diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index 8825d36d..fa537977 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -467,15 +467,87 @@ declare void @__fastmath() nounwind ;; round/floor/ceil -declare float @__round_uniform_float(float) nounwind readnone -declare float @__floor_uniform_float(float) nounwind readnone -declare float @__ceil_uniform_float(float) nounwind readnone +define float @__round_uniform_float(float) nounwind readnone alwaysinline +{ + %2 = tail call float asm sideeffect + "{ .reg .pred p<3>; .reg .s32 r<4>; .reg .f32 f<10>; + mov.f32 f4, $1; + abs.f32 f5, f4; + mov.b32 r1, f4; + and.b32 r2, r1, -2147483648; + or.b32 r3, r2, 1056964608; + mov.b32 f6, r3; + add.f32 f7, f6, f4; + cvt.rzi.f32.f32 f8, f7; + setp.gt.f32 p1, f5, 0f4B000000; + selp.f32 f9, f4, f8, p1; + setp.geu.f32 p2, f5, 0f3F000000; + @!p2 cvt.rzi.f32.f32 f9, f4; + mov.f32 $0, f9; + }", "=f,f"(float %0) nounwind readnone alwaysinline + ret float %2 +} +define float @__floor_uniform_float(float) nounwind readnone alwaysinline +{ + %2 = tail call float asm sideeffect "cvt.rmi.f32.f32 $0, $1;", "=f,f"(float %0) nounwind alwaysinline readnone + ret float %2 +} +define float @__ceil_uniform_float(float) nounwind readnone alwaysinline +{ + %2 = tail call float asm sideeffect "cvt.rpi.f32.f32 $0, $1;", "=f,f"(float %0) nounwind alwaysinline readnone + ret float %2 +} -declare double @__round_uniform_double(double) nounwind readnone -declare double @__floor_uniform_double(double) nounwind readnone -declare double @__ceil_uniform_double(double) nounwind readnone +define double @__round_uniform_double(double) nounwind readnone alwaysinline +{ + %2 = tail call double asm sideeffect + "{ + .reg .pred p<3>; + .reg .s32 r<6>; + .reg .f64 fd<9>; -define <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline { + mov.f64 fd8, $1 + abs.f64 fd1, fd8; + setp.ge.f64 p1, fd1, 0d4330000000000000; + @p1 bra BB5_2; + + add.f64 fd5, fd1, 0d3FE0000000000000; + cvt.rzi.f64.f64 fd6, fd5; + setp.lt.f64 p2, fd1, 0d3FE0000000000000; + selp.f64 fd7, 0d0000000000000000, fd6, p2; + { + .reg .b32 temp; + mov.b64 {r1, temp}, fd7; + } + { + .reg .b32 temp; + mov.b64 {temp, r2}, fd7; + } + { + .reg .b32 temp; + mov.b64 {temp, r3}, fd8; + } + and.b32 r4, r3, -2147483648; + or.b32 r5, r2, r4; + mov.b64 fd8, {r1, r5}; + +BB5_2: + mov.f64 $0, fd8; + }", "=d,d"(double %0) nounwind readnone alwaysinline + ret double %2 +} +define double @__floor_uniform_double(double) nounwind readnone alwaysinline +{ + %2 = tail call double asm sideeffect "cvt.rmi.f64.f64 $0, $1;", "=f,f"(double %0) nounwind alwaysinline readnone + ret double %2 +} +define double @__ceil_uniform_double(double) nounwind readnone alwaysinline +{ + %2 = tail call double asm sideeffect "cvt.rpi.f64.f64 $0, $1;", "=f,f"(double %0) nounwind alwaysinline readnone + ret double %2 +} + +define <1 x float> @__round_varying_floatX(<1 x float>) nounwind readonly alwaysinline { %float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32> %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i @@ -487,7 +559,7 @@ define <1 x float> @__round_varying_float(<1 x float>) nounwind readonly always %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float> ret <1 x float> %int_to_float_bitcast.i.i.i } -define <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline { +define <1 x float> @__floor_varying_floatX(<1 x float>) nounwind readonly alwaysinline { %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0 %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32> @@ -497,11 +569,21 @@ define <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly always ret <1 x float> %binop.i } -declare @__ceil_varying_float() nounwind readnone - -declare @__round_varying_double() nounwind readnone -declare @__floor_varying_double() nounwind readnone -declare @__ceil_varying_double() nounwind readnone +define(`rfc_varying',` +define <1 x $2> @__$1_varying_$2(<1 x $2>) nounwind readonly alwaysinline +{ + %val = extractelement <1 x $2> %0, i32 0 + %res = call $2 @__$1_uniform_$2($2 %val) + %ret = insertelement <1 x $2> undef, $2 %res, i32 0 + ret <1 x $2> %ret +} +') +rfc_varying(round, float) +rfc_varying(floor, float) +rfc_varying(ceil, float) +rfc_varying(round, double) +rfc_varying(floor, double) +rfc_varying(ceil, double) ;; min/max uniform