From 2e7609156af162026fabfeea50a7684efea25fdb Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 23 Jan 2014 10:24:44 +0100 Subject: [PATCH] fixes for exclclusive_scan_and/or_i32 and shuffle2 and __movmsk --- builtins/target-nvptx.ll | 156 ++++++++++++++++++-------------- builtins/util-nvptx.m4 | 102 --------------------- tests/exclusive-scan-and-1.ispc | 8 -- tests/exclusive-scan-and-2.ispc | 10 -- 4 files changed, 90 insertions(+), 186 deletions(-) diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index 840134c8..9151adf8 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -28,6 +28,12 @@ define i32 @__warpsize() nounwind readnone alwaysinline ;; ret i32 %tid ret i32 32 } +define i32 @__laneidx() nounwind readnone alwaysinline +{ + %tid = tail call i32 @__tid_x() + %lane = and i32 %tid, 31 + ret i32 %lane; +} define i32 @__ctaid_x() nounwind readnone alwaysinline @@ -336,7 +342,6 @@ include(`util-nvptx.m4') stdlib_core() packed_load_and_store() int64minmax() -scans() rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -664,14 +669,15 @@ define double @__sqrt_uniform_double(double) nounwind readonly alwaysinline { } declare @__sqrt_varying_double() nounwind readnone -;; bit ops +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; population count declare i32 @llvm.ctpop.i32(i32) nounwind readnone define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { -;; %call = call i32 @llvm.ctpop.i32(i32 %0) -;; ret i32 %call - %res = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %0) nounwind readnone alwaysinline - ret i32 %res + %call = call i32 @llvm.ctpop.i32(i32 %0) + ret i32 %call +;; %res = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %0) nounwind readnone alwaysinline + ;; ret i32 %res } declare i64 @llvm.ctpop.i64(i64) nounwind readnone @@ -680,6 +686,9 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { ret i64 %call } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; binary prefix sum + define i64 @__warpBinExclusiveScan(i1 %p) nounwind readonly alwaysinline { entry: @@ -701,25 +710,21 @@ ctlztz() ; or, use the macro to call the 4-wide ones twice with our 8-wide ; vectors... -;; svml +;; svml is not support in PTX, will generate linking error include(`svml.m4') svml_stubs(float,f,WIDTH) svml_stubs(double,d,WIDTH) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; population count; - - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions define i64 @__movmsk(<1 x i1>) nounwind readnone alwaysinline { %v = extractelement <1 x i1> %0, i32 0 - %v64 = zext i1 %v to i64 - ret i64 %v64 + %call = call i32 @__ballot_nvptx(i1 zeroext %v) + %v64 = zext i32 %call to i64 + ret i64 %v64 } define i1 @__any(<1 x i1>) nounwind readnone alwaysinline { @@ -1012,37 +1017,10 @@ define i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone alwaysinline { } ;;;; reduce equal -define i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue, - <1 x i1> %mask) nounwind alwaysinline { - %v=extractelement <1 x i32> %vv, i32 0 - store i32 %v, i32 * %samevalue - ret i1 true - -} - -define i1 @__reduce_equal_float(<1 x float> %vv, float * %samevalue, - <1 x i1> %mask) nounwind alwaysinline { - %v=extractelement <1 x float> %vv, i32 0 - store float %v, float * %samevalue - ret i1 true - -} - -define i1 @__reduce_equal_int64(<1 x i64> %vv, i64 * %samevalue, - <1 x i1> %mask) nounwind alwaysinline { - %v=extractelement <1 x i64> %vv, i32 0 - store i64 %v, i64 * %samevalue - ret i1 true - -} - -define i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue, - <1 x i1> %mask) nounwind alwaysinline { - %v=extractelement <1 x double> %vv, i32 0 - store double %v, double * %samevalue - ret i1 true - -} +declare i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue, <1 x i1> %mask) nounwind alwaysinline; +declare i1 @__reduce_equal_float(<1 x float> %vv, float * %samevalue, <1 x i1> %mask) nounwind alwaysinline; +declare i1 @__reduce_equal_int64(<1 x i64> %vv, i64 * %samevalue, <1 x i1> %mask) nounwind alwaysinline; +declare i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue, <1 x i1> %mask) nounwind alwaysinline; ;;;;;;;;;;; shuffle define(`shuffle1', ` @@ -1065,13 +1043,18 @@ shuffle1(double) define(`shuffle2',` define <1 x $1> @__shuffle2_$1(<1 x $1>, <1 x $1>, <1 x i32>) nounwind readnone alwaysinline { - %val1 = extractelement <1 x $1> %0, i32 0 - %val2 = extractelement <1 x $1> %1, i32 0 + %val1 = extractelement <1 x $1> %0, i32 0 + %val2 = extractelement <1 x $1> %1, i32 0 + + ;; fetch both values %lane = extractelement <1 x i32> %2, i32 0 - %c = icmp slt i32 %lane, 32 - %val = select i1 %c, $1 %val1, $1 %val2 %lane_mask = and i32 %lane, 31 - %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane_mask); + %ret1 = tail call $1 @__shfl_$1_nvptx($1 %val1, i32 %lane_mask); + %ret2 = tail call $1 @__shfl_$1_nvptx($1 %val2, i32 %lane_mask); + + ;; select the correct one + %c = icmp slt i32 %lane, 32 + %rets = select i1 %c, $1 %ret1, $1 %ret2 %retv = insertelement <1 x $1> undef, $1 %rets, i32 0 ret <1 x $1> %retv } @@ -1141,6 +1124,9 @@ broadcast(i64) broadcast(float) broadcast(double) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefix sum stuff + define i32 @__shfl_scan_add_step_i32(i32 %partial, i32 %up_offset) nounwind readnone alwaysinline { %result = tail call i32 asm sideeffect @@ -1181,10 +1167,18 @@ define i32 @__shfl_scan_or_step_i32(i32 %partial, i32 %up_offset) nounwind readn } define <1 x i32> @__exclusive_scan_or_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline { - %shft = tail call <1 x i32> @__shift_i32(<1 x i32> %0, i32 -1) - %v0 = extractelement <1 x i32> %shft, i32 0 + %v0 = extractelement <1 x i32> %0, i32 0 %mask = extractelement <1 x i1 > %1, i32 0 - %v = select i1 %mask, i32 %v0, i32 0 + %v1 = select i1 %mask, i32 %v0, i32 0 + + ;; shfl-up by one for exclusive scan + %v = tail call i32 asm sideeffect + "{.reg .u32 r0; + .reg .pred p; + shfl.up.b32 r0|p, $1, 1, 0; + @!p mov.u32 r0, 0; + mov.u32 $0, r0; + }","=r,r"(i32 %v1); %s1 = tail call i32 @__shfl_scan_or_step_i32(i32 %v, i32 1); %s2 = tail call i32 @__shfl_scan_or_step_i32(i32 %s1, i32 2); @@ -1208,16 +1202,19 @@ define i32 @__shfl_scan_and_step_i32(i32 %partial, i32 %up_offset) nounwind read } define <1 x i32> @__exclusive_scan_and_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline { - %shft = tail call <1 x i32> @__shift_i32(<1 x i32> %0, i32 -1) - %v0 = extractelement <1 x i32> %shft, i32 0 + %v0 = extractelement <1 x i32> %0, i32 0 %mask = extractelement <1 x i1 > %1, i32 0 %v1 = select i1 %mask, i32 %v0, i32 -1 - %tid = tail call i32 @__tid_x() - %lane = and i32 %tid, 31 - %c = icmp eq i32 %lane, 0 - %v = select i1 %c, i32 -1, i32 %v1 - + ;; shfl-up by one for exclusive scan + %v = tail call i32 asm sideeffect + "{.reg .u32 r0; + .reg .pred p; + shfl.up.b32 r0|p, $1, 1, 0; + @!p mov.u32 r0, -1; + mov.u32 $0, r0; + }","=r,r"(i32 %v1); + %s1 = tail call i32 @__shfl_scan_and_step_i32(i32 %v, i32 1); %s2 = tail call i32 @__shfl_scan_and_step_i32(i32 %s1, i32 2); %s3 = tail call i32 @__shfl_scan_and_step_i32(i32 %s2, i32 4); @@ -1253,6 +1250,10 @@ define <1 x float> @__exclusive_scan_add_float(<1 x float>, <1 x i1>) nounwind r %retv = insertelement <1 x float> undef, float %rets, i32 0 ret <1 x float> %retv } +declare <1 x double> @__exclusive_scan_add_double(<1 x double>, <1 x i1>) nounwind readnone alwaysinline +declare <1 x i64> @__exclusive_scan_add_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline +declare <1 x i64> @__exclusive_scan_and_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline +declare <1 x i64> @__exclusive_scan_or_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts @@ -1345,15 +1346,38 @@ gen_scatter(double) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; prefetch - -;; define void @__prefetch_read_uniform_1(i8 * nocapture) nounwind alwaysinline { } -;; define void @__prefetch_read_uniform_2(i8 * nocapture) nounwind alwaysinline { } -;; define void @__prefetch_read_uniform_3(i8 * nocapture) nounwind alwaysinline { } -;; define void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind alwaysinline { } - define_prefetches() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int8/int16 builtins define_avgs() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; vector ops + +define(`extract_insert',` +define $1 @__extract_$2(<1 x $1>, i32) nounwind readnone alwaysinline { + %val = extractelement <1 x $1> %0, i32 0 + %extract = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %1) + ret $1 %extract +} + +define <1 x $1> @__insert_$2(<1 x $1>, i32, + $1) nounwind readnone alwaysinline { + %orig = extractelement <1 x $1> %0, i32 0 + %lane = call i32 @__laneidx() + %c = icmp eq i32 %lane, %1 + %val = select i1 %c, $1 %2, $1 %orig + %insert = insertelement <1 x $1> %0, $1 %val, i32 0 + ret <1 x $1> %insert +} +') + +extract_insert(i8, int8) +extract_insert(i16, int16) +extract_insert(i32, int32) +extract_insert(i64, int64) +extract_insert(float, float) +extract_insert(double, double) + diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4 index 11bd0997..04c63758 100644 --- a/builtins/util-nvptx.m4 +++ b/builtins/util-nvptx.m4 @@ -2503,52 +2503,6 @@ ifelse(HAVE_SCATTER, `1', } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; vector ops - -define i8 @__extract_int8(, i32) nounwind readnone alwaysinline { - %extract = extractelement %0, i32 %1 - ret i8 %extract -} - -define @__insert_int8(, i32, - i8) nounwind readnone alwaysinline { - %insert = insertelement %0, i8 %2, i32 %1 - ret %insert -} - -define i16 @__extract_int16(, i32) nounwind readnone alwaysinline { - %extract = extractelement %0, i32 %1 - ret i16 %extract -} - -define @__insert_int16(, i32, - i16) nounwind readnone alwaysinline { - %insert = insertelement %0, i16 %2, i32 %1 - ret %insert -} - -define i32 @__extract_int32(, i32) nounwind readnone alwaysinline { - %extract = extractelement %0, i32 %1 - ret i32 %extract -} - -define @__insert_int32(, i32, - i32) nounwind readnone alwaysinline { - %insert = insertelement %0, i32 %2, i32 %1 - ret %insert -} - -define i64 @__extract_int64(, i32) nounwind readnone alwaysinline { - %extract = extractelement %0, i32 %1 - ret i64 %extract -} - -define @__insert_int64(, i32, - i64) nounwind readnone alwaysinline { - %insert = insertelement %0, i64 %2, i32 %1 - ret %insert -} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; various bitcasts from one type to another @@ -3777,62 +3731,6 @@ reduce_equal_aux($1, i64, int64, i64, icmp, 64, eq) reduce_equal_aux($1, double, double, i64, fcmp, 64, oeq) ') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; prefix sum stuff - -; $1: vector width (e.g. 4) -; $2: vector element type (e.g. float) -; $3: bit width of vector element type (e.g. 32) -; $4: operator to apply (e.g. fadd) -; $5: identity element value (e.g. 0) -; $6: suffix for function (e.g. add_float) - -define(`exclusive_scan', ` -define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v, - <$1 x MASK> %mask) nounwind alwaysinline { - ; first, set the value of any off lanes to the identity value - %ptr = alloca <$1 x $2> - %idvec1 = bitcast $2 $5 to <1 x $2> - %idvec = shufflevector <1 x $2> %idvec1, <1 x $2> undef, - <$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 > - store <$1 x $2> %idvec, <$1 x $2> * %ptr - %ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> * - %vi = bitcast <$1 x $2> %v to <$1 x i`'$3> - call void @__masked_store_blend_i$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi, - <$1 x MASK> %mask) - %v_id = load <$1 x $2> * %ptr - - ; extract elements of the vector to use in computing the scan - forloop(i, 0, eval($1-1), ` - %v`'i = extractelement <$1 x $2> %v_id, i32 i') - - ; and just compute the scan directly. - ; 0th element is the identity (so nothing to do here), - ; 1st element is identity (op) the 0th element of the original vector, - ; each successive element is the previous element (op) the previous element - ; of the original vector - %s1 = $4 $2 $5, %v0 - forloop(i, 2, eval($1-1), ` - %s`'i = $4 $2 %s`'eval(i-1), %v`'eval(i-1)') - - ; and fill in the result vector - %r0 = insertelement <$1 x $2> undef, $2 $5, i32 0 ; 0th element gets identity - forloop(i, 1, eval($1-1), ` - %r`'i = insertelement <$1 x $2> %r`'eval(i-1), $2 %s`'i, i32 i') - - ret <$1 x $2> %r`'eval($1-1) -} -') - -define(`scans', ` -exclusive_scan(WIDTH, i64, 64, add, 0, add_i64) -exclusive_scan(WIDTH, double, 64, fadd, zeroinitializer, add_double) - -exclusive_scan(WIDTH, i64, 64, and, -1, and_i64) - -exclusive_scan(WIDTH, i64, 64, or, 0, or_i64) -') - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; per_lane ;; diff --git a/tests/exclusive-scan-and-1.ispc b/tests/exclusive-scan-and-1.ispc index 525d0c2a..31347b47 100644 --- a/tests/exclusive-scan-and-1.ispc +++ b/tests/exclusive-scan-and-1.ispc @@ -4,17 +4,9 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { RET[programIndex] = -1; int32 a = (programIndex & 1) ? 0xff : 0; -#if 0 if (programIndex & 1) { RET[programIndex] = exclusive_scan_and(a); } -#else - const bool mask = programIndex & 1; - const float res = exclusive_scan_and(mask ? a : -1); - if (mask) { - RET[programIndex] = res; - } -#endif } diff --git a/tests/exclusive-scan-and-2.ispc b/tests/exclusive-scan-and-2.ispc index 7fd69648..b742a91e 100644 --- a/tests/exclusive-scan-and-2.ispc +++ b/tests/exclusive-scan-and-2.ispc @@ -4,19 +4,9 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { RET[programIndex] = -1; int32 a = ~(1ul << programIndex); -#if 0 if ((programIndex < 32) && (programIndex & 1) == 0) { RET[programIndex] = exclusive_scan_and(a); } -#else - const bool mask = ((programIndex < 32) && (programIndex & 1) == 0); - const float res = exclusive_scan_and(mask ? a : -1); - if (mask) - { - RET[programIndex] = res; - } -#endif - }