From 8c534d4d74db53ee52741f90229a4d0afae04147 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 10 Aug 2011 15:55:55 -0700 Subject: [PATCH] Add reduce_equal() function to standard library. --- builtins-avx.ll | 1 + builtins-sse.ll | 1 + builtins-sse4x2.ll | 2 + builtins.m4 | 75 ++++++++++++++++++++++++++++++++++++++ docs/ispc.txt | 27 ++++++++++++++ stdlib.ispc | 16 ++++++++ tests/reduce-equal-1.ispc | 11 ++++++ tests/reduce-equal-10.ispc | 13 +++++++ tests/reduce-equal-11.ispc | 13 +++++++ tests/reduce-equal-12.ispc | 18 +++++++++ tests/reduce-equal-13.ispc | 17 +++++++++ tests/reduce-equal-2.ispc | 13 +++++++ tests/reduce-equal-3.ispc | 13 +++++++ tests/reduce-equal-4.ispc | 15 ++++++++ tests/reduce-equal-5.ispc | 13 +++++++ tests/reduce-equal-6.ispc | 13 +++++++ tests/reduce-equal-7.ispc | 13 +++++++ tests/reduce-equal-8.ispc | 13 +++++++ tests/reduce-equal-9.ispc | 15 ++++++++ tests/reduce-equal.ispc | 11 ++++++ 20 files changed, 313 insertions(+) create mode 100644 tests/reduce-equal-1.ispc create mode 100644 tests/reduce-equal-10.ispc create mode 100644 tests/reduce-equal-11.ispc create mode 100644 tests/reduce-equal-12.ispc create mode 100644 tests/reduce-equal-13.ispc create mode 100644 tests/reduce-equal-2.ispc create mode 100644 tests/reduce-equal-3.ispc create mode 100644 tests/reduce-equal-4.ispc create mode 100644 tests/reduce-equal-5.ispc create mode 100644 tests/reduce-equal-6.ispc create mode 100644 tests/reduce-equal-7.ispc create mode 100644 tests/reduce-equal-8.ispc create mode 100644 tests/reduce-equal-9.ispc create mode 100644 tests/reduce-equal.ispc diff --git a/builtins-avx.ll b/builtins-avx.ll index 58a5dd3b..843a7a0b 100644 --- a/builtins-avx.ll +++ b/builtins-avx.ll @@ -409,6 +409,7 @@ define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysi reduce8(float, @__max_varying_float, @__max_uniform_float) } +reduce_equal(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int32 ops diff --git a/builtins-sse.ll b/builtins-sse.ll index c76ff907..a29e8437 100644 --- a/builtins-sse.ll +++ b/builtins-sse.ll @@ -376,6 +376,7 @@ define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) } +reduce_equal(4) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store diff --git a/builtins-sse4x2.ll b/builtins-sse4x2.ll index 10c595f8..ed6cca68 100644 --- a/builtins-sse4x2.ll +++ b/builtins-sse4x2.ll @@ -434,6 +434,8 @@ define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone { reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) } +reduce_equal(8) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts diff --git a/builtins.m4 b/builtins.m4 index 47158292..4267b6c6 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -1400,6 +1400,81 @@ done: } ') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reduce_equal + +; count leading zeros +declare i32 @llvm.cttz.i32(i32) + +define(`reduce_equal_aux', ` +define internal i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue, + <$1 x i32> %mask) nounwind alwaysinline { +entry: + %mm = call i32 @__movmsk(<$1 x i32> %mask) + %allon = icmp eq i32 %mm, eval((1<<$1)-1) + br i1 %allon, label %check_neighbors, label %domixed + +domixed: + ; the mask is mixed on/off. First see if the lanes are all off + %alloff = icmp eq i32 %mm, 0 + br i1 %alloff, label %doalloff, label %actuallymixed + +doalloff: + ret i1 undef ;; should we return an actual value here? + +actuallymixed: + ; First, figure out which lane is the first active one + %first = call i32 @llvm.cttz.i32(i32 %mm) + %baseval = extractelement <$1 x $2> %v, i32 %first + %basev1 = bitcast $2 %baseval to <1 x $2> + ; get a vector that is that value smeared across all elements + %basesmear = shufflevector <1 x $2> %basev1, <1 x $2> undef, + <$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 > + + ; now to a blend of that vector with the original vector, such that the + ; result will be the original value for the active lanes, and the value + ; from the first active lane for the inactive lanes. Given that, we can + ; just unconditionally check if the lanes are all equal in check_neighbors + ; below without worrying about inactive lanes... + %ptr = alloca <$1 x $2> + store <$1 x $2> %basesmear, <$1 x $2> * %ptr + %castptr = bitcast <$1 x $2> * %ptr to <$1 x $4> * + %castv = bitcast <$1 x $2> %v to <$1 x $4> + call void @__masked_store_blend_$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x i32> %mask) + %blendvec = load <$1 x $2> * %ptr + br label %check_neighbors + +check_neighbors: + %vec = phi <$1 x $2> [ %blendvec, %actuallymixed ], [ %v, %entry ] + ; now we can just rotate once and compare with the vector, which ends + ; up comparing each element to its neighbor on the right. Then see if + ; all of those values are true; if so, then all of the elements are equal.. + %castvec = bitcast <$1 x $2> %vec to <$1 x $4> + %castvr = call <$1 x $4> @__rotate_int$6(<$1 x $4> %castvec, i32 1) + %vr = bitcast <$1 x $4> %castvr to <$1 x $2> + %eq = $5 eq <$1 x $2> %vec, %vr + %eq32 = sext <$1 x i1> %eq to <$1 x i32> + %eqmm = call i32 @__movmsk(<$1 x i32> %eq32) + %alleq = icmp eq i32 %eqmm, eval((1<<$1)-1) + br i1 %alleq, label %all_equal, label %not_all_equal + +all_equal: + %the_value = extractelement <$1 x $2> %vec, i32 0 + store $2 %the_value, $2 * %samevalue + ret i1 true + +not_all_equal: + ret i1 false +} +') + +define(`reduce_equal', ` +reduce_equal_aux($1, i32, int32, i32, icmp, 32) +reduce_equal_aux($1, float, float, i32, fcmp, 32) +reduce_equal_aux($1, i64, int64, i64, icmp, 64) +reduce_equal_aux($1, double, double, i64, fcmp, 64) +') + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; per_lane ;; diff --git a/docs/ispc.txt b/docs/ispc.txt index 3ba925dd..5afbbbc6 100644 --- a/docs/ispc.txt +++ b/docs/ispc.txt @@ -1823,7 +1823,34 @@ given value across all of the currently-executing vector lanes. uniform int reduce_max(int a, int b) uniform unsigned int reduce_max(unsigned int a, unsigned int b) +Finally, you can check to see if a particular value has the same value in +all of the currently-running program instances: +:: + + uniform bool reduce_equal(int32 v) + uniform bool reduce_equal(unsigned int32 v) + uniform bool reduce_equal(float v) + uniform bool reduce_equal(int64 v) + uniform bool reduce_equal(unsigned int64 v) + uniform bool reduce_equal(double) + +There are also variants of these functions that return the value as a +``uniform`` in the case where the values are all the same. + +:: + + uniform bool reduce_equal(int32 v, reference uniform int32 sameval) + uniform bool reduce_equal(unsigned int32 v, + reference uniform unsigned int32 sameval) + uniform bool reduce_equal(float v, reference uniform float sameval) + uniform bool reduce_equal(int64 v, reference uniform int64 sameval) + uniform bool reduce_equal(unsigned int64 v, + reference uniform unsigned int64 sameval) + uniform bool reduce_equal(double, reference uniform double sameval) + +The value returned by the ``reduce_equal()`` function is undefined if +it is called when none of the program instances are running. Packed Load and Store Operations -------------------------------- diff --git a/stdlib.ispc b/stdlib.ispc index 9907904f..f9f852a1 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -471,6 +471,22 @@ static inline uniform unsigned int64 reduce_max(unsigned int64 v) { return __reduce_max_uint64(__mask ? v : 0); } +#define REDUCE_EQUAL(TYPE, FUNCTYPE) \ +static inline uniform bool reduce_equal(TYPE v) { \ + uniform TYPE unusedValue; \ + return __reduce_equal_##FUNCTYPE(v, unusedValue, (int32)__mask); \ +} \ +static inline uniform bool reduce_equal(TYPE v, reference uniform TYPE value) { \ + return __reduce_equal_##FUNCTYPE(v, value, (int32)__mask); \ +} + +REDUCE_EQUAL(int32, int32) +REDUCE_EQUAL(unsigned int32, int32) +REDUCE_EQUAL(float, float) +REDUCE_EQUAL(int64, int64) +REDUCE_EQUAL(unsigned int64, int64) +REDUCE_EQUAL(double, double) + /////////////////////////////////////////////////////////////////////////// // packed load, store diff --git a/tests/reduce-equal-1.ispc b/tests/reduce-equal-1.ispc new file mode 100644 index 00000000..da58601c --- /dev/null +++ b/tests/reduce-equal-1.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int a = b; + RET[programIndex] = reduce_equal(a) ? 1 : 0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/reduce-equal-10.ispc b/tests/reduce-equal-10.ispc new file mode 100644 index 00000000..3f24aa91 --- /dev/null +++ b/tests/reduce-equal-10.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + unsigned int64 a = aFOO[programIndex&1]; + RET[programIndex] = 1; + if (programIndex & 1) + RET[programIndex] = reduce_equal(a) ? 1 : 0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/reduce-equal-11.ispc b/tests/reduce-equal-11.ispc new file mode 100644 index 00000000..aa0ee1df --- /dev/null +++ b/tests/reduce-equal-11.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + double a = aFOO[programIndex]; + RET[programIndex] = 0; + if (programIndex & 1) + RET[programIndex] = reduce_equal(a) ? 1 : 0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} diff --git a/tests/reduce-equal-12.ispc b/tests/reduce-equal-12.ispc new file mode 100644 index 00000000..8b67e5ce --- /dev/null +++ b/tests/reduce-equal-12.ispc @@ -0,0 +1,18 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int a = 10 + aFOO[programIndex/2]; + RET[programIndex] = 1; + uniform int sameVal; + uniform bool re; + if (a <= 11) { + re = reduce_equal(a, sameVal); +//CO print("% % %\n", re, sameVal, a); + } + RET[programIndex] = ((int)re << 8) + sameVal; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 267; +} diff --git a/tests/reduce-equal-13.ispc b/tests/reduce-equal-13.ispc new file mode 100644 index 00000000..15eac013 --- /dev/null +++ b/tests/reduce-equal-13.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int a = aFOO[programIndex&1]; + RET[programIndex] = 1; + uniform bool re; + uniform int val; + if (programIndex & 1) { + re = reduce_equal(a, val); + } + RET[programIndex] = ((int)re << 8) + val; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 258; +} diff --git a/tests/reduce-equal-2.ispc b/tests/reduce-equal-2.ispc new file mode 100644 index 00000000..20743905 --- /dev/null +++ b/tests/reduce-equal-2.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int a = aFOO[programIndex/2]; + RET[programIndex] = 1; + if (a == 1) + RET[programIndex] = reduce_equal(a) ? 1 : 0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/reduce-equal-3.ispc b/tests/reduce-equal-3.ispc new file mode 100644 index 00000000..ca158f82 --- /dev/null +++ b/tests/reduce-equal-3.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int a = aFOO[programIndex/2]; + RET[programIndex] = 1; + if (a < programCount + 4) + RET[programIndex] = reduce_equal(a) ? 1 : 0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} diff --git a/tests/reduce-equal-4.ispc b/tests/reduce-equal-4.ispc new file mode 100644 index 00000000..4c6f9808 --- /dev/null +++ b/tests/reduce-equal-4.ispc @@ -0,0 +1,15 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int a = aFOO[programIndex/(programCount/2)]; + RET[programIndex] = 0; + if (programIndex >= programCount/2 && a < 4) + RET[programIndex] = reduce_equal(a) ? 1 : 0; + else + RET[programIndex] = 1; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/reduce-equal-5.ispc b/tests/reduce-equal-5.ispc new file mode 100644 index 00000000..03f895b9 --- /dev/null +++ b/tests/reduce-equal-5.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int a = aFOO[programIndex&1]; + RET[programIndex] = 1; + if (programIndex & 1) + RET[programIndex] = reduce_equal(a) ? 1 : 0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/reduce-equal-6.ispc b/tests/reduce-equal-6.ispc new file mode 100644 index 00000000..803991db --- /dev/null +++ b/tests/reduce-equal-6.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int a = aFOO[programIndex]; + RET[programIndex] = 0; + if (programIndex & 1) + RET[programIndex] = reduce_equal(a) ? 1 : 0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} diff --git a/tests/reduce-equal-7.ispc b/tests/reduce-equal-7.ispc new file mode 100644 index 00000000..8aef617c --- /dev/null +++ b/tests/reduce-equal-7.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + float a = aFOO[programIndex/2]; + RET[programIndex] = 1; + if (a == 1) + RET[programIndex] = reduce_equal(a) ? 1 : 0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/reduce-equal-8.ispc b/tests/reduce-equal-8.ispc new file mode 100644 index 00000000..3f078c55 --- /dev/null +++ b/tests/reduce-equal-8.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int64 a = aFOO[programIndex/2]; + RET[programIndex] = 1; + if (a == 1) + RET[programIndex] = reduce_equal(a) ? 1 : 0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/reduce-equal-9.ispc b/tests/reduce-equal-9.ispc new file mode 100644 index 00000000..24fde82c --- /dev/null +++ b/tests/reduce-equal-9.ispc @@ -0,0 +1,15 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + double a = aFOO[programIndex/2]; + RET[programIndex] = 1; + uniform bool eq = false; + if (a < 4) + eq = reduce_equal(a); + RET[programIndex] = eq; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} diff --git a/tests/reduce-equal.ispc b/tests/reduce-equal.ispc new file mode 100644 index 00000000..f09ec940 --- /dev/null +++ b/tests/reduce-equal.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + RET[programIndex] = reduce_equal(a); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +}