From f868a63064cddeea8ec7d90c58cdc3c4629cb8e8 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Sat, 13 Aug 2011 20:11:41 +0100 Subject: [PATCH] Add support for scan operations across program instances (add, and, or). --- builtins-avx.ll | 1 + builtins-sse2.ll | 1 + builtins-sse4.ll | 2 +- builtins-sse4x2.ll | 1 + builtins.m4 | 60 +++++++++++++++++++++++++++ docs/ispc.txt | 71 ++++++++++++++++++++++++++++++++ stdlib.ispc | 56 +++++++++++++++++++++++++ tests/exclusive-scan-add-1.ispc | 12 ++++++ tests/exclusive-scan-add-10.ispc | 20 +++++++++ tests/exclusive-scan-add-2.ispc | 12 ++++++ tests/exclusive-scan-add-3.ispc | 17 ++++++++ tests/exclusive-scan-add-4.ispc | 17 ++++++++ tests/exclusive-scan-add-5.ispc | 20 +++++++++ tests/exclusive-scan-add-6.ispc | 12 ++++++ tests/exclusive-scan-add-7.ispc | 12 ++++++ tests/exclusive-scan-add-8.ispc | 17 ++++++++ tests/exclusive-scan-add-9.ispc | 17 ++++++++ tests/exclusive-scan-and-1.ispc | 22 ++++++++++ tests/exclusive-scan-and-2.ispc | 21 ++++++++++ tests/exclusive-scan-or-1.ispc | 13 ++++++ 20 files changed, 403 insertions(+), 1 deletion(-) create mode 100644 tests/exclusive-scan-add-1.ispc create mode 100644 tests/exclusive-scan-add-10.ispc create mode 100644 tests/exclusive-scan-add-2.ispc create mode 100644 tests/exclusive-scan-add-3.ispc create mode 100644 tests/exclusive-scan-add-4.ispc create mode 100644 tests/exclusive-scan-add-5.ispc create mode 100644 tests/exclusive-scan-add-6.ispc create mode 100644 tests/exclusive-scan-add-7.ispc create mode 100644 tests/exclusive-scan-add-8.ispc create mode 100644 tests/exclusive-scan-add-9.ispc create mode 100644 tests/exclusive-scan-and-1.ispc create mode 100644 tests/exclusive-scan-and-2.ispc create mode 100644 tests/exclusive-scan-or-1.ispc diff --git a/builtins-avx.ll b/builtins-avx.ll index 843a7a0b..18f90f48 100644 --- a/builtins-avx.ll +++ b/builtins-avx.ll @@ -41,6 +41,7 @@ stdlib_core(8) packed_load_and_store(8) +scans(8) int64minmax(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/builtins-sse2.ll b/builtins-sse2.ll index de03e422..daa75cc7 100644 --- a/builtins-sse2.ll +++ b/builtins-sse2.ll @@ -35,6 +35,7 @@ ; Define some basics for a 4-wide target stdlib_core(4) packed_load_and_store(4) +scans(4) ; Include the various definitions of things that only require SSE1 and SSE2 include(`builtins-sse.ll') diff --git a/builtins-sse4.ll b/builtins-sse4.ll index 15a8c7ff..3f8d616e 100644 --- a/builtins-sse4.ll +++ b/builtins-sse4.ll @@ -35,6 +35,7 @@ ; Define common 4-wide stuff stdlib_core(4) packed_load_and_store(4) +scans(4) ; Define the stuff that can be done with base SSE1/SSE2 instructions include(`builtins-sse.ll') @@ -229,7 +230,6 @@ define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysi ret float %scalar } - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store diff --git a/builtins-sse4x2.ll b/builtins-sse4x2.ll index ed6cca68..6fc1b800 100644 --- a/builtins-sse4x2.ll +++ b/builtins-sse4x2.ll @@ -38,6 +38,7 @@ stdlib_core(8) packed_load_and_store(8) +scans(8) int64minmax(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/builtins.m4 b/builtins.m4 index 4267b6c6..862a94e0 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -1475,6 +1475,66 @@ reduce_equal_aux($1, i64, int64, i64, icmp, 64) reduce_equal_aux($1, double, double, i64, fcmp, 64) ') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefix sum stuff + +; $1: vector width (e.g. 4) +; $2: vector element type (e.g. float) +; $3: bit width of vector element type (e.g. 32) +; $4: operator to apply (e.g. fadd) +; $5: identity element value (e.g. 0) +; $6: suffix for function (e.g. add_float) + +define(`exclusive_scan', ` +define internal <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v, + <$1 x i32> %mask) nounwind alwaysinline { + ; first, set the value of any off lanes to the identity value + %ptr = alloca <$1 x $2> + %idvec1 = bitcast $2 $5 to <1 x $2> + %idvec = shufflevector <1 x $2> %idvec1, <1 x $2> undef, + <$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 > + store <$1 x $2> %idvec, <$1 x $2> * %ptr + %ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> * + %vi = bitcast <$1 x $2> %v to <$1 x i`'$3> + call void @__masked_store_blend_$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi, + <$1 x i32> %mask) + %v_id = load <$1 x $2> * %ptr + + ; extract elements of the vector to use in computing the scan + forloop(i, 0, eval($1-1), ` + %v`'i = extractelement <$1 x $2> %v_id, i32 i') + + ; and just compute the scan directly. + ; 0th element is the identity (so nothing to do here), + ; 1st element is identity (op) the 0th element of the original vector, + ; each successive element is the previous element (op) the previous element + ; of the original vector + %s1 = $4 $2 $5, %v0 + forloop(i, 2, eval($1-1), ` + %s`'i = $4 $2 %s`'eval(i-1), %v`'eval(i-1)') + + ; and fill in the result vector + %r0 = insertelement <$1 x $2> undef, $2 $5, i32 0 ; 0th element gets identity + forloop(i, 1, eval($1-1), ` + %r`'i = insertelement <$1 x $2> %r`'eval(i-1), $2 %s`'i, i32 i') + + ret <$1 x $2> %r`'eval($1-1) +} +') + +define(`scans', ` +exclusive_scan($1, i32, 32, add, 0, add_i32) +exclusive_scan($1, float, 32, fadd, zeroinitializer, add_float) +exclusive_scan($1, i64, 64, add, 0, add_i64) +exclusive_scan($1, double, 64, fadd, zeroinitializer, add_double) + +exclusive_scan($1, i32, 32, and, -1, and_i32) +exclusive_scan($1, i64, 64, and, -1, and_i64) + +exclusive_scan($1, i32, 32, or, 0, or_i32) +exclusive_scan($1, i64, 64, or, 0, or_i64) +') + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; per_lane ;; diff --git a/docs/ispc.txt b/docs/ispc.txt index 5afbbbc6..8621cba3 100644 --- a/docs/ispc.txt +++ b/docs/ispc.txt @@ -101,6 +101,7 @@ Contents: + `"Inline" Aggressively`_ + `Small Performance Tricks`_ + `Instrumenting Your ISPC Programs`_ + + `Using Scan Operations For Variable Output`_ * `Disclaimer and Legal Information`_ @@ -1852,6 +1853,44 @@ There are also variants of these functions that return the value as a The value returned by the ``reduce_equal()`` function is undefined if it is called when none of the program instances are running. +There are also a number of functions to compute "scan"s of values across +the program instances. For example, the ``exclusive_scan_and()`` function +computes, for each program instance, the sum of the given value over all of +the preceeding program instances. (The scans currently available in +``ispc`` are all so-called "exclusive" scans, meaning that the value +computed for a given element does not include the value provided for that +element.) In C code, an exclusive add scan over an array might be +implemented as: + +:: + + void scan_add(int *in_array, int *result_array, int count) { + result_array[0] = 0; + for (int i = 0; i < count; ++i) + result_array[i] = result_array[i-1] + in_array[i-1]; + } + +``ispc`` provides the following scan functions--addition, bitwise-and, and +bitwise-or are available: + +:: + + int32 exclusive_scan_add(int32 v) + unsigned int32 exclusive_scan_add(unsigned int32 v) + float exclusive_scan_add(float v) + int64 exclusive_scan_add(int64 v) + unsigned int64 exclusive_scan_add(unsigned int64 v) + double exclusive_scan_add(double v) + int32 exclusive_scan_and(int32 v) + unsigned int32 exclusive_scan_and(unsigned int32 v) + int64 exclusive_scan_and(int64 v) + unsigned int64 exclusive_scan_and(unsigned int64 v) + int32 exclusive_scan_or(int32 v) + unsigned int32 exclusive_scan_or(unsigned int32 v) + int64 exclusive_scan_or(int64 v) + unsigned int64 exclusive_scan_or(unsigned int64 v) + + Packed Load and Store Operations -------------------------------- @@ -2760,6 +2799,38 @@ active upon function entry. ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes ... + +Using Scan Operations For Variable Output +----------------------------------------- + +One important application of the ``exclusive_scan_add()`` function in the +standard library is when program instances want to generate a variable amount +of output and when one would like that output to be densely packed in a +single array. For example, consider the code fragment below: + +:: + + uniform int func(uniform float outArray[], ...) { + int numOut = ...; // figure out how many to be output + float outLocal[MAX_OUT]; // staging area + // put results in outLocal[0], ..., outLocal[numOut-1] + int startOffset = exclusive_scan_add(numOut); + for (int i = 0; i < numOut; ++i) + outArray[startOffset + i] = outLocal[i]; + return reduce_add(numOut); + } + +Here, each program instance has computed a number, ``numOut``, of values to +output, and has stored them in the ``outLocal`` array. Assume that four +program instances are running and that the first one wants to output one +value, the second two values, and the third and fourth three values each. +In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6) +to the four program instances, respectively. The first program instance +will write its one result to ``outArray[0]``, the second will write its two +values to ``outArray[1]`` and ``outArray[2]``, and so forth. The +``reduce_add`` call at the end returns the total number of values that the +program instances have written to the array. + Disclaimer and Legal Information ================================ diff --git a/stdlib.ispc b/stdlib.ispc index f9f852a1..7ff5fce5 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -487,6 +487,62 @@ REDUCE_EQUAL(int64, int64) REDUCE_EQUAL(unsigned int64, int64) REDUCE_EQUAL(double, double) +static int32 exclusive_scan_add(int32 v) { + return __exclusive_scan_add_i32(v, (int32)__mask); +} + +static unsigned int32 exclusive_scan_add(unsigned int32 v) { + return __exclusive_scan_add_i32(v, __mask); +} + +static float exclusive_scan_add(float v) { + return __exclusive_scan_add_float(v, __mask); +} + +static int64 exclusive_scan_add(int64 v) { + return __exclusive_scan_add_i64(v, (int32)__mask); +} + +static unsigned int64 exclusive_scan_add(unsigned int64 v) { + return __exclusive_scan_add_i64(v, __mask); +} + +static double exclusive_scan_add(double v) { + return __exclusive_scan_add_double(v, __mask); +} + +static int32 exclusive_scan_and(int32 v) { + return __exclusive_scan_and_i32(v, (int32)__mask); +} + +static unsigned int32 exclusive_scan_and(unsigned int32 v) { + return __exclusive_scan_and_i32(v, __mask); +} + +static int64 exclusive_scan_and(int64 v) { + return __exclusive_scan_and_i64(v, (int32)__mask); +} + +static unsigned int64 exclusive_scan_and(unsigned int64 v) { + return __exclusive_scan_and_i64(v, __mask); +} + +static int32 exclusive_scan_or(int32 v) { + return __exclusive_scan_or_i32(v, (int32)__mask); +} + +static unsigned int32 exclusive_scan_or(unsigned int32 v) { + return __exclusive_scan_or_i32(v, __mask); +} + +static int64 exclusive_scan_or(int64 v) { + return __exclusive_scan_or_i64(v, (int32)__mask); +} + +static unsigned int64 exclusive_scan_or(unsigned int64 v) { + return __exclusive_scan_or_i64(v, __mask); +} + /////////////////////////////////////////////////////////////////////////// // packed load, store diff --git a/tests/exclusive-scan-add-1.ispc b/tests/exclusive-scan-add-1.ispc new file mode 100644 index 00000000..b4b0cea9 --- /dev/null +++ b/tests/exclusive-scan-add-1.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + RET[programIndex] = exclusive_scan_add(programIndex); +} + +export void result(uniform float RET[]) { + uniform int result[] = { 0, 0, 1, 3, 6, 10, 15, 21, 28, + 36, 45, 55, 66, 78, 91, 105, 120 }; + RET[programIndex] = result[programIndex]; +} diff --git a/tests/exclusive-scan-add-10.ispc b/tests/exclusive-scan-add-10.ispc new file mode 100644 index 00000000..c5e1aa18 --- /dev/null +++ b/tests/exclusive-scan-add-10.ispc @@ -0,0 +1,20 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + RET[programIndex] = -1; + unsigned int64 a = aFOO[programIndex]; + if (programIndex & 1) { + RET[programIndex] = exclusive_scan_add(a); + } +} + + +export void result(uniform float RET[]) { + uniform int result[16] = { 0, 0, 0, 2, 0, 6, 0, 12, + 0, 20, 0, 30, 0, 42, 0, 56 }; + if (programIndex & 1) + RET[programIndex] = result[programIndex]; + else + RET[programIndex] = -1; +} diff --git a/tests/exclusive-scan-add-2.ispc b/tests/exclusive-scan-add-2.ispc new file mode 100644 index 00000000..b8a9258f --- /dev/null +++ b/tests/exclusive-scan-add-2.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + RET[programIndex] = exclusive_scan_add(aFOO[programIndex]); +} + +export void result(uniform float RET[]) { + uniform int result[] = { 0, 1, 3, 6, 10, 15, 21, 28, + 36, 45, 55, 66, 78, 91, 105, 120, 136 }; + RET[programIndex] = result[programIndex]; +} diff --git a/tests/exclusive-scan-add-3.ispc b/tests/exclusive-scan-add-3.ispc new file mode 100644 index 00000000..2d883c6c --- /dev/null +++ b/tests/exclusive-scan-add-3.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + RET[programIndex] = -1; + float a = aFOO[programIndex]; + if (a <= 2) + RET[programIndex] = exclusive_scan_add(a); +} + +export void result(uniform float RET[]) { + uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; + RET[programIndex] = -1; + if (programIndex <= 1) + RET[programIndex] = result[programIndex]; +} diff --git a/tests/exclusive-scan-add-4.ispc b/tests/exclusive-scan-add-4.ispc new file mode 100644 index 00000000..2d883c6c --- /dev/null +++ b/tests/exclusive-scan-add-4.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + RET[programIndex] = -1; + float a = aFOO[programIndex]; + if (a <= 2) + RET[programIndex] = exclusive_scan_add(a); +} + +export void result(uniform float RET[]) { + uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; + RET[programIndex] = -1; + if (programIndex <= 1) + RET[programIndex] = result[programIndex]; +} diff --git a/tests/exclusive-scan-add-5.ispc b/tests/exclusive-scan-add-5.ispc new file mode 100644 index 00000000..bb4d50db --- /dev/null +++ b/tests/exclusive-scan-add-5.ispc @@ -0,0 +1,20 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + RET[programIndex] = -1; + float a = aFOO[programIndex]; + if (programIndex & 1) { + RET[programIndex] = exclusive_scan_add(a); + } +} + + +export void result(uniform float RET[]) { + uniform int result[16] = { 0, 0, 0, 2, 0, 6, 0, 12, + 0, 20, 0, 30, 0, 42, 0, 56 }; + if (programIndex & 1) + RET[programIndex] = result[programIndex]; + else + RET[programIndex] = -1; +} diff --git a/tests/exclusive-scan-add-6.ispc b/tests/exclusive-scan-add-6.ispc new file mode 100644 index 00000000..46908efe --- /dev/null +++ b/tests/exclusive-scan-add-6.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + RET[programIndex] = exclusive_scan_add((float)programIndex); +} + +export void result(uniform float RET[]) { + uniform int result[] = { 0, 0, 1, 3, 6, 10, 15, 21, 28, + 36, 45, 55, 66, 78, 91, 105, 120 }; + RET[programIndex] = result[programIndex]; +} diff --git a/tests/exclusive-scan-add-7.ispc b/tests/exclusive-scan-add-7.ispc new file mode 100644 index 00000000..ee0b0fcd --- /dev/null +++ b/tests/exclusive-scan-add-7.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + RET[programIndex] = exclusive_scan_add((double)aFOO[programIndex]); +} + +export void result(uniform float RET[]) { + uniform int result[] = { 0, 1, 3, 6, 10, 15, 21, 28, + 36, 45, 55, 66, 78, 91, 105, 120, 136 }; + RET[programIndex] = result[programIndex]; +} diff --git a/tests/exclusive-scan-add-8.ispc b/tests/exclusive-scan-add-8.ispc new file mode 100644 index 00000000..f17a8ce9 --- /dev/null +++ b/tests/exclusive-scan-add-8.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + RET[programIndex] = -1; + int64 a = aFOO[programIndex]; + if (a <= 2) + RET[programIndex] = exclusive_scan_add(a); +} + +export void result(uniform float RET[]) { + uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; + RET[programIndex] = -1; + if (programIndex <= 1) + RET[programIndex] = result[programIndex]; +} diff --git a/tests/exclusive-scan-add-9.ispc b/tests/exclusive-scan-add-9.ispc new file mode 100644 index 00000000..eb61dc88 --- /dev/null +++ b/tests/exclusive-scan-add-9.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + RET[programIndex] = -1; + unsigned int64 a = aFOO[programIndex]; + if (a <= 2) + RET[programIndex] = exclusive_scan_add(a); +} + +export void result(uniform float RET[]) { + uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; + RET[programIndex] = -1; + if (programIndex <= 1) + RET[programIndex] = result[programIndex]; +} diff --git a/tests/exclusive-scan-and-1.ispc b/tests/exclusive-scan-and-1.ispc new file mode 100644 index 00000000..31347b47 --- /dev/null +++ b/tests/exclusive-scan-and-1.ispc @@ -0,0 +1,22 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + RET[programIndex] = -1; + int32 a = (programIndex & 1) ? 0xff : 0; + if (programIndex & 1) { + RET[programIndex] = exclusive_scan_and(a); + } +} + + +export void result(uniform float RET[]) { + if (programIndex & 1) { + if (programIndex == 1) + RET[programIndex] = -1; // 0xffffffff, made float + else + RET[programIndex] = 0xff; + } + else + RET[programIndex] = -1; +} diff --git a/tests/exclusive-scan-and-2.ispc b/tests/exclusive-scan-and-2.ispc new file mode 100644 index 00000000..59a9900e --- /dev/null +++ b/tests/exclusive-scan-and-2.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + RET[programIndex] = -1; + int32 a = ~(1 << programIndex); + if ((programIndex & 1) == 0) { + RET[programIndex] = exclusive_scan_and(a); + } +} + + +export void result(uniform float RET[]) { + RET[programIndex] = -1; + if ((programIndex & 1) == 0 && programIndex > 0) { + int val = 0xffffffff; + for (int i = 0; i < programIndex-1; i += 2) + val &= ~(1<