Add support for scan operations across program instances (add, and, or).
This commit is contained in:
@@ -41,6 +41,7 @@
|
||||
|
||||
stdlib_core(8)
|
||||
packed_load_and_store(8)
|
||||
scans(8)
|
||||
int64minmax(8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
; Define some basics for a 4-wide target
|
||||
stdlib_core(4)
|
||||
packed_load_and_store(4)
|
||||
scans(4)
|
||||
|
||||
; Include the various definitions of things that only require SSE1 and SSE2
|
||||
include(`builtins-sse.ll')
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
; Define common 4-wide stuff
|
||||
stdlib_core(4)
|
||||
packed_load_and_store(4)
|
||||
scans(4)
|
||||
|
||||
; Define the stuff that can be done with base SSE1/SSE2 instructions
|
||||
include(`builtins-sse.ll')
|
||||
@@ -229,7 +230,6 @@ define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysi
|
||||
ret float %scalar
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
|
||||
stdlib_core(8)
|
||||
packed_load_and_store(8)
|
||||
scans(8)
|
||||
int64minmax(8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
60
builtins.m4
60
builtins.m4
@@ -1475,6 +1475,66 @@ reduce_equal_aux($1, i64, int64, i64, icmp, 64)
|
||||
reduce_equal_aux($1, double, double, i64, fcmp, 64)
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; prefix sum stuff
|
||||
|
||||
; $1: vector width (e.g. 4)
|
||||
; $2: vector element type (e.g. float)
|
||||
; $3: bit width of vector element type (e.g. 32)
|
||||
; $4: operator to apply (e.g. fadd)
|
||||
; $5: identity element value (e.g. 0)
|
||||
; $6: suffix for function (e.g. add_float)
|
||||
|
||||
define(`exclusive_scan', `
|
||||
define internal <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
; first, set the value of any off lanes to the identity value
|
||||
%ptr = alloca <$1 x $2>
|
||||
%idvec1 = bitcast $2 $5 to <1 x $2>
|
||||
%idvec = shufflevector <1 x $2> %idvec1, <1 x $2> undef,
|
||||
<$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 >
|
||||
store <$1 x $2> %idvec, <$1 x $2> * %ptr
|
||||
%ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> *
|
||||
%vi = bitcast <$1 x $2> %v to <$1 x i`'$3>
|
||||
call void @__masked_store_blend_$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi,
|
||||
<$1 x i32> %mask)
|
||||
%v_id = load <$1 x $2> * %ptr
|
||||
|
||||
; extract elements of the vector to use in computing the scan
|
||||
forloop(i, 0, eval($1-1), `
|
||||
%v`'i = extractelement <$1 x $2> %v_id, i32 i')
|
||||
|
||||
; and just compute the scan directly.
|
||||
; 0th element is the identity (so nothing to do here),
|
||||
; 1st element is identity (op) the 0th element of the original vector,
|
||||
; each successive element is the previous element (op) the previous element
|
||||
; of the original vector
|
||||
%s1 = $4 $2 $5, %v0
|
||||
forloop(i, 2, eval($1-1), `
|
||||
%s`'i = $4 $2 %s`'eval(i-1), %v`'eval(i-1)')
|
||||
|
||||
; and fill in the result vector
|
||||
%r0 = insertelement <$1 x $2> undef, $2 $5, i32 0 ; 0th element gets identity
|
||||
forloop(i, 1, eval($1-1), `
|
||||
%r`'i = insertelement <$1 x $2> %r`'eval(i-1), $2 %s`'i, i32 i')
|
||||
|
||||
ret <$1 x $2> %r`'eval($1-1)
|
||||
}
|
||||
')
|
||||
|
||||
define(`scans', `
|
||||
exclusive_scan($1, i32, 32, add, 0, add_i32)
|
||||
exclusive_scan($1, float, 32, fadd, zeroinitializer, add_float)
|
||||
exclusive_scan($1, i64, 64, add, 0, add_i64)
|
||||
exclusive_scan($1, double, 64, fadd, zeroinitializer, add_double)
|
||||
|
||||
exclusive_scan($1, i32, 32, and, -1, and_i32)
|
||||
exclusive_scan($1, i64, 64, and, -1, and_i64)
|
||||
|
||||
exclusive_scan($1, i32, 32, or, 0, or_i32)
|
||||
exclusive_scan($1, i64, 64, or, 0, or_i64)
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; per_lane
|
||||
;;
|
||||
|
||||
@@ -101,6 +101,7 @@ Contents:
|
||||
+ `"Inline" Aggressively`_
|
||||
+ `Small Performance Tricks`_
|
||||
+ `Instrumenting Your ISPC Programs`_
|
||||
+ `Using Scan Operations For Variable Output`_
|
||||
|
||||
* `Disclaimer and Legal Information`_
|
||||
|
||||
@@ -1852,6 +1853,44 @@ There are also variants of these functions that return the value as a
|
||||
The value returned by the ``reduce_equal()`` function is undefined if
|
||||
it is called when none of the program instances are running.
|
||||
|
||||
There are also a number of functions to compute "scan"s of values across
|
||||
the program instances. For example, the ``exclusive_scan_and()`` function
|
||||
computes, for each program instance, the sum of the given value over all of
|
||||
the preceeding program instances. (The scans currently available in
|
||||
``ispc`` are all so-called "exclusive" scans, meaning that the value
|
||||
computed for a given element does not include the value provided for that
|
||||
element.) In C code, an exclusive add scan over an array might be
|
||||
implemented as:
|
||||
|
||||
::
|
||||
|
||||
void scan_add(int *in_array, int *result_array, int count) {
|
||||
result_array[0] = 0;
|
||||
for (int i = 0; i < count; ++i)
|
||||
result_array[i] = result_array[i-1] + in_array[i-1];
|
||||
}
|
||||
|
||||
``ispc`` provides the following scan functions--addition, bitwise-and, and
|
||||
bitwise-or are available:
|
||||
|
||||
::
|
||||
|
||||
int32 exclusive_scan_add(int32 v)
|
||||
unsigned int32 exclusive_scan_add(unsigned int32 v)
|
||||
float exclusive_scan_add(float v)
|
||||
int64 exclusive_scan_add(int64 v)
|
||||
unsigned int64 exclusive_scan_add(unsigned int64 v)
|
||||
double exclusive_scan_add(double v)
|
||||
int32 exclusive_scan_and(int32 v)
|
||||
unsigned int32 exclusive_scan_and(unsigned int32 v)
|
||||
int64 exclusive_scan_and(int64 v)
|
||||
unsigned int64 exclusive_scan_and(unsigned int64 v)
|
||||
int32 exclusive_scan_or(int32 v)
|
||||
unsigned int32 exclusive_scan_or(unsigned int32 v)
|
||||
int64 exclusive_scan_or(int64 v)
|
||||
unsigned int64 exclusive_scan_or(unsigned int64 v)
|
||||
|
||||
|
||||
Packed Load and Store Operations
|
||||
--------------------------------
|
||||
|
||||
@@ -2760,6 +2799,38 @@ active upon function entry.
|
||||
ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
|
||||
...
|
||||
|
||||
|
||||
Using Scan Operations For Variable Output
|
||||
-----------------------------------------
|
||||
|
||||
One important application of the ``exclusive_scan_add()`` function in the
|
||||
standard library is when program instances want to generate a variable amount
|
||||
of output and when one would like that output to be densely packed in a
|
||||
single array. For example, consider the code fragment below:
|
||||
|
||||
::
|
||||
|
||||
uniform int func(uniform float outArray[], ...) {
|
||||
int numOut = ...; // figure out how many to be output
|
||||
float outLocal[MAX_OUT]; // staging area
|
||||
// put results in outLocal[0], ..., outLocal[numOut-1]
|
||||
int startOffset = exclusive_scan_add(numOut);
|
||||
for (int i = 0; i < numOut; ++i)
|
||||
outArray[startOffset + i] = outLocal[i];
|
||||
return reduce_add(numOut);
|
||||
}
|
||||
|
||||
Here, each program instance has computed a number, ``numOut``, of values to
|
||||
output, and has stored them in the ``outLocal`` array. Assume that four
|
||||
program instances are running and that the first one wants to output one
|
||||
value, the second two values, and the third and fourth three values each.
|
||||
In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6)
|
||||
to the four program instances, respectively. The first program instance
|
||||
will write its one result to ``outArray[0]``, the second will write its two
|
||||
values to ``outArray[1]`` and ``outArray[2]``, and so forth. The
|
||||
``reduce_add`` call at the end returns the total number of values that the
|
||||
program instances have written to the array.
|
||||
|
||||
Disclaimer and Legal Information
|
||||
================================
|
||||
|
||||
|
||||
56
stdlib.ispc
56
stdlib.ispc
@@ -487,6 +487,62 @@ REDUCE_EQUAL(int64, int64)
|
||||
REDUCE_EQUAL(unsigned int64, int64)
|
||||
REDUCE_EQUAL(double, double)
|
||||
|
||||
static int32 exclusive_scan_add(int32 v) {
|
||||
return __exclusive_scan_add_i32(v, (int32)__mask);
|
||||
}
|
||||
|
||||
static unsigned int32 exclusive_scan_add(unsigned int32 v) {
|
||||
return __exclusive_scan_add_i32(v, __mask);
|
||||
}
|
||||
|
||||
static float exclusive_scan_add(float v) {
|
||||
return __exclusive_scan_add_float(v, __mask);
|
||||
}
|
||||
|
||||
static int64 exclusive_scan_add(int64 v) {
|
||||
return __exclusive_scan_add_i64(v, (int32)__mask);
|
||||
}
|
||||
|
||||
static unsigned int64 exclusive_scan_add(unsigned int64 v) {
|
||||
return __exclusive_scan_add_i64(v, __mask);
|
||||
}
|
||||
|
||||
static double exclusive_scan_add(double v) {
|
||||
return __exclusive_scan_add_double(v, __mask);
|
||||
}
|
||||
|
||||
static int32 exclusive_scan_and(int32 v) {
|
||||
return __exclusive_scan_and_i32(v, (int32)__mask);
|
||||
}
|
||||
|
||||
static unsigned int32 exclusive_scan_and(unsigned int32 v) {
|
||||
return __exclusive_scan_and_i32(v, __mask);
|
||||
}
|
||||
|
||||
static int64 exclusive_scan_and(int64 v) {
|
||||
return __exclusive_scan_and_i64(v, (int32)__mask);
|
||||
}
|
||||
|
||||
static unsigned int64 exclusive_scan_and(unsigned int64 v) {
|
||||
return __exclusive_scan_and_i64(v, __mask);
|
||||
}
|
||||
|
||||
static int32 exclusive_scan_or(int32 v) {
|
||||
return __exclusive_scan_or_i32(v, (int32)__mask);
|
||||
}
|
||||
|
||||
static unsigned int32 exclusive_scan_or(unsigned int32 v) {
|
||||
return __exclusive_scan_or_i32(v, __mask);
|
||||
}
|
||||
|
||||
static int64 exclusive_scan_or(int64 v) {
|
||||
return __exclusive_scan_or_i64(v, (int32)__mask);
|
||||
}
|
||||
|
||||
static unsigned int64 exclusive_scan_or(unsigned int64 v) {
|
||||
return __exclusive_scan_or_i64(v, __mask);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// packed load, store
|
||||
|
||||
|
||||
12
tests/exclusive-scan-add-1.ispc
Normal file
12
tests/exclusive-scan-add-1.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = exclusive_scan_add(programIndex);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int result[] = { 0, 0, 1, 3, 6, 10, 15, 21, 28,
|
||||
36, 45, 55, 66, 78, 91, 105, 120 };
|
||||
RET[programIndex] = result[programIndex];
|
||||
}
|
||||
20
tests/exclusive-scan-add-10.ispc
Normal file
20
tests/exclusive-scan-add-10.ispc
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = -1;
|
||||
unsigned int64 a = aFOO[programIndex];
|
||||
if (programIndex & 1) {
|
||||
RET[programIndex] = exclusive_scan_add(a);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int result[16] = { 0, 0, 0, 2, 0, 6, 0, 12,
|
||||
0, 20, 0, 30, 0, 42, 0, 56 };
|
||||
if (programIndex & 1)
|
||||
RET[programIndex] = result[programIndex];
|
||||
else
|
||||
RET[programIndex] = -1;
|
||||
}
|
||||
12
tests/exclusive-scan-add-2.ispc
Normal file
12
tests/exclusive-scan-add-2.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = exclusive_scan_add(aFOO[programIndex]);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int result[] = { 0, 1, 3, 6, 10, 15, 21, 28,
|
||||
36, 45, 55, 66, 78, 91, 105, 120, 136 };
|
||||
RET[programIndex] = result[programIndex];
|
||||
}
|
||||
17
tests/exclusive-scan-add-3.ispc
Normal file
17
tests/exclusive-scan-add-3.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = -1;
|
||||
float a = aFOO[programIndex];
|
||||
if (a <= 2)
|
||||
RET[programIndex] = exclusive_scan_add(a);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
RET[programIndex] = -1;
|
||||
if (programIndex <= 1)
|
||||
RET[programIndex] = result[programIndex];
|
||||
}
|
||||
17
tests/exclusive-scan-add-4.ispc
Normal file
17
tests/exclusive-scan-add-4.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = -1;
|
||||
float a = aFOO[programIndex];
|
||||
if (a <= 2)
|
||||
RET[programIndex] = exclusive_scan_add(a);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
RET[programIndex] = -1;
|
||||
if (programIndex <= 1)
|
||||
RET[programIndex] = result[programIndex];
|
||||
}
|
||||
20
tests/exclusive-scan-add-5.ispc
Normal file
20
tests/exclusive-scan-add-5.ispc
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = -1;
|
||||
float a = aFOO[programIndex];
|
||||
if (programIndex & 1) {
|
||||
RET[programIndex] = exclusive_scan_add(a);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int result[16] = { 0, 0, 0, 2, 0, 6, 0, 12,
|
||||
0, 20, 0, 30, 0, 42, 0, 56 };
|
||||
if (programIndex & 1)
|
||||
RET[programIndex] = result[programIndex];
|
||||
else
|
||||
RET[programIndex] = -1;
|
||||
}
|
||||
12
tests/exclusive-scan-add-6.ispc
Normal file
12
tests/exclusive-scan-add-6.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = exclusive_scan_add((float)programIndex);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int result[] = { 0, 0, 1, 3, 6, 10, 15, 21, 28,
|
||||
36, 45, 55, 66, 78, 91, 105, 120 };
|
||||
RET[programIndex] = result[programIndex];
|
||||
}
|
||||
12
tests/exclusive-scan-add-7.ispc
Normal file
12
tests/exclusive-scan-add-7.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = exclusive_scan_add((double)aFOO[programIndex]);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int result[] = { 0, 1, 3, 6, 10, 15, 21, 28,
|
||||
36, 45, 55, 66, 78, 91, 105, 120, 136 };
|
||||
RET[programIndex] = result[programIndex];
|
||||
}
|
||||
17
tests/exclusive-scan-add-8.ispc
Normal file
17
tests/exclusive-scan-add-8.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = -1;
|
||||
int64 a = aFOO[programIndex];
|
||||
if (a <= 2)
|
||||
RET[programIndex] = exclusive_scan_add(a);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
RET[programIndex] = -1;
|
||||
if (programIndex <= 1)
|
||||
RET[programIndex] = result[programIndex];
|
||||
}
|
||||
17
tests/exclusive-scan-add-9.ispc
Normal file
17
tests/exclusive-scan-add-9.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = -1;
|
||||
unsigned int64 a = aFOO[programIndex];
|
||||
if (a <= 2)
|
||||
RET[programIndex] = exclusive_scan_add(a);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
RET[programIndex] = -1;
|
||||
if (programIndex <= 1)
|
||||
RET[programIndex] = result[programIndex];
|
||||
}
|
||||
22
tests/exclusive-scan-and-1.ispc
Normal file
22
tests/exclusive-scan-and-1.ispc
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = -1;
|
||||
int32 a = (programIndex & 1) ? 0xff : 0;
|
||||
if (programIndex & 1) {
|
||||
RET[programIndex] = exclusive_scan_and(a);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
if (programIndex & 1) {
|
||||
if (programIndex == 1)
|
||||
RET[programIndex] = -1; // 0xffffffff, made float
|
||||
else
|
||||
RET[programIndex] = 0xff;
|
||||
}
|
||||
else
|
||||
RET[programIndex] = -1;
|
||||
}
|
||||
21
tests/exclusive-scan-and-2.ispc
Normal file
21
tests/exclusive-scan-and-2.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = -1;
|
||||
int32 a = ~(1 << programIndex);
|
||||
if ((programIndex & 1) == 0) {
|
||||
RET[programIndex] = exclusive_scan_and(a);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = -1;
|
||||
if ((programIndex & 1) == 0 && programIndex > 0) {
|
||||
int val = 0xffffffff;
|
||||
for (int i = 0; i < programIndex-1; i += 2)
|
||||
val &= ~(1<<i);
|
||||
RET[programIndex] = val;
|
||||
}
|
||||
}
|
||||
13
tests/exclusive-scan-or-1.ispc
Normal file
13
tests/exclusive-scan-or-1.ispc
Normal file
@@ -0,0 +1,13 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = -1;
|
||||
int32 a = (1 << programIndex);
|
||||
RET[programIndex] = exclusive_scan_or(a);
|
||||
}
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (1 << programIndex) - 1;
|
||||
}
|
||||
Reference in New Issue
Block a user