Add support for scan operations across program instances (add, and, or).

This commit is contained in:
Matt Pharr
2011-08-13 20:11:41 +01:00
parent c74116aa24
commit f868a63064
20 changed files with 403 additions and 1 deletions

View File

@@ -41,6 +41,7 @@
stdlib_core(8)
packed_load_and_store(8)
scans(8)
int64minmax(8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@@ -35,6 +35,7 @@
; Define some basics for a 4-wide target
stdlib_core(4)
packed_load_and_store(4)
scans(4)
; Include the various definitions of things that only require SSE1 and SSE2
include(`builtins-sse.ll')

View File

@@ -35,6 +35,7 @@
; Define common 4-wide stuff
stdlib_core(4)
packed_load_and_store(4)
scans(4)
; Define the stuff that can be done with base SSE1/SSE2 instructions
include(`builtins-sse.ll')
@@ -229,7 +230,6 @@ define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysi
ret float %scalar
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store

View File

@@ -38,6 +38,7 @@
stdlib_core(8)
packed_load_and_store(8)
scans(8)
int64minmax(8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@@ -1475,6 +1475,66 @@ reduce_equal_aux($1, i64, int64, i64, icmp, 64)
reduce_equal_aux($1, double, double, i64, fcmp, 64)
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; prefix sum stuff
; $1: vector width (e.g. 4)
; $2: vector element type (e.g. float)
; $3: bit width of vector element type (e.g. 32)
; $4: operator to apply (e.g. fadd)
; $5: identity element value (e.g. 0)
; $6: suffix for function (e.g. add_float)
define(`exclusive_scan', `
define internal <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v,
<$1 x i32> %mask) nounwind alwaysinline {
; first, set the value of any off lanes to the identity value
%ptr = alloca <$1 x $2>
%idvec1 = bitcast $2 $5 to <1 x $2>
%idvec = shufflevector <1 x $2> %idvec1, <1 x $2> undef,
<$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 >
store <$1 x $2> %idvec, <$1 x $2> * %ptr
%ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> *
%vi = bitcast <$1 x $2> %v to <$1 x i`'$3>
call void @__masked_store_blend_$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi,
<$1 x i32> %mask)
%v_id = load <$1 x $2> * %ptr
; extract elements of the vector to use in computing the scan
forloop(i, 0, eval($1-1), `
%v`'i = extractelement <$1 x $2> %v_id, i32 i')
; and just compute the scan directly.
; 0th element is the identity (so nothing to do here),
; 1st element is identity (op) the 0th element of the original vector,
; each successive element is the previous element (op) the previous element
; of the original vector
%s1 = $4 $2 $5, %v0
forloop(i, 2, eval($1-1), `
%s`'i = $4 $2 %s`'eval(i-1), %v`'eval(i-1)')
; and fill in the result vector
%r0 = insertelement <$1 x $2> undef, $2 $5, i32 0 ; 0th element gets identity
forloop(i, 1, eval($1-1), `
%r`'i = insertelement <$1 x $2> %r`'eval(i-1), $2 %s`'i, i32 i')
ret <$1 x $2> %r`'eval($1-1)
}
')
define(`scans', `
exclusive_scan($1, i32, 32, add, 0, add_i32)
exclusive_scan($1, float, 32, fadd, zeroinitializer, add_float)
exclusive_scan($1, i64, 64, add, 0, add_i64)
exclusive_scan($1, double, 64, fadd, zeroinitializer, add_double)
exclusive_scan($1, i32, 32, and, -1, and_i32)
exclusive_scan($1, i64, 64, and, -1, and_i64)
exclusive_scan($1, i32, 32, or, 0, or_i32)
exclusive_scan($1, i64, 64, or, 0, or_i64)
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; per_lane
;;

View File

@@ -101,6 +101,7 @@ Contents:
+ `"Inline" Aggressively`_
+ `Small Performance Tricks`_
+ `Instrumenting Your ISPC Programs`_
+ `Using Scan Operations For Variable Output`_
* `Disclaimer and Legal Information`_
@@ -1852,6 +1853,44 @@ There are also variants of these functions that return the value as a
The value returned by the ``reduce_equal()`` function is undefined if
it is called when none of the program instances are running.
There are also a number of functions to compute "scan"s of values across
the program instances. For example, the ``exclusive_scan_and()`` function
computes, for each program instance, the sum of the given value over all of
the preceeding program instances. (The scans currently available in
``ispc`` are all so-called "exclusive" scans, meaning that the value
computed for a given element does not include the value provided for that
element.) In C code, an exclusive add scan over an array might be
implemented as:
::
void scan_add(int *in_array, int *result_array, int count) {
result_array[0] = 0;
for (int i = 0; i < count; ++i)
result_array[i] = result_array[i-1] + in_array[i-1];
}
``ispc`` provides the following scan functions--addition, bitwise-and, and
bitwise-or are available:
::
int32 exclusive_scan_add(int32 v)
unsigned int32 exclusive_scan_add(unsigned int32 v)
float exclusive_scan_add(float v)
int64 exclusive_scan_add(int64 v)
unsigned int64 exclusive_scan_add(unsigned int64 v)
double exclusive_scan_add(double v)
int32 exclusive_scan_and(int32 v)
unsigned int32 exclusive_scan_and(unsigned int32 v)
int64 exclusive_scan_and(int64 v)
unsigned int64 exclusive_scan_and(unsigned int64 v)
int32 exclusive_scan_or(int32 v)
unsigned int32 exclusive_scan_or(unsigned int32 v)
int64 exclusive_scan_or(int64 v)
unsigned int64 exclusive_scan_or(unsigned int64 v)
Packed Load and Store Operations
--------------------------------
@@ -2760,6 +2799,38 @@ active upon function entry.
ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
...
Using Scan Operations For Variable Output
-----------------------------------------
One important application of the ``exclusive_scan_add()`` function in the
standard library is when program instances want to generate a variable amount
of output and when one would like that output to be densely packed in a
single array. For example, consider the code fragment below:
::
uniform int func(uniform float outArray[], ...) {
int numOut = ...; // figure out how many to be output
float outLocal[MAX_OUT]; // staging area
// put results in outLocal[0], ..., outLocal[numOut-1]
int startOffset = exclusive_scan_add(numOut);
for (int i = 0; i < numOut; ++i)
outArray[startOffset + i] = outLocal[i];
return reduce_add(numOut);
}
Here, each program instance has computed a number, ``numOut``, of values to
output, and has stored them in the ``outLocal`` array. Assume that four
program instances are running and that the first one wants to output one
value, the second two values, and the third and fourth three values each.
In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6)
to the four program instances, respectively. The first program instance
will write its one result to ``outArray[0]``, the second will write its two
values to ``outArray[1]`` and ``outArray[2]``, and so forth. The
``reduce_add`` call at the end returns the total number of values that the
program instances have written to the array.
Disclaimer and Legal Information
================================

View File

@@ -487,6 +487,62 @@ REDUCE_EQUAL(int64, int64)
REDUCE_EQUAL(unsigned int64, int64)
REDUCE_EQUAL(double, double)
static int32 exclusive_scan_add(int32 v) {
return __exclusive_scan_add_i32(v, (int32)__mask);
}
static unsigned int32 exclusive_scan_add(unsigned int32 v) {
return __exclusive_scan_add_i32(v, __mask);
}
static float exclusive_scan_add(float v) {
return __exclusive_scan_add_float(v, __mask);
}
static int64 exclusive_scan_add(int64 v) {
return __exclusive_scan_add_i64(v, (int32)__mask);
}
static unsigned int64 exclusive_scan_add(unsigned int64 v) {
return __exclusive_scan_add_i64(v, __mask);
}
static double exclusive_scan_add(double v) {
return __exclusive_scan_add_double(v, __mask);
}
static int32 exclusive_scan_and(int32 v) {
return __exclusive_scan_and_i32(v, (int32)__mask);
}
static unsigned int32 exclusive_scan_and(unsigned int32 v) {
return __exclusive_scan_and_i32(v, __mask);
}
static int64 exclusive_scan_and(int64 v) {
return __exclusive_scan_and_i64(v, (int32)__mask);
}
static unsigned int64 exclusive_scan_and(unsigned int64 v) {
return __exclusive_scan_and_i64(v, __mask);
}
static int32 exclusive_scan_or(int32 v) {
return __exclusive_scan_or_i32(v, (int32)__mask);
}
static unsigned int32 exclusive_scan_or(unsigned int32 v) {
return __exclusive_scan_or_i32(v, __mask);
}
static int64 exclusive_scan_or(int64 v) {
return __exclusive_scan_or_i64(v, (int32)__mask);
}
static unsigned int64 exclusive_scan_or(unsigned int64 v) {
return __exclusive_scan_or_i64(v, __mask);
}
///////////////////////////////////////////////////////////////////////////
// packed load, store

View File

@@ -0,0 +1,12 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
RET[programIndex] = exclusive_scan_add(programIndex);
}
export void result(uniform float RET[]) {
uniform int result[] = { 0, 0, 1, 3, 6, 10, 15, 21, 28,
36, 45, 55, 66, 78, 91, 105, 120 };
RET[programIndex] = result[programIndex];
}

View File

@@ -0,0 +1,20 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
RET[programIndex] = -1;
unsigned int64 a = aFOO[programIndex];
if (programIndex & 1) {
RET[programIndex] = exclusive_scan_add(a);
}
}
export void result(uniform float RET[]) {
uniform int result[16] = { 0, 0, 0, 2, 0, 6, 0, 12,
0, 20, 0, 30, 0, 42, 0, 56 };
if (programIndex & 1)
RET[programIndex] = result[programIndex];
else
RET[programIndex] = -1;
}

View File

@@ -0,0 +1,12 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
RET[programIndex] = exclusive_scan_add(aFOO[programIndex]);
}
export void result(uniform float RET[]) {
uniform int result[] = { 0, 1, 3, 6, 10, 15, 21, 28,
36, 45, 55, 66, 78, 91, 105, 120, 136 };
RET[programIndex] = result[programIndex];
}

View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
RET[programIndex] = -1;
float a = aFOO[programIndex];
if (a <= 2)
RET[programIndex] = exclusive_scan_add(a);
}
export void result(uniform float RET[]) {
uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 };
RET[programIndex] = -1;
if (programIndex <= 1)
RET[programIndex] = result[programIndex];
}

View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
RET[programIndex] = -1;
float a = aFOO[programIndex];
if (a <= 2)
RET[programIndex] = exclusive_scan_add(a);
}
export void result(uniform float RET[]) {
uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 };
RET[programIndex] = -1;
if (programIndex <= 1)
RET[programIndex] = result[programIndex];
}

View File

@@ -0,0 +1,20 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
RET[programIndex] = -1;
float a = aFOO[programIndex];
if (programIndex & 1) {
RET[programIndex] = exclusive_scan_add(a);
}
}
export void result(uniform float RET[]) {
uniform int result[16] = { 0, 0, 0, 2, 0, 6, 0, 12,
0, 20, 0, 30, 0, 42, 0, 56 };
if (programIndex & 1)
RET[programIndex] = result[programIndex];
else
RET[programIndex] = -1;
}

View File

@@ -0,0 +1,12 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
RET[programIndex] = exclusive_scan_add((float)programIndex);
}
export void result(uniform float RET[]) {
uniform int result[] = { 0, 0, 1, 3, 6, 10, 15, 21, 28,
36, 45, 55, 66, 78, 91, 105, 120 };
RET[programIndex] = result[programIndex];
}

View File

@@ -0,0 +1,12 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
RET[programIndex] = exclusive_scan_add((double)aFOO[programIndex]);
}
export void result(uniform float RET[]) {
uniform int result[] = { 0, 1, 3, 6, 10, 15, 21, 28,
36, 45, 55, 66, 78, 91, 105, 120, 136 };
RET[programIndex] = result[programIndex];
}

View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
RET[programIndex] = -1;
int64 a = aFOO[programIndex];
if (a <= 2)
RET[programIndex] = exclusive_scan_add(a);
}
export void result(uniform float RET[]) {
uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 };
RET[programIndex] = -1;
if (programIndex <= 1)
RET[programIndex] = result[programIndex];
}

View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
RET[programIndex] = -1;
unsigned int64 a = aFOO[programIndex];
if (a <= 2)
RET[programIndex] = exclusive_scan_add(a);
}
export void result(uniform float RET[]) {
uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 };
RET[programIndex] = -1;
if (programIndex <= 1)
RET[programIndex] = result[programIndex];
}

View File

@@ -0,0 +1,22 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
RET[programIndex] = -1;
int32 a = (programIndex & 1) ? 0xff : 0;
if (programIndex & 1) {
RET[programIndex] = exclusive_scan_and(a);
}
}
export void result(uniform float RET[]) {
if (programIndex & 1) {
if (programIndex == 1)
RET[programIndex] = -1; // 0xffffffff, made float
else
RET[programIndex] = 0xff;
}
else
RET[programIndex] = -1;
}

View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
RET[programIndex] = -1;
int32 a = ~(1 << programIndex);
if ((programIndex & 1) == 0) {
RET[programIndex] = exclusive_scan_and(a);
}
}
export void result(uniform float RET[]) {
RET[programIndex] = -1;
if ((programIndex & 1) == 0 && programIndex > 0) {
int val = 0xffffffff;
for (int i = 0; i < programIndex-1; i += 2)
val &= ~(1<<i);
RET[programIndex] = val;
}
}

View File

@@ -0,0 +1,13 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
RET[programIndex] = -1;
int32 a = (1 << programIndex);
RET[programIndex] = exclusive_scan_or(a);
}
export void result(uniform float RET[]) {
RET[programIndex] = (1 << programIndex) - 1;
}