Improve performance of global atomics, taking advantage of associativity.

For associative atomic ops (add, and, or, xor), we can take advantage of
their associativity to do just a single hardware atomic instruction, 
rather than one for each of the running program instances (as the previous
implementation did.)

The basic approach is to locally compute a reduction across the active
program instances with the given op and to then issue a single HW atomic
with that reduced value as the operand.  We then take the old value that
was stored in the location that is returned from the HW atomic op and
use that to compute the values to return to each of the program instances
(conceptually representing the cumulative effect of each of the preceding
program instances having performed their atomic operation.)

Issue #56.
This commit is contained in:
Matt Pharr
2011-08-31 05:35:01 -07:00
parent 96a297c747
commit e144724979
8 changed files with 224 additions and 38 deletions

16
tests/atomics-10.ispc Normal file
View File

@@ -0,0 +1,16 @@
export uniform int width() { return programCount; }
uniform unsigned int32 s = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
float b = 0;
if (programIndex < 2)
b = atomic_add_global(s, 1);
RET[programIndex] = s;
}
export void result(uniform float RET[]) {
RET[programIndex] = 2;
}

20
tests/atomics-11.ispc Normal file
View File

@@ -0,0 +1,20 @@
export uniform int width() { return programCount; }
uniform unsigned int32 s = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
float b = 0;
if (programIndex & 1)
b = atomic_add_global(s, programIndex);
RET[programIndex] = s;
}
export void result(uniform float RET[]) {
uniform int sum = 0;
for (uniform int i = 0; i < programCount; ++i)
if (i & 1)
sum += i;
RET[programIndex] = sum;
}

20
tests/atomics-12.ispc Normal file
View File

@@ -0,0 +1,20 @@
export uniform int width() { return programCount; }
uniform unsigned int32 s = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
float b = 0;
if (programIndex & 1)
b = atomic_or_global(s, (1 << programIndex));
RET[programIndex] = s;
}
export void result(uniform float RET[]) {
uniform int sum = 0;
for (uniform int i = 0; i < programCount; ++i)
if (i & 1)
sum += (1 << i);
RET[programIndex] = sum;
}

16
tests/atomics-13.ispc Normal file
View File

@@ -0,0 +1,16 @@
export uniform int width() { return programCount; }
uniform unsigned int32 s = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
float b = 0;
if (programIndex & 1)
b = atomic_or_global(s, (1 << programIndex));
RET[programIndex] = popcnt(reduce_add((int32)b));
}
export void result(uniform float RET[]) {
RET[programIndex] = (programCount/2) - 1;
}

20
tests/atomics-14.ispc Normal file
View File

@@ -0,0 +1,20 @@
export uniform int width() { return programCount; }
uniform unsigned int64 s = 0xffffffffff000000;
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
float b = 0;
if (programIndex & 1)
b = atomic_or_global(s, (1 << programIndex));
RET[programIndex] = (s>>20);
}
export void result(uniform float RET[]) {
uniform int sum = 0;
for (uniform int i = 0; i < programCount; ++i)
if (i & 1)
sum += (1 << i);
RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
}

16
tests/atomics-9.ispc Normal file
View File

@@ -0,0 +1,16 @@
export uniform int width() { return programCount; }
uniform unsigned int32 s = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
float b = 0;
if (programIndex < 2)
b = atomic_add_global(s, 1);
RET[programIndex] = reduce_add(b);
}
export void result(uniform float RET[]) {
RET[programIndex] = 1;
}