For associative atomic ops (add, and, or, xor), we can take advantage of their associativity to do just a single hardware atomic instruction, rather than one for each of the running program instances (as the previous implementation did.) The basic approach is to locally compute a reduction across the active program instances with the given op and to then issue a single HW atomic with that reduced value as the operand. We then take the old value that was stored in the location that is returned from the HW atomic op and use that to compute the values to return to each of the program instances (conceptually representing the cumulative effect of each of the preceding program instances having performed their atomic operation.) Issue #56.
21 lines
575 B
Plaintext
21 lines
575 B
Plaintext
|
|
export uniform int width() { return programCount; }
|
|
|
|
uniform unsigned int64 s = 0xffffffffff000000;
|
|
|
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
|
float a = aFOO[programIndex];
|
|
float b = 0;
|
|
if (programIndex & 1)
|
|
b = atomic_or_global(s, (1 << programIndex));
|
|
RET[programIndex] = (s>>20);
|
|
}
|
|
|
|
export void result(uniform float RET[]) {
|
|
uniform int sum = 0;
|
|
for (uniform int i = 0; i < programCount; ++i)
|
|
if (i & 1)
|
|
sum += (1 << i);
|
|
RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
|
|
}
|