diff --git a/docs/ispc.rst b/docs/ispc.rst index 345d6119..aa15158d 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3389,24 +3389,53 @@ Systems Programming Support Atomic Operations and Memory Fences ----------------------------------- -The usual range of atomic memory operations are provided in ``ispc``, -including variants to handle both uniform and varying types. As a first -example, consider on variant of the 32-bit integer atomic add routine: +The standard range of atomic memory operations are provided by the standard +library``ispc``, including variants to handle both uniform and varying +types as well as "local" and "global" atomics. + +Local atomics provide atomic behavior across the program instances in a +gang, but not across multiple gangs or memory operations in different +hardware threads. To see why they are needed, consider a histogram +calculation where each program instance in the gang computes which bucket a +value lies in and then increments a corresponding counter. If the code is +written like this: :: - int32 atomic_add_global(uniform int32 * uniform ptr, int32 delta) + uniform int count[N_BUCKETS] = ...; + float value = ...; + int bucket = clamp(value / N_BUCKETS, 0, N_BUCKETS); + ++count[bucket]; // ERROR: undefined behavior if collisions -The semantics are the expected ones for an atomic add function: the pointer -points to a single location in memory (the same one for all program -instances), and for each executing program instance, the value stored in -the location that ``ptr`` points to has that program instance's value -"delta" added to it atomically, and the old value at that location is -returned from the function. (Thus, if multiple processors simultaneously -issue atomic adds to the same memory location, the adds will be serialized -by the hardware so that the correct result is computed in the end. -Furthermore, the atomic adds are serialized across the running program -instances.) +then the program's behavior is undefined: whenever multiple program +instances have values that map to the same value of ``bucket``, then the +effect of the increment is undefined. (See the discussion in the `Data +Races Within a Gang`_ section; in the case here, there isn't a sequence +point between one program instance updating ``count[bucket]`` and the other +program instance reading its value.) + +The ``atomic_add_local()`` function can be used in this case; as a local +atomic it is atomic across the gang of program instances, such that the +expected result is computed. + +:: + + ... + int bucket = clamp(value / N_BUCKETS, 0, N_BUCKETS); + atomic_add_local(&count[bucket], 1); + +It uses this variant of the 32-bit integer atomic add routine: + +:: + + int32 atomic_add_local(uniform int32 * uniform ptr, int32 delta) + +The semantics of this routine are typical for an atomic add function: the +pointer here points to a single location in memory (the same one for all +program instances), and for each executing program instance, the value +stored in the location that ``ptr`` points to has that program instance's +value "delta" added to it atomically, and the old value at that location is +returned from the function. One thing to note is that that the type of the value being added to a ``uniform`` integer, while the increment amount and the return value are @@ -3417,45 +3446,76 @@ atomics for the running program instances may be issued in arbitrary order; it's not guaranteed that they will be issued in ``programIndex`` order, for example. -Here are the declarations of the ``int32`` variants of these functions. -There are also ``int64`` equivalents as well as variants that take -``unsigned`` ``int32`` and ``int64`` values. (The ``atomic_swap_global()`` -function can be used with ``float`` and ``double`` types as well.) +Global atomics are more powerful than local atomics; they are atomic across +both the program instances in the gang as well as atomic across different +gangs and different hardware threads. For example, for the global variant +of the atomic used above, :: - int32 atomic_add_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_subtract_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_min_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_max_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_and_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_or_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_xor_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_swap_global(uniform int32 * uniform ptr, int32 value) + int32 atomic_add_global(uniform int32 * uniform ptr, int32 delta) -There are also variants of these functions that take ``uniform`` values for -the operand and return a ``uniform`` result. These correspond to a single +if multiple processors simultaneously issue atomic adds to the same memory +location, the adds will be serialized by the hardware so that the correct +result is computed in the end. + +Here are the declarations of the ``int32`` variants of these functions. +There are also ``int64`` equivalents as well as variants that take +``unsigned`` ``int32`` and ``int64`` values. + +:: + + int32 atomic_add_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_subtract_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_min_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_max_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_and_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_or_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_xor_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_swap_{local,global}(uniform int32 * uniform ptr, int32 value) + +Support for ``float`` and ``double`` types is also available. For local +atomics, all but the logical operations are available. (There are +corresponding ``double`` variants of these, not listed here.) + +:: + + float atomic_add_local(uniform float * uniform ptr, float value) + float atomic_subtract_local(uniform float * uniform ptr, float value) + float atomic_min_local(uniform float * uniform ptr, float value) + float atomic_max_local(uniform float * uniform ptr, float value) + float atomic_swap_local(uniform float * uniform ptr, float value) + +For global atomics, only atomic swap is available for these types: + +:: + + float atomic_swap_global(uniform float * uniform ptr, float value) + double atomic_swap_global(uniform double * uniform ptr, double value) + +There are also variants of the atomic that take ``uniform`` values for the +operand and return a ``uniform`` result. These correspond to a single atomic operation being performed for the entire gang of program instances, rather than one per program instance. :: - uniform int32 atomic_add_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_subtract_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_min_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_max_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_and_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_or_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_xor_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_swap_global(uniform int32 * uniform ptr, - uniform int32 newval) + uniform int32 atomic_add_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_subtract_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_min_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_max_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_and_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_or_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_xor_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_swap_{local,global}(uniform int32 * uniform ptr, + uniform int32 newval) Be careful that you use the atomic function that you mean to; consider the following code: @@ -3479,8 +3539,7 @@ will cause the desired atomic add function to be called. :: extern uniform int32 counter; - int32 one = 1; - int32 myCounter = atomic_add_global(&counter, one); + int32 myCounter = atomic_add_global(&counter, (varying int32)1); There is a third variant of each of these atomic functions that takes a ``varying`` pointer; this allows each program instance to issue an atomic @@ -3490,30 +3549,27 @@ the same location in memory!) :: - int32 atomic_add_global(uniform int32 * varying ptr, int32 value) - int32 atomic_subtract_global(uniform int32 * varying ptr, int32 value) - int32 atomic_min_global(uniform int32 * varying ptr, int32 value) - int32 atomic_max_global(uniform int32 * varying ptr, int32 value) - int32 atomic_and_global(uniform int32 * varying ptr, int32 value) - int32 atomic_or_global(uniform int32 * varying ptr, int32 value) - int32 atomic_xor_global(uniform int32 * varying ptr, int32 value) - int32 atomic_swap_global(uniform int32 * varying ptr, int32 value) + int32 atomic_add_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_subtract_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_min_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_max_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_and_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_or_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_xor_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_swap_{local,global}(uniform int32 * varying ptr, int32 value) -There are also atomic swap and "compare and exchange" functions. -Compare and exchange atomically compares the value in "val" to -"compare"--if they match, it assigns "newval" to "val". In either case, -the old value of "val" is returned. (As with the other atomic operations, -there are also ``unsigned`` and 64-bit variants of this function. -Furthermore, there are ``float`` and ``double`` variants as well.) +There are also atomic "compare and exchange" functions. Compare and +exchange atomically compares the value in "val" to "compare"--if they +match, it assigns "newval" to "val". In either case, the old value of +"val" is returned. (As with the other atomic operations, there are also +``unsigned`` and 64-bit variants of this function. Furthermore, there are +``float`` and ``double`` variants as well.) :: - int32 atomic_swap_global(uniform int32 * uniform ptr, int32 newvalue) - uniform int32 atomic_swap_global(uniform int32 * uniform ptr, - uniform int32 newvalue) - int32 atomic_compare_exchange_global(uniform int32 * uniform ptr, - int32 compare, int32 newval) - uniform int32 atomic_compare_exchange_global(uniform int32 * uniform ptr, + int32 atomic_compare_exchange_{local,global}(uniform int32 * uniform ptr, + int32 compare, int32 newval) + uniform int32 atomic_compare_exchange_{local,global}(uniform int32 * uniform ptr, uniform int32 compare, uniform int32 newval) ``ispc`` also has a standard library routine that inserts a memory barrier diff --git a/examples/aobench/ao.ispc b/examples/aobench/ao.ispc index ffd85d29..61c2dc7d 100644 --- a/examples/aobench/ao.ispc +++ b/examples/aobench/ao.ispc @@ -212,104 +212,44 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, RNGState rngstate; seed_rng(&rngstate, y0); + float invSamples = 1.f / nsubsamples; - // Compute the mapping between the 'programCount'-wide program - // instances running in parallel and samples in the image. - // - // For now, we'll always take four samples per pixel, so start by - // initializing du and dv with offsets into subpixel samples. We'll - // take care of further updating du and dv for the case where we're - // doing more than 4 program instances in parallel shortly. - uniform float uSteps[4] = { 0, 1, 0, 1 }; - uniform float vSteps[4] = { 0, 0, 1, 1 }; - float du = uSteps[programIndex % 4] / nsubsamples; - float dv = vSteps[programIndex % 4] / nsubsamples; + foreach_tiled(y = y0 ... y1, x = 0 ... w, + u = 0 ... nsubsamples, v = 0 ... nsubsamples) { + float du = (float)u * invSamples, dv = (float)v * invSamples; - // Now handle the case where we are able to do more than one pixel's - // worth of work at once. nx records the number of pixels in the x - // direction we do per iteration and ny the number in y. - uniform int nx = 1, ny = 1; + // Figure out x,y pixel in NDC + float px = (x + du - (w / 2.0f)) / (w / 2.0f); + float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); + float ret = 0.f; + Ray ray; + Isect isect; - // FIXME: We actually need ny to be 1 regardless of the decomposition, - // since the task decomposition is one scanline high. + ray.org = 0.f; - if (programCount == 8) { - // Do two pixels at once in the x direction - nx = 2; - if (programIndex >= 4) - // And shift the offsets for the second pixel's worth of work - ++du; - } - else if (programCount == 16) { - nx = 4; - ny = 1; - if (programIndex >= 4 && programIndex < 8) - ++du; - if (programIndex >= 8 && programIndex < 12) - du += 2; - if (programIndex >= 12) - du += 3; - } + // Poor man's perspective projection + ray.dir.x = px; + ray.dir.y = py; + ray.dir.z = -1.0; + vnormalize(ray.dir); - // Now loop over all of the pixels, stepping in x and y as calculated - // above. (Assumes that ny divides y and nx divides x...) - for (uniform int y = y0; y < y1; y += ny) { - for (uniform int x = 0; x < w; x += nx) { - // Figure out x,y pixel in NDC - float px = (x + du - (w / 2.0f)) / (w / 2.0f); - float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); - float ret = 0.f; - Ray ray; - Isect isect; + isect.t = 1.0e+17; + isect.hit = 0; - ray.org = 0.f; + for (uniform int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(isect, ray, spheres[snum]); + ray_plane_intersect(isect, ray, plane); - // Poor man's perspective projection - ray.dir.x = px; - ray.dir.y = py; - ray.dir.z = -1.0; - vnormalize(ray.dir); + // Note use of 'coherent' if statement; the set of rays we + // trace will often all hit or all miss the scene + cif (isect.hit) { + ret = ambient_occlusion(isect, plane, spheres, rngstate); + ret *= invSamples * invSamples; - isect.t = 1.0e+17; - isect.hit = 0; - - for (uniform int snum = 0; snum < 3; ++snum) - ray_sphere_intersect(isect, ray, spheres[snum]); - ray_plane_intersect(isect, ray, plane); - - // Note use of 'coherent' if statement; the set of rays we - // trace will often all hit or all miss the scene - cif (isect.hit) - ret = ambient_occlusion(isect, plane, spheres, rngstate); - - // This is a little grungy; we have results for - // programCount-worth of values. Because we're doing 2x2 - // subsamples, we need to peel them off in groups of four, - // average the four values for each pixel, and update the - // output image. - // - // Store the varying value to a uniform array of the same size. - // See the discussion about communication among program - // instances in the ispc user's manual for more discussion on - // this idiom. - uniform float retArray[programCount]; - retArray[programIndex] = ret; - - // offset to the first pixel in the image - uniform int offset = 3 * (y * w + x); - for (uniform int p = 0; p < programCount; p += 4, offset += 3) { - // Get the four sample values for this pixel - uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] + - retArray[p+3]; - - // Normalize by number of samples taken - sumret /= nsubsamples * nsubsamples; - - // Store result in the image - image[offset+0] = sumret; - image[offset+1] = sumret; - image[offset+2] = sumret; - } + int offset = 3 * (y * w + x); + atomic_add_local(&image[offset], ret); + atomic_add_local(&image[offset+1], ret); + atomic_add_local(&image[offset+2], ret); } } } diff --git a/stdlib.ispc b/stdlib.ispc index 0fe5e8ea..5bc931ec 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -795,217 +795,6 @@ static inline uniform int64 clock() { return __clock(); } -/////////////////////////////////////////////////////////////////////////// -// Atomics and memory barriers - -static inline void memory_barrier() { - __memory_barrier(); -} - -#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \ -static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ - memory_barrier(); \ - TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \ - memory_barrier(); \ - return ret; \ -} \ -static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \ - uniform TA value) { \ - memory_barrier(); \ - uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \ - memory_barrier(); \ - return ret; \ -} \ -static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \ - uniform TA * uniform ptrArray[programCount]; \ - ptrArray[programIndex] = ptr; \ - memory_barrier(); \ - TA ret; \ - uniform int mask = lanemask(); \ - for (uniform int i = 0; i < programCount; ++i) { \ - if ((mask & (1 << i)) == 0) \ - continue; \ - uniform TA * uniform p = ptrArray[i]; \ - uniform TA v = extract(value, i); \ - uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \ - ret = insert(ret, i, r); \ - } \ - memory_barrier(); \ - return ret; \ -} \ - -#define DEFINE_ATOMIC_SWAP(TA,TB) \ -static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \ - memory_barrier(); \ - uniform int i = 0; \ - TA ret[programCount]; \ - TA memVal; \ - uniform int lastSwap; \ - uniform int mask = lanemask(); \ - /* First, have the first running program instance (if any) perform \ - the swap with memory with its value of "value"; record the \ - value returned. */ \ - for (; i < programCount; ++i) { \ - if ((mask & (1 << i)) == 0) \ - continue; \ - memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \ - lastSwap = i; \ - break; \ - } \ - /* Now, for all of the remaining running program instances, set the \ - return value of the last instance that did a swap with this \ - instance's value of "value"; this gives the same effect as if the \ - current instance had executed a hardware atomic swap right before \ - the last one that did a swap. */ \ - for (; i < programCount; ++i) { \ - if ((mask & (1 << i)) == 0) \ - continue; \ - ret[lastSwap] = extract(value, i); \ - lastSwap = i; \ - } \ - /* And the last instance that wanted to swap gets the value we \ - originally got back from memory... */ \ - ret[lastSwap] = memVal; \ - memory_barrier(); \ - return ret[programIndex]; \ -} \ -static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \ - uniform TA value) { \ - memory_barrier(); \ - uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value); \ - memory_barrier(); \ - return ret; \ -} \ -static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \ - uniform TA * uniform ptrArray[programCount]; \ - ptrArray[programIndex] = ptr; \ - memory_barrier(); \ - TA ret; \ - uniform int mask = lanemask(); \ - for (uniform int i = 0; i < programCount; ++i) { \ - if ((mask & (1 << i)) == 0) \ - continue; \ - uniform TA * uniform p = ptrArray[i]; \ - uniform TA v = extract(value, i); \ - uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \ - ret = insert(ret, i, r); \ - } \ - memory_barrier(); \ - return ret; \ -} \ - -#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \ -static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ - uniform TA oneval = reduce_##OPA(value); \ - TA ret; \ - if (lanemask() != 0) { \ - memory_barrier(); \ - ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval); \ - memory_barrier(); \ - } \ - return ret; \ -} \ -static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \ - uniform TA value) { \ - memory_barrier(); \ - uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \ - memory_barrier(); \ - return ret; \ -} \ -static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \ - TA value) { \ - uniform TA * uniform ptrArray[programCount]; \ - ptrArray[programIndex] = ptr; \ - memory_barrier(); \ - TA ret; \ - uniform int mask = lanemask(); \ - for (uniform int i = 0; i < programCount; ++i) { \ - if ((mask & (1 << i)) == 0) \ - continue; \ - uniform TA * uniform p = ptrArray[i]; \ - uniform TA v = extract(value, i); \ - uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \ - ret = insert(ret, i, r); \ - } \ - memory_barrier(); \ - return ret; \ -} - -DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType) -DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType) -DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min) -DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max) -DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType) -DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType) -DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType) -DEFINE_ATOMIC_SWAP(int32,int32) - -// For everything but atomic min and max, we can use the same -// implementations for unsigned as for signed. -DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType) -DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin) -DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax) -DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType) -DEFINE_ATOMIC_SWAP(unsigned int32,int32) - -DEFINE_ATOMIC_SWAP(float,float) - -DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType) -DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType) -DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min) -DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max) -DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType) -DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType) -DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType) -DEFINE_ATOMIC_SWAP(int64,int64) - -// For everything but atomic min and max, we can use the same -// implementations for unsigned as for signed. -DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType) -DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin) -DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax) -DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType) -DEFINE_ATOMIC_SWAP(unsigned int64,int64) - -DEFINE_ATOMIC_SWAP(double,double) - -#undef DEFINE_ATOMIC_OP -#undef DEFINE_ATOMIC_MINMAX_OP -#undef DEFINE_ATOMIC_SWAP - -#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \ -static inline TA atomic_compare_exchange_global( \ - uniform TA * uniform ptr, TA oldval, TA newval) { \ - memory_barrier(); \ - TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, \ - (MASKTYPE)__mask); \ - memory_barrier(); \ - return ret; \ -} \ -static inline uniform TA atomic_compare_exchange_global( \ - uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \ - memory_barrier(); \ - uniform TA ret = \ - __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \ - memory_barrier(); \ - return ret; \ -} - -ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType) -ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType) -ATOMIC_DECL_CMPXCHG(float, float, IntMaskType) -ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType) -ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType) -ATOMIC_DECL_CMPXCHG(double, double, IntMaskType) - -#undef ATOMIC_DECL_CMPXCHG - /////////////////////////////////////////////////////////////////////////// // Floating-Point Math @@ -1389,6 +1178,400 @@ static inline uniform int64 clamp(uniform int64 v, uniform int64 low, return min(max(v, low), high); } +/////////////////////////////////////////////////////////////////////////// +// Global atomics and memory barriers + +static inline void memory_barrier() { + __memory_barrier(); +} + +#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \ +static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ + memory_barrier(); \ + TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \ + memory_barrier(); \ + return ret; \ +} \ +static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \ + uniform TA value) { \ + memory_barrier(); \ + uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \ + memory_barrier(); \ + return ret; \ +} \ +static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \ + uniform TA * uniform ptrArray[programCount]; \ + ptrArray[programIndex] = ptr; \ + memory_barrier(); \ + TA ret; \ + uniform int mask = lanemask(); \ + for (uniform int i = 0; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + uniform TA * uniform p = ptrArray[i]; \ + uniform TA v = extract(value, i); \ + uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \ + ret = insert(ret, i, r); \ + } \ + memory_barrier(); \ + return ret; \ +} \ + +#define DEFINE_ATOMIC_SWAP(TA,TB) \ +static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \ + memory_barrier(); \ + uniform int i = 0; \ + TA ret[programCount]; \ + TA memVal; \ + uniform int lastSwap; \ + uniform int mask = lanemask(); \ + /* First, have the first running program instance (if any) perform \ + the swap with memory with its value of "value"; record the \ + value returned. */ \ + for (; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \ + lastSwap = i; \ + break; \ + } \ + /* Now, for all of the remaining running program instances, set the \ + return value of the last instance that did a swap with this \ + instance's value of "value"; this gives the same effect as if the \ + current instance had executed a hardware atomic swap right before \ + the last one that did a swap. */ \ + for (; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + ret[lastSwap] = extract(value, i); \ + lastSwap = i; \ + } \ + /* And the last instance that wanted to swap gets the value we \ + originally got back from memory... */ \ + ret[lastSwap] = memVal; \ + memory_barrier(); \ + return ret[programIndex]; \ +} \ +static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \ + uniform TA value) { \ + memory_barrier(); \ + uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value); \ + memory_barrier(); \ + return ret; \ +} \ +static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \ + uniform TA * uniform ptrArray[programCount]; \ + ptrArray[programIndex] = ptr; \ + memory_barrier(); \ + TA ret; \ + uniform int mask = lanemask(); \ + for (uniform int i = 0; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + uniform TA * uniform p = ptrArray[i]; \ + uniform TA v = extract(value, i); \ + uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \ + ret = insert(ret, i, r); \ + } \ + memory_barrier(); \ + return ret; \ +} \ + +#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \ +static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ + uniform TA oneval = reduce_##OPA(value); \ + TA ret; \ + if (lanemask() != 0) { \ + memory_barrier(); \ + ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval); \ + memory_barrier(); \ + } \ + return ret; \ +} \ +static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \ + uniform TA value) { \ + memory_barrier(); \ + uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \ + memory_barrier(); \ + return ret; \ +} \ +static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \ + TA value) { \ + uniform TA * uniform ptrArray[programCount]; \ + ptrArray[programIndex] = ptr; \ + memory_barrier(); \ + TA ret; \ + uniform int mask = lanemask(); \ + for (uniform int i = 0; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + uniform TA * uniform p = ptrArray[i]; \ + uniform TA v = extract(value, i); \ + uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \ + ret = insert(ret, i, r); \ + } \ + memory_barrier(); \ + return ret; \ +} + +DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max) +DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType) +DEFINE_ATOMIC_SWAP(int32,int32) + +// For everything but atomic min and max, we can use the same +// implementations for unsigned as for signed. +DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax) +DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType) +DEFINE_ATOMIC_SWAP(unsigned int32,int32) + +DEFINE_ATOMIC_SWAP(float,float) + +DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max) +DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType) +DEFINE_ATOMIC_SWAP(int64,int64) + +// For everything but atomic min and max, we can use the same +// implementations for unsigned as for signed. +DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax) +DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType) +DEFINE_ATOMIC_SWAP(unsigned int64,int64) + +DEFINE_ATOMIC_SWAP(double,double) + +#undef DEFINE_ATOMIC_OP +#undef DEFINE_ATOMIC_MINMAX_OP +#undef DEFINE_ATOMIC_SWAP + +#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \ +static inline TA atomic_compare_exchange_global( \ + uniform TA * uniform ptr, TA oldval, TA newval) { \ + memory_barrier(); \ + TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, \ + (MASKTYPE)__mask); \ + memory_barrier(); \ + return ret; \ +} \ +static inline uniform TA atomic_compare_exchange_global( \ + uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \ + memory_barrier(); \ + uniform TA ret = \ + __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \ + memory_barrier(); \ + return ret; \ +} + +ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType) +ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType) +ATOMIC_DECL_CMPXCHG(float, float, IntMaskType) +ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType) +ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType) +ATOMIC_DECL_CMPXCHG(double, double, IntMaskType) + +#undef ATOMIC_DECL_CMPXCHG + +/////////////////////////////////////////////////////////////////////////// +// local atomics + +#define LOCAL_ATOMIC(TYPE,NAME,OPFUNC) \ +static inline uniform TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, \ + uniform TYPE value) { \ + uniform TYPE ret = *ptr; \ + *ptr = OPFUNC(*ptr, value); \ + return ret; \ +} \ +static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value) { \ + TYPE ret; \ + uniform int mask = lanemask(); \ + for (uniform int i = 0; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + ret = insert(ret, i, *ptr); \ + *ptr = OPFUNC(*ptr, extract(value, i)); \ + } \ + return ret; \ +} \ +static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) { \ + TYPE ret; \ + uniform TYPE * uniform ptrs[programCount]; \ + ptrs[programIndex] = p; \ + uniform int mask = lanemask(); \ + for (uniform int i = 0; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + ret = insert(ret, i, *ptrs[i]); \ + *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \ + } \ + return ret; \ +} + +static inline uniform int32 __add(uniform int32 a, uniform int32 b) { return a+b; } +static inline uniform int32 __sub(uniform int32 a, uniform int32 b) { return a-b; } +static inline uniform int32 __and(uniform int32 a, uniform int32 b) { return a & b; } +static inline uniform int32 __or(uniform int32 a, uniform int32 b) { return a | b; } +static inline uniform int32 __xor(uniform int32 a, uniform int32 b) { return a ^ b; } +static inline uniform int32 __swap(uniform int32 a, uniform int32 b) { return b; } + +static inline uniform unsigned int32 __add(uniform unsigned int32 a, + uniform unsigned int32 b) { return a+b; } +static inline uniform unsigned int32 __sub(uniform unsigned int32 a, + uniform unsigned int32 b) { return a-b; } +static inline uniform unsigned int32 __and(uniform unsigned int32 a, + uniform unsigned int32 b) { return a & b; } +static inline uniform unsigned int32 __or(uniform unsigned int32 a, + uniform unsigned int32 b) { return a | b; } +static inline uniform unsigned int32 __xor(uniform unsigned int32 a, + uniform unsigned int32 b) { return a ^ b; } +static inline uniform unsigned int32 __swap(uniform unsigned int32 a, + uniform unsigned int32 b) { return b; } + + +static inline uniform float __add(uniform float a, uniform float b) { return a+b; } +static inline uniform float __sub(uniform float a, uniform float b) { return a-b; } +static inline uniform float __swap(uniform float a, uniform float b) { return b; } + +static inline uniform int64 __add(uniform int64 a, uniform int64 b) { return a+b; } +static inline uniform int64 __sub(uniform int64 a, uniform int64 b) { return a-b; } +static inline uniform int64 __and(uniform int64 a, uniform int64 b) { return a & b; } +static inline uniform int64 __or(uniform int64 a, uniform int64 b) { return a | b; } +static inline uniform int64 __xor(uniform int64 a, uniform int64 b) { return a ^ b; } +static inline uniform int64 __swap(uniform int64 a, uniform int64 b) { return b; } + +static inline uniform unsigned int64 __add(uniform unsigned int64 a, + uniform unsigned int64 b) { return a+b; } +static inline uniform unsigned int64 __sub(uniform unsigned int64 a, + uniform unsigned int64 b) { return a-b; } +static inline uniform unsigned int64 __and(uniform unsigned int64 a, + uniform unsigned int64 b) { return a & b; } +static inline uniform unsigned int64 __or(uniform unsigned int64 a, + uniform unsigned int64 b) { return a | b; } +static inline uniform unsigned int64 __xor(uniform unsigned int64 a, + uniform unsigned int64 b) { return a ^ b; } +static inline uniform unsigned int64 __swap(uniform unsigned int64 a, + uniform unsigned int64 b) { return b; } + +static inline uniform double __add(uniform double a, uniform double b) { return a+b; } +static inline uniform double __sub(uniform double a, uniform double b) { return a-b; } +static inline uniform double __swap(uniform double a, uniform double b) { return a-b; } + +LOCAL_ATOMIC(int32, add, __add) +LOCAL_ATOMIC(int32, subtract, __sub) +LOCAL_ATOMIC(int32, and, __and) +LOCAL_ATOMIC(int32, or, __or) +LOCAL_ATOMIC(int32, xor, __xor) +LOCAL_ATOMIC(int32, min, min) +LOCAL_ATOMIC(int32, max, max) +LOCAL_ATOMIC(int32, swap, __swap) + +LOCAL_ATOMIC(unsigned int32, add, __add) +LOCAL_ATOMIC(unsigned int32, subtract, __sub) +LOCAL_ATOMIC(unsigned int32, and, __and) +LOCAL_ATOMIC(unsigned int32, or, __or) +LOCAL_ATOMIC(unsigned int32, xor, __xor) +LOCAL_ATOMIC(unsigned int32, min, min) +LOCAL_ATOMIC(unsigned int32, max, max) +LOCAL_ATOMIC(unsigned int32, swap, __swap) + +LOCAL_ATOMIC(float, add, __add) +LOCAL_ATOMIC(float, subtract, __sub) +LOCAL_ATOMIC(float, min, min) +LOCAL_ATOMIC(float, max, max) +LOCAL_ATOMIC(float, swap, __swap) + +LOCAL_ATOMIC(int64, add, __add) +LOCAL_ATOMIC(int64, subtract, __sub) +LOCAL_ATOMIC(int64, and, __and) +LOCAL_ATOMIC(int64, or, __or) +LOCAL_ATOMIC(int64, xor, __xor) +LOCAL_ATOMIC(int64, min, min) +LOCAL_ATOMIC(int64, max, max) +LOCAL_ATOMIC(int64, swap, __swap) + +LOCAL_ATOMIC(unsigned int64, add, __add) +LOCAL_ATOMIC(unsigned int64, subtract, __sub) +LOCAL_ATOMIC(unsigned int64, and, __and) +LOCAL_ATOMIC(unsigned int64, or, __or) +LOCAL_ATOMIC(unsigned int64, xor, __xor) +LOCAL_ATOMIC(unsigned int64, min, min) +LOCAL_ATOMIC(unsigned int64, max, max) +LOCAL_ATOMIC(unsigned int64, swap, __swap) + +LOCAL_ATOMIC(double, add, __add) +LOCAL_ATOMIC(double, subtract, __sub) +LOCAL_ATOMIC(double, min, min) +LOCAL_ATOMIC(double, max, max) +LOCAL_ATOMIC(double, swap, __swap) + +// compare exchange +#define LOCAL_CMPXCHG(TYPE) \ +static inline uniform TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \ + uniform TYPE cmp, \ + uniform TYPE update) { \ + uniform TYPE old = *ptr; \ + if (old == cmp) \ + *ptr = update; \ + return old; \ +} \ +static inline TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \ + TYPE cmp, TYPE update) { \ + TYPE ret; \ + uniform int mask = lanemask(); \ + for (uniform int i = 0; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + uniform TYPE old = *ptr; \ + if (old == extract(cmp, i)) \ + *ptr = extract(update, i); \ + ret = insert(ret, i, old); \ + } \ + return ret; \ +} \ +static inline TYPE atomic_compare_exchange_local(uniform TYPE * varying p, \ + TYPE cmp, TYPE update) { \ + uniform TYPE * uniform ptrs[programCount]; \ + ptrs[programIndex] = p; \ + TYPE ret; \ + uniform int mask = lanemask(); \ + for (uniform int i = 0; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + uniform TYPE old = *ptrs[i]; \ + if (old == extract(cmp, i)) \ + *ptrs[i] = extract(update, i); \ + ret = insert(ret, i, old); \ + } \ + return ret; \ +} + +LOCAL_CMPXCHG(int32) +LOCAL_CMPXCHG(unsigned int32) +LOCAL_CMPXCHG(float) +LOCAL_CMPXCHG(int64) +LOCAL_CMPXCHG(unsigned int64) +LOCAL_CMPXCHG(double) + +#undef LOCAL_ATOMIC +#undef LOCAL_CMPXCHG + /////////////////////////////////////////////////////////////////////////// // Transcendentals (float precision) diff --git a/tests/local-atomics-1.ispc b/tests/local-atomics-1.ispc new file mode 100644 index 00000000..1b3b337a --- /dev/null +++ b/tests/local-atomics-1.ispc @@ -0,0 +1,15 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float delta = 1; + float b = atomic_add_local(&s, delta); + RET[programIndex] = reduce_add(b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = reduce_add(programIndex); +} diff --git a/tests/local-atomics-10.ispc b/tests/local-atomics-10.ispc new file mode 100644 index 00000000..77eb1387 --- /dev/null +++ b/tests/local-atomics-10.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = 0; + float delta = 1; + if (programIndex < 2) + b = atomic_add_local(&s, delta); + RET[programIndex] = s; +} + +export void result(uniform float RET[]) { + RET[programIndex] = programCount == 1 ? 1 : 2; +} diff --git a/tests/local-atomics-11.ispc b/tests/local-atomics-11.ispc new file mode 100644 index 00000000..ee17ef30 --- /dev/null +++ b/tests/local-atomics-11.ispc @@ -0,0 +1,20 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = 0; + if (programIndex & 1) + b = atomic_add_local(&s, programIndex); + RET[programIndex] = s; +} + +export void result(uniform float RET[]) { + uniform int sum = 0; + for (uniform int i = 0; i < programCount; ++i) + if (i & 1) + sum += i; + RET[programIndex] = sum; +} diff --git a/tests/local-atomics-12.ispc b/tests/local-atomics-12.ispc new file mode 100644 index 00000000..fc7938ce --- /dev/null +++ b/tests/local-atomics-12.ispc @@ -0,0 +1,20 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = 0; + if (programIndex & 1) + b = atomic_or_local(&s, (1 << programIndex)); + RET[programIndex] = s; +} + +export void result(uniform float RET[]) { + uniform int sum = 0; + for (uniform int i = 0; i < programCount; ++i) + if (i & 1) + sum += (1 << i); + RET[programIndex] = sum; +} diff --git a/tests/local-atomics-13.ispc b/tests/local-atomics-13.ispc new file mode 100644 index 00000000..632e34ea --- /dev/null +++ b/tests/local-atomics-13.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = 0; + if (programIndex & 1) + b = atomic_or_local(&s, (1 << programIndex)); + RET[programIndex] = popcnt(reduce_max((int32)b)); +} + +export void result(uniform float RET[]) { + RET[programIndex] = programCount == 1 ? 0 : ((programCount/2) - 1); +} diff --git a/tests/local-atomics-14.ispc b/tests/local-atomics-14.ispc new file mode 100644 index 00000000..a5f7e63f --- /dev/null +++ b/tests/local-atomics-14.ispc @@ -0,0 +1,20 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int64 s = 0xffffffffff000000; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = 0; + if (programIndex & 1) + b = atomic_or_local(&s, (1 << programIndex)); + RET[programIndex] = (s>>20); +} + +export void result(uniform float RET[]) { + uniform int sum = 0; + for (uniform int i = 0; i < programCount; ++i) + if (i & 1) + sum += (1 << i); + RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20; +} diff --git a/tests/local-atomics-2.ispc b/tests/local-atomics-2.ispc new file mode 100644 index 00000000..82964afd --- /dev/null +++ b/tests/local-atomics-2.ispc @@ -0,0 +1,15 @@ + +export uniform int width() { return programCount; } + +uniform int64 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float delta = 1; + float b = atomic_add_local(&s, delta); + RET[programIndex] = reduce_add(b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = reduce_add(programIndex); +} diff --git a/tests/local-atomics-3.ispc b/tests/local-atomics-3.ispc new file mode 100644 index 00000000..558335e4 --- /dev/null +++ b/tests/local-atomics-3.ispc @@ -0,0 +1,15 @@ + +export uniform int width() { return programCount; } + +uniform int32 s = 0xff; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + int32 bits = 0xfff0; + float b = atomic_xor_local(&s, bits); + RET[programIndex] = s; +} + +export void result(uniform float RET[]) { + RET[programIndex] = (programCount & 1) ? 0xff0f : 0xff; +} diff --git a/tests/local-atomics-4.ispc b/tests/local-atomics-4.ispc new file mode 100644 index 00000000..651cf4c6 --- /dev/null +++ b/tests/local-atomics-4.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = atomic_or_local(&s, (1<