diff --git a/builtins.m4 b/builtins.m4 index 3c59a1f7..fe6990f4 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -622,40 +622,6 @@ forloop(i, 1, eval($1-1), ` } ') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; global_atomic -;; Defines the implementation of a function that handles the mapping from -;; an ispc atomic function to the underlying LLVM intrinsics. Specifically, -;; the function handles loooping over the active lanes, calling the underlying -;; scalar atomic intrinsic for each one, and assembling the vector result. -;; -;; Takes four parameters: -;; $1: vector width of the target -;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names) -;; (add, sub...) -;; $3: return type of the LLVM atomic (e.g. i32) -;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) - -define(`global_atomic', ` - -declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta) - -define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, - <$1 x i32> %mask) nounwind alwaysinline { - %rptr = alloca <$1 x $3> - %rptr32 = bitcast <$1 x $3> * %rptr to $3 * - - per_lane($1, <$1 x i32> %mask, ` - %v_LANE_ID = extractelement <$1 x $3> %val, i32 LANE - %r_LANE_ID = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %v_LANE_ID) - %rp_LANE_ID = getelementptr $3 * %rptr32, i32 LANE - store $3 %r_LANE_ID, $3 * %rp_LANE_ID') - - %r = load <$1 x $3> * %rptr - ret <$1 x $3> %r -} -') - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; global_atomic_associative @@ -681,8 +647,6 @@ define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, define(`global_atomic_associative', ` -declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta) - ;; note that the mask is expected to be of type $3, so the caller must ensure ;; that for 64-bit types, the mask is cast to a signed int before being passed ;; to this so that it is properly sign extended... (The code in stdlib.ispc @@ -751,7 +715,7 @@ define(`global_atomic_uniform', ` declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta) -define internal $3 @__atomic_$2_$4_global($3 * %ptr, $3 %val, +define internal $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val, <$1 x i32> %mask) nounwind alwaysinline { %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val) ret $3 %r @@ -764,9 +728,10 @@ define internal $3 @__atomic_$2_$4_global($3 * %ptr, $3 %val, ;; $2: llvm type of the vector elements (e.g. i32) ;; $3: ispc type of the elements (e.g. int32) -define(`global_swap', ` +declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val) +declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val) -declare $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val) +define(`global_swap', ` define internal <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val, <$1 x i32> %mask) nounwind alwaysinline { @@ -782,6 +747,12 @@ define internal <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val, %r = load <$1 x $2> * %rptr ret <$1 x $2> %r } + +define internal $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val, + <$1 x i32> %mask) nounwind alwaysinline { + %r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val) + ret $2 %r +} ') @@ -811,6 +782,12 @@ define internal <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $ %r = load <$1 x $2> * %rptr ret <$1 x $2> %r } + +define internal $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp, + $2 %val, <$1 x i32> %mask) nounwind alwaysinline { + %r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val) + ret $2 %r +} ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1228,6 +1205,11 @@ global_atomic_associative($1, sub, i32, int32, 0) global_atomic_associative($1, and, i32, int32, -1) global_atomic_associative($1, or, i32, int32, 0) global_atomic_associative($1, xor, i32, int32, 0) +global_atomic_uniform($1, add, i32, int32) +global_atomic_uniform($1, sub, i32, int32) +global_atomic_uniform($1, and, i32, int32) +global_atomic_uniform($1, or, i32, int32) +global_atomic_uniform($1, xor, i32, int32) global_atomic_uniform($1, min, i32, int32) global_atomic_uniform($1, max, i32, int32) global_atomic_uniform($1, umin, i32, uint32) @@ -1238,6 +1220,11 @@ global_atomic_associative($1, sub, i64, int64, 0) global_atomic_associative($1, and, i64, int64, -1) global_atomic_associative($1, or, i64, int64, 0) global_atomic_associative($1, xor, i64, int64, 0) +global_atomic_uniform($1, add, i64, int64) +global_atomic_uniform($1, sub, i64, int64) +global_atomic_uniform($1, and, i64, int64) +global_atomic_uniform($1, or, i64, int64) +global_atomic_uniform($1, xor, i64, int64) global_atomic_uniform($1, min, i64, int64) global_atomic_uniform($1, max, i64, int64) global_atomic_uniform($1, umin, i64, uint64) @@ -1264,6 +1251,24 @@ define internal <$1 x double> @__atomic_swap_double_global(double * %ptr, <$1 x ret <$1 x double> %ret } +define internal float @__atomic_swap_uniform_float_global(float * %ptr, float %val, + <$1 x i32> %mask) nounwind alwaysinline { + %iptr = bitcast float * %ptr to i32 * + %ival = bitcast float %val to i32 + %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <$1 x i32> %mask) + %ret = bitcast i32 %iret to float + ret float %ret +} + +define internal double @__atomic_swap_uniform_double_global(double * %ptr, double %val, + <$1 x i32> %mask) nounwind alwaysinline { + %iptr = bitcast double * %ptr to i64 * + %ival = bitcast double %val to i64 + %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <$1 x i32> %mask) + %ret = bitcast i64 %iret to double + ret double %ret +} + global_atomic_exchange($1, i32, int32) global_atomic_exchange($1, i64, int64) @@ -1288,6 +1293,29 @@ define internal <$1 x double> @__atomic_compare_exchange_double_global(double * %ret = bitcast <$1 x i64> %iret to <$1 x double> ret <$1 x double> %ret } + +define internal float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val, + <$1 x i32> %mask) nounwind alwaysinline { + %iptr = bitcast float * %ptr to i32 * + %icmp = bitcast float %cmp to i32 + %ival = bitcast float %val to i32 + %iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp, + i32 %ival, <$1 x i32> %mask) + %ret = bitcast i32 %iret to float + ret float %ret +} + +define internal double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp, + double %val, <$1 x i32> %mask) nounwind alwaysinline { + %iptr = bitcast double * %ptr to i64 * + %icmp = bitcast double %cmp to i64 + %ival = bitcast double %val to i64 + %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp, + i64 %ival, <$1 x i32> %mask) + %ret = bitcast i64 %iret to double + ret double %ret +} + ') diff --git a/docs/ispc.txt b/docs/ispc.txt index 8bc559b4..f6415e85 100644 --- a/docs/ispc.txt +++ b/docs/ispc.txt @@ -2033,12 +2033,12 @@ end.) One thing to note is that that the value being added to here is a ``uniform`` integer, while the increment amount and the return value are -``varying``. In other words, the semantics are that each running program -instance individually issues the atomic operation with its own ``delta`` -value and gets the previous value of ``val`` back in return. The atomics -for the running program instances may be issued in arbitrary order; it's -not guaranteed that they will be issued in ``programIndex`` order, for -example. +``varying``. In other words, the semantics of this call are that each +running program instance individually issues the atomic operation with its +own ``delta`` value and gets the previous value of ``val`` back in return. +The atomics for the running program instances may be issued in arbitrary +order; it's not guaranteed that they will be issued in ``programIndex`` +order, for example. Here are the declarations of the ``int32`` variants of these functions. There are also ``int64`` equivalents as well as variants that take @@ -2056,17 +2056,44 @@ function can be used with ``float`` and ``double`` types as well.) int32 atomic_xor_global(reference uniform int32 val, int32 value) int32 atomic_swap_global(reference uniform int32 val, int32 newval) -There is also an atomic "compare and exchange" function; it atomically -compares the value in "val" to "compare"--if they match, it assigns -"newval" to "val". In either case, the old value of "val" is returned. -(As with the other atomic operations, there are also ``unsigned`` and -64-bit variants of this function. Furthermore, there are ``float`` and -``double`` variants as well.) +There are also variants of these functions that take ``uniform`` values for +the operand and return a ``uniform`` result: :: + uniform int32 atomic_add_global(reference uniform int32 val, + uniform int32 value) + uniform int32 atomic_subtract_global(reference uniform int32 val, + uniform int32 value) + uniform int32 atomic_min_global(reference uniform int32 val, + uniform int32 value) + uniform int32 atomic_max_global(reference uniform int32 val, + uniform int32 value) + uniform int32 atomic_and_global(reference uniform int32 val, + uniform int32 value) + uniform int32 atomic_or_global(reference uniform int32 val, + uniform int32 value) + uniform int32 atomic_xor_global(reference uniform int32 val, + uniform int32 value) + uniform int32 atomic_swap_global(reference uniform int32 val, + uniform int32 newval) + +There are also an atomic swap and "compare and exchange" functions. +Compare and exchange atomically compares the value in "val" to +"compare"--if they match, it assigns "newval" to "val". In either case, +the old value of "val" is returned. (As with the other atomic operations, +there are also ``unsigned`` and 64-bit variants of this function. +Furthermore, there are ``float`` and ``double`` variants as well.) + +:: + + int32 atomic_swap_global(reference uniform int32 val, int32 new) + uniform int32 atomic_swap_global(reference uniform int32 val, + uniform int32 new) int32 atomic_compare_exchange_global(reference uniform int32 val, int32 compare, int32 newval) + uniform int32 atomic_compare_exchange_global(reference uniform int32 val, + uniform int32 compare, uniform int32 newval) ``ispc`` also has a standard library routine that inserts a memory barrier into the code; it ensures that all memory reads and writes prior to be diff --git a/stdlib.ispc b/stdlib.ispc index 36d90313..0b848422 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -581,6 +581,13 @@ static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \ TA ret = __atomic_##OPB##_##TB##_global(ref, value, (MASKTYPE)__mask); \ memory_barrier(); \ return ret; \ +} \ +static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \ + uniform TA value) { \ + memory_barrier(); \ + uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \ + memory_barrier(); \ + return ret; \ } #define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \ @@ -589,10 +596,17 @@ static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \ TA ret; \ if (lanemask() != 0) { \ memory_barrier(); \ - ret = __atomic_##OPB##_##TB##_global(ref, oneval, __mask); \ + ret = __atomic_##OPB##_uniform_##TB##_global(ref, oneval, __mask); \ memory_barrier(); \ } \ return ret; \ +} \ +static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \ + uniform TA value) { \ + memory_barrier(); \ + uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, __mask); \ + memory_barrier(); \ + return ret; \ } DEFINE_ATOMIC_OP(int32,int32,add,add,int32) @@ -648,6 +662,13 @@ static inline TA atomic_compare_exchange_global( \ TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, __mask); \ memory_barrier(); \ return ret; \ +} \ +static inline uniform TA atomic_compare_exchange_global( \ + uniform reference TA ref, uniform TA oldval, uniform TA newval) { \ + memory_barrier(); \ + uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ref, oldval, newval, __mask); \ + memory_barrier(); \ + return ret; \ } ATOMIC_DECL_CMPXCHG(int32, int32) diff --git a/tests/atomics-1.ispc b/tests/atomics-1.ispc index 12d64d85..abe04e55 100644 --- a/tests/atomics-1.ispc +++ b/tests/atomics-1.ispc @@ -5,7 +5,8 @@ uniform unsigned int32 s = 0; export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; - float b = atomic_add_global(s, 1); + float delta = 1; + float b = atomic_add_global(s, delta); RET[programIndex] = reduce_add(b); } diff --git a/tests/atomics-10.ispc b/tests/atomics-10.ispc index b950988a..c033b0bf 100644 --- a/tests/atomics-10.ispc +++ b/tests/atomics-10.ispc @@ -6,8 +6,9 @@ uniform unsigned int32 s = 0; export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; + float delta = 1; if (programIndex < 2) - b = atomic_add_global(s, 1); + b = atomic_add_global(s, delta); RET[programIndex] = s; } diff --git a/tests/atomics-2.ispc b/tests/atomics-2.ispc index c32f52dd..9af4281c 100644 --- a/tests/atomics-2.ispc +++ b/tests/atomics-2.ispc @@ -5,7 +5,8 @@ uniform int64 s = 0; export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; - float b = atomic_add_global(s, 1); + float delta = 1; + float b = atomic_add_global(s, delta); RET[programIndex] = reduce_add(b); } diff --git a/tests/atomics-3.ispc b/tests/atomics-3.ispc index c7282fdb..9b68a90b 100644 --- a/tests/atomics-3.ispc +++ b/tests/atomics-3.ispc @@ -5,7 +5,8 @@ uniform int32 s = 0xff; export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; - float b = atomic_xor_global(s, 0xfffffff0); + int32 bits = 0xfffffff0; + float b = atomic_xor_global(s, bits); RET[programIndex] = s; } diff --git a/tests/atomics-9.ispc b/tests/atomics-9.ispc index c038adc4..1d7ff5ec 100644 --- a/tests/atomics-9.ispc +++ b/tests/atomics-9.ispc @@ -6,8 +6,9 @@ uniform unsigned int32 s = 0; export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; + int32 delta = 1; if (programIndex < 2) - b = atomic_add_global(s, 1); + b = atomic_add_global(s, delta); RET[programIndex] = reduce_add(b); } diff --git a/tests/atomics-uniform-1.ispc b/tests/atomics-uniform-1.ispc new file mode 100644 index 00000000..8455deb9 --- /dev/null +++ b/tests/atomics-uniform-1.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 10; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform unsigned int32 b = atomic_add_global(s, 1); + RET[programIndex] = s; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 11; +} diff --git a/tests/atomics-uniform-2.ispc b/tests/atomics-uniform-2.ispc new file mode 100644 index 00000000..b878d430 --- /dev/null +++ b/tests/atomics-uniform-2.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0b1010; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform unsigned int32 b = atomic_or_global(s, 1); + RET[programIndex] = s; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0b1011; +} diff --git a/tests/atomics-uniform-3.ispc b/tests/atomics-uniform-3.ispc new file mode 100644 index 00000000..5b16b249 --- /dev/null +++ b/tests/atomics-uniform-3.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0b1010; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform unsigned int32 b = atomic_or_global(s, 1); + RET[programIndex] = b; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0b1010; +} diff --git a/tests/atomics-uniform-4.ispc b/tests/atomics-uniform-4.ispc new file mode 100644 index 00000000..2e400faa --- /dev/null +++ b/tests/atomics-uniform-4.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0xffff; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform unsigned int32 b = atomic_min_global(s, 1); + RET[programIndex] = b; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0xffff; +} diff --git a/tests/atomics-uniform-5.ispc b/tests/atomics-uniform-5.ispc new file mode 100644 index 00000000..ac0b849e --- /dev/null +++ b/tests/atomics-uniform-5.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0xffff; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform unsigned int32 b = atomic_min_global(s, 1); + RET[programIndex] = s; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/atomics-uniform-6.ispc b/tests/atomics-uniform-6.ispc new file mode 100644 index 00000000..4161cd65 --- /dev/null +++ b/tests/atomics-uniform-6.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform float s = 100.; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform float b = atomic_swap_global(s, 1.); + RET[programIndex] = s; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1.; +} diff --git a/tests/atomics-uniform-7.ispc b/tests/atomics-uniform-7.ispc new file mode 100644 index 00000000..a7d3816b --- /dev/null +++ b/tests/atomics-uniform-7.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform float s = 100.; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform float b = atomic_swap_global(s, 1.); + RET[programIndex] = b; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 100.; +} diff --git a/tests/atomics-uniform-8.ispc b/tests/atomics-uniform-8.ispc new file mode 100644 index 00000000..a8f89cc5 --- /dev/null +++ b/tests/atomics-uniform-8.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform float s = 100.; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform float b = atomic_compare_exchange_global(s, 1., -100.); + RET[programIndex] = b; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 100.; +} diff --git a/tests/atomics-uniform-9.ispc b/tests/atomics-uniform-9.ispc new file mode 100644 index 00000000..ce632f5c --- /dev/null +++ b/tests/atomics-uniform-9.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform int64 s = 100.; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform int64 b = atomic_compare_exchange_global(s, 100, -100); + RET[programIndex] = s; +} + +export void result(uniform float RET[]) { + RET[programIndex] = -100.; +}