diff --git a/builtins.m4 b/builtins.m4 index 59a7b6a3..714a2bd5 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -656,6 +656,84 @@ define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, } ') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; global_atomic_associative +;; More efficient implementation for atomics that are associative (e.g., +;; add, and, ...). If a basic implementation would do sometihng like: +;; result0 = atomic_op(ptr, val0) +;; result1 = atomic_op(ptr, val1) +;; .. +;; Then instead we can do: +;; tmp = (val0 op val1 op ...) +;; result0 = atomic_op(ptr, tmp) +;; result1 = (result0 op val0) +;; .. +;; And more efficiently compute the same result +;; +;; Takes five parameters: +;; $1: vector width of the target +;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names) +;; (add, sub...) +;; $3: return type of the LLVM atomic (e.g. i32) +;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) +;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...) + +define(`global_atomic_associative', ` + +declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta) + +;; note that the mask is expected to be of type $3, so the caller must ensure +;; that for 64-bit types, the mask is cast to a signed int before being passed +;; to this so that it is properly sign extended... (The code in stdlib.ispc +;; does do this..) + +define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, + <$1 x $3> %mask) nounwind alwaysinline { + ; first, for any lanes where the mask is off, compute a vector where those lanes + ; hold the identity value.. + + ; zero out any lanes that are off + %valoff = and <$1 x $3> %val, %mask + + ; compute an identity vector that is zero in on lanes and has the identiy value + ; in the off lanes + %idv1 = bitcast $3 $5 to <1 x $3> + %idvec = shufflevector <1 x $3> %idv1, <1 x $3> undef, + <$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 > + %notmask = xor <$1 x $3> %mask, < forloop(i, 1, eval($1-1), `$3 -1, ') $3 -1 > + %idoff = and <$1 x $3> %idvec, %notmask + + ; and comptue the merged vector that holds the identity in the off lanes + %valp = or <$1 x $3> %valoff, %idoff + + ; now compute the local reduction (val0 op val1 op ... )--initialize + ; %eltvec so that the 0th element is the identity, the first is val0, + ; the second is (val0 op val1), .. + %red0 = extractelement <$1 x $3> %valp, i32 0 + %eltvec0 = insertelement <$1 x $3> undef, $3 $5, i32 0 + + forloop(i, 1, eval($1-1), ` + %elt`'i = extractelement <$1 x $3> %valp, i32 i + %red`'i = $2 $3 %red`'eval(i-1), %elt`'i + %eltvec`'i = insertelement <$1 x $3> %eltvec`'eval(i-1), $3 %red`'eval(i-1), i32 i') + + ; make the atomic call, passing it the final reduced value + %final0 = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %red`'eval($1-1)) + + ; now go back and compute the values to be returned for each program + ; instance--this just involves smearing the old value returned from the + ; actual atomic call across the vector and applying the vector op to the + ; %eltvec vector computed above.. + %finalv1 = bitcast $3 %final0 to <1 x $3> + %final_base = shufflevector <1 x $3> %finalv1, <1 x $3> undef, + <$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 > + %r = $2 <$1 x $3> %final_base, %eltvec`'eval($1-1) + + ret <$1 x $3> %r +} +') + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; global_atomic_uniform ;; Defines the implementation of a function that handles the mapping from @@ -1143,21 +1221,21 @@ define internal void @__memory_barrier() nounwind readnone alwaysinline { ret void } -global_atomic($1, add, i32, int32) -global_atomic($1, sub, i32, int32) -global_atomic($1, and, i32, int32) -global_atomic($1, or, i32, int32) -global_atomic($1, xor, i32, int32) +global_atomic_associative($1, add, i32, int32, 0) +global_atomic_associative($1, sub, i32, int32, 0) +global_atomic_associative($1, and, i32, int32, -1) +global_atomic_associative($1, or, i32, int32, 0) +global_atomic_associative($1, xor, i32, int32, 0) global_atomic_uniform($1, min, i32, int32) global_atomic_uniform($1, max, i32, int32) global_atomic_uniform($1, umin, i32, uint32) global_atomic_uniform($1, umax, i32, uint32) -global_atomic($1, add, i64, int64) -global_atomic($1, sub, i64, int64) -global_atomic($1, and, i64, int64) -global_atomic($1, or, i64, int64) -global_atomic($1, xor, i64, int64) +global_atomic_associative($1, add, i64, int64, 0) +global_atomic_associative($1, sub, i64, int64, 0) +global_atomic_associative($1, and, i64, int64, -1) +global_atomic_associative($1, or, i64, int64, 0) +global_atomic_associative($1, xor, i64, int64, 0) global_atomic_uniform($1, min, i64, int64) global_atomic_uniform($1, max, i64, int64) global_atomic_uniform($1, umin, i64, uint64) diff --git a/stdlib.ispc b/stdlib.ispc index 532d723f..6b7ce67f 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -575,10 +575,10 @@ static inline void memory_barrier() { __memory_barrier(); } -#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB) \ +#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \ static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \ memory_barrier(); \ - TA ret = __atomic_##OPB##_##TB##_global(ref, value, __mask); \ + TA ret = __atomic_##OPB##_##TB##_global(ref, value, (MASKTYPE)__mask); \ memory_barrier(); \ return ret; \ } @@ -595,49 +595,49 @@ static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \ return ret; \ } -DEFINE_ATOMIC_OP(int32,int32,add,add) -DEFINE_ATOMIC_OP(int32,int32,subtract,sub) +DEFINE_ATOMIC_OP(int32,int32,add,add,int32) +DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32) DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min) DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max) -DEFINE_ATOMIC_OP(int32,int32,and,and) -DEFINE_ATOMIC_OP(int32,int32,or,or) -DEFINE_ATOMIC_OP(int32,int32,xor,xor) -DEFINE_ATOMIC_OP(int32,int32,swap,swap) +DEFINE_ATOMIC_OP(int32,int32,and,and,int32) +DEFINE_ATOMIC_OP(int32,int32,or,or,int32) +DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32) +DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. -DEFINE_ATOMIC_OP(unsigned int32,int32,add,add) -DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub) +DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,int32) +DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,int32) DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin) DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax) -DEFINE_ATOMIC_OP(unsigned int32,int32,and,and) -DEFINE_ATOMIC_OP(unsigned int32,int32,or,or) -DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor) -DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap) +DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,int32) +DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,int32) +DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,int32) +DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,int32) -DEFINE_ATOMIC_OP(float,float,swap,swap) +DEFINE_ATOMIC_OP(float,float,swap,swap,int32) -DEFINE_ATOMIC_OP(int64,int64,add,add) -DEFINE_ATOMIC_OP(int64,int64,subtract,sub) +DEFINE_ATOMIC_OP(int64,int64,add,add,int64) +DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int64) DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min) DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max) -DEFINE_ATOMIC_OP(int64,int64,and,and) -DEFINE_ATOMIC_OP(int64,int64,or,or) -DEFINE_ATOMIC_OP(int64,int64,xor,xor) -DEFINE_ATOMIC_OP(int64,int64,swap,swap) +DEFINE_ATOMIC_OP(int64,int64,and,and,int64) +DEFINE_ATOMIC_OP(int64,int64,or,or,int64) +DEFINE_ATOMIC_OP(int64,int64,xor,xor,int64) +DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. -DEFINE_ATOMIC_OP(unsigned int64,int64,add,add) -DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub) +DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,int64) +DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,int64) DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin) DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax) -DEFINE_ATOMIC_OP(unsigned int64,int64,and,and) -DEFINE_ATOMIC_OP(unsigned int64,int64,or,or) -DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor) -DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap) +DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,int64) +DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,int64) +DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,int64) +DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,int32) -DEFINE_ATOMIC_OP(double,double,swap,swap) +DEFINE_ATOMIC_OP(double,double,swap,swap,int32) #undef DEFINE_ATOMIC_OP diff --git a/tests/atomics-10.ispc b/tests/atomics-10.ispc new file mode 100644 index 00000000..b950988a --- /dev/null +++ b/tests/atomics-10.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = 0; + if (programIndex < 2) + b = atomic_add_global(s, 1); + RET[programIndex] = s; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 2; +} diff --git a/tests/atomics-11.ispc b/tests/atomics-11.ispc new file mode 100644 index 00000000..cb94544c --- /dev/null +++ b/tests/atomics-11.ispc @@ -0,0 +1,20 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = 0; + if (programIndex & 1) + b = atomic_add_global(s, programIndex); + RET[programIndex] = s; +} + +export void result(uniform float RET[]) { + uniform int sum = 0; + for (uniform int i = 0; i < programCount; ++i) + if (i & 1) + sum += i; + RET[programIndex] = sum; +} diff --git a/tests/atomics-12.ispc b/tests/atomics-12.ispc new file mode 100644 index 00000000..4d7e2c1e --- /dev/null +++ b/tests/atomics-12.ispc @@ -0,0 +1,20 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = 0; + if (programIndex & 1) + b = atomic_or_global(s, (1 << programIndex)); + RET[programIndex] = s; +} + +export void result(uniform float RET[]) { + uniform int sum = 0; + for (uniform int i = 0; i < programCount; ++i) + if (i & 1) + sum += (1 << i); + RET[programIndex] = sum; +} diff --git a/tests/atomics-13.ispc b/tests/atomics-13.ispc new file mode 100644 index 00000000..dd9c316c --- /dev/null +++ b/tests/atomics-13.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = 0; + if (programIndex & 1) + b = atomic_or_global(s, (1 << programIndex)); + RET[programIndex] = popcnt(reduce_add((int32)b)); +} + +export void result(uniform float RET[]) { + RET[programIndex] = (programCount/2) - 1; +} diff --git a/tests/atomics-14.ispc b/tests/atomics-14.ispc new file mode 100644 index 00000000..cf9826cb --- /dev/null +++ b/tests/atomics-14.ispc @@ -0,0 +1,20 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int64 s = 0xffffffffff000000; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = 0; + if (programIndex & 1) + b = atomic_or_global(s, (1 << programIndex)); + RET[programIndex] = (s>>20); +} + +export void result(uniform float RET[]) { + uniform int sum = 0; + for (uniform int i = 0; i < programCount; ++i) + if (i & 1) + sum += (1 << i); + RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20; +} diff --git a/tests/atomics-9.ispc b/tests/atomics-9.ispc new file mode 100644 index 00000000..c038adc4 --- /dev/null +++ b/tests/atomics-9.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = 0; + if (programIndex < 2) + b = atomic_add_global(s, 1); + RET[programIndex] = reduce_add(b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +}