From 1bba9d43074fd24387dcb3e17ebae9ca6e725755 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 20 Jan 2012 10:37:33 -0800 Subject: [PATCH] Improve atomic_swap_global() to take advantage of associativity. We now do a single atomic hardware swap and then effectively do swaps between the running program instances such that the result is the same as if they had happened to run a particular ordering of hardware swaps themselves. Also cleaned up __atomic_swap_uniform_* built-in implementations to not take the mask, which they weren't using anyway. Finishes Issue #56. --- builtins/util.m4 | 68 +++++-------------------- stdlib.ispc | 110 ++++++++++++++++++++++++++++++---------- tests/atomics-swap.ispc | 17 +++++++ 3 files changed, 113 insertions(+), 82 deletions(-) create mode 100644 tests/atomics-swap.ispc diff --git a/builtins/util.m4 b/builtins/util.m4 index b7c2e43e..883cfb4c 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -760,14 +760,12 @@ define(`global_atomic_uniform', ` ifelse(LLVM_VERSION, `LLVM_2_9',` declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta) -define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val, - <$1 x MASK> %mask) nounwind alwaysinline { +define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline { %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val) ret $3 %r } ', ` -define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val, - <$1 x MASK> %mask) nounwind alwaysinline { +define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline { %r = atomicrmw $2 $3 * %ptr, $3 %val seq_cst ret $3 %r } @@ -786,26 +784,7 @@ declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val) declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)') define(`global_swap', ` - -define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val, - <$1 x MASK> %mask) nounwind alwaysinline { - %rptr = alloca <$1 x $2> - %rptr32 = bitcast <$1 x $2> * %rptr to $2 * - - per_lane($1, <$1 x MASK> %mask, ` - %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE -ifelse(LLVM_VERSION, `LLVM_2_9',` - %r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)', ` - %r_LANE_ID = atomicrmw xchg $2 * %ptr, $2 %val_LANE_ID seq_cst') - %rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE - store $2 %r_LANE_ID, $2 * %rp_LANE_ID') - - %r = load <$1 x $2> * %rptr - ret <$1 x $2> %r -} - -define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val, - <$1 x MASK> %mask) nounwind alwaysinline { +define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline { ifelse(LLVM_VERSION, `LLVM_2_9',` %r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)', ` %r = atomicrmw xchg $2 * %ptr, $2 %val seq_cst') @@ -845,7 +824,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',` } define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp, - $2 %val, <$1 x MASK> %mask) nounwind alwaysinline { + $2 %val) nounwind alwaysinline { ifelse(LLVM_VERSION, `LLVM_2_9',` %r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)', ` %r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst') @@ -1997,38 +1976,18 @@ global_atomic_uniform(WIDTH, umax, i64, uint64) global_swap(WIDTH, i32, int32) global_swap(WIDTH, i64, int64) -define @__atomic_swap_float_global(float * %ptr, %val, - %mask) nounwind alwaysinline { - %iptr = bitcast float * %ptr to i32 * - %ival = bitcast %val to - %iret = call @__atomic_swap_int32_global(i32 * %iptr, %ival, %mask) - %ret = bitcast %iret to - ret %ret -} - -define @__atomic_swap_double_global(double * %ptr, %val, - %mask) nounwind alwaysinline { - %iptr = bitcast double * %ptr to i64 * - %ival = bitcast %val to - %iret = call @__atomic_swap_int64_global(i64 * %iptr, %ival, %mask) - %ret = bitcast %iret to - ret %ret -} - -define float @__atomic_swap_uniform_float_global(float * %ptr, float %val, - %mask) nounwind alwaysinline { +define float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline { %iptr = bitcast float * %ptr to i32 * %ival = bitcast float %val to i32 - %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, %mask) + %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival) %ret = bitcast i32 %iret to float ret float %ret } -define double @__atomic_swap_uniform_double_global(double * %ptr, double %val, - %mask) nounwind alwaysinline { +define double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline { %iptr = bitcast double * %ptr to i64 * %ival = bitcast double %val to i64 - %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, %mask) + %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival) %ret = bitcast i64 %iret to double ret double %ret } @@ -2058,24 +2017,23 @@ define @__atomic_compare_exchange_double_global(double * %ptr, ret %ret } -define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val, - %mask) nounwind alwaysinline { +define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, + float %val) nounwind alwaysinline { %iptr = bitcast float * %ptr to i32 * %icmp = bitcast float %cmp to i32 %ival = bitcast float %val to i32 %iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp, - i32 %ival, %mask) + i32 %ival) %ret = bitcast i32 %iret to float ret float %ret } define double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp, - double %val, %mask) nounwind alwaysinline { + double %val) nounwind alwaysinline { %iptr = bitcast double * %ptr to i64 * %icmp = bitcast double %cmp to i64 %ival = bitcast double %val to i64 - %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp, - i64 %ival, %mask) + %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp, i64 %ival) %ret = bitcast i64 %iret to double ret double %ret } diff --git a/stdlib.ispc b/stdlib.ispc index 4013cd4b..cae63abe 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -808,8 +808,7 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \ uniform TA value) { \ memory_barrier(); \ - uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \ - (MASKTYPE)__mask); \ + uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \ memory_barrier(); \ return ret; \ } \ @@ -824,22 +823,80 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \ continue; \ uniform TA * uniform p = ptrArray[i]; \ uniform TA v = extract(value, i); \ - uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \ - (MASKTYPE)__mask); \ + uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \ ret = insert(ret, i, r); \ } \ memory_barrier(); \ return ret; \ } \ -#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE) \ +#define DEFINE_ATOMIC_SWAP(TA,TB) \ +static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \ + memory_barrier(); \ + uniform int i = 0; \ + TA ret[programCount]; \ + TA memVal; \ + uniform int lastSwap; \ + uniform int mask = lanemask(); \ + /* First, have the first running program instance (if any) perform \ + the swap with memory with its value of "value"; record the \ + value returned. */ \ + for (; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \ + lastSwap = i; \ + break; \ + } \ + /* Now, for all of the remaining running program instances, set the \ + return value of the last instance that did a swap with this \ + instance's value of "value"; this gives the same effect as if the \ + current instance had executed a hardware atomic swap right before \ + the last one that did a swap. */ \ + for (; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + ret[lastSwap] = extract(value, i); \ + lastSwap = i; \ + } \ + /* And the last instance that wanted to swap gets the value we \ + originally got back from memory... */ \ + ret[lastSwap] = memVal; \ + memory_barrier(); \ + return ret[programIndex]; \ +} \ +static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \ + uniform TA value) { \ + memory_barrier(); \ + uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value); \ + memory_barrier(); \ + return ret; \ +} \ +static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \ + uniform TA * uniform ptrArray[programCount]; \ + ptrArray[programIndex] = ptr; \ + memory_barrier(); \ + TA ret; \ + uniform int mask = lanemask(); \ + for (uniform int i = 0; i < programCount; ++i) { \ + if ((mask & (1 << i)) == 0) \ + continue; \ + uniform TA * uniform p = ptrArray[i]; \ + uniform TA v = extract(value, i); \ + uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \ + ret = insert(ret, i, r); \ + } \ + memory_barrier(); \ + return ret; \ +} \ + +#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ uniform TA oneval = reduce_##OPA(value); \ TA ret; \ if (lanemask() != 0) { \ memory_barrier(); \ - ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, \ - (MASKTYPE)__mask); \ + ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval); \ memory_barrier(); \ } \ return ret; \ @@ -847,8 +904,7 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \ uniform TA value) { \ memory_barrier(); \ - uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \ - (MASKTYPE)__mask); \ + uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \ memory_barrier(); \ return ret; \ } \ @@ -864,8 +920,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \ continue; \ uniform TA * uniform p = ptrArray[i]; \ uniform TA v = extract(value, i); \ - uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \ - (MASKTYPE)__mask); \ + uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \ ret = insert(ret, i, r); \ } \ memory_barrier(); \ @@ -874,49 +929,51 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \ DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType) DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType) -DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType) -DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max) DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType) DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType) DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType) -DEFINE_ATOMIC_OP(int32,int32,swap,swap,IntMaskType) +DEFINE_ATOMIC_SWAP(int32,int32) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType) DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType) -DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType) -DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax) DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType) DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType) DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,UIntMaskType) +DEFINE_ATOMIC_SWAP(unsigned int32,int32) -DEFINE_ATOMIC_OP(float,float,swap,swap,IntMaskType) +DEFINE_ATOMIC_SWAP(float,float) DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType) DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType) -DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType) -DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max) DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType) DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType) DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType) -DEFINE_ATOMIC_OP(int64,int64,swap,swap,IntMaskType) +DEFINE_ATOMIC_SWAP(int64,int64) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType) DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType) -DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType) -DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax) DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType) DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType) DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,UIntMaskType) +DEFINE_ATOMIC_SWAP(unsigned int64,int64) -DEFINE_ATOMIC_OP(double,double,swap,swap,IntMaskType) +DEFINE_ATOMIC_SWAP(double,double) #undef DEFINE_ATOMIC_OP +#undef DEFINE_ATOMIC_MINMAX_OP +#undef DEFINE_ATOMIC_SWAP #define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \ static inline TA atomic_compare_exchange_global( \ @@ -931,8 +988,7 @@ static inline uniform TA atomic_compare_exchange_global( \ uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \ memory_barrier(); \ uniform TA ret = \ - __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, \ - (MASKTYPE)__mask); \ + __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \ memory_barrier(); \ return ret; \ } diff --git a/tests/atomics-swap.ispc b/tests/atomics-swap.ispc new file mode 100644 index 00000000..9d5f33c4 --- /dev/null +++ b/tests/atomics-swap.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +uniform int32 s = 1234; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = 0; + if (programIndex & 1) { + b = atomic_swap_global(&s, programIndex); + } + RET[programIndex] = reduce_add(b) + s; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1234 + reduce_add(programIndex & 1 ? programIndex : 0); +}