From 1bba9d43074fd24387dcb3e17ebae9ca6e725755 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 20 Jan 2012 10:37:33 -0800
Subject: [PATCH] Improve atomic_swap_global() to take advantage of
 associativity.

We now do a single atomic hardware swap and then effectively do
swaps between the running program instances such that the result
is the same as if they had happened to run a particular ordering
of hardware swaps themselves.

Also cleaned up __atomic_swap_uniform_* built-in implementations
to not take the mask, which they weren't using anyway.

Finishes Issue #56.
---
 builtins/util.m4        |  68 +++++--------------------
 stdlib.ispc             | 110 ++++++++++++++++++++++++++++++----------
 tests/atomics-swap.ispc |  17 +++++++
 3 files changed, 113 insertions(+), 82 deletions(-)
 create mode 100644 tests/atomics-swap.ispc
diff --git a/builtins/util.m4 b/builtins/util.m4
index b7c2e43e..883cfb4c 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -760,14 +760,12 @@ define(`global_atomic_uniform', `
 ifelse(LLVM_VERSION, `LLVM_2_9',`
 declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
 
-define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
-                                         <$1 x MASK> %mask) nounwind alwaysinline {
+define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline {
   %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
   ret $3 %r
 }
 ', `
-define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
-                                         <$1 x MASK> %mask) nounwind alwaysinline {
+define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline {
   %r = atomicrmw $2 $3 * %ptr, $3 %val seq_cst
   ret $3 %r
 }
@@ -786,26 +784,7 @@ declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val)
 declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)')
 
 define(`global_swap', `
-
-define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
-                                          <$1 x MASK> %mask) nounwind alwaysinline {
-  %rptr = alloca <$1 x $2>
-  %rptr32 = bitcast <$1 x $2> * %rptr to $2 *
-
-  per_lane($1, <$1 x MASK> %mask, `
-   %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
-ifelse(LLVM_VERSION, `LLVM_2_9',`
-   %r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)', `
-   %r_LANE_ID = atomicrmw xchg $2 * %ptr, $2 %val_LANE_ID seq_cst')
-   %rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE
-   store $2 %r_LANE_ID, $2 * %rp_LANE_ID')
-
-  %r = load <$1 x $2> * %rptr
-  ret <$1 x $2> %r
-}
-
-define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
-                                           <$1 x MASK> %mask) nounwind alwaysinline {
+define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline {
 ifelse(LLVM_VERSION, `LLVM_2_9',`
  %r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)', `
  %r = atomicrmw xchg $2 * %ptr, $2 %val seq_cst')
@@ -845,7 +824,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
 }
 
 define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
-                               $2 %val, <$1 x MASK> %mask) nounwind alwaysinline {
+                                                       $2 %val) nounwind alwaysinline {
 ifelse(LLVM_VERSION, `LLVM_2_9',`
   %r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)', `
   %r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst')
@@ -1997,38 +1976,18 @@ global_atomic_uniform(WIDTH, umax, i64, uint64)
 global_swap(WIDTH, i32, int32)
 global_swap(WIDTH, i64, int64)
 
-define <WIDTH x float> @__atomic_swap_float_global(float * %ptr, <WIDTH x float> %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %iptr = bitcast float * %ptr to i32 *
-  %ival = bitcast <WIDTH x float> %val to <WIDTH x i32>
-  %iret = call <WIDTH x i32> @__atomic_swap_int32_global(i32 * %iptr, <WIDTH x i32> %ival, <WIDTH x MASK> %mask)
-  %ret = bitcast <WIDTH x i32> %iret to <WIDTH x float>
-  ret <WIDTH x float> %ret
-}
-
-define <WIDTH x double> @__atomic_swap_double_global(double * %ptr, <WIDTH x double> %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %iptr = bitcast double * %ptr to i64 *
-  %ival = bitcast <WIDTH x double> %val to <WIDTH x i64>
-  %iret = call <WIDTH x i64> @__atomic_swap_int64_global(i64 * %iptr, <WIDTH x i64> %ival, <WIDTH x MASK> %mask)
-  %ret = bitcast <WIDTH x i64> %iret to <WIDTH x double>
-  ret <WIDTH x double> %ret
-}
-
-define float @__atomic_swap_uniform_float_global(float * %ptr, float %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
+define float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline {
   %iptr = bitcast float * %ptr to i32 *
   %ival = bitcast float %val to i32
-  %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <WIDTH x MASK> %mask)
+  %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival)
   %ret = bitcast i32 %iret to float
   ret float %ret
 }
 
-define double @__atomic_swap_uniform_double_global(double * %ptr, double %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
+define double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline {
   %iptr = bitcast double * %ptr to i64 *
   %ival = bitcast double %val to i64
-  %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <WIDTH x MASK> %mask)
+  %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival)
   %ret = bitcast i64 %iret to double
   ret double %ret
 }
@@ -2058,24 +2017,23 @@ define <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
   ret <WIDTH x double> %ret
 }
 
-define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
+define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
+                                                             float %val) nounwind alwaysinline {
   %iptr = bitcast float * %ptr to i32 *
   %icmp = bitcast float %cmp to i32
   %ival = bitcast float %val to i32
   %iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp,
-                                                                   i32 %ival, <WIDTH x MASK> %mask)
+                                                                   i32 %ival)
   %ret = bitcast i32 %iret to float
   ret float %ret
 }
 
 define double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
-                                            double %val, <WIDTH x MASK> %mask) nounwind alwaysinline {
+                                                               double %val) nounwind alwaysinline {
   %iptr = bitcast double * %ptr to i64 *
   %icmp = bitcast double %cmp to i64
   %ival = bitcast double %val to i64
-  %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp,
-                                                                   i64 %ival, <WIDTH x MASK> %mask)
+  %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp, i64 %ival)
   %ret = bitcast i64 %iret to double
   ret double %ret
 }
diff --git a/stdlib.ispc b/stdlib.ispc
index 4013cd4b..cae63abe 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -808,8 +808,7 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
 static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                                uniform TA value) {      \
     memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
-                                                            (MASKTYPE)__mask); \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
     memory_barrier();                                                   \
     return ret;                                                         \
 }                                                                       \
@@ -824,22 +823,80 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
             continue;                                                   \
         uniform TA * uniform p = ptrArray[i];                           \
         uniform TA v = extract(value, i);                               \
-        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v,     \
-                                                      (MASKTYPE)__mask); \
+        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
     memory_barrier();                                                   \
     return ret;                                                         \
 }                                                                       \
 
-#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE)                \
+#define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
+static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
+    memory_barrier();                                                   \
+    uniform int i = 0;                                                  \
+    TA ret[programCount];                                               \
+    TA memVal;                                                          \
+    uniform int lastSwap;                                               \
+    uniform int mask = lanemask();                                      \
+    /* First, have the first running program instance (if any) perform  \
+       the swap with memory with its value of "value"; record the       \
+       value returned. */                                               \
+    for (; i < programCount; ++i) {                                     \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
+        lastSwap = i;                                                   \
+        break;                                                          \
+    }                                                                   \
+    /* Now, for all of the remaining running program instances, set the \
+       return value of the last instance that did a swap with this      \
+       instance's value of "value"; this gives the same effect as if the \
+       current instance had executed a hardware atomic swap right before \
+       the last one that did a swap. */                                 \
+    for (; i < programCount; ++i) {                                     \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        ret[lastSwap] = extract(value, i);                              \
+        lastSwap = i;                                                   \
+    }                                                                   \
+    /* And the last instance that wanted to swap gets the value we      \
+       originally got back from memory... */                            \
+    ret[lastSwap] = memVal;                                             \
+    memory_barrier();                                                   \
+    return ret[programIndex];                                           \
+}                                                                       \
+static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
+                                            uniform TA value) {         \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value);   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
+    uniform TA * uniform ptrArray[programCount];                        \
+    ptrArray[programIndex] = ptr;                                       \
+    memory_barrier();                                                   \
+    TA ret;                                                             \
+    uniform int mask = lanemask();                                      \
+    for (uniform int i = 0; i < programCount; ++i) {                    \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        uniform TA * uniform p = ptrArray[i];                           \
+        uniform TA v = extract(value, i);                               \
+        uniform TA r = __atomic_swap_uniform_##TB##_global(p, v);       \
+        ret = insert(ret, i, r);                                        \
+    }                                                                   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
     uniform TA oneval = reduce_##OPA(value);                            \
     TA ret;                                                             \
     if (lanemask() != 0) {                                              \
         memory_barrier();                                               \
-        ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval,       \
-                                                     (MASKTYPE)__mask); \
+        ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval);      \
         memory_barrier();                                               \
     }                                                                   \
     return ret;                                                         \
@@ -847,8 +904,7 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
 static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                                uniform TA value) {      \
     memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
-                                                            (MASKTYPE)__mask); \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
     memory_barrier();                                                   \
     return ret;                                                         \
 }                                                                       \
@@ -864,8 +920,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
             continue;                                                   \
         uniform TA * uniform p = ptrArray[i];                           \
         uniform TA v = extract(value, i);                               \
-        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v,     \
-                                                      (MASKTYPE)__mask); \
+        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
     memory_barrier();                                                   \
@@ -874,49 +929,51 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
 
 DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
 DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
 DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
 DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
 DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,swap,swap,IntMaskType)
+DEFINE_ATOMIC_SWAP(int32,int32)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
 DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
 DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
 DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
 DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
 DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,UIntMaskType)
+DEFINE_ATOMIC_SWAP(unsigned int32,int32)
 
-DEFINE_ATOMIC_OP(float,float,swap,swap,IntMaskType)
+DEFINE_ATOMIC_SWAP(float,float)
 
 DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
 DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
 DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
 DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
 DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,swap,swap,IntMaskType)
+DEFINE_ATOMIC_SWAP(int64,int64)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
 DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
 DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
 DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
 DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
 DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,UIntMaskType)
+DEFINE_ATOMIC_SWAP(unsigned int64,int64)
 
-DEFINE_ATOMIC_OP(double,double,swap,swap,IntMaskType)
+DEFINE_ATOMIC_SWAP(double,double)
 
 #undef DEFINE_ATOMIC_OP
+#undef DEFINE_ATOMIC_MINMAX_OP
+#undef DEFINE_ATOMIC_SWAP
 
 #define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
 static inline TA atomic_compare_exchange_global(                           \
@@ -931,8 +988,7 @@ static inline uniform TA atomic_compare_exchange_global(               \
          uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
     memory_barrier();                                                      \
     uniform TA ret =                                                    \
-        __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, \
-                                                        (MASKTYPE)__mask); \
+        __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
     memory_barrier();                                                   \
     return ret;                                                         \
 }
diff --git a/tests/atomics-swap.ispc b/tests/atomics-swap.ispc
new file mode 100644
index 00000000..9d5f33c4
--- /dev/null
+++ b/tests/atomics-swap.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 1234;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1) {
+        b = atomic_swap_global(&s, programIndex);
+    }
+    RET[programIndex] = reduce_add(b) + s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1234 + reduce_add(programIndex & 1 ? programIndex : 0);
+}