From e144724979e7c051a14d9d6554b119105c5cc097 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Wed, 31 Aug 2011 05:35:01 -0700
Subject: [PATCH] Improve performance of global atomics, taking advantage of
 associativity.

For associative atomic ops (add, and, or, xor), we can take advantage of
their associativity to do just a single hardware atomic instruction,
rather than one for each of the running program instances (as the previous
implementation did.)

The basic approach is to locally compute a reduction across the active
program instances with the given op and to then issue a single HW atomic
with that reduced value as the operand.  We then take the old value that
was stored in the location that is returned from the HW atomic op and
use that to compute the values to return to each of the program instances
(conceptually representing the cumulative effect of each of the preceding
program instances having performed their atomic operation.)

Issue #56.
---
 builtins.m4           | 98 ++++++++++++++++++++++++++++++++++++++-----
 stdlib.ispc           | 56 ++++++++++++-------------
 tests/atomics-10.ispc | 16 +++++++
 tests/atomics-11.ispc | 20 +++++++++
 tests/atomics-12.ispc | 20 +++++++++
 tests/atomics-13.ispc | 16 +++++++
 tests/atomics-14.ispc | 20 +++++++++
 tests/atomics-9.ispc  | 16 +++++++
 8 files changed, 224 insertions(+), 38 deletions(-)
 create mode 100644 tests/atomics-10.ispc
 create mode 100644 tests/atomics-11.ispc
 create mode 100644 tests/atomics-12.ispc
 create mode 100644 tests/atomics-13.ispc
 create mode 100644 tests/atomics-14.ispc
 create mode 100644 tests/atomics-9.ispc

diff --git a/builtins.m4 b/builtins.m4
index 59a7b6a3..714a2bd5 100644
--- a/builtins.m4
+++ b/builtins.m4
@@ -656,6 +656,84 @@ define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
 }
 ')
 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; global_atomic_associative
+;; More efficient implementation for atomics that are associative (e.g.,
+;; add, and, ...).  If a basic implementation would do sometihng like:
+;; result0 = atomic_op(ptr, val0)
+;; result1 = atomic_op(ptr, val1)
+;; ..
+;; Then instead we can do:
+;; tmp = (val0 op val1 op ...)
+;; result0 = atomic_op(ptr, tmp)
+;; result1 = (result0 op val0)
+;; ..
+;; And more efficiently compute the same result
+;;
+;; Takes five parameters:
+;; $1: vector width of the target
+;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
+;;     (add, sub...)
+;; $3: return type of the LLVM atomic (e.g. i32)
+;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
+;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
+
+define(`global_atomic_associative', `
+
+declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
+
+;; note that the mask is expected to be of type $3, so the caller must ensure
+;; that for 64-bit types, the mask is cast to a signed int before being passed
+;; to this so that it is properly sign extended...  (The code in stdlib.ispc
+;; does do this..)
+
+define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
+                                                 <$1 x $3> %mask) nounwind alwaysinline {
+  ; first, for any lanes where the mask is off, compute a vector where those lanes
+  ; hold the identity value..
+
+  ; zero out any lanes that are off
+  %valoff = and <$1 x $3> %val, %mask
+
+  ; compute an identity vector that is zero in on lanes and has the identiy value
+  ; in the off lanes
+  %idv1 = bitcast $3 $5 to <1 x $3>
+  %idvec = shufflevector <1 x $3> %idv1, <1 x $3> undef,
+     <$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 >
+  %notmask = xor <$1 x $3> %mask, < forloop(i, 1, eval($1-1), `$3 -1, ') $3 -1 >
+  %idoff = and <$1 x $3> %idvec, %notmask
+
+  ; and comptue the merged vector that holds the identity in the off lanes
+  %valp = or <$1 x $3> %valoff, %idoff
+
+  ; now compute the local reduction (val0 op val1 op ... )--initialize
+  ; %eltvec so that the 0th element is the identity, the first is val0,
+  ; the second is (val0 op val1), ..
+  %red0 = extractelement <$1 x $3> %valp, i32 0
+  %eltvec0 = insertelement <$1 x $3> undef, $3 $5, i32 0
+
+  forloop(i, 1, eval($1-1), `
+  %elt`'i = extractelement <$1 x $3> %valp, i32 i
+  %red`'i = $2 $3 %red`'eval(i-1), %elt`'i
+  %eltvec`'i = insertelement <$1 x $3> %eltvec`'eval(i-1), $3 %red`'eval(i-1), i32 i')
+
+  ; make the atomic call, passing it the final reduced value
+  %final0 = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %red`'eval($1-1))
+
+  ; now go back and compute the values to be returned for each program 
+  ; instance--this just involves smearing the old value returned from the
+  ; actual atomic call across the vector and applying the vector op to the
+  ; %eltvec vector computed above..
+  %finalv1 = bitcast $3 %final0 to <1 x $3>
+  %final_base = shufflevector <1 x $3> %finalv1, <1 x $3> undef,
+     <$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 >
+  %r = $2 <$1 x $3> %final_base, %eltvec`'eval($1-1)
+
+  ret <$1 x $3> %r
+}
+')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; global_atomic_uniform
 ;; Defines the implementation of a function that handles the mapping from
@@ -1143,21 +1221,21 @@ define internal void @__memory_barrier() nounwind readnone alwaysinline {
   ret void
 }
 
-global_atomic($1, add, i32, int32)
-global_atomic($1, sub, i32, int32)
-global_atomic($1, and, i32, int32)
-global_atomic($1, or, i32, int32)
-global_atomic($1, xor, i32, int32)
+global_atomic_associative($1, add, i32, int32, 0)
+global_atomic_associative($1, sub, i32, int32, 0)
+global_atomic_associative($1, and, i32, int32, -1)
+global_atomic_associative($1, or, i32, int32, 0)
+global_atomic_associative($1, xor, i32, int32, 0)
 global_atomic_uniform($1, min, i32, int32)
 global_atomic_uniform($1, max, i32, int32)
 global_atomic_uniform($1, umin, i32, uint32)
 global_atomic_uniform($1, umax, i32, uint32)
 
-global_atomic($1, add, i64, int64)
-global_atomic($1, sub, i64, int64)
-global_atomic($1, and, i64, int64)
-global_atomic($1, or, i64, int64)
-global_atomic($1, xor, i64, int64)
+global_atomic_associative($1, add, i64, int64, 0)
+global_atomic_associative($1, sub, i64, int64, 0)
+global_atomic_associative($1, and, i64, int64, -1)
+global_atomic_associative($1, or, i64, int64, 0)
+global_atomic_associative($1, xor, i64, int64, 0)
 global_atomic_uniform($1, min, i64, int64)
 global_atomic_uniform($1, max, i64, int64)
 global_atomic_uniform($1, umin, i64, uint64)
diff --git a/stdlib.ispc b/stdlib.ispc
index 532d723f..6b7ce67f 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -575,10 +575,10 @@ static inline void memory_barrier() {
     __memory_barrier();
 }
 
-#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB)                                 \
+#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
 static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
     memory_barrier();                                                   \
-    TA ret = __atomic_##OPB##_##TB##_global(ref, value, __mask);  \
+    TA ret = __atomic_##OPB##_##TB##_global(ref, value, (MASKTYPE)__mask); \
     memory_barrier();                                                   \
     return ret;                                                         \
 }
@@ -595,49 +595,49 @@ static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
     return ret;                                                         \
 }
 
-DEFINE_ATOMIC_OP(int32,int32,add,add)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub)
+DEFINE_ATOMIC_OP(int32,int32,add,add,int32)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32)
 DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
 DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
-DEFINE_ATOMIC_OP(int32,int32,and,and)
-DEFINE_ATOMIC_OP(int32,int32,or,or)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor)
-DEFINE_ATOMIC_OP(int32,int32,swap,swap)
+DEFINE_ATOMIC_OP(int32,int32,and,and,int32)
+DEFINE_ATOMIC_OP(int32,int32,or,or,int32)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32)
+DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,int32)
 DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
 DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor)
-DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,int32)
 
-DEFINE_ATOMIC_OP(float,float,swap,swap)
+DEFINE_ATOMIC_OP(float,float,swap,swap,int32)
 
-DEFINE_ATOMIC_OP(int64,int64,add,add)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub)
+DEFINE_ATOMIC_OP(int64,int64,add,add,int64)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int64)
 DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
 DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
-DEFINE_ATOMIC_OP(int64,int64,and,and)
-DEFINE_ATOMIC_OP(int64,int64,or,or)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor)
-DEFINE_ATOMIC_OP(int64,int64,swap,swap)
+DEFINE_ATOMIC_OP(int64,int64,and,and,int64)
+DEFINE_ATOMIC_OP(int64,int64,or,or,int64)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,int64)
+DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,int64)
 DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
 DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor)
-DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,int32)
 
-DEFINE_ATOMIC_OP(double,double,swap,swap)
+DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
 
 #undef DEFINE_ATOMIC_OP
 
diff --git a/tests/atomics-10.ispc b/tests/atomics-10.ispc
new file mode 100644
index 00000000..b950988a
--- /dev/null
+++ b/tests/atomics-10.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex < 2)
+        b = atomic_add_global(s, 1);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+}
diff --git a/tests/atomics-11.ispc b/tests/atomics-11.ispc
new file mode 100644
index 00000000..cb94544c
--- /dev/null
+++ b/tests/atomics-11.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_add_global(s, programIndex);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    uniform int sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        if (i & 1)
+            sum += i;
+    RET[programIndex] = sum;
+}
diff --git a/tests/atomics-12.ispc b/tests/atomics-12.ispc
new file mode 100644
index 00000000..4d7e2c1e
--- /dev/null
+++ b/tests/atomics-12.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_or_global(s, (1 << programIndex));
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    uniform int sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        if (i & 1)
+            sum += (1 << i);
+    RET[programIndex] = sum;
+}
diff --git a/tests/atomics-13.ispc b/tests/atomics-13.ispc
new file mode 100644
index 00000000..dd9c316c
--- /dev/null
+++ b/tests/atomics-13.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_or_global(s, (1 << programIndex));
+    RET[programIndex] = popcnt(reduce_add((int32)b));
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programCount/2) - 1;
+}
diff --git a/tests/atomics-14.ispc b/tests/atomics-14.ispc
new file mode 100644
index 00000000..cf9826cb
--- /dev/null
+++ b/tests/atomics-14.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int64 s = 0xffffffffff000000;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_or_global(s, (1 << programIndex));
+    RET[programIndex] = (s>>20);
+}
+
+export void result(uniform float RET[]) {
+    uniform int sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        if (i & 1)
+            sum += (1 << i);
+    RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
+}
diff --git a/tests/atomics-9.ispc b/tests/atomics-9.ispc
new file mode 100644
index 00000000..c038adc4
--- /dev/null
+++ b/tests/atomics-9.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex < 2)
+        b = atomic_add_global(s, 1);
+    RET[programIndex] = reduce_add(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}