From 5bcc611409b27f85cf62ccc1d25e941a3de2b54e Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 4 Jul 2011 17:20:42 +0100
Subject: [PATCH] Implement global atomics and a memory barrier in the standard
 library.

This checkin provides the standard set of atomic operations and a memory barrier in the ispc standard library.  Both signed and unsigned 32- and 64-bit integer types are supported.
---
 Makefile              |   2 +-
 docs/ReleaseNotes.txt |  17 ++++--
 docs/build.sh         |   2 +-
 docs/ispc.txt         |  64 +++++++++++++++++++++
 ispc.vcxproj          |   4 +-
 stdlib.ispc           |  69 ++++++++++++++++++++++
 stdlib.m4             | 131 ++++++++++++++++++++++++++++++++++++++++++
 tests/atomics-1.ispc  |  14 +++++
 tests/atomics-2.ispc  |  14 +++++
 tests/atomics-3.ispc  |  14 +++++
 tests/atomics-4.ispc  |  14 +++++
 tests/atomics-5.ispc  |  14 +++++
 tests/atomics-6.ispc  |  14 +++++
 13 files changed, 364 insertions(+), 9 deletions(-)
 create mode 100644 tests/atomics-1.ispc
 create mode 100644 tests/atomics-2.ispc
 create mode 100644 tests/atomics-3.ispc
 create mode 100644 tests/atomics-4.ispc
 create mode 100644 tests/atomics-5.ispc
 create mode 100644 tests/atomics-6.ispc
diff --git a/Makefile b/Makefile
index e5d0541c..fcf708bf 100644
--- a/Makefile
+++ b/Makefile
@@ -121,7 +121,7 @@ objs/stdlib-c.o: objs/stdlib-c.cpp
 
 objs/stdlib_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $<
-	@$(CPP) -DISPC=1 -DPI=3.1415926536 $< | ./stdlib2cpp.py > $@
+	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@
 
 objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
 	@echo Compiling $<
diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt
index a8159e1a..872cbe76 100644
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,9 +1,9 @@
-=== v1.0.3 === (not yet released)
+=== v1.0.3 === (4 July 2011)
 
 ispc now has a bulit-in pre-processor (from LLVM's clang compiler).
-(Thanks to Pete Couperus!)  It is therefore no longer necessary to use
-cl.exe for preprocessing before on Windows; the MSVC proejct files for the
-examples have been updated accordingly.
+(Thanks to Pete Couperus for this patch!)  It is therefore no longer
+necessary to use cl.exe for preprocessing on Windows; the MSVC proejct
+files for the examples have been updated accordingly.
 
 There is another variant of the shuffle() function int the standard
 library: "<type> shuffle(<type> v0, <type> v1, int permute)", where the
@@ -11,8 +11,15 @@ permutation vector indexes over the concatenation of the two vectors
 (e.g. the value 0 corresponds to the first element of v0, the value
 2*programCount-1 corresponds to the last element of v1, etc.)
 
+ispc now supports the usual range of atomic operations (add, subtract, min,
+max, and, or, and xor) as well as atomic swap and atomic compare and
+exchange.  There is also a facility for inserting memory fences.  See the
+"Atomic Operations and Memory Fences" section of the user's guide
+(http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences) for
+more information.
+ 
 There are now both 'signed' and 'unsigned' variants of the standard library
-functions like packed_load_active() that that references to arrays of
+functions like packed_load_active() that take references to arrays of
 signed int32s and unsigned int32s respectively.  (The
 {load_from,store_to}_{int8,int16}() functions have similarly been augmented
 to have both 'signed' and 'unsigned' variants.)
diff --git a/docs/build.sh b/docs/build.sh
index 6de1e93d..cca3bee6 100755
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-rst2html ispc.txt > ispc.html
+rst2html.py ispc.txt > ispc.html
 
 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
 #pdflatex ispc.tex
diff --git a/docs/ispc.txt b/docs/ispc.txt
index d99661aa..52f02e34 100644
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -76,6 +76,7 @@ Contents:
   + `Output Functions`_
   + `Cross-Program Instance Operations`_
   + `Packed Load and Store Operations`_
+  + `Atomic Operations and Memory Fences`_
   + `Low-Level Bits`_
 
 * `Interoperability with the Application`_
@@ -1811,6 +1812,69 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v``
     int insert(int x, uniform int i, uniform int v)
 
 
+Atomic Operations and Memory Fences
+-----------------------------------
+
+The usual range of atomic memory operations are provided in ``ispc``.  As an
+example, consider the 32-bit integer atomic add routine:
+
+::
+
+  int32 atomic_add_global(reference uniform int32 val, int32 delta)
+
+The semantics are the expected ones for an atomic add function: the value
+"val" has the value "delta" added to it atomically, and the old value of
+"val" is returned from the function.  (Thus, if multiple processors 
+simultaneously issue atomic adds to the same memory location, the adds will
+be serialized by the hardware so that the correct result is computed in the
+end.)
+
+One thing to note is that that the value being added to here is a
+``uniform`` integer, while the increment amount and the return value are
+``varying``.  In other words, the semantics are that each running program
+instance individually issues the atomic operation with its own ``delta``
+value and gets the previous value of ``val`` back in return.
+
+Here are the declarations of the ``int32`` variants of these functions.
+There are also ``int64`` equivalents as well as variants that take
+``unsigned`` ``int32`` and ``int64`` values.
+
+::
+
+  int32 atomic_add_global(reference uniform int32 val, int32 value)
+  int32 atomic_subtract_global(reference uniform int32 val, int32 value)
+  int32 atomic_min_global(reference uniform int32 val, int32 value)
+  int32 atomic_max_global(reference uniform int32 val, int32 value)
+  int32 atomic_and_global(reference uniform int32 val, int32 value)
+  int32 atomic_or_global(reference uniform int32 val, int32 value)
+  int32 atomic_xor_global(reference uniform int32 val, int32 value)
+  int32 atomic_swap_global(reference uniform int32 val, int32 newval)
+
+There is also an atomic "compare and exchange" function; it atomically
+compares the value in "val" to "compare"--if they match, it assigns
+"newval" to "val".  In either case, the old value of "val" is returned.
+(As with the other atomic operations, there are also ``unsigned`` and
+64-bit variants of this function.)
+
+::
+
+  int32 atomic_compare_exchange_global(reference uniform int32 val,
+                                       int32 compare, int32 newval)
+
+``ispc`` also has a standard library routine that inserts a memory barrier
+into the code; it ensures that all memory reads and writes prior to be
+barrier complete before any reads or writes after the barrier are issued.
+See the `Linux kernel documentation on memory barriers`_ for an excellent
+writeup on the need for that the use of memory barriers in multi-threaded
+code.
+
+.. _Linux kernel documentation on memory barriers: http://www.kernel.org/doc/Documentation/memory-barriers.txt
+
+::
+
+    void memory_barrier();
+
+
 Low-Level Bits
 --------------
 
diff --git a/ispc.vcxproj b/ispc.vcxproj
index 40bacc74..81670e0b 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -59,9 +59,9 @@
   <ItemGroup>
     <CustomBuild Include="stdlib.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
diff --git a/stdlib.ispc b/stdlib.ispc
index a775c680..432d7528 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -295,6 +295,75 @@ static inline uniform int packed_store_active(uniform int a[], uniform int start
     return __packed_store_active(a, start, vals, __mask);
 }
 
+///////////////////////////////////////////////////////////////////////////
+// Atomics and memory barriers
+
+static inline void memory_barrier() {
+    __memory_barrier();
+}
+
+#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB)                                 \
+static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
+    memory_barrier();                                                   \
+    TA ret = __atomic_##OPB##_##TB##_global(ref, value, __mask);  \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}
+
+DEFINE_ATOMIC_OP(int32,int32,add,add)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub)
+DEFINE_ATOMIC_OP(int32,int32,min,min)
+DEFINE_ATOMIC_OP(int32,int32,max,max)
+DEFINE_ATOMIC_OP(int32,int32,and,and)
+DEFINE_ATOMIC_OP(int32,int32,or,or)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor)
+DEFINE_ATOMIC_OP(int32,int32,swap,swap)
+
+// For everything but atomic min and max, we can use the same
+// implementations for unsigned as for signed.
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub)
+DEFINE_ATOMIC_OP(unsigned int32,uint32,min,umin)
+DEFINE_ATOMIC_OP(unsigned int32,uint32,max,umax)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor)
+DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap)
+
+DEFINE_ATOMIC_OP(int64,int64,add,add)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub)
+DEFINE_ATOMIC_OP(int64,int64,min,min)
+DEFINE_ATOMIC_OP(int64,int64,max,max)
+DEFINE_ATOMIC_OP(int64,int64,and,and)
+DEFINE_ATOMIC_OP(int64,int64,or,or)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor)
+DEFINE_ATOMIC_OP(int64,int64,swap,swap)
+
+// For everything but atomic min and max, we can use the same
+// implementations for unsigned as for signed.
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub)
+DEFINE_ATOMIC_OP(unsigned int64,uint64,min,umin)
+DEFINE_ATOMIC_OP(unsigned int64,uint64,max,umax)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor)
+DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap)
+
+#define ATOMIC_DECL_CMPXCHG(TA, TB)                                        \
+static inline TA atomic_compare_exchange_global(                           \
+         uniform reference TA ref, TA oldval, TA newval) {                 \
+    memory_barrier();                                                      \
+    TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, __mask); \
+    memory_barrier();                                                      \
+    return ret;                                                            \
+}
+
+ATOMIC_DECL_CMPXCHG(int32, int32)
+ATOMIC_DECL_CMPXCHG(unsigned int32, int32)
+ATOMIC_DECL_CMPXCHG(int64, int64)
+ATOMIC_DECL_CMPXCHG(unsigned int64, int64)
+
 ///////////////////////////////////////////////////////////////////////////
 // Load/store from/to 8/16-bit types
 
diff --git a/stdlib.m4 b/stdlib.m4
index 30c8c497..7d023aba 100644
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -405,6 +405,95 @@ forloop(i, 1, eval($1-1), `
 }
 ')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; global_atomic
+;; Defines the implementation of a function that handles the mapping from
+;; an ispc atomic function to the underlying LLVM intrinsics.  Specifically,
+;; the function handles loooping over the active lanes, calling the underlying
+;; scalar atomic intrinsic for each one, and assembling the vector result.
+;;
+;; Takes four parameters:
+;; $1: vector width of the target
+;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
+;;     (add, sub...)
+;; $3: return type of the LLVM atomic (e.g. i32)
+;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
+
+define(`global_atomic', `
+
+declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
+
+define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
+                                                 <$1 x i32> %mask) nounwind alwaysinline {
+  %rptr = alloca <$1 x $3>
+  %rptr32 = bitcast <$1 x $3> * %rptr to $3 *
+
+  per_lane($1, <$1 x i32> %mask, `
+   %v_LANE_ID = extractelement <$1 x $3> %val, i32 LANE
+   %r_LANE_ID = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %v_LANE_ID)
+   %rp_LANE_ID = getelementptr $3 * %rptr32, i32 LANE
+   store $3 %r_LANE_ID, $3 * %rp_LANE_ID')
+
+  %r = load <$1 x $3> * %rptr
+  ret <$1 x $3> %r
+}
+')
+
+;; Macro to declare the function that implements the swap atomic.  
+;; Takes three parameters:
+;; $1: vector width of the target
+;; $2: llvm type of the vector elements (e.g. i32)
+;; $3: ispc type of the elements (e.g. int32)
+
+define(`global_swap', `
+
+declare $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)
+
+define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
+                                          <$1 x i32> %mask) nounwind alwaysinline {
+  %rptr = alloca <$1 x $2>
+  %rptr32 = bitcast <$1 x $2> * %rptr to $2 *
+
+  per_lane($1, <$1 x i32> %mask, `
+   %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
+   %r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)
+   %rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE
+   store $2 %r_LANE_ID, $2 * %rp_LANE_ID')
+
+  %r = load <$1 x $2> * %rptr
+  ret <$1 x $2> %r
+}
+')
+
+
+;; Similarly, macro to declare the function that implements the compare/exchange
+;; atomic.  Takes three parameters:
+;; $1: vector width of the target
+;; $2: llvm type of the vector elements (e.g. i32)
+;; $3: ispc type of the elements (e.g. int32)
+
+define(`global_atomic_exchange', `
+
+declare $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)
+
+define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
+                               <$1 x $2> %val, <$1 x i32> %mask) nounwind alwaysinline {
+  %rptr = alloca <$1 x $2>
+  %rptr32 = bitcast <$1 x $2> * %rptr to $2 *
+
+  per_lane($1, <$1 x i32> %mask, `
+   %cmp_LANE_ID = extractelement <$1 x $2> %cmp, i32 LANE
+   %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
+   %r_LANE_ID = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp_LANE_ID,
+                                                         $2 %val_LANE_ID)
+   %rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE
+   store $2 %r_LANE_ID, $2 * %rp_LANE_ID')
+
+  %r = load <$1 x $2> * %rptr
+  ret <$1 x $2> %r
+}
+')
+
 
 define(`stdlib_core', `
 
@@ -543,6 +632,48 @@ define internal float @__stdlib_pow(float, float) nounwind readnone alwaysinline
   %r = call float @powf(float %0, float %1)
   ret float %r
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; atomics and memory barriers
+
+declare void @llvm.memory.barrier(i1 %loadload, i1 %loadstore, i1 %storeload,
+                                  i1 %storestore, i1 %device)
+
+define internal void @__memory_barrier() nounwind readnone alwaysinline {
+  ;; see http://llvm.org/bugs/show_bug.cgi?id=2829.  It seems like we
+  ;; only get an MFENCE on x86 if "device" is true, but IMHO we should
+  ;; in the case where the first 4 args are true but it is false.
+  ;;  So we just always set that to true...
+  call void @llvm.memory.barrier(i1 true, i1 true, i1 true, i1 true, i1 true)
+  ret void
+}
+
+global_atomic($1, add, i32, int32)
+global_atomic($1, sub, i32, int32)
+global_atomic($1, and, i32, int32)
+global_atomic($1, or, i32, int32)
+global_atomic($1, xor, i32, int32)
+global_atomic($1, min, i32, int32)
+global_atomic($1, max, i32, int32)
+global_atomic($1, umin, i32, uint32)
+global_atomic($1, umax, i32, uint32)
+
+global_atomic($1, add, i64, int64)
+global_atomic($1, sub, i64, int64)
+global_atomic($1, and, i64, int64)
+global_atomic($1, or, i64, int64)
+global_atomic($1, xor, i64, int64)
+global_atomic($1, min, i64, int64)
+global_atomic($1, max, i64, int64)
+global_atomic($1, umin, i64, uint64)
+global_atomic($1, umax, i64, uint64)
+
+global_swap($1, i32, int32)
+global_swap($1, i64, int64)
+
+global_atomic_exchange($1, i32, int32)
+global_atomic_exchange($1, i64, int64)
+
 ')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/tests/atomics-1.ispc b/tests/atomics-1.ispc
new file mode 100644
index 00000000..12d64d85
--- /dev/null
+++ b/tests/atomics-1.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = atomic_add_global(s, 1);
+    RET[programIndex] = reduce_add(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = reduce_add(programIndex);
+}
diff --git a/tests/atomics-2.ispc b/tests/atomics-2.ispc
new file mode 100644
index 00000000..c32f52dd
--- /dev/null
+++ b/tests/atomics-2.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int64 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = atomic_add_global(s, 1);
+    RET[programIndex] = reduce_add(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = reduce_add(programIndex);
+}
diff --git a/tests/atomics-3.ispc b/tests/atomics-3.ispc
new file mode 100644
index 00000000..c7282fdb
--- /dev/null
+++ b/tests/atomics-3.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0xff;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = atomic_xor_global(s, 0xfffffff0);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0xff;
+}
diff --git a/tests/atomics-4.ispc b/tests/atomics-4.ispc
new file mode 100644
index 00000000..4a0ea6dc
--- /dev/null
+++ b/tests/atomics-4.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = atomic_or_global(s, (1<<programIndex));
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (1<<programCount)-1;
+}
diff --git a/tests/atomics-5.ispc b/tests/atomics-5.ispc
new file mode 100644
index 00000000..0941497c
--- /dev/null
+++ b/tests/atomics-5.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0xbeef;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = atomic_swap_global(s, programIndex);
+    RET[programIndex] = reduce_max(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0xbeef;
+}
diff --git a/tests/atomics-6.ispc b/tests/atomics-6.ispc
new file mode 100644
index 00000000..c84baed9
--- /dev/null
+++ b/tests/atomics-6.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 2;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = atomic_compare_exchange_global(s, programIndex, a*1000);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3000;
+}