From 5bcc611409b27f85cf62ccc1d25e941a3de2b54e Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Mon, 4 Jul 2011 17:20:42 +0100 Subject: [PATCH] Implement global atomics and a memory barrier in the standard library. This checkin provides the standard set of atomic operations and a memory barrier in the ispc standard library. Both signed and unsigned 32- and 64-bit integer types are supported. --- Makefile | 2 +- docs/ReleaseNotes.txt | 17 ++++-- docs/build.sh | 2 +- docs/ispc.txt | 64 +++++++++++++++++++++ ispc.vcxproj | 4 +- stdlib.ispc | 69 ++++++++++++++++++++++ stdlib.m4 | 131 ++++++++++++++++++++++++++++++++++++++++++ tests/atomics-1.ispc | 14 +++++ tests/atomics-2.ispc | 14 +++++ tests/atomics-3.ispc | 14 +++++ tests/atomics-4.ispc | 14 +++++ tests/atomics-5.ispc | 14 +++++ tests/atomics-6.ispc | 14 +++++ 13 files changed, 364 insertions(+), 9 deletions(-) create mode 100644 tests/atomics-1.ispc create mode 100644 tests/atomics-2.ispc create mode 100644 tests/atomics-3.ispc create mode 100644 tests/atomics-4.ispc create mode 100644 tests/atomics-5.ispc create mode 100644 tests/atomics-6.ispc diff --git a/Makefile b/Makefile index e5d0541c..fcf708bf 100644 --- a/Makefile +++ b/Makefile @@ -121,7 +121,7 @@ objs/stdlib-c.o: objs/stdlib-c.cpp objs/stdlib_ispc.cpp: stdlib.ispc @echo Creating C++ source from $< - @$(CPP) -DISPC=1 -DPI=3.1415926536 $< | ./stdlib2cpp.py > $@ + @$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@ objs/stdlib_ispc.o: objs/stdlib_ispc.cpp @echo Compiling $< diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt index a8159e1a..872cbe76 100644 --- a/docs/ReleaseNotes.txt +++ b/docs/ReleaseNotes.txt @@ -1,9 +1,9 @@ -=== v1.0.3 === (not yet released) +=== v1.0.3 === (4 July 2011) ispc now has a bulit-in pre-processor (from LLVM's clang compiler). -(Thanks to Pete Couperus!) It is therefore no longer necessary to use -cl.exe for preprocessing before on Windows; the MSVC proejct files for the -examples have been updated accordingly. +(Thanks to Pete Couperus for this patch!) It is therefore no longer +necessary to use cl.exe for preprocessing on Windows; the MSVC proejct +files for the examples have been updated accordingly. There is another variant of the shuffle() function int the standard library: " shuffle( v0, v1, int permute)", where the @@ -11,8 +11,15 @@ permutation vector indexes over the concatenation of the two vectors (e.g. the value 0 corresponds to the first element of v0, the value 2*programCount-1 corresponds to the last element of v1, etc.) +ispc now supports the usual range of atomic operations (add, subtract, min, +max, and, or, and xor) as well as atomic swap and atomic compare and +exchange. There is also a facility for inserting memory fences. See the +"Atomic Operations and Memory Fences" section of the user's guide +(http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences) for +more information. + There are now both 'signed' and 'unsigned' variants of the standard library -functions like packed_load_active() that that references to arrays of +functions like packed_load_active() that take references to arrays of signed int32s and unsigned int32s respectively. (The {load_from,store_to}_{int8,int16}() functions have similarly been augmented to have both 'signed' and 'unsigned' variants.) diff --git a/docs/build.sh b/docs/build.sh index 6de1e93d..cca3bee6 100755 --- a/docs/build.sh +++ b/docs/build.sh @@ -1,6 +1,6 @@ #!/bin/bash -rst2html ispc.txt > ispc.html +rst2html.py ispc.txt > ispc.html #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex #pdflatex ispc.tex diff --git a/docs/ispc.txt b/docs/ispc.txt index d99661aa..52f02e34 100644 --- a/docs/ispc.txt +++ b/docs/ispc.txt @@ -76,6 +76,7 @@ Contents: + `Output Functions`_ + `Cross-Program Instance Operations`_ + `Packed Load and Store Operations`_ + + `Atomic Operations and Memory Fences`_ + `Low-Level Bits`_ * `Interoperability with the Application`_ @@ -1811,6 +1812,69 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v`` int insert(int x, uniform int i, uniform int v) +Atomic Operations and Memory Fences +----------------------------------- + +The usual range of atomic memory operations are provided in ``ispc``. As an +example, consider the 32-bit integer atomic add routine: + +:: + + int32 atomic_add_global(reference uniform int32 val, int32 delta) + +The semantics are the expected ones for an atomic add function: the value +"val" has the value "delta" added to it atomically, and the old value of +"val" is returned from the function. (Thus, if multiple processors +simultaneously issue atomic adds to the same memory location, the adds will +be serialized by the hardware so that the correct result is computed in the +end.) + +One thing to note is that that the value being added to here is a +``uniform`` integer, while the increment amount and the return value are +``varying``. In other words, the semantics are that each running program +instance individually issues the atomic operation with its own ``delta`` +value and gets the previous value of ``val`` back in return. + +Here are the declarations of the ``int32`` variants of these functions. +There are also ``int64`` equivalents as well as variants that take +``unsigned`` ``int32`` and ``int64`` values. + +:: + + int32 atomic_add_global(reference uniform int32 val, int32 value) + int32 atomic_subtract_global(reference uniform int32 val, int32 value) + int32 atomic_min_global(reference uniform int32 val, int32 value) + int32 atomic_max_global(reference uniform int32 val, int32 value) + int32 atomic_and_global(reference uniform int32 val, int32 value) + int32 atomic_or_global(reference uniform int32 val, int32 value) + int32 atomic_xor_global(reference uniform int32 val, int32 value) + int32 atomic_swap_global(reference uniform int32 val, int32 newval) + +There is also an atomic "compare and exchange" function; it atomically +compares the value in "val" to "compare"--if they match, it assigns +"newval" to "val". In either case, the old value of "val" is returned. +(As with the other atomic operations, there are also ``unsigned`` and +64-bit variants of this function.) + +:: + + int32 atomic_compare_exchange_global(reference uniform int32 val, + int32 compare, int32 newval) + +``ispc`` also has a standard library routine that inserts a memory barrier +into the code; it ensures that all memory reads and writes prior to be +barrier complete before any reads or writes after the barrier are issued. +See the `Linux kernel documentation on memory barriers`_ for an excellent +writeup on the need for that the use of memory barriers in multi-threaded +code. + +.. _Linux kernel documentation on memory barriers: http://www.kernel.org/doc/Documentation/memory-barriers.txt + +:: + + void memory_barrier(); + + Low-Level Bits -------------- diff --git a/ispc.vcxproj b/ispc.vcxproj index 40bacc74..81670e0b 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -59,9 +59,9 @@ Document - cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp + clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp gen-stdlib.cpp - cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp + clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp gen-stdlib.cpp Building gen-stdlib.cpp Building gen-stdlib.cpp diff --git a/stdlib.ispc b/stdlib.ispc index a775c680..432d7528 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -295,6 +295,75 @@ static inline uniform int packed_store_active(uniform int a[], uniform int start return __packed_store_active(a, start, vals, __mask); } +/////////////////////////////////////////////////////////////////////////// +// Atomics and memory barriers + +static inline void memory_barrier() { + __memory_barrier(); +} + +#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB) \ +static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \ + memory_barrier(); \ + TA ret = __atomic_##OPB##_##TB##_global(ref, value, __mask); \ + memory_barrier(); \ + return ret; \ +} + +DEFINE_ATOMIC_OP(int32,int32,add,add) +DEFINE_ATOMIC_OP(int32,int32,subtract,sub) +DEFINE_ATOMIC_OP(int32,int32,min,min) +DEFINE_ATOMIC_OP(int32,int32,max,max) +DEFINE_ATOMIC_OP(int32,int32,and,and) +DEFINE_ATOMIC_OP(int32,int32,or,or) +DEFINE_ATOMIC_OP(int32,int32,xor,xor) +DEFINE_ATOMIC_OP(int32,int32,swap,swap) + +// For everything but atomic min and max, we can use the same +// implementations for unsigned as for signed. +DEFINE_ATOMIC_OP(unsigned int32,int32,add,add) +DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub) +DEFINE_ATOMIC_OP(unsigned int32,uint32,min,umin) +DEFINE_ATOMIC_OP(unsigned int32,uint32,max,umax) +DEFINE_ATOMIC_OP(unsigned int32,int32,and,and) +DEFINE_ATOMIC_OP(unsigned int32,int32,or,or) +DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor) +DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap) + +DEFINE_ATOMIC_OP(int64,int64,add,add) +DEFINE_ATOMIC_OP(int64,int64,subtract,sub) +DEFINE_ATOMIC_OP(int64,int64,min,min) +DEFINE_ATOMIC_OP(int64,int64,max,max) +DEFINE_ATOMIC_OP(int64,int64,and,and) +DEFINE_ATOMIC_OP(int64,int64,or,or) +DEFINE_ATOMIC_OP(int64,int64,xor,xor) +DEFINE_ATOMIC_OP(int64,int64,swap,swap) + +// For everything but atomic min and max, we can use the same +// implementations for unsigned as for signed. +DEFINE_ATOMIC_OP(unsigned int64,int64,add,add) +DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub) +DEFINE_ATOMIC_OP(unsigned int64,uint64,min,umin) +DEFINE_ATOMIC_OP(unsigned int64,uint64,max,umax) +DEFINE_ATOMIC_OP(unsigned int64,int64,and,and) +DEFINE_ATOMIC_OP(unsigned int64,int64,or,or) +DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor) +DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap) + +#define ATOMIC_DECL_CMPXCHG(TA, TB) \ +static inline TA atomic_compare_exchange_global( \ + uniform reference TA ref, TA oldval, TA newval) { \ + memory_barrier(); \ + TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, __mask); \ + memory_barrier(); \ + return ret; \ +} + +ATOMIC_DECL_CMPXCHG(int32, int32) +ATOMIC_DECL_CMPXCHG(unsigned int32, int32) +ATOMIC_DECL_CMPXCHG(int64, int64) +ATOMIC_DECL_CMPXCHG(unsigned int64, int64) + /////////////////////////////////////////////////////////////////////////// // Load/store from/to 8/16-bit types diff --git a/stdlib.m4 b/stdlib.m4 index 30c8c497..7d023aba 100644 --- a/stdlib.m4 +++ b/stdlib.m4 @@ -405,6 +405,95 @@ forloop(i, 1, eval($1-1), ` } ') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; global_atomic +;; Defines the implementation of a function that handles the mapping from +;; an ispc atomic function to the underlying LLVM intrinsics. Specifically, +;; the function handles loooping over the active lanes, calling the underlying +;; scalar atomic intrinsic for each one, and assembling the vector result. +;; +;; Takes four parameters: +;; $1: vector width of the target +;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names) +;; (add, sub...) +;; $3: return type of the LLVM atomic (e.g. i32) +;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) + +define(`global_atomic', ` + +declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta) + +define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, + <$1 x i32> %mask) nounwind alwaysinline { + %rptr = alloca <$1 x $3> + %rptr32 = bitcast <$1 x $3> * %rptr to $3 * + + per_lane($1, <$1 x i32> %mask, ` + %v_LANE_ID = extractelement <$1 x $3> %val, i32 LANE + %r_LANE_ID = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %v_LANE_ID) + %rp_LANE_ID = getelementptr $3 * %rptr32, i32 LANE + store $3 %r_LANE_ID, $3 * %rp_LANE_ID') + + %r = load <$1 x $3> * %rptr + ret <$1 x $3> %r +} +') + +;; Macro to declare the function that implements the swap atomic. +;; Takes three parameters: +;; $1: vector width of the target +;; $2: llvm type of the vector elements (e.g. i32) +;; $3: ispc type of the elements (e.g. int32) + +define(`global_swap', ` + +declare $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val) + +define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val, + <$1 x i32> %mask) nounwind alwaysinline { + %rptr = alloca <$1 x $2> + %rptr32 = bitcast <$1 x $2> * %rptr to $2 * + + per_lane($1, <$1 x i32> %mask, ` + %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE + %r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID) + %rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE + store $2 %r_LANE_ID, $2 * %rp_LANE_ID') + + %r = load <$1 x $2> * %rptr + ret <$1 x $2> %r +} +') + + +;; Similarly, macro to declare the function that implements the compare/exchange +;; atomic. Takes three parameters: +;; $1: vector width of the target +;; $2: llvm type of the vector elements (e.g. i32) +;; $3: ispc type of the elements (e.g. int32) + +define(`global_atomic_exchange', ` + +declare $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val) + +define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp, + <$1 x $2> %val, <$1 x i32> %mask) nounwind alwaysinline { + %rptr = alloca <$1 x $2> + %rptr32 = bitcast <$1 x $2> * %rptr to $2 * + + per_lane($1, <$1 x i32> %mask, ` + %cmp_LANE_ID = extractelement <$1 x $2> %cmp, i32 LANE + %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE + %r_LANE_ID = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp_LANE_ID, + $2 %val_LANE_ID) + %rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE + store $2 %r_LANE_ID, $2 * %rp_LANE_ID') + + %r = load <$1 x $2> * %rptr + ret <$1 x $2> %r +} +') + define(`stdlib_core', ` @@ -543,6 +632,48 @@ define internal float @__stdlib_pow(float, float) nounwind readnone alwaysinline %r = call float @powf(float %0, float %1) ret float %r } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; atomics and memory barriers + +declare void @llvm.memory.barrier(i1 %loadload, i1 %loadstore, i1 %storeload, + i1 %storestore, i1 %device) + +define internal void @__memory_barrier() nounwind readnone alwaysinline { + ;; see http://llvm.org/bugs/show_bug.cgi?id=2829. It seems like we + ;; only get an MFENCE on x86 if "device" is true, but IMHO we should + ;; in the case where the first 4 args are true but it is false. + ;; So we just always set that to true... + call void @llvm.memory.barrier(i1 true, i1 true, i1 true, i1 true, i1 true) + ret void +} + +global_atomic($1, add, i32, int32) +global_atomic($1, sub, i32, int32) +global_atomic($1, and, i32, int32) +global_atomic($1, or, i32, int32) +global_atomic($1, xor, i32, int32) +global_atomic($1, min, i32, int32) +global_atomic($1, max, i32, int32) +global_atomic($1, umin, i32, uint32) +global_atomic($1, umax, i32, uint32) + +global_atomic($1, add, i64, int64) +global_atomic($1, sub, i64, int64) +global_atomic($1, and, i64, int64) +global_atomic($1, or, i64, int64) +global_atomic($1, xor, i64, int64) +global_atomic($1, min, i64, int64) +global_atomic($1, max, i64, int64) +global_atomic($1, umin, i64, uint64) +global_atomic($1, umax, i64, uint64) + +global_swap($1, i32, int32) +global_swap($1, i64, int64) + +global_atomic_exchange($1, i32, int32) +global_atomic_exchange($1, i64, int64) + ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/tests/atomics-1.ispc b/tests/atomics-1.ispc new file mode 100644 index 00000000..12d64d85 --- /dev/null +++ b/tests/atomics-1.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform unsigned int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = atomic_add_global(s, 1); + RET[programIndex] = reduce_add(b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = reduce_add(programIndex); +} diff --git a/tests/atomics-2.ispc b/tests/atomics-2.ispc new file mode 100644 index 00000000..c32f52dd --- /dev/null +++ b/tests/atomics-2.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform int64 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = atomic_add_global(s, 1); + RET[programIndex] = reduce_add(b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = reduce_add(programIndex); +} diff --git a/tests/atomics-3.ispc b/tests/atomics-3.ispc new file mode 100644 index 00000000..c7282fdb --- /dev/null +++ b/tests/atomics-3.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform int32 s = 0xff; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = atomic_xor_global(s, 0xfffffff0); + RET[programIndex] = s; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0xff; +} diff --git a/tests/atomics-4.ispc b/tests/atomics-4.ispc new file mode 100644 index 00000000..4a0ea6dc --- /dev/null +++ b/tests/atomics-4.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = atomic_or_global(s, (1<