From 606cbab0d4e8014bd38fedd4aba04189580496eb Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 26 Aug 2011 10:35:24 -0700 Subject: [PATCH] Performance improvements for global min/max atomics. Issue #57. Compute a "local" min/max across the active program instances and then do a single atomic memory op. Added a few tests to exercise global min/max atomics (which were previously untested!) --- builtins.m4 | 40 ++++++++++++++++++++++++++++++++-------- stdlib.ispc | 28 ++++++++++++++++++++-------- tests/atomics-7.ispc | 14 ++++++++++++++ tests/atomics-8.ispc | 14 ++++++++++++++ 4 files changed, 80 insertions(+), 16 deletions(-) create mode 100644 tests/atomics-7.ispc create mode 100644 tests/atomics-8.ispc diff --git a/builtins.m4 b/builtins.m4 index e7da41b5..c8019847 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -656,6 +656,30 @@ define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, } ') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; global_atomic_uniform +;; Defines the implementation of a function that handles the mapping from +;; an ispc atomic function to the underlying LLVM intrinsics. This variant +;; just calls the atomic once, for the given uniform value +;; +;; Takes four parameters: +;; $1: vector width of the target +;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names) +;; (add, sub...) +;; $3: return type of the LLVM atomic (e.g. i32) +;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) + +define(`global_atomic_uniform', ` + +declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta) + +define internal $3 @__atomic_$2_$4_global($3 * %ptr, $3 %val, + <$1 x i32> %mask) nounwind alwaysinline { + %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val) + ret $3 %r +} +') + ;; Macro to declare the function that implements the swap atomic. ;; Takes three parameters: ;; $1: vector width of the target @@ -1124,20 +1148,20 @@ global_atomic($1, sub, i32, int32) global_atomic($1, and, i32, int32) global_atomic($1, or, i32, int32) global_atomic($1, xor, i32, int32) -global_atomic($1, min, i32, int32) -global_atomic($1, max, i32, int32) -global_atomic($1, umin, i32, uint32) -global_atomic($1, umax, i32, uint32) +global_atomic_uniform($1, min, i32, int32) +global_atomic_uniform($1, max, i32, int32) +global_atomic_uniform($1, umin, i32, uint32) +global_atomic_uniform($1, umax, i32, uint32) global_atomic($1, add, i64, int64) global_atomic($1, sub, i64, int64) global_atomic($1, and, i64, int64) global_atomic($1, or, i64, int64) global_atomic($1, xor, i64, int64) -global_atomic($1, min, i64, int64) -global_atomic($1, max, i64, int64) -global_atomic($1, umin, i64, uint64) -global_atomic($1, umax, i64, uint64) +global_atomic_uniform($1, min, i64, int64) +global_atomic_uniform($1, max, i64, int64) +global_atomic_uniform($1, umin, i64, uint64) +global_atomic_uniform($1, umax, i64, uint64) global_swap($1, i32, int32) global_swap($1, i64, int64) diff --git a/stdlib.ispc b/stdlib.ispc index 7ff5fce5..532d723f 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -583,10 +583,22 @@ static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \ return ret; \ } +#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \ +static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \ + uniform TA oneval = reduce_##OPA(value); \ + TA ret; \ + if (lanemask() != 0) { \ + memory_barrier(); \ + ret = __atomic_##OPB##_##TB##_global(ref, oneval, __mask); \ + memory_barrier(); \ + } \ + return ret; \ +} + DEFINE_ATOMIC_OP(int32,int32,add,add) DEFINE_ATOMIC_OP(int32,int32,subtract,sub) -DEFINE_ATOMIC_OP(int32,int32,min,min) -DEFINE_ATOMIC_OP(int32,int32,max,max) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max) DEFINE_ATOMIC_OP(int32,int32,and,and) DEFINE_ATOMIC_OP(int32,int32,or,or) DEFINE_ATOMIC_OP(int32,int32,xor,xor) @@ -596,8 +608,8 @@ DEFINE_ATOMIC_OP(int32,int32,swap,swap) // implementations for unsigned as for signed. DEFINE_ATOMIC_OP(unsigned int32,int32,add,add) DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub) -DEFINE_ATOMIC_OP(unsigned int32,uint32,min,umin) -DEFINE_ATOMIC_OP(unsigned int32,uint32,max,umax) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax) DEFINE_ATOMIC_OP(unsigned int32,int32,and,and) DEFINE_ATOMIC_OP(unsigned int32,int32,or,or) DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor) @@ -607,8 +619,8 @@ DEFINE_ATOMIC_OP(float,float,swap,swap) DEFINE_ATOMIC_OP(int64,int64,add,add) DEFINE_ATOMIC_OP(int64,int64,subtract,sub) -DEFINE_ATOMIC_OP(int64,int64,min,min) -DEFINE_ATOMIC_OP(int64,int64,max,max) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max) DEFINE_ATOMIC_OP(int64,int64,and,and) DEFINE_ATOMIC_OP(int64,int64,or,or) DEFINE_ATOMIC_OP(int64,int64,xor,xor) @@ -618,8 +630,8 @@ DEFINE_ATOMIC_OP(int64,int64,swap,swap) // implementations for unsigned as for signed. DEFINE_ATOMIC_OP(unsigned int64,int64,add,add) DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub) -DEFINE_ATOMIC_OP(unsigned int64,uint64,min,umin) -DEFINE_ATOMIC_OP(unsigned int64,uint64,max,umax) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax) DEFINE_ATOMIC_OP(unsigned int64,int64,and,and) DEFINE_ATOMIC_OP(unsigned int64,int64,or,or) DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor) diff --git a/tests/atomics-7.ispc b/tests/atomics-7.ispc new file mode 100644 index 00000000..c32f52dd --- /dev/null +++ b/tests/atomics-7.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform int64 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + float b = atomic_add_global(s, 1); + RET[programIndex] = reduce_add(b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = reduce_add(programIndex); +} diff --git a/tests/atomics-8.ispc b/tests/atomics-8.ispc new file mode 100644 index 00000000..ec34ba8c --- /dev/null +++ b/tests/atomics-8.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + +uniform int32 s = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + int32 a = aFOO[programIndex]; + float b = atomic_min_global(s, a); + RET[programIndex] = reduce_min(b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = reduce_min(programIndex); +}