Performance improvements for global min/max atomics. Issue #57.
Compute a "local" min/max across the active program instances and then do a single atomic memory op. Added a few tests to exercise global min/max atomics (which were previously untested!)
This commit is contained in:
40
builtins.m4
40
builtins.m4
@@ -656,6 +656,30 @@ define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; global_atomic_uniform
|
||||
;; Defines the implementation of a function that handles the mapping from
|
||||
;; an ispc atomic function to the underlying LLVM intrinsics. This variant
|
||||
;; just calls the atomic once, for the given uniform value
|
||||
;;
|
||||
;; Takes four parameters:
|
||||
;; $1: vector width of the target
|
||||
;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
|
||||
;; (add, sub...)
|
||||
;; $3: return type of the LLVM atomic (e.g. i32)
|
||||
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
|
||||
|
||||
define(`global_atomic_uniform', `
|
||||
|
||||
declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
|
||||
|
||||
define internal $3 @__atomic_$2_$4_global($3 * %ptr, $3 %val,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
%r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
|
||||
ret $3 %r
|
||||
}
|
||||
')
|
||||
|
||||
;; Macro to declare the function that implements the swap atomic.
|
||||
;; Takes three parameters:
|
||||
;; $1: vector width of the target
|
||||
@@ -1124,20 +1148,20 @@ global_atomic($1, sub, i32, int32)
|
||||
global_atomic($1, and, i32, int32)
|
||||
global_atomic($1, or, i32, int32)
|
||||
global_atomic($1, xor, i32, int32)
|
||||
global_atomic($1, min, i32, int32)
|
||||
global_atomic($1, max, i32, int32)
|
||||
global_atomic($1, umin, i32, uint32)
|
||||
global_atomic($1, umax, i32, uint32)
|
||||
global_atomic_uniform($1, min, i32, int32)
|
||||
global_atomic_uniform($1, max, i32, int32)
|
||||
global_atomic_uniform($1, umin, i32, uint32)
|
||||
global_atomic_uniform($1, umax, i32, uint32)
|
||||
|
||||
global_atomic($1, add, i64, int64)
|
||||
global_atomic($1, sub, i64, int64)
|
||||
global_atomic($1, and, i64, int64)
|
||||
global_atomic($1, or, i64, int64)
|
||||
global_atomic($1, xor, i64, int64)
|
||||
global_atomic($1, min, i64, int64)
|
||||
global_atomic($1, max, i64, int64)
|
||||
global_atomic($1, umin, i64, uint64)
|
||||
global_atomic($1, umax, i64, uint64)
|
||||
global_atomic_uniform($1, min, i64, int64)
|
||||
global_atomic_uniform($1, max, i64, int64)
|
||||
global_atomic_uniform($1, umin, i64, uint64)
|
||||
global_atomic_uniform($1, umax, i64, uint64)
|
||||
|
||||
global_swap($1, i32, int32)
|
||||
global_swap($1, i64, int64)
|
||||
|
||||
28
stdlib.ispc
28
stdlib.ispc
@@ -583,10 +583,22 @@ static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \
|
||||
static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
|
||||
uniform TA oneval = reduce_##OPA(value); \
|
||||
TA ret; \
|
||||
if (lanemask() != 0) { \
|
||||
memory_barrier(); \
|
||||
ret = __atomic_##OPB##_##TB##_global(ref, oneval, __mask); \
|
||||
memory_barrier(); \
|
||||
} \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
DEFINE_ATOMIC_OP(int32,int32,add,add)
|
||||
DEFINE_ATOMIC_OP(int32,int32,subtract,sub)
|
||||
DEFINE_ATOMIC_OP(int32,int32,min,min)
|
||||
DEFINE_ATOMIC_OP(int32,int32,max,max)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
|
||||
DEFINE_ATOMIC_OP(int32,int32,and,and)
|
||||
DEFINE_ATOMIC_OP(int32,int32,or,or)
|
||||
DEFINE_ATOMIC_OP(int32,int32,xor,xor)
|
||||
@@ -596,8 +608,8 @@ DEFINE_ATOMIC_OP(int32,int32,swap,swap)
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,uint32,min,umin)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,uint32,max,umax)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor)
|
||||
@@ -607,8 +619,8 @@ DEFINE_ATOMIC_OP(float,float,swap,swap)
|
||||
|
||||
DEFINE_ATOMIC_OP(int64,int64,add,add)
|
||||
DEFINE_ATOMIC_OP(int64,int64,subtract,sub)
|
||||
DEFINE_ATOMIC_OP(int64,int64,min,min)
|
||||
DEFINE_ATOMIC_OP(int64,int64,max,max)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
|
||||
DEFINE_ATOMIC_OP(int64,int64,and,and)
|
||||
DEFINE_ATOMIC_OP(int64,int64,or,or)
|
||||
DEFINE_ATOMIC_OP(int64,int64,xor,xor)
|
||||
@@ -618,8 +630,8 @@ DEFINE_ATOMIC_OP(int64,int64,swap,swap)
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,uint64,min,umin)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,uint64,max,umax)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor)
|
||||
|
||||
14
tests/atomics-7.ispc
Normal file
14
tests/atomics-7.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int64 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = atomic_add_global(s, 1);
|
||||
RET[programIndex] = reduce_add(b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = reduce_add(programIndex);
|
||||
}
|
||||
14
tests/atomics-8.ispc
Normal file
14
tests/atomics-8.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
int32 a = aFOO[programIndex];
|
||||
float b = atomic_min_global(s, a);
|
||||
RET[programIndex] = reduce_min(b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = reduce_min(programIndex);
|
||||
}
|
||||
Reference in New Issue
Block a user