diff --git a/builtins.cpp b/builtins.cpp index 29d33ba1..fc21a7c3 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -369,6 +369,10 @@ lSetInternalFunctions(llvm::Module *module) { "__atomic_compare_exchange_uniform_float_global", "__atomic_compare_exchange_uniform_int32_global", "__atomic_compare_exchange_uniform_int64_global", + "__atomic_compare_exchange_varying_double_global", + "__atomic_compare_exchange_varying_float_global", + "__atomic_compare_exchange_varying_int32_global", + "__atomic_compare_exchange_varying_int64_global", "__atomic_max_uniform_int32_global", "__atomic_max_uniform_int64_global", "__atomic_min_uniform_int32_global", @@ -397,6 +401,10 @@ lSetInternalFunctions(llvm::Module *module) { "__atomic_swap_uniform_float_global", "__atomic_swap_uniform_int32_global", "__atomic_swap_uniform_int64_global", + "__atomic_swap_varying_double_global", + "__atomic_swap_varying_float_global", + "__atomic_swap_varying_int32_global", + "__atomic_swap_varying_int64_global", "__atomic_umax_uniform_uint32_global", "__atomic_umax_uniform_uint64_global", "__atomic_umin_uniform_uint32_global", diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index d0f39c51..24b9c0e3 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -1660,6 +1660,9 @@ define i64 @__clock() nounwind alwaysinline { ret i64 %r } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; atomics and memory barriers + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; global_atomic_associative ;; More efficient implementation for atomics that are associative (e.g., @@ -1943,6 +1946,20 @@ define i64 @__atomic_umax_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounw ret i64 %old; } +define(`global_atomic',` +define <1 x $3> @__atomic_$2_$4_global($3* %ptr, <1 x $3> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x $3> %valv to $3 + br i1 %mask, label %exec, label %pass +exec: + %old = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %val); + %oldv = bitcast $3 %old to <1 x $3> + ret <1 x $3> %oldv +pass: + ret <1 x $3> %valv +} +') define(`global_atomic_uniform',` define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline { @@ -1963,6 +1980,27 @@ p2: ret $3 %old; } ') +define(`global_atomic_varying',` +define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline +{ +entry: + %addr = bitcast <1 x i64> %ptr to i64 + %c = bitcast <1 x i1> %maskv to i1 + br i1 %c, label %p1, label %p2 + +p1: + %sv = bitcast <1 x $3> %val to $3 + %sptr = inttoptr i64 %addr to $3* + %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sv); + %t0v = bitcast $3 %t0 to <1 x $3> + ret < 1x $3> %t0v + +p2: + ret <1 x $3> %val +} +') + + global_atomic_uniform(1, add, i32, int32) global_atomic_uniform(1, sub, i32, int32) global_atomic_uniform(1, and, i32, int32) @@ -1983,25 +2021,6 @@ global_atomic_uniform(1, max, i64, int64) global_atomic_uniform(1, umin, i64, uint64) global_atomic_uniform(1, umax, i64, uint64) -define(`global_atomic_varying',` -define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline -{ -entry: - %addr = bitcast <1 x i64> %ptr to i64 - %c = bitcast <1 x i1> %maskv to i1 - br i1 %c, label %p1, label %p2 - -p1: - %sv = bitcast <1 x $3> %val to $3 - %sptr = inttoptr i64 %addr to $3* - %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sv); - %t0v = bitcast $3 %t0 to <1 x $3> - ret < 1x $3> %t0v - -p2: - ret <1 x $3> %val -} -') global_atomic_varying(1, add, i32, int32) global_atomic_varying(1, sub, i32, int32) global_atomic_varying(1, and, i32, int32) @@ -2028,9 +2047,42 @@ global_atomic_varying(1, umax, i64, uint64) ;; $2: llvm type of the vector elements (e.g. i32) ;; $3: ispc type of the elements (e.g. int32) -define(`global_swap', ` -declare $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline ; -') +define i32 @__atomic_swap_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.exch.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define i64 @__atomic_swap_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.exch.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define float @__atomic_swap_uniform_float_global_nvptx(float* %ptr, float %val) nounwind alwaysinline +{ + %ptrI = bitcast float* %ptr to i32* + %valI = bitcast float %val to i32 + %retI = call i32 @__atomic_swap_uniform_int32_global_nvptx(i32* %ptrI, i32 %valI) + %ret = bitcast i32 %retI to float + ret float %ret +} +define double @__atomic_swap_uniform_double_global_nvptx(double* %ptr, double %val) nounwind alwaysinline +{ + %ptrI = bitcast double* %ptr to i64* + %valI = bitcast double %val to i64 + %retI = call i64 @__atomic_swap_uniform_int64_global_nvptx(i64* %ptrI, i64 %valI) + %ret = bitcast i64 %retI to double + ret double %ret +} +global_atomic_uniform(1, swap, i32, int32) +global_atomic_uniform(1, swap, i64, int64) +global_atomic_uniform(1, swap, float, float) +global_atomic_uniform(1, swap, double, double) +global_atomic_varying(1, swap, i32, int32) +global_atomic_varying(1, swap, i64, int64) +global_atomic_varying(1, swap, float, float) +global_atomic_varying(1, swap, double, double) ;; Similarly, macro to declare the function that implements the compare/exchange @@ -2039,34 +2091,109 @@ declare $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysin ;; $2: llvm type of the vector elements (e.g. i32) ;; $3: ispc type of the elements (e.g. int32) -define(`global_atomic_exchange', ` +define i32 @__atomic_compare_exchange_uniform_int32_global_nvptx(i32* %ptr, i32 %cmp, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.cas.b32 $0, [$1], $2, $3;", "=r,l,r,r"(i64 %addr, i32 %cmp, i32 %val); + ret i32 %old; +} +define i64 @__atomic_compare_exchange_uniform_int64_global_nvptx(i64* %ptr, i64 %cmp, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.cas.b64 $0, [$1], $2, $3;", "=l,l,l,l"(i64 %addr, i64 %cmp, i64 %val); + ret i64 %old; +} +define float @__atomic_compare_exchange_uniform_float_global_nvptx(float* %ptr, float %cmp, float %val) nounwind alwaysinline +{ + %ptrI = bitcast float* %ptr to i32* + %cmpI = bitcast float %cmp to i32 + %valI = bitcast float %val to i32 + %retI = call i32 @__atomic_compare_exchange_uniform_int32_global_nvptx(i32* %ptrI, i32 %cmpI, i32 %valI) + %ret = bitcast i32 %retI to float + ret float %ret +} +define double @__atomic_compare_exchange_uniform_double_global_nvptx(double* %ptr, double %cmp, double %val) nounwind alwaysinline +{ + %ptrI = bitcast double* %ptr to i64* + %cmpI = bitcast double %cmp to i64 + %valI = bitcast double %val to i64 + %retI = call i64 @__atomic_compare_exchange_uniform_int64_global_nvptx(i64* %ptrI, i64 %cmpI, i64 %valI) + %ret = bitcast i64 %retI to double + ret double %ret +} -declare <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp, - <$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline ; +;;;;;;;;;;;; +define(`global_atomic_cas',` +define <1 x $3> @__atomic_$2_$4_global($3* %ptr, <1 x $3> %cmpv, <1 x $3> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %cmp = bitcast <1 x $3> %cmpv to $3 + %val = bitcast <1 x $3> %valv to $3 + br i1 %mask, label %exec, label %pass +exec: + %old = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %cmp, $3 %val); + %oldv = bitcast $3 %old to <1 x $3> + ret <1 x $3> %oldv +pass: + ret <1 x $3> %valv +} +') +define(`global_atomic_cas_uniform',` +define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %cmp, $3 %val) nounwind alwaysinline +{ +entry: + %addr = ptrtoint $3 * %ptr to i64 + %active = call i32 @__get_first_active_lane(); + %lane = call i32 @__laneidx(); + %c = icmp eq i32 %lane, %active + br i1 %c, label %p1, label %p2 -declare $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp, - $2 %val) nounwind alwaysinline ; +p1: + %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %cmp, $3 %val); + br label %p2; + +p2: + %t1 = phi $3 [%t0, %p1], [zeroinitializer, %entry] + %old = call $3 @__shfl_$3_nvptx($3 %t1, i32 %active) + ret $3 %old; +} +') +define(`global_atomic_cas_varying',` +define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %cmp, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline +{ +entry: + %addr = bitcast <1 x i64> %ptr to i64 + %c = bitcast <1 x i1> %maskv to i1 + br i1 %c, label %p1, label %p2 + +p1: + %sv = bitcast <1 x $3> %val to $3 + %sc = bitcast <1 x $3> %cmp to $3 + %sptr = inttoptr i64 %addr to $3* + %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sc, $3 %sv); + %t0v = bitcast $3 %t0 to <1 x $3> + ret < 1x $3> %t0v + +p2: + ret <1 x $3> %val +} ') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; atomics and memory barriers +global_atomic_cas_uniform(1, compare_exchange, i32, int32) +global_atomic_cas_uniform(1, compare_exchange, i64, int64) +global_atomic_cas_uniform(1, compare_exchange, float, float) +global_atomic_cas_uniform(1, compare_exchange, double, double) +global_atomic_cas_varying(1, compare_exchange, i32, int32) +global_atomic_cas_varying(1, compare_exchange, i64, int64) +global_atomic_cas_varying(1, compare_exchange, float, float) +global_atomic_cas_varying(1, compare_exchange, double, double) +global_atomic_cas(1, compare_exchange, i32, int32) +global_atomic_cas(1, compare_exchange, i64, int64) +global_atomic_cas(1, compare_exchange, float, float) +global_atomic_cas(1, compare_exchange, double, double) -global_swap(WIDTH, i32, int32) -global_swap(WIDTH, i64, int64) -declare float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline ; -declare double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline ; -global_atomic_exchange(WIDTH, i32, int32) -global_atomic_exchange(WIDTH, i64, int64) -declare @__atomic_compare_exchange_float_global(float * %ptr, - %cmp, %val, %mask) nounwind alwaysinline ; -declare @__atomic_compare_exchange_double_global(double * %ptr, - %cmp, %val, %mask) nounwind alwaysinline ; -declare float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, - float %val) nounwind alwaysinline ; -declare double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp, - double %val) nounwind alwaysinline ; declare void @llvm.nvvm.membar.gl() declare void @llvm.nvvm.membar.sys() diff --git a/builtins/util.m4 b/builtins/util.m4 index cd445fc4..6c8df00f 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -4547,3 +4547,40 @@ declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline ') +define(`global_atomic_varying',` +declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline +') + +define(`global_atomic_cas_varying',` +declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %cmp, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline +') + +global_atomic_cas_varying(WIDTH, compare_exchange, i32, int32) +global_atomic_cas_varying(WIDTH, compare_exchange, i64, int64) +global_atomic_cas_varying(WIDTH, compare_exchange, float, float) +global_atomic_cas_varying(WIDTH, compare_exchange, double, double) + +global_atomic_varying(WIDTH, swap, i32, int32) +global_atomic_varying(WIDTH, swap, i64, int64) +global_atomic_varying(WIDTH, swap, float, float) +global_atomic_varying(WIDTH, swap, double, double) + +global_atomic_varying(WIDTH, add, i32, int32) +global_atomic_varying(WIDTH, sub, i32, int32) +global_atomic_varying(WIDTH, and, i32, int32) +global_atomic_varying(WIDTH, or, i32, int32) +global_atomic_varying(WIDTH, xor, i32, int32) +global_atomic_varying(WIDTH, min, i32, int32) +global_atomic_varying(WIDTH, max, i32, int32) +global_atomic_varying(WIDTH, umin, i32, uint32) +global_atomic_varying(WIDTH, umax, i32, uint32) + +global_atomic_varying(WIDTH, add, i64, int64) +global_atomic_varying(WIDTH, sub, i64, int64) +global_atomic_varying(WIDTH, and, i64, int64) +global_atomic_varying(WIDTH, or, i64, int64) +global_atomic_varying(WIDTH, xor, i64, int64) +global_atomic_varying(WIDTH, min, i64, int64) +global_atomic_varying(WIDTH, max, i64, int64) +global_atomic_varying(WIDTH, umin, i64, uint64) +global_atomic_varying(WIDTH, umax, i64, uint64) diff --git a/stdlib.ispc b/stdlib.ispc index a607fab7..42bb303e 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1842,8 +1842,12 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \ } \ } \ -#define DEFINE_ATOMIC_SWAP(TA,TB) \ +#define DEFINE_ATOMIC_SWAP(TA,TB,MASKTYPE,TC) \ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \ + if (__is_nvptx_target) { \ + TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \ + return ret; \ + } else { \ uniform int i = 0; \ TA ret[programCount]; \ TA memVal; \ @@ -1874,6 +1878,7 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \ originally got back from memory... */ \ ret[lastSwap] = memVal; \ return ret[programIndex]; \ + }\ } \ static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \ uniform TA value) { \ @@ -1881,6 +1886,10 @@ static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \ return ret; \ } \ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \ + if (__is_nvptx_target) { \ + TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \ + return ret; \ + } else { \ uniform TA * uniform ptrArray[programCount]; \ ptrArray[programIndex] = ptr; \ TA ret; \ @@ -1891,6 +1900,7 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \ ret = insert(ret, i, r); \ } \ return ret; \ + }\ } \ #define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \ @@ -1932,7 +1942,7 @@ DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64) DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64) DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64) DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64) -DEFINE_ATOMIC_SWAP(int32,int32) +DEFINE_ATOMIC_SWAP(int32,int32,IntMaskType,int64) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. @@ -1943,9 +1953,9 @@ DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64) DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64) DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64) -DEFINE_ATOMIC_SWAP(unsigned int32,int32) +DEFINE_ATOMIC_SWAP(unsigned int32,int32,UIntMaskType, unsigned int64) -DEFINE_ATOMIC_SWAP(float,float) +DEFINE_ATOMIC_SWAP(float,float,IntMaskType,int64) DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64) DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64) @@ -1954,7 +1964,7 @@ DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64) DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64) DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64) DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64) -DEFINE_ATOMIC_SWAP(int64,int64) +DEFINE_ATOMIC_SWAP(int64,int64,IntMaskType, int64) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. @@ -1965,15 +1975,15 @@ DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64) DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64) DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64) -DEFINE_ATOMIC_SWAP(unsigned int64,int64) +DEFINE_ATOMIC_SWAP(unsigned int64,int64,UIntMaskType, unsigned int64) -DEFINE_ATOMIC_SWAP(double,double) +DEFINE_ATOMIC_SWAP(double,double,IntMaskType, int64) #undef DEFINE_ATOMIC_OP #undef DEFINE_ATOMIC_MINMAX_OP #undef DEFINE_ATOMIC_SWAP -#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \ +#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE, TC) \ static inline uniform TA atomic_compare_exchange_global( \ uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \ uniform TA ret = \ @@ -1988,6 +1998,10 @@ static inline TA atomic_compare_exchange_global( \ } \ static inline TA atomic_compare_exchange_global( \ uniform TA * varying ptr, TA oldval, TA newval) { \ + if (__is_nvptx_target) { \ + TA ret = __atomic_compare_exchange_varying_##TB##_global((TC)ptr, oldval, newval, (MASKTYPE)__mask); \ + return ret; \ + } else { \ uniform TA * uniform ptrArray[programCount]; \ ptrArray[programIndex] = ptr; \ TA ret; \ @@ -1999,14 +2013,15 @@ static inline TA atomic_compare_exchange_global( \ ret = insert(ret, i, r); \ } \ return ret; \ + } \ } -ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType) -ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType) -ATOMIC_DECL_CMPXCHG(float, float, IntMaskType) -ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType) -ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType) -ATOMIC_DECL_CMPXCHG(double, double, IntMaskType) +ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType,int64) +ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType,unsigned int64) +ATOMIC_DECL_CMPXCHG(float, float, IntMaskType,int64) +ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType,int64) +ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType,unsigned int64) +ATOMIC_DECL_CMPXCHG(double, double, IntMaskType,int64) #undef ATOMIC_DECL_CMPXCHG