From a7d4a3f922ef5203f109bc64f5f9321c277dee0a Mon Sep 17 00:00:00 2001 From: Evghenii Date: Sun, 26 Jan 2014 13:15:13 +0100 Subject: [PATCH] fix for __any --- builtins.cpp | 22 ++ builtins/target-nvptx.ll | 420 +++++++++++++++++++++++++++++++++++++++ builtins/util-nvptx.m4 | 130 ------------ stdlib.ispc | 70 ++++--- 4 files changed, 482 insertions(+), 160 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index 57ed3808..29d33ba1 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -353,10 +353,14 @@ lSetInternalFunctions(llvm::Module *module) { "__atomic_add_int64_global", "__atomic_add_uniform_int32_global", "__atomic_add_uniform_int64_global", + "__atomic_add_varying_int32_global", + "__atomic_add_varying_int64_global", "__atomic_and_int32_global", "__atomic_and_int64_global", "__atomic_and_uniform_int32_global", "__atomic_and_uniform_int64_global", + "__atomic_and_varying_int32_global", + "__atomic_and_varying_int64_global", "__atomic_compare_exchange_double_global", "__atomic_compare_exchange_float_global", "__atomic_compare_exchange_int32_global", @@ -369,14 +373,22 @@ lSetInternalFunctions(llvm::Module *module) { "__atomic_max_uniform_int64_global", "__atomic_min_uniform_int32_global", "__atomic_min_uniform_int64_global", + "__atomic_max_varying_int32_global", + "__atomic_max_varying_int64_global", + "__atomic_min_varying_int32_global", + "__atomic_min_varying_int64_global", "__atomic_or_int32_global", "__atomic_or_int64_global", "__atomic_or_uniform_int32_global", "__atomic_or_uniform_int64_global", + "__atomic_or_varying_int32_global", + "__atomic_or_varying_int64_global", "__atomic_sub_int32_global", "__atomic_sub_int64_global", "__atomic_sub_uniform_int32_global", "__atomic_sub_uniform_int64_global", + "__atomic_sub_varying_int32_global", + "__atomic_sub_varying_int64_global", "__atomic_swap_double_global", "__atomic_swap_float_global", "__atomic_swap_int32_global", @@ -389,10 +401,20 @@ lSetInternalFunctions(llvm::Module *module) { "__atomic_umax_uniform_uint64_global", "__atomic_umin_uniform_uint32_global", "__atomic_umin_uniform_uint64_global", + "__atomic_umax_varying_uint32_global", + "__atomic_umax_varying_uint64_global", + "__atomic_umin_varying_uint32_global", + "__atomic_umin_varying_uint64_global", "__atomic_xor_int32_global", "__atomic_xor_int64_global", "__atomic_xor_uniform_int32_global", "__atomic_xor_uniform_int64_global", + "__atomic_xor_uniform_int32_global", + "__atomic_xor_uniform_int64_global", + "__atomic_xor_varying_int32_global", + "__atomic_xor_varying_int64_global", + "__atomic_xor_varying_int32_global", + "__atomic_xor_varying_int64_global", "__broadcast_double", "__broadcast_float", "__broadcast_i16", diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index dbfedc0d..d0f39c51 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -1660,3 +1660,423 @@ define i64 @__clock() nounwind alwaysinline { ret i64 %r } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; global_atomic_associative +;; More efficient implementation for atomics that are associative (e.g., +;; add, and, ...). If a basic implementation would do sometihng like: +;; result0 = atomic_op(ptr, val0) +;; result1 = atomic_op(ptr, val1) +;; .. +;; Then instead we can do: +;; tmp = (val0 op val1 op ...) +;; result0 = atomic_op(ptr, tmp) +;; result1 = (result0 op val0) +;; .. +;; And more efficiently compute the same result +;; +;; Takes five parameters: +;; $1: vector width of the target +;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names) +;; (add, sub...) +;; $3: return type of the LLVM atomic (e.g. i32) +;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) +;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...) +;; add +define <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i32> %valv to i32 + br i1 %mask, label %exec, label %pass +exec: + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + %oldv = bitcast i32 %old to <1 x i32> + ret <1 x i32> %oldv +pass: + ret <1 x i32> %valv +} +;; sub +define <1 x i32> @__atomic_sub_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %nvalv = sub <1 x i32> , %valv + %ret = call <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %nvalv, <1 x i1> %maskv); + ret <1 x i32> %ret; +} +;; and +define <1 x i32> @__atomic_and_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i32> %valv to i32 + br i1 %mask, label %exec, label %pass +exec: + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + %oldv = bitcast i32 %old to <1 x i32> + ret <1 x i32> %oldv +pass: + ret <1 x i32> %valv +} +;; or +define <1 x i32> @__atomic_or_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i32> %valv to i32 + br i1 %mask, label %exec, label %pass +exec: + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + %oldv = bitcast i32 %old to <1 x i32> + ret <1 x i32> %oldv +pass: + ret <1 x i32> %valv +} +;; xor +define <1 x i32> @__atomic_xor_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i32> %valv to i32 + br i1 %mask, label %exec, label %pass +exec: + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + %oldv = bitcast i32 %old to <1 x i32> + ret <1 x i32> %oldv +pass: + ret <1 x i32> %valv +} + +;;;;;;;;; int64 +define <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i64> %valv to i64 + br i1 %mask, label %exec, label %pass +exec: + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + %oldv = bitcast i64 %old to <1 x i64> + ret <1 x i64> %oldv +pass: + ret <1 x i64> %valv +} +define <1 x i64> @__atomic_sub_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %nvalv = sub <1 x i64> , %valv + %ret = call <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %nvalv, <1 x i1> %maskv); + ret <1 x i64> %ret; +} + +;; and +define <1 x i64> @__atomic_and_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i64> %valv to i64 + br i1 %mask, label %exec, label %pass +exec: + %andr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %andr, i64 %val); + %oldv = bitcast i64 %old to <1 x i64> + ret <1 x i64> %oldv +pass: + ret <1 x i64> %valv +} + +;; or +define <1 x i64> @__atomic_or_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i64> %valv to i64 + br i1 %mask, label %exec, label %pass +exec: + %orr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %orr, i64 %val); + %oldv = bitcast i64 %old to <1 x i64> + ret <1 x i64> %oldv +pass: + ret <1 x i64> %valv +} + +;; xor +define <1 x i64> @__atomic_xor_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i64> %valv to i64 + br i1 %mask, label %exec, label %pass +exec: + %xorr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %xorr, i64 %val); + %oldv = bitcast i64 %old to <1 x i64> + ret <1 x i64> %oldv +pass: + ret <1 x i64> %valv +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; global_atomic_uniform +;; Defines the implementation of a function that handles the mapping from +;; an ispc atomic function to the underlying LLVM intrinsics. This variant +;; just calls the atomic once, for the given uniform value +;; +;; Takes four parameters: +;; $1: vector width of the target +;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names) +;; (add, sub...) +;; $3: return type of the LLVM atomic (e.g. i32) +;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) + +define i32 @__get_first_active_lane() +{ + %nact = call i32 @__ballot_nvptx(i1 true); + %lane1 = call i32 @__count_leading_zeros_i32(i32 %nact) + %lane = sub i32 31, %lane1 + ret i32 %lane +} + +define i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define i32 @__atomic_sub_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %nval = sub i32 0, %val; + %old = tail call i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %nval); + ret i32 %old; +} +define i32 @__atomic_and_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define i32 @__atomic_or_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define i32 @__atomic_xor_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define i32 @__atomic_min_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.min.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define i32 @__atomic_max_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.max.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define i32 @__atomic_umin_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.min.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define i32 @__atomic_umax_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.max.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} + + +define i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define i64 @__atomic_sub_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %nval = sub i64 0, %val; + %old = tail call i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %nval); + ret i64 %old; +} +define i64 @__atomic_and_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define i64 @__atomic_or_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define i64 @__atomic_xor_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define i64 @__atomic_min_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.min.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define i64 @__atomic_max_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.max.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define i64 @__atomic_umin_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.min.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define i64 @__atomic_umax_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.max.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} + +define(`global_atomic_uniform',` +define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline +{ +entry: + %addr = ptrtoint $3 * %ptr to i64 + %active = call i32 @__get_first_active_lane(); + %lane = call i32 @__laneidx(); + %c = icmp eq i32 %lane, %active + br i1 %c, label %p1, label %p2 + +p1: + %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %val); + br label %p2; + +p2: + %t1 = phi $3 [%t0, %p1], [zeroinitializer, %entry] + %old = call $3 @__shfl_$3_nvptx($3 %t1, i32 %active) + ret $3 %old; +} +') +global_atomic_uniform(1, add, i32, int32) +global_atomic_uniform(1, sub, i32, int32) +global_atomic_uniform(1, and, i32, int32) +global_atomic_uniform(1, or, i32, int32) +global_atomic_uniform(1, xor, i32, int32) +global_atomic_uniform(1, min, i32, int32) +global_atomic_uniform(1, max, i32, int32) +global_atomic_uniform(1, umin, i32, uint32) +global_atomic_uniform(1, umax, i32, uint32) + +global_atomic_uniform(1, add, i64, int64) +global_atomic_uniform(1, sub, i64, int64) +global_atomic_uniform(1, and, i64, int64) +global_atomic_uniform(1, or, i64, int64) +global_atomic_uniform(1, xor, i64, int64) +global_atomic_uniform(1, min, i64, int64) +global_atomic_uniform(1, max, i64, int64) +global_atomic_uniform(1, umin, i64, uint64) +global_atomic_uniform(1, umax, i64, uint64) + +define(`global_atomic_varying',` +define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline +{ +entry: + %addr = bitcast <1 x i64> %ptr to i64 + %c = bitcast <1 x i1> %maskv to i1 + br i1 %c, label %p1, label %p2 + +p1: + %sv = bitcast <1 x $3> %val to $3 + %sptr = inttoptr i64 %addr to $3* + %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sv); + %t0v = bitcast $3 %t0 to <1 x $3> + ret < 1x $3> %t0v + +p2: + ret <1 x $3> %val +} +') +global_atomic_varying(1, add, i32, int32) +global_atomic_varying(1, sub, i32, int32) +global_atomic_varying(1, and, i32, int32) +global_atomic_varying(1, or, i32, int32) +global_atomic_varying(1, xor, i32, int32) +global_atomic_varying(1, min, i32, int32) +global_atomic_varying(1, max, i32, int32) +global_atomic_varying(1, umin, i32, uint32) +global_atomic_varying(1, umax, i32, uint32) + +global_atomic_varying(1, add, i64, int64) +global_atomic_varying(1, sub, i64, int64) +global_atomic_varying(1, and, i64, int64) +global_atomic_varying(1, or, i64, int64) +global_atomic_varying(1, xor, i64, int64) +global_atomic_varying(1, min, i64, int64) +global_atomic_varying(1, max, i64, int64) +global_atomic_varying(1, umin, i64, uint64) +global_atomic_varying(1, umax, i64, uint64) + +;; Macro to declare the function that implements the swap atomic. +;; Takes three parameters: +;; $1: vector width of the target +;; $2: llvm type of the vector elements (e.g. i32) +;; $3: ispc type of the elements (e.g. int32) + +define(`global_swap', ` +declare $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline ; +') + + +;; Similarly, macro to declare the function that implements the compare/exchange +;; atomic. Takes three parameters: +;; $1: vector width of the target +;; $2: llvm type of the vector elements (e.g. i32) +;; $3: ispc type of the elements (e.g. int32) + +define(`global_atomic_exchange', ` + +declare <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp, + <$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline ; + +declare $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp, + $2 %val) nounwind alwaysinline ; +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; atomics and memory barriers + +global_swap(WIDTH, i32, int32) +global_swap(WIDTH, i64, int64) + +declare float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline ; +declare double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline ; +global_atomic_exchange(WIDTH, i32, int32) +global_atomic_exchange(WIDTH, i64, int64) + +declare @__atomic_compare_exchange_float_global(float * %ptr, + %cmp, %val, %mask) nounwind alwaysinline ; +declare @__atomic_compare_exchange_double_global(double * %ptr, + %cmp, %val, %mask) nounwind alwaysinline ; +declare float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, + float %val) nounwind alwaysinline ; +declare double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp, + double %val) nounwind alwaysinline ; + +declare void @llvm.nvvm.membar.gl() +declare void @llvm.nvvm.membar.sys() +declare void @llvm.nvvm.membar.cta() + +define void @__memory_barrier() nounwind readnone alwaysinline { + ;; see http://llvm.org/bugs/show_bug.cgi?id=2829. It seems like we + ;; only get an MFENCE on x86 if "device" is true, but IMHO we should + ;; in the case where the first 4 args are true but it is false. + ;; So we just always set that to true... + call void @llvm.nvvm.membar.gl() + ret void +} diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4 index 7bb1014b..ede70860 100644 --- a/builtins/util-nvptx.m4 +++ b/builtins/util-nvptx.m4 @@ -768,27 +768,6 @@ shuffles(double, 8) shuffles(i64, 8) ') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; global_atomic_associative -;; More efficient implementation for atomics that are associative (e.g., -;; add, and, ...). If a basic implementation would do sometihng like: -;; result0 = atomic_op(ptr, val0) -;; result1 = atomic_op(ptr, val1) -;; .. -;; Then instead we can do: -;; tmp = (val0 op val1 op ...) -;; result0 = atomic_op(ptr, tmp) -;; result1 = (result0 op val0) -;; .. -;; And more efficiently compute the same result -;; -;; Takes five parameters: -;; $1: vector width of the target -;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names) -;; (add, sub...) -;; $3: return type of the LLVM atomic (e.g. i32) -;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) -;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...) define(`mask_converts', ` define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) { @@ -875,54 +854,6 @@ define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) { mask_converts(WIDTH) -define(`global_atomic_associative', ` - -declare <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, - <$1 x MASK> %m) nounwind alwaysinline ; -') - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; global_atomic_uniform -;; Defines the implementation of a function that handles the mapping from -;; an ispc atomic function to the underlying LLVM intrinsics. This variant -;; just calls the atomic once, for the given uniform value -;; -;; Takes four parameters: -;; $1: vector width of the target -;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names) -;; (add, sub...) -;; $3: return type of the LLVM atomic (e.g. i32) -;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) - -define(`global_atomic_uniform', ` -declare $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline ; -') - -;; Macro to declare the function that implements the swap atomic. -;; Takes three parameters: -;; $1: vector width of the target -;; $2: llvm type of the vector elements (e.g. i32) -;; $3: ispc type of the elements (e.g. int32) - -define(`global_swap', ` -declare $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline ; -') - - -;; Similarly, macro to declare the function that implements the compare/exchange -;; atomic. Takes three parameters: -;; $1: vector width of the target -;; $2: llvm type of the vector elements (e.g. i32) -;; $3: ispc type of the elements (e.g. int32) - -define(`global_atomic_exchange', ` - -declare <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp, - <$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline ; - -declare $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp, - $2 %val) nounwind alwaysinline ; -') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; count trailing zeros @@ -2507,67 +2438,6 @@ define double @__stdlib_pow(double, double) nounwind readnone alwaysinline { ret double %r } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; atomics and memory barriers - -declare void @llvm.memory.barrier(i1 %loadload, i1 %loadstore, i1 %storeload, - i1 %storestore, i1 %device) - -define void @__memory_barrier() nounwind readnone alwaysinline { - ;; see http://llvm.org/bugs/show_bug.cgi?id=2829. It seems like we - ;; only get an MFENCE on x86 if "device" is true, but IMHO we should - ;; in the case where the first 4 args are true but it is false. - ;; So we just always set that to true... - call void @llvm.memory.barrier(i1 true, i1 true, i1 true, i1 true, i1 true) - ret void -} - -global_atomic_associative(WIDTH, add, i32, int32, 0) -global_atomic_associative(WIDTH, sub, i32, int32, 0) -global_atomic_associative(WIDTH, and, i32, int32, -1) -global_atomic_associative(WIDTH, or, i32, int32, 0) -global_atomic_associative(WIDTH, xor, i32, int32, 0) -global_atomic_uniform(WIDTH, add, i32, int32) -global_atomic_uniform(WIDTH, sub, i32, int32) -global_atomic_uniform(WIDTH, and, i32, int32) -global_atomic_uniform(WIDTH, or, i32, int32) -global_atomic_uniform(WIDTH, xor, i32, int32) -global_atomic_uniform(WIDTH, min, i32, int32) -global_atomic_uniform(WIDTH, max, i32, int32) -global_atomic_uniform(WIDTH, umin, i32, uint32) -global_atomic_uniform(WIDTH, umax, i32, uint32) - -global_atomic_associative(WIDTH, add, i64, int64, 0) -global_atomic_associative(WIDTH, sub, i64, int64, 0) -global_atomic_associative(WIDTH, and, i64, int64, -1) -global_atomic_associative(WIDTH, or, i64, int64, 0) -global_atomic_associative(WIDTH, xor, i64, int64, 0) -global_atomic_uniform(WIDTH, add, i64, int64) -global_atomic_uniform(WIDTH, sub, i64, int64) -global_atomic_uniform(WIDTH, and, i64, int64) -global_atomic_uniform(WIDTH, or, i64, int64) -global_atomic_uniform(WIDTH, xor, i64, int64) -global_atomic_uniform(WIDTH, min, i64, int64) -global_atomic_uniform(WIDTH, max, i64, int64) -global_atomic_uniform(WIDTH, umin, i64, uint64) -global_atomic_uniform(WIDTH, umax, i64, uint64) - -global_swap(WIDTH, i32, int32) -global_swap(WIDTH, i64, int64) - -declare float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline ; -declare double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline ; -global_atomic_exchange(WIDTH, i32, int32) -global_atomic_exchange(WIDTH, i64, int64) - -declare @__atomic_compare_exchange_float_global(float * %ptr, - %cmp, %val, %mask) nounwind alwaysinline ; -declare @__atomic_compare_exchange_double_global(double * %ptr, - %cmp, %val, %mask) nounwind alwaysinline ; -declare float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, - float %val) nounwind alwaysinline ; -declare double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp, - double %val) nounwind alwaysinline ; ') diff --git a/stdlib.ispc b/stdlib.ispc index 2d79bf33..a607fab7 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1814,7 +1814,7 @@ static inline void memory_barrier() { __memory_barrier(); } -#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \ +#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \ return ret; \ @@ -1825,6 +1825,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \ return ret; \ } \ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \ + if (__is_nvptx_target) { \ + TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \ + return ret; \ + } else { \ uniform TA * uniform ptrArray[programCount]; \ ptrArray[programIndex] = ptr; \ TA ret; \ @@ -1835,6 +1839,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \ ret = insert(ret, i, r); \ } \ return ret; \ + } \ } \ #define DEFINE_ATOMIC_SWAP(TA,TB) \ @@ -1888,7 +1893,7 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \ return ret; \ } \ -#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \ +#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ uniform TA oneval = reduce_##OPA(value); \ TA ret; \ @@ -1903,6 +1908,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \ } \ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \ TA value) { \ + if (__is_nvptx_target) { \ + TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \ + return ret; \ + } else { \ uniform TA * uniform ptrArray[programCount]; \ ptrArray[programIndex] = ptr; \ TA ret; \ @@ -1913,48 +1922,49 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \ ret = insert(ret, i, r); \ } \ return ret; \ + } \ } -DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType) -DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType) -DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min) -DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max) -DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType) -DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType) -DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType) +DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64) +DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64) +DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64) +DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64) +DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64) DEFINE_ATOMIC_SWAP(int32,int32) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. -DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType) -DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin) -DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax) -DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64) +DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64) +DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64) +DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64) DEFINE_ATOMIC_SWAP(unsigned int32,int32) DEFINE_ATOMIC_SWAP(float,float) -DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType) -DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType) -DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min) -DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max) -DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType) -DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType) -DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType) +DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64) +DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64) +DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64) +DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64) +DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64) DEFINE_ATOMIC_SWAP(int64,int64) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. -DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType) -DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin) -DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax) -DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType) +DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64) DEFINE_ATOMIC_SWAP(unsigned int64,int64) DEFINE_ATOMIC_SWAP(double,double)