fix for __any

This commit is contained in:
Evghenii
2014-01-26 13:15:13 +01:00
parent 09ea9c9fd6
commit a7d4a3f922
4 changed files with 482 additions and 160 deletions

View File

@@ -353,10 +353,14 @@ lSetInternalFunctions(llvm::Module *module) {
"__atomic_add_int64_global",
"__atomic_add_uniform_int32_global",
"__atomic_add_uniform_int64_global",
"__atomic_add_varying_int32_global",
"__atomic_add_varying_int64_global",
"__atomic_and_int32_global",
"__atomic_and_int64_global",
"__atomic_and_uniform_int32_global",
"__atomic_and_uniform_int64_global",
"__atomic_and_varying_int32_global",
"__atomic_and_varying_int64_global",
"__atomic_compare_exchange_double_global",
"__atomic_compare_exchange_float_global",
"__atomic_compare_exchange_int32_global",
@@ -369,14 +373,22 @@ lSetInternalFunctions(llvm::Module *module) {
"__atomic_max_uniform_int64_global",
"__atomic_min_uniform_int32_global",
"__atomic_min_uniform_int64_global",
"__atomic_max_varying_int32_global",
"__atomic_max_varying_int64_global",
"__atomic_min_varying_int32_global",
"__atomic_min_varying_int64_global",
"__atomic_or_int32_global",
"__atomic_or_int64_global",
"__atomic_or_uniform_int32_global",
"__atomic_or_uniform_int64_global",
"__atomic_or_varying_int32_global",
"__atomic_or_varying_int64_global",
"__atomic_sub_int32_global",
"__atomic_sub_int64_global",
"__atomic_sub_uniform_int32_global",
"__atomic_sub_uniform_int64_global",
"__atomic_sub_varying_int32_global",
"__atomic_sub_varying_int64_global",
"__atomic_swap_double_global",
"__atomic_swap_float_global",
"__atomic_swap_int32_global",
@@ -389,10 +401,20 @@ lSetInternalFunctions(llvm::Module *module) {
"__atomic_umax_uniform_uint64_global",
"__atomic_umin_uniform_uint32_global",
"__atomic_umin_uniform_uint64_global",
"__atomic_umax_varying_uint32_global",
"__atomic_umax_varying_uint64_global",
"__atomic_umin_varying_uint32_global",
"__atomic_umin_varying_uint64_global",
"__atomic_xor_int32_global",
"__atomic_xor_int64_global",
"__atomic_xor_uniform_int32_global",
"__atomic_xor_uniform_int64_global",
"__atomic_xor_uniform_int32_global",
"__atomic_xor_uniform_int64_global",
"__atomic_xor_varying_int32_global",
"__atomic_xor_varying_int64_global",
"__atomic_xor_varying_int32_global",
"__atomic_xor_varying_int64_global",
"__broadcast_double",
"__broadcast_float",
"__broadcast_i16",

View File

@@ -1660,3 +1660,423 @@ define i64 @__clock() nounwind alwaysinline {
ret i64 %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; global_atomic_associative
;; More efficient implementation for atomics that are associative (e.g.,
;; add, and, ...). If a basic implementation would do sometihng like:
;; result0 = atomic_op(ptr, val0)
;; result1 = atomic_op(ptr, val1)
;; ..
;; Then instead we can do:
;; tmp = (val0 op val1 op ...)
;; result0 = atomic_op(ptr, tmp)
;; result1 = (result0 op val0)
;; ..
;; And more efficiently compute the same result
;;
;; Takes five parameters:
;; $1: vector width of the target
;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
;; (add, sub...)
;; $3: return type of the LLVM atomic (e.g. i32)
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
;; add
define <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
{
%mask = bitcast <1 x i1> %maskv to i1
%val = bitcast <1 x i32> %valv to i32
br i1 %mask, label %exec, label %pass
exec:
%addr = ptrtoint i32* %ptr to i64
%old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
%oldv = bitcast i32 %old to <1 x i32>
ret <1 x i32> %oldv
pass:
ret <1 x i32> %valv
}
;; sub
define <1 x i32> @__atomic_sub_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
{
%nvalv = sub <1 x i32> <i32 0>, %valv
%ret = call <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %nvalv, <1 x i1> %maskv);
ret <1 x i32> %ret;
}
;; and
define <1 x i32> @__atomic_and_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
{
%mask = bitcast <1 x i1> %maskv to i1
%val = bitcast <1 x i32> %valv to i32
br i1 %mask, label %exec, label %pass
exec:
%addr = ptrtoint i32* %ptr to i64
%old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
%oldv = bitcast i32 %old to <1 x i32>
ret <1 x i32> %oldv
pass:
ret <1 x i32> %valv
}
;; or
define <1 x i32> @__atomic_or_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
{
%mask = bitcast <1 x i1> %maskv to i1
%val = bitcast <1 x i32> %valv to i32
br i1 %mask, label %exec, label %pass
exec:
%addr = ptrtoint i32* %ptr to i64
%old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
%oldv = bitcast i32 %old to <1 x i32>
ret <1 x i32> %oldv
pass:
ret <1 x i32> %valv
}
;; xor
define <1 x i32> @__atomic_xor_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
{
%mask = bitcast <1 x i1> %maskv to i1
%val = bitcast <1 x i32> %valv to i32
br i1 %mask, label %exec, label %pass
exec:
%addr = ptrtoint i32* %ptr to i64
%old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
%oldv = bitcast i32 %old to <1 x i32>
ret <1 x i32> %oldv
pass:
ret <1 x i32> %valv
}
;;;;;;;;; int64
define <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
{
%mask = bitcast <1 x i1> %maskv to i1
%val = bitcast <1 x i64> %valv to i64
br i1 %mask, label %exec, label %pass
exec:
%addr = ptrtoint i64* %ptr to i64
%old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
%oldv = bitcast i64 %old to <1 x i64>
ret <1 x i64> %oldv
pass:
ret <1 x i64> %valv
}
define <1 x i64> @__atomic_sub_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
{
%nvalv = sub <1 x i64> <i64 0>, %valv
%ret = call <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %nvalv, <1 x i1> %maskv);
ret <1 x i64> %ret;
}
;; and
define <1 x i64> @__atomic_and_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
{
%mask = bitcast <1 x i1> %maskv to i1
%val = bitcast <1 x i64> %valv to i64
br i1 %mask, label %exec, label %pass
exec:
%andr = ptrtoint i64* %ptr to i64
%old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %andr, i64 %val);
%oldv = bitcast i64 %old to <1 x i64>
ret <1 x i64> %oldv
pass:
ret <1 x i64> %valv
}
;; or
define <1 x i64> @__atomic_or_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
{
%mask = bitcast <1 x i1> %maskv to i1
%val = bitcast <1 x i64> %valv to i64
br i1 %mask, label %exec, label %pass
exec:
%orr = ptrtoint i64* %ptr to i64
%old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %orr, i64 %val);
%oldv = bitcast i64 %old to <1 x i64>
ret <1 x i64> %oldv
pass:
ret <1 x i64> %valv
}
;; xor
define <1 x i64> @__atomic_xor_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
{
%mask = bitcast <1 x i1> %maskv to i1
%val = bitcast <1 x i64> %valv to i64
br i1 %mask, label %exec, label %pass
exec:
%xorr = ptrtoint i64* %ptr to i64
%old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %xorr, i64 %val);
%oldv = bitcast i64 %old to <1 x i64>
ret <1 x i64> %oldv
pass:
ret <1 x i64> %valv
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; global_atomic_uniform
;; Defines the implementation of a function that handles the mapping from
;; an ispc atomic function to the underlying LLVM intrinsics. This variant
;; just calls the atomic once, for the given uniform value
;;
;; Takes four parameters:
;; $1: vector width of the target
;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
;; (add, sub...)
;; $3: return type of the LLVM atomic (e.g. i32)
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
define i32 @__get_first_active_lane()
{
%nact = call i32 @__ballot_nvptx(i1 true);
%lane1 = call i32 @__count_leading_zeros_i32(i32 %nact)
%lane = sub i32 31, %lane1
ret i32 %lane
}
define i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
{
%addr = ptrtoint i32* %ptr to i64
%old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
ret i32 %old;
}
define i32 @__atomic_sub_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
{
%nval = sub i32 0, %val;
%old = tail call i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %nval);
ret i32 %old;
}
define i32 @__atomic_and_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
{
%addr = ptrtoint i32* %ptr to i64
%old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
ret i32 %old;
}
define i32 @__atomic_or_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
{
%addr = ptrtoint i32* %ptr to i64
%old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
ret i32 %old;
}
define i32 @__atomic_xor_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
{
%addr = ptrtoint i32* %ptr to i64
%old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
ret i32 %old;
}
define i32 @__atomic_min_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
{
%addr = ptrtoint i32* %ptr to i64
%old = tail call i32 asm sideeffect "atom.min.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
ret i32 %old;
}
define i32 @__atomic_max_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
{
%addr = ptrtoint i32* %ptr to i64
%old = tail call i32 asm sideeffect "atom.max.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
ret i32 %old;
}
define i32 @__atomic_umin_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
{
%addr = ptrtoint i32* %ptr to i64
%old = tail call i32 asm sideeffect "atom.min.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
ret i32 %old;
}
define i32 @__atomic_umax_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
{
%addr = ptrtoint i32* %ptr to i64
%old = tail call i32 asm sideeffect "atom.max.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
ret i32 %old;
}
define i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
{
%addr = ptrtoint i64* %ptr to i64
%old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
ret i64 %old;
}
define i64 @__atomic_sub_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
{
%nval = sub i64 0, %val;
%old = tail call i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %nval);
ret i64 %old;
}
define i64 @__atomic_and_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
{
%addr = ptrtoint i64* %ptr to i64
%old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
ret i64 %old;
}
define i64 @__atomic_or_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
{
%addr = ptrtoint i64* %ptr to i64
%old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
ret i64 %old;
}
define i64 @__atomic_xor_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
{
%addr = ptrtoint i64* %ptr to i64
%old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
ret i64 %old;
}
define i64 @__atomic_min_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
{
%addr = ptrtoint i64* %ptr to i64
%old = tail call i64 asm sideeffect "atom.min.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
ret i64 %old;
}
define i64 @__atomic_max_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
{
%addr = ptrtoint i64* %ptr to i64
%old = tail call i64 asm sideeffect "atom.max.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
ret i64 %old;
}
define i64 @__atomic_umin_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
{
%addr = ptrtoint i64* %ptr to i64
%old = tail call i64 asm sideeffect "atom.min.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
ret i64 %old;
}
define i64 @__atomic_umax_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
{
%addr = ptrtoint i64* %ptr to i64
%old = tail call i64 asm sideeffect "atom.max.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
ret i64 %old;
}
define(`global_atomic_uniform',`
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline
{
entry:
%addr = ptrtoint $3 * %ptr to i64
%active = call i32 @__get_first_active_lane();
%lane = call i32 @__laneidx();
%c = icmp eq i32 %lane, %active
br i1 %c, label %p1, label %p2
p1:
%t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %val);
br label %p2;
p2:
%t1 = phi $3 [%t0, %p1], [zeroinitializer, %entry]
%old = call $3 @__shfl_$3_nvptx($3 %t1, i32 %active)
ret $3 %old;
}
')
global_atomic_uniform(1, add, i32, int32)
global_atomic_uniform(1, sub, i32, int32)
global_atomic_uniform(1, and, i32, int32)
global_atomic_uniform(1, or, i32, int32)
global_atomic_uniform(1, xor, i32, int32)
global_atomic_uniform(1, min, i32, int32)
global_atomic_uniform(1, max, i32, int32)
global_atomic_uniform(1, umin, i32, uint32)
global_atomic_uniform(1, umax, i32, uint32)
global_atomic_uniform(1, add, i64, int64)
global_atomic_uniform(1, sub, i64, int64)
global_atomic_uniform(1, and, i64, int64)
global_atomic_uniform(1, or, i64, int64)
global_atomic_uniform(1, xor, i64, int64)
global_atomic_uniform(1, min, i64, int64)
global_atomic_uniform(1, max, i64, int64)
global_atomic_uniform(1, umin, i64, uint64)
global_atomic_uniform(1, umax, i64, uint64)
define(`global_atomic_varying',`
define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline
{
entry:
%addr = bitcast <1 x i64> %ptr to i64
%c = bitcast <1 x i1> %maskv to i1
br i1 %c, label %p1, label %p2
p1:
%sv = bitcast <1 x $3> %val to $3
%sptr = inttoptr i64 %addr to $3*
%t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sv);
%t0v = bitcast $3 %t0 to <1 x $3>
ret < 1x $3> %t0v
p2:
ret <1 x $3> %val
}
')
global_atomic_varying(1, add, i32, int32)
global_atomic_varying(1, sub, i32, int32)
global_atomic_varying(1, and, i32, int32)
global_atomic_varying(1, or, i32, int32)
global_atomic_varying(1, xor, i32, int32)
global_atomic_varying(1, min, i32, int32)
global_atomic_varying(1, max, i32, int32)
global_atomic_varying(1, umin, i32, uint32)
global_atomic_varying(1, umax, i32, uint32)
global_atomic_varying(1, add, i64, int64)
global_atomic_varying(1, sub, i64, int64)
global_atomic_varying(1, and, i64, int64)
global_atomic_varying(1, or, i64, int64)
global_atomic_varying(1, xor, i64, int64)
global_atomic_varying(1, min, i64, int64)
global_atomic_varying(1, max, i64, int64)
global_atomic_varying(1, umin, i64, uint64)
global_atomic_varying(1, umax, i64, uint64)
;; Macro to declare the function that implements the swap atomic.
;; Takes three parameters:
;; $1: vector width of the target
;; $2: llvm type of the vector elements (e.g. i32)
;; $3: ispc type of the elements (e.g. int32)
define(`global_swap', `
declare $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline ;
')
;; Similarly, macro to declare the function that implements the compare/exchange
;; atomic. Takes three parameters:
;; $1: vector width of the target
;; $2: llvm type of the vector elements (e.g. i32)
;; $3: ispc type of the elements (e.g. int32)
define(`global_atomic_exchange', `
declare <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
<$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline ;
declare $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
$2 %val) nounwind alwaysinline ;
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; atomics and memory barriers
global_swap(WIDTH, i32, int32)
global_swap(WIDTH, i64, int64)
declare float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline ;
declare double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline ;
global_atomic_exchange(WIDTH, i32, int32)
global_atomic_exchange(WIDTH, i64, int64)
declare <WIDTH x float> @__atomic_compare_exchange_float_global(float * %ptr,
<WIDTH x float> %cmp, <WIDTH x float> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
declare <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
<WIDTH x double> %cmp, <WIDTH x double> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
declare float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
float %val) nounwind alwaysinline ;
declare double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
double %val) nounwind alwaysinline ;
declare void @llvm.nvvm.membar.gl()
declare void @llvm.nvvm.membar.sys()
declare void @llvm.nvvm.membar.cta()
define void @__memory_barrier() nounwind readnone alwaysinline {
;; see http://llvm.org/bugs/show_bug.cgi?id=2829. It seems like we
;; only get an MFENCE on x86 if "device" is true, but IMHO we should
;; in the case where the first 4 args are true but it is false.
;; So we just always set that to true...
call void @llvm.nvvm.membar.gl()
ret void
}

View File

@@ -768,27 +768,6 @@ shuffles(double, 8)
shuffles(i64, 8)
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; global_atomic_associative
;; More efficient implementation for atomics that are associative (e.g.,
;; add, and, ...). If a basic implementation would do sometihng like:
;; result0 = atomic_op(ptr, val0)
;; result1 = atomic_op(ptr, val1)
;; ..
;; Then instead we can do:
;; tmp = (val0 op val1 op ...)
;; result0 = atomic_op(ptr, tmp)
;; result1 = (result0 op val0)
;; ..
;; And more efficiently compute the same result
;;
;; Takes five parameters:
;; $1: vector width of the target
;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
;; (add, sub...)
;; $3: return type of the LLVM atomic (e.g. i32)
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
define(`mask_converts', `
define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) {
@@ -875,54 +854,6 @@ define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
mask_converts(WIDTH)
define(`global_atomic_associative', `
declare <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
<$1 x MASK> %m) nounwind alwaysinline ;
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; global_atomic_uniform
;; Defines the implementation of a function that handles the mapping from
;; an ispc atomic function to the underlying LLVM intrinsics. This variant
;; just calls the atomic once, for the given uniform value
;;
;; Takes four parameters:
;; $1: vector width of the target
;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
;; (add, sub...)
;; $3: return type of the LLVM atomic (e.g. i32)
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
define(`global_atomic_uniform', `
declare $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline ;
')
;; Macro to declare the function that implements the swap atomic.
;; Takes three parameters:
;; $1: vector width of the target
;; $2: llvm type of the vector elements (e.g. i32)
;; $3: ispc type of the elements (e.g. int32)
define(`global_swap', `
declare $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline ;
')
;; Similarly, macro to declare the function that implements the compare/exchange
;; atomic. Takes three parameters:
;; $1: vector width of the target
;; $2: llvm type of the vector elements (e.g. i32)
;; $3: ispc type of the elements (e.g. int32)
define(`global_atomic_exchange', `
declare <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
<$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline ;
declare $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
$2 %val) nounwind alwaysinline ;
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; count trailing zeros
@@ -2507,67 +2438,6 @@ define double @__stdlib_pow(double, double) nounwind readnone alwaysinline {
ret double %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; atomics and memory barriers
declare void @llvm.memory.barrier(i1 %loadload, i1 %loadstore, i1 %storeload,
i1 %storestore, i1 %device)
define void @__memory_barrier() nounwind readnone alwaysinline {
;; see http://llvm.org/bugs/show_bug.cgi?id=2829. It seems like we
;; only get an MFENCE on x86 if "device" is true, but IMHO we should
;; in the case where the first 4 args are true but it is false.
;; So we just always set that to true...
call void @llvm.memory.barrier(i1 true, i1 true, i1 true, i1 true, i1 true)
ret void
}
global_atomic_associative(WIDTH, add, i32, int32, 0)
global_atomic_associative(WIDTH, sub, i32, int32, 0)
global_atomic_associative(WIDTH, and, i32, int32, -1)
global_atomic_associative(WIDTH, or, i32, int32, 0)
global_atomic_associative(WIDTH, xor, i32, int32, 0)
global_atomic_uniform(WIDTH, add, i32, int32)
global_atomic_uniform(WIDTH, sub, i32, int32)
global_atomic_uniform(WIDTH, and, i32, int32)
global_atomic_uniform(WIDTH, or, i32, int32)
global_atomic_uniform(WIDTH, xor, i32, int32)
global_atomic_uniform(WIDTH, min, i32, int32)
global_atomic_uniform(WIDTH, max, i32, int32)
global_atomic_uniform(WIDTH, umin, i32, uint32)
global_atomic_uniform(WIDTH, umax, i32, uint32)
global_atomic_associative(WIDTH, add, i64, int64, 0)
global_atomic_associative(WIDTH, sub, i64, int64, 0)
global_atomic_associative(WIDTH, and, i64, int64, -1)
global_atomic_associative(WIDTH, or, i64, int64, 0)
global_atomic_associative(WIDTH, xor, i64, int64, 0)
global_atomic_uniform(WIDTH, add, i64, int64)
global_atomic_uniform(WIDTH, sub, i64, int64)
global_atomic_uniform(WIDTH, and, i64, int64)
global_atomic_uniform(WIDTH, or, i64, int64)
global_atomic_uniform(WIDTH, xor, i64, int64)
global_atomic_uniform(WIDTH, min, i64, int64)
global_atomic_uniform(WIDTH, max, i64, int64)
global_atomic_uniform(WIDTH, umin, i64, uint64)
global_atomic_uniform(WIDTH, umax, i64, uint64)
global_swap(WIDTH, i32, int32)
global_swap(WIDTH, i64, int64)
declare float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline ;
declare double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline ;
global_atomic_exchange(WIDTH, i32, int32)
global_atomic_exchange(WIDTH, i64, int64)
declare <WIDTH x float> @__atomic_compare_exchange_float_global(float * %ptr,
<WIDTH x float> %cmp, <WIDTH x float> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
declare <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
<WIDTH x double> %cmp, <WIDTH x double> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
declare float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
float %val) nounwind alwaysinline ;
declare double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
double %val) nounwind alwaysinline ;
')

View File

@@ -1814,7 +1814,7 @@ static inline void memory_barrier() {
__memory_barrier();
}
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
return ret; \
@@ -1825,6 +1825,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
return ret; \
} \
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
if (__is_nvptx_target) { \
TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
return ret; \
} else { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
TA ret; \
@@ -1835,6 +1839,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
ret = insert(ret, i, r); \
} \
return ret; \
} \
} \
#define DEFINE_ATOMIC_SWAP(TA,TB) \
@@ -1888,7 +1893,7 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
return ret; \
} \
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
uniform TA oneval = reduce_##OPA(value); \
TA ret; \
@@ -1903,6 +1908,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
} \
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
TA value) { \
if (__is_nvptx_target) { \
TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
return ret; \
} else { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
TA ret; \
@@ -1913,48 +1922,49 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
ret = insert(ret, i, r); \
} \
return ret; \
} \
}
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64)
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64)
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64)
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64)
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64)
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64)
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64)
DEFINE_ATOMIC_SWAP(int32,int32)
// For everything but atomic min and max, we can use the same
// implementations for unsigned as for signed.
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64)
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64)
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64)
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64)
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64)
DEFINE_ATOMIC_SWAP(unsigned int32,int32)
DEFINE_ATOMIC_SWAP(float,float)
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64)
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64)
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64)
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64)
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64)
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64)
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64)
DEFINE_ATOMIC_SWAP(int64,int64)
// For everything but atomic min and max, we can use the same
// implementations for unsigned as for signed.
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_SWAP(unsigned int64,int64)
DEFINE_ATOMIC_SWAP(double,double)