fix for __any
This commit is contained in:
22
builtins.cpp
22
builtins.cpp
@@ -353,10 +353,14 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__atomic_add_int64_global",
|
||||
"__atomic_add_uniform_int32_global",
|
||||
"__atomic_add_uniform_int64_global",
|
||||
"__atomic_add_varying_int32_global",
|
||||
"__atomic_add_varying_int64_global",
|
||||
"__atomic_and_int32_global",
|
||||
"__atomic_and_int64_global",
|
||||
"__atomic_and_uniform_int32_global",
|
||||
"__atomic_and_uniform_int64_global",
|
||||
"__atomic_and_varying_int32_global",
|
||||
"__atomic_and_varying_int64_global",
|
||||
"__atomic_compare_exchange_double_global",
|
||||
"__atomic_compare_exchange_float_global",
|
||||
"__atomic_compare_exchange_int32_global",
|
||||
@@ -369,14 +373,22 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__atomic_max_uniform_int64_global",
|
||||
"__atomic_min_uniform_int32_global",
|
||||
"__atomic_min_uniform_int64_global",
|
||||
"__atomic_max_varying_int32_global",
|
||||
"__atomic_max_varying_int64_global",
|
||||
"__atomic_min_varying_int32_global",
|
||||
"__atomic_min_varying_int64_global",
|
||||
"__atomic_or_int32_global",
|
||||
"__atomic_or_int64_global",
|
||||
"__atomic_or_uniform_int32_global",
|
||||
"__atomic_or_uniform_int64_global",
|
||||
"__atomic_or_varying_int32_global",
|
||||
"__atomic_or_varying_int64_global",
|
||||
"__atomic_sub_int32_global",
|
||||
"__atomic_sub_int64_global",
|
||||
"__atomic_sub_uniform_int32_global",
|
||||
"__atomic_sub_uniform_int64_global",
|
||||
"__atomic_sub_varying_int32_global",
|
||||
"__atomic_sub_varying_int64_global",
|
||||
"__atomic_swap_double_global",
|
||||
"__atomic_swap_float_global",
|
||||
"__atomic_swap_int32_global",
|
||||
@@ -389,10 +401,20 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__atomic_umax_uniform_uint64_global",
|
||||
"__atomic_umin_uniform_uint32_global",
|
||||
"__atomic_umin_uniform_uint64_global",
|
||||
"__atomic_umax_varying_uint32_global",
|
||||
"__atomic_umax_varying_uint64_global",
|
||||
"__atomic_umin_varying_uint32_global",
|
||||
"__atomic_umin_varying_uint64_global",
|
||||
"__atomic_xor_int32_global",
|
||||
"__atomic_xor_int64_global",
|
||||
"__atomic_xor_uniform_int32_global",
|
||||
"__atomic_xor_uniform_int64_global",
|
||||
"__atomic_xor_uniform_int32_global",
|
||||
"__atomic_xor_uniform_int64_global",
|
||||
"__atomic_xor_varying_int32_global",
|
||||
"__atomic_xor_varying_int64_global",
|
||||
"__atomic_xor_varying_int32_global",
|
||||
"__atomic_xor_varying_int64_global",
|
||||
"__broadcast_double",
|
||||
"__broadcast_float",
|
||||
"__broadcast_i16",
|
||||
|
||||
@@ -1660,3 +1660,423 @@ define i64 @__clock() nounwind alwaysinline {
|
||||
ret i64 %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; global_atomic_associative
|
||||
;; More efficient implementation for atomics that are associative (e.g.,
|
||||
;; add, and, ...). If a basic implementation would do sometihng like:
|
||||
;; result0 = atomic_op(ptr, val0)
|
||||
;; result1 = atomic_op(ptr, val1)
|
||||
;; ..
|
||||
;; Then instead we can do:
|
||||
;; tmp = (val0 op val1 op ...)
|
||||
;; result0 = atomic_op(ptr, tmp)
|
||||
;; result1 = (result0 op val0)
|
||||
;; ..
|
||||
;; And more efficiently compute the same result
|
||||
;;
|
||||
;; Takes five parameters:
|
||||
;; $1: vector width of the target
|
||||
;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
|
||||
;; (add, sub...)
|
||||
;; $3: return type of the LLVM atomic (e.g. i32)
|
||||
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
|
||||
;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
|
||||
;; add
|
||||
define <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
|
||||
{
|
||||
%mask = bitcast <1 x i1> %maskv to i1
|
||||
%val = bitcast <1 x i32> %valv to i32
|
||||
br i1 %mask, label %exec, label %pass
|
||||
exec:
|
||||
%addr = ptrtoint i32* %ptr to i64
|
||||
%old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
|
||||
%oldv = bitcast i32 %old to <1 x i32>
|
||||
ret <1 x i32> %oldv
|
||||
pass:
|
||||
ret <1 x i32> %valv
|
||||
}
|
||||
;; sub
|
||||
define <1 x i32> @__atomic_sub_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
|
||||
{
|
||||
%nvalv = sub <1 x i32> <i32 0>, %valv
|
||||
%ret = call <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %nvalv, <1 x i1> %maskv);
|
||||
ret <1 x i32> %ret;
|
||||
}
|
||||
;; and
|
||||
define <1 x i32> @__atomic_and_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
|
||||
{
|
||||
%mask = bitcast <1 x i1> %maskv to i1
|
||||
%val = bitcast <1 x i32> %valv to i32
|
||||
br i1 %mask, label %exec, label %pass
|
||||
exec:
|
||||
%addr = ptrtoint i32* %ptr to i64
|
||||
%old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
|
||||
%oldv = bitcast i32 %old to <1 x i32>
|
||||
ret <1 x i32> %oldv
|
||||
pass:
|
||||
ret <1 x i32> %valv
|
||||
}
|
||||
;; or
|
||||
define <1 x i32> @__atomic_or_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
|
||||
{
|
||||
%mask = bitcast <1 x i1> %maskv to i1
|
||||
%val = bitcast <1 x i32> %valv to i32
|
||||
br i1 %mask, label %exec, label %pass
|
||||
exec:
|
||||
%addr = ptrtoint i32* %ptr to i64
|
||||
%old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
|
||||
%oldv = bitcast i32 %old to <1 x i32>
|
||||
ret <1 x i32> %oldv
|
||||
pass:
|
||||
ret <1 x i32> %valv
|
||||
}
|
||||
;; xor
|
||||
define <1 x i32> @__atomic_xor_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline
|
||||
{
|
||||
%mask = bitcast <1 x i1> %maskv to i1
|
||||
%val = bitcast <1 x i32> %valv to i32
|
||||
br i1 %mask, label %exec, label %pass
|
||||
exec:
|
||||
%addr = ptrtoint i32* %ptr to i64
|
||||
%old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
|
||||
%oldv = bitcast i32 %old to <1 x i32>
|
||||
ret <1 x i32> %oldv
|
||||
pass:
|
||||
ret <1 x i32> %valv
|
||||
}
|
||||
|
||||
;;;;;;;;; int64
|
||||
define <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
|
||||
{
|
||||
%mask = bitcast <1 x i1> %maskv to i1
|
||||
%val = bitcast <1 x i64> %valv to i64
|
||||
br i1 %mask, label %exec, label %pass
|
||||
exec:
|
||||
%addr = ptrtoint i64* %ptr to i64
|
||||
%old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
|
||||
%oldv = bitcast i64 %old to <1 x i64>
|
||||
ret <1 x i64> %oldv
|
||||
pass:
|
||||
ret <1 x i64> %valv
|
||||
}
|
||||
define <1 x i64> @__atomic_sub_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
|
||||
{
|
||||
%nvalv = sub <1 x i64> <i64 0>, %valv
|
||||
%ret = call <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %nvalv, <1 x i1> %maskv);
|
||||
ret <1 x i64> %ret;
|
||||
}
|
||||
|
||||
;; and
|
||||
define <1 x i64> @__atomic_and_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
|
||||
{
|
||||
%mask = bitcast <1 x i1> %maskv to i1
|
||||
%val = bitcast <1 x i64> %valv to i64
|
||||
br i1 %mask, label %exec, label %pass
|
||||
exec:
|
||||
%andr = ptrtoint i64* %ptr to i64
|
||||
%old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %andr, i64 %val);
|
||||
%oldv = bitcast i64 %old to <1 x i64>
|
||||
ret <1 x i64> %oldv
|
||||
pass:
|
||||
ret <1 x i64> %valv
|
||||
}
|
||||
|
||||
;; or
|
||||
define <1 x i64> @__atomic_or_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
|
||||
{
|
||||
%mask = bitcast <1 x i1> %maskv to i1
|
||||
%val = bitcast <1 x i64> %valv to i64
|
||||
br i1 %mask, label %exec, label %pass
|
||||
exec:
|
||||
%orr = ptrtoint i64* %ptr to i64
|
||||
%old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %orr, i64 %val);
|
||||
%oldv = bitcast i64 %old to <1 x i64>
|
||||
ret <1 x i64> %oldv
|
||||
pass:
|
||||
ret <1 x i64> %valv
|
||||
}
|
||||
|
||||
;; xor
|
||||
define <1 x i64> @__atomic_xor_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline
|
||||
{
|
||||
%mask = bitcast <1 x i1> %maskv to i1
|
||||
%val = bitcast <1 x i64> %valv to i64
|
||||
br i1 %mask, label %exec, label %pass
|
||||
exec:
|
||||
%xorr = ptrtoint i64* %ptr to i64
|
||||
%old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %xorr, i64 %val);
|
||||
%oldv = bitcast i64 %old to <1 x i64>
|
||||
ret <1 x i64> %oldv
|
||||
pass:
|
||||
ret <1 x i64> %valv
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; global_atomic_uniform
|
||||
;; Defines the implementation of a function that handles the mapping from
|
||||
;; an ispc atomic function to the underlying LLVM intrinsics. This variant
|
||||
;; just calls the atomic once, for the given uniform value
|
||||
;;
|
||||
;; Takes four parameters:
|
||||
;; $1: vector width of the target
|
||||
;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
|
||||
;; (add, sub...)
|
||||
;; $3: return type of the LLVM atomic (e.g. i32)
|
||||
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
|
||||
|
||||
define i32 @__get_first_active_lane()
|
||||
{
|
||||
%nact = call i32 @__ballot_nvptx(i1 true);
|
||||
%lane1 = call i32 @__count_leading_zeros_i32(i32 %nact)
|
||||
%lane = sub i32 31, %lane1
|
||||
ret i32 %lane
|
||||
}
|
||||
|
||||
define i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i32* %ptr to i64
|
||||
%old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
|
||||
ret i32 %old;
|
||||
}
|
||||
define i32 @__atomic_sub_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
|
||||
{
|
||||
%nval = sub i32 0, %val;
|
||||
%old = tail call i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %nval);
|
||||
ret i32 %old;
|
||||
}
|
||||
define i32 @__atomic_and_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i32* %ptr to i64
|
||||
%old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
|
||||
ret i32 %old;
|
||||
}
|
||||
define i32 @__atomic_or_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i32* %ptr to i64
|
||||
%old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
|
||||
ret i32 %old;
|
||||
}
|
||||
define i32 @__atomic_xor_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i32* %ptr to i64
|
||||
%old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
|
||||
ret i32 %old;
|
||||
}
|
||||
define i32 @__atomic_min_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i32* %ptr to i64
|
||||
%old = tail call i32 asm sideeffect "atom.min.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
|
||||
ret i32 %old;
|
||||
}
|
||||
define i32 @__atomic_max_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i32* %ptr to i64
|
||||
%old = tail call i32 asm sideeffect "atom.max.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
|
||||
ret i32 %old;
|
||||
}
|
||||
define i32 @__atomic_umin_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i32* %ptr to i64
|
||||
%old = tail call i32 asm sideeffect "atom.min.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
|
||||
ret i32 %old;
|
||||
}
|
||||
define i32 @__atomic_umax_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i32* %ptr to i64
|
||||
%old = tail call i32 asm sideeffect "atom.max.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val);
|
||||
ret i32 %old;
|
||||
}
|
||||
|
||||
|
||||
define i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i64* %ptr to i64
|
||||
%old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
|
||||
ret i64 %old;
|
||||
}
|
||||
define i64 @__atomic_sub_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
|
||||
{
|
||||
%nval = sub i64 0, %val;
|
||||
%old = tail call i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %nval);
|
||||
ret i64 %old;
|
||||
}
|
||||
define i64 @__atomic_and_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i64* %ptr to i64
|
||||
%old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
|
||||
ret i64 %old;
|
||||
}
|
||||
define i64 @__atomic_or_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i64* %ptr to i64
|
||||
%old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
|
||||
ret i64 %old;
|
||||
}
|
||||
define i64 @__atomic_xor_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i64* %ptr to i64
|
||||
%old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
|
||||
ret i64 %old;
|
||||
}
|
||||
define i64 @__atomic_min_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i64* %ptr to i64
|
||||
%old = tail call i64 asm sideeffect "atom.min.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
|
||||
ret i64 %old;
|
||||
}
|
||||
define i64 @__atomic_max_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i64* %ptr to i64
|
||||
%old = tail call i64 asm sideeffect "atom.max.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
|
||||
ret i64 %old;
|
||||
}
|
||||
define i64 @__atomic_umin_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i64* %ptr to i64
|
||||
%old = tail call i64 asm sideeffect "atom.min.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
|
||||
ret i64 %old;
|
||||
}
|
||||
define i64 @__atomic_umax_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline
|
||||
{
|
||||
%addr = ptrtoint i64* %ptr to i64
|
||||
%old = tail call i64 asm sideeffect "atom.max.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val);
|
||||
ret i64 %old;
|
||||
}
|
||||
|
||||
define(`global_atomic_uniform',`
|
||||
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline
|
||||
{
|
||||
entry:
|
||||
%addr = ptrtoint $3 * %ptr to i64
|
||||
%active = call i32 @__get_first_active_lane();
|
||||
%lane = call i32 @__laneidx();
|
||||
%c = icmp eq i32 %lane, %active
|
||||
br i1 %c, label %p1, label %p2
|
||||
|
||||
p1:
|
||||
%t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %val);
|
||||
br label %p2;
|
||||
|
||||
p2:
|
||||
%t1 = phi $3 [%t0, %p1], [zeroinitializer, %entry]
|
||||
%old = call $3 @__shfl_$3_nvptx($3 %t1, i32 %active)
|
||||
ret $3 %old;
|
||||
}
|
||||
')
|
||||
global_atomic_uniform(1, add, i32, int32)
|
||||
global_atomic_uniform(1, sub, i32, int32)
|
||||
global_atomic_uniform(1, and, i32, int32)
|
||||
global_atomic_uniform(1, or, i32, int32)
|
||||
global_atomic_uniform(1, xor, i32, int32)
|
||||
global_atomic_uniform(1, min, i32, int32)
|
||||
global_atomic_uniform(1, max, i32, int32)
|
||||
global_atomic_uniform(1, umin, i32, uint32)
|
||||
global_atomic_uniform(1, umax, i32, uint32)
|
||||
|
||||
global_atomic_uniform(1, add, i64, int64)
|
||||
global_atomic_uniform(1, sub, i64, int64)
|
||||
global_atomic_uniform(1, and, i64, int64)
|
||||
global_atomic_uniform(1, or, i64, int64)
|
||||
global_atomic_uniform(1, xor, i64, int64)
|
||||
global_atomic_uniform(1, min, i64, int64)
|
||||
global_atomic_uniform(1, max, i64, int64)
|
||||
global_atomic_uniform(1, umin, i64, uint64)
|
||||
global_atomic_uniform(1, umax, i64, uint64)
|
||||
|
||||
define(`global_atomic_varying',`
|
||||
define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline
|
||||
{
|
||||
entry:
|
||||
%addr = bitcast <1 x i64> %ptr to i64
|
||||
%c = bitcast <1 x i1> %maskv to i1
|
||||
br i1 %c, label %p1, label %p2
|
||||
|
||||
p1:
|
||||
%sv = bitcast <1 x $3> %val to $3
|
||||
%sptr = inttoptr i64 %addr to $3*
|
||||
%t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sv);
|
||||
%t0v = bitcast $3 %t0 to <1 x $3>
|
||||
ret < 1x $3> %t0v
|
||||
|
||||
p2:
|
||||
ret <1 x $3> %val
|
||||
}
|
||||
')
|
||||
global_atomic_varying(1, add, i32, int32)
|
||||
global_atomic_varying(1, sub, i32, int32)
|
||||
global_atomic_varying(1, and, i32, int32)
|
||||
global_atomic_varying(1, or, i32, int32)
|
||||
global_atomic_varying(1, xor, i32, int32)
|
||||
global_atomic_varying(1, min, i32, int32)
|
||||
global_atomic_varying(1, max, i32, int32)
|
||||
global_atomic_varying(1, umin, i32, uint32)
|
||||
global_atomic_varying(1, umax, i32, uint32)
|
||||
|
||||
global_atomic_varying(1, add, i64, int64)
|
||||
global_atomic_varying(1, sub, i64, int64)
|
||||
global_atomic_varying(1, and, i64, int64)
|
||||
global_atomic_varying(1, or, i64, int64)
|
||||
global_atomic_varying(1, xor, i64, int64)
|
||||
global_atomic_varying(1, min, i64, int64)
|
||||
global_atomic_varying(1, max, i64, int64)
|
||||
global_atomic_varying(1, umin, i64, uint64)
|
||||
global_atomic_varying(1, umax, i64, uint64)
|
||||
|
||||
;; Macro to declare the function that implements the swap atomic.
|
||||
;; Takes three parameters:
|
||||
;; $1: vector width of the target
|
||||
;; $2: llvm type of the vector elements (e.g. i32)
|
||||
;; $3: ispc type of the elements (e.g. int32)
|
||||
|
||||
define(`global_swap', `
|
||||
declare $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline ;
|
||||
')
|
||||
|
||||
|
||||
;; Similarly, macro to declare the function that implements the compare/exchange
|
||||
;; atomic. Takes three parameters:
|
||||
;; $1: vector width of the target
|
||||
;; $2: llvm type of the vector elements (e.g. i32)
|
||||
;; $3: ispc type of the elements (e.g. int32)
|
||||
|
||||
define(`global_atomic_exchange', `
|
||||
|
||||
declare <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
|
||||
<$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline ;
|
||||
|
||||
declare $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
|
||||
$2 %val) nounwind alwaysinline ;
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; atomics and memory barriers
|
||||
|
||||
global_swap(WIDTH, i32, int32)
|
||||
global_swap(WIDTH, i64, int64)
|
||||
|
||||
declare float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline ;
|
||||
declare double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline ;
|
||||
global_atomic_exchange(WIDTH, i32, int32)
|
||||
global_atomic_exchange(WIDTH, i64, int64)
|
||||
|
||||
declare <WIDTH x float> @__atomic_compare_exchange_float_global(float * %ptr,
|
||||
<WIDTH x float> %cmp, <WIDTH x float> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
|
||||
declare <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
|
||||
<WIDTH x double> %cmp, <WIDTH x double> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
|
||||
declare float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
|
||||
float %val) nounwind alwaysinline ;
|
||||
declare double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
|
||||
double %val) nounwind alwaysinline ;
|
||||
|
||||
declare void @llvm.nvvm.membar.gl()
|
||||
declare void @llvm.nvvm.membar.sys()
|
||||
declare void @llvm.nvvm.membar.cta()
|
||||
|
||||
define void @__memory_barrier() nounwind readnone alwaysinline {
|
||||
;; see http://llvm.org/bugs/show_bug.cgi?id=2829. It seems like we
|
||||
;; only get an MFENCE on x86 if "device" is true, but IMHO we should
|
||||
;; in the case where the first 4 args are true but it is false.
|
||||
;; So we just always set that to true...
|
||||
call void @llvm.nvvm.membar.gl()
|
||||
ret void
|
||||
}
|
||||
|
||||
@@ -768,27 +768,6 @@ shuffles(double, 8)
|
||||
shuffles(i64, 8)
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; global_atomic_associative
|
||||
;; More efficient implementation for atomics that are associative (e.g.,
|
||||
;; add, and, ...). If a basic implementation would do sometihng like:
|
||||
;; result0 = atomic_op(ptr, val0)
|
||||
;; result1 = atomic_op(ptr, val1)
|
||||
;; ..
|
||||
;; Then instead we can do:
|
||||
;; tmp = (val0 op val1 op ...)
|
||||
;; result0 = atomic_op(ptr, tmp)
|
||||
;; result1 = (result0 op val0)
|
||||
;; ..
|
||||
;; And more efficiently compute the same result
|
||||
;;
|
||||
;; Takes five parameters:
|
||||
;; $1: vector width of the target
|
||||
;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
|
||||
;; (add, sub...)
|
||||
;; $3: return type of the LLVM atomic (e.g. i32)
|
||||
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
|
||||
;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
|
||||
|
||||
define(`mask_converts', `
|
||||
define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) {
|
||||
@@ -875,54 +854,6 @@ define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
|
||||
|
||||
mask_converts(WIDTH)
|
||||
|
||||
define(`global_atomic_associative', `
|
||||
|
||||
declare <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
|
||||
<$1 x MASK> %m) nounwind alwaysinline ;
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; global_atomic_uniform
|
||||
;; Defines the implementation of a function that handles the mapping from
|
||||
;; an ispc atomic function to the underlying LLVM intrinsics. This variant
|
||||
;; just calls the atomic once, for the given uniform value
|
||||
;;
|
||||
;; Takes four parameters:
|
||||
;; $1: vector width of the target
|
||||
;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
|
||||
;; (add, sub...)
|
||||
;; $3: return type of the LLVM atomic (e.g. i32)
|
||||
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
|
||||
|
||||
define(`global_atomic_uniform', `
|
||||
declare $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline ;
|
||||
')
|
||||
|
||||
;; Macro to declare the function that implements the swap atomic.
|
||||
;; Takes three parameters:
|
||||
;; $1: vector width of the target
|
||||
;; $2: llvm type of the vector elements (e.g. i32)
|
||||
;; $3: ispc type of the elements (e.g. int32)
|
||||
|
||||
define(`global_swap', `
|
||||
declare $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline ;
|
||||
')
|
||||
|
||||
|
||||
;; Similarly, macro to declare the function that implements the compare/exchange
|
||||
;; atomic. Takes three parameters:
|
||||
;; $1: vector width of the target
|
||||
;; $2: llvm type of the vector elements (e.g. i32)
|
||||
;; $3: ispc type of the elements (e.g. int32)
|
||||
|
||||
define(`global_atomic_exchange', `
|
||||
|
||||
declare <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
|
||||
<$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline ;
|
||||
|
||||
declare $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
|
||||
$2 %val) nounwind alwaysinline ;
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; count trailing zeros
|
||||
@@ -2507,67 +2438,6 @@ define double @__stdlib_pow(double, double) nounwind readnone alwaysinline {
|
||||
ret double %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; atomics and memory barriers
|
||||
|
||||
declare void @llvm.memory.barrier(i1 %loadload, i1 %loadstore, i1 %storeload,
|
||||
i1 %storestore, i1 %device)
|
||||
|
||||
define void @__memory_barrier() nounwind readnone alwaysinline {
|
||||
;; see http://llvm.org/bugs/show_bug.cgi?id=2829. It seems like we
|
||||
;; only get an MFENCE on x86 if "device" is true, but IMHO we should
|
||||
;; in the case where the first 4 args are true but it is false.
|
||||
;; So we just always set that to true...
|
||||
call void @llvm.memory.barrier(i1 true, i1 true, i1 true, i1 true, i1 true)
|
||||
ret void
|
||||
}
|
||||
|
||||
global_atomic_associative(WIDTH, add, i32, int32, 0)
|
||||
global_atomic_associative(WIDTH, sub, i32, int32, 0)
|
||||
global_atomic_associative(WIDTH, and, i32, int32, -1)
|
||||
global_atomic_associative(WIDTH, or, i32, int32, 0)
|
||||
global_atomic_associative(WIDTH, xor, i32, int32, 0)
|
||||
global_atomic_uniform(WIDTH, add, i32, int32)
|
||||
global_atomic_uniform(WIDTH, sub, i32, int32)
|
||||
global_atomic_uniform(WIDTH, and, i32, int32)
|
||||
global_atomic_uniform(WIDTH, or, i32, int32)
|
||||
global_atomic_uniform(WIDTH, xor, i32, int32)
|
||||
global_atomic_uniform(WIDTH, min, i32, int32)
|
||||
global_atomic_uniform(WIDTH, max, i32, int32)
|
||||
global_atomic_uniform(WIDTH, umin, i32, uint32)
|
||||
global_atomic_uniform(WIDTH, umax, i32, uint32)
|
||||
|
||||
global_atomic_associative(WIDTH, add, i64, int64, 0)
|
||||
global_atomic_associative(WIDTH, sub, i64, int64, 0)
|
||||
global_atomic_associative(WIDTH, and, i64, int64, -1)
|
||||
global_atomic_associative(WIDTH, or, i64, int64, 0)
|
||||
global_atomic_associative(WIDTH, xor, i64, int64, 0)
|
||||
global_atomic_uniform(WIDTH, add, i64, int64)
|
||||
global_atomic_uniform(WIDTH, sub, i64, int64)
|
||||
global_atomic_uniform(WIDTH, and, i64, int64)
|
||||
global_atomic_uniform(WIDTH, or, i64, int64)
|
||||
global_atomic_uniform(WIDTH, xor, i64, int64)
|
||||
global_atomic_uniform(WIDTH, min, i64, int64)
|
||||
global_atomic_uniform(WIDTH, max, i64, int64)
|
||||
global_atomic_uniform(WIDTH, umin, i64, uint64)
|
||||
global_atomic_uniform(WIDTH, umax, i64, uint64)
|
||||
|
||||
global_swap(WIDTH, i32, int32)
|
||||
global_swap(WIDTH, i64, int64)
|
||||
|
||||
declare float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline ;
|
||||
declare double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline ;
|
||||
global_atomic_exchange(WIDTH, i32, int32)
|
||||
global_atomic_exchange(WIDTH, i64, int64)
|
||||
|
||||
declare <WIDTH x float> @__atomic_compare_exchange_float_global(float * %ptr,
|
||||
<WIDTH x float> %cmp, <WIDTH x float> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
|
||||
declare <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
|
||||
<WIDTH x double> %cmp, <WIDTH x double> %val, <WIDTH x MASK> %mask) nounwind alwaysinline ;
|
||||
declare float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
|
||||
float %val) nounwind alwaysinline ;
|
||||
declare double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
|
||||
double %val) nounwind alwaysinline ;
|
||||
|
||||
')
|
||||
|
||||
|
||||
70
stdlib.ispc
70
stdlib.ispc
@@ -1814,7 +1814,7 @@ static inline void memory_barrier() {
|
||||
__memory_barrier();
|
||||
}
|
||||
|
||||
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \
|
||||
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
|
||||
return ret; \
|
||||
@@ -1825,6 +1825,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
||||
if (__is_nvptx_target) { \
|
||||
TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
|
||||
return ret; \
|
||||
} else { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
TA ret; \
|
||||
@@ -1835,6 +1839,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
} \
|
||||
|
||||
#define DEFINE_ATOMIC_SWAP(TA,TB) \
|
||||
@@ -1888,7 +1893,7 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
|
||||
return ret; \
|
||||
} \
|
||||
|
||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \
|
||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
uniform TA oneval = reduce_##OPA(value); \
|
||||
TA ret; \
|
||||
@@ -1903,6 +1908,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
} \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
||||
TA value) { \
|
||||
if (__is_nvptx_target) { \
|
||||
TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
|
||||
return ret; \
|
||||
} else { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
TA ret; \
|
||||
@@ -1913,48 +1922,49 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
}
|
||||
|
||||
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
|
||||
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_SWAP(int32,int32)
|
||||
|
||||
// For everything but atomic min and max, we can use the same
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64)
|
||||
DEFINE_ATOMIC_SWAP(unsigned int32,int32)
|
||||
|
||||
DEFINE_ATOMIC_SWAP(float,float)
|
||||
|
||||
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
|
||||
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_SWAP(int64,int64)
|
||||
|
||||
// For everything but atomic min and max, we can use the same
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_SWAP(unsigned int64,int64)
|
||||
|
||||
DEFINE_ATOMIC_SWAP(double,double)
|
||||
|
||||
Reference in New Issue
Block a user