Implement global atomics and a memory barrier in the standard library.

This checkin provides the standard set of atomic operations and a memory barrier in the ispc standard library.  Both signed and unsigned 32- and 64-bit integer types are supported.
This commit is contained in:
Matt Pharr
2011-07-04 17:20:42 +01:00
parent 24f47b300d
commit 5bcc611409
13 changed files with 364 additions and 9 deletions

131
stdlib.m4
View File

@@ -405,6 +405,95 @@ forloop(i, 1, eval($1-1), `
}
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; global_atomic
;; Defines the implementation of a function that handles the mapping from
;; an ispc atomic function to the underlying LLVM intrinsics. Specifically,
;; the function handles loooping over the active lanes, calling the underlying
;; scalar atomic intrinsic for each one, and assembling the vector result.
;;
;; Takes four parameters:
;; $1: vector width of the target
;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
;; (add, sub...)
;; $3: return type of the LLVM atomic (e.g. i32)
;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
define(`global_atomic', `
declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
<$1 x i32> %mask) nounwind alwaysinline {
%rptr = alloca <$1 x $3>
%rptr32 = bitcast <$1 x $3> * %rptr to $3 *
per_lane($1, <$1 x i32> %mask, `
%v_LANE_ID = extractelement <$1 x $3> %val, i32 LANE
%r_LANE_ID = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %v_LANE_ID)
%rp_LANE_ID = getelementptr $3 * %rptr32, i32 LANE
store $3 %r_LANE_ID, $3 * %rp_LANE_ID')
%r = load <$1 x $3> * %rptr
ret <$1 x $3> %r
}
')
;; Macro to declare the function that implements the swap atomic.
;; Takes three parameters:
;; $1: vector width of the target
;; $2: llvm type of the vector elements (e.g. i32)
;; $3: ispc type of the elements (e.g. int32)
define(`global_swap', `
declare $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)
define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
<$1 x i32> %mask) nounwind alwaysinline {
%rptr = alloca <$1 x $2>
%rptr32 = bitcast <$1 x $2> * %rptr to $2 *
per_lane($1, <$1 x i32> %mask, `
%val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
%r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)
%rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE
store $2 %r_LANE_ID, $2 * %rp_LANE_ID')
%r = load <$1 x $2> * %rptr
ret <$1 x $2> %r
}
')
;; Similarly, macro to declare the function that implements the compare/exchange
;; atomic. Takes three parameters:
;; $1: vector width of the target
;; $2: llvm type of the vector elements (e.g. i32)
;; $3: ispc type of the elements (e.g. int32)
define(`global_atomic_exchange', `
declare $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)
define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
<$1 x $2> %val, <$1 x i32> %mask) nounwind alwaysinline {
%rptr = alloca <$1 x $2>
%rptr32 = bitcast <$1 x $2> * %rptr to $2 *
per_lane($1, <$1 x i32> %mask, `
%cmp_LANE_ID = extractelement <$1 x $2> %cmp, i32 LANE
%val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
%r_LANE_ID = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp_LANE_ID,
$2 %val_LANE_ID)
%rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE
store $2 %r_LANE_ID, $2 * %rp_LANE_ID')
%r = load <$1 x $2> * %rptr
ret <$1 x $2> %r
}
')
define(`stdlib_core', `
@@ -543,6 +632,48 @@ define internal float @__stdlib_pow(float, float) nounwind readnone alwaysinline
%r = call float @powf(float %0, float %1)
ret float %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; atomics and memory barriers
declare void @llvm.memory.barrier(i1 %loadload, i1 %loadstore, i1 %storeload,
i1 %storestore, i1 %device)
define internal void @__memory_barrier() nounwind readnone alwaysinline {
;; see http://llvm.org/bugs/show_bug.cgi?id=2829. It seems like we
;; only get an MFENCE on x86 if "device" is true, but IMHO we should
;; in the case where the first 4 args are true but it is false.
;; So we just always set that to true...
call void @llvm.memory.barrier(i1 true, i1 true, i1 true, i1 true, i1 true)
ret void
}
global_atomic($1, add, i32, int32)
global_atomic($1, sub, i32, int32)
global_atomic($1, and, i32, int32)
global_atomic($1, or, i32, int32)
global_atomic($1, xor, i32, int32)
global_atomic($1, min, i32, int32)
global_atomic($1, max, i32, int32)
global_atomic($1, umin, i32, uint32)
global_atomic($1, umax, i32, uint32)
global_atomic($1, add, i64, int64)
global_atomic($1, sub, i64, int64)
global_atomic($1, and, i64, int64)
global_atomic($1, or, i64, int64)
global_atomic($1, xor, i64, int64)
global_atomic($1, min, i64, int64)
global_atomic($1, max, i64, int64)
global_atomic($1, umin, i64, uint64)
global_atomic($1, umax, i64, uint64)
global_swap($1, i32, int32)
global_swap($1, i64, int64)
global_atomic_exchange($1, i32, int32)
global_atomic_exchange($1, i64, int64)
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;