Update implementations of masked load/store builtins for AVX to actually use the AVX intrinsics that do this. (As always, not yet tested, pending fuller LLVM AVX support.)
This commit is contained in:
@@ -412,6 +412,7 @@ define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind always
|
|||||||
br i1 %any_on, label %load, label %skip
|
br i1 %any_on, label %load, label %skip
|
||||||
|
|
||||||
load:
|
load:
|
||||||
|
; TODO: make sure this becomes a vbroadcast...
|
||||||
%ptr = bitcast i8 * %0 to i32 *
|
%ptr = bitcast i8 * %0 to i32 *
|
||||||
%val = load i32 * %ptr
|
%val = load i32 * %ptr
|
||||||
|
|
||||||
@@ -436,6 +437,7 @@ define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind always
|
|||||||
br i1 %any_on, label %load, label %skip
|
br i1 %any_on, label %load, label %skip
|
||||||
|
|
||||||
load:
|
load:
|
||||||
|
; TODO: make sure this becomes a vbroadcast...
|
||||||
%ptr = bitcast i8 * %0 to i64 *
|
%ptr = bitcast i8 * %0 to i64 *
|
||||||
%val = load i64 * %ptr
|
%val = load i64 * %ptr
|
||||||
|
|
||||||
@@ -454,55 +456,74 @@ skip:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
||||||
|
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||||
|
|
||||||
define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||||
%mm = call i32 @__movmsk(<8 x i32> %mask)
|
%floatmask = bitcast <8 x i32> %mask to <8 x float>
|
||||||
%any_on = icmp ne i32 %mm, 0
|
%floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
|
||||||
br i1 %any_on, label %load, label %skip
|
%retval = bitcast <8 x float> %floatval to <8 x i32>
|
||||||
|
ret <8 x i32> %retval
|
||||||
load:
|
|
||||||
%ptr = bitcast i8 * %0 to <8 x i32> *
|
|
||||||
%val = load <8 x i32> * %ptr, align 4
|
|
||||||
ret <8 x i32> %val
|
|
||||||
|
|
||||||
skip:
|
|
||||||
ret <8 x i32> undef
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||||
%mm = call i32 @__movmsk(<8 x i32> %mask)
|
; double up masks, bitcast to doubles
|
||||||
%any_on = icmp ne i32 %mm, 0
|
%mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||||
br i1 %any_on, label %load, label %skip
|
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||||
|
%mask1 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||||
|
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||||
|
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||||
|
%mask1d = bitcast <8 x i32> %mask1 to <4 x double>
|
||||||
|
|
||||||
load:
|
%val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
|
||||||
%ptr = bitcast i8 * %0 to <8 x i64> *
|
%ptr1 = getelementptr i8 * %0, i32 32
|
||||||
%val = load <8 x i64> * %ptr, align 8
|
%val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
|
||||||
|
|
||||||
|
%vald = shufflevector <4 x double> %val0d, <4 x double> %val1d,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%val = bitcast <8 x double> %vald to <8 x i64>
|
||||||
ret <8 x i64> %val
|
ret <8 x i64> %val
|
||||||
|
|
||||||
skip:
|
|
||||||
ret <8 x i64> undef
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; masked store
|
;; masked store
|
||||||
|
|
||||||
|
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||||
|
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
||||||
|
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||||
|
|
||||||
define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
||||||
<8 x i32>) nounwind alwaysinline {
|
<8 x i32>) nounwind alwaysinline {
|
||||||
per_lane(8, <8 x i32> %2, `
|
%ptr = bitcast <8 x i32> * %0 to i8 *
|
||||||
; compute address for this one
|
%val = bitcast <8 x i32> %1 to <8 x float>
|
||||||
%ptr_ID = getelementptr <8 x i32> * %0, i32 0, i32 LANE
|
%mask = bitcast <8 x i32> %2 to <8 x float>
|
||||||
%storeval_ID = extractelement <8 x i32> %1, i32 LANE
|
call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask, <8 x float> %val)
|
||||||
store i32 %storeval_ID, i32 * %ptr_ID')
|
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
||||||
<8 x i32>) nounwind alwaysinline {
|
<8 x i32> %mask) nounwind alwaysinline {
|
||||||
per_lane(8, <8 x i32> %2, `
|
%ptr = bitcast <8 x i64> * %0 to i8 *
|
||||||
%ptr_ID = getelementptr <8 x i64> * %0, i32 0, i32 LANE
|
%val = bitcast <8 x i64> %1 to <8 x double>
|
||||||
%storeval_ID = extractelement <8 x i64> %1, i32 LANE
|
|
||||||
store i64 %storeval_ID, i64 * %ptr_ID')
|
%mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||||
|
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||||
|
%mask1 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||||
|
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||||
|
|
||||||
|
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||||
|
%mask1d = bitcast <8 x i32> %mask1 to <4 x double>
|
||||||
|
|
||||||
|
%val0 = shufflevector <8 x double> %val, <8 x double> undef,
|
||||||
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%val1 = shufflevector <8 x double> %val, <8 x double> undef,
|
||||||
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
|
|
||||||
|
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
|
||||||
|
%ptr1 = getelementptr i8 * %ptr, i32 32
|
||||||
|
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user