Improve naming of masked load/store instructions in builtins.
Now, use _i32 suffixes, rather than _32, etc. Also cleaned up the m4 macro to generate these functions, using WIDTH to get the target width, etc.
This commit is contained in:
@@ -359,13 +359,13 @@ load_and_broadcast(i32)
|
|||||||
load_and_broadcast(i64)
|
load_and_broadcast(i64)
|
||||||
|
|
||||||
; no masked load instruction for i8 and i16 types??
|
; no masked load instruction for i8 and i16 types??
|
||||||
masked_load(16, i8, 8, 1)
|
masked_load(i8, 1)
|
||||||
masked_load(16, i16, 16, 2)
|
masked_load(i16, 2)
|
||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
||||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||||
|
|
||||||
define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
define <16 x i32> @__masked_load_i32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||||
%floatmask = bitcast <16 x i32> %mask to <16 x float>
|
%floatmask = bitcast <16 x i32> %mask to <16 x float>
|
||||||
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
|
||||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
@@ -383,7 +383,7 @@ define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
|
||||||
; double up masks, bitcast to doubles
|
; double up masks, bitcast to doubles
|
||||||
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||||
@@ -424,15 +424,15 @@ define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
|
|||||||
; FIXME: there is no AVX instruction for these, but we could be clever
|
; FIXME: there is no AVX instruction for these, but we could be clever
|
||||||
; by packing the bits down and setting the last 3/4 or half, respectively,
|
; by packing the bits down and setting the last 3/4 or half, respectively,
|
||||||
; of the mask to zero... Not sure if this would be a win in the end
|
; of the mask to zero... Not sure if this would be a win in the end
|
||||||
gen_masked_store(16, i8, 8)
|
gen_masked_store(i8)
|
||||||
gen_masked_store(16, i16, 16)
|
gen_masked_store(i16)
|
||||||
|
|
||||||
; note that mask is the 2nd parameter, not the 3rd one!!
|
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||||
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
||||||
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||||
|
|
||||||
define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
|
define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32>,
|
||||||
<16 x i32>) nounwind alwaysinline {
|
<16 x i32>) nounwind alwaysinline {
|
||||||
%ptr = bitcast <16 x i32> * %0 to i8 *
|
%ptr = bitcast <16 x i32> * %0 to i8 *
|
||||||
%val = bitcast <16 x i32> %1 to <16 x float>
|
%val = bitcast <16 x i32> %1 to <16 x float>
|
||||||
%mask = bitcast <16 x i32> %2 to <16 x float>
|
%mask = bitcast <16 x i32> %2 to <16 x float>
|
||||||
@@ -454,8 +454,8 @@ define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
|
define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>,
|
||||||
<16 x i32> %mask) nounwind alwaysinline {
|
<16 x i32> %mask) nounwind alwaysinline {
|
||||||
%ptr = bitcast <16 x i64> * %0 to i8 *
|
%ptr = bitcast <16 x i64> * %0 to i8 *
|
||||||
%val = bitcast <16 x i64> %1 to <16 x double>
|
%val = bitcast <16 x i64> %1 to <16 x double>
|
||||||
|
|
||||||
@@ -499,8 +499,8 @@ masked_store_blend_8_16_by_16()
|
|||||||
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||||
<8 x float>) nounwind readnone
|
<8 x float>) nounwind readnone
|
||||||
|
|
||||||
define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
|
define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>,
|
||||||
<16 x i32>) nounwind alwaysinline {
|
<16 x i32>) nounwind alwaysinline {
|
||||||
%maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
|
%maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
|
||||||
%oldValue = load <16 x i32>* %0, align 4
|
%oldValue = load <16 x i32>* %0, align 4
|
||||||
%oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
|
%oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
|
||||||
@@ -537,8 +537,8 @@ define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
|
|||||||
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
|
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
|
||||||
<4 x double>) nounwind readnone
|
<4 x double>) nounwind readnone
|
||||||
|
|
||||||
define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64,
|
define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64,
|
||||||
<16 x i32> %mask) nounwind alwaysinline {
|
<16 x i32> %mask) nounwind alwaysinline {
|
||||||
%oldValue = load <16 x i64>* %ptr, align 8
|
%oldValue = load <16 x i64>* %ptr, align 8
|
||||||
%old = bitcast <16 x i64> %oldValue to <16 x double>
|
%old = bitcast <16 x i64> %oldValue to <16 x double>
|
||||||
%old0d = shufflevector <16 x double> %old, <16 x double> undef,
|
%old0d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||||
|
|||||||
@@ -340,13 +340,13 @@ load_and_broadcast(i32)
|
|||||||
load_and_broadcast(i64)
|
load_and_broadcast(i64)
|
||||||
|
|
||||||
; no masked load instruction for i8 and i16 types??
|
; no masked load instruction for i8 and i16 types??
|
||||||
masked_load(8, i8, 8, 1)
|
masked_load(i8, 1)
|
||||||
masked_load(8, i16, 16, 2)
|
masked_load(i16, 2)
|
||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
|
||||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||||
|
|
||||||
define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
define <8 x i32> @__masked_load_i32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||||
%floatmask = bitcast <8 x i32> %mask to <8 x float>
|
%floatmask = bitcast <8 x i32> %mask to <8 x float>
|
||||||
%floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
|
%floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
|
||||||
%retval = bitcast <8 x float> %floatval to <8 x i32>
|
%retval = bitcast <8 x float> %floatval to <8 x i32>
|
||||||
@@ -354,7 +354,7 @@ define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
|
||||||
; double up masks, bitcast to doubles
|
; double up masks, bitcast to doubles
|
||||||
%mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
%mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
|
||||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||||
@@ -377,15 +377,15 @@ define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; masked store
|
;; masked store
|
||||||
|
|
||||||
gen_masked_store(8, i8, 8)
|
gen_masked_store(i8)
|
||||||
gen_masked_store(8, i16, 16)
|
gen_masked_store(i16)
|
||||||
|
|
||||||
; note that mask is the 2nd parameter, not the 3rd one!!
|
; note that mask is the 2nd parameter, not the 3rd one!!
|
||||||
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
|
||||||
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||||
|
|
||||||
define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
define void @__masked_store_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||||
<8 x i32>) nounwind alwaysinline {
|
<8 x i32>) nounwind alwaysinline {
|
||||||
%ptr = bitcast <8 x i32> * %0 to i8 *
|
%ptr = bitcast <8 x i32> * %0 to i8 *
|
||||||
%val = bitcast <8 x i32> %1 to <8 x float>
|
%val = bitcast <8 x i32> %1 to <8 x float>
|
||||||
%mask = bitcast <8 x i32> %2 to <8 x float>
|
%mask = bitcast <8 x i32> %2 to <8 x float>
|
||||||
@@ -393,8 +393,8 @@ define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
|
||||||
<8 x i32> %mask) nounwind alwaysinline {
|
<8 x i32> %mask) nounwind alwaysinline {
|
||||||
%ptr = bitcast <8 x i64> * %0 to i8 *
|
%ptr = bitcast <8 x i64> * %0 to i8 *
|
||||||
%val = bitcast <8 x i64> %1 to <8 x double>
|
%val = bitcast <8 x i64> %1 to <8 x double>
|
||||||
|
|
||||||
@@ -418,14 +418,13 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
masked_store_blend_8_16_by_8()
|
masked_store_blend_8_16_by_8()
|
||||||
|
|
||||||
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||||
<8 x float>) nounwind readnone
|
<8 x float>) nounwind readnone
|
||||||
|
|
||||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||||
<8 x i32>) nounwind alwaysinline {
|
<8 x i32>) nounwind alwaysinline {
|
||||||
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
|
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
|
||||||
%oldValue = load <8 x i32>* %0, align 4
|
%oldValue = load <8 x i32>* %0, align 4
|
||||||
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
|
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
|
||||||
@@ -439,8 +438,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||||
<8 x i32> %i32mask) nounwind alwaysinline {
|
<8 x i32> %i32mask) nounwind alwaysinline {
|
||||||
%oldValue = load <8 x i64>* %ptr, align 8
|
%oldValue = load <8 x i64>* %ptr, align 8
|
||||||
%mask = bitcast <8 x i32> %i32mask to <8 x float>
|
%mask = bitcast <8 x i32> %i32mask to <8 x float>
|
||||||
|
|
||||||
|
|||||||
@@ -13,10 +13,10 @@ aossoa()
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; masked store
|
;; masked store
|
||||||
|
|
||||||
gen_masked_store(1, i8, 8)
|
gen_masked_store(i8)
|
||||||
gen_masked_store(1, i16, 16)
|
gen_masked_store(i16)
|
||||||
gen_masked_store(1, i32, 32)
|
gen_masked_store(i32)
|
||||||
gen_masked_store(1, i64, 64)
|
gen_masked_store(i64)
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; unaligned loads/loads+broadcasts
|
;; unaligned loads/loads+broadcasts
|
||||||
@@ -26,10 +26,10 @@ load_and_broadcast(i16)
|
|||||||
load_and_broadcast(i32)
|
load_and_broadcast(i32)
|
||||||
load_and_broadcast(i64)
|
load_and_broadcast(i64)
|
||||||
|
|
||||||
masked_load(1, i8, 8, 1)
|
masked_load(i8, 1)
|
||||||
masked_load(1, i16, 16, 2)
|
masked_load(i16, 2)
|
||||||
masked_load(1, i32, 32, 4)
|
masked_load(i32, 4)
|
||||||
masked_load(1, i64, 64, 8)
|
masked_load(i64, 8)
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; gather/scatter
|
;; gather/scatter
|
||||||
@@ -155,23 +155,23 @@ define <1 x float> @__vselect_float(<1 x float>, <1 x float>,
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; masked store
|
;; masked store
|
||||||
|
|
||||||
define void @__masked_store_blend_8(<1 x i8>* nocapture, <1 x i8>,
|
define void @__masked_store_blend_i8(<1 x i8>* nocapture, <1 x i8>,
|
||||||
<1 x i32> %mask) nounwind alwaysinline {
|
<1 x i32> %mask) nounwind alwaysinline {
|
||||||
%val = load <1 x i8> * %0, align 4
|
%val = load <1 x i8> * %0, align 4
|
||||||
%newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask)
|
%newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask)
|
||||||
store <1 x i8> %newval, <1 x i8> * %0, align 4
|
store <1 x i8> %newval, <1 x i8> * %0, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
define void @__masked_store_blend_16(<1 x i16>* nocapture, <1 x i16>,
|
|
||||||
<1 x i32> %mask) nounwind alwaysinline {
|
define void @__masked_store_blend_i16(<1 x i16>* nocapture, <1 x i16>,
|
||||||
|
<1 x i32> %mask) nounwind alwaysinline {
|
||||||
%val = load <1 x i16> * %0, align 4
|
%val = load <1 x i16> * %0, align 4
|
||||||
%newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask)
|
%newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask)
|
||||||
store <1 x i16> %newval, <1 x i16> * %0, align 4
|
store <1 x i16> %newval, <1 x i16> * %0, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define void @__masked_store_blend_i32(<1 x i32>* nocapture, <1 x i32>,
|
||||||
define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>,
|
|
||||||
<1 x i32> %mask) nounwind alwaysinline {
|
<1 x i32> %mask) nounwind alwaysinline {
|
||||||
%val = load <1 x i32> * %0, align 4
|
%val = load <1 x i32> * %0, align 4
|
||||||
%newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask)
|
%newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask)
|
||||||
@@ -179,8 +179,8 @@ define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>,
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
|
define void @__masked_store_blend_i64(<1 x i64>* nocapture, <1 x i64>,
|
||||||
<1 x i32> %mask) nounwind alwaysinline {
|
<1 x i32> %mask) nounwind alwaysinline {
|
||||||
%val = load <1 x i64> * %0, align 4
|
%val = load <1 x i64> * %0, align 4
|
||||||
%newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask)
|
%newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask)
|
||||||
store <1 x i64> %newval, <1 x i64> * %0, align 4
|
store <1 x i64> %newval, <1 x i64> * %0, align 4
|
||||||
|
|||||||
@@ -231,36 +231,36 @@ declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone
|
|||||||
;; unaligned loads/loads+broadcasts
|
;; unaligned loads/loads+broadcasts
|
||||||
|
|
||||||
|
|
||||||
declare <WIDTH x i8> @__masked_load_8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
|
||||||
declare <WIDTH x i16> @__masked_load_16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
|
||||||
declare <WIDTH x i32> @__masked_load_32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
|
||||||
declare <WIDTH x i64> @__masked_load_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
|
||||||
|
|
||||||
load_and_broadcast(i8)
|
load_and_broadcast(i8)
|
||||||
load_and_broadcast(i16)
|
load_and_broadcast(i16)
|
||||||
load_and_broadcast(i32)
|
load_and_broadcast(i32)
|
||||||
load_and_broadcast(i64)
|
load_and_broadcast(i64)
|
||||||
|
|
||||||
declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||||
<WIDTH x i1>) nounwind
|
declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||||
declare void @__masked_store_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||||
|
declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||||
|
declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||||
<WIDTH x i1>) nounwind
|
<WIDTH x i1>) nounwind
|
||||||
declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||||
<WIDTH x i1>) nounwind
|
<WIDTH x i1>) nounwind
|
||||||
declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||||
<WIDTH x i1> %mask) nounwind
|
<WIDTH x i1>) nounwind
|
||||||
|
declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||||
|
<WIDTH x i1> %mask) nounwind
|
||||||
|
|
||||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||||
declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
declare void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||||
<WIDTH x i1>) nounwind
|
|
||||||
declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
|
||||||
<WIDTH x i1>) nounwind
|
<WIDTH x i1>) nounwind
|
||||||
declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
declare void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||||
<WIDTH x i1>) nounwind
|
<WIDTH x i1>) nounwind
|
||||||
declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
declare void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||||
<WIDTH x i1> %mask) nounwind
|
<WIDTH x i1>) nounwind
|
||||||
|
declare void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||||
|
<WIDTH x i1> %mask) nounwind
|
||||||
', `
|
', `
|
||||||
define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||||
<WIDTH x i1>) nounwind alwaysinline {
|
<WIDTH x i1>) nounwind alwaysinline {
|
||||||
%v = load <WIDTH x i8> * %0
|
%v = load <WIDTH x i8> * %0
|
||||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
|
%v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
|
||||||
@@ -268,23 +268,23 @@ define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||||
<WIDTH x i1>) nounwind alwaysinline {
|
<WIDTH x i1>) nounwind alwaysinline {
|
||||||
%v = load <WIDTH x i16> * %0
|
%v = load <WIDTH x i16> * %0
|
||||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
|
%v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
|
||||||
store <WIDTH x i16> %v1, <WIDTH x i16> * %0
|
store <WIDTH x i16> %v1, <WIDTH x i16> * %0
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||||
<WIDTH x i1>) nounwind alwaysinline {
|
<WIDTH x i1>) nounwind alwaysinline {
|
||||||
%v = load <WIDTH x i32> * %0
|
%v = load <WIDTH x i32> * %0
|
||||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
|
%v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
|
||||||
store <WIDTH x i32> %v1, <WIDTH x i32> * %0
|
store <WIDTH x i32> %v1, <WIDTH x i32> * %0
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
|
define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
|
||||||
<WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
|
<WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
|
||||||
%v = load <WIDTH x i64> * %0
|
%v = load <WIDTH x i64> * %0
|
||||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
|
%v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
|
||||||
|
|||||||
@@ -434,14 +434,14 @@ reduce_equal(8)
|
|||||||
;; unaligned loads/loads+broadcasts
|
;; unaligned loads/loads+broadcasts
|
||||||
|
|
||||||
|
|
||||||
masked_load(8, i8, 8, 1)
|
|
||||||
masked_load(8, i16, 16, 2)
|
|
||||||
masked_load(8, i32, 32, 4)
|
|
||||||
masked_load(8, i64, 64, 8)
|
|
||||||
load_and_broadcast(i8)
|
load_and_broadcast(i8)
|
||||||
load_and_broadcast(i16)
|
load_and_broadcast(i16)
|
||||||
load_and_broadcast(i32)
|
load_and_broadcast(i32)
|
||||||
load_and_broadcast(i64)
|
load_and_broadcast(i64)
|
||||||
|
masked_load(i8, 1)
|
||||||
|
masked_load(i16, 2)
|
||||||
|
masked_load(i32, 4)
|
||||||
|
masked_load(i64, 8)
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; gather/scatter
|
;; gather/scatter
|
||||||
@@ -558,23 +558,23 @@ define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alway
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; masked store
|
;; masked store
|
||||||
|
|
||||||
gen_masked_store(8, i8, 8)
|
gen_masked_store(i8)
|
||||||
gen_masked_store(8, i16, 16)
|
gen_masked_store(i16)
|
||||||
gen_masked_store(8, i32, 32)
|
gen_masked_store(i32)
|
||||||
gen_masked_store(8, i64, 64)
|
gen_masked_store(i64)
|
||||||
|
|
||||||
masked_store_blend_8_16_by_8()
|
masked_store_blend_8_16_by_8()
|
||||||
|
|
||||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||||
<8 x i32> %mask) nounwind alwaysinline {
|
<8 x i32> %mask) nounwind alwaysinline {
|
||||||
%val = load <8 x i32> * %0, align 4
|
%val = load <8 x i32> * %0, align 4
|
||||||
%newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask)
|
%newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask)
|
||||||
store <8 x i32> %newval, <8 x i32> * %0, align 4
|
store <8 x i32> %newval, <8 x i32> * %0, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||||
<8 x i32> %mask) nounwind alwaysinline {
|
<8 x i32> %mask) nounwind alwaysinline {
|
||||||
%oldValue = load <8 x i64>* %ptr, align 8
|
%oldValue = load <8 x i64>* %ptr, align 8
|
||||||
|
|
||||||
; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
|
; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
|
||||||
|
|||||||
@@ -350,16 +350,16 @@ reduce_equal(4)
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; masked store
|
;; masked store
|
||||||
|
|
||||||
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
|
||||||
<4 x i32> %mask) nounwind alwaysinline {
|
<4 x i32> %mask) nounwind alwaysinline {
|
||||||
%val = load <4 x i32> * %0, align 4
|
%val = load <4 x i32> * %0, align 4
|
||||||
%newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask)
|
%newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask)
|
||||||
store <4 x i32> %newval, <4 x i32> * %0, align 4
|
store <4 x i32> %newval, <4 x i32> * %0, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||||
<4 x i32> %mask) nounwind alwaysinline {
|
<4 x i32> %mask) nounwind alwaysinline {
|
||||||
%oldValue = load <4 x i64>* %ptr, align 8
|
%oldValue = load <4 x i64>* %ptr, align 8
|
||||||
|
|
||||||
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
|
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
|
||||||
@@ -552,10 +552,10 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
|
|||||||
|
|
||||||
masked_store_blend_8_16_by_4()
|
masked_store_blend_8_16_by_4()
|
||||||
|
|
||||||
gen_masked_store(4, i8, 8)
|
gen_masked_store(i8)
|
||||||
gen_masked_store(4, i16, 16)
|
gen_masked_store(i16)
|
||||||
gen_masked_store(4, i32, 32)
|
gen_masked_store(i32)
|
||||||
gen_masked_store(4, i64, 64)
|
gen_masked_store(i64)
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; unaligned loads/loads+broadcasts
|
;; unaligned loads/loads+broadcasts
|
||||||
@@ -565,10 +565,10 @@ load_and_broadcast(i16)
|
|||||||
load_and_broadcast(i32)
|
load_and_broadcast(i32)
|
||||||
load_and_broadcast(i64)
|
load_and_broadcast(i64)
|
||||||
|
|
||||||
masked_load(4, i8, 8, 1)
|
masked_load(i8, 1)
|
||||||
masked_load(4, i16, 16, 2)
|
masked_load(i16, 2)
|
||||||
masked_load(4, i32, 32, 4)
|
masked_load(i32, 4)
|
||||||
masked_load(4, i64, 64, 8)
|
masked_load(i64, 8)
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; gather/scatter
|
;; gather/scatter
|
||||||
|
|||||||
@@ -360,15 +360,14 @@ reduce_equal(8)
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; unaligned loads/loads+broadcasts
|
;; unaligned loads/loads+broadcasts
|
||||||
|
|
||||||
|
|
||||||
masked_load(8, i8, 8, 1)
|
|
||||||
masked_load(8, i16, 16, 2)
|
|
||||||
masked_load(8, i32, 32, 4)
|
|
||||||
masked_load(8, i64, 64, 8)
|
|
||||||
load_and_broadcast(i8)
|
load_and_broadcast(i8)
|
||||||
load_and_broadcast(i16)
|
load_and_broadcast(i16)
|
||||||
load_and_broadcast(i32)
|
load_and_broadcast(i32)
|
||||||
load_and_broadcast(i64)
|
load_and_broadcast(i64)
|
||||||
|
masked_load(i8, 1)
|
||||||
|
masked_load(i16, 2)
|
||||||
|
masked_load(i32, 4)
|
||||||
|
masked_load(i64, 8)
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; gather/scatter
|
;; gather/scatter
|
||||||
@@ -444,18 +443,18 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; masked store
|
;; masked store
|
||||||
|
|
||||||
gen_masked_store(8, i8, 8)
|
gen_masked_store(i8)
|
||||||
gen_masked_store(8, i16, 16)
|
gen_masked_store(i16)
|
||||||
gen_masked_store(8, i32, 32)
|
gen_masked_store(i32)
|
||||||
gen_masked_store(8, i64, 64)
|
gen_masked_store(i64)
|
||||||
|
|
||||||
masked_store_blend_8_16_by_8()
|
masked_store_blend_8_16_by_8()
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
||||||
<4 x float>) nounwind readnone
|
<4 x float>) nounwind readnone
|
||||||
|
|
||||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>,
|
||||||
<8 x i32> %mask) nounwind alwaysinline {
|
<8 x i32> %mask) nounwind alwaysinline {
|
||||||
; do two 4-wide blends with blendvps
|
; do two 4-wide blends with blendvps
|
||||||
%mask_as_float = bitcast <8 x i32> %mask to <8 x float>
|
%mask_as_float = bitcast <8 x i32> %mask to <8 x float>
|
||||||
%mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
|
%mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
|
||||||
@@ -484,8 +483,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||||
<8 x i32> %mask) nounwind alwaysinline {
|
<8 x i32> %mask) nounwind alwaysinline {
|
||||||
; implement this as 4 blends of <4 x i32>s, which are actually bitcast
|
; implement this as 4 blends of <4 x i32>s, which are actually bitcast
|
||||||
; <2 x i64>s...
|
; <2 x i64>s...
|
||||||
|
|
||||||
|
|||||||
@@ -384,8 +384,8 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
|||||||
<4 x float>) nounwind readnone
|
<4 x float>) nounwind readnone
|
||||||
|
|
||||||
|
|
||||||
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
|
||||||
<4 x i32> %mask) nounwind alwaysinline {
|
<4 x i32> %mask) nounwind alwaysinline {
|
||||||
%mask_as_float = bitcast <4 x i32> %mask to <4 x float>
|
%mask_as_float = bitcast <4 x i32> %mask to <4 x float>
|
||||||
%oldValue = load <4 x i32>* %0, align 4
|
%oldValue = load <4 x i32>* %0, align 4
|
||||||
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
|
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
|
||||||
@@ -399,8 +399,8 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||||
<4 x i32> %i32mask) nounwind alwaysinline {
|
<4 x i32> %i32mask) nounwind alwaysinline {
|
||||||
%oldValue = load <4 x i64>* %ptr, align 8
|
%oldValue = load <4 x i64>* %ptr, align 8
|
||||||
%mask = bitcast <4 x i32> %i32mask to <4 x float>
|
%mask = bitcast <4 x i32> %i32mask to <4 x float>
|
||||||
|
|
||||||
@@ -451,10 +451,10 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
|||||||
|
|
||||||
masked_store_blend_8_16_by_4()
|
masked_store_blend_8_16_by_4()
|
||||||
|
|
||||||
gen_masked_store(4, i8, 8)
|
gen_masked_store(i8)
|
||||||
gen_masked_store(4, i16, 16)
|
gen_masked_store(i16)
|
||||||
gen_masked_store(4, i32, 32)
|
gen_masked_store(i32)
|
||||||
gen_masked_store(4, i64, 64)
|
gen_masked_store(i64)
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; unaligned loads/loads+broadcasts
|
;; unaligned loads/loads+broadcasts
|
||||||
@@ -464,10 +464,10 @@ load_and_broadcast(i16)
|
|||||||
load_and_broadcast(i32)
|
load_and_broadcast(i32)
|
||||||
load_and_broadcast(i64)
|
load_and_broadcast(i64)
|
||||||
|
|
||||||
masked_load(4, i8, 8, 1)
|
masked_load(i8, 1)
|
||||||
masked_load(4, i16, 16, 2)
|
masked_load(i16, 2)
|
||||||
masked_load(4, i32, 32, 4)
|
masked_load(i32, 4)
|
||||||
masked_load(4, i64, 64, 8)
|
masked_load(i64, 8)
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; gather/scatter
|
;; gather/scatter
|
||||||
|
|||||||
160
builtins/util.m4
160
builtins/util.m4
@@ -1549,19 +1549,19 @@ declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
|
|||||||
; This function declares placeholder masked store functions for the
|
; This function declares placeholder masked store functions for the
|
||||||
; front-end to use.
|
; front-end to use.
|
||||||
;
|
;
|
||||||
; void __pseudo_masked_store_8 (uniform int8 *ptr, varying int8 values, mask)
|
; void __pseudo_masked_store_i8 (uniform int8 *ptr, varying int8 values, mask)
|
||||||
; void __pseudo_masked_store_16(uniform int16 *ptr, varying int16 values, mask)
|
; void __pseudo_masked_store_i16(uniform int16 *ptr, varying int16 values, mask)
|
||||||
; void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask)
|
; void __pseudo_masked_store_i32(uniform int32 *ptr, varying int32 values, mask)
|
||||||
; void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask)
|
; void __pseudo_masked_store_i64(uniform int64 *ptr, varying int64 values, mask)
|
||||||
;
|
;
|
||||||
; These in turn are converted to native masked stores or to regular
|
; These in turn are converted to native masked stores or to regular
|
||||||
; stores (if the mask is all on) by the MaskedStoreOptPass optimization
|
; stores (if the mask is all on) by the MaskedStoreOptPass optimization
|
||||||
; pass.
|
; pass.
|
||||||
|
|
||||||
declare void @__pseudo_masked_store_8(<WIDTH x i8> * nocapture, <WIDTH x i8>, <WIDTH x MASK>)
|
declare void @__pseudo_masked_store_i8(<WIDTH x i8> * nocapture, <WIDTH x i8>, <WIDTH x MASK>)
|
||||||
declare void @__pseudo_masked_store_16(<WIDTH x i16> * nocapture, <WIDTH x i16>, <WIDTH x MASK>)
|
declare void @__pseudo_masked_store_i16(<WIDTH x i16> * nocapture, <WIDTH x i16>, <WIDTH x MASK>)
|
||||||
declare void @__pseudo_masked_store_32(<WIDTH x i32> * nocapture, <WIDTH x i32>, <WIDTH x MASK>)
|
declare void @__pseudo_masked_store_i32(<WIDTH x i32> * nocapture, <WIDTH x i32>, <WIDTH x MASK>)
|
||||||
declare void @__pseudo_masked_store_64(<WIDTH x i64> * nocapture, <WIDTH x i64>, <WIDTH x MASK>)
|
declare void @__pseudo_masked_store_i64(<WIDTH x i64> * nocapture, <WIDTH x i64>, <WIDTH x MASK>)
|
||||||
|
|
||||||
; Declare the pseudo-gather functions. When the ispc front-end needs
|
; Declare the pseudo-gather functions. When the ispc front-end needs
|
||||||
; to perform a gather, it generates a call to one of these functions,
|
; to perform a gather, it generates a call to one of these functions,
|
||||||
@@ -1692,13 +1692,13 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
|
|||||||
<WIDTH x MASK> %mask) {
|
<WIDTH x MASK> %mask) {
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; loads
|
;; loads
|
||||||
%ml8 = call <WIDTH x i8> @__masked_load_8(i8 * %ptr, <WIDTH x MASK> %mask)
|
%ml8 = call <WIDTH x i8> @__masked_load_i8(i8 * %ptr, <WIDTH x MASK> %mask)
|
||||||
call void @__use8(<WIDTH x i8> %ml8)
|
call void @__use8(<WIDTH x i8> %ml8)
|
||||||
%ml16 = call <WIDTH x i16> @__masked_load_16(i8 * %ptr, <WIDTH x MASK> %mask)
|
%ml16 = call <WIDTH x i16> @__masked_load_i16(i8 * %ptr, <WIDTH x MASK> %mask)
|
||||||
call void @__use16(<WIDTH x i16> %ml16)
|
call void @__use16(<WIDTH x i16> %ml16)
|
||||||
%ml32 = call <WIDTH x i32> @__masked_load_32(i8 * %ptr, <WIDTH x MASK> %mask)
|
%ml32 = call <WIDTH x i32> @__masked_load_i32(i8 * %ptr, <WIDTH x MASK> %mask)
|
||||||
call void @__use32(<WIDTH x i32> %ml32)
|
call void @__use32(<WIDTH x i32> %ml32)
|
||||||
%ml64 = call <WIDTH x i64> @__masked_load_64(i8 * %ptr, <WIDTH x MASK> %mask)
|
%ml64 = call <WIDTH x i64> @__masked_load_i64(i8 * %ptr, <WIDTH x MASK> %mask)
|
||||||
call void @__use64(<WIDTH x i64> %ml64)
|
call void @__use64(<WIDTH x i64> %ml64)
|
||||||
|
|
||||||
%lb8 = call <WIDTH x i8> @__load_and_broadcast_i8(i8 * %ptr, <WIDTH x MASK> %mask)
|
%lb8 = call <WIDTH x i8> @__load_and_broadcast_i8(i8 * %ptr, <WIDTH x MASK> %mask)
|
||||||
@@ -1713,31 +1713,29 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; stores
|
;; stores
|
||||||
%pv8 = bitcast i8 * %ptr to <WIDTH x i8> *
|
%pv8 = bitcast i8 * %ptr to <WIDTH x i8> *
|
||||||
call void @__pseudo_masked_store_8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
|
call void @__pseudo_masked_store_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
|
||||||
<WIDTH x MASK> %mask)
|
<WIDTH x MASK> %mask)
|
||||||
%pv16 = bitcast i8 * %ptr to <WIDTH x i16> *
|
%pv16 = bitcast i8 * %ptr to <WIDTH x i16> *
|
||||||
call void @__pseudo_masked_store_16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
|
call void @__pseudo_masked_store_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
|
||||||
<WIDTH x MASK> %mask)
|
<WIDTH x MASK> %mask)
|
||||||
%pv32 = bitcast i8 * %ptr to <WIDTH x i32> *
|
%pv32 = bitcast i8 * %ptr to <WIDTH x i32> *
|
||||||
call void @__pseudo_masked_store_32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
|
call void @__pseudo_masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
|
||||||
<WIDTH x MASK> %mask)
|
<WIDTH x MASK> %mask)
|
||||||
%pv64 = bitcast i8 * %ptr to <WIDTH x i64> *
|
%pv64 = bitcast i8 * %ptr to <WIDTH x i64> *
|
||||||
call void @__pseudo_masked_store_64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
|
call void @__pseudo_masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
|
||||||
|
<WIDTH x MASK> %mask)
|
||||||
|
call void @__masked_store_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
|
||||||
|
call void @__masked_store_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
|
||||||
|
call void @__masked_store_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||||
|
call void @__masked_store_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||||
|
call void @__masked_store_blend_i8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
|
||||||
|
<WIDTH x MASK> %mask)
|
||||||
|
call void @__masked_store_blend_i16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
|
||||||
|
<WIDTH x MASK> %mask)
|
||||||
|
call void @__masked_store_blend_i32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
|
||||||
|
<WIDTH x MASK> %mask)
|
||||||
|
call void @__masked_store_blend_i64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
|
||||||
<WIDTH x MASK> %mask)
|
<WIDTH x MASK> %mask)
|
||||||
|
|
||||||
call void @__masked_store_8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
|
|
||||||
call void @__masked_store_16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
|
|
||||||
call void @__masked_store_32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
|
||||||
call void @__masked_store_64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
|
||||||
|
|
||||||
call void @__masked_store_blend_8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
|
|
||||||
<WIDTH x MASK> %mask)
|
|
||||||
call void @__masked_store_blend_16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
|
|
||||||
<WIDTH x MASK> %mask)
|
|
||||||
call void @__masked_store_blend_32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
|
|
||||||
<WIDTH x MASK> %mask)
|
|
||||||
call void @__masked_store_blend_64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
|
|
||||||
<WIDTH x MASK> %mask)
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; gathers
|
;; gathers
|
||||||
@@ -2507,15 +2505,13 @@ define <WIDTH x $1> @__load_and_broadcast_$1(i8 *, <WIDTH x MASK> %mask) nounwin
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; Emit general-purpose code to do a masked load for targets that dont have
|
;; Emit general-purpose code to do a masked load for targets that dont have
|
||||||
;; an instruction to do that. Parameters:
|
;; an instruction to do that. Parameters:
|
||||||
;; $1: target vector width
|
;; $1: element type for which to emit the function (i32, i64, ...) (and suffix for function name)
|
||||||
;; $2: element type for which to emit the function (i32, i64, ...)
|
;; $2: alignment for elements of type $1 (4, 8, ...)
|
||||||
;; $3: suffix for function name (32, 64, ...)
|
|
||||||
;; $4: alignment for elements of type $2 (4, 8, ...)
|
|
||||||
|
|
||||||
define(`masked_load', `
|
define(`masked_load', `
|
||||||
define <$1 x $2> @__masked_load_$3(i8 *, <$1 x MASK> %mask) nounwind alwaysinline {
|
define <WIDTH x $1> @__masked_load_$1(i8 *, <WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||||
entry:
|
entry:
|
||||||
%mm = call i64 @__movmsk(<$1 x MASK> %mask)
|
%mm = call i64 @__movmsk(<WIDTH x MASK> %mask)
|
||||||
|
|
||||||
; if the first lane and the last lane are on, then it is safe to do a vector load
|
; if the first lane and the last lane are on, then it is safe to do a vector load
|
||||||
; of the whole thing--what the lanes in the middle want turns out to not matter...
|
; of the whole thing--what the lanes in the middle want turns out to not matter...
|
||||||
@@ -2531,14 +2527,14 @@ entry:
|
|||||||
%can_vload_maybe_fast = or i1 %fast_i1, %can_vload
|
%can_vload_maybe_fast = or i1 %fast_i1, %can_vload
|
||||||
|
|
||||||
; if we are not able to do a singe vload, we will accumulate lanes in this memory..
|
; if we are not able to do a singe vload, we will accumulate lanes in this memory..
|
||||||
%retptr = alloca <$1 x $2>
|
%retptr = alloca <WIDTH x $1>
|
||||||
%retptr32 = bitcast <$1 x $2> * %retptr to $2 *
|
%retptr32 = bitcast <WIDTH x $1> * %retptr to $1 *
|
||||||
br i1 %can_vload_maybe_fast, label %load, label %loop
|
br i1 %can_vload_maybe_fast, label %load, label %loop
|
||||||
|
|
||||||
load:
|
load:
|
||||||
%ptr = bitcast i8 * %0 to <$1 x $2> *
|
%ptr = bitcast i8 * %0 to <WIDTH x $1> *
|
||||||
%valall = load <$1 x $2> * %ptr, align $4
|
%valall = load <WIDTH x $1> * %ptr, align $2
|
||||||
ret <$1 x $2> %valall
|
ret <WIDTH x $1> %valall
|
||||||
|
|
||||||
loop:
|
loop:
|
||||||
; loop over the lanes and see if each one is on...
|
; loop over the lanes and see if each one is on...
|
||||||
@@ -2552,21 +2548,21 @@ loop:
|
|||||||
load_lane:
|
load_lane:
|
||||||
; yes! do the load and store the result into the appropriate place in the
|
; yes! do the load and store the result into the appropriate place in the
|
||||||
; allocaed memory above
|
; allocaed memory above
|
||||||
%ptr32 = bitcast i8 * %0 to $2 *
|
%ptr32 = bitcast i8 * %0 to $1 *
|
||||||
%lane_ptr = getelementptr $2 * %ptr32, i32 %lane
|
%lane_ptr = getelementptr $1 * %ptr32, i32 %lane
|
||||||
%val = load $2 * %lane_ptr
|
%val = load $1 * %lane_ptr
|
||||||
%store_ptr = getelementptr $2 * %retptr32, i32 %lane
|
%store_ptr = getelementptr $1 * %retptr32, i32 %lane
|
||||||
store $2 %val, $2 * %store_ptr
|
store $1 %val, $1 * %store_ptr
|
||||||
br label %lane_done
|
br label %lane_done
|
||||||
|
|
||||||
lane_done:
|
lane_done:
|
||||||
%next_lane = add i32 %lane, 1
|
%next_lane = add i32 %lane, 1
|
||||||
%done = icmp eq i32 %lane, eval($1-1)
|
%done = icmp eq i32 %lane, eval(WIDTH-1)
|
||||||
br i1 %done, label %return, label %loop
|
br i1 %done, label %return, label %loop
|
||||||
|
|
||||||
return:
|
return:
|
||||||
%r = load <$1 x $2> * %retptr
|
%r = load <WIDTH x $1> * %retptr
|
||||||
ret <$1 x $2> %r
|
ret <WIDTH x $1> %r
|
||||||
}
|
}
|
||||||
')
|
')
|
||||||
|
|
||||||
@@ -2574,23 +2570,21 @@ return:
|
|||||||
;; masked store
|
;; masked store
|
||||||
;; emit code to do masked store as a set of per-lane scalar stores
|
;; emit code to do masked store as a set of per-lane scalar stores
|
||||||
;; parameters:
|
;; parameters:
|
||||||
;; $1: target vector width
|
;; $1: llvm type of elements (and suffix for function name)
|
||||||
;; $2: llvm type of elements
|
|
||||||
;; $3: suffix for function name
|
|
||||||
|
|
||||||
define(`gen_masked_store', `
|
define(`gen_masked_store', `
|
||||||
define void @__masked_store_$3(<$1 x $2>* nocapture, <$1 x $2>, <$1 x i32>) nounwind alwaysinline {
|
define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x i32>) nounwind alwaysinline {
|
||||||
per_lane($1, <$1 x i32> %2, `
|
per_lane(WIDTH, <WIDTH x i32> %2, `
|
||||||
%ptr_LANE_ID = getelementptr <$1 x $2> * %0, i32 0, i32 LANE
|
%ptr_LANE_ID = getelementptr <WIDTH x $1> * %0, i32 0, i32 LANE
|
||||||
%storeval_LANE_ID = extractelement <$1 x $2> %1, i32 LANE
|
%storeval_LANE_ID = extractelement <WIDTH x $1> %1, i32 LANE
|
||||||
store $2 %storeval_LANE_ID, $2 * %ptr_LANE_ID')
|
store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID')
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
')
|
')
|
||||||
|
|
||||||
define(`masked_store_blend_8_16_by_4', `
|
define(`masked_store_blend_8_16_by_4', `
|
||||||
define void @__masked_store_blend_8(<4 x i8>* nocapture, <4 x i8>,
|
define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
|
||||||
<4 x i32>) nounwind alwaysinline {
|
<4 x i32>) nounwind alwaysinline {
|
||||||
%old = load <4 x i8> * %0, align 1
|
%old = load <4 x i8> * %0, align 1
|
||||||
ifelse(LLVM_VERSION,LLVM_3_1svn,`
|
ifelse(LLVM_VERSION,LLVM_3_1svn,`
|
||||||
%m = trunc <4 x i32> %2 to <4 x i1>
|
%m = trunc <4 x i32> %2 to <4 x i1>
|
||||||
@@ -2613,8 +2607,8 @@ define void @__masked_store_blend_8(<4 x i8>* nocapture, <4 x i8>,
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_blend_16(<4 x i16>* nocapture, <4 x i16>,
|
define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
|
||||||
<4 x i32>) nounwind alwaysinline {
|
<4 x i32>) nounwind alwaysinline {
|
||||||
%old = load <4 x i16> * %0, align 2
|
%old = load <4 x i16> * %0, align 2
|
||||||
ifelse(LLVM_VERSION,LLVM_3_1svn,`
|
ifelse(LLVM_VERSION,LLVM_3_1svn,`
|
||||||
%m = trunc <4 x i32> %2 to <4 x i1>
|
%m = trunc <4 x i32> %2 to <4 x i1>
|
||||||
@@ -2639,8 +2633,8 @@ define void @__masked_store_blend_16(<4 x i16>* nocapture, <4 x i16>,
|
|||||||
')
|
')
|
||||||
|
|
||||||
define(`masked_store_blend_8_16_by_8', `
|
define(`masked_store_blend_8_16_by_8', `
|
||||||
define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>,
|
define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
|
||||||
<8 x i32>) nounwind alwaysinline {
|
<8 x i32>) nounwind alwaysinline {
|
||||||
%old = load <8 x i8> * %0, align 1
|
%old = load <8 x i8> * %0, align 1
|
||||||
ifelse(LLVM_VERSION,LLVM_3_1svn,`
|
ifelse(LLVM_VERSION,LLVM_3_1svn,`
|
||||||
%m = trunc <8 x i32> %2 to <8 x i1>
|
%m = trunc <8 x i32> %2 to <8 x i1>
|
||||||
@@ -2663,8 +2657,8 @@ define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>,
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
|
define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>,
|
||||||
<8 x i32>) nounwind alwaysinline {
|
<8 x i32>) nounwind alwaysinline {
|
||||||
%old = load <8 x i16> * %0, align 2
|
%old = load <8 x i16> * %0, align 2
|
||||||
ifelse(LLVM_VERSION,LLVM_3_1svn,`
|
ifelse(LLVM_VERSION,LLVM_3_1svn,`
|
||||||
%m = trunc <8 x i32> %2 to <8 x i1>
|
%m = trunc <8 x i32> %2 to <8 x i1>
|
||||||
@@ -2690,8 +2684,8 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
|
|||||||
|
|
||||||
|
|
||||||
define(`masked_store_blend_8_16_by_16', `
|
define(`masked_store_blend_8_16_by_16', `
|
||||||
define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>,
|
define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
|
||||||
<16 x i32>) nounwind alwaysinline {
|
<16 x i32>) nounwind alwaysinline {
|
||||||
%old = load <16 x i8> * %0, align 1
|
%old = load <16 x i8> * %0, align 1
|
||||||
ifelse(LLVM_VERSION,LLVM_3_1svn,`
|
ifelse(LLVM_VERSION,LLVM_3_1svn,`
|
||||||
%m = trunc <16 x i32> %2 to <16 x i1>
|
%m = trunc <16 x i32> %2 to <16 x i1>
|
||||||
@@ -2714,8 +2708,8 @@ define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>,
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
|
define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
|
||||||
<16 x i32>) nounwind alwaysinline {
|
<16 x i32>) nounwind alwaysinline {
|
||||||
%old = load <16 x i16> * %0, align 2
|
%old = load <16 x i16> * %0, align 2
|
||||||
ifelse(LLVM_VERSION,LLVM_3_1svn,`
|
ifelse(LLVM_VERSION,LLVM_3_1svn,`
|
||||||
%m = trunc <16 x i32> %2 to <16 x i1>
|
%m = trunc <16 x i32> %2 to <16 x i1>
|
||||||
@@ -2895,7 +2889,7 @@ domixed:
|
|||||||
store <$1 x $2> %basesmear, <$1 x $2> * %ptr
|
store <$1 x $2> %basesmear, <$1 x $2> * %ptr
|
||||||
%castptr = bitcast <$1 x $2> * %ptr to <$1 x $4> *
|
%castptr = bitcast <$1 x $2> * %ptr to <$1 x $4> *
|
||||||
%castv = bitcast <$1 x $2> %v to <$1 x $4>
|
%castv = bitcast <$1 x $2> %v to <$1 x $4>
|
||||||
call void @__masked_store_blend_$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x MASK> %mask)
|
call void @__masked_store_blend_i$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x MASK> %mask)
|
||||||
%blendvec = load <$1 x $2> * %ptr
|
%blendvec = load <$1 x $2> * %ptr
|
||||||
br label %check_neighbors
|
br label %check_neighbors
|
||||||
|
|
||||||
@@ -2970,8 +2964,8 @@ define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v,
|
|||||||
store <$1 x $2> %idvec, <$1 x $2> * %ptr
|
store <$1 x $2> %idvec, <$1 x $2> * %ptr
|
||||||
%ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> *
|
%ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> *
|
||||||
%vi = bitcast <$1 x $2> %v to <$1 x i`'$3>
|
%vi = bitcast <$1 x $2> %v to <$1 x i`'$3>
|
||||||
call void @__masked_store_blend_$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi,
|
call void @__masked_store_blend_i$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi,
|
||||||
<$1 x MASK> %mask)
|
<$1 x MASK> %mask)
|
||||||
%v_id = load <$1 x $2> * %ptr
|
%v_id = load <$1 x $2> * %ptr
|
||||||
|
|
||||||
; extract elements of the vector to use in computing the scan
|
; extract elements of the vector to use in computing the scan
|
||||||
@@ -3144,14 +3138,14 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32
|
|||||||
; Set the offset to zero for lanes that are off
|
; Set the offset to zero for lanes that are off
|
||||||
%offsetsPtr = alloca <$1 x i32>
|
%offsetsPtr = alloca <$1 x i32>
|
||||||
store <$1 x i32> zeroinitializer, <$1 x i32> * %offsetsPtr
|
store <$1 x i32> zeroinitializer, <$1 x i32> * %offsetsPtr
|
||||||
call void @__masked_store_blend_32(<$1 x i32> * %offsetsPtr, <$1 x i32> %offsets,
|
call void @__masked_store_blend_i32(<$1 x i32> * %offsetsPtr, <$1 x i32> %offsets,
|
||||||
<$1 x i32> %vecmask)
|
<$1 x i32> %vecmask)
|
||||||
%newOffsets = load <$1 x i32> * %offsetsPtr
|
%newOffsets = load <$1 x i32> * %offsetsPtr
|
||||||
|
|
||||||
%deltaPtr = alloca <$1 x i32>
|
%deltaPtr = alloca <$1 x i32>
|
||||||
store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr
|
store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr
|
||||||
call void @__masked_store_blend_32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta,
|
call void @__masked_store_blend_i32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta,
|
||||||
<$1 x i32> %vecmask)
|
<$1 x i32> %vecmask)
|
||||||
%newDelta = load <$1 x i32> * %deltaPtr
|
%newDelta = load <$1 x i32> * %deltaPtr
|
||||||
|
|
||||||
%ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
|
%ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
|
||||||
@@ -3175,14 +3169,14 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32
|
|||||||
; Set the offset to zero for lanes that are off
|
; Set the offset to zero for lanes that are off
|
||||||
%offsetsPtr = alloca <$1 x i64>
|
%offsetsPtr = alloca <$1 x i64>
|
||||||
store <$1 x i64> zeroinitializer, <$1 x i64> * %offsetsPtr
|
store <$1 x i64> zeroinitializer, <$1 x i64> * %offsetsPtr
|
||||||
call void @__masked_store_blend_64(<$1 x i64> * %offsetsPtr, <$1 x i64> %offsets,
|
call void @__masked_store_blend_i64(<$1 x i64> * %offsetsPtr, <$1 x i64> %offsets,
|
||||||
<$1 x i32> %vecmask)
|
<$1 x i32> %vecmask)
|
||||||
%newOffsets = load <$1 x i64> * %offsetsPtr
|
%newOffsets = load <$1 x i64> * %offsetsPtr
|
||||||
|
|
||||||
%deltaPtr = alloca <$1 x i64>
|
%deltaPtr = alloca <$1 x i64>
|
||||||
store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr
|
store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr
|
||||||
call void @__masked_store_blend_64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta,
|
call void @__masked_store_blend_i64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta,
|
||||||
<$1 x i32> %vecmask)
|
<$1 x i32> %vecmask)
|
||||||
%newDelta = load <$1 x i64> * %deltaPtr
|
%newDelta = load <$1 x i64> * %deltaPtr
|
||||||
|
|
||||||
%ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
|
%ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
|
||||||
|
|||||||
16
ctx.cpp
16
ctx.cpp
@@ -2691,9 +2691,9 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (g->target.is32Bit)
|
if (g->target.is32Bit)
|
||||||
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32");
|
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32");
|
||||||
else
|
else
|
||||||
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_64");
|
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64");
|
||||||
}
|
}
|
||||||
else if (Type::Equal(valueType, AtomicType::VaryingBool) &&
|
else if (Type::Equal(valueType, AtomicType::VaryingBool) &&
|
||||||
g->target.maskBitCount == 1) {
|
g->target.maskBitCount == 1) {
|
||||||
@@ -2712,35 +2712,31 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
|
|||||||
else if (Type::Equal(valueType, AtomicType::VaryingDouble) ||
|
else if (Type::Equal(valueType, AtomicType::VaryingDouble) ||
|
||||||
Type::Equal(valueType, AtomicType::VaryingInt64) ||
|
Type::Equal(valueType, AtomicType::VaryingInt64) ||
|
||||||
Type::Equal(valueType, AtomicType::VaryingUInt64)) {
|
Type::Equal(valueType, AtomicType::VaryingUInt64)) {
|
||||||
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_64");
|
|
||||||
ptr = BitCastInst(ptr, LLVMTypes::Int64VectorPointerType,
|
ptr = BitCastInst(ptr, LLVMTypes::Int64VectorPointerType,
|
||||||
LLVMGetName(ptr, "_to_int64vecptr"));
|
LLVMGetName(ptr, "_to_int64vecptr"));
|
||||||
value = BitCastInst(value, LLVMTypes::Int64VectorType,
|
value = BitCastInst(value, LLVMTypes::Int64VectorType,
|
||||||
LLVMGetName(value, "_to_int64"));
|
LLVMGetName(value, "_to_int64"));
|
||||||
|
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64");
|
||||||
}
|
}
|
||||||
else if (Type::Equal(valueType, AtomicType::VaryingFloat) ||
|
else if (Type::Equal(valueType, AtomicType::VaryingFloat) ||
|
||||||
Type::Equal(valueType, AtomicType::VaryingBool) ||
|
Type::Equal(valueType, AtomicType::VaryingBool) ||
|
||||||
Type::Equal(valueType, AtomicType::VaryingInt32) ||
|
Type::Equal(valueType, AtomicType::VaryingInt32) ||
|
||||||
Type::Equal(valueType, AtomicType::VaryingUInt32) ||
|
Type::Equal(valueType, AtomicType::VaryingUInt32) ||
|
||||||
CastType<EnumType>(valueType) != NULL) {
|
CastType<EnumType>(valueType) != NULL) {
|
||||||
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32");
|
|
||||||
ptr = BitCastInst(ptr, LLVMTypes::Int32VectorPointerType,
|
ptr = BitCastInst(ptr, LLVMTypes::Int32VectorPointerType,
|
||||||
LLVMGetName(ptr, "_to_int32vecptr"));
|
LLVMGetName(ptr, "_to_int32vecptr"));
|
||||||
if (Type::Equal(valueType, AtomicType::VaryingFloat))
|
if (Type::Equal(valueType, AtomicType::VaryingFloat))
|
||||||
value = BitCastInst(value, LLVMTypes::Int32VectorType,
|
value = BitCastInst(value, LLVMTypes::Int32VectorType,
|
||||||
LLVMGetName(value, "_to_int32"));
|
LLVMGetName(value, "_to_int32"));
|
||||||
|
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32");
|
||||||
}
|
}
|
||||||
else if (Type::Equal(valueType, AtomicType::VaryingInt16) ||
|
else if (Type::Equal(valueType, AtomicType::VaryingInt16) ||
|
||||||
Type::Equal(valueType, AtomicType::VaryingUInt16)) {
|
Type::Equal(valueType, AtomicType::VaryingUInt16)) {
|
||||||
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_16");
|
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i16");
|
||||||
ptr = BitCastInst(ptr, LLVMTypes::Int16VectorPointerType,
|
|
||||||
LLVMGetName(ptr, "_to_int16vecptr"));
|
|
||||||
}
|
}
|
||||||
else if (Type::Equal(valueType, AtomicType::VaryingInt8) ||
|
else if (Type::Equal(valueType, AtomicType::VaryingInt8) ||
|
||||||
Type::Equal(valueType, AtomicType::VaryingUInt8)) {
|
Type::Equal(valueType, AtomicType::VaryingUInt8)) {
|
||||||
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_8");
|
maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i8");
|
||||||
ptr = BitCastInst(ptr, LLVMTypes::Int8VectorPointerType,
|
|
||||||
LLVMGetName(ptr, "_to_int8vecptr"));
|
|
||||||
}
|
}
|
||||||
AssertPos(currentPos, maskedStoreFunc != NULL);
|
AssertPos(currentPos, maskedStoreFunc != NULL);
|
||||||
|
|
||||||
|
|||||||
@@ -1101,8 +1101,8 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// masked load/store
|
// masked load/store
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
|
static FORCEINLINE __vec16_i8 __masked_load_i8(void *p,
|
||||||
__vec16_i1 mask) {
|
__vec16_i1 mask) {
|
||||||
__vec16_i8 ret;
|
__vec16_i8 ret;
|
||||||
int8_t *ptr = (int8_t *)p;
|
int8_t *ptr = (int8_t *)p;
|
||||||
for (int i = 0; i < 16; ++i)
|
for (int i = 0; i < 16; ++i)
|
||||||
@@ -1111,8 +1111,8 @@ static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
|
static FORCEINLINE __vec16_i16 __masked_load_i16(void *p,
|
||||||
__vec16_i1 mask) {
|
__vec16_i1 mask) {
|
||||||
__vec16_i16 ret;
|
__vec16_i16 ret;
|
||||||
int16_t *ptr = (int16_t *)p;
|
int16_t *ptr = (int16_t *)p;
|
||||||
for (int i = 0; i < 16; ++i)
|
for (int i = 0; i < 16; ++i)
|
||||||
@@ -1121,8 +1121,8 @@ static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
|
static FORCEINLINE __vec16_i32 __masked_load_i32(void *p,
|
||||||
__vec16_i1 mask) {
|
__vec16_i1 mask) {
|
||||||
__vec16_i32 ret;
|
__vec16_i32 ret;
|
||||||
int32_t *ptr = (int32_t *)p;
|
int32_t *ptr = (int32_t *)p;
|
||||||
for (int i = 0; i < 16; ++i)
|
for (int i = 0; i < 16; ++i)
|
||||||
@@ -1131,8 +1131,8 @@ static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
|
static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
|
||||||
__vec16_i1 mask) {
|
__vec16_i1 mask) {
|
||||||
__vec16_i64 ret;
|
__vec16_i64 ret;
|
||||||
int64_t *ptr = (int64_t *)p;
|
int64_t *ptr = (int64_t *)p;
|
||||||
for (int i = 0; i < 16; ++i)
|
for (int i = 0; i < 16; ++i)
|
||||||
@@ -1141,31 +1141,31 @@ static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_8(void *p, __vec16_i8 val,
|
static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
|
||||||
__vec16_i1 mask) {
|
__vec16_i1 mask) {
|
||||||
int8_t *ptr = (int8_t *)p;
|
int8_t *ptr = (int8_t *)p;
|
||||||
for (int i = 0; i < 16; ++i)
|
for (int i = 0; i < 16; ++i)
|
||||||
if ((mask.v & (1 << i)) != 0)
|
if ((mask.v & (1 << i)) != 0)
|
||||||
ptr[i] = val.v[i];
|
ptr[i] = val.v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_16(void *p, __vec16_i16 val,
|
static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
|
||||||
__vec16_i1 mask) {
|
__vec16_i1 mask) {
|
||||||
int16_t *ptr = (int16_t *)p;
|
int16_t *ptr = (int16_t *)p;
|
||||||
for (int i = 0; i < 16; ++i)
|
for (int i = 0; i < 16; ++i)
|
||||||
if ((mask.v & (1 << i)) != 0)
|
if ((mask.v & (1 << i)) != 0)
|
||||||
ptr[i] = val.v[i];
|
ptr[i] = val.v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_32(void *p, __vec16_i32 val,
|
static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val,
|
||||||
__vec16_i1 mask) {
|
__vec16_i1 mask) {
|
||||||
int32_t *ptr = (int32_t *)p;
|
int32_t *ptr = (int32_t *)p;
|
||||||
for (int i = 0; i < 16; ++i)
|
for (int i = 0; i < 16; ++i)
|
||||||
if ((mask.v & (1 << i)) != 0)
|
if ((mask.v & (1 << i)) != 0)
|
||||||
ptr[i] = val.v[i];
|
ptr[i] = val.v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
|
static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
|
||||||
__vec16_i1 mask) {
|
__vec16_i1 mask) {
|
||||||
int64_t *ptr = (int64_t *)p;
|
int64_t *ptr = (int64_t *)p;
|
||||||
for (int i = 0; i < 16; ++i)
|
for (int i = 0; i < 16; ++i)
|
||||||
@@ -1173,24 +1173,28 @@ static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
|
|||||||
ptr[i] = val.v[i];
|
ptr[i] = val.v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_8(void *p, __vec16_i8 val,
|
|
||||||
__vec16_i1 mask) {
|
|
||||||
__masked_store_8(p, val, mask);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_16(void *p, __vec16_i16 val,
|
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
|
||||||
__vec16_i1 mask) {
|
__vec16_i1 mask) {
|
||||||
__masked_store_16(p, val, mask);
|
__masked_store_i8(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_32(void *p, __vec16_i32 val,
|
static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val,
|
||||||
__vec16_i1 mask) {
|
__vec16_i1 mask) {
|
||||||
__masked_store_32(p, val, mask);
|
__masked_store_i16(p, val, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
|
||||||
|
__vec16_i1 mask) {
|
||||||
|
__masked_store_i32(p, val, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val,
|
||||||
|
__vec16_i1 mask) {
|
||||||
|
__masked_store_i64(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
|
|
||||||
__vec16_i1 mask) {
|
|
||||||
__masked_store_64(p, val, mask);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|||||||
@@ -2415,8 +2415,7 @@ static FORCEINLINE uint64_t __reduce_max_uint64(__vec4_i64 v) {
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// masked load/store
|
// masked load/store
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i8 __masked_load_8(void *p,
|
static FORCEINLINE __vec4_i8 __masked_load_i8(void *p, __vec4_i1 mask) {
|
||||||
__vec4_i1 mask) {
|
|
||||||
int8_t r[4];
|
int8_t r[4];
|
||||||
int8_t *ptr = (int8_t *)p;
|
int8_t *ptr = (int8_t *)p;
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
@@ -2435,8 +2434,7 @@ static FORCEINLINE __vec4_i8 __masked_load_8(void *p,
|
|||||||
return __vec4_i8(r[0], r[1], r[2], r[3]);
|
return __vec4_i8(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i16 __masked_load_16(void *p,
|
static FORCEINLINE __vec4_i16 __masked_load_i16(void *p, __vec4_i1 mask) {
|
||||||
__vec4_i1 mask) {
|
|
||||||
int16_t r[4];
|
int16_t r[4];
|
||||||
int16_t *ptr = (int16_t *)p;
|
int16_t *ptr = (int16_t *)p;
|
||||||
|
|
||||||
@@ -2459,8 +2457,7 @@ static FORCEINLINE __vec4_i16 __masked_load_16(void *p,
|
|||||||
return __vec4_i16(r[0], r[1], r[2], r[3]);
|
return __vec4_i16(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i32 __masked_load_32(void *p,
|
static FORCEINLINE __vec4_i32 __masked_load_i32(void *p, __vec4_i1 mask) {
|
||||||
__vec4_i1 mask) {
|
|
||||||
__m128i r = _mm_set_epi32(0, 0, 0, 0);
|
__m128i r = _mm_set_epi32(0, 0, 0, 0);
|
||||||
int32_t *ptr = (int32_t *)p;
|
int32_t *ptr = (int32_t *)p;
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
@@ -2482,8 +2479,7 @@ static FORCEINLINE __vec4_i32 __masked_load_32(void *p,
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i64 __masked_load_64(void *p,
|
static FORCEINLINE __vec4_i64 __masked_load_i64(void *p, __vec4_i1 mask) {
|
||||||
__vec4_i1 mask) {
|
|
||||||
uint64_t r[4];
|
uint64_t r[4];
|
||||||
uint64_t *ptr = (uint64_t *)p;
|
uint64_t *ptr = (uint64_t *)p;
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
@@ -2505,8 +2501,8 @@ static FORCEINLINE __vec4_i64 __masked_load_64(void *p,
|
|||||||
return __vec4_i64(r[0], r[1], r[2], r[3]);
|
return __vec4_i64(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_8(void *p, __vec4_i8 val,
|
static FORCEINLINE void __masked_store_i8(void *p, __vec4_i8 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
int8_t *ptr = (int8_t *)p;
|
int8_t *ptr = (int8_t *)p;
|
||||||
|
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
@@ -2526,8 +2522,8 @@ static FORCEINLINE void __masked_store_8(void *p, __vec4_i8 val,
|
|||||||
ptr[3] = _mm_extract_epi8(val.v, 3);
|
ptr[3] = _mm_extract_epi8(val.v, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_16(void *p, __vec4_i16 val,
|
static FORCEINLINE void __masked_store_i16(void *p, __vec4_i16 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
int16_t *ptr = (int16_t *)p;
|
int16_t *ptr = (int16_t *)p;
|
||||||
|
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
@@ -2547,8 +2543,8 @@ static FORCEINLINE void __masked_store_16(void *p, __vec4_i16 val,
|
|||||||
ptr[3] = _mm_extract_epi16(val.v, 3);
|
ptr[3] = _mm_extract_epi16(val.v, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_32(void *p, __vec4_i32 val,
|
static FORCEINLINE void __masked_store_i32(void *p, __vec4_i32 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
int32_t *ptr = (int32_t *)p;
|
int32_t *ptr = (int32_t *)p;
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
if (m != 0)
|
if (m != 0)
|
||||||
@@ -2567,8 +2563,8 @@ static FORCEINLINE void __masked_store_32(void *p, __vec4_i32 val,
|
|||||||
ptr[3] = _mm_extract_epi32(val.v, 3);
|
ptr[3] = _mm_extract_epi32(val.v, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_64(void *p, __vec4_i64 val,
|
static FORCEINLINE void __masked_store_i64(void *p, __vec4_i64 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
int64_t *ptr = (int64_t *)p;
|
int64_t *ptr = (int64_t *)p;
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
if (m != 0)
|
if (m != 0)
|
||||||
@@ -2587,26 +2583,29 @@ static FORCEINLINE void __masked_store_64(void *p, __vec4_i64 val,
|
|||||||
ptr[3] = _mm_extract_epi64(val.v[1], 1);
|
ptr[3] = _mm_extract_epi64(val.v[1], 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_8(void *p, __vec4_i8 val,
|
|
||||||
__vec4_i1 mask) {
|
|
||||||
__masked_store_8(p, val, mask);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_16(void *p, __vec4_i16 val,
|
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec4_i8 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
__masked_store_16(p, val, mask);
|
__masked_store_i8(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_32(void *p, __vec4_i32 val,
|
static FORCEINLINE void __masked_store_blend_i16(void *p, __vec4_i16 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
|
__masked_store_i16(p, val, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_blend_i32(void *p, __vec4_i32 val,
|
||||||
|
__vec4_i1 mask) {
|
||||||
// FIXME: do a load, blendvps, store here...
|
// FIXME: do a load, blendvps, store here...
|
||||||
__masked_store_32(p, val, mask);
|
__masked_store_i32(p, val, mask);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_64(void *p, __vec4_i64 val,
|
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec4_i64 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
// FIXME: do a 2x (load, blendvps, store) here...
|
// FIXME: do a 2x (load, blendvps, store) here...
|
||||||
__masked_store_64(p, val, mask);
|
__masked_store_i64(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|||||||
92
opt.cpp
92
opt.cpp
@@ -1877,18 +1877,18 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|||||||
DEBUG_START_PASS("MaskedStoreOpt");
|
DEBUG_START_PASS("MaskedStoreOpt");
|
||||||
|
|
||||||
MSInfo msInfo[] = {
|
MSInfo msInfo[] = {
|
||||||
MSInfo("__pseudo_masked_store_8", 1),
|
MSInfo("__pseudo_masked_store_i8", 1),
|
||||||
MSInfo("__pseudo_masked_store_16", 2),
|
MSInfo("__pseudo_masked_store_i16", 2),
|
||||||
MSInfo("__pseudo_masked_store_32", 4),
|
MSInfo("__pseudo_masked_store_i32", 4),
|
||||||
MSInfo("__pseudo_masked_store_64", 8),
|
MSInfo("__pseudo_masked_store_i64", 8),
|
||||||
MSInfo("__masked_store_blend_8", 1),
|
MSInfo("__masked_store_blend_i8", 1),
|
||||||
MSInfo("__masked_store_blend_16", 2),
|
MSInfo("__masked_store_blend_i16", 2),
|
||||||
MSInfo("__masked_store_blend_32", 4),
|
MSInfo("__masked_store_blend_i32", 4),
|
||||||
MSInfo("__masked_store_blend_64", 8),
|
MSInfo("__masked_store_blend_i64", 8),
|
||||||
MSInfo("__masked_store_8", 1),
|
MSInfo("__masked_store_i8", 1),
|
||||||
MSInfo("__masked_store_16", 2),
|
MSInfo("__masked_store_i16", 2),
|
||||||
MSInfo("__masked_store_32", 4),
|
MSInfo("__masked_store_i32", 4),
|
||||||
MSInfo("__masked_store_64", 8)
|
MSInfo("__masked_store_i64", 8),
|
||||||
};
|
};
|
||||||
|
|
||||||
bool modifiedAny = false;
|
bool modifiedAny = false;
|
||||||
@@ -1992,10 +1992,10 @@ MaskedLoadOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|||||||
DEBUG_START_PASS("MaskedLoadOpt");
|
DEBUG_START_PASS("MaskedLoadOpt");
|
||||||
|
|
||||||
MLInfo mlInfo[] = {
|
MLInfo mlInfo[] = {
|
||||||
MLInfo("__masked_load_8", 1),
|
MLInfo("__masked_load_i8", 1),
|
||||||
MLInfo("__masked_load_16", 2),
|
MLInfo("__masked_load_i16", 2),
|
||||||
MLInfo("__masked_load_32", 4),
|
MLInfo("__masked_load_i32", 4),
|
||||||
MLInfo("__masked_load_64", 8)
|
MLInfo("__masked_load_i64", 8),
|
||||||
};
|
};
|
||||||
|
|
||||||
bool modifiedAny = false;
|
bool modifiedAny = false;
|
||||||
@@ -2141,14 +2141,14 @@ PseudoMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|||||||
DEBUG_START_PASS("PseudoMaskedStorePass");
|
DEBUG_START_PASS("PseudoMaskedStorePass");
|
||||||
|
|
||||||
LMSInfo msInfo[] = {
|
LMSInfo msInfo[] = {
|
||||||
LMSInfo("__pseudo_masked_store_8", "__masked_store_blend_8",
|
LMSInfo("__pseudo_masked_store_i8", "__masked_store_blend_i8",
|
||||||
"__masked_store_8"),
|
"__masked_store_i8"),
|
||||||
LMSInfo("__pseudo_masked_store_16", "__masked_store_blend_16",
|
LMSInfo("__pseudo_masked_store_i16", "__masked_store_blend_i16",
|
||||||
"__masked_store_16"),
|
"__masked_store_i16"),
|
||||||
LMSInfo("__pseudo_masked_store_32", "__masked_store_blend_32",
|
LMSInfo("__pseudo_masked_store_i32", "__masked_store_blend_i32",
|
||||||
"__masked_store_32"),
|
"__masked_store_i32"),
|
||||||
LMSInfo("__pseudo_masked_store_64", "__masked_store_blend_64",
|
LMSInfo("__pseudo_masked_store_i64", "__masked_store_blend_i64",
|
||||||
"__masked_store_64")
|
"__masked_store_i64"),
|
||||||
};
|
};
|
||||||
|
|
||||||
bool modifiedAny = false;
|
bool modifiedAny = false;
|
||||||
@@ -2282,38 +2282,38 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|||||||
|
|
||||||
GatherImpInfo gInfo[] = {
|
GatherImpInfo gInfo[] = {
|
||||||
GatherImpInfo("__pseudo_gather_base_offsets32_8", "__load_and_broadcast_i8",
|
GatherImpInfo("__pseudo_gather_base_offsets32_8", "__load_and_broadcast_i8",
|
||||||
"__masked_load_8", 1),
|
"__masked_load_i8", 1),
|
||||||
GatherImpInfo("__pseudo_gather_base_offsets32_16", "__load_and_broadcast_i16",
|
GatherImpInfo("__pseudo_gather_base_offsets32_16", "__load_and_broadcast_i16",
|
||||||
"__masked_load_16", 2),
|
"__masked_load_i16", 2),
|
||||||
GatherImpInfo("__pseudo_gather_base_offsets32_32", "__load_and_broadcast_i32",
|
GatherImpInfo("__pseudo_gather_base_offsets32_32", "__load_and_broadcast_i32",
|
||||||
"__masked_load_32", 4),
|
"__masked_load_i32", 4),
|
||||||
GatherImpInfo("__pseudo_gather_base_offsets32_64", "__load_and_broadcast_i64",
|
GatherImpInfo("__pseudo_gather_base_offsets32_64", "__load_and_broadcast_i64",
|
||||||
"__masked_load_64", 8),
|
"__masked_load_i64", 8),
|
||||||
GatherImpInfo("__pseudo_gather_base_offsets64_8", "__load_and_broadcast_i8",
|
GatherImpInfo("__pseudo_gather_base_offsets64_8", "__load_and_broadcast_i8",
|
||||||
"__masked_load_8", 1),
|
"__masked_load_i8", 1),
|
||||||
GatherImpInfo("__pseudo_gather_base_offsets64_16", "__load_and_broadcast_i16",
|
GatherImpInfo("__pseudo_gather_base_offsets64_16", "__load_and_broadcast_i16",
|
||||||
"__masked_load_16", 2),
|
"__masked_load_i16", 2),
|
||||||
GatherImpInfo("__pseudo_gather_base_offsets64_32", "__load_and_broadcast_i32",
|
GatherImpInfo("__pseudo_gather_base_offsets64_32", "__load_and_broadcast_i32",
|
||||||
"__masked_load_32", 4),
|
"__masked_load_i32", 4),
|
||||||
GatherImpInfo("__pseudo_gather_base_offsets64_64", "__load_and_broadcast_i64",
|
GatherImpInfo("__pseudo_gather_base_offsets64_64", "__load_and_broadcast_i64",
|
||||||
"__masked_load_64", 8)
|
"__masked_load_i64", 8)
|
||||||
};
|
};
|
||||||
ScatterImpInfo sInfo[] = {
|
ScatterImpInfo sInfo[] = {
|
||||||
ScatterImpInfo("__pseudo_scatter_base_offsets32_8", "__pseudo_masked_store_8",
|
ScatterImpInfo("__pseudo_scatter_base_offsets32_8", "__pseudo_masked_store_i8",
|
||||||
LLVMTypes::Int8VectorPointerType, 1),
|
LLVMTypes::Int8VectorPointerType, 1),
|
||||||
ScatterImpInfo("__pseudo_scatter_base_offsets32_16", "__pseudo_masked_store_16",
|
ScatterImpInfo("__pseudo_scatter_base_offsets32_16", "__pseudo_masked_store_i16",
|
||||||
LLVMTypes::Int16VectorPointerType, 2),
|
LLVMTypes::Int16VectorPointerType, 2),
|
||||||
ScatterImpInfo("__pseudo_scatter_base_offsets32_32", "__pseudo_masked_store_32",
|
ScatterImpInfo("__pseudo_scatter_base_offsets32_32", "__pseudo_masked_store_i32",
|
||||||
LLVMTypes::Int32VectorPointerType, 4),
|
LLVMTypes::Int32VectorPointerType, 4),
|
||||||
ScatterImpInfo("__pseudo_scatter_base_offsets32_64", "__pseudo_masked_store_64",
|
ScatterImpInfo("__pseudo_scatter_base_offsets32_64", "__pseudo_masked_store_i64",
|
||||||
LLVMTypes::Int64VectorPointerType, 8),
|
LLVMTypes::Int64VectorPointerType, 8),
|
||||||
ScatterImpInfo("__pseudo_scatter_base_offsets64_8", "__pseudo_masked_store_8",
|
ScatterImpInfo("__pseudo_scatter_base_offsets64_8", "__pseudo_masked_store_i8",
|
||||||
LLVMTypes::Int8VectorPointerType, 1),
|
LLVMTypes::Int8VectorPointerType, 1),
|
||||||
ScatterImpInfo("__pseudo_scatter_base_offsets64_16", "__pseudo_masked_store_16",
|
ScatterImpInfo("__pseudo_scatter_base_offsets64_16", "__pseudo_masked_store_i16",
|
||||||
LLVMTypes::Int16VectorPointerType, 2),
|
LLVMTypes::Int16VectorPointerType, 2),
|
||||||
ScatterImpInfo("__pseudo_scatter_base_offsets64_32", "__pseudo_masked_store_32",
|
ScatterImpInfo("__pseudo_scatter_base_offsets64_32", "__pseudo_masked_store_i32",
|
||||||
LLVMTypes::Int32VectorPointerType, 4),
|
LLVMTypes::Int32VectorPointerType, 4),
|
||||||
ScatterImpInfo("__pseudo_scatter_base_offsets64_64", "__pseudo_masked_store_64",
|
ScatterImpInfo("__pseudo_scatter_base_offsets64_64", "__pseudo_masked_store_i64",
|
||||||
LLVMTypes::Int64VectorPointerType, 8)
|
LLVMTypes::Int64VectorPointerType, 8)
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -3815,14 +3815,14 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
|
|||||||
"__gather_elt32_i32", "__gather_elt32_i64",
|
"__gather_elt32_i32", "__gather_elt32_i64",
|
||||||
"__gather_elt64_i8", "__gather_elt64_i16",
|
"__gather_elt64_i8", "__gather_elt64_i16",
|
||||||
"__gather_elt64_i32", "__gather_elt64_i64",
|
"__gather_elt64_i32", "__gather_elt64_i64",
|
||||||
"__masked_load_8", "__masked_load_16",
|
|
||||||
"__masked_load_32", "__masked_load_64",
|
|
||||||
"__masked_store_8", "__masked_store_16",
|
|
||||||
"__masked_store_32", "__masked_store_64",
|
|
||||||
"__masked_store_blend_8", "__masked_store_blend_16",
|
|
||||||
"__masked_store_blend_32", "__masked_store_blend_64",
|
|
||||||
"__load_and_broadcast_i8", "__load_and_broadcast_i16",
|
"__load_and_broadcast_i8", "__load_and_broadcast_i16",
|
||||||
"__load_and_broadcast_i32", "__load_and_broadcast_i64",
|
"__load_and_broadcast_i32", "__load_and_broadcast_i64",
|
||||||
|
"__masked_load_i8", "__masked_load_i16",
|
||||||
|
"__masked_load_i32", "__masked_load_i64",
|
||||||
|
"__masked_store_i8", "__masked_store_i16",
|
||||||
|
"__masked_store_i32", "__masked_store_i64",
|
||||||
|
"__masked_store_blend_i8", "__masked_store_blend_i16",
|
||||||
|
"__masked_store_blend_i32", "__masked_store_blend_i64",
|
||||||
"__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16",
|
"__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16",
|
||||||
"__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64",
|
"__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64",
|
||||||
"__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16",
|
"__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16",
|
||||||
|
|||||||
Reference in New Issue
Block a user