added mask64
This commit is contained in:
@@ -29,7 +29,7 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx-h.ll')
|
||||
include(`target-avx-i64x4base.ll')
|
||||
|
||||
rdrand_decls()
|
||||
|
||||
@@ -33,7 +33,7 @@
|
||||
;; Basic 4-wide definitions
|
||||
|
||||
define(`WIDTH',`4')
|
||||
define(`MASK',`i32')
|
||||
define(`MASK',`i64')
|
||||
include(`util.m4')
|
||||
|
||||
stdlib_core()
|
||||
@@ -185,32 +185,32 @@ define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind reado
|
||||
; horizontal ops
|
||||
|
||||
;; sse intrinsic
|
||||
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
||||
declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
|
||||
|
||||
define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i64> %0 to <4 x double>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
|
||||
%v64 = zext i32 %v to i64
|
||||
ret i64 %v64
|
||||
}
|
||||
|
||||
define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
define i1 @__any(<4 x i64>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i64> %0 to <4 x double>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
|
||||
%cmp = icmp ne i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
define i1 @__all(<4 x i64>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i64> %0 to <4 x double>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 15
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i32> %0 to <4 x float>
|
||||
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
|
||||
define i1 @__none(<4 x i64>) nounwind readnone alwaysinline {
|
||||
%floatmask = bitcast <4 x i64> %0 to <4 x double>
|
||||
%v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
|
||||
%cmp = icmp eq i32 %v, 0
|
||||
ret i1 %cmp
|
||||
}
|
||||
@@ -392,7 +392,8 @@ masked_load(i16, 2)
|
||||
declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
|
||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
|
||||
|
||||
define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
|
||||
define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline {
|
||||
%mask = trunc <4 x i64> %mask64 to <4 x i32>
|
||||
%floatmask = bitcast <4 x i32> %mask to <4 x float>
|
||||
%floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
|
||||
%retval = bitcast <4 x float> %floatval to <4 x i32>
|
||||
@@ -400,18 +401,11 @@ define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline
|
||||
}
|
||||
|
||||
|
||||
define <4 x i64> @__masked_load_i64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
|
||||
; double up masks, bitcast to doubles
|
||||
%mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||
|
||||
%val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
|
||||
|
||||
%vald = shufflevector <4 x double> %val0d, <4 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%val = bitcast <4 x double> %vald to <4 x i64>
|
||||
ret <4 x i64> %val
|
||||
define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline {
|
||||
%doublemask = bitcast <4 x i64> %mask to <4 x double>
|
||||
%doubleval = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask)
|
||||
%retval = bitcast <4 x double> %doubleval to <4 x i64>
|
||||
ret <4 x i64> %retval
|
||||
}
|
||||
|
||||
masked_load_float_double()
|
||||
@@ -428,83 +422,62 @@ declare void @llvm.x86.avx.maskstore.ps (i8 *, <4 x float>, <4 x float>)
|
||||
declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
|
||||
|
||||
define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32>) nounwind alwaysinline {
|
||||
%ptr = bitcast <4 x i32> * %0 to i8 *
|
||||
%val = bitcast <4 x i32> %1 to <4 x float>
|
||||
%mask = bitcast <4 x i32> %2 to <4 x float>
|
||||
<4 x i64>) nounwind alwaysinline {
|
||||
%mask32 = trunc <4 x i64> %2 to <4 x i32>
|
||||
|
||||
%ptr = bitcast <4 x i32> * %0 to i8 *
|
||||
%val = bitcast <4 x i32> %1 to <4 x float>
|
||||
%mask = bitcast <4 x i32> %mask32 to <4 x float>
|
||||
call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
%ptr = bitcast <4 x i64> * %0 to i8 *
|
||||
%val = bitcast <4 x i64> %1 to <4 x double>
|
||||
|
||||
%mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
|
||||
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||
|
||||
%val0 = shufflevector <4 x double> %val, <4 x double> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
|
||||
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
|
||||
<4 x i64>) nounwind alwaysinline {
|
||||
%ptr = bitcast <4 x i64> * %0 to i8 *
|
||||
%val = bitcast <4 x i64> %1 to <4 x double>
|
||||
%mask = bitcast <4 x i64> %2 to <4 x double>
|
||||
call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
masked_store_blend_8_16_by_4()
|
||||
masked_store_blend_8_16_by_4_mask64()
|
||||
|
||||
;; sse intrinsic
|
||||
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
||||
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
|
||||
<4 x float>) nounwind readnone
|
||||
|
||||
|
||||
define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>,
|
||||
<4 x i32> %mask) nounwind alwaysinline {
|
||||
<4 x i64>) nounwind alwaysinline {
|
||||
%mask = trunc <4 x i64> %2 to <4 x i32>
|
||||
%mask_as_float = bitcast <4 x i32> %mask to <4 x float>
|
||||
%oldValue = load <4 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
|
||||
%newAsFloat = bitcast <4 x i32> %1 to <4 x float>
|
||||
%blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
|
||||
<4 x float> %newAsFloat,
|
||||
<4 x float> %mask_as_float)
|
||||
%oldValue = load <4 x i32>* %0, align 4
|
||||
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
|
||||
%newAsFloat = bitcast <4 x i32> %1 to <4 x float>
|
||||
%blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
|
||||
<4 x float> %newAsFloat,
|
||||
<4 x float> %mask_as_float)
|
||||
%blendAsInt = bitcast <4 x float> %blend to <4 x i32>
|
||||
store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
;; avx intrinsic
|
||||
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||
<8 x float>) nounwind readnone
|
||||
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
|
||||
<4 x double>) nounwind readnone
|
||||
|
||||
define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
|
||||
<4 x i32> %i32mask) nounwind alwaysinline {
|
||||
%oldValue = load <4 x i64>* %ptr, align 8
|
||||
%mask = bitcast <4 x i32> %i32mask to <4 x float>
|
||||
|
||||
; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
|
||||
; are actually bitcast <4 x i64> values
|
||||
;
|
||||
; set up the first four 64-bit values
|
||||
%old01 = bitcast <4 x i64> %oldValue to <4 x i64>
|
||||
%old01f = bitcast <4 x i64> %old01 to <8 x float>
|
||||
%new01 = bitcast <4 x i64> %new to <4 x i64>
|
||||
%new01f = bitcast <4 x i64> %new01 to <8 x float>
|
||||
; compute mask--note that the indices are all doubled-up
|
||||
%mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
|
||||
<8 x i32> <i32 0, i32 0, i32 1, i32 1,
|
||||
i32 2, i32 2, i32 3, i32 3>
|
||||
; and blend them
|
||||
%result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
|
||||
<8 x float> %new01f,
|
||||
<8 x float> %mask01)
|
||||
%result01 = bitcast <8 x float> %result01f to <4 x i64>
|
||||
|
||||
|
||||
%final = bitcast <4 x i64> %result01 to <4 x i64>
|
||||
store <4 x i64> %final, <4 x i64> * %ptr, align 8
|
||||
define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>,
|
||||
<4 x i64>) nounwind alwaysinline {
|
||||
%mask_as_double = bitcast <4 x i64> %2 to <4 x double>
|
||||
%oldValue = load <4 x i64>* %0, align 4
|
||||
%oldAsDouble = bitcast <4 x i64> %oldValue to <4 x double>
|
||||
%newAsDouble = bitcast <4 x i64> %1 to <4 x double>
|
||||
%blend = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble,
|
||||
<4 x double> %newAsDouble,
|
||||
<4 x double> %mask_as_double)
|
||||
%blendAsInt = bitcast <4 x double> %blend to <4 x i64>
|
||||
store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@@ -947,6 +947,22 @@ define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
|
||||
%r = sext <$1 x i32> %0 to <$1 x i64>
|
||||
ret <$1 x i64> %r
|
||||
}
|
||||
|
||||
define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) {
|
||||
%r = trunc <$1 x i64> %0 to <$1 x i8>
|
||||
ret <$1 x i8> %r
|
||||
}
|
||||
define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) {
|
||||
%r = trunc <$1 x i64> %0 to <$1 x i16>
|
||||
ret <$1 x i16> %r
|
||||
}
|
||||
define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) {
|
||||
%r = trunc <$1 x i64> %0 to <$1 x i32>
|
||||
ret <$1 x i32> %r
|
||||
}
|
||||
define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
|
||||
ret <$1 x i64> %0
|
||||
}
|
||||
')
|
||||
|
||||
mask_converts(WIDTH)
|
||||
@@ -2689,9 +2705,13 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
|
||||
}
|
||||
|
||||
define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
|
||||
ifelse(MASK,i32, `ret <WIDTH x i32> %0',
|
||||
`%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
|
||||
ret <WIDTH x i32> %se')
|
||||
;; ifelse(MASK,i32, `ret <WIDTH x i32> %0',
|
||||
;; `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
|
||||
;; ret <WIDTH x i32> %se')
|
||||
ifelse(MASK,i32, `%se = bitcast <WIDTH x i32> %0 to <WIDTH x i32>',
|
||||
MASK,i64, `%se = trunc <WIDTH x MASK> %0 to <WIDTH x i32>',
|
||||
`%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>')
|
||||
ret <WIDTH x i32> %se
|
||||
}
|
||||
|
||||
|
||||
@@ -3508,6 +3528,56 @@ define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
|
||||
}
|
||||
')
|
||||
|
||||
define(`masked_store_blend_8_16_by_4_mask64', `
|
||||
define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
|
||||
<4 x i64>) nounwind alwaysinline {
|
||||
%old = load <4 x i8> * %0, align 1
|
||||
ifelse(LLVM_VERSION,LLVM_3_0,`
|
||||
%old32 = bitcast <4 x i8> %old to i32
|
||||
%new32 = bitcast <4 x i8> %1 to i32
|
||||
|
||||
%mask8 = trunc <4 x i64> %2 to <4 x i8>
|
||||
%mask32 = bitcast <4 x i8> %mask8 to i32
|
||||
%notmask32 = xor i32 %mask32, -1
|
||||
|
||||
%newmasked = and i32 %new32, %mask32
|
||||
%oldmasked = and i32 %old32, %notmask32
|
||||
%result = or i32 %newmasked, %oldmasked
|
||||
|
||||
%resultvec = bitcast i32 %result to <4 x i8>
|
||||
',`
|
||||
%m = trunc <4 x i64> %2 to <4 x i1>
|
||||
%resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
|
||||
')
|
||||
store <4 x i8> %resultvec, <4 x i8> * %0, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
|
||||
<4 x i64>) nounwind alwaysinline {
|
||||
%old = load <4 x i16> * %0, align 2
|
||||
ifelse(LLVM_VERSION,LLVM_3_0,`
|
||||
%old64 = bitcast <4 x i16> %old to i64
|
||||
%new64 = bitcast <4 x i16> %1 to i64
|
||||
|
||||
%mask16 = trunc <4 x i64> %2 to <4 x i16>
|
||||
%mask64 = bitcast <4 x i16> %mask16 to i64
|
||||
%notmask64 = xor i64 %mask64, -1
|
||||
|
||||
%newmasked = and i64 %new64, %mask64
|
||||
%oldmasked = and i64 %old64, %notmask64
|
||||
%result = or i64 %newmasked, %oldmasked
|
||||
|
||||
%resultvec = bitcast i64 %result to <4 x i16>
|
||||
',`
|
||||
%m = trunc <4 x i64> %2 to <4 x i1>
|
||||
%resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
|
||||
')
|
||||
store <4 x i16> %resultvec, <4 x i16> * %0, align 2
|
||||
ret void
|
||||
}
|
||||
')
|
||||
|
||||
define(`masked_store_blend_8_16_by_8', `
|
||||
define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
|
||||
<8 x i32>) nounwind alwaysinline {
|
||||
|
||||
Reference in New Issue
Block a user