AVX: go back to using blend (vs. masked store) when possible.
All of the masked store calls were inhibiting putting values into registers, which in turn led to a lot of unnecessary stack traffic. This approach seems to give better code in the end.
This commit is contained in:
@@ -523,35 +523,104 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
;; FIXME: various code elsewhere in the builtins implementations makes
|
masked_store_blend_8_16_by_16()
|
||||||
;; calls to the 32/64 bit versions of these, basically assuming that doing
|
|
||||||
;; so is faster than doing a full call to an actual masked store, which
|
|
||||||
;; isn't likely to be the case on AVX. So here we provide those functions
|
|
||||||
;; but then don't actually do what the caller asked for...
|
|
||||||
|
|
||||||
declare void @llvm.trap()
|
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||||
|
<8 x float>) nounwind readnone
|
||||||
define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>,
|
|
||||||
<8 x i32>) nounwind alwaysinline {
|
|
||||||
call void @llvm.trap()
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
|
|
||||||
<8 x i32>) nounwind alwaysinline {
|
|
||||||
call void @llvm.trap()
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
|
define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
|
||||||
<16 x i32>) nounwind alwaysinline {
|
<16 x i32>) nounwind alwaysinline {
|
||||||
call void @__masked_store_32(<16 x i32> * %0, <16 x i32> %1, <16 x i32> %2)
|
%maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
|
||||||
|
%oldValue = load <16 x i32>* %0, align 4
|
||||||
|
%oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
|
||||||
|
%newAsFloat = bitcast <16 x i32> %1 to <16 x float>
|
||||||
|
|
||||||
|
%old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
|
||||||
|
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
|
||||||
|
%blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
|
||||||
|
<8 x float> %new0,
|
||||||
|
<8 x float> %mask0)
|
||||||
|
%blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
|
||||||
|
<8 x float> %new1,
|
||||||
|
<8 x float> %mask1)
|
||||||
|
%blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
|
||||||
|
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||||
|
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%blendAsInt = bitcast <16 x float> %blend to <16 x i32>
|
||||||
|
store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @__masked_store_blend_64(<16 x i64>* nocapture, <16 x i64>,
|
|
||||||
<16 x i32>) nounwind alwaysinline {
|
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
|
||||||
call void @__masked_store_64(<16 x i64> * %0, <16 x i64> %1, <16 x i32> %2)
|
<4 x double>) nounwind readnone
|
||||||
|
|
||||||
|
define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64,
|
||||||
|
<16 x i32> %mask) nounwind alwaysinline {
|
||||||
|
%oldValue = load <16 x i64>* %ptr, align 8
|
||||||
|
%old = bitcast <16 x i64> %oldValue to <16 x double>
|
||||||
|
%old0d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%old1d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%old2d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||||
|
%old3d = shufflevector <16 x double> %old, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||||
|
|
||||||
|
%new = bitcast <16 x i64> %newi64 to <16 x double>
|
||||||
|
%new0d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%new1d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%new2d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||||
|
%new3d = shufflevector <16 x double> %new, <16 x double> undef,
|
||||||
|
<4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||||
|
|
||||||
|
%mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||||
|
%mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||||
|
%mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
|
||||||
|
%mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
|
||||||
|
<8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
|
||||||
|
%mask0d = bitcast <8 x i32> %mask0 to <4 x double>
|
||||||
|
%mask1d = bitcast <8 x i32> %mask1 to <4 x double>
|
||||||
|
%mask2d = bitcast <8 x i32> %mask2 to <4 x double>
|
||||||
|
%mask3d = bitcast <8 x i32> %mask3 to <4 x double>
|
||||||
|
|
||||||
|
%result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
|
||||||
|
<4 x double> %new0d, <4 x double> %mask0d)
|
||||||
|
%result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
|
||||||
|
<4 x double> %new1d, <4 x double> %mask1d)
|
||||||
|
%result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
|
||||||
|
<4 x double> %new2d, <4 x double> %mask2d)
|
||||||
|
%result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
|
||||||
|
<4 x double> %new3d, <4 x double> %mask3d)
|
||||||
|
|
||||||
|
%result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||||
|
|
||||||
|
%result = shufflevector <8 x double> %result01, <8 x double> %result23,
|
||||||
|
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||||
|
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
%result64 = bitcast <16 x double> %result to <16 x i64>
|
||||||
|
store <16 x i64> %result64, <16 x i64> * %ptr
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -450,38 +450,74 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
;; FIXME: various code elsewhere in the builtins implementations makes
|
|
||||||
;; calls to the 32/64 bit versions of these, basically assuming that doing
|
|
||||||
;; so is faster than doing a full call to an actual masked store, which
|
|
||||||
;; isn't likely to be the case on AVX. So here we provide those functions
|
|
||||||
;; but then don't actually do what the caller asked for...
|
|
||||||
|
|
||||||
declare void @llvm.trap()
|
masked_store_blend_8_16_by_8()
|
||||||
|
|
||||||
define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>,
|
|
||||||
<8 x i32>) nounwind alwaysinline {
|
|
||||||
call void @llvm.trap()
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
|
|
||||||
<8 x i32>) nounwind alwaysinline {
|
|
||||||
call void @llvm.trap()
|
|
||||||
ret void
|
|
||||||
}
|
|
||||||
|
|
||||||
|
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
|
||||||
|
<8 x float>) nounwind readnone
|
||||||
|
|
||||||
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
|
||||||
<8 x i32>) nounwind alwaysinline {
|
<8 x i32>) nounwind alwaysinline {
|
||||||
call void @__masked_store_32(<8 x i32> * %0, <8 x i32> %1, <8 x i32> %2)
|
%mask_as_float = bitcast <8 x i32> %2 to <8 x float>
|
||||||
|
%oldValue = load <8 x i32>* %0, align 4
|
||||||
|
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
|
||||||
|
%newAsFloat = bitcast <8 x i32> %1 to <8 x float>
|
||||||
|
%blend = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat,
|
||||||
|
<8 x float> %newAsFloat,
|
||||||
|
<8 x float> %mask_as_float)
|
||||||
|
%blendAsInt = bitcast <8 x float> %blend to <8 x i32>
|
||||||
|
store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define void @__masked_store_blend_64(<8 x i64>* nocapture, <8 x i64>,
|
define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||||
<8 x i32>) nounwind alwaysinline {
|
<8 x i32> %i32mask) nounwind alwaysinline {
|
||||||
call void @__masked_store_64(<8 x i64> * %0, <8 x i64> %1, <8 x i32> %2)
|
%oldValue = load <8 x i64>* %ptr, align 8
|
||||||
|
%mask = bitcast <8 x i32> %i32mask to <8 x float>
|
||||||
|
|
||||||
|
; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
|
||||||
|
; are actually bitcast <4 x i64> values
|
||||||
|
;
|
||||||
|
; set up the first four 64-bit values
|
||||||
|
%old01 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
|
||||||
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%old01f = bitcast <4 x i64> %old01 to <8 x float>
|
||||||
|
%new01 = shufflevector <8 x i64> %new, <8 x i64> undef,
|
||||||
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
|
%new01f = bitcast <4 x i64> %new01 to <8 x float>
|
||||||
|
; compute mask--note that the indices are all doubled-up
|
||||||
|
%mask01 = shufflevector <8 x float> %mask, <8 x float> undef,
|
||||||
|
<8 x i32> <i32 0, i32 0, i32 1, i32 1,
|
||||||
|
i32 2, i32 2, i32 3, i32 3>
|
||||||
|
; and blend them
|
||||||
|
%result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
|
||||||
|
<8 x float> %new01f,
|
||||||
|
<8 x float> %mask01)
|
||||||
|
%result01 = bitcast <8 x float> %result01f to <4 x i64>
|
||||||
|
|
||||||
|
; and again
|
||||||
|
%old23 = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
|
||||||
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%old23f = bitcast <4 x i64> %old23 to <8 x float>
|
||||||
|
%new23 = shufflevector <8 x i64> %new, <8 x i64> undef,
|
||||||
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
|
%new23f = bitcast <4 x i64> %new23 to <8 x float>
|
||||||
|
; compute mask--note that the values are doubled-up...
|
||||||
|
%mask23 = shufflevector <8 x float> %mask, <8 x float> undef,
|
||||||
|
<8 x i32> <i32 4, i32 4, i32 5, i32 5,
|
||||||
|
i32 6, i32 6, i32 7, i32 7>
|
||||||
|
; and blend them
|
||||||
|
%result23f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f,
|
||||||
|
<8 x float> %new23f,
|
||||||
|
<8 x float> %mask23)
|
||||||
|
%result23 = bitcast <8 x float> %result23f to <4 x i64>
|
||||||
|
|
||||||
|
; reconstruct the final <8 x i64> vector
|
||||||
|
%final = shufflevector <4 x i64> %result01, <4 x i64> %result23,
|
||||||
|
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||||
|
i32 4, i32 5, i32 6, i32 7>
|
||||||
|
store <8 x i64> %final, <8 x i64> * %ptr, align 8
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
40
builtins.m4
40
builtins.m4
@@ -1517,6 +1517,46 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
|
|||||||
')
|
')
|
||||||
|
|
||||||
|
|
||||||
|
define(`masked_store_blend_8_16_by_16', `
|
||||||
|
define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>,
|
||||||
|
<16 x i32>) nounwind alwaysinline {
|
||||||
|
%old = load <16 x i8> * %0
|
||||||
|
%old128 = bitcast <16 x i8> %old to i128
|
||||||
|
%new128 = bitcast <16 x i8> %1 to i128
|
||||||
|
|
||||||
|
%mask8 = trunc <16 x i32> %2 to <16 x i8>
|
||||||
|
%mask128 = bitcast <16 x i8> %mask8 to i128
|
||||||
|
%notmask128 = xor i128 %mask128, -1
|
||||||
|
|
||||||
|
%newmasked = and i128 %new128, %mask128
|
||||||
|
%oldmasked = and i128 %old128, %notmask128
|
||||||
|
%result = or i128 %newmasked, %oldmasked
|
||||||
|
|
||||||
|
%resultvec = bitcast i128 %result to <16 x i8>
|
||||||
|
store <16 x i8> %resultvec, <16 x i8> * %0
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
|
||||||
|
<16 x i32>) nounwind alwaysinline {
|
||||||
|
%old = load <16 x i16> * %0
|
||||||
|
%old256 = bitcast <16 x i16> %old to i256
|
||||||
|
%new256 = bitcast <16 x i16> %1 to i256
|
||||||
|
|
||||||
|
%mask16 = trunc <16 x i32> %2 to <16 x i16>
|
||||||
|
%mask256 = bitcast <16 x i16> %mask16 to i256
|
||||||
|
%notmask256 = xor i256 %mask256, -1
|
||||||
|
|
||||||
|
%newmasked = and i256 %new256, %mask256
|
||||||
|
%oldmasked = and i256 %old256, %notmask256
|
||||||
|
%result = or i256 %newmasked, %oldmasked
|
||||||
|
|
||||||
|
%resultvec = bitcast i256 %result to <16 x i16>
|
||||||
|
store <16 x i16> %resultvec, <16 x i16> * %0
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
')
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; packed load and store functions
|
;; packed load and store functions
|
||||||
;;
|
;;
|
||||||
|
|||||||
14
opt.cpp
14
opt.cpp
@@ -1433,16 +1433,12 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|||||||
llvm::Value *rvalue = callInst->getArgOperand(1);
|
llvm::Value *rvalue = callInst->getArgOperand(1);
|
||||||
llvm::Value *mask = callInst->getArgOperand(2);
|
llvm::Value *mask = callInst->getArgOperand(2);
|
||||||
|
|
||||||
// On SSE, we need to choose between doing the load + blend + store
|
// We need to choose between doing the load + blend + store trick,
|
||||||
// trick, or serializing the masked store. On targets with a
|
// or serializing the masked store. Even on targets with a native
|
||||||
// native masked store instruction, the implementations of
|
// masked store instruction, this is preferable since it lets us
|
||||||
// __masked_store_blend_* should be the same as __masked_store_*,
|
// keep values in registers rather than going out to the stack.
|
||||||
// so this doesn't matter. On SSE, blending is generally more
|
bool doBlend = (!g->opt.disableBlendedMaskedStores ||
|
||||||
// efficient and is always safe to do on stack-allocated values.(?)
|
|
||||||
bool doBlend = (g->target.isa != Target::AVX &&
|
|
||||||
lIsStackVariablePointer(lvalue));
|
lIsStackVariablePointer(lvalue));
|
||||||
if (g->target.isa == Target::SSE4 || g->target.isa == Target::SSE2)
|
|
||||||
doBlend |= !g->opt.disableBlendedMaskedStores;
|
|
||||||
|
|
||||||
// Generate the call to the appropriate masked store function and
|
// Generate the call to the appropriate masked store function and
|
||||||
// replace the __pseudo_* one with it.
|
// replace the __pseudo_* one with it.
|
||||||
|
|||||||
Reference in New Issue
Block a user