diff --git a/builtins-avx-x2.ll b/builtins-avx-x2.ll index 5128030a..d263fe25 100644 --- a/builtins-avx-x2.ll +++ b/builtins-avx-x2.ll @@ -523,35 +523,104 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>, } -;; FIXME: various code elsewhere in the builtins implementations makes -;; calls to the 32/64 bit versions of these, basically assuming that doing -;; so is faster than doing a full call to an actual masked store, which -;; isn't likely to be the case on AVX. So here we provide those functions -;; but then don't actually do what the caller asked for... +masked_store_blend_8_16_by_16() -declare void @llvm.trap() - -define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>, - <8 x i32>) nounwind alwaysinline { - call void @llvm.trap() - ret void -} - -define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>, - <8 x i32>) nounwind alwaysinline { - call void @llvm.trap() - ret void -} +declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, + <8 x float>) nounwind readnone define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, <16 x i32>) nounwind alwaysinline { - call void @__masked_store_32(<16 x i32> * %0, <16 x i32> %1, <16 x i32> %2) + %maskAsFloat = bitcast <16 x i32> %2 to <16 x float> + %oldValue = load <16 x i32>* %0, align 4 + %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float> + %newAsFloat = bitcast <16 x i32> %1 to <16 x float> + + %old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef, + <8 x i32> + %old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef, + <8 x i32> + %new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef, + <8 x i32> + %new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef, + <8 x i32> + %mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef, + <8 x i32> + %mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef, + <8 x i32> + + %blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0, + <8 x float> %new0, + <8 x float> %mask0) + %blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1, + <8 x float> %new1, + <8 x float> %mask1) + %blend = shufflevector <8 x float> %blend0, <8 x float> %blend1, + <16 x i32> + %blendAsInt = bitcast <16 x float> %blend to <16 x i32> + store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4 ret void } -define void @__masked_store_blend_64(<16 x i64>* nocapture, <16 x i64>, - <16 x i32>) nounwind alwaysinline { - call void @__masked_store_64(<16 x i64> * %0, <16 x i64> %1, <16 x i32> %2) + +declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, + <4 x double>) nounwind readnone + +define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, + <16 x i32> %mask) nounwind alwaysinline { + %oldValue = load <16 x i64>* %ptr, align 8 + %old = bitcast <16 x i64> %oldValue to <16 x double> + %old0d = shufflevector <16 x double> %old, <16 x double> undef, + <4 x i32> + %old1d = shufflevector <16 x double> %old, <16 x double> undef, + <4 x i32> + %old2d = shufflevector <16 x double> %old, <16 x double> undef, + <4 x i32> + %old3d = shufflevector <16 x double> %old, <16 x double> undef, + <4 x i32> + + %new = bitcast <16 x i64> %newi64 to <16 x double> + %new0d = shufflevector <16 x double> %new, <16 x double> undef, + <4 x i32> + %new1d = shufflevector <16 x double> %new, <16 x double> undef, + <4 x i32> + %new2d = shufflevector <16 x double> %new, <16 x double> undef, + <4 x i32> + %new3d = shufflevector <16 x double> %new, <16 x double> undef, + <4 x i32> + + %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef, + <8 x i32> + %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef, + <8 x i32> + %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef, + <8 x i32> + %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef, + <8 x i32> + %mask0d = bitcast <8 x i32> %mask0 to <4 x double> + %mask1d = bitcast <8 x i32> %mask1 to <4 x double> + %mask2d = bitcast <8 x i32> %mask2 to <4 x double> + %mask3d = bitcast <8 x i32> %mask3 to <4 x double> + + %result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d, + <4 x double> %new0d, <4 x double> %mask0d) + %result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d, + <4 x double> %new1d, <4 x double> %mask1d) + %result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d, + <4 x double> %new2d, <4 x double> %mask2d) + %result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d, + <4 x double> %new3d, <4 x double> %mask3d) + + %result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d, + <8 x i32> + %result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d, + <8 x i32> + + %result = shufflevector <8 x double> %result01, <8 x double> %result23, + <16 x i32> + %result64 = bitcast <16 x double> %result to <16 x i64> + store <16 x i64> %result64, <16 x i64> * %ptr ret void } diff --git a/builtins-avx.ll b/builtins-avx.ll index 2cfe3a81..55f0e2da 100644 --- a/builtins-avx.ll +++ b/builtins-avx.ll @@ -450,38 +450,74 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>, } -;; FIXME: various code elsewhere in the builtins implementations makes -;; calls to the 32/64 bit versions of these, basically assuming that doing -;; so is faster than doing a full call to an actual masked store, which -;; isn't likely to be the case on AVX. So here we provide those functions -;; but then don't actually do what the caller asked for... -declare void @llvm.trap() - -define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>, - <8 x i32>) nounwind alwaysinline { - call void @llvm.trap() - ret void -} - - -define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>, - <8 x i32>) nounwind alwaysinline { - call void @llvm.trap() - ret void -} +masked_store_blend_8_16_by_8() +declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, + <8 x float>) nounwind readnone define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, <8 x i32>) nounwind alwaysinline { - call void @__masked_store_32(<8 x i32> * %0, <8 x i32> %1, <8 x i32> %2) + %mask_as_float = bitcast <8 x i32> %2 to <8 x float> + %oldValue = load <8 x i32>* %0, align 4 + %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float> + %newAsFloat = bitcast <8 x i32> %1 to <8 x float> + %blend = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat, + <8 x float> %newAsFloat, + <8 x float> %mask_as_float) + %blendAsInt = bitcast <8 x float> %blend to <8 x i32> + store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4 ret void } -define void @__masked_store_blend_64(<8 x i64>* nocapture, <8 x i64>, - <8 x i32>) nounwind alwaysinline { - call void @__masked_store_64(<8 x i64> * %0, <8 x i64> %1, <8 x i32> %2) +define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, + <8 x i32> %i32mask) nounwind alwaysinline { + %oldValue = load <8 x i64>* %ptr, align 8 + %mask = bitcast <8 x i32> %i32mask to <8 x float> + + ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values + ; are actually bitcast <4 x i64> values + ; + ; set up the first four 64-bit values + %old01 = shufflevector <8 x i64> %oldValue, <8 x i64> undef, + <4 x i32> + %old01f = bitcast <4 x i64> %old01 to <8 x float> + %new01 = shufflevector <8 x i64> %new, <8 x i64> undef, + <4 x i32> + %new01f = bitcast <4 x i64> %new01 to <8 x float> + ; compute mask--note that the indices are all doubled-up + %mask01 = shufflevector <8 x float> %mask, <8 x float> undef, + <8 x i32> + ; and blend them + %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f, + <8 x float> %new01f, + <8 x float> %mask01) + %result01 = bitcast <8 x float> %result01f to <4 x i64> + + ; and again + %old23 = shufflevector <8 x i64> %oldValue, <8 x i64> undef, + <4 x i32> + %old23f = bitcast <4 x i64> %old23 to <8 x float> + %new23 = shufflevector <8 x i64> %new, <8 x i64> undef, + <4 x i32> + %new23f = bitcast <4 x i64> %new23 to <8 x float> + ; compute mask--note that the values are doubled-up... + %mask23 = shufflevector <8 x float> %mask, <8 x float> undef, + <8 x i32> + ; and blend them + %result23f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f, + <8 x float> %new23f, + <8 x float> %mask23) + %result23 = bitcast <8 x float> %result23f to <4 x i64> + + ; reconstruct the final <8 x i64> vector + %final = shufflevector <4 x i64> %result01, <4 x i64> %result23, + <8 x i32> + store <8 x i64> %final, <8 x i64> * %ptr, align 8 ret void } diff --git a/builtins.m4 b/builtins.m4 index 714a2bd5..b3a1da0e 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -1517,6 +1517,46 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>, ') +define(`masked_store_blend_8_16_by_16', ` +define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>, + <16 x i32>) nounwind alwaysinline { + %old = load <16 x i8> * %0 + %old128 = bitcast <16 x i8> %old to i128 + %new128 = bitcast <16 x i8> %1 to i128 + + %mask8 = trunc <16 x i32> %2 to <16 x i8> + %mask128 = bitcast <16 x i8> %mask8 to i128 + %notmask128 = xor i128 %mask128, -1 + + %newmasked = and i128 %new128, %mask128 + %oldmasked = and i128 %old128, %notmask128 + %result = or i128 %newmasked, %oldmasked + + %resultvec = bitcast i128 %result to <16 x i8> + store <16 x i8> %resultvec, <16 x i8> * %0 + ret void +} + +define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>, + <16 x i32>) nounwind alwaysinline { + %old = load <16 x i16> * %0 + %old256 = bitcast <16 x i16> %old to i256 + %new256 = bitcast <16 x i16> %1 to i256 + + %mask16 = trunc <16 x i32> %2 to <16 x i16> + %mask256 = bitcast <16 x i16> %mask16 to i256 + %notmask256 = xor i256 %mask256, -1 + + %newmasked = and i256 %new256, %mask256 + %oldmasked = and i256 %old256, %notmask256 + %result = or i256 %newmasked, %oldmasked + + %resultvec = bitcast i256 %result to <16 x i16> + store <16 x i16> %resultvec, <16 x i16> * %0 + ret void +} +') + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; packed load and store functions ;; diff --git a/opt.cpp b/opt.cpp index 135eb35d..7dfa38d1 100644 --- a/opt.cpp +++ b/opt.cpp @@ -1433,16 +1433,12 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) { llvm::Value *rvalue = callInst->getArgOperand(1); llvm::Value *mask = callInst->getArgOperand(2); - // On SSE, we need to choose between doing the load + blend + store - // trick, or serializing the masked store. On targets with a - // native masked store instruction, the implementations of - // __masked_store_blend_* should be the same as __masked_store_*, - // so this doesn't matter. On SSE, blending is generally more - // efficient and is always safe to do on stack-allocated values.(?) - bool doBlend = (g->target.isa != Target::AVX && + // We need to choose between doing the load + blend + store trick, + // or serializing the masked store. Even on targets with a native + // masked store instruction, this is preferable since it lets us + // keep values in registers rather than going out to the stack. + bool doBlend = (!g->opt.disableBlendedMaskedStores || lIsStackVariablePointer(lvalue)); - if (g->target.isa == Target::SSE4 || g->target.isa == Target::SSE2) - doBlend |= !g->opt.disableBlendedMaskedStores; // Generate the call to the appropriate masked store function and // replace the __pseudo_* one with it.