From 4ab982bc16d509fd6ec5905c2d04f8a9e8ef41bc Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 26 Aug 2011 12:58:02 -0700 Subject: [PATCH] Various AVX fixes (found by inspection). Emit calls to masked_store, not masked_store_blend, when handling masked stores emitted by the frontend. Fix bug in binary8to16 macro in builtins.m4 Fix bug in 16-wide version of __reduce_add_float Remove blend function implementations for masked_store_blend for AVX; just forward those on to the corresponding real masked store functions. --- builtins-avx-x2.ll | 105 +++++---------------------------------------- builtins-avx.ll | 78 +++++++-------------------------- builtins.m4 | 44 +------------------ opt.cpp | 3 +- 4 files changed, 30 insertions(+), 200 deletions(-) diff --git a/builtins-avx-x2.ll b/builtins-avx-x2.ll index 4000425f..3c380e24 100644 --- a/builtins-avx-x2.ll +++ b/builtins-avx-x2.ll @@ -233,7 +233,7 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1) %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2) %scalar1 = extractelement <8 x float> %v2, i32 0 - %scalar2 = extractelement <8 x float> %v2, i32 4 + %scalar2 = extractelement <8 x float> %v2, i32 1 %sum = fadd float %scalar1, %scalar2 ret float %sum } @@ -522,105 +522,22 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>, ret void } -masked_store_blend_8_16_by_16() -declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, - <8 x float>) nounwind readnone +;; FIXME: various code elsewhere in the builtins implementations makes +;; calls to these, basically assuming that doing so is faster than doing +;; a full call to an actual masked store, which isn't likely to be the +;; case on AVX. So here we provide those functions but then don't actually +;; do what the caller asked for... - -define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, +define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, <16 x i32>) nounwind alwaysinline { - %maskAsFloat = bitcast <16 x i32> %2 to <16 x float> - %oldValue = load <16 x i32>* %0, align 4 - %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float> - %newAsFloat = bitcast <16 x i32> %1 to <16 x float> - - %old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef, - <8 x i32> - %old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef, - <8 x i32> - %new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef, - <8 x i32> - %new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef, - <8 x i32> - %mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef, - <8 x i32> - %mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef, - <8 x i32> - - %blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0, - <8 x float> %new0, - <8 x float> %mask0) - %blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1, - <8 x float> %new1, - <8 x float> %mask1) - %blend = shufflevector <8 x float> %blend0, <8 x float> %blend1, - <16 x i32> - %blendAsInt = bitcast <16 x float> %blend to <16 x i32> - store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4 + call void @__masked_store_32(<16 x i32> * %0, <16 x i32> %1, <16 x i32> %2) ret void } - -declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, - <4 x double>) nounwind readnone - -define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, - <16 x i32> %mask) nounwind alwaysinline { - %oldValue = load <16 x i64>* %ptr, align 8 - %old = bitcast <16 x i64> %oldValue to <16 x double> - %old0d = shufflevector <16 x double> %old, <16 x double> undef, - <4 x i32> - %old1d = shufflevector <16 x double> %old, <16 x double> undef, - <4 x i32> - %old2d = shufflevector <16 x double> %old, <16 x double> undef, - <4 x i32> - %old3d = shufflevector <16 x double> %old, <16 x double> undef, - <4 x i32> - - %new = bitcast <16 x i64> %newi64 to <16 x double> - %new0d = shufflevector <16 x double> %new, <16 x double> undef, - <4 x i32> - %new1d = shufflevector <16 x double> %new, <16 x double> undef, - <4 x i32> - %new2d = shufflevector <16 x double> %new, <16 x double> undef, - <4 x i32> - %new3d = shufflevector <16 x double> %new, <16 x double> undef, - <4 x i32> - - %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef, - <8 x i32> - %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef, - <8 x i32> - %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef, - <8 x i32> - %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef, - <8 x i32> - %mask0d = bitcast <8 x i32> %mask0 to <4 x double> - %mask1d = bitcast <8 x i32> %mask1 to <4 x double> - %mask2d = bitcast <8 x i32> %mask2 to <4 x double> - %mask3d = bitcast <8 x i32> %mask3 to <4 x double> - - %result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d, - <4 x double> %new0d, <4 x double> %mask0d) - %result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d, - <4 x double> %new1d, <4 x double> %mask1d) - %result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d, - <4 x double> %new2d, <4 x double> %mask2d) - %result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d, - <4 x double> %new3d, <4 x double> %mask3d) - - %result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d, - <8 x i32> - %result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d, - <8 x i32> - - %result = shufflevector <8 x double> %result01, <8 x double> %result23, - <16 x i32> - %result64 = bitcast <16 x double> %result to <16 x i64> - store <16 x i64> %result64, <16 x i64> * %ptr +define void @__masked_store_blend_64(<16 x i64>* nocapture, <16 x i64>, + <16 x i32>) nounwind alwaysinline { + call void @__masked_store_64(<16 x i64> * %0, <16 x i64> %1, <16 x i32> %2) ret void } diff --git a/builtins-avx.ll b/builtins-avx.ll index 41089abf..e06bd87b 100644 --- a/builtins-avx.ll +++ b/builtins-avx.ll @@ -119,9 +119,11 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read ; return 0.5 * is * (3. - (v * is) * is); %v_is = fmul <8 x float> %v, %is %v_is_is = fmul <8 x float> %v_is, %is - %three_sub = fsub <8 x float> , %v_is_is + %three_sub = fsub <8 x float> , %v_is_is %is_mul = fmul <8 x float> %is, %three_sub - %half_scale = fmul <8 x float> , %is_mul + %half_scale = fmul <8 x float> , %is_mul ret <8 x float> %half_scale } @@ -446,77 +448,27 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>, ret void } -masked_store_blend_8_16_by_8() -declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, - <8 x float>) nounwind readnone +;; FIXME: various code elsewhere in the builtins implementations makes +;; calls to these, basically assuming that doing so is faster than doing +;; a full call to an actual masked store, which isn't likely to be the +;; case on AVX. So here we provide those functions but then don't actually +;; do what the caller asked for... - -define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, +define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, <8 x i32>) nounwind alwaysinline { - %mask_as_float = bitcast <8 x i32> %2 to <8 x float> - %oldValue = load <8 x i32>* %0, align 4 - %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float> - %newAsFloat = bitcast <8 x i32> %1 to <8 x float> - %blend = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat, - <8 x float> %newAsFloat, - <8 x float> %mask_as_float) - %blendAsInt = bitcast <8 x float> %blend to <8 x i32> - store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4 + call void @__masked_store_32(<8 x i32> * %0, <8 x i32> %1, <8 x i32> %2) ret void } -define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, - <8 x i32> %i32mask) nounwind alwaysinline { - %oldValue = load <8 x i64>* %ptr, align 8 - %mask = bitcast <8 x i32> %i32mask to <8 x float> - - ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values - ; are actually bitcast <4 x i64> values - ; - ; set up the first four 64-bit values - %old01 = shufflevector <8 x i64> %oldValue, <8 x i64> undef, - <4 x i32> - %old01f = bitcast <4 x i64> %old01 to <8 x float> - %new01 = shufflevector <8 x i64> %new, <8 x i64> undef, - <4 x i32> - %new01f = bitcast <4 x i64> %new01 to <8 x float> - ; compute mask--note that the indices are all doubled-up - %mask01 = shufflevector <8 x float> %mask, <8 x float> undef, - <8 x i32> - ; and blend them - %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f, - <8 x float> %new01f, - <8 x float> %mask01) - %result01 = bitcast <8 x float> %result01f to <4 x i64> - - ; and again - %old23 = shufflevector <8 x i64> %oldValue, <8 x i64> undef, - <4 x i32> - %old23f = bitcast <4 x i64> %old23 to <8 x float> - %new23 = shufflevector <8 x i64> %new, <8 x i64> undef, - <4 x i32> - %new23f = bitcast <4 x i64> %new23 to <8 x float> - ; compute mask--note that the values are doubled-up... - %mask23 = shufflevector <8 x float> %mask, <8 x float> undef, - <8 x i32> - ; and blend them - %result23f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f, - <8 x float> %new23f, - <8 x float> %mask23) - %result23 = bitcast <8 x float> %result23f to <4 x i64> - - ; reconstruct the final <8 x i64> vector - %final = shufflevector <4 x i64> %result01, <4 x i64> %result23, - <8 x i32> - store <8 x i64> %final, <8 x i64> * %ptr, align 8 +define void @__masked_store_blend_64(<8 x i64>* nocapture, <8 x i64>, + <8 x i32>) nounwind alwaysinline { + call void @__masked_store_64(<8 x i64> * %0, <8 x i64> %1, <8 x i32> %2) ret void } + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter diff --git a/builtins.m4 b/builtins.m4 index c8019847..59a7b6a3 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -300,10 +300,10 @@ define(`binary8to16', ` %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <8 x i32> %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, - <8 x i32> + <8 x i32> %v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0a, <8 x $2> %$1_0b) %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, - <8 x i32> + <8 x i32> %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <8 x i32> %v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1a, <8 x $2> %$1_1b) @@ -1438,46 +1438,6 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>, } ') -define(`masked_store_blend_8_16_by_16', ` -define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>, - <16 x i32>) nounwind alwaysinline { - %old = load <16 x i8> * %0 - %old128 = bitcast <16 x i8> %old to i128 - %new128 = bitcast <16 x i8> %1 to i128 - - %mask8 = trunc <16 x i32> %2 to <16 x i8> - %mask128 = bitcast <16 x i8> %mask8 to i128 - %notmask128 = xor i128 %mask128, -1 - - %newmasked = and i128 %new128, %mask128 - %oldmasked = and i128 %old128, %notmask128 - %result = or i128 %newmasked, %oldmasked - - %resultvec = bitcast i128 %result to <16 x i8> - store <16 x i8> %resultvec, <16 x i8> * %0 - ret void -} - -define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>, - <16 x i32>) nounwind alwaysinline { - %old = load <16 x i16> * %0 - %old256 = bitcast <16 x i16> %old to i256 - %new256 = bitcast <16 x i16> %1 to i256 - - %mask16 = trunc <16 x i32> %2 to <16 x i16> - %mask256 = bitcast <16 x i16> %mask16 to i256 - %notmask256 = xor i256 %mask256, -1 - - %newmasked = and i256 %new256, %mask256 - %oldmasked = and i256 %old256, %notmask256 - %result = or i256 %newmasked, %oldmasked - - %resultvec = bitcast i256 %result to <16 x i16> - store <16 x i16> %resultvec, <16 x i16> * %0 - ret void -} -') - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; packed load and store functions diff --git a/opt.cpp b/opt.cpp index d4a6ce87..6d6c3d95 100644 --- a/opt.cpp +++ b/opt.cpp @@ -1422,7 +1422,8 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) { // __masked_store_blend_* should be the same as __masked_store_*, // so this doesn't matter. On SSE, blending is generally more // efficient and is always safe to do on stack-allocated values.(?) - bool doBlend = lIsStackVariablePointer(lvalue); + bool doBlend = (g->target.isa != Target::AVX && + lIsStackVariablePointer(lvalue)); if (g->target.isa == Target::SSE4 || g->target.isa == Target::SSE2) doBlend |= !g->opt.disableBlendedMaskedStores;