AVX: go back to using blend (vs. masked store) when possible.

All of the masked store calls were inhibiting putting values into registers, which in turn led to a lot of unnecessary stack traffic. This approach seems to give better code in the end.
2011-09-07 11:26:49 -07:00
parent 375f1cb8e8
commit c86128e8ee
4 changed files with 195 additions and 54 deletions
--- a/builtins-avx-x2.ll
+++ b/builtins-avx-x2.ll
@@ -523,35 +523,104 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
 }
-;; FIXME: various code elsewhere in the builtins implementations makes
+masked_store_blend_8_16_by_16()
 ;; calls to the 32/64 bit versions of these, basically assuming that doing
 ;; so is faster than doing a full call to an actual masked store, which
 ;; isn't likely to be the case on AVX.  So here we provide those functions
 ;; but then don't actually do what the caller asked for...
-declare void @llvm.trap()
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
-
+                                                <8 x float>) nounwind readnone
 define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>, 
                                    <8 x i32>) nounwind alwaysinline {
  call void @llvm.trap()
  ret void
 }
 define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>, 
                                     <8 x i32>) nounwind alwaysinline {
  call void @llvm.trap()
  ret void
 }
 define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, 
                                     <16 x i32>) nounwind alwaysinline {
-  call void @__masked_store_32(<16 x i32> * %0, <16 x i32> %1, <16 x i32> %2)
+  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
  %oldValue = load <16 x i32>* %0, align 4
  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
  %newAsFloat = bitcast <16 x i32> %1 to <16 x float>
  %old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
                                                         <8 x float> %new0,
                                                         <8 x float> %mask0)
  %blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
                                                         <8 x float> %new1,
                                                         <8 x float> %mask1)
  %blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %blendAsInt = bitcast <16 x float> %blend to <16 x i32>
  store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
  ret void
 }
-define void @__masked_store_blend_64(<16 x i64>* nocapture, <16 x i64>, 
+
-                                     <16 x i32>) nounwind alwaysinline {
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
-  call void @__masked_store_64(<16 x i64> * %0, <16 x i64> %1, <16 x i32> %2)
+                                                 <4 x double>) nounwind readnone
 define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
                                     <16 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <16 x i64>* %ptr, align 8
  %old = bitcast <16 x i64> %oldValue to <16 x double>
  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %old1d = shufflevector <16 x double> %old, <16 x double> undef,
     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %old2d = shufflevector <16 x double> %old, <16 x double> undef,
     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
  %old3d = shufflevector <16 x double> %old, <16 x double> undef,
     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
  %new = bitcast <16 x i64> %newi64 to <16 x double>
  %new0d = shufflevector <16 x double> %new, <16 x double> undef,
     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %new1d = shufflevector <16 x double> %new, <16 x double> undef,
     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %new2d = shufflevector <16 x double> %new, <16 x double> undef,
     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
  %new3d = shufflevector <16 x double> %new, <16 x double> undef,
     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
  %result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
                                 <4 x double> %new0d, <4 x double> %mask0d)
  %result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
                                 <4 x double> %new1d, <4 x double> %mask1d)
  %result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
                                 <4 x double> %new2d, <4 x double> %mask2d)
  %result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
                                 <4 x double> %new3d, <4 x double> %mask3d)
  %result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %result = shufflevector <8 x double> %result01, <8 x double> %result23,
           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %result64 = bitcast <16 x double> %result to <16 x i64>
  store <16 x i64> %result64, <16 x i64> * %ptr
  ret void
 }
--- a/builtins-avx.ll
+++ b/builtins-avx.ll
@@ -450,38 +450,74 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
 }
 ;; FIXME: various code elsewhere in the builtins implementations makes
 ;; calls to the 32/64 bit versions of these, basically assuming that doing
 ;; so is faster than doing a full call to an actual masked store, which
 ;; isn't likely to be the case on AVX.  So here we provide those functions
 ;; but then don't actually do what the caller asked for...
-declare void @llvm.trap()
+masked_store_blend_8_16_by_8()
 define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>, 
                                    <8 x i32>) nounwind alwaysinline {
  call void @llvm.trap()
  ret void
 }
 define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>, 
                                     <8 x i32>) nounwind alwaysinline {
  call void @llvm.trap()
  ret void
 }
 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone
 define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
                                     <8 x i32>) nounwind alwaysinline {
-  call void @__masked_store_32(<8 x i32> * %0, <8 x i32> %1, <8 x i32> %2)
+  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
  %oldValue = load <8 x i32>* %0, align 4
  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
  %blend = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat,
                                                        <8 x float> %newAsFloat,
                                                        <8 x float> %mask_as_float)
  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
  ret void
 }
-define void @__masked_store_blend_64(<8 x i64>* nocapture, <8 x i64>, 
+define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
-                                     <8 x i32>) nounwind alwaysinline {
+                                     <8 x i32> %i32mask) nounwind alwaysinline {
-  call void @__masked_store_64(<8 x i64> * %0, <8 x i64> %1, <8 x i32> %2)
+  %oldValue = load <8 x i64>* %ptr, align 8
  %mask = bitcast <8 x i32> %i32mask to <8 x float>
  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
  ; are actually bitcast <4 x i64> values
  ;
  ; set up the first four 64-bit values
  %old01  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %old01f = bitcast <4 x i64> %old01 to <8 x float>
  %new01  = shufflevector <8 x i64> %new, <8 x i64> undef,
                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %new01f = bitcast <4 x i64> %new01 to <8 x float>
  ; compute mask--note that the indices are all doubled-up
  %mask01 = shufflevector <8 x float> %mask, <8 x float> undef,
                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
                                     i32 2, i32 2, i32 3, i32 3>
  ; and blend them
  %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
                                                            <8 x float> %new01f,
                                                            <8 x float> %mask01)
  %result01 = bitcast <8 x float> %result01f to <4 x i64>
  ; and again
  %old23  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %old23f = bitcast <4 x i64> %old23 to <8 x float>
  %new23  = shufflevector <8 x i64> %new, <8 x i64> undef,
                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %new23f = bitcast <4 x i64> %new23 to <8 x float>
  ; compute mask--note that the values are doubled-up...
  %mask23 = shufflevector <8 x float> %mask, <8 x float> undef,
                          <8 x i32> <i32 4, i32 4, i32 5, i32 5,
                                     i32 6, i32 6, i32 7, i32 7>
  ; and blend them
  %result23f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f,
                                                            <8 x float> %new23f,
                                                            <8 x float> %mask23)
  %result23 = bitcast <8 x float> %result23f to <4 x i64>
  ; reconstruct the final <8 x i64> vector
  %final = shufflevector <4 x i64> %result01, <4 x i64> %result23,
                         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
                                    i32 4, i32 5, i32 6, i32 7>
  store <8 x i64> %final, <8 x i64> * %ptr, align 8
  ret void
 }
--- a/builtins.m4
+++ b/builtins.m4
@@ -1517,6 +1517,46 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
 ')
 define(`masked_store_blend_8_16_by_16', `
 define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>,
                                    <16 x i32>) nounwind alwaysinline {
  %old = load <16 x i8> * %0
  %old128 = bitcast <16 x i8> %old to i128
  %new128 = bitcast <16 x i8> %1 to i128
  %mask8 = trunc <16 x i32> %2 to <16 x i8>
  %mask128 = bitcast <16 x i8> %mask8 to i128
  %notmask128 = xor i128 %mask128, -1
  %newmasked = and i128 %new128, %mask128
  %oldmasked = and i128 %old128, %notmask128
  %result = or i128 %newmasked, %oldmasked
  %resultvec = bitcast i128 %result to <16 x i8>
  store <16 x i8> %resultvec, <16 x i8> * %0
  ret void
 }
 define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
                                     <16 x i32>) nounwind alwaysinline {
  %old = load <16 x i16> * %0
  %old256 = bitcast <16 x i16> %old to i256
  %new256 = bitcast <16 x i16> %1 to i256
  %mask16 = trunc <16 x i32> %2 to <16 x i16>
  %mask256 = bitcast <16 x i16> %mask16 to i256
  %notmask256 = xor i256 %mask256, -1
  %newmasked = and i256 %new256, %mask256
  %oldmasked = and i256 %old256, %notmask256
  %result = or i256 %newmasked, %oldmasked
  %resultvec = bitcast i256 %result to <16 x i16>
  store <16 x i16> %resultvec, <16 x i16> * %0
  ret void
 }
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; packed load and store functions
 ;;
--- a/opt.cpp
+++ b/opt.cpp
@@ -1433,16 +1433,12 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
        llvm::Value *rvalue  = callInst->getArgOperand(1);
        llvm::Value *mask = callInst->getArgOperand(2);
-        // On SSE, we need to choose between doing the load + blend + store
+        // We need to choose between doing the load + blend + store trick,
-        // trick, or serializing the masked store.  On targets with a
+        // or serializing the masked store.  Even on targets with a native
-        // native masked store instruction, the implementations of
+        // masked store instruction, this is preferable since it lets us
-        // __masked_store_blend_* should be the same as __masked_store_*,
+        // keep values in registers rather than going out to the stack.
-        // so this doesn't matter.  On SSE, blending is generally more
+        bool doBlend = (!g->opt.disableBlendedMaskedStores ||
        // efficient and is always safe to do on stack-allocated values.(?)
        bool doBlend = (g->target.isa != Target::AVX &&
                        lIsStackVariablePointer(lvalue));
        if (g->target.isa == Target::SSE4 || g->target.isa == Target::SSE2)
            doBlend |= !g->opt.disableBlendedMaskedStores;
        // Generate the call to the appropriate masked store function and
        // replace the __pseudo_* one with it.