From 4ab982bc16d509fd6ec5905c2d04f8a9e8ef41bc Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 26 Aug 2011 12:58:02 -0700
Subject: [PATCH] Various AVX fixes (found by inspection).

Emit calls to masked_store, not masked_store_blend, when handling
  masked stores emitted by the frontend.
Fix bug in binary8to16 macro in builtins.m4
Fix bug in 16-wide version of __reduce_add_float
Remove blend function implementations for masked_store_blend for
  AVX; just forward those on to the corresponding real masked store
  functions.
---
 builtins-avx-x2.ll | 105 +++++----------------------------------------
 builtins-avx.ll    |  78 +++++++--------------------------
 builtins.m4        |  44 +------------------
 opt.cpp            |   3 +-
 4 files changed, 30 insertions(+), 200 deletions(-)
diff --git a/builtins-avx-x2.ll b/builtins-avx-x2.ll
index 4000425f..3c380e24 100644
--- a/builtins-avx-x2.ll
+++ b/builtins-avx-x2.ll
@@ -233,7 +233,7 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always
   %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
   %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
   %scalar1 = extractelement <8 x float> %v2, i32 0
-  %scalar2 = extractelement <8 x float> %v2, i32 4
+  %scalar2 = extractelement <8 x float> %v2, i32 1
   %sum = fadd float %scalar1, %scalar2
   ret float %sum
 }
@@ -522,105 +522,22 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
   ret void
 }
 
-masked_store_blend_8_16_by_16()
 
-declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
-                                                <8 x float>) nounwind readnone
+;; FIXME: various code elsewhere in the builtins implementations makes
+;; calls to these, basically assuming that doing so is faster than doing
+;; a full call to an actual masked store, which isn't likely to be the
+;; case on AVX.  So here we provide those functions but then don't actually
+;; do what the caller asked for...
 
-
-define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
+define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, 
                                      <16 x i32>) nounwind alwaysinline {
-  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
-  %oldValue = load <16 x i32>* %0, align 4
-  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
-  %newAsFloat = bitcast <16 x i32> %1 to <16 x float>
-
-  %old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
-        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
-        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
-        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
-        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
-        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
-        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-
-  %blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
-                                                         <8 x float> %new0,
-                                                         <8 x float> %mask0)
-  %blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
-                                                         <8 x float> %new1,
-                                                         <8 x float> %mask1)
-  %blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %blendAsInt = bitcast <16 x float> %blend to <16 x i32>
-  store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
+  call void @__masked_store_32(<16 x i32> * %0, <16 x i32> %1, <16 x i32> %2)
   ret void
 }
 
-
-declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
-                                                 <4 x double>) nounwind readnone
-
-define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64,
-                                     <16 x i32> %mask) nounwind alwaysinline {
-  %oldValue = load <16 x i64>* %ptr, align 8
-  %old = bitcast <16 x i64> %oldValue to <16 x double>
-  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
-     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %old1d = shufflevector <16 x double> %old, <16 x double> undef,
-     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %old2d = shufflevector <16 x double> %old, <16 x double> undef,
-     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-  %old3d = shufflevector <16 x double> %old, <16 x double> undef,
-     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-
-  %new = bitcast <16 x i64> %newi64 to <16 x double>
-  %new0d = shufflevector <16 x double> %new, <16 x double> undef,
-     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %new1d = shufflevector <16 x double> %new, <16 x double> undef,
-     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %new2d = shufflevector <16 x double> %new, <16 x double> undef,
-     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-  %new3d = shufflevector <16 x double> %new, <16 x double> undef,
-     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-
-  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
-     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
-     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
-     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
-  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
-     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
-  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
-  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
-  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
-
-  %result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
-                                 <4 x double> %new0d, <4 x double> %mask0d)
-  %result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
-                                 <4 x double> %new1d, <4 x double> %mask1d)
-  %result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
-                                 <4 x double> %new2d, <4 x double> %mask2d)
-  %result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
-                                 <4 x double> %new3d, <4 x double> %mask3d)
-
-  %result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
-           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
-           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-
-  %result = shufflevector <8 x double> %result01, <8 x double> %result23,
-           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %result64 = bitcast <16 x double> %result to <16 x i64>
-  store <16 x i64> %result64, <16 x i64> * %ptr
+define void @__masked_store_blend_64(<16 x i64>* nocapture, <16 x i64>, 
+                                     <16 x i32>) nounwind alwaysinline {
+  call void @__masked_store_64(<16 x i64> * %0, <16 x i64> %1, <16 x i32> %2)
   ret void
 }
 
diff --git a/builtins-avx.ll b/builtins-avx.ll
index 41089abf..e06bd87b 100644
--- a/builtins-avx.ll
+++ b/builtins-avx.ll
@@ -119,9 +119,11 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read
   ;  return 0.5 * is * (3. - (v * is) * is);
   %v_is = fmul <8 x float> %v, %is
   %v_is_is = fmul <8 x float> %v_is, %is
-  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3., float 3., float 3., float 3., float 3.>, %v_is_is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
   %is_mul = fmul <8 x float> %is, %three_sub
-  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
   ret <8 x float> %half_scale
 }
 
@@ -446,77 +448,27 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
   ret void
 }
 
-masked_store_blend_8_16_by_8()
 
-declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
-                                                <8 x float>) nounwind readnone
+;; FIXME: various code elsewhere in the builtins implementations makes
+;; calls to these, basically assuming that doing so is faster than doing
+;; a full call to an actual masked store, which isn't likely to be the
+;; case on AVX.  So here we provide those functions but then don't actually
+;; do what the caller asked for...
 
-
-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
+define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
                                      <8 x i32>) nounwind alwaysinline {
-  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
-  %oldValue = load <8 x i32>* %0, align 4
-  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
-  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
-  %blend = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat,
-                                                        <8 x float> %newAsFloat,
-                                                        <8 x float> %mask_as_float)
-  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
-  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
+  call void @__masked_store_32(<8 x i32> * %0, <8 x i32> %1, <8 x i32> %2)
   ret void
 }
 
 
-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
-                                     <8 x i32> %i32mask) nounwind alwaysinline {
-  %oldValue = load <8 x i64>* %ptr, align 8
-  %mask = bitcast <8 x i32> %i32mask to <8 x float>
-
-  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
-  ; are actually bitcast <4 x i64> values
-  ;
-  ; set up the first four 64-bit values
-  %old01  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
-                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %old01f = bitcast <4 x i64> %old01 to <8 x float>
-  %new01  = shufflevector <8 x i64> %new, <8 x i64> undef,
-                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %new01f = bitcast <4 x i64> %new01 to <8 x float>
-  ; compute mask--note that the indices are all doubled-up
-  %mask01 = shufflevector <8 x float> %mask, <8 x float> undef,
-                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
-                                     i32 2, i32 2, i32 3, i32 3>
-  ; and blend them
-  %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
-                                                            <8 x float> %new01f,
-                                                            <8 x float> %mask01)
-  %result01 = bitcast <8 x float> %result01f to <4 x i64>
-
-  ; and again
-  %old23  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
-                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %old23f = bitcast <4 x i64> %old23 to <8 x float>
-  %new23  = shufflevector <8 x i64> %new, <8 x i64> undef,
-                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %new23f = bitcast <4 x i64> %new23 to <8 x float>
-  ; compute mask--note that the values are doubled-up...
-  %mask23 = shufflevector <8 x float> %mask, <8 x float> undef,
-                          <8 x i32> <i32 4, i32 4, i32 5, i32 5,
-                                     i32 6, i32 6, i32 7, i32 7>
-  ; and blend them
-  %result23f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f,
-                                                            <8 x float> %new23f,
-                                                            <8 x float> %mask23)
-  %result23 = bitcast <8 x float> %result23f to <4 x i64>
-
-  ; reconstruct the final <8 x i64> vector
-  %final = shufflevector <4 x i64> %result01, <4 x i64> %result23,
-                         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x i64> %final, <8 x i64> * %ptr, align 8
+define void @__masked_store_blend_64(<8 x i64>* nocapture, <8 x i64>, 
+                                     <8 x i32>) nounwind alwaysinline {
+  call void @__masked_store_64(<8 x i64> * %0, <8 x i64> %1, <8 x i32> %2)
   ret void
 }
 
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
 
diff --git a/builtins.m4 b/builtins.m4
index c8019847..59a7b6a3 100644
--- a/builtins.m4
+++ b/builtins.m4
@@ -300,10 +300,10 @@ define(`binary8to16', `
 %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef,
           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef,
-          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 %v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0a, <8 x $2> %$1_0b)
 %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef,
-          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef,
           <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 %v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1a, <8 x $2> %$1_1b)
@@ -1438,46 +1438,6 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
 }
 ')
 
-define(`masked_store_blend_8_16_by_16', `
-define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>,
-                                    <16 x i32>) nounwind alwaysinline {
-  %old = load <16 x i8> * %0
-  %old128 = bitcast <16 x i8> %old to i128
-  %new128 = bitcast <16 x i8> %1 to i128
-
-  %mask8 = trunc <16 x i32> %2 to <16 x i8>
-  %mask128 = bitcast <16 x i8> %mask8 to i128
-  %notmask128 = xor i128 %mask128, -1
-
-  %newmasked = and i128 %new128, %mask128
-  %oldmasked = and i128 %old128, %notmask128
-  %result = or i128 %newmasked, %oldmasked
-
-  %resultvec = bitcast i128 %result to <16 x i8>
-  store <16 x i8> %resultvec, <16 x i8> * %0
-  ret void
-}
-
-define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
-                                     <16 x i32>) nounwind alwaysinline {
-  %old = load <16 x i16> * %0
-  %old256 = bitcast <16 x i16> %old to i256
-  %new256 = bitcast <16 x i16> %1 to i256
-
-  %mask16 = trunc <16 x i32> %2 to <16 x i16>
-  %mask256 = bitcast <16 x i16> %mask16 to i256
-  %notmask256 = xor i256 %mask256, -1
-
-  %newmasked = and i256 %new256, %mask256
-  %oldmasked = and i256 %old256, %notmask256
-  %result = or i256 %newmasked, %oldmasked
-
-  %resultvec = bitcast i256 %result to <16 x i16>
-  store <16 x i16> %resultvec, <16 x i16> * %0
-  ret void
-}
-')
-
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; packed load and store functions
diff --git a/opt.cpp b/opt.cpp
index d4a6ce87..6d6c3d95 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -1422,7 +1422,8 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         // __masked_store_blend_* should be the same as __masked_store_*,
         // so this doesn't matter.  On SSE, blending is generally more
         // efficient and is always safe to do on stack-allocated values.(?)
-        bool doBlend = lIsStackVariablePointer(lvalue);
+        bool doBlend = (g->target.isa != Target::AVX &&
+                        lIsStackVariablePointer(lvalue));
         if (g->target.isa == Target::SSE4 || g->target.isa == Target::SSE2)
             doBlend |= !g->opt.disableBlendedMaskedStores;