ispc/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch

This patch needs to be applied to LLVM 3.3 to fix performance regression after r172868 revision.
This regression is due to increased register pressure after revision causing spills in case of multiple loads
This regression is fixed in 3.4 but the changes in 3.4 is not back portable,
so we roll back r172868 to avoid regression with 3.3.

Index: test/CodeGen/X86/sandybridge-loads.ll
===================================================================
--- test/CodeGen/X86/sandybridge-loads.ll       (revision 191082)
+++ test/CodeGen/X86/sandybridge-loads.ll       (working copy)
@@ -1,24 +1,5 @@
 ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s

-;CHECK: wideloads
-;CHECK: vmovaps
-;CHECK: vinsertf128
-;CHECK: vmovaps
-;CHECK-NOT: vinsertf128
-;CHECK: ret
-
-define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
-  %v0 = load <8 x float>* %a, align 16  ; <---- unaligned!
-  %v1 = load <8 x float>* %b, align 32  ; <---- aligned!
-  %m0 = fcmp olt <8 x float> %v1, %v0
-  %v2 = load <8 x float>* %c, align 32  ; <---- aligned!
-  %m1 = fcmp olt <8 x float> %v2, %v0
-  %mand = and <8 x i1> %m1, %m0
-  %r = zext <8 x i1> %mand to <8 x i32>
-  store <8 x i32> %r, <8 x i32>* undef, align 32
-  ret void
-}
-
 ; CHECK: widestores
 ; loads:
 ; CHECK: vmovaps
Index: test/CodeGen/X86/v8i1-masks.ll
===================================================================
--- test/CodeGen/X86/v8i1-masks.ll	(revision 172868)
+++ test/CodeGen/X86/v8i1-masks.ll	(revision 172866)
@@ -1,7 +1,7 @@
 ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s

 ;CHECK: and_masks
-;CHECK: vmovaps
+;CHECK: vmovups
 ;CHECK: vcmpltp
 ;CHECK: vcmpltp
 ;CHECK: vandps
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp  (revision 191077)
+++ lib/Target/X86/X86ISelLowering.cpp  (working copy)
@@ -16756,42 +16756,9 @@
   EVT MemVT = Ld->getMemoryVT();
   DebugLoc dl = Ld->getDebugLoc();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  unsigned RegSz = RegVT.getSizeInBits();

-  // On Sandybridge unaligned 256bit loads are inefficient.
   ISD::LoadExtType Ext = Ld->getExtensionType();
-  unsigned Alignment = Ld->getAlignment();
-  bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
-  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
-      !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
-    unsigned NumElems = RegVT.getVectorNumElements();
-    if (NumElems < 2)
-      return SDValue();

-    SDValue Ptr = Ld->getBasePtr();
-    SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
-
-    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
-                                  NumElems/2);
-    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
-                                Ld->getPointerInfo(), Ld->isVolatile(),
-                                Ld->isNonTemporal(), Ld->isInvariant(),
-                                Alignment);
-    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
-    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
-                                Ld->getPointerInfo(), Ld->isVolatile(),
-                                Ld->isNonTemporal(), Ld->isInvariant(),
-                                std::min(16U, Alignment));
-    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                             Load1.getValue(1),
-                             Load2.getValue(1));
-
-    SDValue NewVec = DAG.getUNDEF(RegVT);
-    NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
-    NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
-    return DCI.CombineTo(N, NewVec, TF, true);
-  }
-
   // If this is a vector EXT Load then attempt to optimize it using a
   // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
   // expansion is still better than scalar code.
@@ -16805,6 +16772,7 @@
     assert(MemVT.isVector() && "Must load a vector from memory");

     unsigned NumElems = RegVT.getVectorNumElements();
+    unsigned RegSz = RegVT.getSizeInBits();
     unsigned MemSz = MemVT.getSizeInBits();
     assert(RegSz > MemSz && "Register size must be greater than the mem size");