diff --git a/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch new file mode 100644 index 00000000..36bb5572 --- /dev/null +++ b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch @@ -0,0 +1,102 @@ +This patch needs to be applied to LLVM 3.3 to fix performance regression after r172868 revision. +This regression is due to increased register pressure after revision causing spills in case of multiple loads +This regression is fixed in 3.4 but the changes in 3.4 is not back portable, +so we roll back r172868 to avoid regression with 3.3. + +Index: test/CodeGen/X86/sandybridge-loads.ll +=================================================================== +--- test/CodeGen/X86/sandybridge-loads.ll (revision 191082) ++++ test/CodeGen/X86/sandybridge-loads.ll (working copy) +@@ -1,24 +1,5 @@ + ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s + +-;CHECK: wideloads +-;CHECK: vmovaps +-;CHECK: vinsertf128 +-;CHECK: vmovaps +-;CHECK-NOT: vinsertf128 +-;CHECK: ret +- +-define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { +- %v0 = load <8 x float>* %a, align 16 ; <---- unaligned! +- %v1 = load <8 x float>* %b, align 32 ; <---- aligned! +- %m0 = fcmp olt <8 x float> %v1, %v0 +- %v2 = load <8 x float>* %c, align 32 ; <---- aligned! +- %m1 = fcmp olt <8 x float> %v2, %v0 +- %mand = and <8 x i1> %m1, %m0 +- %r = zext <8 x i1> %mand to <8 x i32> +- store <8 x i32> %r, <8 x i32>* undef, align 32 +- ret void +-} +- + ; CHECK: widestores + ; loads: + ; CHECK: vmovaps +Index: test/CodeGen/X86/v8i1-masks.ll +=================================================================== +--- test/CodeGen/X86/v8i1-masks.ll (revision 172868) ++++ test/CodeGen/X86/v8i1-masks.ll (revision 172866) +@@ -1,7 +1,7 @@ + ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s + + ;CHECK: and_masks +-;CHECK: vmovaps ++;CHECK: vmovups + ;CHECK: vcmpltp + ;CHECK: vcmpltp + ;CHECK: vandps +Index: lib/Target/X86/X86ISelLowering.cpp +=================================================================== +--- lib/Target/X86/X86ISelLowering.cpp (revision 191077) ++++ lib/Target/X86/X86ISelLowering.cpp (working copy) +@@ -16756,42 +16756,9 @@ + EVT MemVT = Ld->getMemoryVT(); + DebugLoc dl = Ld->getDebugLoc(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); +- unsigned RegSz = RegVT.getSizeInBits(); + +- // On Sandybridge unaligned 256bit loads are inefficient. + ISD::LoadExtType Ext = Ld->getExtensionType(); +- unsigned Alignment = Ld->getAlignment(); +- bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; +- if (RegVT.is256BitVector() && !Subtarget->hasInt256() && +- !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { +- unsigned NumElems = RegVT.getVectorNumElements(); +- if (NumElems < 2) +- return SDValue(); + +- SDValue Ptr = Ld->getBasePtr(); +- SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); +- +- EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), +- NumElems/2); +- SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, +- Ld->getPointerInfo(), Ld->isVolatile(), +- Ld->isNonTemporal(), Ld->isInvariant(), +- Alignment); +- Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); +- SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, +- Ld->getPointerInfo(), Ld->isVolatile(), +- Ld->isNonTemporal(), Ld->isInvariant(), +- std::min(16U, Alignment)); +- SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, +- Load1.getValue(1), +- Load2.getValue(1)); +- +- SDValue NewVec = DAG.getUNDEF(RegVT); +- NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); +- NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); +- return DCI.CombineTo(N, NewVec, TF, true); +- } +- + // If this is a vector EXT Load then attempt to optimize it using a + // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the + // expansion is still better than scalar code. +@@ -16805,6 +16772,7 @@ + assert(MemVT.isVector() && "Must load a vector from memory"); + + unsigned NumElems = RegVT.getVectorNumElements(); ++ unsigned RegSz = RegVT.getSizeInBits(); + unsigned MemSz = MemVT.getSizeInBits(); + assert(RegSz > MemSz && "Register size must be greater than the mem size"); +