103 lines
4.3 KiB
Diff
103 lines
4.3 KiB
Diff
This patch needs to be applied to LLVM 3.3 to fix performance regression after r172868 revision.
|
|
This regression is due to increased register pressure after revision causing spills in case of multiple loads
|
|
This regression is fixed in 3.4 but the changes in 3.4 is not back portable,
|
|
so we roll back r172868 to avoid regression with 3.3.
|
|
|
|
Index: test/CodeGen/X86/sandybridge-loads.ll
|
|
===================================================================
|
|
--- test/CodeGen/X86/sandybridge-loads.ll (revision 191082)
|
|
+++ test/CodeGen/X86/sandybridge-loads.ll (working copy)
|
|
@@ -1,24 +1,5 @@
|
|
; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
|
|
|
|
-;CHECK: wideloads
|
|
-;CHECK: vmovaps
|
|
-;CHECK: vinsertf128
|
|
-;CHECK: vmovaps
|
|
-;CHECK-NOT: vinsertf128
|
|
-;CHECK: ret
|
|
-
|
|
-define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
|
|
- %v0 = load <8 x float>* %a, align 16 ; <---- unaligned!
|
|
- %v1 = load <8 x float>* %b, align 32 ; <---- aligned!
|
|
- %m0 = fcmp olt <8 x float> %v1, %v0
|
|
- %v2 = load <8 x float>* %c, align 32 ; <---- aligned!
|
|
- %m1 = fcmp olt <8 x float> %v2, %v0
|
|
- %mand = and <8 x i1> %m1, %m0
|
|
- %r = zext <8 x i1> %mand to <8 x i32>
|
|
- store <8 x i32> %r, <8 x i32>* undef, align 32
|
|
- ret void
|
|
-}
|
|
-
|
|
; CHECK: widestores
|
|
; loads:
|
|
; CHECK: vmovaps
|
|
Index: test/CodeGen/X86/v8i1-masks.ll
|
|
===================================================================
|
|
--- test/CodeGen/X86/v8i1-masks.ll (revision 172868)
|
|
+++ test/CodeGen/X86/v8i1-masks.ll (revision 172866)
|
|
@@ -1,7 +1,7 @@
|
|
; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
|
|
|
|
;CHECK: and_masks
|
|
-;CHECK: vmovaps
|
|
+;CHECK: vmovups
|
|
;CHECK: vcmpltp
|
|
;CHECK: vcmpltp
|
|
;CHECK: vandps
|
|
Index: lib/Target/X86/X86ISelLowering.cpp
|
|
===================================================================
|
|
--- lib/Target/X86/X86ISelLowering.cpp (revision 191077)
|
|
+++ lib/Target/X86/X86ISelLowering.cpp (working copy)
|
|
@@ -16756,42 +16756,9 @@
|
|
EVT MemVT = Ld->getMemoryVT();
|
|
DebugLoc dl = Ld->getDebugLoc();
|
|
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
|
- unsigned RegSz = RegVT.getSizeInBits();
|
|
|
|
- // On Sandybridge unaligned 256bit loads are inefficient.
|
|
ISD::LoadExtType Ext = Ld->getExtensionType();
|
|
- unsigned Alignment = Ld->getAlignment();
|
|
- bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
|
|
- if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
|
|
- !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
|
|
- unsigned NumElems = RegVT.getVectorNumElements();
|
|
- if (NumElems < 2)
|
|
- return SDValue();
|
|
|
|
- SDValue Ptr = Ld->getBasePtr();
|
|
- SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
|
|
-
|
|
- EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
|
|
- NumElems/2);
|
|
- SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
|
|
- Ld->getPointerInfo(), Ld->isVolatile(),
|
|
- Ld->isNonTemporal(), Ld->isInvariant(),
|
|
- Alignment);
|
|
- Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
|
|
- SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
|
|
- Ld->getPointerInfo(), Ld->isVolatile(),
|
|
- Ld->isNonTemporal(), Ld->isInvariant(),
|
|
- std::min(16U, Alignment));
|
|
- SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
|
|
- Load1.getValue(1),
|
|
- Load2.getValue(1));
|
|
-
|
|
- SDValue NewVec = DAG.getUNDEF(RegVT);
|
|
- NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
|
|
- NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
|
|
- return DCI.CombineTo(N, NewVec, TF, true);
|
|
- }
|
|
-
|
|
// If this is a vector EXT Load then attempt to optimize it using a
|
|
// shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
|
|
// expansion is still better than scalar code.
|
|
@@ -16805,6 +16772,7 @@
|
|
assert(MemVT.isVector() && "Must load a vector from memory");
|
|
|
|
unsigned NumElems = RegVT.getVectorNumElements();
|
|
+ unsigned RegSz = RegVT.getSizeInBits();
|
|
unsigned MemSz = MemVT.getSizeInBits();
|
|
assert(RegSz > MemSz && "Register size must be greater than the mem size");
|
|
|