Merge pull request #596 from ifilippov/patch_3_
adding patch for LLVM 3.3 which increases performance after regression
This commit is contained in:
102
llvm_patches/3_3_r172868-vmovups-vinsertf128.patch
Normal file
102
llvm_patches/3_3_r172868-vmovups-vinsertf128.patch
Normal file
@@ -0,0 +1,102 @@
|
||||
This patch needs to be applied to LLVM 3.3 to fix performance regression after r172868 revision.
|
||||
This regression is due to increased register pressure after revision causing spills in case of multiple loads
|
||||
This regression is fixed in 3.4 but the changes in 3.4 is not back portable,
|
||||
so we roll back r172868 to avoid regression with 3.3.
|
||||
|
||||
Index: test/CodeGen/X86/sandybridge-loads.ll
|
||||
===================================================================
|
||||
--- test/CodeGen/X86/sandybridge-loads.ll (revision 191082)
|
||||
+++ test/CodeGen/X86/sandybridge-loads.ll (working copy)
|
||||
@@ -1,24 +1,5 @@
|
||||
; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
|
||||
|
||||
-;CHECK: wideloads
|
||||
-;CHECK: vmovaps
|
||||
-;CHECK: vinsertf128
|
||||
-;CHECK: vmovaps
|
||||
-;CHECK-NOT: vinsertf128
|
||||
-;CHECK: ret
|
||||
-
|
||||
-define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
|
||||
- %v0 = load <8 x float>* %a, align 16 ; <---- unaligned!
|
||||
- %v1 = load <8 x float>* %b, align 32 ; <---- aligned!
|
||||
- %m0 = fcmp olt <8 x float> %v1, %v0
|
||||
- %v2 = load <8 x float>* %c, align 32 ; <---- aligned!
|
||||
- %m1 = fcmp olt <8 x float> %v2, %v0
|
||||
- %mand = and <8 x i1> %m1, %m0
|
||||
- %r = zext <8 x i1> %mand to <8 x i32>
|
||||
- store <8 x i32> %r, <8 x i32>* undef, align 32
|
||||
- ret void
|
||||
-}
|
||||
-
|
||||
; CHECK: widestores
|
||||
; loads:
|
||||
; CHECK: vmovaps
|
||||
Index: test/CodeGen/X86/v8i1-masks.ll
|
||||
===================================================================
|
||||
--- test/CodeGen/X86/v8i1-masks.ll (revision 172868)
|
||||
+++ test/CodeGen/X86/v8i1-masks.ll (revision 172866)
|
||||
@@ -1,7 +1,7 @@
|
||||
; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
|
||||
|
||||
;CHECK: and_masks
|
||||
-;CHECK: vmovaps
|
||||
+;CHECK: vmovups
|
||||
;CHECK: vcmpltp
|
||||
;CHECK: vcmpltp
|
||||
;CHECK: vandps
|
||||
Index: lib/Target/X86/X86ISelLowering.cpp
|
||||
===================================================================
|
||||
--- lib/Target/X86/X86ISelLowering.cpp (revision 191077)
|
||||
+++ lib/Target/X86/X86ISelLowering.cpp (working copy)
|
||||
@@ -16756,42 +16756,9 @@
|
||||
EVT MemVT = Ld->getMemoryVT();
|
||||
DebugLoc dl = Ld->getDebugLoc();
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
- unsigned RegSz = RegVT.getSizeInBits();
|
||||
|
||||
- // On Sandybridge unaligned 256bit loads are inefficient.
|
||||
ISD::LoadExtType Ext = Ld->getExtensionType();
|
||||
- unsigned Alignment = Ld->getAlignment();
|
||||
- bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
|
||||
- if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
|
||||
- !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
|
||||
- unsigned NumElems = RegVT.getVectorNumElements();
|
||||
- if (NumElems < 2)
|
||||
- return SDValue();
|
||||
|
||||
- SDValue Ptr = Ld->getBasePtr();
|
||||
- SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
|
||||
-
|
||||
- EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
|
||||
- NumElems/2);
|
||||
- SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
|
||||
- Ld->getPointerInfo(), Ld->isVolatile(),
|
||||
- Ld->isNonTemporal(), Ld->isInvariant(),
|
||||
- Alignment);
|
||||
- Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
|
||||
- SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
|
||||
- Ld->getPointerInfo(), Ld->isVolatile(),
|
||||
- Ld->isNonTemporal(), Ld->isInvariant(),
|
||||
- std::min(16U, Alignment));
|
||||
- SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
|
||||
- Load1.getValue(1),
|
||||
- Load2.getValue(1));
|
||||
-
|
||||
- SDValue NewVec = DAG.getUNDEF(RegVT);
|
||||
- NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
|
||||
- NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
|
||||
- return DCI.CombineTo(N, NewVec, TF, true);
|
||||
- }
|
||||
-
|
||||
// If this is a vector EXT Load then attempt to optimize it using a
|
||||
// shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
|
||||
// expansion is still better than scalar code.
|
||||
@@ -16805,6 +16772,7 @@
|
||||
assert(MemVT.isVector() && "Must load a vector from memory");
|
||||
|
||||
unsigned NumElems = RegVT.getVectorNumElements();
|
||||
+ unsigned RegSz = RegVT.getSizeInBits();
|
||||
unsigned MemSz = MemVT.getSizeInBits();
|
||||
assert(RegSz > MemSz && "Register size must be greater than the mem size");
|
||||
|
||||
Reference in New Issue
Block a user