diff --git a/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch
new file mode 100644
index 00000000..36bb5572
--- /dev/null
+++ b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch
@@ -0,0 +1,102 @@
+This patch needs to be applied to LLVM 3.3 to fix performance regression after r172868 revision.
+This regression is due to increased register pressure after revision causing spills in case of multiple loads 
+This regression is fixed in 3.4 but the changes in 3.4 is not back portable,
+so we roll back r172868 to avoid regression with 3.3.
+
+Index: test/CodeGen/X86/sandybridge-loads.ll
+===================================================================
+--- test/CodeGen/X86/sandybridge-loads.ll       (revision 191082)
++++ test/CodeGen/X86/sandybridge-loads.ll       (working copy)
+@@ -1,24 +1,5 @@
+ ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
+ 
+-;CHECK: wideloads
+-;CHECK: vmovaps
+-;CHECK: vinsertf128
+-;CHECK: vmovaps
+-;CHECK-NOT: vinsertf128
+-;CHECK: ret
+-
+-define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+-  %v0 = load <8 x float>* %a, align 16  ; <---- unaligned!
+-  %v1 = load <8 x float>* %b, align 32  ; <---- aligned!
+-  %m0 = fcmp olt <8 x float> %v1, %v0
+-  %v2 = load <8 x float>* %c, align 32  ; <---- aligned!
+-  %m1 = fcmp olt <8 x float> %v2, %v0
+-  %mand = and <8 x i1> %m1, %m0
+-  %r = zext <8 x i1> %mand to <8 x i32>
+-  store <8 x i32> %r, <8 x i32>* undef, align 32
+-  ret void
+-}
+-
+ ; CHECK: widestores
+ ; loads:
+ ; CHECK: vmovaps
+Index: test/CodeGen/X86/v8i1-masks.ll
+===================================================================
+--- test/CodeGen/X86/v8i1-masks.ll	(revision 172868)
++++ test/CodeGen/X86/v8i1-masks.ll	(revision 172866)
+@@ -1,7 +1,7 @@
+ ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
+ 
+ ;CHECK: and_masks
+-;CHECK: vmovaps
++;CHECK: vmovups
+ ;CHECK: vcmpltp
+ ;CHECK: vcmpltp
+ ;CHECK: vandps
+Index: lib/Target/X86/X86ISelLowering.cpp
+===================================================================
+--- lib/Target/X86/X86ISelLowering.cpp  (revision 191077)
++++ lib/Target/X86/X86ISelLowering.cpp  (working copy)
+@@ -16756,42 +16756,9 @@
+   EVT MemVT = Ld->getMemoryVT();
+   DebugLoc dl = Ld->getDebugLoc();
+   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+-  unsigned RegSz = RegVT.getSizeInBits();
+ 
+-  // On Sandybridge unaligned 256bit loads are inefficient.
+   ISD::LoadExtType Ext = Ld->getExtensionType();
+-  unsigned Alignment = Ld->getAlignment();
+-  bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
+-  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+-      !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
+-    unsigned NumElems = RegVT.getVectorNumElements();
+-    if (NumElems < 2)
+-      return SDValue();
+ 
+-    SDValue Ptr = Ld->getBasePtr();
+-    SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
+-
+-    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+-                                  NumElems/2);
+-    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+-                                Ld->getPointerInfo(), Ld->isVolatile(),
+-                                Ld->isNonTemporal(), Ld->isInvariant(),
+-                                Alignment);
+-    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+-    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+-                                Ld->getPointerInfo(), Ld->isVolatile(),
+-                                Ld->isNonTemporal(), Ld->isInvariant(),
+-                                std::min(16U, Alignment));
+-    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+-                             Load1.getValue(1),
+-                             Load2.getValue(1));
+-
+-    SDValue NewVec = DAG.getUNDEF(RegVT);
+-    NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
+-    NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+-    return DCI.CombineTo(N, NewVec, TF, true);
+-  }
+-
+   // If this is a vector EXT Load then attempt to optimize it using a
+   // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+   // expansion is still better than scalar code.
+@@ -16805,6 +16772,7 @@
+     assert(MemVT.isVector() && "Must load a vector from memory");
+ 
+     unsigned NumElems = RegVT.getVectorNumElements();
++    unsigned RegSz = RegVT.getSizeInBits();
+     unsigned MemSz = MemVT.getSizeInBits();
+     assert(RegSz > MemSz && "Register size must be greater than the mem size");
+