diff --git a/ctx.cpp b/ctx.cpp
index c1a7e61a..e5c60363 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -316,7 +316,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
             llvm::BasicBlock *offBB =
                    llvm::BasicBlock::Create(*g->ctx, "entry",
                                             (llvm::Function *)offFunc, 0);
-            new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB);
+            llvm::StoreInst *inst =
+                new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB);
+            if (g->opt.forceAlignedMemory) {
+                inst->setAlignment(g->target->getNativeVectorAlignment());
+            }
             llvm::ReturnInst::Create(*g->ctx, offBB);
         }
 
@@ -2437,7 +2441,13 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) {
     if (name == NULL)
         name = LLVMGetName(ptr, "_load");
 
-    llvm::Instruction *inst = new llvm::LoadInst(ptr, name, bblock);
+    llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock);
+
+    if (g->opt.forceAlignedMemory &&
+        llvm::dyn_cast<llvm::VectorType>(pt->getElementType())) {
+        inst->setAlignment(g->target->getNativeVectorAlignment());
+    }
+
     AddDebugPos(inst);
     return inst;
 }
@@ -2719,7 +2729,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType,
         inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock);
 
     // If no alignment was specified but we have an array of a uniform
-    // type, then align it to 4 * the native vector width; it's not
+    // type, then align it to the native vector alignment; it's not
     // unlikely that this array will be loaded into varying variables with
     // what will be aligned accesses if the uniform -> varying load is done
     // in regular chunks.
@@ -2727,7 +2737,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType,
         llvm::dyn_cast<llvm::ArrayType>(llvmType);
     if (align == 0 && arrayType != NULL &&
         !llvm::isa<llvm::VectorType>(arrayType->getElementType()))
-        align = 4 * g->target->getNativeVectorWidth();
+        align = g->target->getNativeVectorAlignment();
 
     if (align != 0)
         inst->setAlignment(align);
@@ -2986,7 +2996,17 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) {
         return;
     }
 
-    llvm::Instruction *inst = new llvm::StoreInst(value, ptr, bblock);
+    llvm::PointerType *pt =
+        llvm::dyn_cast<llvm::PointerType>(ptr->getType());
+    AssertPos(currentPos, pt != NULL);
+
+    llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock);
+
+    if (g->opt.forceAlignedMemory &&
+        llvm::dyn_cast<llvm::VectorType>(pt->getElementType())) {
+        inst->setAlignment(g->target->getNativeVectorAlignment());
+    }
+
     AddDebugPos(inst);
 }
 
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
index 698daf0f..1c4d2ca5 100644
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
@@ -74,8 +74,8 @@ static void usage() {
 }
 
 int main(int argc, char *argv[]) {
-    unsigned int width = 1536;
-    unsigned int height = 1024;
+    unsigned int width = 1536 * 8;
+    unsigned int height = 1024 * 8;
     float x0 = -2;
     float x1 = 1;
     float y0 = -1;
diff --git a/examples/noise/noise.cpp b/examples/noise/noise.cpp
index 123f98c7..86b4f761 100644
--- a/examples/noise/noise.cpp
+++ b/examples/noise/noise.cpp
@@ -66,8 +66,8 @@ writePPM(float *buf, int width, int height, const char *fn) {
 
 
 int main() {
-    unsigned int width = 768;
-    unsigned int height = 768;
+    unsigned int width = 768 * 4;
+    unsigned int height = 768 * 4;
     float x0 = -10;
     float x1 = 10;
     float y0 = -10;
diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp
index 593d901f..9cd12674 100644
--- a/examples/stencil/stencil.cpp
+++ b/examples/stencil/stencil.cpp
@@ -67,7 +67,7 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
 
 
 int main() {
-    int Nx = 256, Ny = 256, Nz = 256;
+    int Nx = 256 * 2, Ny = 256 * 2, Nz = 256 * 2;
     int width = 4;
     float *Aserial[2], *Aispc[2];
     Aserial[0] = new float [Nx * Ny * Nz];
diff --git a/ispc.cpp b/ispc.cpp
index 36d31580..b1790dc3 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -191,6 +191,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     m_tf_attributes(NULL),
 #endif
     m_nativeVectorWidth(-1),
+    m_nativeVectorAlignment(-1),
     m_dataTypeWidth(-1),
     m_vectorWidth(-1),
     m_generatePIC(pic),
@@ -309,6 +310,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         !strcasecmp(isa, "sse2-i32x4")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
@@ -325,6 +327,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "sse2-i32x8")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
@@ -341,6 +344,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "sse4-i32x4")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         // TODO: why not sse42 and popcnt?
@@ -359,6 +363,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "sse4-i32x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -374,6 +379,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "sse4-i8x16")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 16;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 8;
         this->m_vectorWidth = 16;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -389,6 +395,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "sse4-i16x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 16;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -405,6 +412,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "generic-x4")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorAlignment = 16;
         this->m_vectorWidth = 4;
         this->m_maskingIsFree = true;
         this->m_maskBitCount = 1;
@@ -416,6 +424,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "generic-x8")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_vectorWidth = 8;
         this->m_maskingIsFree = true;
         this->m_maskBitCount = 1;
@@ -427,6 +436,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "generic-x16")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 16;
+        this->m_nativeVectorAlignment = 64;
         this->m_vectorWidth = 16;
         this->m_maskingIsFree = true;
         this->m_maskBitCount = 1;
@@ -438,6 +448,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "generic-x32")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 32;
+        this->m_nativeVectorAlignment = 64;
         this->m_vectorWidth = 32;
         this->m_maskingIsFree = true;
         this->m_maskBitCount = 1;
@@ -449,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "generic-x64")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 64;
+        this->m_nativeVectorAlignment = 64;
         this->m_vectorWidth = 64;
         this->m_maskingIsFree = true;
         this->m_maskBitCount = 1;
@@ -460,6 +472,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "generic-x1")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 1;
+        this->m_nativeVectorAlignment = 16;
         this->m_vectorWidth = 1;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
@@ -467,6 +480,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "avx1-i32x4")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov";
@@ -478,6 +492,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1-i32x8")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx,+popcnt,+cmov";
@@ -488,6 +503,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1-i64x4")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 64;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov";
@@ -499,6 +515,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1-i32x16")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx,+popcnt,+cmov";
@@ -509,6 +526,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1.1-i32x8")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -530,6 +548,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1.1-i32x16")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -550,6 +569,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "avx1.1-i64x4")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 64;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -571,6 +591,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx2-i32x8")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -596,6 +617,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx2-i32x16")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -620,6 +642,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "avx2-i64x4")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_nativeVectorAlignment = 32;
         this->m_dataTypeWidth = 64;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -645,6 +668,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "neon-i8x16")) {
         this->m_isa = Target::NEON8;
         this->m_nativeVectorWidth = 16;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 8;
         this->m_vectorWidth = 16;
         this->m_attributes = "+neon,+fp16";
@@ -655,6 +679,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "neon-i16x8")) {
         this->m_isa = Target::NEON16;
         this->m_nativeVectorWidth = 8;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 16;
         this->m_vectorWidth = 8;
         this->m_attributes = "+neon,+fp16";
@@ -666,6 +691,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "neon-i32x4")) {
         this->m_isa = Target::NEON32;
         this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorAlignment = 16;
         this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         this->m_attributes = "+neon,+fp16";
diff --git a/ispc.h b/ispc.h
index b319d656..4b333861 100644
--- a/ispc.h
+++ b/ispc.h
@@ -260,6 +260,8 @@ public:
 
     int getNativeVectorWidth() const {return m_nativeVectorWidth;}
 
+    int getNativeVectorAlignment() const {return m_nativeVectorAlignment;}
+
     int getDataTypeWidth() const {return m_dataTypeWidth;}
 
     int getVectorWidth() const {return m_vectorWidth;}
@@ -332,6 +334,13 @@ private:
         SSE, 8 for AVX, etc.) */
     int m_nativeVectorWidth;
 
+    /** Native vector alignment in bytes. Theoretically this may be derived
+        from the vector size, but it's better to manage directly the alignement.
+        It allows easier experimenting and better fine tuning for particular
+        platform. This information is primatily used when
+        --opt=force-aligned-memory is used. */
+    int m_nativeVectorAlignment;
+
     /** Data type with in bits. Typically it's 32, but could be 8, 16 or 64.
         For generic it's -1, which means undefined. */
     int m_dataTypeWidth;
diff --git a/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch
new file mode 100644
index 00000000..8f0a790b
--- /dev/null
+++ b/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch
@@ -0,0 +1,115 @@
+From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001
+From: Michael Liao <michael.hliao@gmail.com>
+Date: Mon, 21 Oct 2013 17:47:58 -0700
+Subject: [PATCH] Fix PR17631
+
+- Skip instructions added in prolog. For specific targets, prolog may
+  insert helper function calls (e.g. _chkstk will be called when
+  there're more than 4K bytes allocated on stack). However, these
+  helpers don't use/def YMM/XMM registers.
+  It also include second fix for the problem: r196261+r196391.
+
+diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
+index 477f75a..0d37a7d 100644
+--- lib/Target/X86/X86VZeroUpper.cpp
++++ lib/Target/X86/X86VZeroUpper.cpp
+@@ -121,7 +121,7 @@
+ }
+ 
+ static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+-  for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) {
++  for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+     if (!MO.clobbersPhysReg(reg))
+       return false;
+   }
+@@ -143,6 +143,21 @@
+   return false;
+ }
+ 
++/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
++/// instruction.
++static bool clobbersAnyYmmReg(MachineInstr *MI) {
++  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
++    const MachineOperand &MO = MI->getOperand(i);
++    if (!MO.isRegMask())
++      continue;
++    for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
++      if (MO.clobbersPhysReg(reg))
++        return true;
++    }
++  }
++  return false;
++}
++
+ /// runOnMachineFunction - Loop over all of the basic blocks, inserting
+ /// vzero upper instructions before function calls.
+ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+@@ -226,8 +241,9 @@
+   bool BBHasCall = false;
+ 
+   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
++    DebugLoc dl = I->getDebugLoc();
+     MachineInstr *MI = I;
+-    DebugLoc dl = I->getDebugLoc();
++
+     bool isControlFlow = MI->isCall() || MI->isReturn();
+ 
+     // Shortcut: don't need to check regular instructions in dirty state.
+@@ -246,6 +262,14 @@
+     if (!isControlFlow)
+       continue;
+ 
++    // If the call won't clobber any YMM register, skip it as well. It usually
++    // happens on helper function calls (such as '_chkstk', '_ftol2') where
++    // standard calling convention is not used (RegMask is not used to mark
++    // register clobbered and register usage (def/imp-def/use) is well-dfined
++    // and explicitly specified.
++    if (MI->isCall() && !clobbersAnyYmmReg(MI))
++      continue;
++
+     BBHasCall = true;
+ 
+     // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
+diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
+new file mode 100644
+index 0000000..a572ff2
+--- /dev/null
++++ test/CodeGen/X86/pr17631.ll
+@@ -0,0 +1,34 @@
++; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
++
++%struct_type = type { [64 x <8 x float>], <8 x float> }
++
++; Function Attrs: nounwind readnone
++declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
++
++; Function Attrs: nounwind
++define i32 @equal(<8 x i32> %A) {
++allocas:
++  %first_alloc  = alloca [64 x <8 x i32>]
++  %second_alloc = alloca %struct_type
++ 
++  %A1 = bitcast <8 x i32> %A to <8 x float>
++  %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
++  ret i32 %A2
++}
++
++; CHECK: equal
++; CHECK-NOT: vzeroupper
++; CHECK: _chkstk
++; CHECK: ret
++
++define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) {
++  %i = fptoui double %x to i64
++  store i64 %i, i64* %p
++  %ret = fadd <8 x float> %y, %y
++  ret <8 x float> %ret
++}
++
++; CHECK: foo
++; CHECK-NOT: vzeroupper
++; CHECK: _ftol2
++; CHECK: ret
+-- 
+1.8.1.2
+
diff --git a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
deleted file mode 100644
index b6abb1d3..00000000
--- a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
+++ /dev/null
@@ -1,69 +0,0 @@
-From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001
-From: Michael Liao <michael.hliao@gmail.com>
-Date: Mon, 21 Oct 2013 17:47:58 -0700
-Subject: [PATCH] Fix PR17631
-
-- Skip instructions added in prolog. For specific targets, prolog may
-  insert helper function calls (e.g. _chkstk will be called when
-  there're more than 4K bytes allocated on stack). However, these
-  helpers don't use/def YMM/XMM registers.
----
- lib/Target/X86/X86VZeroUpper.cpp | 11 ++++++++++-
- test/CodeGen/X86/pr17631.ll      | 22 ++++++++++++++++++++++
- 2 files changed, 32 insertions(+), 1 deletion(-)
- create mode 100644 test/CodeGen/X86/pr17631.ll
-
-diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
-index 477f75a..0d37a7d 100644
---- lib/Target/X86/X86VZeroUpper.cpp
-+++ lib/Target/X86/X86VZeroUpper.cpp
-@@ -231,8 +231,17 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
-   bool BBHasCall = false;
- 
-   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
--    MachineInstr *MI = I;
-     DebugLoc dl = I->getDebugLoc();
-+    MachineInstr *MI = I;
-+
-+    // Don't need to check instructions added in prolog.
-+    // In prolog, special function calls may be added for specific targets
-+    // (e.g. on Windows, a prolog helper '_chkstk' is called when the local
-+    // variables exceed 4K bytes on stack.) These helpers won't use/def YMM/XMM
-+    // registers.
-+    if (MI->getFlag(MachineInstr::FrameSetup))
-+      continue;
-+
-     bool isControlFlow = MI->isCall() || MI->isReturn();
- 
-     // Shortcut: don't need to check regular instructions in dirty state.
-diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
-new file mode 100644
-index 0000000..a572ff2
---- /dev/null
-+++ test/CodeGen/X86/pr17631.ll
-@@ -0,0 +1,22 @@
-+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
-+ 
-+%struct_type = type { [64 x <8 x float>], <8 x float> }
-+ 
-+; Function Attrs: nounwind readnone
-+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
-+ 
-+; Function Attrs: nounwind
-+define i32 @equal(<8 x i32> %A) {
-+allocas:
-+  %first_alloc  = alloca [64 x <8 x i32>]
-+  %second_alloc = alloca %struct_type
-+ 
-+  %A1 = bitcast <8 x i32> %A to <8 x float>
-+  %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
-+  ret i32 %A2
-+}
-+
-+; CHECK: equal
-+; CHECK-NOT: vzeroupper
-+; CHECK: _chkstk
-+; CHECK: ret
--- 
-1.8.1.2
-
diff --git a/opt.cpp b/opt.cpp
index 3e320b4b..9059c746 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -904,7 +904,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                     lCopyMetadata(castPtr, callInst);
                     int align;
                     if (g->opt.forceAlignedMemory)
-                        align = 0;
+                        align = g->target->getNativeVectorAlignment();
                     else
                         align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
                     name = LLVMGetName(callInst->getArgOperand(0), "_load");
@@ -946,7 +946,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                         new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
                     int align;
                     if (g->opt.forceAlignedMemory)
-                        align = 0;
+                        align = g->target->getNativeVectorAlignment();
                     else
                         align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
                     storeInst->setAlignment(align);
@@ -2758,7 +2758,8 @@ lImproveMaskedStore(llvm::CallInst *callInst) {
         lCopyMetadata(lvalue, callInst);
         llvm::Instruction *store =
             new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
-                                g->opt.forceAlignedMemory ? 0 : info->align);
+                                g->opt.forceAlignedMemory ?
+                                    g->target->getNativeVectorAlignment() : info->align);
         lCopyMetadata(store, callInst);
         llvm::ReplaceInstWithInst(callInst, store);
         return true;
@@ -2821,7 +2822,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst,
                                     callInst);
         llvm::Instruction *load =
             new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
-                               g->opt.forceAlignedMemory ? 0 : info->align,
+                               g->opt.forceAlignedMemory ?
+                                   g->target->getNativeVectorAlignment() : info->align,
                                (llvm::Instruction *)NULL);
         lCopyMetadata(load, callInst);
         llvm::ReplaceInstWithInst(callInst, load);
@@ -3226,6 +3228,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
         }
         case 4: {
             // 4-wide vector load
+            if (g->opt.forceAlignedMemory) {
+                align = g->target->getNativeVectorAlignment();
+            }
             llvm::VectorType *vt =
                 llvm::VectorType::get(LLVMTypes::Int32Type, 4);
             loadOps[i].load = lGEPAndLoad(basePtr, start, align,
@@ -3234,6 +3239,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
         }
         case 8: {
             // 8-wide vector load
+            if (g->opt.forceAlignedMemory) {
+                align = g->target->getNativeVectorAlignment();
+            }
             llvm::VectorType *vt =
                 llvm::VectorType::get(LLVMTypes::Int32Type, 8);
             loadOps[i].load = lGEPAndLoad(basePtr, start, align,
diff --git a/perf.ini b/perf.ini
index 249c25f4..b44a2853 100755
--- a/perf.ini
+++ b/perf.ini
@@ -10,7 +10,7 @@
 %****************************************************************************************************
 AOBench
 aobench
-10 512 512
+3 2048 2048
 #***
 Deferred Shading
 deferred
@@ -41,7 +41,7 @@ options
 #***
 Ray Tracer
 rt
-sponza
+sponza --scale=6.0
 #***
 3D Stencil
 stencil