Merge branch 'master' of https://github.com/ispc/ispc

2013-12-06 17:22:19 +04:00
parent ea94658411 8766e44b95
commit 02dc2f460e
10 changed files with 194 additions and 85 deletions
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -316,7 +316,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
            llvm::BasicBlock *offBB =
                   llvm::BasicBlock::Create(*g->ctx, "entry",
                                            (llvm::Function *)offFunc, 0);
            llvm::StoreInst *inst =
                new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB);
            if (g->opt.forceAlignedMemory) {
                inst->setAlignment(g->target->getNativeVectorAlignment());
            }
            llvm::ReturnInst::Create(*g->ctx, offBB);
        }
@@ -2437,7 +2441,13 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) {
    if (name == NULL)
        name = LLVMGetName(ptr, "_load");
-    llvm::Instruction *inst = new llvm::LoadInst(ptr, name, bblock);
+    llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock);
    if (g->opt.forceAlignedMemory &&
        llvm::dyn_cast<llvm::VectorType>(pt->getElementType())) {
        inst->setAlignment(g->target->getNativeVectorAlignment());
    }
    AddDebugPos(inst);
    return inst;
 }
@@ -2719,7 +2729,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType,
        inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock);
    // If no alignment was specified but we have an array of a uniform
-    // type, then align it to 4 * the native vector width; it's not
+    // type, then align it to the native vector alignment; it's not
    // unlikely that this array will be loaded into varying variables with
    // what will be aligned accesses if the uniform -> varying load is done
    // in regular chunks.
@@ -2727,7 +2737,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType,
        llvm::dyn_cast<llvm::ArrayType>(llvmType);
    if (align == 0 && arrayType != NULL &&
        !llvm::isa<llvm::VectorType>(arrayType->getElementType()))
-        align = 4 * g->target->getNativeVectorWidth();
+        align = g->target->getNativeVectorAlignment();
    if (align != 0)
        inst->setAlignment(align);
@@ -2986,7 +2996,17 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) {
        return;
    }
-    llvm::Instruction *inst = new llvm::StoreInst(value, ptr, bblock);
+    llvm::PointerType *pt =
        llvm::dyn_cast<llvm::PointerType>(ptr->getType());
    AssertPos(currentPos, pt != NULL);
    llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock);
    if (g->opt.forceAlignedMemory &&
        llvm::dyn_cast<llvm::VectorType>(pt->getElementType())) {
        inst->setAlignment(g->target->getNativeVectorAlignment());
    }
    AddDebugPos(inst);
 }
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
@@ -74,8 +74,8 @@ static void usage() {
 }
 int main(int argc, char *argv[]) {
-    unsigned int width = 1536;
+    unsigned int width = 1536 * 8;
-    unsigned int height = 1024;
+    unsigned int height = 1024 * 8;
    float x0 = -2;
    float x1 = 1;
    float y0 = -1;
--- a/examples/noise/noise.cpp
+++ b/examples/noise/noise.cpp
@@ -66,8 +66,8 @@ writePPM(float *buf, int width, int height, const char *fn) {
 int main() {
-    unsigned int width = 768;
+    unsigned int width = 768 * 4;
-    unsigned int height = 768;
+    unsigned int height = 768 * 4;
    float x0 = -10;
    float x1 = 10;
    float y0 = -10;
--- a/examples/stencil/stencil.cpp
+++ b/examples/stencil/stencil.cpp
@@ -67,7 +67,7 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
 int main() {
-    int Nx = 256, Ny = 256, Nz = 256;
+    int Nx = 256 * 2, Ny = 256 * 2, Nz = 256 * 2;
    int width = 4;
    float *Aserial[2], *Aispc[2];
    Aserial[0] = new float [Nx * Ny * Nz];
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -191,6 +191,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
    m_tf_attributes(NULL),
 #endif
    m_nativeVectorWidth(-1),
    m_nativeVectorAlignment(-1),
    m_dataTypeWidth(-1),
    m_vectorWidth(-1),
    m_generatePIC(pic),
@@ -309,6 +310,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        !strcasecmp(isa, "sse2-i32x4")) {
        this->m_isa = Target::SSE2;
        this->m_nativeVectorWidth = 4;
        this->m_nativeVectorAlignment = 16;
        this->m_dataTypeWidth = 32;
        this->m_vectorWidth = 4;
        this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
@@ -325,6 +327,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "sse2-i32x8")) {
        this->m_isa = Target::SSE2;
        this->m_nativeVectorWidth = 4;
        this->m_nativeVectorAlignment = 16;
        this->m_dataTypeWidth = 32;
        this->m_vectorWidth = 8;
        this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
@@ -341,6 +344,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "sse4-i32x4")) {
        this->m_isa = Target::SSE4;
        this->m_nativeVectorWidth = 4;
        this->m_nativeVectorAlignment = 16;
        this->m_dataTypeWidth = 32;
        this->m_vectorWidth = 4;
        // TODO: why not sse42 and popcnt?
@@ -359,6 +363,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "sse4-i32x8")) {
        this->m_isa = Target::SSE4;
        this->m_nativeVectorWidth = 4;
        this->m_nativeVectorAlignment = 16;
        this->m_dataTypeWidth = 32;
        this->m_vectorWidth = 8;
        this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -374,6 +379,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
    else if (!strcasecmp(isa, "sse4-i8x16")) {
        this->m_isa = Target::SSE4;
        this->m_nativeVectorWidth = 16;
        this->m_nativeVectorAlignment = 16;
        this->m_dataTypeWidth = 8;
        this->m_vectorWidth = 16;
        this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -389,6 +395,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
    else if (!strcasecmp(isa, "sse4-i16x8")) {
        this->m_isa = Target::SSE4;
        this->m_nativeVectorWidth = 8;
        this->m_nativeVectorAlignment = 16;
        this->m_dataTypeWidth = 16;
        this->m_vectorWidth = 8;
        this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -405,6 +412,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "generic-x4")) {
        this->m_isa = Target::GENERIC;
        this->m_nativeVectorWidth = 4;
        this->m_nativeVectorAlignment = 16;
        this->m_vectorWidth = 4;
        this->m_maskingIsFree = true;
        this->m_maskBitCount = 1;
@@ -416,6 +424,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "generic-x8")) {
        this->m_isa = Target::GENERIC;
        this->m_nativeVectorWidth = 8;
        this->m_nativeVectorAlignment = 32;
        this->m_vectorWidth = 8;
        this->m_maskingIsFree = true;
        this->m_maskBitCount = 1;
@@ -427,6 +436,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "generic-x16")) {
        this->m_isa = Target::GENERIC;
        this->m_nativeVectorWidth = 16;
        this->m_nativeVectorAlignment = 64;
        this->m_vectorWidth = 16;
        this->m_maskingIsFree = true;
        this->m_maskBitCount = 1;
@@ -438,6 +448,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "generic-x32")) {
        this->m_isa = Target::GENERIC;
        this->m_nativeVectorWidth = 32;
        this->m_nativeVectorAlignment = 64;
        this->m_vectorWidth = 32;
        this->m_maskingIsFree = true;
        this->m_maskBitCount = 1;
@@ -449,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "generic-x64")) {
        this->m_isa = Target::GENERIC;
        this->m_nativeVectorWidth = 64;
        this->m_nativeVectorAlignment = 64;
        this->m_vectorWidth = 64;
        this->m_maskingIsFree = true;
        this->m_maskBitCount = 1;
@@ -460,6 +472,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "generic-x1")) {
        this->m_isa = Target::GENERIC;
        this->m_nativeVectorWidth = 1;
        this->m_nativeVectorAlignment = 16;
        this->m_vectorWidth = 1;
        this->m_maskingIsFree = false;
        this->m_maskBitCount = 32;
@@ -467,6 +480,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
    else if (!strcasecmp(isa, "avx1-i32x4")) {
        this->m_isa = Target::AVX;
        this->m_nativeVectorWidth = 8;
        this->m_nativeVectorAlignment = 32;
        this->m_dataTypeWidth = 32;
        this->m_vectorWidth = 4;
        this->m_attributes = "+avx,+popcnt,+cmov";
@@ -478,6 +492,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "avx1-i32x8")) {
        this->m_isa = Target::AVX;
        this->m_nativeVectorWidth = 8;
        this->m_nativeVectorAlignment = 32;
        this->m_dataTypeWidth = 32;
        this->m_vectorWidth = 8;
        this->m_attributes = "+avx,+popcnt,+cmov";
@@ -488,6 +503,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "avx1-i64x4")) {
        this->m_isa = Target::AVX;
        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
        this->m_nativeVectorAlignment = 32;
        this->m_dataTypeWidth = 64;
        this->m_vectorWidth = 4;
        this->m_attributes = "+avx,+popcnt,+cmov";
@@ -499,6 +515,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "avx1-i32x16")) {
        this->m_isa = Target::AVX;
        this->m_nativeVectorWidth = 8;
        this->m_nativeVectorAlignment = 32;
        this->m_dataTypeWidth = 32;
        this->m_vectorWidth = 16;
        this->m_attributes = "+avx,+popcnt,+cmov";
@@ -509,6 +526,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "avx1.1-i32x8")) {
        this->m_isa = Target::AVX11;
        this->m_nativeVectorWidth = 8;
        this->m_nativeVectorAlignment = 32;
        this->m_dataTypeWidth = 32;
        this->m_vectorWidth = 8;
        this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -530,6 +548,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "avx1.1-i32x16")) {
        this->m_isa = Target::AVX11;
        this->m_nativeVectorWidth = 8;
        this->m_nativeVectorAlignment = 32;
        this->m_dataTypeWidth = 32;
        this->m_vectorWidth = 16;
        this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -550,6 +569,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
    else if (!strcasecmp(isa, "avx1.1-i64x4")) {
        this->m_isa = Target::AVX11;
        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
        this->m_nativeVectorAlignment = 32;
        this->m_dataTypeWidth = 64;
        this->m_vectorWidth = 4;
        this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -571,6 +591,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "avx2-i32x8")) {
        this->m_isa = Target::AVX2;
        this->m_nativeVectorWidth = 8;
        this->m_nativeVectorAlignment = 32;
        this->m_dataTypeWidth = 32;
        this->m_vectorWidth = 8;
        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -596,6 +617,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "avx2-i32x16")) {
        this->m_isa = Target::AVX2;
        this->m_nativeVectorWidth = 16;
        this->m_nativeVectorAlignment = 32;
        this->m_dataTypeWidth = 32;
        this->m_vectorWidth = 16;
        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -620,6 +642,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
    else if (!strcasecmp(isa, "avx2-i64x4")) {
        this->m_isa = Target::AVX2;
        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
        this->m_nativeVectorAlignment = 32;
        this->m_dataTypeWidth = 64;
        this->m_vectorWidth = 4;
        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -645,6 +668,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
    else if (!strcasecmp(isa, "neon-i8x16")) {
        this->m_isa = Target::NEON8;
        this->m_nativeVectorWidth = 16;
        this->m_nativeVectorAlignment = 16;
        this->m_dataTypeWidth = 8;
        this->m_vectorWidth = 16;
        this->m_attributes = "+neon,+fp16";
@@ -655,6 +679,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
    else if (!strcasecmp(isa, "neon-i16x8")) {
        this->m_isa = Target::NEON16;
        this->m_nativeVectorWidth = 8;
        this->m_nativeVectorAlignment = 16;
        this->m_dataTypeWidth = 16;
        this->m_vectorWidth = 8;
        this->m_attributes = "+neon,+fp16";
@@ -666,6 +691,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             !strcasecmp(isa, "neon-i32x4")) {
        this->m_isa = Target::NEON32;
        this->m_nativeVectorWidth = 4;
        this->m_nativeVectorAlignment = 16;
        this->m_dataTypeWidth = 32;
        this->m_vectorWidth = 4;
        this->m_attributes = "+neon,+fp16";
--- a/ispc.h
+++ b/ispc.h
@@ -260,6 +260,8 @@ public:
    int getNativeVectorWidth() const {return m_nativeVectorWidth;}
    int getNativeVectorAlignment() const {return m_nativeVectorAlignment;}
    int getDataTypeWidth() const {return m_dataTypeWidth;}
    int getVectorWidth() const {return m_vectorWidth;}
@@ -332,6 +334,13 @@ private:
        SSE, 8 for AVX, etc.) */
    int m_nativeVectorWidth;
    /** Native vector alignment in bytes. Theoretically this may be derived
        from the vector size, but it's better to manage directly the alignement.
        It allows easier experimenting and better fine tuning for particular
        platform. This information is primatily used when
        --opt=force-aligned-memory is used. */
    int m_nativeVectorAlignment;
    /** Data type with in bits. Typically it's 32, but could be 8, 16 or 64.
        For generic it's -1, which means undefined. */
    int m_dataTypeWidth;
--- a/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch
+++ b/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch
@@ -0,0 +1,115 @@
 From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001
 From: Michael Liao <michael.hliao@gmail.com>
 Date: Mon, 21 Oct 2013 17:47:58 -0700
 Subject: [PATCH] Fix PR17631
 - Skip instructions added in prolog. For specific targets, prolog may
  insert helper function calls (e.g. _chkstk will be called when
  there're more than 4K bytes allocated on stack). However, these
  helpers don't use/def YMM/XMM registers.
  It also include second fix for the problem: r196261+r196391.
 diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
 index 477f75a..0d37a7d 100644
 --- lib/Target/X86/X86VZeroUpper.cpp
 +++ lib/Target/X86/X86VZeroUpper.cpp
@@ -121,7 +121,7 @@
 }
 static bool clobbersAllYmmRegs(const MachineOperand &MO) {
 -  for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) {
 +  for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
     if (!MO.clobbersPhysReg(reg))
       return false;
   }
@@ -143,6 +143,21 @@
   return false;
 }
 +/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
 +/// instruction.
 +static bool clobbersAnyYmmReg(MachineInstr *MI) {
 +  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
 +    const MachineOperand &MO = MI->getOperand(i);
 +    if (!MO.isRegMask())
 +      continue;
 +    for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
 +      if (MO.clobbersPhysReg(reg))
 +        return true;
 +    }
 +  }
 +  return false;
 +}
 +
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// vzero upper instructions before function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
@@ -226,8 +241,9 @@
   bool BBHasCall = false;
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
 +    DebugLoc dl = I->getDebugLoc();
     MachineInstr *MI = I;
 -    DebugLoc dl = I->getDebugLoc();
 +
     bool isControlFlow = MI->isCall() || MI->isReturn();
     // Shortcut: don't need to check regular instructions in dirty state.
@@ -246,6 +262,14 @@
     if (!isControlFlow)
       continue;
 +    // If the call won't clobber any YMM register, skip it as well. It usually
 +    // happens on helper function calls (such as '_chkstk', '_ftol2') where
 +    // standard calling convention is not used (RegMask is not used to mark
 +    // register clobbered and register usage (def/imp-def/use) is well-dfined
 +    // and explicitly specified.
 +    if (MI->isCall() && !clobbersAnyYmmReg(MI))
 +      continue;
 +
     BBHasCall = true;
     // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
 diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
 new file mode 100644
 index 0000000..a572ff2
 --- /dev/null
 +++ test/CodeGen/X86/pr17631.ll
@@ -0,0 +1,34 @@
 +; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
 +
 +%struct_type = type { [64 x <8 x float>], <8 x float> }
 +
 +; Function Attrs: nounwind readnone
 +declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
 +
 +; Function Attrs: nounwind
 +define i32 @equal(<8 x i32> %A) {
 +allocas:
 +  %first_alloc  = alloca [64 x <8 x i32>]
 +  %second_alloc = alloca %struct_type
 + 
 +  %A1 = bitcast <8 x i32> %A to <8 x float>
 +  %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
 +  ret i32 %A2
 +}
 +
 +; CHECK: equal
 +; CHECK-NOT: vzeroupper
 +; CHECK: _chkstk
 +; CHECK: ret
 +
 +define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) {
 +  %i = fptoui double %x to i64
 +  store i64 %i, i64* %p
 +  %ret = fadd <8 x float> %y, %y
 +  ret <8 x float> %ret
 +}
 +
 +; CHECK: foo
 +; CHECK-NOT: vzeroupper
 +; CHECK: _ftol2
 +; CHECK: ret
 -- 
 1.8.1.2
--- a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
+++ b/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
@@ -1,69 +0,0 @@
 From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001
 From: Michael Liao <michael.hliao@gmail.com>
 Date: Mon, 21 Oct 2013 17:47:58 -0700
 Subject: [PATCH] Fix PR17631
 - Skip instructions added in prolog. For specific targets, prolog may
  insert helper function calls (e.g. _chkstk will be called when
  there're more than 4K bytes allocated on stack). However, these
  helpers don't use/def YMM/XMM registers.
 ---
 lib/Target/X86/X86VZeroUpper.cpp | 11 ++++++++++-
 test/CodeGen/X86/pr17631.ll      | 22 ++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/X86/pr17631.ll
 diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
 index 477f75a..0d37a7d 100644
 --- lib/Target/X86/X86VZeroUpper.cpp
 +++ lib/Target/X86/X86VZeroUpper.cpp
@@ -231,8 +231,17 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
   bool BBHasCall = false;
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
 -    MachineInstr *MI = I;
     DebugLoc dl = I->getDebugLoc();
 +    MachineInstr *MI = I;
 +
 +    // Don't need to check instructions added in prolog.
 +    // In prolog, special function calls may be added for specific targets
 +    // (e.g. on Windows, a prolog helper '_chkstk' is called when the local
 +    // variables exceed 4K bytes on stack.) These helpers won't use/def YMM/XMM
 +    // registers.
 +    if (MI->getFlag(MachineInstr::FrameSetup))
 +      continue;
 +
     bool isControlFlow = MI->isCall() || MI->isReturn();
     // Shortcut: don't need to check regular instructions in dirty state.
 diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
 new file mode 100644
 index 0000000..a572ff2
 --- /dev/null
 +++ test/CodeGen/X86/pr17631.ll
@@ -0,0 +1,22 @@
 +; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
 + 
 +%struct_type = type { [64 x <8 x float>], <8 x float> }
 + 
 +; Function Attrs: nounwind readnone
 +declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
 + 
 +; Function Attrs: nounwind
 +define i32 @equal(<8 x i32> %A) {
 +allocas:
 +  %first_alloc  = alloca [64 x <8 x i32>]
 +  %second_alloc = alloca %struct_type
 + 
 +  %A1 = bitcast <8 x i32> %A to <8 x float>
 +  %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
 +  ret i32 %A2
 +}
 +
 +; CHECK: equal
 +; CHECK-NOT: vzeroupper
 +; CHECK: _chkstk
 +; CHECK: ret
 -- 
 1.8.1.2
--- a/opt.cpp
+++ b/opt.cpp
@@ -904,7 +904,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                    lCopyMetadata(castPtr, callInst);
                    int align;
                    if (g->opt.forceAlignedMemory)
-                        align = 0;
+                        align = g->target->getNativeVectorAlignment();
                    else
                        align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
                    name = LLVMGetName(callInst->getArgOperand(0), "_load");
@@ -946,7 +946,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                        new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
                    int align;
                    if (g->opt.forceAlignedMemory)
-                        align = 0;
+                        align = g->target->getNativeVectorAlignment();
                    else
                        align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
                    storeInst->setAlignment(align);
@@ -2758,7 +2758,8 @@ lImproveMaskedStore(llvm::CallInst *callInst) {
        lCopyMetadata(lvalue, callInst);
        llvm::Instruction *store =
            new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
-                                g->opt.forceAlignedMemory ? 0 : info->align);
+                                g->opt.forceAlignedMemory ?
                                    g->target->getNativeVectorAlignment() : info->align);
        lCopyMetadata(store, callInst);
        llvm::ReplaceInstWithInst(callInst, store);
        return true;
@@ -2821,7 +2822,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst,
                                    callInst);
        llvm::Instruction *load =
            new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
-                               g->opt.forceAlignedMemory ? 0 : info->align,
+                               g->opt.forceAlignedMemory ?
                                   g->target->getNativeVectorAlignment() : info->align,
                               (llvm::Instruction *)NULL);
        lCopyMetadata(load, callInst);
        llvm::ReplaceInstWithInst(callInst, load);
@@ -3226,6 +3228,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
        }
        case 4: {
            // 4-wide vector load
            if (g->opt.forceAlignedMemory) {
                align = g->target->getNativeVectorAlignment();
            }
            llvm::VectorType *vt =
                llvm::VectorType::get(LLVMTypes::Int32Type, 4);
            loadOps[i].load = lGEPAndLoad(basePtr, start, align,
@@ -3234,6 +3239,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
        }
        case 8: {
            // 8-wide vector load
            if (g->opt.forceAlignedMemory) {
                align = g->target->getNativeVectorAlignment();
            }
            llvm::VectorType *vt =
                llvm::VectorType::get(LLVMTypes::Int32Type, 8);
            loadOps[i].load = lGEPAndLoad(basePtr, start, align,
--- a/perf.ini
+++ b/perf.ini
@@ -10,7 +10,7 @@
 %****************************************************************************************************
 AOBench
 aobench
-10 512 512
+3 2048 2048
 #***
 Deferred Shading
 deferred
@@ -41,7 +41,7 @@ options
 #***
 Ray Tracer
 rt
-sponza
+sponza --scale=6.0
 #***
 3D Stencil
 stencil