diff --git a/ctx.cpp b/ctx.cpp index c1a7e61a..e5c60363 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -316,7 +316,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym, llvm::BasicBlock *offBB = llvm::BasicBlock::Create(*g->ctx, "entry", (llvm::Function *)offFunc, 0); - new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB); + llvm::StoreInst *inst = + new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB); + if (g->opt.forceAlignedMemory) { + inst->setAlignment(g->target->getNativeVectorAlignment()); + } llvm::ReturnInst::Create(*g->ctx, offBB); } @@ -2437,7 +2441,13 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) { if (name == NULL) name = LLVMGetName(ptr, "_load"); - llvm::Instruction *inst = new llvm::LoadInst(ptr, name, bblock); + llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock); + + if (g->opt.forceAlignedMemory && + llvm::dyn_cast(pt->getElementType())) { + inst->setAlignment(g->target->getNativeVectorAlignment()); + } + AddDebugPos(inst); return inst; } @@ -2719,7 +2729,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType, inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock); // If no alignment was specified but we have an array of a uniform - // type, then align it to 4 * the native vector width; it's not + // type, then align it to the native vector alignment; it's not // unlikely that this array will be loaded into varying variables with // what will be aligned accesses if the uniform -> varying load is done // in regular chunks. @@ -2727,7 +2737,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType, llvm::dyn_cast(llvmType); if (align == 0 && arrayType != NULL && !llvm::isa(arrayType->getElementType())) - align = 4 * g->target->getNativeVectorWidth(); + align = g->target->getNativeVectorAlignment(); if (align != 0) inst->setAlignment(align); @@ -2986,7 +2996,17 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) { return; } - llvm::Instruction *inst = new llvm::StoreInst(value, ptr, bblock); + llvm::PointerType *pt = + llvm::dyn_cast(ptr->getType()); + AssertPos(currentPos, pt != NULL); + + llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock); + + if (g->opt.forceAlignedMemory && + llvm::dyn_cast(pt->getElementType())) { + inst->setAlignment(g->target->getNativeVectorAlignment()); + } + AddDebugPos(inst); } diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp index 698daf0f..1c4d2ca5 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -74,8 +74,8 @@ static void usage() { } int main(int argc, char *argv[]) { - unsigned int width = 1536; - unsigned int height = 1024; + unsigned int width = 1536 * 8; + unsigned int height = 1024 * 8; float x0 = -2; float x1 = 1; float y0 = -1; diff --git a/examples/noise/noise.cpp b/examples/noise/noise.cpp index 123f98c7..86b4f761 100644 --- a/examples/noise/noise.cpp +++ b/examples/noise/noise.cpp @@ -66,8 +66,8 @@ writePPM(float *buf, int width, int height, const char *fn) { int main() { - unsigned int width = 768; - unsigned int height = 768; + unsigned int width = 768 * 4; + unsigned int height = 768 * 4; float x0 = -10; float x1 = 10; float y0 = -10; diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp index 593d901f..9cd12674 100644 --- a/examples/stencil/stencil.cpp +++ b/examples/stencil/stencil.cpp @@ -67,7 +67,7 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) { int main() { - int Nx = 256, Ny = 256, Nz = 256; + int Nx = 256 * 2, Ny = 256 * 2, Nz = 256 * 2; int width = 4; float *Aserial[2], *Aispc[2]; Aserial[0] = new float [Nx * Ny * Nz]; diff --git a/ispc.cpp b/ispc.cpp index 36d31580..b1790dc3 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -191,6 +191,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : m_tf_attributes(NULL), #endif m_nativeVectorWidth(-1), + m_nativeVectorAlignment(-1), m_dataTypeWidth(-1), m_vectorWidth(-1), m_generatePIC(pic), @@ -309,6 +310,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "sse2-i32x4")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" @@ -325,6 +327,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "sse2-i32x8")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" @@ -341,6 +344,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "sse4-i32x4")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; // TODO: why not sse42 and popcnt? @@ -359,6 +363,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "sse4-i32x8")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" @@ -374,6 +379,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "sse4-i8x16")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 8; this->m_vectorWidth = 16; this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" @@ -389,6 +395,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "sse4-i16x8")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 16; this->m_vectorWidth = 8; this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" @@ -405,6 +412,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x4")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_vectorWidth = 4; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -416,6 +424,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x8")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_vectorWidth = 8; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -427,6 +436,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x16")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 64; this->m_vectorWidth = 16; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -438,6 +448,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x32")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 32; + this->m_nativeVectorAlignment = 64; this->m_vectorWidth = 32; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -449,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x64")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 64; + this->m_nativeVectorAlignment = 64; this->m_vectorWidth = 64; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -460,6 +472,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x1")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 1; + this->m_nativeVectorAlignment = 16; this->m_vectorWidth = 1; this->m_maskingIsFree = false; this->m_maskBitCount = 32; @@ -467,6 +480,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "avx1-i32x4")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -478,6 +492,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1-i32x8")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -488,6 +503,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1-i64x4")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 64; this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -499,6 +515,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1-i32x16")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 16; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -509,6 +526,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1.1-i32x8")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+avx,+popcnt,+cmov,+f16c" @@ -530,6 +548,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1.1-i32x16")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 16; this->m_attributes = "+avx,+popcnt,+cmov,+f16c" @@ -550,6 +569,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "avx1.1-i64x4")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 64; this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov,+f16c" @@ -571,6 +591,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx2-i32x8")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" @@ -596,6 +617,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx2-i32x16")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 16; this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" @@ -620,6 +642,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "avx2-i64x4")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 64; this->m_vectorWidth = 4; this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" @@ -645,6 +668,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "neon-i8x16")) { this->m_isa = Target::NEON8; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 8; this->m_vectorWidth = 16; this->m_attributes = "+neon,+fp16"; @@ -655,6 +679,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "neon-i16x8")) { this->m_isa = Target::NEON16; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 16; this->m_vectorWidth = 8; this->m_attributes = "+neon,+fp16"; @@ -666,6 +691,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "neon-i32x4")) { this->m_isa = Target::NEON32; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; this->m_attributes = "+neon,+fp16"; diff --git a/ispc.h b/ispc.h index b319d656..4b333861 100644 --- a/ispc.h +++ b/ispc.h @@ -260,6 +260,8 @@ public: int getNativeVectorWidth() const {return m_nativeVectorWidth;} + int getNativeVectorAlignment() const {return m_nativeVectorAlignment;} + int getDataTypeWidth() const {return m_dataTypeWidth;} int getVectorWidth() const {return m_vectorWidth;} @@ -332,6 +334,13 @@ private: SSE, 8 for AVX, etc.) */ int m_nativeVectorWidth; + /** Native vector alignment in bytes. Theoretically this may be derived + from the vector size, but it's better to manage directly the alignement. + It allows easier experimenting and better fine tuning for particular + platform. This information is primatily used when + --opt=force-aligned-memory is used. */ + int m_nativeVectorAlignment; + /** Data type with in bits. Typically it's 32, but could be 8, 16 or 64. For generic it's -1, which means undefined. */ int m_dataTypeWidth; diff --git a/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch new file mode 100644 index 00000000..8f0a790b --- /dev/null +++ b/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch @@ -0,0 +1,115 @@ +From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001 +From: Michael Liao +Date: Mon, 21 Oct 2013 17:47:58 -0700 +Subject: [PATCH] Fix PR17631 + +- Skip instructions added in prolog. For specific targets, prolog may + insert helper function calls (e.g. _chkstk will be called when + there're more than 4K bytes allocated on stack). However, these + helpers don't use/def YMM/XMM registers. + It also include second fix for the problem: r196261+r196391. + +diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp +index 477f75a..0d37a7d 100644 +--- lib/Target/X86/X86VZeroUpper.cpp ++++ lib/Target/X86/X86VZeroUpper.cpp +@@ -121,7 +121,7 @@ + } + + static bool clobbersAllYmmRegs(const MachineOperand &MO) { +- for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) { ++ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { + if (!MO.clobbersPhysReg(reg)) + return false; + } +@@ -143,6 +143,21 @@ + return false; + } + ++/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this ++/// instruction. ++static bool clobbersAnyYmmReg(MachineInstr *MI) { ++ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { ++ const MachineOperand &MO = MI->getOperand(i); ++ if (!MO.isRegMask()) ++ continue; ++ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { ++ if (MO.clobbersPhysReg(reg)) ++ return true; ++ } ++ } ++ return false; ++} ++ + /// runOnMachineFunction - Loop over all of the basic blocks, inserting + /// vzero upper instructions before function calls. + bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { +@@ -226,8 +241,9 @@ + bool BBHasCall = false; + + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { ++ DebugLoc dl = I->getDebugLoc(); + MachineInstr *MI = I; +- DebugLoc dl = I->getDebugLoc(); ++ + bool isControlFlow = MI->isCall() || MI->isReturn(); + + // Shortcut: don't need to check regular instructions in dirty state. +@@ -246,6 +262,14 @@ + if (!isControlFlow) + continue; + ++ // If the call won't clobber any YMM register, skip it as well. It usually ++ // happens on helper function calls (such as '_chkstk', '_ftol2') where ++ // standard calling convention is not used (RegMask is not used to mark ++ // register clobbered and register usage (def/imp-def/use) is well-dfined ++ // and explicitly specified. ++ if (MI->isCall() && !clobbersAnyYmmReg(MI)) ++ continue; ++ + BBHasCall = true; + + // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX +diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll +new file mode 100644 +index 0000000..a572ff2 +--- /dev/null ++++ test/CodeGen/X86/pr17631.ll +@@ -0,0 +1,34 @@ ++; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s ++ ++%struct_type = type { [64 x <8 x float>], <8 x float> } ++ ++; Function Attrs: nounwind readnone ++declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) ++ ++; Function Attrs: nounwind ++define i32 @equal(<8 x i32> %A) { ++allocas: ++ %first_alloc = alloca [64 x <8 x i32>] ++ %second_alloc = alloca %struct_type ++ ++ %A1 = bitcast <8 x i32> %A to <8 x float> ++ %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1) ++ ret i32 %A2 ++} ++ ++; CHECK: equal ++; CHECK-NOT: vzeroupper ++; CHECK: _chkstk ++; CHECK: ret ++ ++define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) { ++ %i = fptoui double %x to i64 ++ store i64 %i, i64* %p ++ %ret = fadd <8 x float> %y, %y ++ ret <8 x float> %ret ++} ++ ++; CHECK: foo ++; CHECK-NOT: vzeroupper ++; CHECK: _ftol2 ++; CHECK: ret +-- +1.8.1.2 + diff --git a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch deleted file mode 100644 index b6abb1d3..00000000 --- a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch +++ /dev/null @@ -1,69 +0,0 @@ -From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001 -From: Michael Liao -Date: Mon, 21 Oct 2013 17:47:58 -0700 -Subject: [PATCH] Fix PR17631 - -- Skip instructions added in prolog. For specific targets, prolog may - insert helper function calls (e.g. _chkstk will be called when - there're more than 4K bytes allocated on stack). However, these - helpers don't use/def YMM/XMM registers. ---- - lib/Target/X86/X86VZeroUpper.cpp | 11 ++++++++++- - test/CodeGen/X86/pr17631.ll | 22 ++++++++++++++++++++++ - 2 files changed, 32 insertions(+), 1 deletion(-) - create mode 100644 test/CodeGen/X86/pr17631.ll - -diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp -index 477f75a..0d37a7d 100644 ---- lib/Target/X86/X86VZeroUpper.cpp -+++ lib/Target/X86/X86VZeroUpper.cpp -@@ -231,8 +231,17 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, - bool BBHasCall = false; - - for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { -- MachineInstr *MI = I; - DebugLoc dl = I->getDebugLoc(); -+ MachineInstr *MI = I; -+ -+ // Don't need to check instructions added in prolog. -+ // In prolog, special function calls may be added for specific targets -+ // (e.g. on Windows, a prolog helper '_chkstk' is called when the local -+ // variables exceed 4K bytes on stack.) These helpers won't use/def YMM/XMM -+ // registers. -+ if (MI->getFlag(MachineInstr::FrameSetup)) -+ continue; -+ - bool isControlFlow = MI->isCall() || MI->isReturn(); - - // Shortcut: don't need to check regular instructions in dirty state. -diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll -new file mode 100644 -index 0000000..a572ff2 ---- /dev/null -+++ test/CodeGen/X86/pr17631.ll -@@ -0,0 +1,22 @@ -+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s -+ -+%struct_type = type { [64 x <8 x float>], <8 x float> } -+ -+; Function Attrs: nounwind readnone -+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) -+ -+; Function Attrs: nounwind -+define i32 @equal(<8 x i32> %A) { -+allocas: -+ %first_alloc = alloca [64 x <8 x i32>] -+ %second_alloc = alloca %struct_type -+ -+ %A1 = bitcast <8 x i32> %A to <8 x float> -+ %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1) -+ ret i32 %A2 -+} -+ -+; CHECK: equal -+; CHECK-NOT: vzeroupper -+; CHECK: _chkstk -+; CHECK: ret --- -1.8.1.2 - diff --git a/opt.cpp b/opt.cpp index 3e320b4b..9059c746 100644 --- a/opt.cpp +++ b/opt.cpp @@ -904,7 +904,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { lCopyMetadata(castPtr, callInst); int align; if (g->opt.forceAlignedMemory) - align = 0; + align = g->target->getNativeVectorAlignment(); else align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8; name = LLVMGetName(callInst->getArgOperand(0), "_load"); @@ -946,7 +946,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL); int align; if (g->opt.forceAlignedMemory) - align = 0; + align = g->target->getNativeVectorAlignment(); else align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8; storeInst->setAlignment(align); @@ -2758,7 +2758,8 @@ lImproveMaskedStore(llvm::CallInst *callInst) { lCopyMetadata(lvalue, callInst); llvm::Instruction *store = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */, - g->opt.forceAlignedMemory ? 0 : info->align); + g->opt.forceAlignedMemory ? + g->target->getNativeVectorAlignment() : info->align); lCopyMetadata(store, callInst); llvm::ReplaceInstWithInst(callInst, store); return true; @@ -2821,7 +2822,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst, callInst); llvm::Instruction *load = new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */, - g->opt.forceAlignedMemory ? 0 : info->align, + g->opt.forceAlignedMemory ? + g->target->getNativeVectorAlignment() : info->align, (llvm::Instruction *)NULL); lCopyMetadata(load, callInst); llvm::ReplaceInstWithInst(callInst, load); @@ -3226,6 +3228,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector &loadOps, } case 4: { // 4-wide vector load + if (g->opt.forceAlignedMemory) { + align = g->target->getNativeVectorAlignment(); + } llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 4); loadOps[i].load = lGEPAndLoad(basePtr, start, align, @@ -3234,6 +3239,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector &loadOps, } case 8: { // 8-wide vector load + if (g->opt.forceAlignedMemory) { + align = g->target->getNativeVectorAlignment(); + } llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 8); loadOps[i].load = lGEPAndLoad(basePtr, start, align, diff --git a/perf.ini b/perf.ini index 249c25f4..b44a2853 100755 --- a/perf.ini +++ b/perf.ini @@ -10,7 +10,7 @@ %**************************************************************************************************** AOBench aobench -10 512 512 +3 2048 2048 #*** Deferred Shading deferred @@ -41,7 +41,7 @@ options #*** Ray Tracer rt -sponza +sponza --scale=6.0 #*** 3D Stencil stencil