This commit is contained in:
Vsevolod Livinskij
2013-12-06 17:22:19 +04:00
10 changed files with 194 additions and 85 deletions

30
ctx.cpp
View File

@@ -316,7 +316,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
llvm::BasicBlock *offBB =
llvm::BasicBlock::Create(*g->ctx, "entry",
(llvm::Function *)offFunc, 0);
new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB);
llvm::StoreInst *inst =
new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB);
if (g->opt.forceAlignedMemory) {
inst->setAlignment(g->target->getNativeVectorAlignment());
}
llvm::ReturnInst::Create(*g->ctx, offBB);
}
@@ -2437,7 +2441,13 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) {
if (name == NULL)
name = LLVMGetName(ptr, "_load");
llvm::Instruction *inst = new llvm::LoadInst(ptr, name, bblock);
llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock);
if (g->opt.forceAlignedMemory &&
llvm::dyn_cast<llvm::VectorType>(pt->getElementType())) {
inst->setAlignment(g->target->getNativeVectorAlignment());
}
AddDebugPos(inst);
return inst;
}
@@ -2719,7 +2729,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType,
inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock);
// If no alignment was specified but we have an array of a uniform
// type, then align it to 4 * the native vector width; it's not
// type, then align it to the native vector alignment; it's not
// unlikely that this array will be loaded into varying variables with
// what will be aligned accesses if the uniform -> varying load is done
// in regular chunks.
@@ -2727,7 +2737,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType,
llvm::dyn_cast<llvm::ArrayType>(llvmType);
if (align == 0 && arrayType != NULL &&
!llvm::isa<llvm::VectorType>(arrayType->getElementType()))
align = 4 * g->target->getNativeVectorWidth();
align = g->target->getNativeVectorAlignment();
if (align != 0)
inst->setAlignment(align);
@@ -2986,7 +2996,17 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) {
return;
}
llvm::Instruction *inst = new llvm::StoreInst(value, ptr, bblock);
llvm::PointerType *pt =
llvm::dyn_cast<llvm::PointerType>(ptr->getType());
AssertPos(currentPos, pt != NULL);
llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock);
if (g->opt.forceAlignedMemory &&
llvm::dyn_cast<llvm::VectorType>(pt->getElementType())) {
inst->setAlignment(g->target->getNativeVectorAlignment());
}
AddDebugPos(inst);
}

View File

@@ -74,8 +74,8 @@ static void usage() {
}
int main(int argc, char *argv[]) {
unsigned int width = 1536;
unsigned int height = 1024;
unsigned int width = 1536 * 8;
unsigned int height = 1024 * 8;
float x0 = -2;
float x1 = 1;
float y0 = -1;

View File

@@ -66,8 +66,8 @@ writePPM(float *buf, int width, int height, const char *fn) {
int main() {
unsigned int width = 768;
unsigned int height = 768;
unsigned int width = 768 * 4;
unsigned int height = 768 * 4;
float x0 = -10;
float x1 = 10;
float y0 = -10;

View File

@@ -67,7 +67,7 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
int main() {
int Nx = 256, Ny = 256, Nz = 256;
int Nx = 256 * 2, Ny = 256 * 2, Nz = 256 * 2;
int width = 4;
float *Aserial[2], *Aispc[2];
Aserial[0] = new float [Nx * Ny * Nz];

View File

@@ -191,6 +191,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
m_tf_attributes(NULL),
#endif
m_nativeVectorWidth(-1),
m_nativeVectorAlignment(-1),
m_dataTypeWidth(-1),
m_vectorWidth(-1),
m_generatePIC(pic),
@@ -309,6 +310,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse2-i32x4")) {
this->m_isa = Target::SSE2;
this->m_nativeVectorWidth = 4;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4;
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
@@ -325,6 +327,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse2-i32x8")) {
this->m_isa = Target::SSE2;
this->m_nativeVectorWidth = 4;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
@@ -341,6 +344,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse4-i32x4")) {
this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 4;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4;
// TODO: why not sse42 and popcnt?
@@ -359,6 +363,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse4-i32x8")) {
this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 4;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -374,6 +379,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "sse4-i8x16")) {
this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 16;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 8;
this->m_vectorWidth = 16;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -389,6 +395,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "sse4-i16x8")) {
this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 16;
this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -405,6 +412,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "generic-x4")) {
this->m_isa = Target::GENERIC;
this->m_nativeVectorWidth = 4;
this->m_nativeVectorAlignment = 16;
this->m_vectorWidth = 4;
this->m_maskingIsFree = true;
this->m_maskBitCount = 1;
@@ -416,6 +424,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "generic-x8")) {
this->m_isa = Target::GENERIC;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_vectorWidth = 8;
this->m_maskingIsFree = true;
this->m_maskBitCount = 1;
@@ -427,6 +436,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "generic-x16")) {
this->m_isa = Target::GENERIC;
this->m_nativeVectorWidth = 16;
this->m_nativeVectorAlignment = 64;
this->m_vectorWidth = 16;
this->m_maskingIsFree = true;
this->m_maskBitCount = 1;
@@ -438,6 +448,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "generic-x32")) {
this->m_isa = Target::GENERIC;
this->m_nativeVectorWidth = 32;
this->m_nativeVectorAlignment = 64;
this->m_vectorWidth = 32;
this->m_maskingIsFree = true;
this->m_maskBitCount = 1;
@@ -449,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "generic-x64")) {
this->m_isa = Target::GENERIC;
this->m_nativeVectorWidth = 64;
this->m_nativeVectorAlignment = 64;
this->m_vectorWidth = 64;
this->m_maskingIsFree = true;
this->m_maskBitCount = 1;
@@ -460,6 +472,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "generic-x1")) {
this->m_isa = Target::GENERIC;
this->m_nativeVectorWidth = 1;
this->m_nativeVectorAlignment = 16;
this->m_vectorWidth = 1;
this->m_maskingIsFree = false;
this->m_maskBitCount = 32;
@@ -467,6 +480,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "avx1-i32x4")) {
this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov";
@@ -478,6 +492,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1-i32x8")) {
this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8;
this->m_attributes = "+avx,+popcnt,+cmov";
@@ -488,6 +503,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1-i64x4")) {
this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov";
@@ -499,6 +515,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1-i32x16")) {
this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16;
this->m_attributes = "+avx,+popcnt,+cmov";
@@ -509,6 +526,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1.1-i32x8")) {
this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -530,6 +548,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1.1-i32x16")) {
this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -550,6 +569,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "avx1.1-i64x4")) {
this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -571,6 +591,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx2-i32x8")) {
this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -596,6 +617,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx2-i32x16")) {
this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 16;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -620,6 +642,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "avx2-i64x4")) {
this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -645,6 +668,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "neon-i8x16")) {
this->m_isa = Target::NEON8;
this->m_nativeVectorWidth = 16;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 8;
this->m_vectorWidth = 16;
this->m_attributes = "+neon,+fp16";
@@ -655,6 +679,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "neon-i16x8")) {
this->m_isa = Target::NEON16;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 16;
this->m_vectorWidth = 8;
this->m_attributes = "+neon,+fp16";
@@ -666,6 +691,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "neon-i32x4")) {
this->m_isa = Target::NEON32;
this->m_nativeVectorWidth = 4;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4;
this->m_attributes = "+neon,+fp16";

9
ispc.h
View File

@@ -260,6 +260,8 @@ public:
int getNativeVectorWidth() const {return m_nativeVectorWidth;}
int getNativeVectorAlignment() const {return m_nativeVectorAlignment;}
int getDataTypeWidth() const {return m_dataTypeWidth;}
int getVectorWidth() const {return m_vectorWidth;}
@@ -332,6 +334,13 @@ private:
SSE, 8 for AVX, etc.) */
int m_nativeVectorWidth;
/** Native vector alignment in bytes. Theoretically this may be derived
from the vector size, but it's better to manage directly the alignement.
It allows easier experimenting and better fine tuning for particular
platform. This information is primatily used when
--opt=force-aligned-memory is used. */
int m_nativeVectorAlignment;
/** Data type with in bits. Typically it's 32, but could be 8, 16 or 64.
For generic it's -1, which means undefined. */
int m_dataTypeWidth;

View File

@@ -0,0 +1,115 @@
From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Mon, 21 Oct 2013 17:47:58 -0700
Subject: [PATCH] Fix PR17631
- Skip instructions added in prolog. For specific targets, prolog may
insert helper function calls (e.g. _chkstk will be called when
there're more than 4K bytes allocated on stack). However, these
helpers don't use/def YMM/XMM registers.
It also include second fix for the problem: r196261+r196391.
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 477f75a..0d37a7d 100644
--- lib/Target/X86/X86VZeroUpper.cpp
+++ lib/Target/X86/X86VZeroUpper.cpp
@@ -121,7 +121,7 @@
}
static bool clobbersAllYmmRegs(const MachineOperand &MO) {
- for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) {
+ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
if (!MO.clobbersPhysReg(reg))
return false;
}
@@ -143,6 +143,21 @@
return false;
}
+/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
+/// instruction.
+static bool clobbersAnyYmmReg(MachineInstr *MI) {
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isRegMask())
+ continue;
+ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+ if (MO.clobbersPhysReg(reg))
+ return true;
+ }
+ }
+ return false;
+}
+
/// runOnMachineFunction - Loop over all of the basic blocks, inserting
/// vzero upper instructions before function calls.
bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
@@ -226,8 +241,9 @@
bool BBHasCall = false;
for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+ DebugLoc dl = I->getDebugLoc();
MachineInstr *MI = I;
- DebugLoc dl = I->getDebugLoc();
+
bool isControlFlow = MI->isCall() || MI->isReturn();
// Shortcut: don't need to check regular instructions in dirty state.
@@ -246,6 +262,14 @@
if (!isControlFlow)
continue;
+ // If the call won't clobber any YMM register, skip it as well. It usually
+ // happens on helper function calls (such as '_chkstk', '_ftol2') where
+ // standard calling convention is not used (RegMask is not used to mark
+ // register clobbered and register usage (def/imp-def/use) is well-dfined
+ // and explicitly specified.
+ if (MI->isCall() && !clobbersAnyYmmReg(MI))
+ continue;
+
BBHasCall = true;
// The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
new file mode 100644
index 0000000..a572ff2
--- /dev/null
+++ test/CodeGen/X86/pr17631.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
+
+%struct_type = type { [64 x <8 x float>], <8 x float> }
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
+
+; Function Attrs: nounwind
+define i32 @equal(<8 x i32> %A) {
+allocas:
+ %first_alloc = alloca [64 x <8 x i32>]
+ %second_alloc = alloca %struct_type
+
+ %A1 = bitcast <8 x i32> %A to <8 x float>
+ %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
+ ret i32 %A2
+}
+
+; CHECK: equal
+; CHECK-NOT: vzeroupper
+; CHECK: _chkstk
+; CHECK: ret
+
+define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) {
+ %i = fptoui double %x to i64
+ store i64 %i, i64* %p
+ %ret = fadd <8 x float> %y, %y
+ ret <8 x float> %ret
+}
+
+; CHECK: foo
+; CHECK-NOT: vzeroupper
+; CHECK: _ftol2
+; CHECK: ret
--
1.8.1.2

View File

@@ -1,69 +0,0 @@
From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Mon, 21 Oct 2013 17:47:58 -0700
Subject: [PATCH] Fix PR17631
- Skip instructions added in prolog. For specific targets, prolog may
insert helper function calls (e.g. _chkstk will be called when
there're more than 4K bytes allocated on stack). However, these
helpers don't use/def YMM/XMM registers.
---
lib/Target/X86/X86VZeroUpper.cpp | 11 ++++++++++-
test/CodeGen/X86/pr17631.ll | 22 ++++++++++++++++++++++
2 files changed, 32 insertions(+), 1 deletion(-)
create mode 100644 test/CodeGen/X86/pr17631.ll
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 477f75a..0d37a7d 100644
--- lib/Target/X86/X86VZeroUpper.cpp
+++ lib/Target/X86/X86VZeroUpper.cpp
@@ -231,8 +231,17 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
bool BBHasCall = false;
for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
- MachineInstr *MI = I;
DebugLoc dl = I->getDebugLoc();
+ MachineInstr *MI = I;
+
+ // Don't need to check instructions added in prolog.
+ // In prolog, special function calls may be added for specific targets
+ // (e.g. on Windows, a prolog helper '_chkstk' is called when the local
+ // variables exceed 4K bytes on stack.) These helpers won't use/def YMM/XMM
+ // registers.
+ if (MI->getFlag(MachineInstr::FrameSetup))
+ continue;
+
bool isControlFlow = MI->isCall() || MI->isReturn();
// Shortcut: don't need to check regular instructions in dirty state.
diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
new file mode 100644
index 0000000..a572ff2
--- /dev/null
+++ test/CodeGen/X86/pr17631.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
+
+%struct_type = type { [64 x <8 x float>], <8 x float> }
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
+
+; Function Attrs: nounwind
+define i32 @equal(<8 x i32> %A) {
+allocas:
+ %first_alloc = alloca [64 x <8 x i32>]
+ %second_alloc = alloca %struct_type
+
+ %A1 = bitcast <8 x i32> %A to <8 x float>
+ %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
+ ret i32 %A2
+}
+
+; CHECK: equal
+; CHECK-NOT: vzeroupper
+; CHECK: _chkstk
+; CHECK: ret
--
1.8.1.2

16
opt.cpp
View File

@@ -904,7 +904,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
lCopyMetadata(castPtr, callInst);
int align;
if (g->opt.forceAlignedMemory)
align = 0;
align = g->target->getNativeVectorAlignment();
else
align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
name = LLVMGetName(callInst->getArgOperand(0), "_load");
@@ -946,7 +946,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
int align;
if (g->opt.forceAlignedMemory)
align = 0;
align = g->target->getNativeVectorAlignment();
else
align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
storeInst->setAlignment(align);
@@ -2758,7 +2758,8 @@ lImproveMaskedStore(llvm::CallInst *callInst) {
lCopyMetadata(lvalue, callInst);
llvm::Instruction *store =
new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
g->opt.forceAlignedMemory ? 0 : info->align);
g->opt.forceAlignedMemory ?
g->target->getNativeVectorAlignment() : info->align);
lCopyMetadata(store, callInst);
llvm::ReplaceInstWithInst(callInst, store);
return true;
@@ -2821,7 +2822,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst,
callInst);
llvm::Instruction *load =
new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
g->opt.forceAlignedMemory ? 0 : info->align,
g->opt.forceAlignedMemory ?
g->target->getNativeVectorAlignment() : info->align,
(llvm::Instruction *)NULL);
lCopyMetadata(load, callInst);
llvm::ReplaceInstWithInst(callInst, load);
@@ -3226,6 +3228,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
}
case 4: {
// 4-wide vector load
if (g->opt.forceAlignedMemory) {
align = g->target->getNativeVectorAlignment();
}
llvm::VectorType *vt =
llvm::VectorType::get(LLVMTypes::Int32Type, 4);
loadOps[i].load = lGEPAndLoad(basePtr, start, align,
@@ -3234,6 +3239,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
}
case 8: {
// 8-wide vector load
if (g->opt.forceAlignedMemory) {
align = g->target->getNativeVectorAlignment();
}
llvm::VectorType *vt =
llvm::VectorType::get(LLVMTypes::Int32Type, 8);
loadOps[i].load = lGEPAndLoad(basePtr, start, align,

View File

@@ -10,7 +10,7 @@
%****************************************************************************************************
AOBench
aobench
10 512 512
3 2048 2048
#***
Deferred Shading
deferred
@@ -41,7 +41,7 @@ options
#***
Ray Tracer
rt
sponza
sponza --scale=6.0
#***
3D Stencil
stencil