Fixing --opt=force-aligned-memory for LLVM 3.3+

This commit is contained in:
Dmitry Babokin
2013-12-04 19:00:02 +04:00
parent 4a53ed1201
commit 2d2d14744b
4 changed files with 72 additions and 9 deletions

30
ctx.cpp
View File

@@ -316,7 +316,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
llvm::BasicBlock *offBB =
llvm::BasicBlock::Create(*g->ctx, "entry",
(llvm::Function *)offFunc, 0);
new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB);
llvm::StoreInst *inst =
new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB);
if (g->opt.forceAlignedMemory) {
inst->setAlignment(g->target->getNativeVectorAlignment());
}
llvm::ReturnInst::Create(*g->ctx, offBB);
}
@@ -2437,7 +2441,13 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) {
if (name == NULL)
name = LLVMGetName(ptr, "_load");
llvm::Instruction *inst = new llvm::LoadInst(ptr, name, bblock);
llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock);
if (g->opt.forceAlignedMemory &&
llvm::dyn_cast<llvm::VectorType>(pt->getElementType())) {
inst->setAlignment(g->target->getNativeVectorAlignment());
}
AddDebugPos(inst);
return inst;
}
@@ -2719,7 +2729,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType,
inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock);
// If no alignment was specified but we have an array of a uniform
// type, then align it to 4 * the native vector width; it's not
// type, then align it to the native vector alignment; it's not
// unlikely that this array will be loaded into varying variables with
// what will be aligned accesses if the uniform -> varying load is done
// in regular chunks.
@@ -2727,7 +2737,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType,
llvm::dyn_cast<llvm::ArrayType>(llvmType);
if (align == 0 && arrayType != NULL &&
!llvm::isa<llvm::VectorType>(arrayType->getElementType()))
align = 4 * g->target->getNativeVectorWidth();
align = g->target->getNativeVectorAlignment();
if (align != 0)
inst->setAlignment(align);
@@ -2986,7 +2996,17 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) {
return;
}
llvm::Instruction *inst = new llvm::StoreInst(value, ptr, bblock);
llvm::PointerType *pt =
llvm::dyn_cast<llvm::PointerType>(ptr->getType());
AssertPos(currentPos, pt != NULL);
llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock);
if (g->opt.forceAlignedMemory &&
llvm::dyn_cast<llvm::VectorType>(pt->getElementType())) {
inst->setAlignment(g->target->getNativeVectorAlignment());
}
AddDebugPos(inst);
}

View File

@@ -191,6 +191,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
m_tf_attributes(NULL),
#endif
m_nativeVectorWidth(-1),
m_nativeVectorAlignment(-1),
m_dataTypeWidth(-1),
m_vectorWidth(-1),
m_generatePIC(pic),
@@ -309,6 +310,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse2-i32x4")) {
this->m_isa = Target::SSE2;
this->m_nativeVectorWidth = 4;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4;
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
@@ -325,6 +327,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse2-i32x8")) {
this->m_isa = Target::SSE2;
this->m_nativeVectorWidth = 4;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
@@ -341,6 +344,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse4-i32x4")) {
this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 4;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4;
// TODO: why not sse42 and popcnt?
@@ -359,6 +363,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse4-i32x8")) {
this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 4;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -374,6 +379,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "sse4-i8x16")) {
this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 16;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 8;
this->m_vectorWidth = 16;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -389,6 +395,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "sse4-i16x8")) {
this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 16;
this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
@@ -405,6 +412,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "generic-x4")) {
this->m_isa = Target::GENERIC;
this->m_nativeVectorWidth = 4;
this->m_nativeVectorAlignment = 16;
this->m_vectorWidth = 4;
this->m_maskingIsFree = true;
this->m_maskBitCount = 1;
@@ -416,6 +424,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "generic-x8")) {
this->m_isa = Target::GENERIC;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_vectorWidth = 8;
this->m_maskingIsFree = true;
this->m_maskBitCount = 1;
@@ -427,6 +436,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "generic-x16")) {
this->m_isa = Target::GENERIC;
this->m_nativeVectorWidth = 16;
this->m_nativeVectorAlignment = 64;
this->m_vectorWidth = 16;
this->m_maskingIsFree = true;
this->m_maskBitCount = 1;
@@ -438,6 +448,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "generic-x32")) {
this->m_isa = Target::GENERIC;
this->m_nativeVectorWidth = 32;
this->m_nativeVectorAlignment = 64;
this->m_vectorWidth = 32;
this->m_maskingIsFree = true;
this->m_maskBitCount = 1;
@@ -449,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "generic-x64")) {
this->m_isa = Target::GENERIC;
this->m_nativeVectorWidth = 64;
this->m_nativeVectorAlignment = 64;
this->m_vectorWidth = 64;
this->m_maskingIsFree = true;
this->m_maskBitCount = 1;
@@ -460,6 +472,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "generic-x1")) {
this->m_isa = Target::GENERIC;
this->m_nativeVectorWidth = 1;
this->m_nativeVectorAlignment = 16;
this->m_vectorWidth = 1;
this->m_maskingIsFree = false;
this->m_maskBitCount = 32;
@@ -467,6 +480,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "avx1-i32x4")) {
this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov";
@@ -478,6 +492,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1-i32x8")) {
this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8;
this->m_attributes = "+avx,+popcnt,+cmov";
@@ -488,6 +503,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1-i64x4")) {
this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov";
@@ -499,6 +515,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1-i32x16")) {
this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16;
this->m_attributes = "+avx,+popcnt,+cmov";
@@ -509,6 +526,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1.1-i32x8")) {
this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -530,6 +548,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1.1-i32x16")) {
this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -550,6 +569,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "avx1.1-i64x4")) {
this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
@@ -571,6 +591,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx2-i32x8")) {
this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -596,6 +617,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx2-i32x16")) {
this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 16;
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -620,6 +642,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "avx2-i64x4")) {
this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_nativeVectorAlignment = 32;
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
@@ -645,6 +668,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "neon-i8x16")) {
this->m_isa = Target::NEON8;
this->m_nativeVectorWidth = 16;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 8;
this->m_vectorWidth = 16;
this->m_attributes = "+neon,+fp16";
@@ -655,6 +679,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "neon-i16x8")) {
this->m_isa = Target::NEON16;
this->m_nativeVectorWidth = 8;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 16;
this->m_vectorWidth = 8;
this->m_attributes = "+neon,+fp16";
@@ -666,6 +691,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "neon-i32x4")) {
this->m_isa = Target::NEON32;
this->m_nativeVectorWidth = 4;
this->m_nativeVectorAlignment = 16;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4;
this->m_attributes = "+neon,+fp16";

9
ispc.h
View File

@@ -260,6 +260,8 @@ public:
int getNativeVectorWidth() const {return m_nativeVectorWidth;}
int getNativeVectorAlignment() const {return m_nativeVectorAlignment;}
int getDataTypeWidth() const {return m_dataTypeWidth;}
int getVectorWidth() const {return m_vectorWidth;}
@@ -332,6 +334,13 @@ private:
SSE, 8 for AVX, etc.) */
int m_nativeVectorWidth;
/** Native vector alignment in bytes. Theoretically this may be derived
from the vector size, but it's better to manage directly the alignement.
It allows easier experimenting and better fine tuning for particular
platform. This information is primatily used when
--opt=force-aligned-memory is used. */
int m_nativeVectorAlignment;
/** Data type with in bits. Typically it's 32, but could be 8, 16 or 64.
For generic it's -1, which means undefined. */
int m_dataTypeWidth;

16
opt.cpp
View File

@@ -904,7 +904,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
lCopyMetadata(castPtr, callInst);
int align;
if (g->opt.forceAlignedMemory)
align = 0;
align = g->target->getNativeVectorAlignment();
else
align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
name = LLVMGetName(callInst->getArgOperand(0), "_load");
@@ -946,7 +946,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
int align;
if (g->opt.forceAlignedMemory)
align = 0;
align = g->target->getNativeVectorAlignment();
else
align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
storeInst->setAlignment(align);
@@ -2758,7 +2758,8 @@ lImproveMaskedStore(llvm::CallInst *callInst) {
lCopyMetadata(lvalue, callInst);
llvm::Instruction *store =
new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
g->opt.forceAlignedMemory ? 0 : info->align);
g->opt.forceAlignedMemory ?
g->target->getNativeVectorAlignment() : info->align);
lCopyMetadata(store, callInst);
llvm::ReplaceInstWithInst(callInst, store);
return true;
@@ -2821,7 +2822,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst,
callInst);
llvm::Instruction *load =
new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
g->opt.forceAlignedMemory ? 0 : info->align,
g->opt.forceAlignedMemory ?
g->target->getNativeVectorAlignment() : info->align,
(llvm::Instruction *)NULL);
lCopyMetadata(load, callInst);
llvm::ReplaceInstWithInst(callInst, load);
@@ -3226,6 +3228,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
}
case 4: {
// 4-wide vector load
if (g->opt.forceAlignedMemory) {
align = g->target->getNativeVectorAlignment();
}
llvm::VectorType *vt =
llvm::VectorType::get(LLVMTypes::Int32Type, 4);
loadOps[i].load = lGEPAndLoad(basePtr, start, align,
@@ -3234,6 +3239,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
}
case 8: {
// 8-wide vector load
if (g->opt.forceAlignedMemory) {
align = g->target->getNativeVectorAlignment();
}
llvm::VectorType *vt =
llvm::VectorType::get(LLVMTypes::Int32Type, 8);
loadOps[i].load = lGEPAndLoad(basePtr, start, align,