diff --git a/ctx.cpp b/ctx.cpp index c1a7e61a..e5c60363 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -316,7 +316,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym, llvm::BasicBlock *offBB = llvm::BasicBlock::Create(*g->ctx, "entry", (llvm::Function *)offFunc, 0); - new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB); + llvm::StoreInst *inst = + new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB); + if (g->opt.forceAlignedMemory) { + inst->setAlignment(g->target->getNativeVectorAlignment()); + } llvm::ReturnInst::Create(*g->ctx, offBB); } @@ -2437,7 +2441,13 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) { if (name == NULL) name = LLVMGetName(ptr, "_load"); - llvm::Instruction *inst = new llvm::LoadInst(ptr, name, bblock); + llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock); + + if (g->opt.forceAlignedMemory && + llvm::dyn_cast(pt->getElementType())) { + inst->setAlignment(g->target->getNativeVectorAlignment()); + } + AddDebugPos(inst); return inst; } @@ -2719,7 +2729,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType, inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock); // If no alignment was specified but we have an array of a uniform - // type, then align it to 4 * the native vector width; it's not + // type, then align it to the native vector alignment; it's not // unlikely that this array will be loaded into varying variables with // what will be aligned accesses if the uniform -> varying load is done // in regular chunks. @@ -2727,7 +2737,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType, llvm::dyn_cast(llvmType); if (align == 0 && arrayType != NULL && !llvm::isa(arrayType->getElementType())) - align = 4 * g->target->getNativeVectorWidth(); + align = g->target->getNativeVectorAlignment(); if (align != 0) inst->setAlignment(align); @@ -2986,7 +2996,17 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) { return; } - llvm::Instruction *inst = new llvm::StoreInst(value, ptr, bblock); + llvm::PointerType *pt = + llvm::dyn_cast(ptr->getType()); + AssertPos(currentPos, pt != NULL); + + llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock); + + if (g->opt.forceAlignedMemory && + llvm::dyn_cast(pt->getElementType())) { + inst->setAlignment(g->target->getNativeVectorAlignment()); + } + AddDebugPos(inst); } diff --git a/ispc.cpp b/ispc.cpp index 36d31580..b1790dc3 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -191,6 +191,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : m_tf_attributes(NULL), #endif m_nativeVectorWidth(-1), + m_nativeVectorAlignment(-1), m_dataTypeWidth(-1), m_vectorWidth(-1), m_generatePIC(pic), @@ -309,6 +310,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "sse2-i32x4")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" @@ -325,6 +327,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "sse2-i32x8")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" @@ -341,6 +344,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "sse4-i32x4")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; // TODO: why not sse42 and popcnt? @@ -359,6 +363,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "sse4-i32x8")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" @@ -374,6 +379,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "sse4-i8x16")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 8; this->m_vectorWidth = 16; this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" @@ -389,6 +395,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "sse4-i16x8")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 16; this->m_vectorWidth = 8; this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" @@ -405,6 +412,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x4")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_vectorWidth = 4; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -416,6 +424,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x8")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_vectorWidth = 8; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -427,6 +436,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x16")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 64; this->m_vectorWidth = 16; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -438,6 +448,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x32")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 32; + this->m_nativeVectorAlignment = 64; this->m_vectorWidth = 32; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -449,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x64")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 64; + this->m_nativeVectorAlignment = 64; this->m_vectorWidth = 64; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -460,6 +472,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x1")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 1; + this->m_nativeVectorAlignment = 16; this->m_vectorWidth = 1; this->m_maskingIsFree = false; this->m_maskBitCount = 32; @@ -467,6 +480,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "avx1-i32x4")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -478,6 +492,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1-i32x8")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -488,6 +503,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1-i64x4")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 64; this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -499,6 +515,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1-i32x16")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 16; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -509,6 +526,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1.1-i32x8")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+avx,+popcnt,+cmov,+f16c" @@ -530,6 +548,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1.1-i32x16")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 16; this->m_attributes = "+avx,+popcnt,+cmov,+f16c" @@ -550,6 +569,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "avx1.1-i64x4")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 64; this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov,+f16c" @@ -571,6 +591,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx2-i32x8")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" @@ -596,6 +617,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx2-i32x16")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 16; this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" @@ -620,6 +642,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "avx2-i64x4")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 64; this->m_vectorWidth = 4; this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" @@ -645,6 +668,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "neon-i8x16")) { this->m_isa = Target::NEON8; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 8; this->m_vectorWidth = 16; this->m_attributes = "+neon,+fp16"; @@ -655,6 +679,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "neon-i16x8")) { this->m_isa = Target::NEON16; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 16; this->m_vectorWidth = 8; this->m_attributes = "+neon,+fp16"; @@ -666,6 +691,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "neon-i32x4")) { this->m_isa = Target::NEON32; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; this->m_attributes = "+neon,+fp16"; diff --git a/ispc.h b/ispc.h index b319d656..4b333861 100644 --- a/ispc.h +++ b/ispc.h @@ -260,6 +260,8 @@ public: int getNativeVectorWidth() const {return m_nativeVectorWidth;} + int getNativeVectorAlignment() const {return m_nativeVectorAlignment;} + int getDataTypeWidth() const {return m_dataTypeWidth;} int getVectorWidth() const {return m_vectorWidth;} @@ -332,6 +334,13 @@ private: SSE, 8 for AVX, etc.) */ int m_nativeVectorWidth; + /** Native vector alignment in bytes. Theoretically this may be derived + from the vector size, but it's better to manage directly the alignement. + It allows easier experimenting and better fine tuning for particular + platform. This information is primatily used when + --opt=force-aligned-memory is used. */ + int m_nativeVectorAlignment; + /** Data type with in bits. Typically it's 32, but could be 8, 16 or 64. For generic it's -1, which means undefined. */ int m_dataTypeWidth; diff --git a/opt.cpp b/opt.cpp index 3e320b4b..9059c746 100644 --- a/opt.cpp +++ b/opt.cpp @@ -904,7 +904,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { lCopyMetadata(castPtr, callInst); int align; if (g->opt.forceAlignedMemory) - align = 0; + align = g->target->getNativeVectorAlignment(); else align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8; name = LLVMGetName(callInst->getArgOperand(0), "_load"); @@ -946,7 +946,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL); int align; if (g->opt.forceAlignedMemory) - align = 0; + align = g->target->getNativeVectorAlignment(); else align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8; storeInst->setAlignment(align); @@ -2758,7 +2758,8 @@ lImproveMaskedStore(llvm::CallInst *callInst) { lCopyMetadata(lvalue, callInst); llvm::Instruction *store = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */, - g->opt.forceAlignedMemory ? 0 : info->align); + g->opt.forceAlignedMemory ? + g->target->getNativeVectorAlignment() : info->align); lCopyMetadata(store, callInst); llvm::ReplaceInstWithInst(callInst, store); return true; @@ -2821,7 +2822,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst, callInst); llvm::Instruction *load = new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */, - g->opt.forceAlignedMemory ? 0 : info->align, + g->opt.forceAlignedMemory ? + g->target->getNativeVectorAlignment() : info->align, (llvm::Instruction *)NULL); lCopyMetadata(load, callInst); llvm::ReplaceInstWithInst(callInst, load); @@ -3226,6 +3228,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector &loadOps, } case 4: { // 4-wide vector load + if (g->opt.forceAlignedMemory) { + align = g->target->getNativeVectorAlignment(); + } llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 4); loadOps[i].load = lGEPAndLoad(basePtr, start, align, @@ -3234,6 +3239,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector &loadOps, } case 8: { // 8-wide vector load + if (g->opt.forceAlignedMemory) { + align = g->target->getNativeVectorAlignment(); + } llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 8); loadOps[i].load = lGEPAndLoad(basePtr, start, align,