diff --git a/ispc.cpp b/ispc.cpp index 29801359..0aae5e90 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -640,6 +640,7 @@ Opt::Opt() { unrollLoops = true; disableAsserts = false; disableFMA = false; + forceAlignedMemory = false; disableMaskAllOnOptimizations = false; disableHandlePseudoMemoryOps = false; disableBlendedMaskedStores = false; diff --git a/ispc.h b/ispc.h index e376df46..045916ab 100644 --- a/ispc.h +++ b/ispc.h @@ -311,6 +311,12 @@ struct Opt { that support them). */ bool disableFMA; + /** Always generate aligned vector load/store instructions; this + implies a guarantee that all dynamic access through pointers that + becomes a vector load/store will be a cache-aligned sequence of + locations. */ + bool forceAlignedMemory; + /** If enabled, disables the various optimizations that kick in when the execution mask can be determined to be "all on" at compile time. */ diff --git a/main.cpp b/main.cpp index 63c4d572..8076456f 100644 --- a/main.cpp +++ b/main.cpp @@ -119,6 +119,7 @@ usage(int ret) { printf(" disable-loop-unroll\t\tDisable loop unrolling.\n"); printf(" fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n"); printf(" fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n"); + printf(" force-aligned-memory\t\tAlways issue \"aligned\" vector load and store instructions\n"); #ifndef ISPC_IS_WINDOWS printf(" [--pic]\t\t\t\tGenerate position-independent code\n"); #endif // !ISPC_IS_WINDOWS @@ -336,6 +337,8 @@ int main(int Argc, char *Argv[]) { g->opt.unrollLoops = false; else if (!strcmp(opt, "disable-fma")) g->opt.disableFMA = true; + else if (!strcmp(opt, "force-aligned-memory")) + g->opt.forceAlignedMemory = true; // These are only used for performance tests of specific // optimizations diff --git a/module.cpp b/module.cpp index 365653ce..9ba7cea9 100644 --- a/module.cpp +++ b/module.cpp @@ -1783,6 +1783,8 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre opts.addMacroDef("ISPC_TARGET_HAS_HALF"); if (g->target.hasTranscendentals) opts.addMacroDef("ISPC_TARGET_HAS_TRANSCENDENTALS"); + if (g->opt.forceAlignedMemory) + opts.addMacroDef("ISPC_FORCE_ALIGNED_MEMORY"); opts.addMacroDef("ISPC_MAJOR_VERSION=1"); opts.addMacroDef("ISPC_MINOR_VERSION=3"); diff --git a/opt.cpp b/opt.cpp index a623466b..e2f38d8d 100644 --- a/opt.cpp +++ b/opt.cpp @@ -791,7 +791,11 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { llvm::PointerType::get(returnType, 0), name, callInst); lCopyMetadata(castPtr, callInst); - int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8; + int align; + if (g->opt.forceAlignedMemory) + align = 0; + else + align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8; name = LLVMGetName(callInst->getArgOperand(0), "_load"); llvm::Instruction *loadInst = new llvm::LoadInst(castPtr, name, false /* not volatile */, @@ -829,7 +833,11 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { llvm::StoreInst *storeInst = new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL); - int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8; + int align; + if (g->opt.forceAlignedMemory) + align = 0; + else + align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8; storeInst->setAlignment(align); lCopyMetadata(storeInst, callInst); llvm::ReplaceInstWithInst(callInst, storeInst); @@ -2553,7 +2561,7 @@ lImproveMaskedStore(llvm::CallInst *callInst) { lCopyMetadata(lvalue, callInst); llvm::Instruction *store = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */, - info->align); + g->opt.forceAlignedMemory ? 0 : info->align); lCopyMetadata(store, callInst); llvm::ReplaceInstWithInst(callInst, store); return true; @@ -2616,7 +2624,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst, callInst); llvm::Instruction *load = new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */, - info->align, (llvm::Instruction *)NULL); + g->opt.forceAlignedMemory ? 0 : info->align, + (llvm::Instruction *)NULL); lCopyMetadata(load, callInst); llvm::ReplaceInstWithInst(callInst, load); return true; diff --git a/test_static.cpp b/test_static.cpp index e798f960..ec91960e 100644 --- a/test_static.cpp +++ b/test_static.cpp @@ -99,15 +99,21 @@ void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) { } +#if defined(_WIN32) || defined(_WIN64) +#define ALIGN +#else +#define ALIGN __attribute__((aligned(64))) +#endif int main(int argc, char *argv[]) { int w = width(); assert(w <= 64); - float returned_result[64]; - float vfloat[64]; - double vdouble[64]; - int vint[64], vint2[64]; + float returned_result[64] ALIGN; + float vfloat[64] ALIGN; + double vdouble[64] ALIGN; + int vint[64] ALIGN; + int vint2[64] ALIGN; for (int i = 0; i < 64; ++i) { returned_result[i] = -1e20;