From 05a5a42a080e98ec52c94221f2e0c5750cea5abd Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 14 Sep 2012 12:17:25 -0700 Subject: [PATCH 1/4] Don't force loads/stores from varying types to be unaligned. These should always actually be aligned in memory. --- ctx.cpp | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/ctx.cpp b/ctx.cpp index fec38065..a066679b 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -2397,16 +2397,7 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) { if (name == NULL) name = LLVMGetName(ptr, "_load"); - // FIXME: it's not clear to me that we generate unaligned vector loads - // of varying stuff out of the front-end any more. (Only by the - // optimization passes that lower gathers to vector loads, I think..) - // So remove this?? - int align = 0; - if (llvm::isa(pt->getElementType())) - align = 1; - llvm::Instruction *inst = new llvm::LoadInst(ptr, name, - false /* not volatile */, - align, bblock); + llvm::Instruction *inst = new llvm::LoadInst(ptr, name, bblock); AddDebugPos(inst); return inst; } @@ -2958,17 +2949,7 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) { return; } - llvm::Instruction *inst; - if (llvm::isa(value->getType())) - // FIXME: same for load--do we still need/want this?? - // Specify an unaligned store, since we don't know that the ptr - // will in fact be aligned to a vector width here. (Actually - // should be aligned to the alignment of the vector elment type...) - inst = new llvm::StoreInst(value, ptr, false /* not volatile */, - 1, bblock); - else - inst = new llvm::StoreInst(value, ptr, bblock); - + llvm::Instruction *inst = new llvm::StoreInst(value, ptr, bblock); AddDebugPos(inst); } From 59b0a2b208ffe6201e8c7a81e0b120290ed7becc Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 14 Sep 2012 13:32:42 -0700 Subject: [PATCH 2/4] Mark __any(), __all(), and __none() as internal after they're linked in. This fixes multiple symbol definition errors when compiling a single binary for multiple ISA targets. --- builtins.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/builtins.cpp b/builtins.cpp index e75f2107..81a9a64b 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -322,6 +322,8 @@ lSetInternalFunctions(llvm::Module *module) { "__add_varying_double", "__add_varying_int32", "__add_varying_int64", + "__all", + "__any", "__aos_to_soa3_float", "__aos_to_soa3_float16", "__aos_to_soa3_float4", @@ -465,6 +467,7 @@ lSetInternalFunctions(llvm::Module *module) { "__new_uniform", "__new_varying32", "__new_varying64", + "__none", "__num_cores", "__packed_load_active", "__packed_store_active", From be2108260ea66d7a0e64a43d83ebf8444a3b18c7 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 14 Sep 2012 13:49:45 -0700 Subject: [PATCH 3/4] Add --opt=force-aligned-memory option. This forces all vector loads/stores to be done assuming that the given pointer is aligned to the vector size, thus allowing the use of sometimes more-efficient instructions. (If it isn't the case that the memory is aligned, the program will fail!). --- ispc.cpp | 1 + ispc.h | 6 ++++++ main.cpp | 3 +++ module.cpp | 2 ++ opt.cpp | 17 +++++++++++++---- test_static.cpp | 14 ++++++++++---- 6 files changed, 35 insertions(+), 8 deletions(-) diff --git a/ispc.cpp b/ispc.cpp index 29801359..0aae5e90 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -640,6 +640,7 @@ Opt::Opt() { unrollLoops = true; disableAsserts = false; disableFMA = false; + forceAlignedMemory = false; disableMaskAllOnOptimizations = false; disableHandlePseudoMemoryOps = false; disableBlendedMaskedStores = false; diff --git a/ispc.h b/ispc.h index e376df46..045916ab 100644 --- a/ispc.h +++ b/ispc.h @@ -311,6 +311,12 @@ struct Opt { that support them). */ bool disableFMA; + /** Always generate aligned vector load/store instructions; this + implies a guarantee that all dynamic access through pointers that + becomes a vector load/store will be a cache-aligned sequence of + locations. */ + bool forceAlignedMemory; + /** If enabled, disables the various optimizations that kick in when the execution mask can be determined to be "all on" at compile time. */ diff --git a/main.cpp b/main.cpp index 63c4d572..8076456f 100644 --- a/main.cpp +++ b/main.cpp @@ -119,6 +119,7 @@ usage(int ret) { printf(" disable-loop-unroll\t\tDisable loop unrolling.\n"); printf(" fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n"); printf(" fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n"); + printf(" force-aligned-memory\t\tAlways issue \"aligned\" vector load and store instructions\n"); #ifndef ISPC_IS_WINDOWS printf(" [--pic]\t\t\t\tGenerate position-independent code\n"); #endif // !ISPC_IS_WINDOWS @@ -336,6 +337,8 @@ int main(int Argc, char *Argv[]) { g->opt.unrollLoops = false; else if (!strcmp(opt, "disable-fma")) g->opt.disableFMA = true; + else if (!strcmp(opt, "force-aligned-memory")) + g->opt.forceAlignedMemory = true; // These are only used for performance tests of specific // optimizations diff --git a/module.cpp b/module.cpp index 365653ce..9ba7cea9 100644 --- a/module.cpp +++ b/module.cpp @@ -1783,6 +1783,8 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre opts.addMacroDef("ISPC_TARGET_HAS_HALF"); if (g->target.hasTranscendentals) opts.addMacroDef("ISPC_TARGET_HAS_TRANSCENDENTALS"); + if (g->opt.forceAlignedMemory) + opts.addMacroDef("ISPC_FORCE_ALIGNED_MEMORY"); opts.addMacroDef("ISPC_MAJOR_VERSION=1"); opts.addMacroDef("ISPC_MINOR_VERSION=3"); diff --git a/opt.cpp b/opt.cpp index a623466b..e2f38d8d 100644 --- a/opt.cpp +++ b/opt.cpp @@ -791,7 +791,11 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { llvm::PointerType::get(returnType, 0), name, callInst); lCopyMetadata(castPtr, callInst); - int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8; + int align; + if (g->opt.forceAlignedMemory) + align = 0; + else + align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8; name = LLVMGetName(callInst->getArgOperand(0), "_load"); llvm::Instruction *loadInst = new llvm::LoadInst(castPtr, name, false /* not volatile */, @@ -829,7 +833,11 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { llvm::StoreInst *storeInst = new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL); - int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8; + int align; + if (g->opt.forceAlignedMemory) + align = 0; + else + align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8; storeInst->setAlignment(align); lCopyMetadata(storeInst, callInst); llvm::ReplaceInstWithInst(callInst, storeInst); @@ -2553,7 +2561,7 @@ lImproveMaskedStore(llvm::CallInst *callInst) { lCopyMetadata(lvalue, callInst); llvm::Instruction *store = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */, - info->align); + g->opt.forceAlignedMemory ? 0 : info->align); lCopyMetadata(store, callInst); llvm::ReplaceInstWithInst(callInst, store); return true; @@ -2616,7 +2624,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst, callInst); llvm::Instruction *load = new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */, - info->align, (llvm::Instruction *)NULL); + g->opt.forceAlignedMemory ? 0 : info->align, + (llvm::Instruction *)NULL); lCopyMetadata(load, callInst); llvm::ReplaceInstWithInst(callInst, load); return true; diff --git a/test_static.cpp b/test_static.cpp index e798f960..ec91960e 100644 --- a/test_static.cpp +++ b/test_static.cpp @@ -99,15 +99,21 @@ void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) { } +#if defined(_WIN32) || defined(_WIN64) +#define ALIGN +#else +#define ALIGN __attribute__((aligned(64))) +#endif int main(int argc, char *argv[]) { int w = width(); assert(w <= 64); - float returned_result[64]; - float vfloat[64]; - double vdouble[64]; - int vint[64], vint2[64]; + float returned_result[64] ALIGN; + float vfloat[64] ALIGN; + double vdouble[64] ALIGN; + int vint[64] ALIGN; + int vint2[64] ALIGN; for (int i = 0; i < 64; ++i) { returned_result[i] = -1e20; From a13e7f24358be3cd661b7f4f81e85c69730fd2b8 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 14 Sep 2012 13:53:12 -0700 Subject: [PATCH 4/4] #define ISPC_FORCE_ALIGNED_MEMORY, if appropriate, in C++ output. --- cbackend.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cbackend.cpp b/cbackend.cpp index cb6236bc..b49e1d10 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -2203,6 +2203,10 @@ bool CWriter::doInitialization(llvm::Module &M) { Out << "#undef ISPC_FAST_MATH\n"; } + if (g->opt.forceAlignedMemory) { + Out << "#define ISPC_FORCE_ALIGNED_MEMORY\n"; + } + Out << "#include \"" << includeName << "\"\n"; Out << "\n/* Basic Library Function Declarations */\n";