Add --opt=force-aligned-memory option.

This forces all vector loads/stores to be done assuming that the given
pointer is aligned to the vector size, thus allowing the use of sometimes
more-efficient instructions.  (If it isn't the case that the memory is
aligned, the program will fail!).
This commit is contained in:
Matt Pharr
2012-09-14 13:49:45 -07:00
parent 59b0a2b208
commit be2108260e
6 changed files with 35 additions and 8 deletions

View File

@@ -640,6 +640,7 @@ Opt::Opt() {
unrollLoops = true; unrollLoops = true;
disableAsserts = false; disableAsserts = false;
disableFMA = false; disableFMA = false;
forceAlignedMemory = false;
disableMaskAllOnOptimizations = false; disableMaskAllOnOptimizations = false;
disableHandlePseudoMemoryOps = false; disableHandlePseudoMemoryOps = false;
disableBlendedMaskedStores = false; disableBlendedMaskedStores = false;

6
ispc.h
View File

@@ -311,6 +311,12 @@ struct Opt {
that support them). */ that support them). */
bool disableFMA; bool disableFMA;
/** Always generate aligned vector load/store instructions; this
implies a guarantee that all dynamic access through pointers that
becomes a vector load/store will be a cache-aligned sequence of
locations. */
bool forceAlignedMemory;
/** If enabled, disables the various optimizations that kick in when /** If enabled, disables the various optimizations that kick in when
the execution mask can be determined to be "all on" at compile the execution mask can be determined to be "all on" at compile
time. */ time. */

View File

@@ -119,6 +119,7 @@ usage(int ret) {
printf(" disable-loop-unroll\t\tDisable loop unrolling.\n"); printf(" disable-loop-unroll\t\tDisable loop unrolling.\n");
printf(" fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n"); printf(" fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
printf(" fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n"); printf(" fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
printf(" force-aligned-memory\t\tAlways issue \"aligned\" vector load and store instructions\n");
#ifndef ISPC_IS_WINDOWS #ifndef ISPC_IS_WINDOWS
printf(" [--pic]\t\t\t\tGenerate position-independent code\n"); printf(" [--pic]\t\t\t\tGenerate position-independent code\n");
#endif // !ISPC_IS_WINDOWS #endif // !ISPC_IS_WINDOWS
@@ -336,6 +337,8 @@ int main(int Argc, char *Argv[]) {
g->opt.unrollLoops = false; g->opt.unrollLoops = false;
else if (!strcmp(opt, "disable-fma")) else if (!strcmp(opt, "disable-fma"))
g->opt.disableFMA = true; g->opt.disableFMA = true;
else if (!strcmp(opt, "force-aligned-memory"))
g->opt.forceAlignedMemory = true;
// These are only used for performance tests of specific // These are only used for performance tests of specific
// optimizations // optimizations

View File

@@ -1783,6 +1783,8 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
opts.addMacroDef("ISPC_TARGET_HAS_HALF"); opts.addMacroDef("ISPC_TARGET_HAS_HALF");
if (g->target.hasTranscendentals) if (g->target.hasTranscendentals)
opts.addMacroDef("ISPC_TARGET_HAS_TRANSCENDENTALS"); opts.addMacroDef("ISPC_TARGET_HAS_TRANSCENDENTALS");
if (g->opt.forceAlignedMemory)
opts.addMacroDef("ISPC_FORCE_ALIGNED_MEMORY");
opts.addMacroDef("ISPC_MAJOR_VERSION=1"); opts.addMacroDef("ISPC_MAJOR_VERSION=1");
opts.addMacroDef("ISPC_MINOR_VERSION=3"); opts.addMacroDef("ISPC_MINOR_VERSION=3");

17
opt.cpp
View File

@@ -791,7 +791,11 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
llvm::PointerType::get(returnType, 0), llvm::PointerType::get(returnType, 0),
name, callInst); name, callInst);
lCopyMetadata(castPtr, callInst); lCopyMetadata(castPtr, callInst);
int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8; int align;
if (g->opt.forceAlignedMemory)
align = 0;
else
align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
name = LLVMGetName(callInst->getArgOperand(0), "_load"); name = LLVMGetName(callInst->getArgOperand(0), "_load");
llvm::Instruction *loadInst = llvm::Instruction *loadInst =
new llvm::LoadInst(castPtr, name, false /* not volatile */, new llvm::LoadInst(castPtr, name, false /* not volatile */,
@@ -829,7 +833,11 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
llvm::StoreInst *storeInst = llvm::StoreInst *storeInst =
new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL); new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8; int align;
if (g->opt.forceAlignedMemory)
align = 0;
else
align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
storeInst->setAlignment(align); storeInst->setAlignment(align);
lCopyMetadata(storeInst, callInst); lCopyMetadata(storeInst, callInst);
llvm::ReplaceInstWithInst(callInst, storeInst); llvm::ReplaceInstWithInst(callInst, storeInst);
@@ -2553,7 +2561,7 @@ lImproveMaskedStore(llvm::CallInst *callInst) {
lCopyMetadata(lvalue, callInst); lCopyMetadata(lvalue, callInst);
llvm::Instruction *store = llvm::Instruction *store =
new llvm::StoreInst(rvalue, lvalue, false /* not volatile */, new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
info->align); g->opt.forceAlignedMemory ? 0 : info->align);
lCopyMetadata(store, callInst); lCopyMetadata(store, callInst);
llvm::ReplaceInstWithInst(callInst, store); llvm::ReplaceInstWithInst(callInst, store);
return true; return true;
@@ -2616,7 +2624,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst,
callInst); callInst);
llvm::Instruction *load = llvm::Instruction *load =
new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */, new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
info->align, (llvm::Instruction *)NULL); g->opt.forceAlignedMemory ? 0 : info->align,
(llvm::Instruction *)NULL);
lCopyMetadata(load, callInst); lCopyMetadata(load, callInst);
llvm::ReplaceInstWithInst(callInst, load); llvm::ReplaceInstWithInst(callInst, load);
return true; return true;

View File

@@ -99,15 +99,21 @@ void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
} }
#if defined(_WIN32) || defined(_WIN64)
#define ALIGN
#else
#define ALIGN __attribute__((aligned(64)))
#endif
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
int w = width(); int w = width();
assert(w <= 64); assert(w <= 64);
float returned_result[64]; float returned_result[64] ALIGN;
float vfloat[64]; float vfloat[64] ALIGN;
double vdouble[64]; double vdouble[64] ALIGN;
int vint[64], vint2[64]; int vint[64] ALIGN;
int vint2[64] ALIGN;
for (int i = 0; i < 64; ++i) { for (int i = 0; i < 64; ++i) {
returned_result[i] = -1e20; returned_result[i] = -1e20;