Add --opt=force-aligned-memory option.

This forces all vector loads/stores to be done assuming that the given pointer is aligned to the vector size, thus allowing the use of sometimes more-efficient instructions. (If it isn't the case that the memory is aligned, the program will fail!).
2012-09-14 13:49:45 -07:00
parent 59b0a2b208
commit be2108260e
6 changed files with 35 additions and 8 deletions
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -640,6 +640,7 @@ Opt::Opt() {
    unrollLoops = true;
    disableAsserts = false;
    disableFMA = false;
    forceAlignedMemory = false;
    disableMaskAllOnOptimizations = false;
    disableHandlePseudoMemoryOps = false;
    disableBlendedMaskedStores = false;
--- a/ispc.h
+++ b/ispc.h
@@ -311,6 +311,12 @@ struct Opt {
        that support them). */
    bool disableFMA;
    /** Always generate aligned vector load/store instructions; this
        implies a guarantee that all dynamic access through pointers that
        becomes a vector load/store will be a cache-aligned sequence of
        locations. */
    bool forceAlignedMemory;
    /** If enabled, disables the various optimizations that kick in when
        the execution mask can be determined to be "all on" at compile
        time. */
--- a/main.cpp
+++ b/main.cpp
@@ -119,6 +119,7 @@ usage(int ret) {
    printf("        disable-loop-unroll\t\tDisable loop unrolling.\n");
    printf("        fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
    printf("        fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
    printf("        force-aligned-memory\t\tAlways issue \"aligned\" vector load and store instructions\n");
 #ifndef ISPC_IS_WINDOWS
    printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
 #endif // !ISPC_IS_WINDOWS
@@ -336,6 +337,8 @@ int main(int Argc, char *Argv[]) {
                g->opt.unrollLoops = false;
            else if (!strcmp(opt, "disable-fma"))
                g->opt.disableFMA = true;
            else if (!strcmp(opt, "force-aligned-memory"))
                g->opt.forceAlignedMemory = true;
            // These are only used for performance tests of specific
            // optimizations
--- a/module.cpp
+++ b/module.cpp
@@ -1783,6 +1783,8 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
        opts.addMacroDef("ISPC_TARGET_HAS_HALF");
    if (g->target.hasTranscendentals)
        opts.addMacroDef("ISPC_TARGET_HAS_TRANSCENDENTALS");
    if (g->opt.forceAlignedMemory)
        opts.addMacroDef("ISPC_FORCE_ALIGNED_MEMORY");
    opts.addMacroDef("ISPC_MAJOR_VERSION=1");
    opts.addMacroDef("ISPC_MINOR_VERSION=3");
--- a/opt.cpp
+++ b/opt.cpp
@@ -791,7 +791,11 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                                              llvm::PointerType::get(returnType, 0), 
                                              name, callInst);
                    lCopyMetadata(castPtr, callInst);
-                    int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
+                    int align;
                    if (g->opt.forceAlignedMemory)
                        align = 0;
                    else
                        align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
                    name = LLVMGetName(callInst->getArgOperand(0), "_load");
                    llvm::Instruction *loadInst = 
                        new llvm::LoadInst(castPtr, name, false /* not volatile */,
@@ -829,7 +833,11 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                    llvm::StoreInst *storeInst = 
                        new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
-                    int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
+                    int align;
                    if (g->opt.forceAlignedMemory)
                        align = 0;
                    else
                        align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
                    storeInst->setAlignment(align);
                    lCopyMetadata(storeInst, callInst);
                    llvm::ReplaceInstWithInst(callInst, storeInst);
@@ -2553,7 +2561,7 @@ lImproveMaskedStore(llvm::CallInst *callInst) {
        lCopyMetadata(lvalue, callInst);
        llvm::Instruction *store = 
            new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
-                                info->align);
+                                g->opt.forceAlignedMemory ? 0 : info->align);
        lCopyMetadata(store, callInst);
        llvm::ReplaceInstWithInst(callInst, store);
        return true;
@@ -2616,7 +2624,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst,
                                    callInst);
        llvm::Instruction *load = 
            new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
-                               info->align, (llvm::Instruction *)NULL);
+                               g->opt.forceAlignedMemory ? 0 : info->align,
                               (llvm::Instruction *)NULL);
        lCopyMetadata(load, callInst);
        llvm::ReplaceInstWithInst(callInst, load);
        return true;
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -99,15 +99,21 @@ void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
 }
 #if defined(_WIN32) || defined(_WIN64)
 #define ALIGN
 #else
 #define ALIGN __attribute__((aligned(64)))
 #endif
 int main(int argc, char *argv[]) {
    int w = width();
    assert(w <= 64);
-    float returned_result[64];
+    float returned_result[64] ALIGN;
-    float vfloat[64];
+    float vfloat[64] ALIGN;
-    double vdouble[64];
+    double vdouble[64] ALIGN;
-    int vint[64], vint2[64];
+    int vint[64] ALIGN;
    int vint2[64] ALIGN;
    for (int i = 0; i < 64; ++i) {
        returned_result[i] = -1e20;