Do all memory op improvements in a single optimization pass.

Rather than having separate passes to do conversion, when possible, of: - General gather/scatter of a vector of pointers to g/s of a base pointer and integer offsets - Gather/scatter to masked load/store, load+broadcast - Masked load/store to regular load/store Now all are done in a single ImproveMemoryOps pass. This change was in particular to address some phase ordering issues that showed up with multidimensional array access wherein after determining that an outer dimension had the same index value, we previously weren't able to take advantage of the uniformity of the resulting pointer.
2012-06-12 13:56:17 -07:00
parent 40a295e951
commit 96450e17a3
1 changed files with 460 additions and 559 deletions
--- a/opt.cpp
+++ b/opt.cpp
@@ -89,12 +89,9 @@

 static llvm::Pass *CreateIntrinsicsOptPass();
 static llvm::Pass *CreateVSelMovmskOptPass();
-static llvm::Pass *CreateDetectGSBaseOffsetsPass();
-static llvm::Pass *CreateGSToLoadStorePass();
-static llvm::Pass *CreateGatherCoalescePass();
-static llvm::Pass *CreateMaskedStoreOptPass();
-static llvm::Pass *CreateMaskedLoadOptPass();

+static llvm::Pass *CreateImproveMemoryOpsPass();
+static llvm::Pass *CreateGatherCoalescePass();
 static llvm::Pass *CreateReplacePseudoMemoryOpsPass();

 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
@@ -414,7 +411,7 @@ Optimize(llvm::Module *module, int optLevel) {
        // run absolutely no optimizations, since the front-end needs us to
        // take the various __pseudo_* functions it has emitted and turn
        // them into something that can actually execute.
-        optPM.add(CreateDetectGSBaseOffsetsPass());
+        optPM.add(CreateImproveMemoryOpsPass());
        if (g->opt.disableHandlePseudoMemoryOps == false)
            optPM.add(CreateReplacePseudoMemoryOpsPass());

@@ -446,12 +443,13 @@ Optimize(llvm::Module *module, int optLevel) {
        optPM.add(llvm::createDeadInstEliminationPass());
        optPM.add(llvm::createCFGSimplificationPass());

-        optPM.add(CreateDetectGSBaseOffsetsPass());
+        if (g->opt.disableGatherScatterOptimizations == false &&
+            g->target.vectorWidth > 1) {
+            optPM.add(CreateImproveMemoryOpsPass());
+        }
        if (!g->opt.disableMaskAllOnOptimizations) {
            optPM.add(CreateIntrinsicsOptPass());
            optPM.add(CreateVSelMovmskOptPass());
-            optPM.add(CreateMaskedStoreOptPass());
-            optPM.add(CreateMaskedLoadOptPass());
        }
        optPM.add(llvm::createDeadInstEliminationPass());

@@ -485,13 +483,11 @@ Optimize(llvm::Module *module, int optLevel) {
        if (!g->opt.disableMaskAllOnOptimizations) {
            optPM.add(CreateIntrinsicsOptPass());
            optPM.add(CreateVSelMovmskOptPass());
-            optPM.add(CreateMaskedStoreOptPass());
-            optPM.add(CreateMaskedLoadOptPass());
        }

        if (g->opt.disableGatherScatterOptimizations == false &&
            g->target.vectorWidth > 1) {
-            optPM.add(CreateGSToLoadStorePass());
+            optPM.add(CreateImproveMemoryOpsPass());
        
            if (g->opt.disableCoalescing == false &&
                g->target.isa != Target::GENERIC) {
@@ -502,23 +498,26 @@ Optimize(llvm::Module *module, int optLevel) {
            }
        }

-        if (g->opt.disableHandlePseudoMemoryOps == false)
-            optPM.add(CreateReplacePseudoMemoryOpsPass());
-
-        if (!g->opt.disableMaskAllOnOptimizations) {
-            optPM.add(CreateMaskedStoreOptPass());
-            optPM.add(CreateMaskedLoadOptPass());
-        }
-
        optPM.add(llvm::createFunctionInliningPass());
        optPM.add(llvm::createConstantPropagationPass());
        optPM.add(CreateIntrinsicsOptPass());
        optPM.add(CreateVSelMovmskOptPass());

+        if (g->opt.disableGatherScatterOptimizations == false &&
+            g->target.vectorWidth > 1) {
+            optPM.add(CreateImproveMemoryOpsPass());
+        }
+
        optPM.add(llvm::createIPSCCPPass());              
        optPM.add(llvm::createDeadArgEliminationPass());  
        optPM.add(llvm::createInstructionCombiningPass());
        optPM.add(llvm::createCFGSimplificationPass());   
+
+        if (g->opt.disableHandlePseudoMemoryOps == false)
+            optPM.add(CreateReplacePseudoMemoryOpsPass());
+        optPM.add(CreateIntrinsicsOptPass());
+        optPM.add(CreateVSelMovmskOptPass());
+
        optPM.add(llvm::createFunctionInliningPass());
        optPM.add(llvm::createArgumentPromotionPass());   
        optPM.add(llvm::createScalarReplAggregatesPass(-1, false));
@@ -959,7 +958,7 @@ CreateVSelMovmskOptPass() {


 ///////////////////////////////////////////////////////////////////////////
-// DetectGSBaseOffsetsPass
+// ImproveMemoryOpsPass

 /** When the front-end emits gathers and scatters, it generates an array of
    vector-width pointers to represent the set of addresses to read from or
@@ -971,16 +970,16 @@ CreateVSelMovmskOptPass() {
    See for example the comments discussing the __pseudo_gather functions
    in builtins.cpp for more information about this.
 */
-class DetectGSBaseOffsetsPass : public llvm::BasicBlockPass {
+class ImproveMemoryOpsPass : public llvm::BasicBlockPass {
 public:
    static char ID;
-    DetectGSBaseOffsetsPass() : BasicBlockPass(ID) { }
+    ImproveMemoryOpsPass() : BasicBlockPass(ID) { }

-    const char *getPassName() const { return "Gather/Scatter Flattening"; }
+    const char *getPassName() const { return "Improve Memory Ops"; }
    bool runOnBasicBlock(llvm::BasicBlock &BB);
 };

-char DetectGSBaseOffsetsPass::ID = 0;
+char ImproveMemoryOpsPass::ID = 0;



@@ -1670,6 +1669,8 @@ lOffsets32BitSafe(llvm::Value **variableOffsetPtr,
 }


+static bool
+lGSToGSBaseOffsets(llvm::CallInst *callInst) {
    struct GSInfo {
        GSInfo(const char *pgFuncName, const char *pgboFuncName, 
               const char *pgbo32FuncName, bool ig) 
@@ -1683,11 +1684,6 @@ struct GSInfo {
        const bool isGather;
    };

-
-bool
-DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    DEBUG_START_PASS("DetectGSBaseOffsets");
-
    GSInfo gsFuncs[] = {
        GSInfo("__pseudo_gather32_i8",  "__pseudo_gather_base_offsets32_i8",
               "__pseudo_gather_base_offsets32_i8", true),
@@ -1741,21 +1737,12 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        GSInfo("__pseudo_scatter64_double", "__pseudo_scatter_base_offsets64_double", 
               "__pseudo_scatter_base_offsets32_double", false),
    };
+
    int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
    for (int i = 0; i < numGSFuncs; ++i)
        Assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL &&
               gsFuncs[i].baseOffsets32Func != NULL);

-    bool modifiedAny = false;
- restart:
-    // Iterate through all of the instructions in the basic block.
-    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
-        // If we don't have a call to one of the
-        // __pseudo_{gather,scatter}_* functions, then just go on to the
-        // next instruction.
-        if (callInst == NULL)
-            continue;
    GSInfo *info = NULL;
    for (int i = 0; i < numGSFuncs; ++i)
        if (gsFuncs[i].func != NULL &&
@@ -1764,7 +1751,7 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            break;
        }
    if (info == NULL)
-            continue;
+        return false;

    // Try to transform the array of pointers to a single base pointer
    // and an array of int32 offsets.  (All the hard work is done by
@@ -1778,7 +1765,7 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        // It's actually a fully general gather/scatter with a varying
        // set of base pointers, so leave it as is and continune onward
        // to the next instruction...
-            continue;
+        return false;

    // Try to decompose the offset vector into a compile time constant
    // component and a varying component.  The constant component is
@@ -1843,254 +1830,19 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        lCopyMetadata(newCall, callInst);
        llvm::ReplaceInstWithInst(callInst, newCall);
    }
-        modifiedAny = true;
-        goto restart;
-    }

-    DEBUG_END_PASS("DetectGSBaseOffsets");
-
-    return modifiedAny;
+    return true;
 }


-static llvm::Pass *
-CreateDetectGSBaseOffsetsPass() {
-    return new DetectGSBaseOffsetsPass;
+static llvm::Value *
+lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets,
+                      llvm::Instruction *insertBefore) {
+    llvm::Value *firstOffset = LLVMExtractFirstVectorElement(offsets);
+    return lGEPInst(base, firstOffset, "ptr", insertBefore);
 }


-///////////////////////////////////////////////////////////////////////////
-// MaskedStoreOptPass
-
-/** Masked stores are generally more complex than regular stores; for
-    example, they require multiple instructions to simulate under SSE.
-    This optimization detects cases where masked stores can be replaced
-    with regular stores or removed entirely, for the cases of an 'all on'
-    mask and an 'all off' mask, respectively.
-*/
-class MaskedStoreOptPass : public llvm::BasicBlockPass {
-public:
-    static char ID;
-    MaskedStoreOptPass() : BasicBlockPass(ID) { }
-
-    const char *getPassName() const { return "Masked Store Scalarize"; }
-    bool runOnBasicBlock(llvm::BasicBlock &BB);
-};
-
-
-char MaskedStoreOptPass::ID = 0;
-
-struct MSInfo {
-    MSInfo(const char *name, const int a) 
-        : align(a) {
-        func = m->module->getFunction(name);
-        Assert(func != NULL);
-    }
-    llvm::Function *func;
-    const int align;
-};
-        
-
-bool
-MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    DEBUG_START_PASS("MaskedStoreOpt");
-
-    MSInfo msInfo[] = {
-        MSInfo("__pseudo_masked_store_i8",  1),
-        MSInfo("__pseudo_masked_store_i16", 2),
-        MSInfo("__pseudo_masked_store_i32", 4),
-        MSInfo("__pseudo_masked_store_float", 4),
-        MSInfo("__pseudo_masked_store_i64", 8),
-        MSInfo("__pseudo_masked_store_double", 8),
-        MSInfo("__masked_store_blend_i8",  1),
-        MSInfo("__masked_store_blend_i16", 2),
-        MSInfo("__masked_store_blend_i32", 4),
-        MSInfo("__masked_store_blend_float", 4),
-        MSInfo("__masked_store_blend_i64", 8),
-        MSInfo("__masked_store_blend_double", 8),
-        MSInfo("__masked_store_i8",  1),
-        MSInfo("__masked_store_i16", 2),
-        MSInfo("__masked_store_i32", 4),
-        MSInfo("__masked_store_float", 4),
-        MSInfo("__masked_store_i64", 8),
-        MSInfo("__masked_store_double", 8)
-    };
-
-    bool modifiedAny = false;
- restart:
-    // Iterate over all of the instructions to look for one of the various
-    // masked store functions
-    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
-        if (callInst == NULL)
-            continue;
-
-        llvm::Function *called = callInst->getCalledFunction();
-        if (called == NULL)
-            continue;
-
-        int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
-        MSInfo *info = NULL;
-        for (int i = 0; i < nMSFuncs; ++i) {
-            if (msInfo[i].func != NULL && called == msInfo[i].func) {
-                info = &msInfo[i];
-                break;
-            }
-        }
-        if (info == NULL)
-            continue;
-
-        // Got one; grab the operands
-        llvm::Value *lvalue = callInst->getArgOperand(0);
-        llvm::Value *rvalue  = callInst->getArgOperand(1);
-        llvm::Value *mask = callInst->getArgOperand(2);
-
-        MaskStatus maskStatus = lGetMaskStatus(mask);
-        if (maskStatus == ALL_OFF) {
-            // Zero mask - no-op, so remove the store completely.  (This
-            // may in turn lead to being able to optimize out instructions
-            // that compute the rvalue...)
-            callInst->eraseFromParent();
-            modifiedAny = true;
-            goto restart;
-        }
-        else if (maskStatus == ALL_ON) {
-            // The mask is all on, so turn this into a regular store
-            llvm::Type *rvalueType = rvalue->getType();
-            llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
-
-            lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
-            lCopyMetadata(lvalue, callInst);
-            llvm::Instruction *store = 
-                new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
-                                    info->align);
-            lCopyMetadata(store, callInst);
-            llvm::ReplaceInstWithInst(callInst, store);
-
-            modifiedAny = true;
-            goto restart;
-        }
-    }
-
-    DEBUG_END_PASS("MaskedStoreOpt");
-
-    return modifiedAny;
-}
-
-
-static llvm::Pass *
-CreateMaskedStoreOptPass() {
-    return new MaskedStoreOptPass;
-}
-
-
-///////////////////////////////////////////////////////////////////////////
-// MaskedLoadOptPass
-
-/** Masked load improvements for the all on/all off mask cases.
-*/
-class MaskedLoadOptPass : public llvm::BasicBlockPass {
-public:
-    static char ID;
-    MaskedLoadOptPass() : BasicBlockPass(ID) { }
-
-    const char *getPassName() const { return "Masked Load Improvements"; }
-    bool runOnBasicBlock(llvm::BasicBlock &BB);
-};
-
-
-char MaskedLoadOptPass::ID = 0;
-
-struct MLInfo {
-    MLInfo(const char *name, const int a) 
-        : align(a) {
-        func = m->module->getFunction(name);
-        Assert(func != NULL);
-    }
-    llvm::Function *func;
-    const int align;
-};
-        
-
-bool
-MaskedLoadOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    DEBUG_START_PASS("MaskedLoadOpt");
-
-    MLInfo mlInfo[] = {
-        MLInfo("__masked_load_i8",  1),
-        MLInfo("__masked_load_i16", 2),
-        MLInfo("__masked_load_i32", 4),
-        MLInfo("__masked_load_float", 4),
-        MLInfo("__masked_load_i64", 8),
-        MLInfo("__masked_load_double", 8)
-    };
-
-    bool modifiedAny = false;
- restart:
-    // Iterate over all of the instructions to look for one of the various
-    // masked load functions
-    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
-        if (!callInst)
-            continue;
-
-        llvm::Function *called = callInst->getCalledFunction();
-        if (called == NULL)
-            continue;
-
-        int nFuncs = sizeof(mlInfo) / sizeof(mlInfo[0]);
-        MLInfo *info = NULL;
-        for (int i = 0; i < nFuncs; ++i) {
-            if (mlInfo[i].func != NULL && called == mlInfo[i].func) {
-                info = &mlInfo[i];
-                break;
-            }
-        }
-        if (info == NULL)
-            continue;
-
-        // Got one; grab the operands
-        llvm::Value *ptr = callInst->getArgOperand(0);
-        llvm::Value *mask  = callInst->getArgOperand(1);
-
-        MaskStatus maskStatus = lGetMaskStatus(mask);
-        if (maskStatus == ALL_OFF) {
-            // Zero mask - no-op, so replace the load with an undef value
-            llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
-                                       iter, llvm::UndefValue::get(callInst->getType()));
-            modifiedAny = true;
-            goto restart;
-        }
-        else if (maskStatus == ALL_ON) {
-            // The mask is all on, so turn this into a regular load
-            llvm::Type *ptrType = llvm::PointerType::get(callInst->getType(), 0);
-            ptr = new llvm::BitCastInst(ptr, ptrType, "ptr_cast_for_load", 
-                                        callInst);
-            llvm::Instruction *load = 
-                new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
-                                   info->align, (llvm::Instruction *)NULL);
-            lCopyMetadata(load, callInst);
-            llvm::ReplaceInstWithInst(callInst, load);
-            modifiedAny = true;
-            goto restart;
-        }
-    }
-
-    DEBUG_END_PASS("MaskedLoadOpt");
-
-    return modifiedAny;
-}
-
-
-static llvm::Pass *
-CreateMaskedLoadOptPass() {
-    return new MaskedLoadOptPass;
-}
-
-
-///////////////////////////////////////////////////////////////////////////
-// GSToLoadStorePass
-
 /** After earlier optimization passes have run, we are sometimes able to
    determine that gathers/scatters are actually accessing memory in a more
    regular fashion and then change the operation to something simpler and
@@ -2106,19 +1858,8 @@ CreateMaskedLoadOptPass() {
    shuffle or things that could be handled with hybrids of e.g. 2 4-wide
    vector loads with AVX, etc.
 */
-class GSToLoadStorePass : public llvm::BasicBlockPass {
-public:
-    static char ID;
-    GSToLoadStorePass() : BasicBlockPass(ID) { }
-
-    const char *getPassName() const { return "Gather/Scatter Improvements"; }
-    bool runOnBasicBlock(llvm::BasicBlock &BB);
-};
-
-
-char GSToLoadStorePass::ID = 0;
-
-
+static bool
+lGSToLoadStore(llvm::CallInst *callInst) {
    struct GatherImpInfo {
        GatherImpInfo(const char *pName, const char *lmName, llvm::Type *st,
                      int a) 
@@ -2135,35 +1876,6 @@ struct GatherImpInfo {
        const int align;
    };

-
-static llvm::Value *
-lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets,
-                      llvm::Instruction *insertBefore) {
-    llvm::Value *firstOffset = LLVMExtractFirstVectorElement(offsets);
-    return lGEPInst(base, firstOffset, "ptr", insertBefore);
-}
-
-
-struct ScatterImpInfo {
-    ScatterImpInfo(const char *pName, const char *msName, 
-                   llvm::Type *vpt, int a)
-        : align(a) {
-        pseudoFunc = m->module->getFunction(pName);
-        maskedStoreFunc = m->module->getFunction(msName);
-        vecPtrType = vpt;
-        Assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
-    }
-    llvm::Function *pseudoFunc;
-    llvm::Function *maskedStoreFunc;
-    llvm::Type *vecPtrType;
-    const int align;
-};
-    
-
-bool
-GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    DEBUG_START_PASS("GSToLoadStorePass");
-
    GatherImpInfo gInfo[] = {
        GatherImpInfo("__pseudo_gather_base_offsets32_i8",     "__masked_load_i8",     
                      LLVMTypes::Int8Type, 1),
@@ -2190,6 +1902,22 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
        GatherImpInfo("__pseudo_gather_base_offsets64_double", "__masked_load_double", 
                      LLVMTypes::DoubleType, 8)
    };
+
+    struct ScatterImpInfo {
+        ScatterImpInfo(const char *pName, const char *msName, 
+                       llvm::Type *vpt, int a)
+            : align(a) {
+            pseudoFunc = m->module->getFunction(pName);
+            maskedStoreFunc = m->module->getFunction(msName);
+            vecPtrType = vpt;
+            Assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
+        }
+        llvm::Function *pseudoFunc;
+        llvm::Function *maskedStoreFunc;
+        llvm::Type *vecPtrType;
+        const int align;
+    };
+    
    ScatterImpInfo sInfo[] = {
        ScatterImpInfo("__pseudo_scatter_base_offsets32_i8",  "__pseudo_masked_store_i8", 
                       LLVMTypes::Int8VectorPointerType, 1),
@@ -2217,19 +1945,7 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
                       LLVMTypes::DoubleVectorPointerType, 8)
    };

-    bool modifiedAny = false;
-
- restart:
-    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
-        // Iterate over all of the instructions and look for calls to
-        // __pseudo_*_base_offsets_* calls.
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
-        if (callInst == NULL)
-            continue;
-
    llvm::Function *calledFunc = callInst->getCalledFunction();
-        if (calledFunc == NULL)
-            continue;

    GatherImpInfo *gatherInfo = NULL;
    ScatterImpInfo *scatterInfo = NULL;
@@ -2248,7 +1964,7 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
        }
    }
    if (gatherInfo == NULL && scatterInfo == NULL)
-            continue;
+        return false;

    SourcePos pos;
    lGetSourcePosFromMetadata(callInst, &pos);
@@ -2313,8 +2029,7 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
            lCopyMetadata(vecValue, callInst);
            llvm::ReplaceInstWithInst(callInst, 
                                      llvm::dyn_cast<llvm::Instruction>(vecValue));
-                modifiedAny = true;
-                goto restart;
+            return true;
        }
        else {
            // A scatter with everyone going to the same location is
@@ -2331,6 +2046,7 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
            // case.  We'll just let a bunch of the program instances
            // do redundant writes, since this isn't important to make
            // fast anyway...
+            return false;
        }
    }
    else {
@@ -2351,6 +2067,7 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
                              LLVMGetName(ptr, "_masked_load"));
                lCopyMetadata(newCall, callInst);
                llvm::ReplaceInstWithInst(callInst, newCall);
+                return true;
            }
            else {
                Debug(pos, "Transformed scatter to unaligned vector store!");
@@ -2361,23 +2078,207 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
                              mask, "");
                lCopyMetadata(newCall, callInst);
                llvm::ReplaceInstWithInst(callInst, newCall);
+                return true;
+            }
+        }
+        return false;
+    }
 }

+
+///////////////////////////////////////////////////////////////////////////
+// MaskedStoreOptPass
+
+/** Masked stores are generally more complex than regular stores; for
+    example, they require multiple instructions to simulate under SSE.
+    This optimization detects cases where masked stores can be replaced
+    with regular stores or removed entirely, for the cases of an 'all on'
+    mask and an 'all off' mask, respectively.
+*/
+static bool
+lImproveMaskedStore(llvm::CallInst *callInst) {
+    struct MSInfo {
+        MSInfo(const char *name, const int a) 
+            : align(a) {
+            func = m->module->getFunction(name);
+            Assert(func != NULL);
+        }
+        llvm::Function *func;
+        const int align;
+    };
+        
+    MSInfo msInfo[] = {
+        MSInfo("__pseudo_masked_store_i8",  1),
+        MSInfo("__pseudo_masked_store_i16", 2),
+        MSInfo("__pseudo_masked_store_i32", 4),
+        MSInfo("__pseudo_masked_store_float", 4),
+        MSInfo("__pseudo_masked_store_i64", 8),
+        MSInfo("__pseudo_masked_store_double", 8),
+        MSInfo("__masked_store_blend_i8",  1),
+        MSInfo("__masked_store_blend_i16", 2),
+        MSInfo("__masked_store_blend_i32", 4),
+        MSInfo("__masked_store_blend_float", 4),
+        MSInfo("__masked_store_blend_i64", 8),
+        MSInfo("__masked_store_blend_double", 8),
+        MSInfo("__masked_store_i8",  1),
+        MSInfo("__masked_store_i16", 2),
+        MSInfo("__masked_store_i32", 4),
+        MSInfo("__masked_store_float", 4),
+        MSInfo("__masked_store_i64", 8),
+        MSInfo("__masked_store_double", 8)
+    };
+
+    llvm::Function *called = callInst->getCalledFunction();
+
+    int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
+    MSInfo *info = NULL;
+    for (int i = 0; i < nMSFuncs; ++i) {
+        if (msInfo[i].func != NULL && called == msInfo[i].func) {
+            info = &msInfo[i];
+            break;
+        }
+    }
+    if (info == NULL)
+        return false;
+
+    // Got one; grab the operands
+    llvm::Value *lvalue = callInst->getArgOperand(0);
+    llvm::Value *rvalue  = callInst->getArgOperand(1);
+    llvm::Value *mask = callInst->getArgOperand(2);
+
+    MaskStatus maskStatus = lGetMaskStatus(mask);
+    if (maskStatus == ALL_OFF) {
+        // Zero mask - no-op, so remove the store completely.  (This
+        // may in turn lead to being able to optimize out instructions
+        // that compute the rvalue...)
+        callInst->eraseFromParent();
+        return true;
+    }
+    else if (maskStatus == ALL_ON) {
+        // The mask is all on, so turn this into a regular store
+        llvm::Type *rvalueType = rvalue->getType();
+        llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
+
+        lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
+        lCopyMetadata(lvalue, callInst);
+        llvm::Instruction *store = 
+            new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
+                                info->align);
+        lCopyMetadata(store, callInst);
+        llvm::ReplaceInstWithInst(callInst, store);
+        return true;
+    }
+
+    return false;
+}
+
+
+static bool
+lImproveMaskedLoad(llvm::CallInst *callInst,
+                   llvm::BasicBlock::iterator iter) {
+    struct MLInfo {
+        MLInfo(const char *name, const int a) 
+            : align(a) {
+            func = m->module->getFunction(name);
+            Assert(func != NULL);
+        }
+        llvm::Function *func;
+        const int align;
+    };
+        
+    MLInfo mlInfo[] = {
+        MLInfo("__masked_load_i8",  1),
+        MLInfo("__masked_load_i16", 2),
+        MLInfo("__masked_load_i32", 4),
+        MLInfo("__masked_load_float", 4),
+        MLInfo("__masked_load_i64", 8),
+        MLInfo("__masked_load_double", 8)
+    };
+
+    llvm::Function *called = callInst->getCalledFunction();
+
+    int nFuncs = sizeof(mlInfo) / sizeof(mlInfo[0]);
+    MLInfo *info = NULL;
+    for (int i = 0; i < nFuncs; ++i) {
+        if (mlInfo[i].func != NULL && called == mlInfo[i].func) {
+            info = &mlInfo[i];
+            break;
+        }
+    }
+    if (info == NULL)
+        return false;
+
+    // Got one; grab the operands
+    llvm::Value *ptr = callInst->getArgOperand(0);
+    llvm::Value *mask  = callInst->getArgOperand(1);
+
+    MaskStatus maskStatus = lGetMaskStatus(mask);
+    if (maskStatus == ALL_OFF) {
+        // Zero mask - no-op, so replace the load with an undef value
+        llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
+                                   iter, llvm::UndefValue::get(callInst->getType()));
+        return true;
+    }
+    else if (maskStatus == ALL_ON) {
+        // The mask is all on, so turn this into a regular load
+        llvm::Type *ptrType = llvm::PointerType::get(callInst->getType(), 0);
+        ptr = new llvm::BitCastInst(ptr, ptrType, "ptr_cast_for_load", 
+                                    callInst);
+        llvm::Instruction *load = 
+            new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
+                               info->align, (llvm::Instruction *)NULL);
+        lCopyMetadata(load, callInst);
+        llvm::ReplaceInstWithInst(callInst, load);
+        return true;
+    }
+    else
+        return false;
+}
+
+
+bool
+ImproveMemoryOpsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("ImproveMemoryOps");
+
+    bool modifiedAny = false;
+ restart:
+    // Iterate through all of the instructions in the basic block.
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
+        // If we don't have a call to one of the
+        // __pseudo_{gather,scatter}_* functions, then just go on to the
+        // next instruction.
+        if (callInst == NULL ||
+            callInst->getCalledFunction() == NULL)
+            continue;
+
+        if (lGSToGSBaseOffsets(callInst)) {
+            modifiedAny = true;
+            goto restart;
+        }
+        if (lGSToLoadStore(callInst)) {
+            modifiedAny = true;
+            goto restart;
+        }
+        if (lImproveMaskedStore(callInst)) {
+            modifiedAny = true;
+            goto restart;
+        }
+        if (lImproveMaskedLoad(callInst, iter)) {
            modifiedAny = true;
            goto restart;
        }
    }
-    }

-    DEBUG_END_PASS("GSToLoadStorePass");
+    DEBUG_END_PASS("ImproveMemoryOps");

    return modifiedAny;
 }


 static llvm::Pass *
-CreateGSToLoadStorePass() {
-    return new GSToLoadStorePass;
+CreateImproveMemoryOpsPass() {
+    return new ImproveMemoryOpsPass;
 }