Handle ConstantExpressions when computing address+offset vectors for scatter/gather.

In particular, this fixes issue #81, where a global variable access was leading to ConstantExpressions showing up in this code, which it wasn't previously expecting.
2011-10-14 11:20:08 -07:00
parent 2460fa5c83
commit 9f2aa8d92a
5 changed files with 163 additions and 15 deletions
--- a/opt.cpp
+++ b/opt.cpp
@@ -888,6 +888,74 @@ lGetTypeSize(LLVM_TYPE_CONST llvm::Type *type, llvm::Instruction *insertBefore)
 }


+static llvm::Value *
+lTraverseConstantExpr(llvm::Constant *value, llvm::Value **offsetPtr,
+                      LLVM_TYPE_CONST llvm::Type **scaleType, 
+                      bool *leafIsVarying, llvm::Instruction *insertBefore) {
+    llvm::GlobalVariable *gv = NULL;
+    llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(value);
+    if (ce != NULL) {
+        switch (ce->getOpcode()) {
+        case llvm::Instruction::BitCast:
+            *offsetPtr = LLVMInt32(0);
+            return lTraverseConstantExpr(ce->getOperand(0), offsetPtr, 
+                                         scaleType, leafIsVarying, insertBefore);
+        case llvm::Instruction::GetElementPtr: {
+            gv = llvm::dyn_cast<llvm::GlobalVariable>(ce->getOperand(0));
+            assert(gv != NULL);
+
+            assert(lGetIntValue(ce->getOperand(1)) == 0);
+            LLVM_TYPE_CONST llvm::PointerType *targetPtrType = 
+                llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(ce->getOperand(0)->getType());
+            assert(targetPtrType);
+            LLVM_TYPE_CONST llvm::Type *targetType = targetPtrType->getElementType();
+
+            if (llvm::isa<const llvm::StructType>(targetType)) {
+                *offsetPtr = lStructOffset(targetType, lGetIntValue(ce->getOperand(2)),
+                                           insertBefore);
+                *offsetPtr = new llvm::TruncInst(*offsetPtr, LLVMTypes::Int32Type, "member32", 
+                                                 insertBefore);
+//CO                lCopyMetadata(*offset, insertBefore);
+                *scaleType = LLVMTypes::Int8Type; // aka char aka sizeof(1)
+            }
+            else {
+                *offsetPtr = ce->getOperand(2);
+                assert(*scaleType == NULL || *scaleType == targetType);
+                *scaleType = targetType;
+            }
+            break;
+        }
+        default:
+            FATAL("Unexpected opcode in constant expression!");
+            //printf("other op %s\n", ce->getOpcodeName());
+            break;
+        }
+    }
+
+    if (gv == NULL)
+        gv = llvm::dyn_cast<llvm::GlobalVariable>(value);
+
+    if (gv != NULL) {
+        // FIXME: is this broken for arrays of varying???!? (I think so).
+        // IF so, then why does the other copy if it work.  (Or is that
+        // broken, too?!?!?)
+        if (leafIsVarying != NULL) {
+            LLVM_TYPE_CONST llvm::Type *pt = value->getType();
+            LLVM_TYPE_CONST llvm::PointerType *ptrType = 
+                llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(pt);
+            assert(ptrType);
+            LLVM_TYPE_CONST llvm::Type *eltType = ptrType->getElementType();
+            *leafIsVarying = llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(eltType);
+            //printf("decided that leaf %s varying!\n", *leafIsVarying ? "is" : "is not");
+        }
+
+        return gv;
+    }
+
+    return NULL;
+}
+
+
 static llvm::Value *
 lGetOffsetForLane(int lane, llvm::Value *value, llvm::Value **offset, 
                  LLVM_TYPE_CONST llvm::Type **scaleType, bool *leafIsVarying,
@@ -973,11 +1041,31 @@ lGetOffsetForLane(int lane, llvm::Value *value, llvm::Value **offset,

    @todo All of the additional indexing magic for varying stuff should
    happen in the front end.
- */
+*/
 static llvm::Value *
 lTraverseInsertChain(llvm::Value *ptrs, llvm::Value *offsets[ISPC_MAX_NVEC],
                     LLVM_TYPE_CONST llvm::Type **scaleType, bool *leafIsVarying,
                     llvm::Instruction *insertBefore) {
+    // The pointer values may be an array of constant pointers (this
+    // happens, for example, when indexing into global arrays.)  In that
+    // case, we have llvm::ConstantExprs to deconstruct to dig out the
+    // common base pointer and the per-lane offsets.
+    llvm::ConstantArray *ca = llvm::dyn_cast<llvm::ConstantArray>(ptrs);
+    if (ca != NULL) {
+        assert((int)ca->getNumOperands() == g->target.vectorWidth);
+        llvm::Value *base = NULL;
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            llvm::Value *b = lTraverseConstantExpr(ca->getOperand(i), &offsets[i],
+                                                   scaleType, leafIsVarying, 
+                                                   insertBefore);
+            if (i == 0) 
+                base = b;
+            else
+                assert(base == b);
+        }
+        return base;
+    }
+
    // This depends on the front-end constructing the arrays of pointers
    // via InsertValue instructions.  (Which it does do in
    // FunctionEmitContext::GetElementPtrInst()).
@@ -1029,7 +1117,7 @@ lTraverseInsertChain(llvm::Value *ptrs, llvm::Value *offsets[ISPC_MAX_NVEC],

    @todo Using shufflevector to do this seems more idiomatic (and would be
    just a single instruction).  Switch to that?
- */
+*/
 static llvm::Value *
 lSmearScalar(llvm::Value *scalar, llvm::Instruction *insertBefore) {
    LLVM_TYPE_CONST llvm::Type *vectorType = llvm::VectorType::get(scalar->getType(), 
@@ -1237,7 +1325,7 @@ CreateGatherScatterFlattenPass() {
    This optimization detects cases where masked stores can be replaced
    with regular stores or removed entirely, for the cases of an 'all on'
    mask and an 'all off' mask, respectively.
- */
+*/
 class MaskedStoreOptPass : public llvm::BasicBlockPass {
 public:
    static char ID;
@@ -1353,7 +1441,7 @@ CreateMaskedStoreOptPass() {
    __pseudo_masked_store_{8,16,32,64} call as a placeholder.  This pass
    lowers these calls to either __masked_store_{8,16,32,64} or
    __masked_store_blend_{8,16,32,64} calls.
-  */
+*/
 class LowerMaskedStorePass : public llvm::BasicBlockPass {
 public:
    static char ID;
@@ -1376,7 +1464,7 @@ llvm::RegisterPass<LowerMaskedStorePass> lms("masked-store-lower",
    false for memory that actually is stack allocated.  The basic strategy
    is to traverse through the operands and see if the pointer originally
    comes from an AllocaInst.
- */
+*/
 static bool
 lIsStackVariablePointer(llvm::Value *lvalue) {
    llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(lvalue);
@@ -1498,7 +1586,7 @@ CreateLowerMaskedStorePass() {
    look for, including things that could be handled with a vector load +
    shuffle or things that could be handled with hybrids of e.g. 2 4-wide
    vector loads with AVX, etc.
- */
+*/
 class GSImprovementsPass : public llvm::BasicBlockPass {
 public:
    static char ID;
@@ -1557,7 +1645,7 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC])
    this length as well.)
    @returns                 True if the vector was successfully scalarized and
    the values in offsets[] are valid; false otherwise
- */
+*/
 static bool
 lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector,
                 int vectorLength, std::map<llvm::PHINode *, llvm::Value **> &phiMap) {
@@ -1847,7 +1935,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector,
    practice, but it's be nice to make it a little more robust/general.  In
    general, though, a little something called the halting problem means we
    won't get all of them.
- */
+*/
 static bool
 lValuesAreEqual(llvm::Value *v0, llvm::Value *v1, 
                std::vector<llvm::PHINode *> &seenPhi0,
@@ -1911,7 +1999,7 @@ lValuesAreEqual(llvm::Value *v0, llvm::Value *v1,
 /** Tests to see if all of the llvm::Values in the array are equal.  Like
    lValuesAreEqual, this is a conservative test and may return false for
    arrays where the values are actually all equal.
- */
+*/
 static bool
 lVectorValuesAllEqual(llvm::Value **v, int vectorLength) {
    std::vector<llvm::PHINode *> seenPhi0, seenPhi1;
@@ -1926,7 +2014,7 @@ lVectorValuesAllEqual(llvm::Value **v, int vectorLength) {
    linear sequence of compile-time constant integers starting from an
    arbirary value but then having a step of value "stride" between
    elements.
- */
+*/
 static bool
 lVectorIsLinearConstantInts(llvm::Value **v, int vectorLength, int stride) {
    llvm::ConstantInt *prev = llvm::dyn_cast<llvm::ConstantInt>(v[0]);
@@ -1962,7 +2050,7 @@ lVectorIsLinearConstantInts(llvm::Value **v, int vectorLength, int stride) {
    subtract the constants [v[0], v[0]+stride, v[0]+2*stride, ...] from the
    given values, throw the LLVM optimizer at those, and then see if we get
    back an array of all zeros?
- */
+*/
 static bool
 lVectorIsLinear(llvm::Value **v, int vectorLength, int stride,
                std::set<llvm::PHINode *> &seenPhis) {
--- a/tests/global-array-1.ispc
+++ b/tests/global-array-1.ispc
@@ -0,0 +1,15 @@
+
+uniform float a[100];
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = i;
+    RET[programIndex] = a[programIndex+b-5]; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
--- a/tests/global-array-3.ispc
+++ b/tests/global-array-3.ispc
@@ -0,0 +1,15 @@
+
+float a[100];
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = i*programCount+programIndex;
+    RET[programIndex] = a[b-5]; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
--- a/tests/global-array-4.ispc
+++ b/tests/global-array-4.ispc
@@ -0,0 +1,15 @@
+
+float a[100];
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = i*programCount+programIndex;
+    RET[programIndex] = a[programIndex+b-5]; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programCount+1) * programIndex;
+}
--- a/tests/global-array.ispc
+++ b/tests/global-array.ispc
@@ -0,0 +1,15 @@
+
+uniform float a[100];
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    for (uniform int i = 0; i < programCount; ++i)
+        a[i] = i;
+    RET[programIndex] = a[programIndex]; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}