Merge pull request #467 from dbabokin/broadcast

Broadcast implementation as InsertElement+Shuffle and related improvements
2013-04-11 13:42:56 -07:00
parent 603abf70dc 7371d82bdf
commit 78a840f48d
8 changed files with 301 additions and 125 deletions
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -4395,16 +4395,21 @@ public:
    static char ID;
    llvm::Module *module;
-    int vectorWidth;
+    unsigned int vectorWidth;
 private:
    unsigned int ChainLength(llvm::InsertElementInst *inst) const;
    llvm::Value *getInsertChainSmearValue(llvm::Instruction* inst) const;
    llvm::Value *getShuffleSmearValue(llvm::Instruction* inst) const;
 };
 char SmearCleanupPass::ID = 0;
-static int
+unsigned int
-lChainLength(llvm::InsertElementInst *inst) {
+SmearCleanupPass::ChainLength(llvm::InsertElementInst *inst) const {
-    int length = 0;
+    unsigned int length = 0;
    while (inst != NULL) {
        ++length;
        inst = llvm::dyn_cast<llvm::InsertElementInst>(inst->getOperand(0));
@@ -4413,45 +4418,105 @@ lChainLength(llvm::InsertElementInst *inst) {
 }
 llvm::Value *
 SmearCleanupPass::getInsertChainSmearValue(llvm::Instruction* inst) const {
    // TODO: we don't check indexes where we do insertion, so we may trigger
    // transformation for a wrong chain.
    // This way of doing broadcast is obsolete and should be probably removed
    // some day.
    llvm::InsertElementInst *insertInst =
        llvm::dyn_cast<llvm::InsertElementInst>(inst);
    if (!insertInst) {
        return NULL;
    }
    // We consider only chians of vectorWidth length.
    if (ChainLength(insertInst) != vectorWidth) {
        return NULL;
    }
    // FIXME: we only want to do this to vectors with width equal to
    // the target vector width.  But we can't easily get that here, so
    // for now we at least avoid one case where we definitely don't
    // want to do this.
    llvm::VectorType *vt = llvm::dyn_cast<llvm::VectorType>(insertInst->getType());
    if (vt->getNumElements() == 1) {
        return NULL;
    }
    llvm::Value *smearValue = NULL;
    while (insertInst != NULL) {
        // operand 1 is inserted value
        llvm::Value *insertValue = insertInst->getOperand(1);
        if (smearValue == NULL) {
            smearValue = insertValue;
        }
        else if (smearValue != insertValue) {
            return NULL;
        }
        // operand 0 is a vector to insert into.
        insertInst =
            llvm::dyn_cast<llvm::InsertElementInst>(insertInst->getOperand(0));
    }
    assert(smearValue != NULL);
    return smearValue;
 }
 llvm::Value *
 SmearCleanupPass::getShuffleSmearValue(llvm::Instruction* inst) const {
    llvm::ShuffleVectorInst *shuffleInst =
        llvm::dyn_cast<llvm::ShuffleVectorInst>(inst);
    if (!shuffleInst) {
        return NULL;
    }
    llvm::Constant* mask =
        llvm::dyn_cast<llvm::Constant>(shuffleInst->getOperand(2));
    // Check that the shuffle is a broadcast of the first element of the first vector,
    // i.e. mask vector is all-zeros vector of expected size.
    if (!(mask &&
          mask->isNullValue() &&
          llvm::dyn_cast<llvm::VectorType>(mask->getType())->getNumElements() == vectorWidth)) {
        return NULL;
    }
    llvm::InsertElementInst *insertInst =
        llvm::dyn_cast<llvm::InsertElementInst>(shuffleInst->getOperand(0));
    // Check that it's an InsertElementInst that inserts a value to first element.
    if (!(insertInst &&
          llvm::isa<llvm::Constant>(insertInst->getOperand(2)) &&
          llvm::dyn_cast<llvm::Constant>(insertInst->getOperand(2))->isNullValue())) {
        return NULL;
    }
    llvm::Value *result = insertInst->getOperand(1);
    return result;
 }
 bool
 SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
    bool modifiedAny = false;
 restart:
    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
-        llvm::InsertElementInst *insertInst =
+        llvm::Value *smearValue = NULL;
-            llvm::dyn_cast<llvm::InsertElementInst>(&*iter);
+
-        if (insertInst == NULL)
+        if (!(smearValue = getInsertChainSmearValue(iter)) &&
            !(smearValue = getShuffleSmearValue(iter))) {
            continue;
        // Only do this on the last insert in a chain...
        if (lChainLength(insertInst) != vectorWidth)
            continue;
        // FIXME: we only want to do this to vectors with width equal to
        // the target vector width.  But we can't easily get that here, so
        // for now we at least avoid one case where we definitely don't
        // want to do this.
        llvm::VectorType *vt = llvm::dyn_cast<llvm::VectorType>(insertInst->getType());
        if (vt->getNumElements() == 1)
            continue;
        llvm::Value *toMatch = NULL;
        while (insertInst != NULL) {
            llvm::Value *insertValue = insertInst->getOperand(1);
            if (toMatch == NULL)
                toMatch = insertValue;
            else if (toMatch != insertValue)
                goto not_equal;
            insertInst =
                llvm::dyn_cast<llvm::InsertElementInst>(insertInst->getOperand(0));
        }
        assert(toMatch != NULL);
-        {
+        llvm::Type *smearType = smearValue->getType();
-        llvm::Type *matchType = toMatch->getType();
+        const char *smearFuncName = lGetTypedFunc("smear", smearType, vectorWidth);
        const char *smearFuncName = lGetTypedFunc("smear", matchType, vectorWidth);
        if (smearFuncName != NULL) {
            llvm::Function *smearFunc = module->getFunction(smearFuncName);
            if (smearFunc == NULL) {
@@ -4460,7 +4525,7 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                // parameter type.
                llvm::Constant *sf =
                    module->getOrInsertFunction(smearFuncName, iter->getType(),
-                                                matchType, NULL);
+                                                smearType, NULL);
                smearFunc = llvm::dyn_cast<llvm::Function>(sf);
                assert(smearFunc != NULL);
 #if defined(LLVM_3_1)
@@ -4473,10 +4538,10 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            }
            assert(smearFunc != NULL);
-            llvm::Value *args[1] = { toMatch };
+            llvm::Value *args[1] = { smearValue };
            llvm::ArrayRef<llvm::Value *> argArray(&args[0], &args[1]);
            llvm::Instruction *smearCall =
-                llvm::CallInst::Create(smearFunc, argArray, LLVMGetName(toMatch, "_smear"),
+                llvm::CallInst::Create(smearFunc, argArray, LLVMGetName(smearValue, "_smear"),
                                 (llvm::Instruction *)NULL);
            ReplaceInstWithInst(iter, smearCall);
@@ -4485,9 +4550,6 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            goto restart;
        }
    }
        not_equal:
            ;
    }
    return modifiedAny;
 }
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1379,6 +1379,19 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
 #endif
 }
 llvm::Value *
 FunctionEmitContext::ProgramIndexVector(bool is32bits) {
    llvm::SmallVector<llvm::Constant*, 16> array;
    for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
        llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
        array.push_back(C);
    }
    llvm::Constant* index = llvm::ConstantVector::get(array);
    return index;
 }
 llvm::Value *
 FunctionEmitContext::GetStringPtr(const std::string &str) {
@@ -1729,26 +1742,31 @@ FunctionEmitContext::SmearUniform(llvm::Value *value, const char *name) {
    llvm::Value *ret = NULL;
    llvm::Type *eltType = value->getType();
    llvm::Type *vecType = NULL;
    llvm::PointerType *pt =
        llvm::dyn_cast<llvm::PointerType>(eltType);
    if (pt != NULL) {
        // Varying pointers are represented as vectors of i32/i64s
-        ret = llvm::UndefValue::get(LLVMTypes::VoidPointerVectorType);
+        vecType = LLVMTypes::VoidPointerVectorType;
        value = PtrToIntInst(value);
    }
-    else
+    else {
        // All other varying types are represented as vectors of the
        // underlying type.
-        ret = llvm::UndefValue::get(llvm::VectorType::get(eltType,
+        vecType = llvm::VectorType::get(eltType, g->target->getVectorWidth());
                                                          g->target->getVectorWidth()));
    for (int i = 0; i < g->target->getVectorWidth(); ++i) {
        llvm::Twine n = llvm::Twine("smear.") + llvm::Twine(name ? name : "") +
            llvm::Twine(i);
        ret = InsertInst(ret, value, i, n.str().c_str());
    }
    // Check for a constant case.
    if (llvm::Constant *const_val = llvm::dyn_cast<llvm::Constant>(value)) {
        ret = llvm::ConstantVector::getSplat(
            g->target->getVectorWidth(),
            const_val);
        return ret;
    }
    ret = BroadcastValue(value, vecType, name);
    return ret;
 }
@@ -3131,6 +3149,66 @@ FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
 }
 llvm::Value *
 FunctionEmitContext::ShuffleInst(llvm::Value *v1, llvm::Value *v2, llvm::Value *mask,
                                const char *name) {
    if (v1 == NULL || v2 == NULL || mask == NULL) {
        AssertPos(currentPos, m->errorCount > 0);
        return NULL;
    }
    if (name == NULL) {
        char buf[32];
        sprintf(buf, "_shuffle");
        name = LLVMGetName(v1, buf);
    }
    llvm::Instruction *ii = new llvm::ShuffleVectorInst(v1, v2, mask, name, bblock);
    AddDebugPos(ii);
    return ii;
 }
 llvm::Value *
 FunctionEmitContext::BroadcastValue(llvm::Value *v, llvm::Type* vecType,
                                    const char *name) {
    if (v == NULL || vecType == NULL) {
        AssertPos(currentPos, m->errorCount > 0);
        return NULL;
    }
    llvm::VectorType *ty = llvm::dyn_cast<llvm::VectorType>(vecType);
    Assert(ty && ty->getVectorElementType() == v->getType());
    if (name == NULL) {
        char buf[32];
        sprintf(buf, "_broadcast");
        name = LLVMGetName(v, buf);
    }
    // Generate the follwoing sequence:
    //   %name_init.i = insertelement <4 x i32> undef, i32 %val, i32 0
    //   %name.i = shufflevector <4 x i32> %smear.0, <4 x i32> undef,
    //                                              <4 x i32> zeroinitializer
    llvm::Value *undef1 = llvm::UndefValue::get(vecType);
    llvm::Value *undef2 = llvm::UndefValue::get(vecType);
    // InsertElement
    llvm::Twine tw = llvm::Twine(name) + llvm::Twine("_init");
    llvm::Value *insert = InsertInst(undef1, v, 0, tw.str().c_str());
    // ShuffleVector
    llvm::Constant *zeroVec = llvm::ConstantVector::getSplat(
        vecType->getVectorNumElements(),
        llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
    llvm::Value *ret = ShuffleInst(insert, undef2, zeroVec, name);
    return ret;
 }
 llvm::PHINode *
 FunctionEmitContext::PhiNode(llvm::Type *type, int count,
                             const char *name) {
@@ -3509,12 +3587,9 @@ FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr,
    unifSize = SmearUniform(unifSize);
    // Compute offset = <0, 1, .. > * unifSize
-    llvm::Value *varyingOffsets = llvm::UndefValue::get(unifSize->getType());
+    bool is32bits = g->target->is32Bit() || g->opt.force32BitAddressing;
-    for (int i = 0; i < g->target->getVectorWidth(); ++i) {
+    llvm::Value *varyingOffsets = ProgramIndexVector(is32bits);
-        llvm::Value *iValue = (g->target->is32Bit() || g->opt.force32BitAddressing) ?
+
            LLVMInt32(i) : LLVMInt64(i);
        varyingOffsets = InsertInst(varyingOffsets, iValue, i, "varying_delta");
    }
    llvm::Value *offset = BinaryOperator(llvm::Instruction::Mul, unifSize,
                                         varyingOffsets);
--- a/ctx.h
+++ b/ctx.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2012, Intel Corporation
+  Copyright (c) 2010-2013, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
@@ -295,6 +295,10 @@ public:
        that indicates whether the two masks are equal. */
    llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
    /** Generate ConstantVector, which contains ProgramIndex, i.e.
        < i32 0, i32 1, i32 2, i32 3> */
    llvm::Value *ProgramIndexVector(bool is32bits = true);
    /** Given a string, create an anonymous global variable to hold its
        value and return the pointer to the string. */
    llvm::Value *GetStringPtr(const std::string &str);
@@ -500,6 +504,16 @@ public:
    llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
                            const char *name = NULL);
    /** This convenience method maps to an llvm::ShuffleVectorInst. */
    llvm::Value *ShuffleInst(llvm::Value *v1, llvm::Value *v2, llvm::Value *mask,
                            const char *name = NULL);
    /** This convenience method to generate broadcast pattern. It takes a value
        and a vector type. Type of the value must match element type of the
        vector. */
    llvm::Value *BroadcastValue(llvm::Value *v, llvm::Type *vecType,
                                const char *name = NULL);
    llvm::PHINode *PhiNode(llvm::Type *type, int count,
                           const char *name = NULL);
    llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
--- a/expr.cpp
+++ b/expr.cpp
@@ -3905,11 +3905,7 @@ lAddVaryingOffsetsIfNeeded(FunctionEmitContext *ctx, llvm::Value *ptr,
        return ptr;
    // Onward: compute the per lane offsets.
-    llvm::Value *varyingOffsets =
+    llvm::Value *varyingOffsets = ctx->ProgramIndexVector();
        llvm::UndefValue::get(LLVMTypes::Int32VectorType);
    for (int i = 0; i < g->target->getVectorWidth(); ++i)
        varyingOffsets = ctx->InsertInst(varyingOffsets, LLVMInt32(i), i,
                                         "varying_delta");
    // And finally add the per-lane offsets.  Note that we lie to the GEP
    // call and tell it that the pointers are to uniform elements and not
@@ -6768,9 +6764,8 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
        if (!conv)
            return NULL;
-        llvm::Value *cast = llvm::UndefValue::get(toType->LLVMType(g->ctx));
+        llvm::Value *cast = ctx->BroadcastValue(conv, toType->LLVMType(g->ctx));
-        for (int i = 0; i < toVector->GetElementCount(); ++i)
+
            cast = ctx->InsertInst(cast, conv, i);
        return cast;
    }
    else if (toPointerType != NULL) {
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -601,11 +601,15 @@ lGetIntValue(llvm::Value *offset) {
 void
-LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
+LLVMFlattenInsertChain(llvm::Value *inst, int vectorWidth,
                       llvm::Value **elements) {
-    for (int i = 0; i < vectorWidth; ++i)
+    for (int i = 0; i < vectorWidth; ++i) {
        elements[i] = NULL;
    }
    // Catch a pattern of InsertElement chain.
    if (llvm::InsertElementInst *ie =
            llvm::dyn_cast<llvm::InsertElementInst>(inst)) {
        while (ie != NULL) {
            int64_t iOffset = lGetIntValue(ie->getOperand(2));
            Assert(iOffset >= 0 && iOffset < vectorWidth);
@@ -618,8 +622,9 @@ LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
            llvm::Value *insertBase = ie->getOperand(0);
            ie = llvm::dyn_cast<llvm::InsertElementInst>(insertBase);
            if (ie == NULL) {
-            if (llvm::isa<llvm::UndefValue>(insertBase))
+                if (llvm::isa<llvm::UndefValue>(insertBase)) {
                    return;
                }
                // Get the value out of a constant vector if that's what we
                // have
@@ -641,6 +646,31 @@ LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
                elements[iOffset] = cv->getOperand((int32_t)iOffset);
            }
        }
    }
    // Catch a pattern of broadcast implemented as InsertElement + Shuffle:
    //   %broadcast_init.0 = insertelement <4 x i32> undef, i32 %val, i32 0
    //   %broadcast.1 = shufflevector <4 x i32> %smear.0, <4 x i32> undef,
    //                                              <4 x i32> zeroinitializer
    else if (llvm::ShuffleVectorInst *shuf =
        llvm::dyn_cast<llvm::ShuffleVectorInst>(inst)) {
        llvm::Value *indices = shuf->getOperand(2);
        if (llvm::isa<llvm::ConstantAggregateZero>(indices)) {
            llvm::Value *op = shuf->getOperand(0);
            llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(op);
            if (ie != NULL &&
                llvm::isa<llvm::UndefValue>(ie->getOperand(0))) {
                llvm::ConstantInt *ci =
                    llvm::dyn_cast<llvm::ConstantInt>(ie->getOperand(2));
                if (ci->isZero()) {
                    for (int i = 0; i < vectorWidth; ++i) {
                        elements[i] = ie->getOperand(1);
                    }
                    return;
                }
            }
        }
    }
 }
@@ -694,10 +724,10 @@ lIsExactMultiple(llvm::Value *val, int baseValue, int vectorLength,
    else
        Assert(LLVMVectorValuesAllEqual(val));
-    llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(val);
+    if (llvm::isa<llvm::InsertElementInst>(val) ||
-    if (ie != NULL) {
+        llvm::isa<llvm::ShuffleVectorInst>(val)) {
        llvm::Value *elts[ISPC_MAX_NVEC];
-        LLVMFlattenInsertChain(ie, g->target->getVectorWidth(), elts);
+        LLVMFlattenInsertChain(val, g->target->getVectorWidth(), elts);
        // We just need to check the scalar first value, since we know that
        // all elements are equal
        return lIsExactMultiple(elts[0], baseValue, vectorLength,
@@ -1440,10 +1470,10 @@ lExtractFirstVectorElement(llvm::Value *v,
    // If we have a chain of insertelement instructions, then we can just
    // flatten them out and grab the value for the first one.
-    llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(v);
+    if (llvm::isa<llvm::InsertElementInst>(v) ||
-    if (ie != NULL) {
+        llvm::isa<llvm::ShuffleVectorInst>(v)) {
        llvm::Value *elements[ISPC_MAX_NVEC];
-        LLVMFlattenInsertChain(ie, vt->getNumElements(), elements);
+        LLVMFlattenInsertChain(v, vt->getNumElements(), elements);
        return elements[0];
    }
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -264,8 +264,13 @@ extern bool LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts);
    constant vector.  For anything more complex (e.g. some other arbitrary
    value, it doesn't try to extract element values into the returned
    array.
    This also handles common broadcast pattern:
       %broadcast_init.0 = insertelement <4 x i32> undef, i32 %val, i32 0
       %broadcast.1 = shufflevector <4 x i32> %smear.0, <4 x i32> undef,
                                                  <4 x i32> zeroinitializer
 */
-extern void LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
+extern void LLVMFlattenInsertChain(llvm::Value *inst, int vectorWidth,
                                   llvm::Value **elements);
 /** This is a utility routine for debugging that dumps out the given LLVM
--- a/opt.cpp
+++ b/opt.cpp
@@ -1058,10 +1058,10 @@ lCheckForActualPointer(llvm::Value *v) {
 */
 static llvm::Value *
 lGetBasePointer(llvm::Value *v) {
-    llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(v);
+    if (llvm::isa<llvm::InsertElementInst>(v) ||
-    if (ie != NULL) {
+        llvm::isa<llvm::ShuffleVectorInst>(v)) {
        llvm::Value *elements[ISPC_MAX_NVEC];
-        LLVMFlattenInsertChain(ie, g->target->getVectorWidth(), elements);
+        LLVMFlattenInsertChain(v, g->target->getVectorWidth(), elements);
        // Make sure none of the elements is undefined.
        // TODO: it's probably ok to allow undefined elements and return
@@ -1080,9 +1080,12 @@ lGetBasePointer(llvm::Value *v) {
    }
    // This case comes up with global/static arrays
-    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v);
+    if (llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v)) {
    if (cv != NULL)
        return lCheckForActualPointer(cv->getSplatValue());
    }
    else if (llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v)) {
        return lCheckForActualPointer(cdv->getSplatValue());
    }
    return NULL;
 }
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -1272,11 +1272,8 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
                      const std::vector<int> &spans) {
    // Smear the uniform counter value out to be varying
    llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
-    llvm::Value *smearCounter =
+    llvm::Value *smearCounter = ctx->BroadcastValue(
-        llvm::UndefValue::get(LLVMTypes::Int32VectorType);
+        counter, LLVMTypes::Int32VectorType, "smear_counter");
    for (int i = 0; i < g->target->getVectorWidth(); ++i)
        smearCounter =
            ctx->InsertInst(smearCounter, counter, i, "smear_counter");
    // Figure out the offsets; this is a little bit tricky.  As an example,
    // consider a 2D tiled foreach loop, where we're running 8-wide and
@@ -1517,9 +1514,9 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
            lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
                                  dimVariables[i]->storagePtr, span);
-        llvm::Value *smearEnd = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
+        llvm::Value *smearEnd = ctx->BroadcastValue(
-        for (int j = 0; j < g->target->getVectorWidth(); ++j)
+            endVals[i], LLVMTypes::Int32VectorType, "smear_end");
-            smearEnd = ctx->InsertInst(smearEnd, endVals[i], j, "smear_end");
+
        // Do a vector compare of its value to the end value to generate a
        // mask for this last bit of work.
        llvm::Value *emask =
@@ -1662,9 +1659,9 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    ctx->SetCurrentBasicBlock(bbPartial); {
        llvm::Value *varyingCounter =
            ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
-        llvm::Value *smearEnd = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
+        llvm::Value *smearEnd = ctx->BroadcastValue(
-        for (int j = 0; j < g->target->getVectorWidth(); ++j)
+            endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
-            smearEnd = ctx->InsertInst(smearEnd, endVals[nDims-1], j, "smear_end");
+
        llvm::Value *emask =
            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
                         varyingCounter, smearEnd);
@@ -1758,9 +1755,8 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
        llvm::Value *varyingCounter =
            lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
                                  dimVariables[nDims-1]->storagePtr, span);
-        llvm::Value *smearEnd = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
+        llvm::Value *smearEnd = ctx->BroadcastValue(
-        for (int j = 0; j < g->target->getVectorWidth(); ++j)
+            endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
            smearEnd = ctx->InsertInst(smearEnd, endVals[nDims-1], j, "smear_end");
        llvm::Value *emask =
            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
                         varyingCounter, smearEnd);
@@ -1993,11 +1989,7 @@ ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const {
        // math...)
        // Get the "program index" vector value
-        llvm::Value *programIndex =
+        llvm::Value *programIndex = ctx->ProgramIndexVector();
            llvm::UndefValue::get(LLVMTypes::Int32VectorType);
        for (int i = 0; i < g->target->getVectorWidth(); ++i)
            programIndex = ctx->InsertInst(programIndex, LLVMInt32(i), i,
                                           "prog_index");
        // And smear the current lane out to a vector
        llvm::Value *firstSet32 =