Add foreach and foreach_tiled looping constructs

These make it easier to iterate over arbitrary amounts of data elements; specifically, they automatically handle the "ragged extra bits" that come up when the number of elements to be processed isn't evenly divided by programCount. TODO: documentation
2011-11-30 13:17:31 -08:00
parent b48775a549
commit 8bc7367109
32 changed files with 1120 additions and 78 deletions
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -68,12 +68,19 @@ struct CFInfo {
                           llvm::Value *savedContinueLanesPtr,
                           llvm::Value *savedMask, llvm::Value *savedLoopMask);

+    static CFInfo *GetForeach(llvm::BasicBlock *breakTarget,
+                              llvm::BasicBlock *continueTarget, 
+                              llvm::Value *savedBreakLanesPtr,
+                              llvm::Value *savedContinueLanesPtr,
+                              llvm::Value *savedMask, llvm::Value *savedLoopMask);
+
    bool IsIf() { return type == If; }
    bool IsLoop() { return type == Loop; }
+    bool IsForeach() { return type == Foreach; }
    bool IsVaryingType() { return !isUniform; }
    bool IsUniform() { return isUniform; }

-    enum CFType { If, Loop };
+    enum CFType { If, Loop, Foreach };
    CFType type;
    bool isUniform;
    llvm::BasicBlock *savedBreakTarget, *savedContinueTarget;
@@ -102,6 +109,19 @@ private:
        savedMask = sm;
        savedLoopMask = lm;
    }
+    CFInfo(CFType t, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
+           llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
+           llvm::Value *lm) {
+        assert(t == Foreach);
+        type = t;
+        isUniform = false;
+        savedBreakTarget = bt;
+        savedContinueTarget = ct;
+        savedBreakLanesPtr = sb;
+        savedContinueLanesPtr = sc;
+        savedMask = sm;
+        savedLoopMask = lm;
+    }
 };


@@ -122,6 +142,18 @@ CFInfo::GetLoop(bool isUniform, llvm::BasicBlock *breakTarget,
                      savedMask, savedLoopMask);
 }

+
+CFInfo *
+CFInfo::GetForeach(llvm::BasicBlock *breakTarget,
+                   llvm::BasicBlock *continueTarget, 
+                   llvm::Value *savedBreakLanesPtr,
+                   llvm::Value *savedContinueLanesPtr,
+                   llvm::Value *savedMask, llvm::Value *savedForeachMask) {
+    return new CFInfo(Foreach, breakTarget, continueTarget,
+                      savedBreakLanesPtr, savedContinueLanesPtr,
+                      savedMask, savedForeachMask);
+}
+
 ///////////////////////////////////////////////////////////////////////////

 FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
@@ -422,7 +454,7 @@ FunctionEmitContext::StartLoop(llvm::BasicBlock *bt, llvm::BasicBlock *ct,

 void
 FunctionEmitContext::EndLoop() {
-    assert(controlFlowInfo.size() && !controlFlowInfo.back()->IsIf());
+    assert(controlFlowInfo.size() && controlFlowInfo.back()->IsLoop());
    CFInfo *ci = controlFlowInfo.back();
    controlFlowInfo.pop_back();

@@ -444,6 +476,36 @@ FunctionEmitContext::EndLoop() {
 }


+void
+FunctionEmitContext::StartForeach() {
+    // Store the current values of various loop-related state so that we
+    // can restore it when we exit this loop.
+    llvm::Value *oldMask = GetInternalMask();
+    controlFlowInfo.push_back(CFInfo::GetForeach(breakTarget, continueTarget, breakLanesPtr,
+                                                 continueLanesPtr, oldMask, loopMask));
+    continueLanesPtr = breakLanesPtr = NULL;
+    breakTarget = NULL;
+    continueTarget = NULL;
+    loopMask = NULL;
+}
+
+
+void
+FunctionEmitContext::EndForeach() {
+    assert(controlFlowInfo.size() && controlFlowInfo.back()->IsForeach());
+    CFInfo *ci = controlFlowInfo.back();
+    controlFlowInfo.pop_back();
+
+    // Restore the break/continue state information to what it was before
+    // we went into this loop.
+    breakTarget = ci->savedBreakTarget;
+    continueTarget = ci->savedContinueTarget;
+    breakLanesPtr = ci->savedBreakLanesPtr;
+    continueLanesPtr = ci->savedContinueLanesPtr;
+    loopMask = ci->savedLoopMask;
+}
+
+
 void
 FunctionEmitContext::restoreMaskGivenReturns(llvm::Value *oldMask) {
    if (!bblock)
@@ -638,6 +700,15 @@ FunctionEmitContext::VaryingCFDepth() const {
 }


+bool
+FunctionEmitContext::InForeachLoop() const {
+    for (unsigned int i = 0; i < controlFlowInfo.size(); ++i)
+        if (controlFlowInfo[i]->IsForeach())
+            return true;
+    return false;
+}
+
+
 void
 FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
    const Type *returnType = function->GetReturnType();
--- a/ctx.h
+++ b/ctx.h
@@ -159,6 +159,10 @@ public:
        finished. */
    void EndLoop();

+    /** */
+    void StartForeach();
+    void EndForeach();
+
    /** Emit code for a 'break' statement in a loop.  If doCoherenceCheck
        is true, then if we're in a 'varying' loop, code will be emitted to
        see if all of the lanes want to break, in which case a jump to the
@@ -183,6 +187,8 @@ public:
        flow */
    int VaryingCFDepth() const;

+    bool InForeachLoop() const;
+
    /** Called to generate code for 'return' statement; value is the
        expression in the return statement (if non-NULL), and
        doCoherenceCheck indicates whether instructions should be generated
--- a/examples/mandelbrot/mandelbrot.ispc
+++ b/examples/mandelbrot/mandelbrot.ispc
@@ -60,16 +60,16 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
        // Note that we'll be doing programCount computations in parallel,
        // so increment i by that much.  This assumes that width evenly
        // divides programCount.
-        for (uniform int i = 0; i < width; i += programCount) {
+        foreach (i = 0 ... width) {
            // Figure out the position on the complex plane to compute the
            // number of iterations at.  Note that the x values are
            // different across different program instances, since its
            // initializer incorporates the value of the programIndex
            // variable.
-            float x = x0 + (programIndex + i) * dx;
+            float x = x0 + i * dx;
            float y = y0 + j * dy;

-            int index = j * width + i + programIndex;
+            int index = j * width + i;
            output[index] = mandel(x, y, maxIterations);
        }
    }
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -61,14 +61,12 @@ mandelbrot_scanlines(uniform int ybase, uniform int span,
    uniform int ystart = ybase + taskIndex * span;
    uniform int yend = ystart + span;

-    for (uniform int j = ystart; j < yend; ++j) {
-        for (uniform int i = 0; i < width; i += programCount) {
-            float x = x0 + (programIndex + i) * dx;
-            float y = y0 + j * dy;
+    foreach (yi = ystart ... yend, xi = 0 ... width) {
+        float x = x0 + xi * dx;
+        float y = y0 + yi * dy;

-            int index = j * width + i + programIndex;
-            output[index] = mandel(x, y, maxIterations);
-        }
+        int index = yi * width + xi;
+        output[index] = mandel(x, y, maxIterations);
    }
 }
                               
--- a/examples/options/options.ispc
+++ b/examples/options/options.ispc
@@ -59,15 +59,13 @@ export void
 black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
                   uniform float ra[], uniform float va[], 
                   uniform float result[], uniform int count) {
-    for (uniform int i = 0; i < count; i += programCount) {
-        float S = Sa[i + programIndex], X = Xa[i + programIndex];
-        float T = Ta[i + programIndex], r = ra[i + programIndex];
-        float v = va[i + programIndex];
+    foreach (i = 0 ... count) {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];

        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
        float d2 = d1 - v * sqrt(T);

-        result[i + programIndex] = S * CND(d1) - X * exp(-r * T) * CND(d2);
+        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
    }
 }

@@ -78,10 +76,8 @@ binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
                  uniform float result[], uniform int count) {
    float V[BINOMIAL_NUM];

-    for (uniform int i = 0; i < count; i += programCount) {
-        float S = Sa[i + programIndex], X = Xa[i + programIndex];
-        float T = Ta[i + programIndex], r = ra[i + programIndex];
-        float v = va[i + programIndex];
+    foreach (i = 0 ... count) {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];

        float dt = T / BINOMIAL_NUM;
        float u = exp(v * sqrt(dt));
@@ -98,6 +94,6 @@ binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
            for (uniform int k = 0; k < j; ++k)
                V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;

-        result[i + programIndex] = V[0];
+        result[i] = V[0];
    }
 }
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -199,10 +199,8 @@ int main(int argc, char *argv[]) {
    }
    fclose(f);

-    // round image resolution up to multiple of 16 to make things easy for
-    // the code that assigns pixels to ispc program instances
-    int height = (int(baseHeight * scale) + 0xf) & ~0xf;
-    int width = (int(baseWidth * scale) + 0xf) & ~0xf;
+    int height = int(baseHeight * scale);
+    int width = int(baseWidth * scale);

    // allocate images; one to hold hit object ids, one to hold depth to
    // the first interseciton
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -244,34 +244,15 @@ static void raytrace_tile(uniform int x0, uniform int x1,
    uniform float widthScale = (float)(baseWidth) / (float)(width);
    uniform float heightScale = (float)(baseHeight) / (float)(height);

-    static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 
-                                           0, 1, 0, 1, 2, 3, 2, 3 };
-    static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 
-                                           2, 2, 3, 3, 2, 2, 3, 3 };
+    foreach_tiled (y = y0 ... y1, x = x0 ... x1) {
+        Ray ray;
+        generateRay(raster2camera, camera2world, x*widthScale,
+                    y*heightScale, ray);
+        BVHIntersect(nodes, triangles, ray);

-    // The outer loops are always over blocks of 4x4 pixels
-    for (uniform int y = y0; y < y1; y += 4) {
-        for (uniform int x = x0; x < x1; x += 4) {
-            // Now we have a block of 4x4=16 pixels to process; it will
-            // take 16/programCount iterations of this loop to process
-            // them.
-            for (uniform int o = 0; o < 16 / programCount; ++o) {
-                // Map program instances to samples in the udx/udy arrays
-                // to figure out which pixel each program instance is
-                // responsible for
-                const float dx = udx[o * programCount + programIndex];
-                const float dy = udy[o * programCount + programIndex];
-
-                Ray ray;
-                generateRay(raster2camera, camera2world, (x+dx)*widthScale,
-                            (y+dy)*heightScale, ray);
-                BVHIntersect(nodes, triangles, ray);
-
-                int offset = (y + (int)dy) * width + (x + (int)dx);
-                image[offset] = ray.maxt;
-                id[offset] = ray.hitId;
-            }
-        }
+        int offset = y * width + x;
+        image[offset] = ray.maxt;
+        id[offset] = ray.hitId;
    }
 }

--- a/examples/stencil/stencil.ispc
+++ b/examples/stencil/stencil.ispc
@@ -43,9 +43,8 @@ stencil_step(uniform int x0, uniform int x1,

    for (uniform int z = z0; z < z1; ++z) {
        for (uniform int y = y0; y < y1; ++y) {
-            // Assumes that (x1-x0) % programCount == 0
-            for (uniform int x = x0; x < x1; x += programCount) {
-                int index = (z * Nxy) + (y * Nx) + x + programIndex;
+            foreach (x = x0 ... x1) {
+                int index = (z * Nxy) + (y * Nx) + x;
 #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
 #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
                float div = coef[0] * A_cur(0, 0, 0) +
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -310,11 +310,7 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
    // by 4.
    for (uniform int y = y0; y < y1; y += 4) {
        for (uniform int x = x0; x < x1; x += 4) {
-            // For each such tile, process programCount pixels at a time,
-            // until we've done all 16 of them.  Thus, we're also assuming
-            // that programCount <= 16 and that 16 is evenly dividible by
-            // programCount.
-            for (uniform int o = 0; o < 16; o += programCount) {
+            foreach (o = 0 ... 16) {
                // These two arrays encode the mapping from [0,15] to
                // offsets within the 4x4 pixel block so that we render
                // each pixel inside the block
@@ -324,8 +320,7 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
                                                   2, 2, 3, 3, 2, 2, 3, 3 };

                // Figure out the pixel to render for this program instance
-                int xo = x + xoffsets[o + programIndex];
-                int yo = y + yoffsets[o + programIndex];
+                int xo = x + xoffsets[o], yo = y + yoffsets[o];

                // Use viewing parameters to compute the corresponding ray
                // for the pixel
--- a/lex.ll
+++ b/lex.ll
@@ -101,6 +101,8 @@ extern { return TOKEN_EXTERN; }
 false { return TOKEN_FALSE; }
 float { return TOKEN_FLOAT; }
 for { return TOKEN_FOR; }
+foreach { return TOKEN_FOREACH; }
+foreach_tiled { return TOKEN_FOREACH_TILED; }
 goto { return TOKEN_GOTO; }
 if { return TOKEN_IF; }
 inline { return TOKEN_INLINE; }
@@ -132,6 +134,7 @@ varying { return TOKEN_VARYING; }
 void { return TOKEN_VOID; }
 while { return TOKEN_WHILE; }
 \"C\" { return TOKEN_STRING_C_LITERAL; }
+\.\.\. { return TOKEN_DOTDOTDOT; }

 L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; }

--- a/parse.yy
+++ b/parse.yy
@@ -62,8 +62,12 @@
          (Current).name = NULL;                        /* new */ \
        }                                                              \
    while (0)
+
+struct ForeachDimension;
+
 }

+
 %{

 #include "ispc.h"
@@ -102,11 +106,11 @@ static void lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
                                       const EnumType *enumType);

 static const char *lBuiltinTokens[] = {
-    "bool", "break", "case", "cbreak", "ccontinue", "cdo", "cfor",
+    "assert", "bool", "break", "case", "cbreak", "ccontinue", "cdo", "cfor",
    "cif", "cwhile", "const", "continue", "creturn", "default", "do", "double", 
-    "else", "enum", "export", "extern", "false", "float", "for", "goto", "if",
-    "inline", "int", "int8", "int16", "int32", "int64", "launch", "NULL",
-    "print", "return", "signed", "sizeof",
+    "else", "enum", "export", "extern", "false", "float", "for", "foreach",
+    "foreach_tiled", "goto", "if", "inline", "int", "int8", "int16",
+    "int32", "int64", "launch", "NULL", "print", "return", "signed", "sizeof",
    "static", "struct", "switch", "sync", "task", "true", "typedef", "uniform",
    "unsigned", "varying", "void", "while", NULL 
 };
@@ -116,10 +120,26 @@ static const char *lParamListTokens[] = {
    "int8", "int16", "int32", "int64", "signed", "struct", "true",
    "uniform", "unsigned", "varying", "void", NULL 
 };
-    
+
+struct ForeachDimension {
+    ForeachDimension(Symbol *s = NULL, Expr *b = NULL, Expr *e = NULL) {
+        sym = s;
+        beginExpr = b;
+        endExpr = e;
+    }
+    Symbol *sym;
+    Expr *beginExpr, *endExpr;
+};
+
 %}

 %union {
+    int32_t int32Val;
+    double floatVal;
+    int64_t int64Val;
+    std::string *stringVal;
+    const char *constCharPtr;
+
    Expr *expr;
    ExprList *exprList;
    const Type *type;
@@ -136,13 +156,10 @@ static const char *lParamListTokens[] = {
    StructDeclaration *structDeclaration;
    std::vector<StructDeclaration *> *structDeclarationList;
    const EnumType *enumType;
-    Symbol *enumerator;
-    std::vector<Symbol *> *enumeratorList;
-    int32_t int32Val;
-    double floatVal;
-    int64_t int64Val;
-    std::string *stringVal;
-    const char *constCharPtr;
+    Symbol *symbol;
+    std::vector<Symbol *> *symbolList;
+    ForeachDimension *foreachDimension;
+    std::vector<ForeachDimension *> *foreachDimensionList;
 }


@@ -163,7 +180,7 @@ static const char *lParamListTokens[] = {
 %token TOKEN_ENUM TOKEN_STRUCT TOKEN_TRUE TOKEN_FALSE

 %token TOKEN_CASE TOKEN_DEFAULT TOKEN_IF TOKEN_ELSE TOKEN_SWITCH
-%token TOKEN_WHILE TOKEN_DO TOKEN_LAUNCH
+%token TOKEN_WHILE TOKEN_DO TOKEN_LAUNCH TOKEN_FOREACH TOKEN_FOREACH_TILED TOKEN_DOTDOTDOT
 %token TOKEN_FOR TOKEN_GOTO TOKEN_CONTINUE TOKEN_BREAK TOKEN_RETURN
 %token TOKEN_CIF TOKEN_CDO TOKEN_CFOR TOKEN_CWHILE TOKEN_CBREAK
 %token TOKEN_CCONTINUE TOKEN_CRETURN TOKEN_SYNC TOKEN_PRINT TOKEN_ASSERT
@@ -194,8 +211,8 @@ static const char *lParamListTokens[] = {
 %type <structDeclaration> struct_declaration
 %type <structDeclarationList> struct_declaration_list

-%type <enumeratorList> enumerator_list
-%type <enumerator> enumerator
+%type <symbolList> enumerator_list
+%type <symbol> enumerator foreach_identifier
 %type <enumType> enum_specifier

 %type <type> specifier_qualifier_list struct_or_union_specifier
@@ -211,6 +228,9 @@ static const char *lParamListTokens[] = {
 %type <constCharPtr> struct_or_union_name enum_identifier
 %type <int32Val> int_constant soa_width_specifier

+%type <foreachDimension> foreach_dimension_specifier
+%type <foreachDimensionList> foreach_dimension_list
+
 %start translation_unit
 %%

@@ -1295,6 +1315,40 @@ cfor_scope
    : TOKEN_CFOR { m->symbolTable->PushScope(); }
    ;

+foreach_scope
+    : TOKEN_FOREACH { m->symbolTable->PushScope(); }
+    ;
+
+foreach_tiled_scope
+    : TOKEN_FOREACH_TILED { m->symbolTable->PushScope(); }
+    ;
+
+foreach_identifier
+    : TOKEN_IDENTIFIER
+    {
+        $$ = new Symbol(yytext, @1, AtomicType::VaryingConstInt32);
+    }
+    ;
+
+foreach_dimension_specifier
+    : foreach_identifier '=' assignment_expression TOKEN_DOTDOTDOT assignment_expression
+    {
+        $$ = new ForeachDimension($1, $3, $5);
+    }
+    ;
+
+foreach_dimension_list
+    : foreach_dimension_specifier
+    {
+        $$ = new std::vector<ForeachDimension *>;
+        $$->push_back($1);
+    }
+    | foreach_dimension_list ',' foreach_dimension_specifier
+    {
+        $$->push_back($3);
+    }
+    ;
+
 iteration_statement
    : TOKEN_WHILE '(' expression ')' statement
      { $$ = new ForStmt(NULL, $3, NULL, $5, false, @1); }
@@ -1320,6 +1374,44 @@ iteration_statement
      { $$ = new ForStmt($3, $4, new ExprStmt($5, @5), $7, true, @1);
        m->symbolTable->PopScope();
      }
+    | foreach_scope '(' foreach_dimension_list ')'
+     {
+         std::vector<ForeachDimension *> &dims = *$3;
+         for (unsigned int i = 0; i < dims.size(); ++i)
+             m->symbolTable->AddVariable(dims[i]->sym);
+     }
+     statement
+     {
+         std::vector<ForeachDimension *> &dims = *$3;
+         std::vector<Symbol *> syms;
+         std::vector<Expr *> begins, ends;
+         for (unsigned int i = 0; i < dims.size(); ++i) {
+             syms.push_back(dims[i]->sym);
+             begins.push_back(dims[i]->beginExpr);
+             ends.push_back(dims[i]->endExpr);
+         }
+         $$ = new ForeachStmt(syms, begins, ends, $6, false, @1);
+         m->symbolTable->PopScope();
+     }
+    | foreach_tiled_scope '(' foreach_dimension_list ')'
+     {
+         std::vector<ForeachDimension *> &dims = *$3;
+         for (unsigned int i = 0; i < dims.size(); ++i)
+             m->symbolTable->AddVariable(dims[i]->sym);
+     }
+     statement
+     {
+         std::vector<ForeachDimension *> &dims = *$3;
+         std::vector<Symbol *> syms;
+         std::vector<Expr *> begins, ends;
+         for (unsigned int i = 0; i < dims.size(); ++i) {
+             syms.push_back(dims[i]->sym);
+             begins.push_back(dims[i]->beginExpr);
+             ends.push_back(dims[i]->endExpr);
+         }
+         $$ = new ForeachStmt(syms, begins, ends, $6, true, @1);
+         m->symbolTable->PopScope();
+     }
    ;

 jump_statement
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -819,6 +819,17 @@ lSafeToRunWithAllLanesOff(Stmt *stmt) {
                lSafeToRunWithAllLanesOff(fs->step) &&
                lSafeToRunWithAllLanesOff(fs->stmts));

+    ForeachStmt *fes;
+    if ((fes = dynamic_cast<ForeachStmt *>(stmt)) != NULL) {
+        for (unsigned int i = 0; i < fes->startExprs.size(); ++i)
+            if (!lSafeToRunWithAllLanesOff(fes->startExprs[i]))
+                return false;
+        for (unsigned int i = 0; i < fes->endExprs.size(); ++i)
+            if (!lSafeToRunWithAllLanesOff(fes->endExprs[i]))
+                return false;
+        return lSafeToRunWithAllLanesOff(fes->stmts);
+    }
+
    if (dynamic_cast<BreakStmt *>(stmt) != NULL ||
        dynamic_cast<ContinueStmt *>(stmt) != NULL)
        return true;
@@ -1592,6 +1603,463 @@ ContinueStmt::Print(int indent) const {
 }


+///////////////////////////////////////////////////////////////////////////
+// ForeachStmt
+
+ForeachStmt::ForeachStmt(const std::vector<Symbol *> &lvs, 
+                         const std::vector<Expr *> &se, 
+                         const std::vector<Expr *> &ee, 
+                         Stmt *s, bool t, SourcePos pos)
+    : Stmt(pos), dimVariables(lvs), startExprs(se), endExprs(ee), isTiled(t),
+      stmts(s) {
+}
+
+
+/* Given a uniform counter value in the memory location pointed to by
+   uniformCounterPtr, compute the corresponding set of varying counter
+   values for use within the loop body.
+ */
+static llvm::Value *
+lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx, 
+                      llvm::Value *uniformCounterPtr,
+                      llvm::Value *varyingCounterPtr,
+                      const std::vector<int> &spans) {
+    // Smear the uniform counter value out to be varying
+    llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
+    llvm::Value *smearCounter = 
+        llvm::UndefValue::get(LLVMTypes::Int32VectorType);
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        smearCounter = 
+            ctx->InsertInst(smearCounter, counter, i, "smear_counter");
+
+    // Figure out the offsets; this is a little bit tricky.  As an example,
+    // consider a 2D tiled foreach loop, where we're running 8-wide and
+    // where the inner dimension has a stride of 4 and the outer dimension
+    // has a stride of 2.  For the inner dimension, we want the offsets
+    // (0,1,2,3,0,1,2,3), and for the outer dimension we want
+    // (0,0,0,0,1,1,1,1).
+    int32_t delta[ISPC_MAX_NVEC];
+    for (int i = 0; i < g->target.vectorWidth; ++i) {
+        int d = i;
+        // First, account for the effect of any dimensions at deeper
+        // nesting levels than the current one.
+        int prevDimSpanCount = 1;
+        for (int j = dim; j < nDims-1; ++j)
+            prevDimSpanCount *= spans[j+1];
+        d /= prevDimSpanCount;
+
+        // And now with what's left, figure out our own offset
+        delta[i] = d % spans[dim];
+    }
+
+    // Add the deltas to compute the varying counter values; store the
+    // result to memory and then return it directly as well.
+    llvm::Value *varyingCounter = 
+        ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
+                            LLVMInt32Vector(delta), "iter_val");
+    ctx->StoreInst(varyingCounter, varyingCounterPtr);
+    return varyingCounter;
+}
+
+
+/** Returns the integer log2 of the given integer. */
+static int
+lLog2(int i) {
+    int ret = 0;
+    while (i != 0) {
+        ++ret;
+        i >>= 1;
+    }
+    return ret-1;
+}
+
+
+/* Figure out how many elements to process in each dimension for each time
+   through a foreach loop.  The untiled case is easy; all of the outer
+   dimensions up until the innermost one have a span of 1, and the
+   innermost one takes the entire vector width.  For the tiled case, we
+   give wider spans to the innermost dimensions while also trying to
+   generate relatively square domains.
+
+   This code works recursively from outer dimensions to inner dimensions.
+ */
+static void
+lGetSpans(int dimsLeft, int nDims, int itemsLeft, bool isTiled, int *a) {
+    if (dimsLeft == 0) {
+        // Nothing left to do but give all of the remaining work to the
+        // innermost domain.
+        *a = itemsLeft;
+        return;
+    }
+
+    if (isTiled == false || (dimsLeft >= lLog2(itemsLeft)))
+        // If we're not tiled, or if there are enough dimensions left that
+        // giving this one any more than a span of one would mean that a
+        // later dimension would have to have a span of one, give this one
+        // a span of one to save the available items for later.
+        *a = 1;
+    else if (itemsLeft >= 16 && (dimsLeft == 1))
+        // Special case to have 4x4 domains for the 2D case when running
+        // 16-wide.
+        *a = 4;
+    else
+        // Otherwise give this dimension a span of two. 
+        *a = 2;
+
+    lGetSpans(dimsLeft-1, nDims, itemsLeft / *a, isTiled, a+1);
+}
+
+
+/* Emit code for a foreach statement.  We effectively emit code to run the
+   set of n-dimensional nested loops corresponding to the dimensionality of
+   the foreach statement along with the extra logic to deal with mismatches
+   between the vector width we're compiling to and the number of elements
+   to process.
+ */
+void
+ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (ctx->GetCurrentBasicBlock() == NULL || stmts == NULL) 
+        return;
+
+    llvm::BasicBlock *bbCheckExtras = ctx->CreateBasicBlock("foreach_check_extras");
+    llvm::BasicBlock *bbDoExtras = ctx->CreateBasicBlock("foreach_do_extras");
+    llvm::BasicBlock *bbBody = ctx->CreateBasicBlock("foreach_body");
+    llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit");
+
+    llvm::Value *oldMask = ctx->GetInternalMask();
+
+    ctx->StartForeach();
+    ctx->SetDebugPos(pos);
+    ctx->StartScope();
+
+    // This should be caught during typechecking
+    assert(startExprs.size() == dimVariables.size() && 
+           endExprs.size() == dimVariables.size());
+    int nDims = (int)dimVariables.size();
+
+    ///////////////////////////////////////////////////////////////////////
+    // Setup: compute the number of items we have to work on in each
+    // dimension and a number of derived values.
+    std::vector<llvm::BasicBlock *> bbReset, bbStep, bbTest;
+    std::vector<llvm::Value *> startVals, endVals, uniformCounterPtrs;
+    std::vector<llvm::Value *> nItems, nExtras, alignedEnd;
+    std::vector<llvm::Value *> extrasMaskPtrs;
+
+    std::vector<int> span(nDims, 0);
+    lGetSpans(nDims-1, nDims, g->target.vectorWidth, isTiled, &span[0]);
+
+    for (int i = 0; i < nDims; ++i) {
+        // Basic blocks that we'll fill in later with the looping logic for
+        // this dimension.
+        bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
+        bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
+        bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
+
+        // Start and end value for this loop dimension
+        llvm::Value *sv = startExprs[i]->GetValue(ctx);
+        llvm::Value *ev = endExprs[i]->GetValue(ctx);
+        if (sv == NULL || ev == NULL)
+            return;
+        startVals.push_back(sv);
+        endVals.push_back(ev);
+
+        // nItems = endVal - startVal
+        nItems.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv,
+                                             "nitems"));
+
+        // nExtras = nItems % (span for this dimension)
+        // This gives us the number of extra elements we need to deal with
+        // at the end of the loop for this dimension that don't fit cleanly
+        // into a vector width.
+        nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems[i],
+                                              LLVMInt32(span[i]), "nextras"));
+
+        // alignedEnd = endVal - nExtras
+        alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
+                                                 nExtras[i], "aligned_end"));
+
+        ///////////////////////////////////////////////////////////////////////
+        // Each dimension has a loop counter that is a uniform value that
+        // goes from startVal to endVal, in steps of the span for this
+        // dimension.  Its value is only used internally here for looping
+        // logic and isn't directly available in the user's program code.
+        uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type, 
+                                                     "counter"));
+        ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
+
+        // There is also a varying variable that holds the set of index
+        // values for each dimension in the current loop iteration; this is
+        // the value that is program-visible.
+        dimVariables[i]->storagePtr = ctx->AllocaInst(LLVMTypes::Int32VectorType, 
+                                                  dimVariables[i]->name.c_str());
+        dimVariables[i]->parentFunction = ctx->GetFunction();
+        ctx->EmitVariableDebugInfo(dimVariables[i]);
+
+        // Each dimension also maintains a mask that represents which of
+        // the varying elements in the current iteration should be
+        // processed.  (i.e. this is used to disable the lanes that have
+        // out-of-bounds offsets.)
+        extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
+        ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
+    }
+
+    // On to the outermost loop's test
+    ctx->BranchInst(bbTest[0]);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_reset: this code runs when we need to reset the counter for
+    // a given dimension in preparation for running through its loop again,
+    // after the enclosing level advances its counter.
+    for (int i = 0; i < nDims; ++i) {
+        ctx->SetCurrentBasicBlock(bbReset[i]);
+        if (i == 0)
+            ctx->BranchInst(bbExit);
+        else {
+            ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
+            ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
+            ctx->BranchInst(bbStep[i-1]);
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_test
+    std::vector<llvm::Value *> inExtras;
+    for (int i = 0; i < nDims; ++i) {
+        ctx->SetCurrentBasicBlock(bbTest[i]);
+
+        llvm::Value *haveExtras = 
+            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
+                         endVals[i], alignedEnd[i], "have_extras");
+
+        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
+        llvm::Value *atAlignedEnd = 
+            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
+                         counter, alignedEnd[i], "at_aligned_end");
+        llvm::Value *inEx = 
+            ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
+                                atAlignedEnd, "in_extras");
+
+        if (i == 0)
+            inExtras.push_back(inEx);
+        else
+            inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
+                                                   inExtras[i-1], "in_extras_all"));
+
+        llvm::Value *varyingCounter = 
+            lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i], 
+                                  dimVariables[i]->storagePtr, span);
+
+        llvm::Value *smearEnd = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
+        for (int j = 0; j < g->target.vectorWidth; ++j)
+            smearEnd = ctx->InsertInst(smearEnd, endVals[i], j, "smear_end");
+        // Do a vector compare of its value to the end value to generate a
+        // mask for this last bit of work.
+        llvm::Value *emask = 
+            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+                         varyingCounter, smearEnd);
+        emask = ctx->I1VecToBoolVec(emask);
+
+        if (i == 0)
+            ctx->StoreInst(emask, extrasMaskPtrs[i]);
+        else {
+            // FIXME: at least specialize the innermost loop to not do all
+            // this mask stuff each time through the test...
+            llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
+            llvm::Value *newMask =
+                ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
+                                    "extras_mask");
+            ctx->StoreInst(newMask, extrasMaskPtrs[i]);
+        }
+
+        llvm::Value *notAtEnd = 
+            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+                         counter, endVals[i]);
+        if (i != nDims-1)
+            ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
+        else
+            ctx->BranchInst(bbCheckExtras, bbReset[i], notAtEnd);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_step: increment the uniform counter by the vector width.
+    // Note that we don't increment the varying counter here as well but
+    // just generate its value when we need it in the loop body.
+    for (int i = 0; i < nDims; ++i) {
+        ctx->SetCurrentBasicBlock(bbStep[i]);
+        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
+        llvm::Value *newCounter =  
+            ctx->BinaryOperator(llvm::Instruction::Add, counter,
+                                LLVMInt32(span[i]), "new_counter");
+        ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
+        ctx->BranchInst(bbTest[i]);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_check_extras: see if we need to deal with any partial
+    // vector's worth of work that's left.
+    ctx->SetCurrentBasicBlock(bbCheckExtras);
+    ctx->AddInstrumentationPoint("foreach loop check extras");
+    ctx->BranchInst(bbDoExtras, bbBody, inExtras[nDims-1]);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_body: do a full vector's worth of work.  We know that all
+    // lanes will be running here, so we explicitly set the mask to be 'all
+    // on'.  This ends up being relatively straightforward: just update the
+    // value of the varying loop counter and have the statements in the
+    // loop body emit their code.
+    ctx->SetCurrentBasicBlock(bbBody);
+    ctx->SetInternalMask(LLVMMaskAllOn);
+    ctx->AddInstrumentationPoint("foreach loop body");
+    stmts->EmitCode(ctx);
+    assert(ctx->GetCurrentBasicBlock() != NULL);
+    ctx->BranchInst(bbStep[nDims-1]);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_doextras: set the mask and have the statements emit their
+    // code again.  Note that it's generally worthwhile having two copies
+    // of the statements' code, since the code above is emitted with the
+    // mask known to be all-on, which in turn leads to more efficient code
+    // for that case.
+    ctx->SetCurrentBasicBlock(bbDoExtras);
+    llvm::Value *mask = ctx->LoadInst(extrasMaskPtrs[nDims-1]);
+    ctx->SetInternalMask(mask);
+    stmts->EmitCode(ctx);
+    ctx->BranchInst(bbStep[nDims-1]);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_exit: All done.  Restore the old mask and clean up
+    ctx->SetCurrentBasicBlock(bbExit);
+    ctx->SetInternalMask(oldMask);
+
+    ctx->EndForeach();
+    ctx->EndScope();
+}
+
+
+Stmt *
+ForeachStmt::Optimize() {
+    bool anyErrors = false;
+    for (unsigned int i = 0; i < startExprs.size(); ++i) {
+        if (startExprs[i] != NULL)
+            startExprs[i]->Optimize();
+        anyErrors |= (startExprs[i] == NULL);
+    }
+    for (unsigned int i = 0; i < endExprs.size(); ++i) {
+        if (endExprs[i] != NULL)
+            endExprs[i]->Optimize();
+        anyErrors |= (endExprs[i] == NULL);
+    }
+
+    if (stmts != NULL) 
+        stmts = stmts->TypeCheck();
+    anyErrors |= (stmts == NULL);
+
+    return anyErrors ? NULL : this;
+}
+
+
+Stmt *
+ForeachStmt::TypeCheck() {
+    bool anyErrors = false;
+    for (unsigned int i = 0; i < startExprs.size(); ++i) {
+        if (startExprs[i] != NULL)
+            startExprs[i] = TypeConvertExpr(startExprs[i], 
+                                            AtomicType::UniformInt32, 
+                                            "foreach starting value");
+        if (startExprs[i] != NULL)
+            startExprs[i]->TypeCheck();
+        anyErrors |= (startExprs[i] == NULL);
+    }
+    for (unsigned int i = 0; i < endExprs.size(); ++i) {
+        if (endExprs[i] != NULL)
+            endExprs[i] = TypeConvertExpr(endExprs[i], AtomicType::UniformInt32,
+                                          "foreach ending value");
+        if (endExprs[i] != NULL)
+            endExprs[i]->TypeCheck();
+        anyErrors |= (endExprs[i] == NULL);
+    }
+
+    if (stmts != NULL) 
+        stmts = stmts->TypeCheck();
+    anyErrors |= (stmts == NULL);
+
+    if (startExprs.size() < dimVariables.size()) {
+        Error(pos, "Not enough initial values provided for \"foreach\" loop; "
+              "got %d, expected %d\n", (int)startExprs.size(), (int)dimVariables.size());
+        anyErrors = true;
+    }
+    else if (startExprs.size() > dimVariables.size()) {
+        Error(pos, "Too many initial values provided for \"foreach\" loop; "
+              "got %d, expected %d\n", (int)startExprs.size(), (int)dimVariables.size());
+        anyErrors = true;
+    }
+
+    if (endExprs.size() < dimVariables.size()) {
+        Error(pos, "Not enough initial values provided for \"foreach\" loop; "
+              "got %d, expected %d\n", (int)endExprs.size(), (int)dimVariables.size());
+        anyErrors = true;
+    }
+    else if (endExprs.size() > dimVariables.size()) {
+        Error(pos, "Too many initial values provided for \"foreach\" loop; "
+              "got %d, expected %d\n", (int)endExprs.size(), (int)dimVariables.size());
+        anyErrors = true;
+    }
+
+    return anyErrors ? NULL : this;
+}
+
+
+int
+ForeachStmt::EstimateCost() const {
+    return dimVariables.size() * (COST_UNIFORM_LOOP + COST_SIMPLE_ARITH_LOGIC_OP) +
+        (stmts ? stmts->EstimateCost() : 0);
+}
+
+
+void
+ForeachStmt::Print(int indent) const {
+    printf("%*cForeach Stmt", indent, ' ');
+    pos.Print();
+    printf("\n");
+    
+    for (unsigned int i = 0; i < dimVariables.size(); ++i)
+        if (dimVariables[i] != NULL)
+            printf("%*cVar %d: %s\n", indent+4, ' ', i, 
+                   dimVariables[i]->name.c_str());
+        else 
+            printf("%*cVar %d: NULL\n", indent+4, ' ', i);
+
+    printf("Start values:\n");
+    for (unsigned int i = 0; i < startExprs.size(); ++i) {
+        if (startExprs[i] != NULL)
+            startExprs[i]->Print();
+        else
+            printf("NULL");
+        if (i != startExprs.size()-1)
+            printf(", ");
+        else
+            printf("\n");
+    }
+
+    printf("End values:\n");
+    for (unsigned int i = 0; i < endExprs.size(); ++i) {
+        if (endExprs[i] != NULL)
+            endExprs[i]->Print();
+        else
+            printf("NULL");
+        if (i != endExprs.size()-1)
+            printf(", ");
+        else
+            printf("\n");
+    }
+
+    if (stmts != NULL) {
+        printf("%*cStmts:\n", indent+4, ' ');
+        stmts->Print(indent+8);
+    }
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // ReturnStmt

@@ -1606,6 +2074,11 @@ ReturnStmt::EmitCode(FunctionEmitContext *ctx) const {
    if (!ctx->GetCurrentBasicBlock()) 
        return;

+    if (ctx->InForeachLoop()) {
+        Error(pos, "\"return\" statement is illegal inside a \"foreach\" loop.");
+        return;
+    }
+
    ctx->SetDebugPos(pos);
    ctx->CurrentLanesReturned(val, doCoherenceCheck);
 }
--- a/stmt.h
+++ b/stmt.h
@@ -241,6 +241,31 @@ private:
 };


+/** @brief Statement implementation for parallel 'foreach' loops.
+ */
+class ForeachStmt : public Stmt {
+public:
+    ForeachStmt(const std::vector<Symbol *> &loopVars, 
+                const std::vector<Expr *> &startExprs, 
+                const std::vector<Expr *> &endExprs, 
+                Stmt *bodyStatements, bool tiled, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    std::vector<Symbol *> dimVariables;
+    std::vector<Expr *> startExprs;
+    std::vector<Expr *> endExprs;
+    bool isTiled;
+    Stmt *stmts;
+};
+
+
+
 /** @brief Statement implementation for a 'return' or 'coherent' return
    statement in the program. */
 class ReturnStmt : public Stmt {
--- a/tests/foreach-1.ispc
+++ b/tests/foreach-1.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float val[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        val[i] = 0;
+
+    foreach (i = 0 ... programCount)
+        val[i] += aFOO[i] - 1;
+
+    uniform float sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        sum += val[i];
+
+    RET[programIndex] = sum; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = reduce_add(programIndex);
+}
--- a/tests/foreach-10.ispc
+++ b/tests/foreach-10.ispc
@@ -0,0 +1,33 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#define NA 4
+#define NB 8
+#define NC 7
+    uniform int a[NA][NB][NC];
+
+    for (uniform int i = 0; i < NA; ++i)
+        for (uniform int j = 0; j < NB; ++j)
+            for (uniform int k = 0; j < NC; ++j)
+                a[i][j][k] = 0;
+
+    foreach_tiled (i = 0 ... NA, j = 0 ... NB, k = 0 ... NC) {
+        a[i][j][k] += 1;
+    }
+
+    uniform int errs = 0;
+    for (uniform int i = 0; i < NA; ++i)
+        for (uniform int j = 0; j < NB; ++j)
+            for (uniform int k = 0; j < NC; ++j)
+                if (a[i][j][k] != 1) {
+//CO                    print("% % % = %\n", i, j, k, a[i][j][k]);
+                    ++errs;
+                }
+
+    RET[programIndex] = errs;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/foreach-11.ispc
+++ b/tests/foreach-11.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float val[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        val[i] = 0;
+
+    foreach_tiled (i = 0 ... programCount)
+        val[i] += aFOO[i] - 1;
+
+    uniform float sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        sum += val[i];
+
+    RET[programIndex] = sum; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = reduce_add(programIndex);
+}
--- a/tests/foreach-12.ispc
+++ b/tests/foreach-12.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+uniform int foo(int i);
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float val[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        val[i] = 0;
+
+    foreach_tiled (i = 2 ... programCount)
+        val[i] += i;
+
+    uniform float sum = 0;
+    for (uniform int i = 0; i < programCount; ++i) {
+        sum += val[i];
+    }
+
+    RET[programIndex] = sum; 
+}
+
+export void result(uniform float RET[]) {
+    int pi = (programIndex >= 2) ? programIndex : 0;
+    RET[programIndex] = reduce_add(pi);
+}
--- a/tests/foreach-13.ispc
+++ b/tests/foreach-13.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+uniform int foo(int i);
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    for (uniform int i = 0; i < programCount; ++i)
+        RET[i] = 0;
+
+    foreach_tiled (i = 0 ... 2)
+        RET[i] = i+1;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/foreach-14.ispc
+++ b/tests/foreach-14.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+uniform int foo(int i);
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    for (uniform int i = 0; i < programCount; ++i)
+        RET[i] = 0;
+
+    foreach_tiled (i = 2 ... 0)
+        RET[i] += 1234;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/foreach-15.ispc
+++ b/tests/foreach-15.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+uniform int foo(int i);
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    for (uniform int i = 0; i < programCount; ++i)
+        RET[i] = 0;
+
+    foreach_tiled (i = 1 ... 1)
+        RET[i] = 1234;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/foreach-16.ispc
+++ b/tests/foreach-16.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+uniform int foo(int i);
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    for (uniform int i = 0; i < programCount; ++i)
+        RET[i] = 0;
+
+    foreach_tiled (i = -2 ... programCount-2)
+        RET[i+2] = 1234;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1234;
+}
--- a/tests/foreach-17.ispc
+++ b/tests/foreach-17.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float sum = 0;
+    foreach_tiled (i = 0 ... 6)
+        sum += aFOO[i];
+    RET[programIndex] = reduce_add(sum);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 21;
+}
--- a/tests/foreach-18.ispc
+++ b/tests/foreach-18.ispc
@@ -0,0 +1,29 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#define NA 3
+#define NB 8
+    uniform int a[NA][NB];
+
+    for (uniform int i = 0; i < NA; ++i)
+        for (uniform int j = 0; j < NB; ++j)
+            a[i][j] = 0;
+
+    foreach_tiled (i = 0 ... NA, j = 0 ... NB) {
+        a[i][j] += 1;
+    }
+
+    uniform int errs = 0;
+    for (uniform int i = 0; i < NA; ++i)
+        for (uniform int j = 0; j < NB; ++j)
+            if (a[i][j] != 1) {
+                ++errs;
+            }
+
+    RET[programIndex] = errs;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/foreach-19.ispc
+++ b/tests/foreach-19.ispc
@@ -0,0 +1,29 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+#define NA 3
+#define NB 4
+    uniform int a[NA][NB];
+
+    for (uniform int i = 0; i < NA; ++i)
+        for (uniform int j = 0; j < NB; ++j)
+            a[i][j] = 0;
+
+    foreach_tiled (i = 0 ... NA, j = 0 ... NB) {
+        a[i][j] += 1;
+    }
+
+    uniform int errs = 0;
+    for (uniform int i = 0; i < NA; ++i)
+        for (uniform int j = 0; j < NB; ++j)
+            if (a[i][j] != 1) {
+                ++errs;
+            }
+
+    RET[programIndex] = errs;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/foreach-2.ispc
+++ b/tests/foreach-2.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+uniform int foo(int i);
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float val[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        val[i] = 0;
+
+    foreach (i = 2 ... programCount)
+        val[i] += i;
+
+    uniform float sum = 0;
+    for (uniform int i = 0; i < programCount; ++i) {
+        sum += val[i];
+    }
+
+    RET[programIndex] = sum; 
+}
+
+export void result(uniform float RET[]) {
+    int pi = (programIndex >= 2) ? programIndex : 0;
+    RET[programIndex] = reduce_add(pi);
+}
--- a/tests/foreach-3.ispc
+++ b/tests/foreach-3.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+uniform int foo(int i);
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    for (uniform int i = 0; i < programCount; ++i)
+        RET[i] = 0;
+
+    foreach (i = 0 ... 2)
+        RET[i] = i+1;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
--- a/tests/foreach-4.ispc
+++ b/tests/foreach-4.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+uniform int foo(int i);
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    for (uniform int i = 0; i < programCount; ++i)
+        RET[i] = 0;
+
+    foreach (i = 2 ... 0)
+        RET[i] -= 1234;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/foreach-5.ispc
+++ b/tests/foreach-5.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+uniform int foo(int i);
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    for (uniform int i = 0; i < programCount; ++i)
+        RET[i] = 0;
+
+    foreach (i = 1 ... 1)
+        RET[i] = 1234;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/foreach-6.ispc
+++ b/tests/foreach-6.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+uniform int foo(int i);
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    for (uniform int i = 0; i < programCount; ++i)
+        RET[i] = 0;
+
+    foreach (i = -2 ... programCount-2)
+        RET[i+2] += 1234;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1234;
+}
--- a/tests/foreach-7.ispc
+++ b/tests/foreach-7.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float sum = 0;
+    foreach (i = 0 ... 6)
+        sum += aFOO[i];
+    RET[programIndex] = reduce_add(sum);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 21;
+}
--- a/tests/foreach-8.ispc
+++ b/tests/foreach-8.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float sum1 = 0, sum2 = 0;
+    foreach (x = 0 ... 10, i = 0 ... 6) {
+        sum1 += aFOO[i];
+    }
+
+    for (uniform int x = 0; x < 10; ++x) {
+        for (uniform int i = 0; i < 6; i += programCount) {
+            int index = i + programIndex;
+            if (index < 6)
+                sum2 += aFOO[index];
+        }
+    }
+
+    RET[programIndex] = sum1 - sum2;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/foreach-9.ispc
+++ b/tests/foreach-9.ispc
@@ -0,0 +1,29 @@
+
+export uniform int width() { return programCount; }
+
+#define NA 1
+#define NB 3
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[NA][NB];
+
+    for (uniform int i = 0; i < NA; ++i)
+        for (uniform int j = 0; j < NB; ++j)
+            a[i][j] = 0;
+
+    foreach (i = 0 ... NA, j = 0 ... NB) {
+        a[i][j] += 1;
+    }
+
+    uniform int errs = 0;
+    for (uniform int i = 0; i < NA; ++i)
+        for (uniform int j = 0; j < NB; ++j)
+            if (a[i][j] != 1)
+                ++errs;
+
+    RET[programIndex] = errs;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}