From b4a078e2f68c1be7f41bd712c4d3ab631f69b973 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 22 Jun 2012 10:35:43 -0700
Subject: [PATCH] Add foreach_active iteration statement.

Issue #298.
---
 ast.cpp                     |  17 +-
 ctx.cpp                     |  29 ++-
 ctx.h                       |   6 +-
 docs/ispc.rst               | 376 ++++++++++++++++++++++--------------
 docs/perfguide.rst          |  38 ++++
 lex.ll                      |   8 +-
 parse.yy                    |   8 +-
 stdlib.ispc                 |  74 +++----
 stmt.cpp                    | 259 +++++++++++++++++--------
 stmt.h                      |  17 ++
 tests/foreach-active-1.ispc |  16 ++
 tests/foreach-active-2.ispc |  16 ++
 tests/foreach-active-3.ispc |  16 ++
 tests/foreach-active-4.ispc |  21 ++
 tests/foreach-active-5.ispc |  22 +++
 15 files changed, 644 insertions(+), 279 deletions(-)
 create mode 100644 tests/foreach-active-1.ispc
 create mode 100644 tests/foreach-active-2.ispc
 create mode 100644 tests/foreach-active-3.ispc
 create mode 100644 tests/foreach-active-4.ispc
 create mode 100644 tests/foreach-active-5.ispc
diff --git a/ast.cpp b/ast.cpp
index 06ccc1a9..66b597d5 100644
--- a/ast.cpp
+++ b/ast.cpp
@@ -92,6 +92,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
         DoStmt *dos;
         ForStmt *fs;
         ForeachStmt *fes;
+        ForeachActiveStmt *fas;
         ForeachUniqueStmt *fus;
         CaseStmt *cs;
         DefaultStmt *defs;
@@ -138,6 +139,9 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
                                                    postFunc, data);
             fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
         }
+        else if ((fas = dynamic_cast<ForeachActiveStmt *>(node)) != NULL) {
+            fas->stmts = (Stmt *)WalkAST(fas->stmts, preFunc, postFunc, data);
+        }
         else if ((fus = dynamic_cast<ForeachUniqueStmt *>(node)) != NULL) {
             fus->expr = (Expr *)WalkAST(fus->expr, preFunc, postFunc, data);
             fus->stmts = (Stmt *)WalkAST(fus->stmts, preFunc, postFunc, data);
@@ -391,14 +395,15 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
     }
 
     if (dynamic_cast<ForeachStmt *>(node) != NULL ||
+        dynamic_cast<ForeachActiveStmt *>(node) != NULL ||
         dynamic_cast<ForeachUniqueStmt *>(node) != NULL) {
-        // foreach() statements also shouldn't be run with an all-off mask.
-        // Since they re-establish an 'all on' mask, this would be pretty
-        // unintuitive.  (More generally, it's possibly a little strange to
-        // allow foreach() in the presence of any non-uniform control
-        // flow...)
+        // The various foreach statements also shouldn't be run with an
+        // all-off mask.  Since they can re-establish an 'all on' mask,
+        // this would be pretty unintuitive.  (More generally, it's
+        // possibly a little strange to allow foreach in the presence of
+        // any non-uniform control flow...)
         //
-        // Similarly, the implementation foreach_unique assumes as a
+        // Similarly, the implementation of foreach_unique assumes as a
         // precondition that the mask won't be all off going into it, so
         // we'll enforce that here...
         *okPtr = false;
diff --git a/ctx.cpp b/ctx.cpp
index 194ca30b..75c05769 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -89,12 +89,14 @@ struct CFInfo {
     bool IsIf() { return type == If; }
     bool IsLoop() { return type == Loop; }
     bool IsForeach() { return (type == ForeachRegular ||
+                               type == ForeachActive ||
                                type == ForeachUnique); }
     bool IsSwitch() { return type == Switch; }
     bool IsVarying() { return !isUniform; }
     bool IsUniform() { return isUniform; }
 
-    enum CFType { If, Loop, ForeachRegular, ForeachUnique, Switch };
+    enum CFType { If, Loop, ForeachRegular, ForeachActive, ForeachUnique, 
+                  Switch };
     CFType type;
     bool isUniform;
     llvm::BasicBlock *savedBreakTarget, *savedContinueTarget;
@@ -143,7 +145,7 @@ private:
     CFInfo(CFType t, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
            llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
            llvm::Value *lm) {
-        Assert(t == ForeachRegular || t == ForeachUnique);
+        Assert(t == ForeachRegular || t == ForeachActive || t == ForeachUnique);
         type = t;
         isUniform = false;
         savedBreakTarget = bt;
@@ -190,6 +192,9 @@ CFInfo::GetForeach(FunctionEmitContext::ForeachType ft,
     case FunctionEmitContext::FOREACH_REGULAR:
         cfType = ForeachRegular;
         break;
+    case FunctionEmitContext::FOREACH_ACTIVE:
+        cfType = ForeachActive;
+        break;
     case FunctionEmitContext::FOREACH_UNIQUE:
         cfType = ForeachUnique;
         break;
@@ -744,6 +749,16 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
 }
 
 
+static bool
+lEnclosingLoopIsForeachActive(const std::vector<CFInfo *> &controlFlowInfo) {
+    for (int i = (int)controlFlowInfo.size() - 1; i >= 0; --i) {
+        if (controlFlowInfo[i]->type == CFInfo::ForeachActive)
+            return true;
+    }
+    return false;
+}
+
+
 void
 FunctionEmitContext::Continue(bool doCoherenceCheck) {
     if (!continueTarget) {
@@ -753,12 +768,16 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
     }
     AssertPos(currentPos, controlFlowInfo.size() > 0);
 
-    if (ifsInCFAllUniform(CFInfo::Loop)) {
+    if (ifsInCFAllUniform(CFInfo::Loop) ||
+        lEnclosingLoopIsForeachActive(controlFlowInfo)) {
         // Similarly to 'break' statements, we can immediately jump to the
         // continue target if we're only in 'uniform' control flow within
-        // loop or if we can tell that the mask is all on.
+        // loop or if we can tell that the mask is all on.  Here, we can
+        // also jump if the enclosing loop is a 'foreach_active' loop, in
+        // which case we know that only a single program instance is
+        // executing.
         AddInstrumentationPoint("continue: uniform CF, jumped");
-        if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
+        if (doCoherenceCheck)
             Warning(currentPos, "Coherent continue statement not necessary in "
                     "fully uniform control flow.");
         BranchInst(continueTarget);
diff --git a/ctx.h b/ctx.h
index a37842af..59975c01 100644
--- a/ctx.h
+++ b/ctx.h
@@ -160,9 +160,9 @@ public:
         finished. */
     void EndLoop();
 
-    /** Indicates that code generation for a 'foreach', 'foreach_tiled', or
-        'foreach_unique' loop is about to start. */
-    enum ForeachType { FOREACH_REGULAR, FOREACH_UNIQUE };
+    /** Indicates that code generation for a 'foreach', 'foreach_tiled',
+        'foreach_active', or 'foreach_unique' loop is about to start. */
+    enum ForeachType { FOREACH_REGULAR, FOREACH_ACTIVE, FOREACH_UNIQUE };
     void StartForeach(ForeachType ft);
     void EndForeach();
 
diff --git a/docs/ispc.rst b/docs/ispc.rst
index 7ab01571..58c58d6a 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -105,12 +105,16 @@ Contents:
 
     * `Conditional Statements: "if"`_
     * `Conditional Statements: "switch"`_
-    * `Basic Iteration Statements: "for", "while", and "do"`_
-    * `Iteration over unique elements: "foreach_unique"`_
+    * `Iteration Statements`_
+
+      + `Basic Iteration Statements: "for", "while", and "do"`_
+      + `Iteration over active program instances: "foreach_active"`_
+      + `Iteration over unique elements: "foreach_unique"`_
+      + `Parallel Iteration Statements: "foreach" and "foreach_tiled"`_
+      + `Parallel Iteration with "programIndex" and "programCount"`_
+
     * `Unstructured Control Flow: "goto"`_
     * `"Coherent" Control Flow Statements: "cif" and Friends`_
-    * `Parallel Iteration Statements: "foreach" and "foreach_tiled"`_
-    * `Parallel Iteration with "programIndex" and "programCount"`_
     * `Functions and Function Calls`_
 
       + `Function Overloading`_
@@ -1984,7 +1988,7 @@ format in memory; the benefits from SOA layout are discussed in more detail
 in the `Use "Structure of Arrays" Layout When Possible`_ section in the
 ispc Performance Guide.
 
-.. _Use "Structure of Arrays" Layout When Possible: perf.html#use-structure-of-arrays-layout-when-possible
+.. _Use "Structure of Arrays" Layout When Possible: perfguide.html#use-structure-of-arrays-layout-when-possible
 
 ``ispc`` provides two key language-level capabilities for laying out and
 accessing data in SOA format:
@@ -2348,11 +2352,19 @@ code below.
         x *= x;
     }
 
+
+Iteration Statements
+--------------------
+
+In addition to the standard iteration statements ``for``, ``while``, and
+``do``, inherited from C/C++, ``ispc`` provides a number of additional
+specialized ways to iterate over data.
+
 Basic Iteration Statements: "for", "while", and "do"
 ----------------------------------------------------
 
 ``ispc`` supports ``for``, ``while``, and ``do`` loops, with the same
-specification as in C.  Like C++, variables can be declared in the ``for``
+specification as in C.  As in C++, variables can be declared in the ``for``
 statement itself:
 
 ::
@@ -2374,6 +2386,58 @@ executing code in the loop body that didn't execute the ``continue`` will
 be unaffected by it.
 
 
+Iteration over active program instances: "foreach_active"
+---------------------------------------------------------
+
+The ``foreach_active`` construct specifies a loop that serializes over the
+active program instances: the loop body executes once for each active
+program instance, and with only that program instance executing.  
+
+As an example of the use of this construct, consider an application where
+each program instance independently computes an offset into a shared array
+that is being updated:
+
+::
+
+    uniform float array[...] = { ... };    
+    int index = ...;
+    ++array[index];
+
+If more than one active program instance computes the same value for
+``index``, the above code has undefined behavior (see the section `Data
+Races Within a Gang`_ for details.)  The increment of ``array[index]``
+could instead be written inside a ``foreach_active`` statement:
+
+::
+
+    foreach_active (i) {
+        ++array[index];
+    }  
+
+
+The variable name provided in parenthesis after the ``foreach_active``
+keyword (here, ``index``), causes a ``const uniform int64`` local variable
+of that name to be declared, where the variable takes the ``programIndex``
+value of the program instance executing at each loop iteraton.  
+
+In the code above, because only one program instance is executing at a time
+when the loop body executes, the update to ``array`` is well-defined.
+Note that for this particular example, the "local atomic" operations in
+the standard library could be used instead to safely update ``array``.
+However, local atomics functions aren't always available or appropriate for
+more complex cases.)
+
+``continue`` statements may be used inside ``foreach_active`` loops, though
+``break`` and ``return`` are prohibited.  The order in which the active
+program instances are processed in the loop is not defined.
+
+See the `Using "foreach_active" Effectively`_ Section in the ispc
+Performance Guide for more details about ``foreach_active``.
+
+.. _Using "foreach_active" Effectively: perfguide.html#using-foreach-active-effectively
+
+
+
 Iteration over unique elements: "foreach_unique"
 ------------------------------------------------
 
@@ -2408,7 +2472,144 @@ evaluated once, and it must be of an atomic type (``float``, ``int``,
 etc.), an ``enum`` type, or a pointer type.  The iteration variable ``val``
 is a variable of ``const uniform`` type of the iteration type; it can't be
 modified within the loop.  Finally, ``break`` and ``return`` statements are
-illegal within the loop body, but ``continue`` statemetns are allowed.
+illegal within the loop body, but ``continue`` statements are allowed.
+
+
+Parallel Iteration Statements: "foreach" and "foreach_tiled"
+------------------------------------------------------------
+
+The ``foreach`` and ``foreach_tiled`` constructs specify loops over a
+possibly multi-dimensional domain of integer ranges.  Their role goes
+beyond "syntactic sugar"; they provides one of the two key ways of
+expressing parallel computation in ``ispc``.
+
+In general, a ``foreach`` or ``foreach_tiled`` statement takes one or more
+dimension specifiers separated by commas, where each dimension is specified
+by ``identifier = start ... end``, where ``start`` is a signed integer
+value less than or equal to ``end``, specifying iteration over all integer
+values from ``start`` up to and including ``end-1``.  An arbitrary number
+of iteration dimensions may be specified, with each one spanning a
+different range of values.  Within the ``foreach`` loop, the given
+identifiers are available as ``const varying int32`` variables.  The
+execution mask starts out "all on" at the start of each ``foreach`` loop
+iteration, but may be changed by control flow constructs within the loop.
+
+It is illegal to have a ``break`` statement or a ``return`` statement
+within a ``foreach`` loop; a compile-time error will be issued in this
+case.  (It is legal to have a ``break`` in a regular ``for`` loop that's
+nested inside a ``foreach`` loop.)  ``continue`` statements are legal in
+``foreach`` loops; they have the same effect as in regular ``for`` loops:
+a program instances that executes a ``continue`` statement effectively
+skips over the rest of the loop body for the current iteration.
+
+It is also currently illegal to have nested ``foreach`` statements; this
+limitation will be removed in a future release of ``ispc``.
+
+As a specific example, consider the following ``foreach`` statement:
+
+::
+
+    foreach (j = 0 ... height, i = 0 ... width) {
+        // loop body--process data element (i,j)
+    }
+
+It specifies a loop over a 2D domain, where the ``j`` variable goes from 0
+to ``height-1`` and ``i`` goes from 0 to ``width-1``.  Within the loop, the
+variables ``i`` and ``j`` are available and initialized accordingly.
+
+``foreach`` loops actually cause the given iteration domain to be
+automatically mapped to the program instances in the gang, so that all of
+the data can be processed, in gang-sized chunks.  As a specific example,
+consider a simple ``foreach`` loop like the following, on a target where
+the gang size is 8:
+
+::
+
+    foreach (i = 0 ... 16) {
+        // perform computation on element i
+    }
+
+One possible valid execution path of this loop would be for the program
+counter the step through the statements of this loop just ``16/8==2``
+times; the first time through, with the ``varying int32`` variable ``i``
+having the values (0,1,2,3,4,5,6,7) over the program instances, and the
+second time through, having the values (8,9,10,11,12,13,14,15), thus
+mapping the available program instances to all of the data by the end of
+the loop's execution.  
+
+In general, however, you shouldn't make any assumptions about the order in
+which elements of the iteration domain will be processed by a ``foreach``
+loop.  For example, the following code exhibits undefined behavior:
+
+::
+
+    uniform float a[10][100];
+    foreach (i = 0 ... 10, j = 0 ... 100) {
+        if (i == 0)
+            a[i][j] = j;
+        else
+            // Error: can't assume that a[i-1][j] has been set yet
+            a[i][j] = a[i-1][j];
+
+The ``foreach`` statement generally subdivides the iteration domain by
+selecting sets of contiguous elements in the inner-most dimension of the
+iteration domain.  This decomposition approach generally leads to coherent
+memory reads and writes, but may lead to worse control flow coherence than
+other decompositions.
+
+Therefore, ``foreach_tiled`` decomposes the iteration domain in a way that
+tries to map locations in the domain to program instances in a way that is
+compact across all of the dimensions.  For example, on a target with an
+8-wide gang size, the following ``foreach_tiled`` statement might process
+the iteration domain in chunks of 2 elements in ``j`` and 4 elements in
+``i`` each time.  (The trade-offs between these two constructs are
+discussed in more detail in the `ispc Performance Guide`_.)
+
+.. _ispc Performance Guide: perfguide.html#improving-control-flow-coherence-with-foreach-tiled
+
+::
+
+    foreach_tiled (j = 0 ... height, i = 0 ... width) {
+        // loop body--process data element (i,j)
+    }
+
+
+Parallel Iteration with "programIndex" and "programCount"
+---------------------------------------------------------
+
+In addition to ``foreach`` and ``foreach_tiled``, ``ispc`` provides a
+lower-level mechanism for mapping SPMD program instances to data to operate
+on via the built-in ``programIndex`` and ``programCount`` variables.
+
+``programIndex`` gives the index of the SIMD-lane being used for running
+each program instance.  (In other words, it's a varying integer value that
+has value zero for the first program instance, and so forth.)  The
+``programCount`` builtin gives the total number of instances in the gang.
+Together, these can be used to uniquely map executing program instances to
+input data. [#]_
+
+.. [#] ``programIndex`` is analogous to ``get_global_id()`` in OpenCL* and
+       ``threadIdx`` in CUDA*.
+
+As a specific example, consider an ``ispc`` function that needs to perform
+some computation on an array of data.
+
+::
+
+    for (uniform int i = 0; i < count; i += programCount) {
+        float d = data[i + programIndex];
+        float r = ....
+        result[i + programIndex] = r;
+    }
+
+Here, we've written a loop that explicitly loops over the data in chunks of
+``programCount`` elements.  In each loop iteration, the running program
+instances effectively collude amongst themselves using ``programIndex`` to
+determine which elements to work on in a way that ensures that all of the
+data elements will be processed.  In this particular case, a ``foreach``
+loop would be preferable, as ``foreach`` naturally handles the case where
+``programCount`` doesn't evenly divide the number of elements to be
+processed, while the loop above assumes that case implicitly. 
 
 
 Unstructured Control Flow: "goto"
@@ -2479,139 +2680,6 @@ constructs in ``ispc`` that a loop will never be executed with an "all off"
 execution mask.
 
 
-Parallel Iteration Statements: "foreach" and "foreach_tiled"
-------------------------------------------------------------
-
-The ``foreach`` and ``foreach_tiled`` constructs specify loops over a
-possibly multi-dimensional domain of integer ranges.  Their role goes
-beyond "syntactic sugar"; they provides one of the two key ways of
-expressing parallel computation in ``ispc``.
-
-In general, a ``foreach`` or ``foreach_tiled`` statement takes one or more
-dimension specifiers separated by commas, where each dimension is specified
-by ``identifier = start ... end``, where ``start`` is a signed integer
-value less than or equal to ``end``, specifying iteration over all integer
-values from ``start`` up to and including ``end-1``.  An arbitrary number
-of iteration dimensions may be specified, with each one spanning a
-different range of values.  Within the ``foreach`` loop, the given
-identifiers are available as ``const varying int32`` variables.  The
-execution mask starts out "all on" at the start of each ``foreach`` loop
-iteration, but may be changed by control flow constructs within the loop.
-
-It is illegal to have a ``break`` statement or a ``return`` statement
-within a ``foreach`` loop; a compile-time error will be issued in this
-case.  (It is legal to have a ``break`` in a regular ``for`` loop that's
-nested inside a ``foreach`` loop.)  ``continue`` statements are legal in
-``foreach`` loops; they have the same effect as in regular ``for`` loops:
-a program instances that executes a ``continue`` statement effectively
-skips over the rest of the loop body for the current iteration.
-
-As a specific example, consider the following ``foreach`` statement:
-
-::
-
-    foreach (j = 0 ... height, i = 0 ... width) {
-        // loop body--process data element (i,j)
-    }
-
-It specifies a loop over a 2D domain, where the ``j`` variable goes from 0
-to ``height-1`` and ``i`` goes from 0 to ``width-1``.  Within the loop, the
-variables ``i`` and ``j`` are available and initialized accordingly.
-
-``foreach`` loops actually cause the given iteration domain to be
-automatically mapped to the program instances in the gang, so that all of
-the data can be processed, in gang-sized chunks.  As a specific example,
-consider a simple ``foreach`` loop like the following, on a target where
-the gang size is 8:
-
-::
-
-    foreach (i = 0 ... 16) {
-        // perform computation on element i
-    }
-
-One possible valid execution path of this loop would be for the program
-counter the step through the statements of this loop just ``16/8==2``
-times; the first time through, with the ``varying int32`` variable ``i``
-having the values (0,1,2,3,4,5,6,7) over the program instances, and the
-second time through, having the values (8,9,10,11,12,13,14,15), thus
-mapping the available program instances to all of the data by the end of
-the loop's execution.  
-
-In general, however, you shouldn't make any assumptions about the order in
-which elements of the iteration domain will be processed by a ``foreach``
-loop.  For example, the following code exhibits undefined behavior:
-
-::
-
-    uniform float a[10][100];
-    foreach (i = 0 ... 10, j = 0 ... 100) {
-        if (i == 0)
-            a[i][j] = j;
-        else
-            // Error: can't assume that a[i-1][j] has been set yet
-            a[i][j] = a[i-1][j];
-
-The ``foreach`` statement generally subdivides the iteration domain by
-selecting sets of contiguous elements in the inner-most dimension of the
-iteration domain.  This decomposition approach generally leads to coherent
-memory reads and writes, but may lead to worse control flow coherence than
-other decompositions.
-
-Therefore, ``foreach_tiled`` decomposes the iteration domain in a way that
-tries to map locations in the domain to program instances in a way that is
-compact across all of the dimensions.  For example, on a target with an
-8-wide gang size, the following ``foreach_tiled`` statement might process
-the iteration domain in chunks of 2 elements in ``j`` and 4 elements in
-``i`` each time.  (The trade-offs between these two constructs are
-discussed in more detail in the `ispc Performance Guide`_.)
-
-.. _ispc Performance Guide: perf.html#improving-control-flow-coherence-with-foreach-tiled
-
-::
-
-    foreach_tiled (j = 0 ... height, i = 0 ... width) {
-        // loop body--process data element (i,j)
-    }
-
-
-Parallel Iteration with "programIndex" and "programCount"
----------------------------------------------------------
-
-In addition to ``foreach`` and ``foreach_tiled``, ``ispc`` provides a
-lower-level mechanism for mapping SPMD program instances to data to operate
-on via the built-in ``programIndex`` and ``programCount`` variables.
-
-``programIndex`` gives the index of the SIMD-lane being used for running
-each program instance.  (In other words, it's a varying integer value that
-has value zero for the first program instance, and so forth.)  The
-``programCount`` builtin gives the total number of instances in the gang.
-Together, these can be used to uniquely map executing program instances to
-input data. [#]_
-
-.. [#] ``programIndex`` is analogous to ``get_global_id()`` in OpenCL* and
-       ``threadIdx`` in CUDA*.
-
-As a specific example, consider an ``ispc`` function that needs to perform
-some computation on an array of data.
-
-::
-
-    for (uniform int i = 0; i < count; i += programCount) {
-        float d = data[i + programIndex];
-        float r = ....
-        result[i + programIndex] = r;
-    }
-
-Here, we've written a loop that explicitly loops over the data in chunks of
-``programCount`` elements.  In each loop iteration, the running program
-instances effectively collude amongst themselves using ``programIndex`` to
-determine which elements to work on in a way that ensures that all of the
-data elements will be processed.  In this particular case, a ``foreach``
-loop would be preferable, as ``foreach`` naturally handles the case where
-``programCount`` doesn't evenly divide the number of elements to be
-processed, while the loop above assumes that case implicitly. 
-
 Functions and Function Calls
 ----------------------------
 
@@ -3452,7 +3520,7 @@ There are also variants of these functions that return the value as a
 discussion of an application of this variant to improve memory access
 performance in the `Performance Guide`_.
 
-.. _Performance Guide: perf.html#understanding-gather-and-scatter
+.. _Performance Guide: perfguide.html#understanding-gather-and-scatter
 
 ::
 
@@ -4130,8 +4198,10 @@ from ``ispc`` must be declared as follows:
 It is illegal to overload functions declared with ``extern "C"`` linkage;
 ``ispc`` issues an error in this case.
 
-Function calls back to C/C++ are not made if none of the program instances
-want to make the call.  For example, given code like:
+**Only a single function call is made back to C++ for the entire gang of
+runing program instances**.  Furthermore, function calls back to C/C++ are not
+made if none of the program instances want to make the call.  For example,
+given code like:
 
 ::
 
@@ -4174,6 +4244,24 @@ Application code can thus be written as:
             }
     }
 
+In some cases, it can be desirable to generate a single call for each
+executing program instance, rather than one call for a gang.  For example,
+the code below shows how one might call an existing math library routine
+that takes a scalar parameter.
+
+::
+
+    extern "C" uniform double erf(uniform double);
+    double v = ...;
+    double result;
+    foreach_active (instance) {
+        uniform double r = erf(extract(v, instance));
+        result = insert(result, instance, r);
+    }
+
+This code calls ``erf()`` once for each active program instance, passing it
+the program instance's value of ``v`` and storing the result in the
+instance's ``result`` value.
 
 Data Layout
 -----------
@@ -4309,7 +4397,7 @@ can also have a significant effect on performance; in general, creating
 groups of work that will tend to do similar computation across the SPMD
 program instances improves performance.
 
-.. _ispc Performance Tuning Guide: http://ispc.github.com/perf.html
+.. _ispc Performance Tuning Guide: http://ispc.github.com/perfguide.html
 
 
 Disclaimer and Legal Information
diff --git a/docs/perfguide.rst b/docs/perfguide.rst
index b8e65893..b1d110b2 100644
--- a/docs/perfguide.rst
+++ b/docs/perfguide.rst
@@ -21,6 +21,7 @@ the most out of ``ispc`` in practice.
   + `Avoid 64-bit Addressing Calculations When Possible`_
   + `Avoid Computation With 8 and 16-bit Integer Types`_
   + `Implementing Reductions Efficiently`_
+  + `Using "foreach_active" Effectively`_
   + `Using Low-level Vector Tricks`_
   + `The "Fast math" Option`_
   + `"inline" Aggressively`_
@@ -510,6 +511,43 @@ values--very efficient code in the end.
         return reduce_add(sum);
     } 
 
+Using "foreach_active" Effectively
+----------------------------------
+
+For high-performance code,
+
+For example, consider this segment of code, from the introduction of
+``foreach_active`` in the ispc User's Guide:
+
+::
+
+    uniform float array[...] = { ... };    
+    int index = ...;
+    foreach_active (i) {
+        ++array[index];
+    }  
+
+Here, ``index`` was assumed to possibly have the same value for multiple
+program instances, so the updates to ``array[index]`` are serialized by the
+``foreach_active`` statement in order to not have undefined results when
+``index`` values do collide.
+
+The code generated by the compiler can be improved  in this case by making
+it clear that only a single element of the array is accessed by
+``array[index]`` and that thus a general gather or scatter isn't required.
+Specifically, by using the ``extract()`` function from the standard library
+to extract the current program instance's value of ``index`` into a
+``uniform`` variable and then using that to index into ``array``, as below,
+more efficient code is generated.
+
+::
+
+    foreach_active (instanceNum) {
+        uniform int unifIndex = extract(index, instanceNum);
+        ++array[unifIndex];
+    }
+
+
 Using Low-level Vector Tricks
 -----------------------------
 
diff --git a/lex.ll b/lex.ll
index d55ce930..9bf768e4 100644
--- a/lex.ll
+++ b/lex.ll
@@ -66,8 +66,8 @@ static int allTokens[] = {
   TOKEN_CONST, TOKEN_CONTINUE, TOKEN_CRETURN, TOKEN_DEFAULT, TOKEN_DO,
   TOKEN_DELETE, TOKEN_DOUBLE, TOKEN_ELSE, TOKEN_ENUM,
   TOKEN_EXPORT, TOKEN_EXTERN, TOKEN_FALSE, TOKEN_FLOAT, TOKEN_FOR,
-  TOKEN_FOREACH, TOKEN_FOREACH_TILED, TOKEN_FOREACH_UNIQUE,
-  TOKEN_GOTO, TOKEN_IF, TOKEN_IN, TOKEN_INLINE,
+  TOKEN_FOREACH, TOKEN_FOREACH_ACTIVE, TOKEN_FOREACH_TILED,
+  TOKEN_FOREACH_UNIQUE, TOKEN_GOTO, TOKEN_IF, TOKEN_IN, TOKEN_INLINE,
   TOKEN_INT, TOKEN_INT8, TOKEN_INT16, TOKEN_INT, TOKEN_INT64, TOKEN_LAUNCH,
   TOKEN_NEW, TOKEN_NULL, TOKEN_PRINT, TOKEN_RETURN, TOKEN_SOA, TOKEN_SIGNED,
   TOKEN_SIZEOF, TOKEN_STATIC, TOKEN_STRUCT, TOKEN_SWITCH, TOKEN_SYNC,
@@ -115,6 +115,7 @@ void ParserInit() {
     tokenToName[TOKEN_FLOAT] = "float";
     tokenToName[TOKEN_FOR] = "for";
     tokenToName[TOKEN_FOREACH] = "foreach";
+    tokenToName[TOKEN_FOREACH_ACTIVE] = "foreach_active";
     tokenToName[TOKEN_FOREACH_TILED] = "foreach_tiled";
     tokenToName[TOKEN_FOREACH_UNIQUE] = "foreach_unique";
     tokenToName[TOKEN_GOTO] = "goto";
@@ -226,6 +227,7 @@ void ParserInit() {
     tokenNameRemap["TOKEN_FLOAT"] = "\'float\'";
     tokenNameRemap["TOKEN_FOR"] = "\'for\'";
     tokenNameRemap["TOKEN_FOREACH"] = "\'foreach\'";
+    tokenNameRemap["TOKEN_FOREACH_ACTIVE"] = "\'foreach_active\'";
     tokenNameRemap["TOKEN_FOREACH_TILED"] = "\'foreach_tiled\'";
     tokenNameRemap["TOKEN_FOREACH_UNIQUE"] = "\'foreach_unique\'";
     tokenNameRemap["TOKEN_GOTO"] = "\'goto\'";
@@ -369,8 +371,8 @@ extern { RT; return TOKEN_EXTERN; }
 false { RT; return TOKEN_FALSE; }
 float { RT; return TOKEN_FLOAT; }
 for { RT; return TOKEN_FOR; }
-__foreach_active { RT; return TOKEN_FOREACH_ACTIVE; }
 foreach { RT; return TOKEN_FOREACH; }
+foreach_active { RT; return TOKEN_FOREACH_ACTIVE; }
 foreach_tiled { RT; return TOKEN_FOREACH_TILED; }
 foreach_unique { RT; return TOKEN_FOREACH_UNIQUE; }
 goto { RT; return TOKEN_GOTO; }
diff --git a/parse.yy b/parse.yy
index 8200fb3d..7a9026e3 100644
--- a/parse.yy
+++ b/parse.yy
@@ -117,8 +117,8 @@ static const char *lBuiltinTokens[] = {
     "assert", "bool", "break", "case", "cbreak", "ccontinue", "cdo",
     "cfor", "cif", "cwhile", "const", "continue", "creturn", "default",
     "do", "delete", "double", "else", "enum", "export", "extern", "false",
-    "float", "for", "foreach", "foreach_tiled", "foreach_unique",
-    "goto", "if", "in", "inline",
+    "float", "for", "foreach", "foreach_active", "foreach_tiled",
+     "foreach_unique", "goto", "if", "in", "inline",
     "int", "int8", "int16", "int32", "int64", "launch", "new", "NULL",
     "print", "return", "signed", "sizeof", "static", "struct", "switch",
     "sync", "task", "true", "typedef", "uniform", "unmasked", "unsigned", 
@@ -1688,7 +1688,7 @@ foreach_active_scope
 foreach_active_identifier
     : TOKEN_IDENTIFIER
     {
-        $$ = new Symbol(yytext, @1, AtomicType::UniformInt32);
+        $$ = new Symbol(yytext, @1, AtomicType::UniformInt64->GetAsConstType());
     }
     ;
 
@@ -1838,7 +1838,7 @@ iteration_statement
      }
      statement
      {
-         $$ = CreateForeachActiveStmt($3, $6, Union(@1, @4));
+         $$ = new ForeachActiveStmt($3, $6, Union(@1, @4));
          m->symbolTable->PopScope();
      }
     | foreach_unique_scope '(' foreach_unique_identifier TOKEN_IN
diff --git a/stdlib.ispc b/stdlib.ispc
index 84865d90..a7499930 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -421,7 +421,7 @@ static inline void memcpy(void * varying dst, void * varying src,
     da[programIndex] = dst;
     sa[programIndex] = src;
 
-    __foreach_active (i) {
+    foreach_active (i) {
         void * uniform d = da[i], * uniform s = sa[i];
         __memcpy32((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
     }
@@ -435,7 +435,7 @@ static inline void memcpy64(void * varying dst, void * varying src,
     da[programIndex] = dst;
     sa[programIndex] = src;
 
-    __foreach_active (i) {
+    foreach_active (i) {
         void * uniform d = da[i], * uniform s = sa[i];
         __memcpy64((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
     }
@@ -459,7 +459,7 @@ static inline void memmove(void * varying dst, void * varying src,
     da[programIndex] = dst;
     sa[programIndex] = src;
 
-    __foreach_active (i) {
+    foreach_active (i) {
         void * uniform d = da[i], * uniform s = sa[i];
         __memmove32((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
     }
@@ -473,7 +473,7 @@ static inline void memmove64(void * varying dst, void * varying src,
     da[programIndex] = dst;
     sa[programIndex] = src;
 
-    __foreach_active (i) {
+    foreach_active (i) {
         void * uniform d = da[i], * uniform s = sa[i];
         __memmove64((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
     }
@@ -493,7 +493,7 @@ static inline void memset(void * varying ptr, int8 val, int32 count) {
     void * uniform pa[programCount];
     pa[programIndex] = ptr;
 
-    __foreach_active (i) {
+    foreach_active (i) {
         __memset32((int8 * uniform)pa[i], extract(val, i), extract(count, i));
     }
 }
@@ -502,7 +502,7 @@ static inline void memset64(void * varying ptr, int8 val, int64 count) {
     void * uniform pa[programCount];
     pa[programIndex] = ptr;
 
-    __foreach_active (i) {
+    foreach_active (i) {
         __memset64((int8 * uniform)pa[i], extract(val, i), extract(count, i));
     }
 }
@@ -711,7 +711,7 @@ static inline void prefetch_l1(const void * varying ptr) {
     const void * uniform ptrArray[programCount];
     ptrArray[programIndex] = ptr;
 
-    __foreach_active (i) {
+    foreach_active (i) {
         const void * uniform p = ptrArray[i];
         prefetch_l1(p);
     }
@@ -721,7 +721,7 @@ static inline void prefetch_l2(const void * varying ptr) {
     const void * uniform ptrArray[programCount];
     ptrArray[programIndex] = ptr;
 
-    __foreach_active (i) {
+    foreach_active (i) {
         const void * uniform p = ptrArray[i];
         prefetch_l2(p);
     }
@@ -731,7 +731,7 @@ static inline void prefetch_l3(const void * varying ptr) {
     const void * uniform ptrArray[programCount];
     ptrArray[programIndex] = ptr;
 
-    __foreach_active (i) {
+    foreach_active (i) {
         const void * uniform p = ptrArray[i];
         prefetch_l3(p);
     }
@@ -741,7 +741,7 @@ static inline void prefetch_nt(const void * varying ptr) {
     const void * uniform ptrArray[programCount];
     ptrArray[programIndex] = ptr;
 
-    __foreach_active (i) {
+    foreach_active (i) {
         const void * uniform p = ptrArray[i];
         prefetch_nt(p);
     }
@@ -1621,7 +1621,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
-    __foreach_active (i) {                                              \
+    foreach_active (i) {                                              \
         uniform TA * uniform p = ptrArray[i];                           \
         uniform TA v = extract(value, i);                               \
         uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
@@ -1672,7 +1672,7 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
-    __foreach_active (i) {                                              \
+    foreach_active (i) {                                              \
         uniform TA * uniform p = ptrArray[i];                           \
         uniform TA v = extract(value, i);                               \
         uniform TA r = __atomic_swap_uniform_##TB##_global(p, v);       \
@@ -1699,7 +1699,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
-    __foreach_active (i) {                                              \
+    foreach_active (i) {                                              \
         uniform TA * uniform p = ptrArray[i];                           \
         uniform TA v = extract(value, i);                               \
         uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
@@ -1774,7 +1774,7 @@ static inline TA atomic_compare_exchange_global(               \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
     TA ret;                                                             \
-    __foreach_active (i) {                                              \
+    foreach_active (i) {                                              \
         uniform TA r =                                                  \
             __atomic_compare_exchange_uniform_##TB##_global(ptrArray[i], \
                                                             extract(oldval, i), \
@@ -1848,7 +1848,7 @@ static inline uniform TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, \
 }                                                                      \
 static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value) { \
     TYPE ret;                                                          \
-    __foreach_active (i) {                                             \
+    foreach_active (i) {                                             \
         ret = insert(ret, i, *ptr);                                    \
         *ptr = OPFUNC(*ptr, extract(value, i));                        \
     }                                                                  \
@@ -1858,7 +1858,7 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) {    \
     TYPE ret;                                                          \
     uniform TYPE * uniform ptrs[programCount];                         \
     ptrs[programIndex] = p;                                            \
-    __foreach_active (i) {                                             \
+    foreach_active (i) {                                             \
         ret = insert(ret, i, *ptrs[i]);                                \
         *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i));                \
     }                                                                  \
@@ -1975,7 +1975,7 @@ static inline uniform TYPE atomic_compare_exchange_local(uniform TYPE * uniform
 static inline TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
                                                  TYPE cmp, TYPE update) { \
     TYPE ret;                                                          \
-    __foreach_active (i) {                                             \
+    foreach_active (i) {                                             \
         uniform TYPE old = *ptr;                                       \
         if (old == extract(cmp, i))                                    \
             *ptr = extract(update, i);                                 \
@@ -1988,7 +1988,7 @@ static inline TYPE atomic_compare_exchange_local(uniform TYPE * varying p, \
     uniform TYPE * uniform ptrs[programCount];                          \
     ptrs[programIndex] = p;                                            \
     TYPE ret;                                                          \
-    __foreach_active (i) {                                             \
+    foreach_active (i) {                                             \
         uniform TYPE old = *ptrs[i];                                   \
         if (old == extract(cmp, i))                                    \
             *ptrs[i] = extract(update, i);                             \
@@ -2127,7 +2127,7 @@ static inline float sin(float x_full) {
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform float r = __stdlib_sinf(extract(x_full, i));
             ret = insert(ret, i, r);
         }
@@ -2259,7 +2259,7 @@ static inline float asin(float x) {
     if (__math_lib == __math_lib_svml ||
         __math_lib == __math_lib_system) {
         float ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform float r = __stdlib_asinf(extract(x, i));
             ret = insert(ret, i, r);
         }
@@ -2364,7 +2364,7 @@ static inline float cos(float x_full) {
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform float r = __stdlib_cosf(extract(x_full, i));
             ret = insert(ret, i, r);
         }
@@ -2502,7 +2502,7 @@ static inline void sincos(float x_full, varying float * uniform sin_result,
         __svml_sincos(x_full, sin_result, cos_result);
     }
     else if (__math_lib == __math_lib_system) {
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform float s, c;
             __stdlib_sincosf(extract(x_full, i), &s, &c);
             *sin_result = insert(*sin_result, i, s);
@@ -2635,7 +2635,7 @@ static inline float tan(float x_full) {
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform float r = __stdlib_tanf(extract(x_full, i));
             ret = insert(ret, i, r);
         }
@@ -2786,7 +2786,7 @@ static inline float atan(float x_full) {
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform float r = __stdlib_atanf(extract(x_full, i));
             ret = insert(ret, i, r);
         }
@@ -2881,7 +2881,7 @@ static inline float atan2(float y, float x) {
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i));
             ret = insert(ret, i, r);
         }
@@ -2944,7 +2944,7 @@ static inline float exp(float x_full) {
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform float r = __stdlib_expf(extract(x_full, i));
             ret = insert(ret, i, r);
         }
@@ -3151,7 +3151,7 @@ static inline float log(float x_full) {
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform float r = __stdlib_logf(extract(x_full, i));
             ret = insert(ret, i, r);
         }
@@ -3326,7 +3326,7 @@ static inline float pow(float a, float b) {
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
             ret = insert(ret, i, r);
         }
@@ -3416,7 +3416,7 @@ static inline double sin(double x) {
         return sin((float)x);
     else {
         double ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform double r = __stdlib_sin(extract(x, i));
             ret = insert(ret, i, r);
         }
@@ -3438,7 +3438,7 @@ static inline double cos(double x) {
         return cos((float)x);
     else {
         double ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform double r = __stdlib_cos(extract(x, i));
             ret = insert(ret, i, r);
         }
@@ -3464,7 +3464,7 @@ static inline void sincos(double x, varying double * uniform sin_result,
         *cos_result = cr;
     }
     else {
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform double sr, cr;
             __stdlib_sincos(extract(x, i), &sr, &cr);
             *sin_result = insert(*sin_result, i, sr);
@@ -3492,7 +3492,7 @@ static inline double tan(double x) {
         return tan((float)x);
     else {
         double ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform double r = __stdlib_tan(extract(x, i));
             ret = insert(ret, i, r);
         }
@@ -3514,7 +3514,7 @@ static inline double atan(double x) {
         return atan((float)x);
     else {
         double ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform double r = __stdlib_atan(extract(x, i));
             ret = insert(ret, i, r);
         }
@@ -3536,7 +3536,7 @@ static inline double atan2(double y, double x) {
         return atan2((float)y, (float)x);
     else {
         double ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform double r = __stdlib_atan2(extract(y, i), extract(x, i));
             ret = insert(ret, i, r);
         }
@@ -3558,7 +3558,7 @@ static inline double exp(double x) {
         return exp((float)x);
     else {
         double ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform double r = __stdlib_exp(extract(x, i));
             ret = insert(ret, i, r);
         }
@@ -3580,7 +3580,7 @@ static inline double log(double x) {
         return log((float)x);
     else {
         double ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform double r = __stdlib_log(extract(x, i));
             ret = insert(ret, i, r);
         }
@@ -3602,7 +3602,7 @@ static inline double pow(double a, double b) {
         return pow((float)a, (float)b);
     else {
         double ret;
-        __foreach_active (i) {
+        foreach_active (i) {
             uniform double r = __stdlib_pow(extract(a, i), extract(b, i));
             ret = insert(ret, i, r);
         }
diff --git a/stmt.cpp b/stmt.cpp
index e6622bb2..533cbac8 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -1915,6 +1915,188 @@ ForeachStmt::Print(int indent) const {
 }
 
 
+///////////////////////////////////////////////////////////////////////////
+// ForeachActiveStmt
+
+ForeachActiveStmt::ForeachActiveStmt(Symbol *s, Stmt *st, SourcePos pos) 
+    : Stmt(pos) {
+    sym = s;
+    stmts = st;
+}
+
+
+void
+ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
+    // Allocate storage for the symbol that we'll use for the uniform
+    // variable that holds the current program instance in each loop
+    // iteration.
+    if (sym->type == NULL) {
+        Assert(m->errorCount > 0);
+        return;
+    }
+    Assert(Type::Equal(sym->type, 
+                       AtomicType::UniformInt64->GetAsConstType()));
+    sym->storagePtr = ctx->AllocaInst(LLVMTypes::Int64Type, sym->name.c_str());
+
+    ctx->SetDebugPos(pos);
+    ctx->EmitVariableDebugInfo(sym);
+
+    // The various basic blocks that we'll need in the below
+    llvm::BasicBlock *bbFindNext = 
+        ctx->CreateBasicBlock("foreach_active_find_next");
+    llvm::BasicBlock *bbBody = ctx->CreateBasicBlock("foreach_active_body");
+    llvm::BasicBlock *bbCheckForMore = 
+        ctx->CreateBasicBlock("foreach_active_check_for_more");
+    llvm::BasicBlock *bbDone = ctx->CreateBasicBlock("foreach_active_done");
+
+    // Save the old mask so that we can restore it at the end
+    llvm::Value *oldInternalMask = ctx->GetInternalMask();
+    
+    // Now, *maskBitsPtr will maintain a bitmask for the lanes that remain
+    // to be processed by a pass through the loop body.  It starts out with
+    // the current execution mask (which should never be all off going in
+    // to this)...
+    llvm::Value *oldFullMask = ctx->GetFullMask();
+    llvm::Value *maskBitsPtr = 
+        ctx->AllocaInst(LLVMTypes::Int64Type, "mask_bits");
+    llvm::Value *movmsk = ctx->LaneMask(oldFullMask);
+    ctx->StoreInst(movmsk, maskBitsPtr);
+
+    // Officially start the loop.
+    ctx->StartScope();
+    ctx->StartForeach(FunctionEmitContext::FOREACH_ACTIVE);
+    ctx->SetContinueTarget(bbCheckForMore);
+
+    // Onward to find the first set of program instance to run the loop for
+    ctx->BranchInst(bbFindNext);
+    
+    ctx->SetCurrentBasicBlock(bbFindNext); {
+        // Load the bitmask of the lanes left to be processed
+        llvm::Value *remainingBits = ctx->LoadInst(maskBitsPtr, "remaining_bits");
+
+        // Find the index of the first set bit in the mask
+        llvm::Function *ctlzFunc = 
+            m->module->getFunction("__count_trailing_zeros_i64");
+        Assert(ctlzFunc != NULL);
+        llvm::Value *firstSet = ctx->CallInst(ctlzFunc, NULL, remainingBits,
+                                              "first_set");
+
+        // Store that value into the storage allocated for the iteration
+        // variable.
+        ctx->StoreInst(firstSet, sym->storagePtr);
+
+        // Now set the execution mask to be only on for the current program
+        // instance.  (TODO: is there a more efficient way to do this? e.g.
+        // for AVX1, we might want to do this as float rather than int
+        // math...)
+
+        // Get the "program index" vector value
+        llvm::Value *programIndex = 
+            llvm::UndefValue::get(LLVMTypes::Int32VectorType);
+        for (int i = 0; i < g->target.vectorWidth; ++i)
+            programIndex = ctx->InsertInst(programIndex, LLVMInt32(i), i,
+                                           "prog_index");
+
+        // And smear the current lane out to a vector
+        llvm::Value *firstSet32 = 
+            ctx->TruncInst(firstSet, LLVMTypes::Int32Type, "first_set32");
+        llvm::Value *firstSet32Smear = ctx->SmearUniform(firstSet32);
+
+        // Now set the execution mask based on doing a vector compare of
+        // these two
+        llvm::Value *iterMask = 
+            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
+                         firstSet32Smear, programIndex);
+        iterMask = ctx->I1VecToBoolVec(iterMask);
+
+        ctx->SetInternalMask(iterMask);
+
+        // Also update the bitvector of lanes left to turn off the bit for
+        // the lane we're about to run.
+        llvm::Value *setMask = 
+            ctx->BinaryOperator(llvm::Instruction::Shl, LLVMInt64(1),
+                                firstSet, "set_mask");
+        llvm::Value *notSetMask = ctx->NotOperator(setMask);
+        llvm::Value *newRemaining = 
+            ctx->BinaryOperator(llvm::Instruction::And, remainingBits, 
+                                notSetMask, "new_remaining");
+        ctx->StoreInst(newRemaining, maskBitsPtr);
+
+        // and onward to run the loop body...
+        ctx->BranchInst(bbBody);
+    }
+
+    ctx->SetCurrentBasicBlock(bbBody); {
+        // Run the code in the body of the loop.  This is easy now.
+        if (stmts)
+            stmts->EmitCode(ctx);
+
+        Assert(ctx->GetCurrentBasicBlock() != NULL);
+        ctx->BranchInst(bbCheckForMore);
+    }
+
+    ctx->SetCurrentBasicBlock(bbCheckForMore); {
+        // At the end of the loop body (either due to running the
+        // statements normally, or a continue statement in the middle of
+        // the loop that jumps to the end, see if there are any lanes left
+        // to be processed.
+        llvm::Value *remainingBits = ctx->LoadInst(maskBitsPtr, "remaining_bits");
+        llvm::Value *nonZero = 
+            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE,
+                         remainingBits, LLVMInt64(0), "remaining_ne_zero");
+        ctx->BranchInst(bbFindNext, bbDone, nonZero);
+    }
+
+    ctx->SetCurrentBasicBlock(bbDone);
+    ctx->SetInternalMask(oldInternalMask);
+    ctx->EndForeach();
+    ctx->EndScope();
+}
+
+
+void
+ForeachActiveStmt::Print(int indent) const {
+    printf("%*cForeach_active Stmt", indent, ' ');
+    pos.Print();
+    printf("\n");
+
+    printf("%*cIter symbol: ", indent+4, ' ');
+    if (sym != NULL) {
+        printf("%s", sym->name.c_str());
+        if (sym->type != NULL)
+            printf(" %s", sym->type->GetString().c_str());
+    }
+    else
+        printf("NULL");
+    printf("\n");
+
+    printf("%*cStmts:\n", indent+4, ' ');
+    if (stmts != NULL)
+        stmts->Print(indent+8);
+    else
+        printf("NULL");
+    printf("\n");
+}
+
+
+Stmt *
+ForeachActiveStmt::TypeCheck() {
+    if (sym == NULL)
+        return NULL;
+
+    return this;
+}
+
+
+int
+ForeachActiveStmt::EstimateCost() const {
+    return COST_VARYING_LOOP;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // ForeachUniqueStmt
 
@@ -3043,80 +3225,3 @@ int
 DeleteStmt::EstimateCost() const {
     return COST_DELETE;
 }
-
-///////////////////////////////////////////////////////////////////////////
-
-/** This generates AST nodes for an __foreach_active statement.  This
-    construct can be synthesized ouf of the existing ForStmt (and other AST
-    nodes), so here we just build up the AST that we need rather than
-    having a new Stmt implementation for __foreach_active.
-
-    @param iterSym  Symbol for the iteration variable (e.g. "i" in
-                    __foreach_active (i) { .. .}
-    @param stmts    Statements to execute each time through the loop, for
-                    each active program instance.
-    @param pos      Position of the __foreach_active statement in the source 
-                    file.
- */
-Stmt *
-CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
-    if (iterSym == NULL) {
-        AssertPos(pos, m->errorCount > 0);
-        return NULL;
-    }
-
-    // loop initializer: set iter = 0
-    std::vector<VariableDeclaration> var;
-    ConstExpr *zeroExpr = new ConstExpr(AtomicType::UniformInt32, 0,
-                                        iterSym->pos);
-    var.push_back(VariableDeclaration(iterSym, zeroExpr));
-    Stmt *initStmt = new DeclStmt(var, iterSym->pos);
-
-    // loop test: (iter < programCount)
-    ConstExpr *progCountExpr = 
-        new ConstExpr(AtomicType::UniformInt32, g->target.vectorWidth,
-                      pos);
-    SymbolExpr *symExpr = new SymbolExpr(iterSym, iterSym->pos);
-    Expr *testExpr = new BinaryExpr(BinaryExpr::Lt, symExpr, progCountExpr,
-                                    pos);
-    
-    // loop step: ++iterSym
-    UnaryExpr *incExpr = new UnaryExpr(UnaryExpr::PreInc, symExpr, pos);
-    Stmt *stepStmt = new ExprStmt(incExpr, pos);
-
-    // loop body
-    // First, call __movmsk(__mask)) to get the mask as a set of bits.
-    // This should be hoisted out of the loop
-    Symbol *maskSym = m->symbolTable->LookupVariable("__mask");
-    AssertPos(pos, maskSym != NULL);
-    Expr *maskVecExpr = new SymbolExpr(maskSym, pos);
-    std::vector<Symbol *> mmFuns;
-    m->symbolTable->LookupFunction("__movmsk", &mmFuns);
-    AssertPos(pos, mmFuns.size() == (g->target.maskBitCount == 32 ? 2 : 1));
-    FunctionSymbolExpr *movmskFunc = new FunctionSymbolExpr("__movmsk", mmFuns,
-                                                            pos);
-    ExprList *movmskArgs = new ExprList(maskVecExpr, pos);
-    FunctionCallExpr *movmskExpr = new FunctionCallExpr(movmskFunc, movmskArgs,
-                                                        pos);
-
-    // Compute the per lane mask to test the mask bits against: (1 << iter)
-    ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt64, int64_t(1),
-                                       iterSym->pos);
-    Expr *shiftLaneExpr = new BinaryExpr(BinaryExpr::Shl, oneExpr, symExpr, 
-                                         pos);
-
-    // Compute the AND: movmsk & (1 << iter)
-    Expr *maskAndLaneExpr = new BinaryExpr(BinaryExpr::BitAnd, movmskExpr,
-                                           shiftLaneExpr, pos);
-    // Test to see if it's non-zero: (mask & (1 << iter)) != 0
-    Expr *ifTestExpr = new BinaryExpr(BinaryExpr::NotEqual, maskAndLaneExpr,
-                                      zeroExpr, pos);
-
-    // Now, enclose the provided statements in an if test such that they
-    // only run if the mask is non-zero for the lane we're currently
-    // handling in the loop.
-    IfStmt *laneCheckIf = new IfStmt(ifTestExpr, stmts, NULL, false, pos);
-
-    // And return a for loop that wires it all together.
-    return new ForStmt(initStmt, testExpr, stepStmt, laneCheckIf, false, pos);
-}
diff --git a/stmt.h b/stmt.h
index 01a7244d..ee6bd2f0 100644
--- a/stmt.h
+++ b/stmt.h
@@ -260,6 +260,23 @@ public:
 };
 
 
+/** Iteration over each executing program instance.
+ */
+class ForeachActiveStmt : public Stmt {
+public:
+    ForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    Symbol *sym;
+    Stmt *stmts;
+};
+
+
 /** Parallel iteration over each unique value in the given (varying)
     expression.
  */
diff --git a/tests/foreach-active-1.ispc b/tests/foreach-active-1.ispc
new file mode 100644
index 00000000..d7c5ccef
--- /dev/null
+++ b/tests/foreach-active-1.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform int count = 0;
+    if (programIndex & 1)
+        foreach_active (i)
+            ++count;
+    RET[programIndex] = count; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount / 2;
+}
diff --git a/tests/foreach-active-2.ispc b/tests/foreach-active-2.ispc
new file mode 100644
index 00000000..80b4f606
--- /dev/null
+++ b/tests/foreach-active-2.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform int count = 0;
+    if (programIndex & 1)
+        foreach_active (i)
+            ++a;
+    RET[programIndex] = a; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (1 + programIndex) + ((programIndex & 1) ? 1 : 0);
+}
diff --git a/tests/foreach-active-3.ispc b/tests/foreach-active-3.ispc
new file mode 100644
index 00000000..b258907e
--- /dev/null
+++ b/tests/foreach-active-3.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = a; 
+
+    if (programIndex & 1)
+        foreach_active (i)
+            ++RET[i];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (1 + programIndex) + ((programIndex & 1) ? 1 : 0);
+}
diff --git a/tests/foreach-active-4.ispc b/tests/foreach-active-4.ispc
new file mode 100644
index 00000000..ffe72db0
--- /dev/null
+++ b/tests/foreach-active-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = a; 
+
+    if (programIndex & 1) {
+        foreach_active (i) {
+            if (i == 1)
+                continue;
+            ++RET[i];
+        }
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (1 + programIndex) + ((programIndex & 1) ? 1 : 0);
+    --RET[1];
+}
diff --git a/tests/foreach-active-5.ispc b/tests/foreach-active-5.ispc
new file mode 100644
index 00000000..2b00f220
--- /dev/null
+++ b/tests/foreach-active-5.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = a; 
+
+    if (programIndex & 1) {
+        foreach_active (i) {
+            if (i & 1)
+                continue;
+
+            int * uniform null = 0;
+            *null = 0;
+        }
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (1 + programIndex);
+}