Add foreach_active iteration statement.

Issue #298.
This commit is contained in:
Matt Pharr
2012-06-22 10:35:43 -07:00
parent ed13dd066b
commit b4a078e2f6
15 changed files with 644 additions and 279 deletions

17
ast.cpp
View File

@@ -92,6 +92,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
DoStmt *dos;
ForStmt *fs;
ForeachStmt *fes;
ForeachActiveStmt *fas;
ForeachUniqueStmt *fus;
CaseStmt *cs;
DefaultStmt *defs;
@@ -138,6 +139,9 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
postFunc, data);
fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
}
else if ((fas = dynamic_cast<ForeachActiveStmt *>(node)) != NULL) {
fas->stmts = (Stmt *)WalkAST(fas->stmts, preFunc, postFunc, data);
}
else if ((fus = dynamic_cast<ForeachUniqueStmt *>(node)) != NULL) {
fus->expr = (Expr *)WalkAST(fus->expr, preFunc, postFunc, data);
fus->stmts = (Stmt *)WalkAST(fus->stmts, preFunc, postFunc, data);
@@ -391,14 +395,15 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
}
if (dynamic_cast<ForeachStmt *>(node) != NULL ||
dynamic_cast<ForeachActiveStmt *>(node) != NULL ||
dynamic_cast<ForeachUniqueStmt *>(node) != NULL) {
// foreach() statements also shouldn't be run with an all-off mask.
// Since they re-establish an 'all on' mask, this would be pretty
// unintuitive. (More generally, it's possibly a little strange to
// allow foreach() in the presence of any non-uniform control
// flow...)
// The various foreach statements also shouldn't be run with an
// all-off mask. Since they can re-establish an 'all on' mask,
// this would be pretty unintuitive. (More generally, it's
// possibly a little strange to allow foreach in the presence of
// any non-uniform control flow...)
//
// Similarly, the implementation foreach_unique assumes as a
// Similarly, the implementation of foreach_unique assumes as a
// precondition that the mask won't be all off going into it, so
// we'll enforce that here...
*okPtr = false;

29
ctx.cpp
View File

@@ -89,12 +89,14 @@ struct CFInfo {
bool IsIf() { return type == If; }
bool IsLoop() { return type == Loop; }
bool IsForeach() { return (type == ForeachRegular ||
type == ForeachActive ||
type == ForeachUnique); }
bool IsSwitch() { return type == Switch; }
bool IsVarying() { return !isUniform; }
bool IsUniform() { return isUniform; }
enum CFType { If, Loop, ForeachRegular, ForeachUnique, Switch };
enum CFType { If, Loop, ForeachRegular, ForeachActive, ForeachUnique,
Switch };
CFType type;
bool isUniform;
llvm::BasicBlock *savedBreakTarget, *savedContinueTarget;
@@ -143,7 +145,7 @@ private:
CFInfo(CFType t, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
llvm::Value *lm) {
Assert(t == ForeachRegular || t == ForeachUnique);
Assert(t == ForeachRegular || t == ForeachActive || t == ForeachUnique);
type = t;
isUniform = false;
savedBreakTarget = bt;
@@ -190,6 +192,9 @@ CFInfo::GetForeach(FunctionEmitContext::ForeachType ft,
case FunctionEmitContext::FOREACH_REGULAR:
cfType = ForeachRegular;
break;
case FunctionEmitContext::FOREACH_ACTIVE:
cfType = ForeachActive;
break;
case FunctionEmitContext::FOREACH_UNIQUE:
cfType = ForeachUnique;
break;
@@ -744,6 +749,16 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
}
static bool
lEnclosingLoopIsForeachActive(const std::vector<CFInfo *> &controlFlowInfo) {
for (int i = (int)controlFlowInfo.size() - 1; i >= 0; --i) {
if (controlFlowInfo[i]->type == CFInfo::ForeachActive)
return true;
}
return false;
}
void
FunctionEmitContext::Continue(bool doCoherenceCheck) {
if (!continueTarget) {
@@ -753,12 +768,16 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
}
AssertPos(currentPos, controlFlowInfo.size() > 0);
if (ifsInCFAllUniform(CFInfo::Loop)) {
if (ifsInCFAllUniform(CFInfo::Loop) ||
lEnclosingLoopIsForeachActive(controlFlowInfo)) {
// Similarly to 'break' statements, we can immediately jump to the
// continue target if we're only in 'uniform' control flow within
// loop or if we can tell that the mask is all on.
// loop or if we can tell that the mask is all on. Here, we can
// also jump if the enclosing loop is a 'foreach_active' loop, in
// which case we know that only a single program instance is
// executing.
AddInstrumentationPoint("continue: uniform CF, jumped");
if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
if (doCoherenceCheck)
Warning(currentPos, "Coherent continue statement not necessary in "
"fully uniform control flow.");
BranchInst(continueTarget);

6
ctx.h
View File

@@ -160,9 +160,9 @@ public:
finished. */
void EndLoop();
/** Indicates that code generation for a 'foreach', 'foreach_tiled', or
'foreach_unique' loop is about to start. */
enum ForeachType { FOREACH_REGULAR, FOREACH_UNIQUE };
/** Indicates that code generation for a 'foreach', 'foreach_tiled',
'foreach_active', or 'foreach_unique' loop is about to start. */
enum ForeachType { FOREACH_REGULAR, FOREACH_ACTIVE, FOREACH_UNIQUE };
void StartForeach(ForeachType ft);
void EndForeach();

View File

@@ -105,12 +105,16 @@ Contents:
* `Conditional Statements: "if"`_
* `Conditional Statements: "switch"`_
* `Basic Iteration Statements: "for", "while", and "do"`_
* `Iteration over unique elements: "foreach_unique"`_
* `Iteration Statements`_
+ `Basic Iteration Statements: "for", "while", and "do"`_
+ `Iteration over active program instances: "foreach_active"`_
+ `Iteration over unique elements: "foreach_unique"`_
+ `Parallel Iteration Statements: "foreach" and "foreach_tiled"`_
+ `Parallel Iteration with "programIndex" and "programCount"`_
* `Unstructured Control Flow: "goto"`_
* `"Coherent" Control Flow Statements: "cif" and Friends`_
* `Parallel Iteration Statements: "foreach" and "foreach_tiled"`_
* `Parallel Iteration with "programIndex" and "programCount"`_
* `Functions and Function Calls`_
+ `Function Overloading`_
@@ -1984,7 +1988,7 @@ format in memory; the benefits from SOA layout are discussed in more detail
in the `Use "Structure of Arrays" Layout When Possible`_ section in the
ispc Performance Guide.
.. _Use "Structure of Arrays" Layout When Possible: perf.html#use-structure-of-arrays-layout-when-possible
.. _Use "Structure of Arrays" Layout When Possible: perfguide.html#use-structure-of-arrays-layout-when-possible
``ispc`` provides two key language-level capabilities for laying out and
accessing data in SOA format:
@@ -2348,11 +2352,19 @@ code below.
x *= x;
}
Iteration Statements
--------------------
In addition to the standard iteration statements ``for``, ``while``, and
``do``, inherited from C/C++, ``ispc`` provides a number of additional
specialized ways to iterate over data.
Basic Iteration Statements: "for", "while", and "do"
----------------------------------------------------
``ispc`` supports ``for``, ``while``, and ``do`` loops, with the same
specification as in C. Like C++, variables can be declared in the ``for``
specification as in C. As in C++, variables can be declared in the ``for``
statement itself:
::
@@ -2374,6 +2386,58 @@ executing code in the loop body that didn't execute the ``continue`` will
be unaffected by it.
Iteration over active program instances: "foreach_active"
---------------------------------------------------------
The ``foreach_active`` construct specifies a loop that serializes over the
active program instances: the loop body executes once for each active
program instance, and with only that program instance executing.
As an example of the use of this construct, consider an application where
each program instance independently computes an offset into a shared array
that is being updated:
::
uniform float array[...] = { ... };
int index = ...;
++array[index];
If more than one active program instance computes the same value for
``index``, the above code has undefined behavior (see the section `Data
Races Within a Gang`_ for details.) The increment of ``array[index]``
could instead be written inside a ``foreach_active`` statement:
::
foreach_active (i) {
++array[index];
}
The variable name provided in parenthesis after the ``foreach_active``
keyword (here, ``index``), causes a ``const uniform int64`` local variable
of that name to be declared, where the variable takes the ``programIndex``
value of the program instance executing at each loop iteraton.
In the code above, because only one program instance is executing at a time
when the loop body executes, the update to ``array`` is well-defined.
Note that for this particular example, the "local atomic" operations in
the standard library could be used instead to safely update ``array``.
However, local atomics functions aren't always available or appropriate for
more complex cases.)
``continue`` statements may be used inside ``foreach_active`` loops, though
``break`` and ``return`` are prohibited. The order in which the active
program instances are processed in the loop is not defined.
See the `Using "foreach_active" Effectively`_ Section in the ispc
Performance Guide for more details about ``foreach_active``.
.. _Using "foreach_active" Effectively: perfguide.html#using-foreach-active-effectively
Iteration over unique elements: "foreach_unique"
------------------------------------------------
@@ -2408,7 +2472,144 @@ evaluated once, and it must be of an atomic type (``float``, ``int``,
etc.), an ``enum`` type, or a pointer type. The iteration variable ``val``
is a variable of ``const uniform`` type of the iteration type; it can't be
modified within the loop. Finally, ``break`` and ``return`` statements are
illegal within the loop body, but ``continue`` statemetns are allowed.
illegal within the loop body, but ``continue`` statements are allowed.
Parallel Iteration Statements: "foreach" and "foreach_tiled"
------------------------------------------------------------
The ``foreach`` and ``foreach_tiled`` constructs specify loops over a
possibly multi-dimensional domain of integer ranges. Their role goes
beyond "syntactic sugar"; they provides one of the two key ways of
expressing parallel computation in ``ispc``.
In general, a ``foreach`` or ``foreach_tiled`` statement takes one or more
dimension specifiers separated by commas, where each dimension is specified
by ``identifier = start ... end``, where ``start`` is a signed integer
value less than or equal to ``end``, specifying iteration over all integer
values from ``start`` up to and including ``end-1``. An arbitrary number
of iteration dimensions may be specified, with each one spanning a
different range of values. Within the ``foreach`` loop, the given
identifiers are available as ``const varying int32`` variables. The
execution mask starts out "all on" at the start of each ``foreach`` loop
iteration, but may be changed by control flow constructs within the loop.
It is illegal to have a ``break`` statement or a ``return`` statement
within a ``foreach`` loop; a compile-time error will be issued in this
case. (It is legal to have a ``break`` in a regular ``for`` loop that's
nested inside a ``foreach`` loop.) ``continue`` statements are legal in
``foreach`` loops; they have the same effect as in regular ``for`` loops:
a program instances that executes a ``continue`` statement effectively
skips over the rest of the loop body for the current iteration.
It is also currently illegal to have nested ``foreach`` statements; this
limitation will be removed in a future release of ``ispc``.
As a specific example, consider the following ``foreach`` statement:
::
foreach (j = 0 ... height, i = 0 ... width) {
// loop body--process data element (i,j)
}
It specifies a loop over a 2D domain, where the ``j`` variable goes from 0
to ``height-1`` and ``i`` goes from 0 to ``width-1``. Within the loop, the
variables ``i`` and ``j`` are available and initialized accordingly.
``foreach`` loops actually cause the given iteration domain to be
automatically mapped to the program instances in the gang, so that all of
the data can be processed, in gang-sized chunks. As a specific example,
consider a simple ``foreach`` loop like the following, on a target where
the gang size is 8:
::
foreach (i = 0 ... 16) {
// perform computation on element i
}
One possible valid execution path of this loop would be for the program
counter the step through the statements of this loop just ``16/8==2``
times; the first time through, with the ``varying int32`` variable ``i``
having the values (0,1,2,3,4,5,6,7) over the program instances, and the
second time through, having the values (8,9,10,11,12,13,14,15), thus
mapping the available program instances to all of the data by the end of
the loop's execution.
In general, however, you shouldn't make any assumptions about the order in
which elements of the iteration domain will be processed by a ``foreach``
loop. For example, the following code exhibits undefined behavior:
::
uniform float a[10][100];
foreach (i = 0 ... 10, j = 0 ... 100) {
if (i == 0)
a[i][j] = j;
else
// Error: can't assume that a[i-1][j] has been set yet
a[i][j] = a[i-1][j];
The ``foreach`` statement generally subdivides the iteration domain by
selecting sets of contiguous elements in the inner-most dimension of the
iteration domain. This decomposition approach generally leads to coherent
memory reads and writes, but may lead to worse control flow coherence than
other decompositions.
Therefore, ``foreach_tiled`` decomposes the iteration domain in a way that
tries to map locations in the domain to program instances in a way that is
compact across all of the dimensions. For example, on a target with an
8-wide gang size, the following ``foreach_tiled`` statement might process
the iteration domain in chunks of 2 elements in ``j`` and 4 elements in
``i`` each time. (The trade-offs between these two constructs are
discussed in more detail in the `ispc Performance Guide`_.)
.. _ispc Performance Guide: perfguide.html#improving-control-flow-coherence-with-foreach-tiled
::
foreach_tiled (j = 0 ... height, i = 0 ... width) {
// loop body--process data element (i,j)
}
Parallel Iteration with "programIndex" and "programCount"
---------------------------------------------------------
In addition to ``foreach`` and ``foreach_tiled``, ``ispc`` provides a
lower-level mechanism for mapping SPMD program instances to data to operate
on via the built-in ``programIndex`` and ``programCount`` variables.
``programIndex`` gives the index of the SIMD-lane being used for running
each program instance. (In other words, it's a varying integer value that
has value zero for the first program instance, and so forth.) The
``programCount`` builtin gives the total number of instances in the gang.
Together, these can be used to uniquely map executing program instances to
input data. [#]_
.. [#] ``programIndex`` is analogous to ``get_global_id()`` in OpenCL* and
``threadIdx`` in CUDA*.
As a specific example, consider an ``ispc`` function that needs to perform
some computation on an array of data.
::
for (uniform int i = 0; i < count; i += programCount) {
float d = data[i + programIndex];
float r = ....
result[i + programIndex] = r;
}
Here, we've written a loop that explicitly loops over the data in chunks of
``programCount`` elements. In each loop iteration, the running program
instances effectively collude amongst themselves using ``programIndex`` to
determine which elements to work on in a way that ensures that all of the
data elements will be processed. In this particular case, a ``foreach``
loop would be preferable, as ``foreach`` naturally handles the case where
``programCount`` doesn't evenly divide the number of elements to be
processed, while the loop above assumes that case implicitly.
Unstructured Control Flow: "goto"
@@ -2479,139 +2680,6 @@ constructs in ``ispc`` that a loop will never be executed with an "all off"
execution mask.
Parallel Iteration Statements: "foreach" and "foreach_tiled"
------------------------------------------------------------
The ``foreach`` and ``foreach_tiled`` constructs specify loops over a
possibly multi-dimensional domain of integer ranges. Their role goes
beyond "syntactic sugar"; they provides one of the two key ways of
expressing parallel computation in ``ispc``.
In general, a ``foreach`` or ``foreach_tiled`` statement takes one or more
dimension specifiers separated by commas, where each dimension is specified
by ``identifier = start ... end``, where ``start`` is a signed integer
value less than or equal to ``end``, specifying iteration over all integer
values from ``start`` up to and including ``end-1``. An arbitrary number
of iteration dimensions may be specified, with each one spanning a
different range of values. Within the ``foreach`` loop, the given
identifiers are available as ``const varying int32`` variables. The
execution mask starts out "all on" at the start of each ``foreach`` loop
iteration, but may be changed by control flow constructs within the loop.
It is illegal to have a ``break`` statement or a ``return`` statement
within a ``foreach`` loop; a compile-time error will be issued in this
case. (It is legal to have a ``break`` in a regular ``for`` loop that's
nested inside a ``foreach`` loop.) ``continue`` statements are legal in
``foreach`` loops; they have the same effect as in regular ``for`` loops:
a program instances that executes a ``continue`` statement effectively
skips over the rest of the loop body for the current iteration.
As a specific example, consider the following ``foreach`` statement:
::
foreach (j = 0 ... height, i = 0 ... width) {
// loop body--process data element (i,j)
}
It specifies a loop over a 2D domain, where the ``j`` variable goes from 0
to ``height-1`` and ``i`` goes from 0 to ``width-1``. Within the loop, the
variables ``i`` and ``j`` are available and initialized accordingly.
``foreach`` loops actually cause the given iteration domain to be
automatically mapped to the program instances in the gang, so that all of
the data can be processed, in gang-sized chunks. As a specific example,
consider a simple ``foreach`` loop like the following, on a target where
the gang size is 8:
::
foreach (i = 0 ... 16) {
// perform computation on element i
}
One possible valid execution path of this loop would be for the program
counter the step through the statements of this loop just ``16/8==2``
times; the first time through, with the ``varying int32`` variable ``i``
having the values (0,1,2,3,4,5,6,7) over the program instances, and the
second time through, having the values (8,9,10,11,12,13,14,15), thus
mapping the available program instances to all of the data by the end of
the loop's execution.
In general, however, you shouldn't make any assumptions about the order in
which elements of the iteration domain will be processed by a ``foreach``
loop. For example, the following code exhibits undefined behavior:
::
uniform float a[10][100];
foreach (i = 0 ... 10, j = 0 ... 100) {
if (i == 0)
a[i][j] = j;
else
// Error: can't assume that a[i-1][j] has been set yet
a[i][j] = a[i-1][j];
The ``foreach`` statement generally subdivides the iteration domain by
selecting sets of contiguous elements in the inner-most dimension of the
iteration domain. This decomposition approach generally leads to coherent
memory reads and writes, but may lead to worse control flow coherence than
other decompositions.
Therefore, ``foreach_tiled`` decomposes the iteration domain in a way that
tries to map locations in the domain to program instances in a way that is
compact across all of the dimensions. For example, on a target with an
8-wide gang size, the following ``foreach_tiled`` statement might process
the iteration domain in chunks of 2 elements in ``j`` and 4 elements in
``i`` each time. (The trade-offs between these two constructs are
discussed in more detail in the `ispc Performance Guide`_.)
.. _ispc Performance Guide: perf.html#improving-control-flow-coherence-with-foreach-tiled
::
foreach_tiled (j = 0 ... height, i = 0 ... width) {
// loop body--process data element (i,j)
}
Parallel Iteration with "programIndex" and "programCount"
---------------------------------------------------------
In addition to ``foreach`` and ``foreach_tiled``, ``ispc`` provides a
lower-level mechanism for mapping SPMD program instances to data to operate
on via the built-in ``programIndex`` and ``programCount`` variables.
``programIndex`` gives the index of the SIMD-lane being used for running
each program instance. (In other words, it's a varying integer value that
has value zero for the first program instance, and so forth.) The
``programCount`` builtin gives the total number of instances in the gang.
Together, these can be used to uniquely map executing program instances to
input data. [#]_
.. [#] ``programIndex`` is analogous to ``get_global_id()`` in OpenCL* and
``threadIdx`` in CUDA*.
As a specific example, consider an ``ispc`` function that needs to perform
some computation on an array of data.
::
for (uniform int i = 0; i < count; i += programCount) {
float d = data[i + programIndex];
float r = ....
result[i + programIndex] = r;
}
Here, we've written a loop that explicitly loops over the data in chunks of
``programCount`` elements. In each loop iteration, the running program
instances effectively collude amongst themselves using ``programIndex`` to
determine which elements to work on in a way that ensures that all of the
data elements will be processed. In this particular case, a ``foreach``
loop would be preferable, as ``foreach`` naturally handles the case where
``programCount`` doesn't evenly divide the number of elements to be
processed, while the loop above assumes that case implicitly.
Functions and Function Calls
----------------------------
@@ -3452,7 +3520,7 @@ There are also variants of these functions that return the value as a
discussion of an application of this variant to improve memory access
performance in the `Performance Guide`_.
.. _Performance Guide: perf.html#understanding-gather-and-scatter
.. _Performance Guide: perfguide.html#understanding-gather-and-scatter
::
@@ -4130,8 +4198,10 @@ from ``ispc`` must be declared as follows:
It is illegal to overload functions declared with ``extern "C"`` linkage;
``ispc`` issues an error in this case.
Function calls back to C/C++ are not made if none of the program instances
want to make the call. For example, given code like:
**Only a single function call is made back to C++ for the entire gang of
runing program instances**. Furthermore, function calls back to C/C++ are not
made if none of the program instances want to make the call. For example,
given code like:
::
@@ -4174,6 +4244,24 @@ Application code can thus be written as:
}
}
In some cases, it can be desirable to generate a single call for each
executing program instance, rather than one call for a gang. For example,
the code below shows how one might call an existing math library routine
that takes a scalar parameter.
::
extern "C" uniform double erf(uniform double);
double v = ...;
double result;
foreach_active (instance) {
uniform double r = erf(extract(v, instance));
result = insert(result, instance, r);
}
This code calls ``erf()`` once for each active program instance, passing it
the program instance's value of ``v`` and storing the result in the
instance's ``result`` value.
Data Layout
-----------
@@ -4309,7 +4397,7 @@ can also have a significant effect on performance; in general, creating
groups of work that will tend to do similar computation across the SPMD
program instances improves performance.
.. _ispc Performance Tuning Guide: http://ispc.github.com/perf.html
.. _ispc Performance Tuning Guide: http://ispc.github.com/perfguide.html
Disclaimer and Legal Information

View File

@@ -21,6 +21,7 @@ the most out of ``ispc`` in practice.
+ `Avoid 64-bit Addressing Calculations When Possible`_
+ `Avoid Computation With 8 and 16-bit Integer Types`_
+ `Implementing Reductions Efficiently`_
+ `Using "foreach_active" Effectively`_
+ `Using Low-level Vector Tricks`_
+ `The "Fast math" Option`_
+ `"inline" Aggressively`_
@@ -510,6 +511,43 @@ values--very efficient code in the end.
return reduce_add(sum);
}
Using "foreach_active" Effectively
----------------------------------
For high-performance code,
For example, consider this segment of code, from the introduction of
``foreach_active`` in the ispc User's Guide:
::
uniform float array[...] = { ... };
int index = ...;
foreach_active (i) {
++array[index];
}
Here, ``index`` was assumed to possibly have the same value for multiple
program instances, so the updates to ``array[index]`` are serialized by the
``foreach_active`` statement in order to not have undefined results when
``index`` values do collide.
The code generated by the compiler can be improved in this case by making
it clear that only a single element of the array is accessed by
``array[index]`` and that thus a general gather or scatter isn't required.
Specifically, by using the ``extract()`` function from the standard library
to extract the current program instance's value of ``index`` into a
``uniform`` variable and then using that to index into ``array``, as below,
more efficient code is generated.
::
foreach_active (instanceNum) {
uniform int unifIndex = extract(index, instanceNum);
++array[unifIndex];
}
Using Low-level Vector Tricks
-----------------------------

8
lex.ll
View File

@@ -66,8 +66,8 @@ static int allTokens[] = {
TOKEN_CONST, TOKEN_CONTINUE, TOKEN_CRETURN, TOKEN_DEFAULT, TOKEN_DO,
TOKEN_DELETE, TOKEN_DOUBLE, TOKEN_ELSE, TOKEN_ENUM,
TOKEN_EXPORT, TOKEN_EXTERN, TOKEN_FALSE, TOKEN_FLOAT, TOKEN_FOR,
TOKEN_FOREACH, TOKEN_FOREACH_TILED, TOKEN_FOREACH_UNIQUE,
TOKEN_GOTO, TOKEN_IF, TOKEN_IN, TOKEN_INLINE,
TOKEN_FOREACH, TOKEN_FOREACH_ACTIVE, TOKEN_FOREACH_TILED,
TOKEN_FOREACH_UNIQUE, TOKEN_GOTO, TOKEN_IF, TOKEN_IN, TOKEN_INLINE,
TOKEN_INT, TOKEN_INT8, TOKEN_INT16, TOKEN_INT, TOKEN_INT64, TOKEN_LAUNCH,
TOKEN_NEW, TOKEN_NULL, TOKEN_PRINT, TOKEN_RETURN, TOKEN_SOA, TOKEN_SIGNED,
TOKEN_SIZEOF, TOKEN_STATIC, TOKEN_STRUCT, TOKEN_SWITCH, TOKEN_SYNC,
@@ -115,6 +115,7 @@ void ParserInit() {
tokenToName[TOKEN_FLOAT] = "float";
tokenToName[TOKEN_FOR] = "for";
tokenToName[TOKEN_FOREACH] = "foreach";
tokenToName[TOKEN_FOREACH_ACTIVE] = "foreach_active";
tokenToName[TOKEN_FOREACH_TILED] = "foreach_tiled";
tokenToName[TOKEN_FOREACH_UNIQUE] = "foreach_unique";
tokenToName[TOKEN_GOTO] = "goto";
@@ -226,6 +227,7 @@ void ParserInit() {
tokenNameRemap["TOKEN_FLOAT"] = "\'float\'";
tokenNameRemap["TOKEN_FOR"] = "\'for\'";
tokenNameRemap["TOKEN_FOREACH"] = "\'foreach\'";
tokenNameRemap["TOKEN_FOREACH_ACTIVE"] = "\'foreach_active\'";
tokenNameRemap["TOKEN_FOREACH_TILED"] = "\'foreach_tiled\'";
tokenNameRemap["TOKEN_FOREACH_UNIQUE"] = "\'foreach_unique\'";
tokenNameRemap["TOKEN_GOTO"] = "\'goto\'";
@@ -369,8 +371,8 @@ extern { RT; return TOKEN_EXTERN; }
false { RT; return TOKEN_FALSE; }
float { RT; return TOKEN_FLOAT; }
for { RT; return TOKEN_FOR; }
__foreach_active { RT; return TOKEN_FOREACH_ACTIVE; }
foreach { RT; return TOKEN_FOREACH; }
foreach_active { RT; return TOKEN_FOREACH_ACTIVE; }
foreach_tiled { RT; return TOKEN_FOREACH_TILED; }
foreach_unique { RT; return TOKEN_FOREACH_UNIQUE; }
goto { RT; return TOKEN_GOTO; }

View File

@@ -117,8 +117,8 @@ static const char *lBuiltinTokens[] = {
"assert", "bool", "break", "case", "cbreak", "ccontinue", "cdo",
"cfor", "cif", "cwhile", "const", "continue", "creturn", "default",
"do", "delete", "double", "else", "enum", "export", "extern", "false",
"float", "for", "foreach", "foreach_tiled", "foreach_unique",
"goto", "if", "in", "inline",
"float", "for", "foreach", "foreach_active", "foreach_tiled",
"foreach_unique", "goto", "if", "in", "inline",
"int", "int8", "int16", "int32", "int64", "launch", "new", "NULL",
"print", "return", "signed", "sizeof", "static", "struct", "switch",
"sync", "task", "true", "typedef", "uniform", "unmasked", "unsigned",
@@ -1688,7 +1688,7 @@ foreach_active_scope
foreach_active_identifier
: TOKEN_IDENTIFIER
{
$$ = new Symbol(yytext, @1, AtomicType::UniformInt32);
$$ = new Symbol(yytext, @1, AtomicType::UniformInt64->GetAsConstType());
}
;
@@ -1838,7 +1838,7 @@ iteration_statement
}
statement
{
$$ = CreateForeachActiveStmt($3, $6, Union(@1, @4));
$$ = new ForeachActiveStmt($3, $6, Union(@1, @4));
m->symbolTable->PopScope();
}
| foreach_unique_scope '(' foreach_unique_identifier TOKEN_IN

View File

@@ -421,7 +421,7 @@ static inline void memcpy(void * varying dst, void * varying src,
da[programIndex] = dst;
sa[programIndex] = src;
__foreach_active (i) {
foreach_active (i) {
void * uniform d = da[i], * uniform s = sa[i];
__memcpy32((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
}
@@ -435,7 +435,7 @@ static inline void memcpy64(void * varying dst, void * varying src,
da[programIndex] = dst;
sa[programIndex] = src;
__foreach_active (i) {
foreach_active (i) {
void * uniform d = da[i], * uniform s = sa[i];
__memcpy64((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
}
@@ -459,7 +459,7 @@ static inline void memmove(void * varying dst, void * varying src,
da[programIndex] = dst;
sa[programIndex] = src;
__foreach_active (i) {
foreach_active (i) {
void * uniform d = da[i], * uniform s = sa[i];
__memmove32((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
}
@@ -473,7 +473,7 @@ static inline void memmove64(void * varying dst, void * varying src,
da[programIndex] = dst;
sa[programIndex] = src;
__foreach_active (i) {
foreach_active (i) {
void * uniform d = da[i], * uniform s = sa[i];
__memmove64((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
}
@@ -493,7 +493,7 @@ static inline void memset(void * varying ptr, int8 val, int32 count) {
void * uniform pa[programCount];
pa[programIndex] = ptr;
__foreach_active (i) {
foreach_active (i) {
__memset32((int8 * uniform)pa[i], extract(val, i), extract(count, i));
}
}
@@ -502,7 +502,7 @@ static inline void memset64(void * varying ptr, int8 val, int64 count) {
void * uniform pa[programCount];
pa[programIndex] = ptr;
__foreach_active (i) {
foreach_active (i) {
__memset64((int8 * uniform)pa[i], extract(val, i), extract(count, i));
}
}
@@ -711,7 +711,7 @@ static inline void prefetch_l1(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
__foreach_active (i) {
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_l1(p);
}
@@ -721,7 +721,7 @@ static inline void prefetch_l2(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
__foreach_active (i) {
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_l2(p);
}
@@ -731,7 +731,7 @@ static inline void prefetch_l3(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
__foreach_active (i) {
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_l3(p);
}
@@ -741,7 +741,7 @@ static inline void prefetch_nt(const void * varying ptr) {
const void * uniform ptrArray[programCount];
ptrArray[programIndex] = ptr;
__foreach_active (i) {
foreach_active (i) {
const void * uniform p = ptrArray[i];
prefetch_nt(p);
}
@@ -1621,7 +1621,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
TA ret; \
__foreach_active (i) { \
foreach_active (i) { \
uniform TA * uniform p = ptrArray[i]; \
uniform TA v = extract(value, i); \
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
@@ -1672,7 +1672,7 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
TA ret; \
__foreach_active (i) { \
foreach_active (i) { \
uniform TA * uniform p = ptrArray[i]; \
uniform TA v = extract(value, i); \
uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \
@@ -1699,7 +1699,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
TA ret; \
__foreach_active (i) { \
foreach_active (i) { \
uniform TA * uniform p = ptrArray[i]; \
uniform TA v = extract(value, i); \
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
@@ -1774,7 +1774,7 @@ static inline TA atomic_compare_exchange_global( \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
TA ret; \
__foreach_active (i) { \
foreach_active (i) { \
uniform TA r = \
__atomic_compare_exchange_uniform_##TB##_global(ptrArray[i], \
extract(oldval, i), \
@@ -1848,7 +1848,7 @@ static inline uniform TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, \
} \
static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value) { \
TYPE ret; \
__foreach_active (i) { \
foreach_active (i) { \
ret = insert(ret, i, *ptr); \
*ptr = OPFUNC(*ptr, extract(value, i)); \
} \
@@ -1858,7 +1858,7 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) { \
TYPE ret; \
uniform TYPE * uniform ptrs[programCount]; \
ptrs[programIndex] = p; \
__foreach_active (i) { \
foreach_active (i) { \
ret = insert(ret, i, *ptrs[i]); \
*ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \
} \
@@ -1975,7 +1975,7 @@ static inline uniform TYPE atomic_compare_exchange_local(uniform TYPE * uniform
static inline TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
TYPE cmp, TYPE update) { \
TYPE ret; \
__foreach_active (i) { \
foreach_active (i) { \
uniform TYPE old = *ptr; \
if (old == extract(cmp, i)) \
*ptr = extract(update, i); \
@@ -1988,7 +1988,7 @@ static inline TYPE atomic_compare_exchange_local(uniform TYPE * varying p, \
uniform TYPE * uniform ptrs[programCount]; \
ptrs[programIndex] = p; \
TYPE ret; \
__foreach_active (i) { \
foreach_active (i) { \
uniform TYPE old = *ptrs[i]; \
if (old == extract(cmp, i)) \
*ptrs[i] = extract(update, i); \
@@ -2127,7 +2127,7 @@ static inline float sin(float x_full) {
}
else if (__math_lib == __math_lib_system) {
float ret;
__foreach_active (i) {
foreach_active (i) {
uniform float r = __stdlib_sinf(extract(x_full, i));
ret = insert(ret, i, r);
}
@@ -2259,7 +2259,7 @@ static inline float asin(float x) {
if (__math_lib == __math_lib_svml ||
__math_lib == __math_lib_system) {
float ret;
__foreach_active (i) {
foreach_active (i) {
uniform float r = __stdlib_asinf(extract(x, i));
ret = insert(ret, i, r);
}
@@ -2364,7 +2364,7 @@ static inline float cos(float x_full) {
}
else if (__math_lib == __math_lib_system) {
float ret;
__foreach_active (i) {
foreach_active (i) {
uniform float r = __stdlib_cosf(extract(x_full, i));
ret = insert(ret, i, r);
}
@@ -2502,7 +2502,7 @@ static inline void sincos(float x_full, varying float * uniform sin_result,
__svml_sincos(x_full, sin_result, cos_result);
}
else if (__math_lib == __math_lib_system) {
__foreach_active (i) {
foreach_active (i) {
uniform float s, c;
__stdlib_sincosf(extract(x_full, i), &s, &c);
*sin_result = insert(*sin_result, i, s);
@@ -2635,7 +2635,7 @@ static inline float tan(float x_full) {
}
else if (__math_lib == __math_lib_system) {
float ret;
__foreach_active (i) {
foreach_active (i) {
uniform float r = __stdlib_tanf(extract(x_full, i));
ret = insert(ret, i, r);
}
@@ -2786,7 +2786,7 @@ static inline float atan(float x_full) {
}
else if (__math_lib == __math_lib_system) {
float ret;
__foreach_active (i) {
foreach_active (i) {
uniform float r = __stdlib_atanf(extract(x_full, i));
ret = insert(ret, i, r);
}
@@ -2881,7 +2881,7 @@ static inline float atan2(float y, float x) {
}
else if (__math_lib == __math_lib_system) {
float ret;
__foreach_active (i) {
foreach_active (i) {
uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i));
ret = insert(ret, i, r);
}
@@ -2944,7 +2944,7 @@ static inline float exp(float x_full) {
}
else if (__math_lib == __math_lib_system) {
float ret;
__foreach_active (i) {
foreach_active (i) {
uniform float r = __stdlib_expf(extract(x_full, i));
ret = insert(ret, i, r);
}
@@ -3151,7 +3151,7 @@ static inline float log(float x_full) {
}
else if (__math_lib == __math_lib_system) {
float ret;
__foreach_active (i) {
foreach_active (i) {
uniform float r = __stdlib_logf(extract(x_full, i));
ret = insert(ret, i, r);
}
@@ -3326,7 +3326,7 @@ static inline float pow(float a, float b) {
}
else if (__math_lib == __math_lib_system) {
float ret;
__foreach_active (i) {
foreach_active (i) {
uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
ret = insert(ret, i, r);
}
@@ -3416,7 +3416,7 @@ static inline double sin(double x) {
return sin((float)x);
else {
double ret;
__foreach_active (i) {
foreach_active (i) {
uniform double r = __stdlib_sin(extract(x, i));
ret = insert(ret, i, r);
}
@@ -3438,7 +3438,7 @@ static inline double cos(double x) {
return cos((float)x);
else {
double ret;
__foreach_active (i) {
foreach_active (i) {
uniform double r = __stdlib_cos(extract(x, i));
ret = insert(ret, i, r);
}
@@ -3464,7 +3464,7 @@ static inline void sincos(double x, varying double * uniform sin_result,
*cos_result = cr;
}
else {
__foreach_active (i) {
foreach_active (i) {
uniform double sr, cr;
__stdlib_sincos(extract(x, i), &sr, &cr);
*sin_result = insert(*sin_result, i, sr);
@@ -3492,7 +3492,7 @@ static inline double tan(double x) {
return tan((float)x);
else {
double ret;
__foreach_active (i) {
foreach_active (i) {
uniform double r = __stdlib_tan(extract(x, i));
ret = insert(ret, i, r);
}
@@ -3514,7 +3514,7 @@ static inline double atan(double x) {
return atan((float)x);
else {
double ret;
__foreach_active (i) {
foreach_active (i) {
uniform double r = __stdlib_atan(extract(x, i));
ret = insert(ret, i, r);
}
@@ -3536,7 +3536,7 @@ static inline double atan2(double y, double x) {
return atan2((float)y, (float)x);
else {
double ret;
__foreach_active (i) {
foreach_active (i) {
uniform double r = __stdlib_atan2(extract(y, i), extract(x, i));
ret = insert(ret, i, r);
}
@@ -3558,7 +3558,7 @@ static inline double exp(double x) {
return exp((float)x);
else {
double ret;
__foreach_active (i) {
foreach_active (i) {
uniform double r = __stdlib_exp(extract(x, i));
ret = insert(ret, i, r);
}
@@ -3580,7 +3580,7 @@ static inline double log(double x) {
return log((float)x);
else {
double ret;
__foreach_active (i) {
foreach_active (i) {
uniform double r = __stdlib_log(extract(x, i));
ret = insert(ret, i, r);
}
@@ -3602,7 +3602,7 @@ static inline double pow(double a, double b) {
return pow((float)a, (float)b);
else {
double ret;
__foreach_active (i) {
foreach_active (i) {
uniform double r = __stdlib_pow(extract(a, i), extract(b, i));
ret = insert(ret, i, r);
}

259
stmt.cpp
View File

@@ -1915,6 +1915,188 @@ ForeachStmt::Print(int indent) const {
}
///////////////////////////////////////////////////////////////////////////
// ForeachActiveStmt
ForeachActiveStmt::ForeachActiveStmt(Symbol *s, Stmt *st, SourcePos pos)
: Stmt(pos) {
sym = s;
stmts = st;
}
void
ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const {
if (!ctx->GetCurrentBasicBlock())
return;
// Allocate storage for the symbol that we'll use for the uniform
// variable that holds the current program instance in each loop
// iteration.
if (sym->type == NULL) {
Assert(m->errorCount > 0);
return;
}
Assert(Type::Equal(sym->type,
AtomicType::UniformInt64->GetAsConstType()));
sym->storagePtr = ctx->AllocaInst(LLVMTypes::Int64Type, sym->name.c_str());
ctx->SetDebugPos(pos);
ctx->EmitVariableDebugInfo(sym);
// The various basic blocks that we'll need in the below
llvm::BasicBlock *bbFindNext =
ctx->CreateBasicBlock("foreach_active_find_next");
llvm::BasicBlock *bbBody = ctx->CreateBasicBlock("foreach_active_body");
llvm::BasicBlock *bbCheckForMore =
ctx->CreateBasicBlock("foreach_active_check_for_more");
llvm::BasicBlock *bbDone = ctx->CreateBasicBlock("foreach_active_done");
// Save the old mask so that we can restore it at the end
llvm::Value *oldInternalMask = ctx->GetInternalMask();
// Now, *maskBitsPtr will maintain a bitmask for the lanes that remain
// to be processed by a pass through the loop body. It starts out with
// the current execution mask (which should never be all off going in
// to this)...
llvm::Value *oldFullMask = ctx->GetFullMask();
llvm::Value *maskBitsPtr =
ctx->AllocaInst(LLVMTypes::Int64Type, "mask_bits");
llvm::Value *movmsk = ctx->LaneMask(oldFullMask);
ctx->StoreInst(movmsk, maskBitsPtr);
// Officially start the loop.
ctx->StartScope();
ctx->StartForeach(FunctionEmitContext::FOREACH_ACTIVE);
ctx->SetContinueTarget(bbCheckForMore);
// Onward to find the first set of program instance to run the loop for
ctx->BranchInst(bbFindNext);
ctx->SetCurrentBasicBlock(bbFindNext); {
// Load the bitmask of the lanes left to be processed
llvm::Value *remainingBits = ctx->LoadInst(maskBitsPtr, "remaining_bits");
// Find the index of the first set bit in the mask
llvm::Function *ctlzFunc =
m->module->getFunction("__count_trailing_zeros_i64");
Assert(ctlzFunc != NULL);
llvm::Value *firstSet = ctx->CallInst(ctlzFunc, NULL, remainingBits,
"first_set");
// Store that value into the storage allocated for the iteration
// variable.
ctx->StoreInst(firstSet, sym->storagePtr);
// Now set the execution mask to be only on for the current program
// instance. (TODO: is there a more efficient way to do this? e.g.
// for AVX1, we might want to do this as float rather than int
// math...)
// Get the "program index" vector value
llvm::Value *programIndex =
llvm::UndefValue::get(LLVMTypes::Int32VectorType);
for (int i = 0; i < g->target.vectorWidth; ++i)
programIndex = ctx->InsertInst(programIndex, LLVMInt32(i), i,
"prog_index");
// And smear the current lane out to a vector
llvm::Value *firstSet32 =
ctx->TruncInst(firstSet, LLVMTypes::Int32Type, "first_set32");
llvm::Value *firstSet32Smear = ctx->SmearUniform(firstSet32);
// Now set the execution mask based on doing a vector compare of
// these two
llvm::Value *iterMask =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
firstSet32Smear, programIndex);
iterMask = ctx->I1VecToBoolVec(iterMask);
ctx->SetInternalMask(iterMask);
// Also update the bitvector of lanes left to turn off the bit for
// the lane we're about to run.
llvm::Value *setMask =
ctx->BinaryOperator(llvm::Instruction::Shl, LLVMInt64(1),
firstSet, "set_mask");
llvm::Value *notSetMask = ctx->NotOperator(setMask);
llvm::Value *newRemaining =
ctx->BinaryOperator(llvm::Instruction::And, remainingBits,
notSetMask, "new_remaining");
ctx->StoreInst(newRemaining, maskBitsPtr);
// and onward to run the loop body...
ctx->BranchInst(bbBody);
}
ctx->SetCurrentBasicBlock(bbBody); {
// Run the code in the body of the loop. This is easy now.
if (stmts)
stmts->EmitCode(ctx);
Assert(ctx->GetCurrentBasicBlock() != NULL);
ctx->BranchInst(bbCheckForMore);
}
ctx->SetCurrentBasicBlock(bbCheckForMore); {
// At the end of the loop body (either due to running the
// statements normally, or a continue statement in the middle of
// the loop that jumps to the end, see if there are any lanes left
// to be processed.
llvm::Value *remainingBits = ctx->LoadInst(maskBitsPtr, "remaining_bits");
llvm::Value *nonZero =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE,
remainingBits, LLVMInt64(0), "remaining_ne_zero");
ctx->BranchInst(bbFindNext, bbDone, nonZero);
}
ctx->SetCurrentBasicBlock(bbDone);
ctx->SetInternalMask(oldInternalMask);
ctx->EndForeach();
ctx->EndScope();
}
void
ForeachActiveStmt::Print(int indent) const {
printf("%*cForeach_active Stmt", indent, ' ');
pos.Print();
printf("\n");
printf("%*cIter symbol: ", indent+4, ' ');
if (sym != NULL) {
printf("%s", sym->name.c_str());
if (sym->type != NULL)
printf(" %s", sym->type->GetString().c_str());
}
else
printf("NULL");
printf("\n");
printf("%*cStmts:\n", indent+4, ' ');
if (stmts != NULL)
stmts->Print(indent+8);
else
printf("NULL");
printf("\n");
}
Stmt *
ForeachActiveStmt::TypeCheck() {
if (sym == NULL)
return NULL;
return this;
}
int
ForeachActiveStmt::EstimateCost() const {
return COST_VARYING_LOOP;
}
///////////////////////////////////////////////////////////////////////////
// ForeachUniqueStmt
@@ -3043,80 +3225,3 @@ int
DeleteStmt::EstimateCost() const {
return COST_DELETE;
}
///////////////////////////////////////////////////////////////////////////
/** This generates AST nodes for an __foreach_active statement. This
construct can be synthesized ouf of the existing ForStmt (and other AST
nodes), so here we just build up the AST that we need rather than
having a new Stmt implementation for __foreach_active.
@param iterSym Symbol for the iteration variable (e.g. "i" in
__foreach_active (i) { .. .}
@param stmts Statements to execute each time through the loop, for
each active program instance.
@param pos Position of the __foreach_active statement in the source
file.
*/
Stmt *
CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
if (iterSym == NULL) {
AssertPos(pos, m->errorCount > 0);
return NULL;
}
// loop initializer: set iter = 0
std::vector<VariableDeclaration> var;
ConstExpr *zeroExpr = new ConstExpr(AtomicType::UniformInt32, 0,
iterSym->pos);
var.push_back(VariableDeclaration(iterSym, zeroExpr));
Stmt *initStmt = new DeclStmt(var, iterSym->pos);
// loop test: (iter < programCount)
ConstExpr *progCountExpr =
new ConstExpr(AtomicType::UniformInt32, g->target.vectorWidth,
pos);
SymbolExpr *symExpr = new SymbolExpr(iterSym, iterSym->pos);
Expr *testExpr = new BinaryExpr(BinaryExpr::Lt, symExpr, progCountExpr,
pos);
// loop step: ++iterSym
UnaryExpr *incExpr = new UnaryExpr(UnaryExpr::PreInc, symExpr, pos);
Stmt *stepStmt = new ExprStmt(incExpr, pos);
// loop body
// First, call __movmsk(__mask)) to get the mask as a set of bits.
// This should be hoisted out of the loop
Symbol *maskSym = m->symbolTable->LookupVariable("__mask");
AssertPos(pos, maskSym != NULL);
Expr *maskVecExpr = new SymbolExpr(maskSym, pos);
std::vector<Symbol *> mmFuns;
m->symbolTable->LookupFunction("__movmsk", &mmFuns);
AssertPos(pos, mmFuns.size() == (g->target.maskBitCount == 32 ? 2 : 1));
FunctionSymbolExpr *movmskFunc = new FunctionSymbolExpr("__movmsk", mmFuns,
pos);
ExprList *movmskArgs = new ExprList(maskVecExpr, pos);
FunctionCallExpr *movmskExpr = new FunctionCallExpr(movmskFunc, movmskArgs,
pos);
// Compute the per lane mask to test the mask bits against: (1 << iter)
ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt64, int64_t(1),
iterSym->pos);
Expr *shiftLaneExpr = new BinaryExpr(BinaryExpr::Shl, oneExpr, symExpr,
pos);
// Compute the AND: movmsk & (1 << iter)
Expr *maskAndLaneExpr = new BinaryExpr(BinaryExpr::BitAnd, movmskExpr,
shiftLaneExpr, pos);
// Test to see if it's non-zero: (mask & (1 << iter)) != 0
Expr *ifTestExpr = new BinaryExpr(BinaryExpr::NotEqual, maskAndLaneExpr,
zeroExpr, pos);
// Now, enclose the provided statements in an if test such that they
// only run if the mask is non-zero for the lane we're currently
// handling in the loop.
IfStmt *laneCheckIf = new IfStmt(ifTestExpr, stmts, NULL, false, pos);
// And return a for loop that wires it all together.
return new ForStmt(initStmt, testExpr, stepStmt, laneCheckIf, false, pos);
}

17
stmt.h
View File

@@ -260,6 +260,23 @@ public:
};
/** Iteration over each executing program instance.
*/
class ForeachActiveStmt : public Stmt {
public:
ForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos);
void EmitCode(FunctionEmitContext *ctx) const;
void Print(int indent) const;
Stmt *TypeCheck();
int EstimateCost() const;
Symbol *sym;
Stmt *stmts;
};
/** Parallel iteration over each unique value in the given (varying)
expression.
*/

View File

@@ -0,0 +1,16 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
uniform int count = 0;
if (programIndex & 1)
foreach_active (i)
++count;
RET[programIndex] = count;
}
export void result(uniform float RET[]) {
RET[programIndex] = programCount / 2;
}

View File

@@ -0,0 +1,16 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
uniform int count = 0;
if (programIndex & 1)
foreach_active (i)
++a;
RET[programIndex] = a;
}
export void result(uniform float RET[]) {
RET[programIndex] = (1 + programIndex) + ((programIndex & 1) ? 1 : 0);
}

View File

@@ -0,0 +1,16 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
RET[programIndex] = a;
if (programIndex & 1)
foreach_active (i)
++RET[i];
}
export void result(uniform float RET[]) {
RET[programIndex] = (1 + programIndex) + ((programIndex & 1) ? 1 : 0);
}

View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
RET[programIndex] = a;
if (programIndex & 1) {
foreach_active (i) {
if (i == 1)
continue;
++RET[i];
}
}
}
export void result(uniform float RET[]) {
RET[programIndex] = (1 + programIndex) + ((programIndex & 1) ? 1 : 0);
--RET[1];
}

View File

@@ -0,0 +1,22 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
RET[programIndex] = a;
if (programIndex & 1) {
foreach_active (i) {
if (i & 1)
continue;
int * uniform null = 0;
*null = 0;
}
}
}
export void result(uniform float RET[]) {
RET[programIndex] = (1 + programIndex);
}