17
ast.cpp
17
ast.cpp
@@ -92,6 +92,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
|||||||
DoStmt *dos;
|
DoStmt *dos;
|
||||||
ForStmt *fs;
|
ForStmt *fs;
|
||||||
ForeachStmt *fes;
|
ForeachStmt *fes;
|
||||||
|
ForeachActiveStmt *fas;
|
||||||
ForeachUniqueStmt *fus;
|
ForeachUniqueStmt *fus;
|
||||||
CaseStmt *cs;
|
CaseStmt *cs;
|
||||||
DefaultStmt *defs;
|
DefaultStmt *defs;
|
||||||
@@ -138,6 +139,9 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
|||||||
postFunc, data);
|
postFunc, data);
|
||||||
fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
|
fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
|
||||||
}
|
}
|
||||||
|
else if ((fas = dynamic_cast<ForeachActiveStmt *>(node)) != NULL) {
|
||||||
|
fas->stmts = (Stmt *)WalkAST(fas->stmts, preFunc, postFunc, data);
|
||||||
|
}
|
||||||
else if ((fus = dynamic_cast<ForeachUniqueStmt *>(node)) != NULL) {
|
else if ((fus = dynamic_cast<ForeachUniqueStmt *>(node)) != NULL) {
|
||||||
fus->expr = (Expr *)WalkAST(fus->expr, preFunc, postFunc, data);
|
fus->expr = (Expr *)WalkAST(fus->expr, preFunc, postFunc, data);
|
||||||
fus->stmts = (Stmt *)WalkAST(fus->stmts, preFunc, postFunc, data);
|
fus->stmts = (Stmt *)WalkAST(fus->stmts, preFunc, postFunc, data);
|
||||||
@@ -391,14 +395,15 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (dynamic_cast<ForeachStmt *>(node) != NULL ||
|
if (dynamic_cast<ForeachStmt *>(node) != NULL ||
|
||||||
|
dynamic_cast<ForeachActiveStmt *>(node) != NULL ||
|
||||||
dynamic_cast<ForeachUniqueStmt *>(node) != NULL) {
|
dynamic_cast<ForeachUniqueStmt *>(node) != NULL) {
|
||||||
// foreach() statements also shouldn't be run with an all-off mask.
|
// The various foreach statements also shouldn't be run with an
|
||||||
// Since they re-establish an 'all on' mask, this would be pretty
|
// all-off mask. Since they can re-establish an 'all on' mask,
|
||||||
// unintuitive. (More generally, it's possibly a little strange to
|
// this would be pretty unintuitive. (More generally, it's
|
||||||
// allow foreach() in the presence of any non-uniform control
|
// possibly a little strange to allow foreach in the presence of
|
||||||
// flow...)
|
// any non-uniform control flow...)
|
||||||
//
|
//
|
||||||
// Similarly, the implementation foreach_unique assumes as a
|
// Similarly, the implementation of foreach_unique assumes as a
|
||||||
// precondition that the mask won't be all off going into it, so
|
// precondition that the mask won't be all off going into it, so
|
||||||
// we'll enforce that here...
|
// we'll enforce that here...
|
||||||
*okPtr = false;
|
*okPtr = false;
|
||||||
|
|||||||
29
ctx.cpp
29
ctx.cpp
@@ -89,12 +89,14 @@ struct CFInfo {
|
|||||||
bool IsIf() { return type == If; }
|
bool IsIf() { return type == If; }
|
||||||
bool IsLoop() { return type == Loop; }
|
bool IsLoop() { return type == Loop; }
|
||||||
bool IsForeach() { return (type == ForeachRegular ||
|
bool IsForeach() { return (type == ForeachRegular ||
|
||||||
|
type == ForeachActive ||
|
||||||
type == ForeachUnique); }
|
type == ForeachUnique); }
|
||||||
bool IsSwitch() { return type == Switch; }
|
bool IsSwitch() { return type == Switch; }
|
||||||
bool IsVarying() { return !isUniform; }
|
bool IsVarying() { return !isUniform; }
|
||||||
bool IsUniform() { return isUniform; }
|
bool IsUniform() { return isUniform; }
|
||||||
|
|
||||||
enum CFType { If, Loop, ForeachRegular, ForeachUnique, Switch };
|
enum CFType { If, Loop, ForeachRegular, ForeachActive, ForeachUnique,
|
||||||
|
Switch };
|
||||||
CFType type;
|
CFType type;
|
||||||
bool isUniform;
|
bool isUniform;
|
||||||
llvm::BasicBlock *savedBreakTarget, *savedContinueTarget;
|
llvm::BasicBlock *savedBreakTarget, *savedContinueTarget;
|
||||||
@@ -143,7 +145,7 @@ private:
|
|||||||
CFInfo(CFType t, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
|
CFInfo(CFType t, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
|
||||||
llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
|
llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
|
||||||
llvm::Value *lm) {
|
llvm::Value *lm) {
|
||||||
Assert(t == ForeachRegular || t == ForeachUnique);
|
Assert(t == ForeachRegular || t == ForeachActive || t == ForeachUnique);
|
||||||
type = t;
|
type = t;
|
||||||
isUniform = false;
|
isUniform = false;
|
||||||
savedBreakTarget = bt;
|
savedBreakTarget = bt;
|
||||||
@@ -190,6 +192,9 @@ CFInfo::GetForeach(FunctionEmitContext::ForeachType ft,
|
|||||||
case FunctionEmitContext::FOREACH_REGULAR:
|
case FunctionEmitContext::FOREACH_REGULAR:
|
||||||
cfType = ForeachRegular;
|
cfType = ForeachRegular;
|
||||||
break;
|
break;
|
||||||
|
case FunctionEmitContext::FOREACH_ACTIVE:
|
||||||
|
cfType = ForeachActive;
|
||||||
|
break;
|
||||||
case FunctionEmitContext::FOREACH_UNIQUE:
|
case FunctionEmitContext::FOREACH_UNIQUE:
|
||||||
cfType = ForeachUnique;
|
cfType = ForeachUnique;
|
||||||
break;
|
break;
|
||||||
@@ -744,6 +749,16 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static bool
|
||||||
|
lEnclosingLoopIsForeachActive(const std::vector<CFInfo *> &controlFlowInfo) {
|
||||||
|
for (int i = (int)controlFlowInfo.size() - 1; i >= 0; --i) {
|
||||||
|
if (controlFlowInfo[i]->type == CFInfo::ForeachActive)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
FunctionEmitContext::Continue(bool doCoherenceCheck) {
|
FunctionEmitContext::Continue(bool doCoherenceCheck) {
|
||||||
if (!continueTarget) {
|
if (!continueTarget) {
|
||||||
@@ -753,12 +768,16 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
|
|||||||
}
|
}
|
||||||
AssertPos(currentPos, controlFlowInfo.size() > 0);
|
AssertPos(currentPos, controlFlowInfo.size() > 0);
|
||||||
|
|
||||||
if (ifsInCFAllUniform(CFInfo::Loop)) {
|
if (ifsInCFAllUniform(CFInfo::Loop) ||
|
||||||
|
lEnclosingLoopIsForeachActive(controlFlowInfo)) {
|
||||||
// Similarly to 'break' statements, we can immediately jump to the
|
// Similarly to 'break' statements, we can immediately jump to the
|
||||||
// continue target if we're only in 'uniform' control flow within
|
// continue target if we're only in 'uniform' control flow within
|
||||||
// loop or if we can tell that the mask is all on.
|
// loop or if we can tell that the mask is all on. Here, we can
|
||||||
|
// also jump if the enclosing loop is a 'foreach_active' loop, in
|
||||||
|
// which case we know that only a single program instance is
|
||||||
|
// executing.
|
||||||
AddInstrumentationPoint("continue: uniform CF, jumped");
|
AddInstrumentationPoint("continue: uniform CF, jumped");
|
||||||
if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
|
if (doCoherenceCheck)
|
||||||
Warning(currentPos, "Coherent continue statement not necessary in "
|
Warning(currentPos, "Coherent continue statement not necessary in "
|
||||||
"fully uniform control flow.");
|
"fully uniform control flow.");
|
||||||
BranchInst(continueTarget);
|
BranchInst(continueTarget);
|
||||||
|
|||||||
6
ctx.h
6
ctx.h
@@ -160,9 +160,9 @@ public:
|
|||||||
finished. */
|
finished. */
|
||||||
void EndLoop();
|
void EndLoop();
|
||||||
|
|
||||||
/** Indicates that code generation for a 'foreach', 'foreach_tiled', or
|
/** Indicates that code generation for a 'foreach', 'foreach_tiled',
|
||||||
'foreach_unique' loop is about to start. */
|
'foreach_active', or 'foreach_unique' loop is about to start. */
|
||||||
enum ForeachType { FOREACH_REGULAR, FOREACH_UNIQUE };
|
enum ForeachType { FOREACH_REGULAR, FOREACH_ACTIVE, FOREACH_UNIQUE };
|
||||||
void StartForeach(ForeachType ft);
|
void StartForeach(ForeachType ft);
|
||||||
void EndForeach();
|
void EndForeach();
|
||||||
|
|
||||||
|
|||||||
376
docs/ispc.rst
376
docs/ispc.rst
@@ -105,12 +105,16 @@ Contents:
|
|||||||
|
|
||||||
* `Conditional Statements: "if"`_
|
* `Conditional Statements: "if"`_
|
||||||
* `Conditional Statements: "switch"`_
|
* `Conditional Statements: "switch"`_
|
||||||
* `Basic Iteration Statements: "for", "while", and "do"`_
|
* `Iteration Statements`_
|
||||||
* `Iteration over unique elements: "foreach_unique"`_
|
|
||||||
|
+ `Basic Iteration Statements: "for", "while", and "do"`_
|
||||||
|
+ `Iteration over active program instances: "foreach_active"`_
|
||||||
|
+ `Iteration over unique elements: "foreach_unique"`_
|
||||||
|
+ `Parallel Iteration Statements: "foreach" and "foreach_tiled"`_
|
||||||
|
+ `Parallel Iteration with "programIndex" and "programCount"`_
|
||||||
|
|
||||||
* `Unstructured Control Flow: "goto"`_
|
* `Unstructured Control Flow: "goto"`_
|
||||||
* `"Coherent" Control Flow Statements: "cif" and Friends`_
|
* `"Coherent" Control Flow Statements: "cif" and Friends`_
|
||||||
* `Parallel Iteration Statements: "foreach" and "foreach_tiled"`_
|
|
||||||
* `Parallel Iteration with "programIndex" and "programCount"`_
|
|
||||||
* `Functions and Function Calls`_
|
* `Functions and Function Calls`_
|
||||||
|
|
||||||
+ `Function Overloading`_
|
+ `Function Overloading`_
|
||||||
@@ -1984,7 +1988,7 @@ format in memory; the benefits from SOA layout are discussed in more detail
|
|||||||
in the `Use "Structure of Arrays" Layout When Possible`_ section in the
|
in the `Use "Structure of Arrays" Layout When Possible`_ section in the
|
||||||
ispc Performance Guide.
|
ispc Performance Guide.
|
||||||
|
|
||||||
.. _Use "Structure of Arrays" Layout When Possible: perf.html#use-structure-of-arrays-layout-when-possible
|
.. _Use "Structure of Arrays" Layout When Possible: perfguide.html#use-structure-of-arrays-layout-when-possible
|
||||||
|
|
||||||
``ispc`` provides two key language-level capabilities for laying out and
|
``ispc`` provides two key language-level capabilities for laying out and
|
||||||
accessing data in SOA format:
|
accessing data in SOA format:
|
||||||
@@ -2348,11 +2352,19 @@ code below.
|
|||||||
x *= x;
|
x *= x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Iteration Statements
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
In addition to the standard iteration statements ``for``, ``while``, and
|
||||||
|
``do``, inherited from C/C++, ``ispc`` provides a number of additional
|
||||||
|
specialized ways to iterate over data.
|
||||||
|
|
||||||
Basic Iteration Statements: "for", "while", and "do"
|
Basic Iteration Statements: "for", "while", and "do"
|
||||||
----------------------------------------------------
|
----------------------------------------------------
|
||||||
|
|
||||||
``ispc`` supports ``for``, ``while``, and ``do`` loops, with the same
|
``ispc`` supports ``for``, ``while``, and ``do`` loops, with the same
|
||||||
specification as in C. Like C++, variables can be declared in the ``for``
|
specification as in C. As in C++, variables can be declared in the ``for``
|
||||||
statement itself:
|
statement itself:
|
||||||
|
|
||||||
::
|
::
|
||||||
@@ -2374,6 +2386,58 @@ executing code in the loop body that didn't execute the ``continue`` will
|
|||||||
be unaffected by it.
|
be unaffected by it.
|
||||||
|
|
||||||
|
|
||||||
|
Iteration over active program instances: "foreach_active"
|
||||||
|
---------------------------------------------------------
|
||||||
|
|
||||||
|
The ``foreach_active`` construct specifies a loop that serializes over the
|
||||||
|
active program instances: the loop body executes once for each active
|
||||||
|
program instance, and with only that program instance executing.
|
||||||
|
|
||||||
|
As an example of the use of this construct, consider an application where
|
||||||
|
each program instance independently computes an offset into a shared array
|
||||||
|
that is being updated:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
uniform float array[...] = { ... };
|
||||||
|
int index = ...;
|
||||||
|
++array[index];
|
||||||
|
|
||||||
|
If more than one active program instance computes the same value for
|
||||||
|
``index``, the above code has undefined behavior (see the section `Data
|
||||||
|
Races Within a Gang`_ for details.) The increment of ``array[index]``
|
||||||
|
could instead be written inside a ``foreach_active`` statement:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
foreach_active (i) {
|
||||||
|
++array[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
The variable name provided in parenthesis after the ``foreach_active``
|
||||||
|
keyword (here, ``index``), causes a ``const uniform int64`` local variable
|
||||||
|
of that name to be declared, where the variable takes the ``programIndex``
|
||||||
|
value of the program instance executing at each loop iteraton.
|
||||||
|
|
||||||
|
In the code above, because only one program instance is executing at a time
|
||||||
|
when the loop body executes, the update to ``array`` is well-defined.
|
||||||
|
Note that for this particular example, the "local atomic" operations in
|
||||||
|
the standard library could be used instead to safely update ``array``.
|
||||||
|
However, local atomics functions aren't always available or appropriate for
|
||||||
|
more complex cases.)
|
||||||
|
|
||||||
|
``continue`` statements may be used inside ``foreach_active`` loops, though
|
||||||
|
``break`` and ``return`` are prohibited. The order in which the active
|
||||||
|
program instances are processed in the loop is not defined.
|
||||||
|
|
||||||
|
See the `Using "foreach_active" Effectively`_ Section in the ispc
|
||||||
|
Performance Guide for more details about ``foreach_active``.
|
||||||
|
|
||||||
|
.. _Using "foreach_active" Effectively: perfguide.html#using-foreach-active-effectively
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Iteration over unique elements: "foreach_unique"
|
Iteration over unique elements: "foreach_unique"
|
||||||
------------------------------------------------
|
------------------------------------------------
|
||||||
|
|
||||||
@@ -2408,7 +2472,144 @@ evaluated once, and it must be of an atomic type (``float``, ``int``,
|
|||||||
etc.), an ``enum`` type, or a pointer type. The iteration variable ``val``
|
etc.), an ``enum`` type, or a pointer type. The iteration variable ``val``
|
||||||
is a variable of ``const uniform`` type of the iteration type; it can't be
|
is a variable of ``const uniform`` type of the iteration type; it can't be
|
||||||
modified within the loop. Finally, ``break`` and ``return`` statements are
|
modified within the loop. Finally, ``break`` and ``return`` statements are
|
||||||
illegal within the loop body, but ``continue`` statemetns are allowed.
|
illegal within the loop body, but ``continue`` statements are allowed.
|
||||||
|
|
||||||
|
|
||||||
|
Parallel Iteration Statements: "foreach" and "foreach_tiled"
|
||||||
|
------------------------------------------------------------
|
||||||
|
|
||||||
|
The ``foreach`` and ``foreach_tiled`` constructs specify loops over a
|
||||||
|
possibly multi-dimensional domain of integer ranges. Their role goes
|
||||||
|
beyond "syntactic sugar"; they provides one of the two key ways of
|
||||||
|
expressing parallel computation in ``ispc``.
|
||||||
|
|
||||||
|
In general, a ``foreach`` or ``foreach_tiled`` statement takes one or more
|
||||||
|
dimension specifiers separated by commas, where each dimension is specified
|
||||||
|
by ``identifier = start ... end``, where ``start`` is a signed integer
|
||||||
|
value less than or equal to ``end``, specifying iteration over all integer
|
||||||
|
values from ``start`` up to and including ``end-1``. An arbitrary number
|
||||||
|
of iteration dimensions may be specified, with each one spanning a
|
||||||
|
different range of values. Within the ``foreach`` loop, the given
|
||||||
|
identifiers are available as ``const varying int32`` variables. The
|
||||||
|
execution mask starts out "all on" at the start of each ``foreach`` loop
|
||||||
|
iteration, but may be changed by control flow constructs within the loop.
|
||||||
|
|
||||||
|
It is illegal to have a ``break`` statement or a ``return`` statement
|
||||||
|
within a ``foreach`` loop; a compile-time error will be issued in this
|
||||||
|
case. (It is legal to have a ``break`` in a regular ``for`` loop that's
|
||||||
|
nested inside a ``foreach`` loop.) ``continue`` statements are legal in
|
||||||
|
``foreach`` loops; they have the same effect as in regular ``for`` loops:
|
||||||
|
a program instances that executes a ``continue`` statement effectively
|
||||||
|
skips over the rest of the loop body for the current iteration.
|
||||||
|
|
||||||
|
It is also currently illegal to have nested ``foreach`` statements; this
|
||||||
|
limitation will be removed in a future release of ``ispc``.
|
||||||
|
|
||||||
|
As a specific example, consider the following ``foreach`` statement:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
foreach (j = 0 ... height, i = 0 ... width) {
|
||||||
|
// loop body--process data element (i,j)
|
||||||
|
}
|
||||||
|
|
||||||
|
It specifies a loop over a 2D domain, where the ``j`` variable goes from 0
|
||||||
|
to ``height-1`` and ``i`` goes from 0 to ``width-1``. Within the loop, the
|
||||||
|
variables ``i`` and ``j`` are available and initialized accordingly.
|
||||||
|
|
||||||
|
``foreach`` loops actually cause the given iteration domain to be
|
||||||
|
automatically mapped to the program instances in the gang, so that all of
|
||||||
|
the data can be processed, in gang-sized chunks. As a specific example,
|
||||||
|
consider a simple ``foreach`` loop like the following, on a target where
|
||||||
|
the gang size is 8:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
foreach (i = 0 ... 16) {
|
||||||
|
// perform computation on element i
|
||||||
|
}
|
||||||
|
|
||||||
|
One possible valid execution path of this loop would be for the program
|
||||||
|
counter the step through the statements of this loop just ``16/8==2``
|
||||||
|
times; the first time through, with the ``varying int32`` variable ``i``
|
||||||
|
having the values (0,1,2,3,4,5,6,7) over the program instances, and the
|
||||||
|
second time through, having the values (8,9,10,11,12,13,14,15), thus
|
||||||
|
mapping the available program instances to all of the data by the end of
|
||||||
|
the loop's execution.
|
||||||
|
|
||||||
|
In general, however, you shouldn't make any assumptions about the order in
|
||||||
|
which elements of the iteration domain will be processed by a ``foreach``
|
||||||
|
loop. For example, the following code exhibits undefined behavior:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
uniform float a[10][100];
|
||||||
|
foreach (i = 0 ... 10, j = 0 ... 100) {
|
||||||
|
if (i == 0)
|
||||||
|
a[i][j] = j;
|
||||||
|
else
|
||||||
|
// Error: can't assume that a[i-1][j] has been set yet
|
||||||
|
a[i][j] = a[i-1][j];
|
||||||
|
|
||||||
|
The ``foreach`` statement generally subdivides the iteration domain by
|
||||||
|
selecting sets of contiguous elements in the inner-most dimension of the
|
||||||
|
iteration domain. This decomposition approach generally leads to coherent
|
||||||
|
memory reads and writes, but may lead to worse control flow coherence than
|
||||||
|
other decompositions.
|
||||||
|
|
||||||
|
Therefore, ``foreach_tiled`` decomposes the iteration domain in a way that
|
||||||
|
tries to map locations in the domain to program instances in a way that is
|
||||||
|
compact across all of the dimensions. For example, on a target with an
|
||||||
|
8-wide gang size, the following ``foreach_tiled`` statement might process
|
||||||
|
the iteration domain in chunks of 2 elements in ``j`` and 4 elements in
|
||||||
|
``i`` each time. (The trade-offs between these two constructs are
|
||||||
|
discussed in more detail in the `ispc Performance Guide`_.)
|
||||||
|
|
||||||
|
.. _ispc Performance Guide: perfguide.html#improving-control-flow-coherence-with-foreach-tiled
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
foreach_tiled (j = 0 ... height, i = 0 ... width) {
|
||||||
|
// loop body--process data element (i,j)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Parallel Iteration with "programIndex" and "programCount"
|
||||||
|
---------------------------------------------------------
|
||||||
|
|
||||||
|
In addition to ``foreach`` and ``foreach_tiled``, ``ispc`` provides a
|
||||||
|
lower-level mechanism for mapping SPMD program instances to data to operate
|
||||||
|
on via the built-in ``programIndex`` and ``programCount`` variables.
|
||||||
|
|
||||||
|
``programIndex`` gives the index of the SIMD-lane being used for running
|
||||||
|
each program instance. (In other words, it's a varying integer value that
|
||||||
|
has value zero for the first program instance, and so forth.) The
|
||||||
|
``programCount`` builtin gives the total number of instances in the gang.
|
||||||
|
Together, these can be used to uniquely map executing program instances to
|
||||||
|
input data. [#]_
|
||||||
|
|
||||||
|
.. [#] ``programIndex`` is analogous to ``get_global_id()`` in OpenCL* and
|
||||||
|
``threadIdx`` in CUDA*.
|
||||||
|
|
||||||
|
As a specific example, consider an ``ispc`` function that needs to perform
|
||||||
|
some computation on an array of data.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
for (uniform int i = 0; i < count; i += programCount) {
|
||||||
|
float d = data[i + programIndex];
|
||||||
|
float r = ....
|
||||||
|
result[i + programIndex] = r;
|
||||||
|
}
|
||||||
|
|
||||||
|
Here, we've written a loop that explicitly loops over the data in chunks of
|
||||||
|
``programCount`` elements. In each loop iteration, the running program
|
||||||
|
instances effectively collude amongst themselves using ``programIndex`` to
|
||||||
|
determine which elements to work on in a way that ensures that all of the
|
||||||
|
data elements will be processed. In this particular case, a ``foreach``
|
||||||
|
loop would be preferable, as ``foreach`` naturally handles the case where
|
||||||
|
``programCount`` doesn't evenly divide the number of elements to be
|
||||||
|
processed, while the loop above assumes that case implicitly.
|
||||||
|
|
||||||
|
|
||||||
Unstructured Control Flow: "goto"
|
Unstructured Control Flow: "goto"
|
||||||
@@ -2479,139 +2680,6 @@ constructs in ``ispc`` that a loop will never be executed with an "all off"
|
|||||||
execution mask.
|
execution mask.
|
||||||
|
|
||||||
|
|
||||||
Parallel Iteration Statements: "foreach" and "foreach_tiled"
|
|
||||||
------------------------------------------------------------
|
|
||||||
|
|
||||||
The ``foreach`` and ``foreach_tiled`` constructs specify loops over a
|
|
||||||
possibly multi-dimensional domain of integer ranges. Their role goes
|
|
||||||
beyond "syntactic sugar"; they provides one of the two key ways of
|
|
||||||
expressing parallel computation in ``ispc``.
|
|
||||||
|
|
||||||
In general, a ``foreach`` or ``foreach_tiled`` statement takes one or more
|
|
||||||
dimension specifiers separated by commas, where each dimension is specified
|
|
||||||
by ``identifier = start ... end``, where ``start`` is a signed integer
|
|
||||||
value less than or equal to ``end``, specifying iteration over all integer
|
|
||||||
values from ``start`` up to and including ``end-1``. An arbitrary number
|
|
||||||
of iteration dimensions may be specified, with each one spanning a
|
|
||||||
different range of values. Within the ``foreach`` loop, the given
|
|
||||||
identifiers are available as ``const varying int32`` variables. The
|
|
||||||
execution mask starts out "all on" at the start of each ``foreach`` loop
|
|
||||||
iteration, but may be changed by control flow constructs within the loop.
|
|
||||||
|
|
||||||
It is illegal to have a ``break`` statement or a ``return`` statement
|
|
||||||
within a ``foreach`` loop; a compile-time error will be issued in this
|
|
||||||
case. (It is legal to have a ``break`` in a regular ``for`` loop that's
|
|
||||||
nested inside a ``foreach`` loop.) ``continue`` statements are legal in
|
|
||||||
``foreach`` loops; they have the same effect as in regular ``for`` loops:
|
|
||||||
a program instances that executes a ``continue`` statement effectively
|
|
||||||
skips over the rest of the loop body for the current iteration.
|
|
||||||
|
|
||||||
As a specific example, consider the following ``foreach`` statement:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
foreach (j = 0 ... height, i = 0 ... width) {
|
|
||||||
// loop body--process data element (i,j)
|
|
||||||
}
|
|
||||||
|
|
||||||
It specifies a loop over a 2D domain, where the ``j`` variable goes from 0
|
|
||||||
to ``height-1`` and ``i`` goes from 0 to ``width-1``. Within the loop, the
|
|
||||||
variables ``i`` and ``j`` are available and initialized accordingly.
|
|
||||||
|
|
||||||
``foreach`` loops actually cause the given iteration domain to be
|
|
||||||
automatically mapped to the program instances in the gang, so that all of
|
|
||||||
the data can be processed, in gang-sized chunks. As a specific example,
|
|
||||||
consider a simple ``foreach`` loop like the following, on a target where
|
|
||||||
the gang size is 8:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
foreach (i = 0 ... 16) {
|
|
||||||
// perform computation on element i
|
|
||||||
}
|
|
||||||
|
|
||||||
One possible valid execution path of this loop would be for the program
|
|
||||||
counter the step through the statements of this loop just ``16/8==2``
|
|
||||||
times; the first time through, with the ``varying int32`` variable ``i``
|
|
||||||
having the values (0,1,2,3,4,5,6,7) over the program instances, and the
|
|
||||||
second time through, having the values (8,9,10,11,12,13,14,15), thus
|
|
||||||
mapping the available program instances to all of the data by the end of
|
|
||||||
the loop's execution.
|
|
||||||
|
|
||||||
In general, however, you shouldn't make any assumptions about the order in
|
|
||||||
which elements of the iteration domain will be processed by a ``foreach``
|
|
||||||
loop. For example, the following code exhibits undefined behavior:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
uniform float a[10][100];
|
|
||||||
foreach (i = 0 ... 10, j = 0 ... 100) {
|
|
||||||
if (i == 0)
|
|
||||||
a[i][j] = j;
|
|
||||||
else
|
|
||||||
// Error: can't assume that a[i-1][j] has been set yet
|
|
||||||
a[i][j] = a[i-1][j];
|
|
||||||
|
|
||||||
The ``foreach`` statement generally subdivides the iteration domain by
|
|
||||||
selecting sets of contiguous elements in the inner-most dimension of the
|
|
||||||
iteration domain. This decomposition approach generally leads to coherent
|
|
||||||
memory reads and writes, but may lead to worse control flow coherence than
|
|
||||||
other decompositions.
|
|
||||||
|
|
||||||
Therefore, ``foreach_tiled`` decomposes the iteration domain in a way that
|
|
||||||
tries to map locations in the domain to program instances in a way that is
|
|
||||||
compact across all of the dimensions. For example, on a target with an
|
|
||||||
8-wide gang size, the following ``foreach_tiled`` statement might process
|
|
||||||
the iteration domain in chunks of 2 elements in ``j`` and 4 elements in
|
|
||||||
``i`` each time. (The trade-offs between these two constructs are
|
|
||||||
discussed in more detail in the `ispc Performance Guide`_.)
|
|
||||||
|
|
||||||
.. _ispc Performance Guide: perf.html#improving-control-flow-coherence-with-foreach-tiled
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
foreach_tiled (j = 0 ... height, i = 0 ... width) {
|
|
||||||
// loop body--process data element (i,j)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
Parallel Iteration with "programIndex" and "programCount"
|
|
||||||
---------------------------------------------------------
|
|
||||||
|
|
||||||
In addition to ``foreach`` and ``foreach_tiled``, ``ispc`` provides a
|
|
||||||
lower-level mechanism for mapping SPMD program instances to data to operate
|
|
||||||
on via the built-in ``programIndex`` and ``programCount`` variables.
|
|
||||||
|
|
||||||
``programIndex`` gives the index of the SIMD-lane being used for running
|
|
||||||
each program instance. (In other words, it's a varying integer value that
|
|
||||||
has value zero for the first program instance, and so forth.) The
|
|
||||||
``programCount`` builtin gives the total number of instances in the gang.
|
|
||||||
Together, these can be used to uniquely map executing program instances to
|
|
||||||
input data. [#]_
|
|
||||||
|
|
||||||
.. [#] ``programIndex`` is analogous to ``get_global_id()`` in OpenCL* and
|
|
||||||
``threadIdx`` in CUDA*.
|
|
||||||
|
|
||||||
As a specific example, consider an ``ispc`` function that needs to perform
|
|
||||||
some computation on an array of data.
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
for (uniform int i = 0; i < count; i += programCount) {
|
|
||||||
float d = data[i + programIndex];
|
|
||||||
float r = ....
|
|
||||||
result[i + programIndex] = r;
|
|
||||||
}
|
|
||||||
|
|
||||||
Here, we've written a loop that explicitly loops over the data in chunks of
|
|
||||||
``programCount`` elements. In each loop iteration, the running program
|
|
||||||
instances effectively collude amongst themselves using ``programIndex`` to
|
|
||||||
determine which elements to work on in a way that ensures that all of the
|
|
||||||
data elements will be processed. In this particular case, a ``foreach``
|
|
||||||
loop would be preferable, as ``foreach`` naturally handles the case where
|
|
||||||
``programCount`` doesn't evenly divide the number of elements to be
|
|
||||||
processed, while the loop above assumes that case implicitly.
|
|
||||||
|
|
||||||
Functions and Function Calls
|
Functions and Function Calls
|
||||||
----------------------------
|
----------------------------
|
||||||
|
|
||||||
@@ -3452,7 +3520,7 @@ There are also variants of these functions that return the value as a
|
|||||||
discussion of an application of this variant to improve memory access
|
discussion of an application of this variant to improve memory access
|
||||||
performance in the `Performance Guide`_.
|
performance in the `Performance Guide`_.
|
||||||
|
|
||||||
.. _Performance Guide: perf.html#understanding-gather-and-scatter
|
.. _Performance Guide: perfguide.html#understanding-gather-and-scatter
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
@@ -4130,8 +4198,10 @@ from ``ispc`` must be declared as follows:
|
|||||||
It is illegal to overload functions declared with ``extern "C"`` linkage;
|
It is illegal to overload functions declared with ``extern "C"`` linkage;
|
||||||
``ispc`` issues an error in this case.
|
``ispc`` issues an error in this case.
|
||||||
|
|
||||||
Function calls back to C/C++ are not made if none of the program instances
|
**Only a single function call is made back to C++ for the entire gang of
|
||||||
want to make the call. For example, given code like:
|
runing program instances**. Furthermore, function calls back to C/C++ are not
|
||||||
|
made if none of the program instances want to make the call. For example,
|
||||||
|
given code like:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
@@ -4174,6 +4244,24 @@ Application code can thus be written as:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
In some cases, it can be desirable to generate a single call for each
|
||||||
|
executing program instance, rather than one call for a gang. For example,
|
||||||
|
the code below shows how one might call an existing math library routine
|
||||||
|
that takes a scalar parameter.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
extern "C" uniform double erf(uniform double);
|
||||||
|
double v = ...;
|
||||||
|
double result;
|
||||||
|
foreach_active (instance) {
|
||||||
|
uniform double r = erf(extract(v, instance));
|
||||||
|
result = insert(result, instance, r);
|
||||||
|
}
|
||||||
|
|
||||||
|
This code calls ``erf()`` once for each active program instance, passing it
|
||||||
|
the program instance's value of ``v`` and storing the result in the
|
||||||
|
instance's ``result`` value.
|
||||||
|
|
||||||
Data Layout
|
Data Layout
|
||||||
-----------
|
-----------
|
||||||
@@ -4309,7 +4397,7 @@ can also have a significant effect on performance; in general, creating
|
|||||||
groups of work that will tend to do similar computation across the SPMD
|
groups of work that will tend to do similar computation across the SPMD
|
||||||
program instances improves performance.
|
program instances improves performance.
|
||||||
|
|
||||||
.. _ispc Performance Tuning Guide: http://ispc.github.com/perf.html
|
.. _ispc Performance Tuning Guide: http://ispc.github.com/perfguide.html
|
||||||
|
|
||||||
|
|
||||||
Disclaimer and Legal Information
|
Disclaimer and Legal Information
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ the most out of ``ispc`` in practice.
|
|||||||
+ `Avoid 64-bit Addressing Calculations When Possible`_
|
+ `Avoid 64-bit Addressing Calculations When Possible`_
|
||||||
+ `Avoid Computation With 8 and 16-bit Integer Types`_
|
+ `Avoid Computation With 8 and 16-bit Integer Types`_
|
||||||
+ `Implementing Reductions Efficiently`_
|
+ `Implementing Reductions Efficiently`_
|
||||||
|
+ `Using "foreach_active" Effectively`_
|
||||||
+ `Using Low-level Vector Tricks`_
|
+ `Using Low-level Vector Tricks`_
|
||||||
+ `The "Fast math" Option`_
|
+ `The "Fast math" Option`_
|
||||||
+ `"inline" Aggressively`_
|
+ `"inline" Aggressively`_
|
||||||
@@ -510,6 +511,43 @@ values--very efficient code in the end.
|
|||||||
return reduce_add(sum);
|
return reduce_add(sum);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Using "foreach_active" Effectively
|
||||||
|
----------------------------------
|
||||||
|
|
||||||
|
For high-performance code,
|
||||||
|
|
||||||
|
For example, consider this segment of code, from the introduction of
|
||||||
|
``foreach_active`` in the ispc User's Guide:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
uniform float array[...] = { ... };
|
||||||
|
int index = ...;
|
||||||
|
foreach_active (i) {
|
||||||
|
++array[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
Here, ``index`` was assumed to possibly have the same value for multiple
|
||||||
|
program instances, so the updates to ``array[index]`` are serialized by the
|
||||||
|
``foreach_active`` statement in order to not have undefined results when
|
||||||
|
``index`` values do collide.
|
||||||
|
|
||||||
|
The code generated by the compiler can be improved in this case by making
|
||||||
|
it clear that only a single element of the array is accessed by
|
||||||
|
``array[index]`` and that thus a general gather or scatter isn't required.
|
||||||
|
Specifically, by using the ``extract()`` function from the standard library
|
||||||
|
to extract the current program instance's value of ``index`` into a
|
||||||
|
``uniform`` variable and then using that to index into ``array``, as below,
|
||||||
|
more efficient code is generated.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
foreach_active (instanceNum) {
|
||||||
|
uniform int unifIndex = extract(index, instanceNum);
|
||||||
|
++array[unifIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Using Low-level Vector Tricks
|
Using Low-level Vector Tricks
|
||||||
-----------------------------
|
-----------------------------
|
||||||
|
|
||||||
|
|||||||
8
lex.ll
8
lex.ll
@@ -66,8 +66,8 @@ static int allTokens[] = {
|
|||||||
TOKEN_CONST, TOKEN_CONTINUE, TOKEN_CRETURN, TOKEN_DEFAULT, TOKEN_DO,
|
TOKEN_CONST, TOKEN_CONTINUE, TOKEN_CRETURN, TOKEN_DEFAULT, TOKEN_DO,
|
||||||
TOKEN_DELETE, TOKEN_DOUBLE, TOKEN_ELSE, TOKEN_ENUM,
|
TOKEN_DELETE, TOKEN_DOUBLE, TOKEN_ELSE, TOKEN_ENUM,
|
||||||
TOKEN_EXPORT, TOKEN_EXTERN, TOKEN_FALSE, TOKEN_FLOAT, TOKEN_FOR,
|
TOKEN_EXPORT, TOKEN_EXTERN, TOKEN_FALSE, TOKEN_FLOAT, TOKEN_FOR,
|
||||||
TOKEN_FOREACH, TOKEN_FOREACH_TILED, TOKEN_FOREACH_UNIQUE,
|
TOKEN_FOREACH, TOKEN_FOREACH_ACTIVE, TOKEN_FOREACH_TILED,
|
||||||
TOKEN_GOTO, TOKEN_IF, TOKEN_IN, TOKEN_INLINE,
|
TOKEN_FOREACH_UNIQUE, TOKEN_GOTO, TOKEN_IF, TOKEN_IN, TOKEN_INLINE,
|
||||||
TOKEN_INT, TOKEN_INT8, TOKEN_INT16, TOKEN_INT, TOKEN_INT64, TOKEN_LAUNCH,
|
TOKEN_INT, TOKEN_INT8, TOKEN_INT16, TOKEN_INT, TOKEN_INT64, TOKEN_LAUNCH,
|
||||||
TOKEN_NEW, TOKEN_NULL, TOKEN_PRINT, TOKEN_RETURN, TOKEN_SOA, TOKEN_SIGNED,
|
TOKEN_NEW, TOKEN_NULL, TOKEN_PRINT, TOKEN_RETURN, TOKEN_SOA, TOKEN_SIGNED,
|
||||||
TOKEN_SIZEOF, TOKEN_STATIC, TOKEN_STRUCT, TOKEN_SWITCH, TOKEN_SYNC,
|
TOKEN_SIZEOF, TOKEN_STATIC, TOKEN_STRUCT, TOKEN_SWITCH, TOKEN_SYNC,
|
||||||
@@ -115,6 +115,7 @@ void ParserInit() {
|
|||||||
tokenToName[TOKEN_FLOAT] = "float";
|
tokenToName[TOKEN_FLOAT] = "float";
|
||||||
tokenToName[TOKEN_FOR] = "for";
|
tokenToName[TOKEN_FOR] = "for";
|
||||||
tokenToName[TOKEN_FOREACH] = "foreach";
|
tokenToName[TOKEN_FOREACH] = "foreach";
|
||||||
|
tokenToName[TOKEN_FOREACH_ACTIVE] = "foreach_active";
|
||||||
tokenToName[TOKEN_FOREACH_TILED] = "foreach_tiled";
|
tokenToName[TOKEN_FOREACH_TILED] = "foreach_tiled";
|
||||||
tokenToName[TOKEN_FOREACH_UNIQUE] = "foreach_unique";
|
tokenToName[TOKEN_FOREACH_UNIQUE] = "foreach_unique";
|
||||||
tokenToName[TOKEN_GOTO] = "goto";
|
tokenToName[TOKEN_GOTO] = "goto";
|
||||||
@@ -226,6 +227,7 @@ void ParserInit() {
|
|||||||
tokenNameRemap["TOKEN_FLOAT"] = "\'float\'";
|
tokenNameRemap["TOKEN_FLOAT"] = "\'float\'";
|
||||||
tokenNameRemap["TOKEN_FOR"] = "\'for\'";
|
tokenNameRemap["TOKEN_FOR"] = "\'for\'";
|
||||||
tokenNameRemap["TOKEN_FOREACH"] = "\'foreach\'";
|
tokenNameRemap["TOKEN_FOREACH"] = "\'foreach\'";
|
||||||
|
tokenNameRemap["TOKEN_FOREACH_ACTIVE"] = "\'foreach_active\'";
|
||||||
tokenNameRemap["TOKEN_FOREACH_TILED"] = "\'foreach_tiled\'";
|
tokenNameRemap["TOKEN_FOREACH_TILED"] = "\'foreach_tiled\'";
|
||||||
tokenNameRemap["TOKEN_FOREACH_UNIQUE"] = "\'foreach_unique\'";
|
tokenNameRemap["TOKEN_FOREACH_UNIQUE"] = "\'foreach_unique\'";
|
||||||
tokenNameRemap["TOKEN_GOTO"] = "\'goto\'";
|
tokenNameRemap["TOKEN_GOTO"] = "\'goto\'";
|
||||||
@@ -369,8 +371,8 @@ extern { RT; return TOKEN_EXTERN; }
|
|||||||
false { RT; return TOKEN_FALSE; }
|
false { RT; return TOKEN_FALSE; }
|
||||||
float { RT; return TOKEN_FLOAT; }
|
float { RT; return TOKEN_FLOAT; }
|
||||||
for { RT; return TOKEN_FOR; }
|
for { RT; return TOKEN_FOR; }
|
||||||
__foreach_active { RT; return TOKEN_FOREACH_ACTIVE; }
|
|
||||||
foreach { RT; return TOKEN_FOREACH; }
|
foreach { RT; return TOKEN_FOREACH; }
|
||||||
|
foreach_active { RT; return TOKEN_FOREACH_ACTIVE; }
|
||||||
foreach_tiled { RT; return TOKEN_FOREACH_TILED; }
|
foreach_tiled { RT; return TOKEN_FOREACH_TILED; }
|
||||||
foreach_unique { RT; return TOKEN_FOREACH_UNIQUE; }
|
foreach_unique { RT; return TOKEN_FOREACH_UNIQUE; }
|
||||||
goto { RT; return TOKEN_GOTO; }
|
goto { RT; return TOKEN_GOTO; }
|
||||||
|
|||||||
8
parse.yy
8
parse.yy
@@ -117,8 +117,8 @@ static const char *lBuiltinTokens[] = {
|
|||||||
"assert", "bool", "break", "case", "cbreak", "ccontinue", "cdo",
|
"assert", "bool", "break", "case", "cbreak", "ccontinue", "cdo",
|
||||||
"cfor", "cif", "cwhile", "const", "continue", "creturn", "default",
|
"cfor", "cif", "cwhile", "const", "continue", "creturn", "default",
|
||||||
"do", "delete", "double", "else", "enum", "export", "extern", "false",
|
"do", "delete", "double", "else", "enum", "export", "extern", "false",
|
||||||
"float", "for", "foreach", "foreach_tiled", "foreach_unique",
|
"float", "for", "foreach", "foreach_active", "foreach_tiled",
|
||||||
"goto", "if", "in", "inline",
|
"foreach_unique", "goto", "if", "in", "inline",
|
||||||
"int", "int8", "int16", "int32", "int64", "launch", "new", "NULL",
|
"int", "int8", "int16", "int32", "int64", "launch", "new", "NULL",
|
||||||
"print", "return", "signed", "sizeof", "static", "struct", "switch",
|
"print", "return", "signed", "sizeof", "static", "struct", "switch",
|
||||||
"sync", "task", "true", "typedef", "uniform", "unmasked", "unsigned",
|
"sync", "task", "true", "typedef", "uniform", "unmasked", "unsigned",
|
||||||
@@ -1688,7 +1688,7 @@ foreach_active_scope
|
|||||||
foreach_active_identifier
|
foreach_active_identifier
|
||||||
: TOKEN_IDENTIFIER
|
: TOKEN_IDENTIFIER
|
||||||
{
|
{
|
||||||
$$ = new Symbol(yytext, @1, AtomicType::UniformInt32);
|
$$ = new Symbol(yytext, @1, AtomicType::UniformInt64->GetAsConstType());
|
||||||
}
|
}
|
||||||
;
|
;
|
||||||
|
|
||||||
@@ -1838,7 +1838,7 @@ iteration_statement
|
|||||||
}
|
}
|
||||||
statement
|
statement
|
||||||
{
|
{
|
||||||
$$ = CreateForeachActiveStmt($3, $6, Union(@1, @4));
|
$$ = new ForeachActiveStmt($3, $6, Union(@1, @4));
|
||||||
m->symbolTable->PopScope();
|
m->symbolTable->PopScope();
|
||||||
}
|
}
|
||||||
| foreach_unique_scope '(' foreach_unique_identifier TOKEN_IN
|
| foreach_unique_scope '(' foreach_unique_identifier TOKEN_IN
|
||||||
|
|||||||
74
stdlib.ispc
74
stdlib.ispc
@@ -421,7 +421,7 @@ static inline void memcpy(void * varying dst, void * varying src,
|
|||||||
da[programIndex] = dst;
|
da[programIndex] = dst;
|
||||||
sa[programIndex] = src;
|
sa[programIndex] = src;
|
||||||
|
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
void * uniform d = da[i], * uniform s = sa[i];
|
void * uniform d = da[i], * uniform s = sa[i];
|
||||||
__memcpy32((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
|
__memcpy32((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
|
||||||
}
|
}
|
||||||
@@ -435,7 +435,7 @@ static inline void memcpy64(void * varying dst, void * varying src,
|
|||||||
da[programIndex] = dst;
|
da[programIndex] = dst;
|
||||||
sa[programIndex] = src;
|
sa[programIndex] = src;
|
||||||
|
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
void * uniform d = da[i], * uniform s = sa[i];
|
void * uniform d = da[i], * uniform s = sa[i];
|
||||||
__memcpy64((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
|
__memcpy64((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
|
||||||
}
|
}
|
||||||
@@ -459,7 +459,7 @@ static inline void memmove(void * varying dst, void * varying src,
|
|||||||
da[programIndex] = dst;
|
da[programIndex] = dst;
|
||||||
sa[programIndex] = src;
|
sa[programIndex] = src;
|
||||||
|
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
void * uniform d = da[i], * uniform s = sa[i];
|
void * uniform d = da[i], * uniform s = sa[i];
|
||||||
__memmove32((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
|
__memmove32((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
|
||||||
}
|
}
|
||||||
@@ -473,7 +473,7 @@ static inline void memmove64(void * varying dst, void * varying src,
|
|||||||
da[programIndex] = dst;
|
da[programIndex] = dst;
|
||||||
sa[programIndex] = src;
|
sa[programIndex] = src;
|
||||||
|
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
void * uniform d = da[i], * uniform s = sa[i];
|
void * uniform d = da[i], * uniform s = sa[i];
|
||||||
__memmove64((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
|
__memmove64((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
|
||||||
}
|
}
|
||||||
@@ -493,7 +493,7 @@ static inline void memset(void * varying ptr, int8 val, int32 count) {
|
|||||||
void * uniform pa[programCount];
|
void * uniform pa[programCount];
|
||||||
pa[programIndex] = ptr;
|
pa[programIndex] = ptr;
|
||||||
|
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
__memset32((int8 * uniform)pa[i], extract(val, i), extract(count, i));
|
__memset32((int8 * uniform)pa[i], extract(val, i), extract(count, i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -502,7 +502,7 @@ static inline void memset64(void * varying ptr, int8 val, int64 count) {
|
|||||||
void * uniform pa[programCount];
|
void * uniform pa[programCount];
|
||||||
pa[programIndex] = ptr;
|
pa[programIndex] = ptr;
|
||||||
|
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
__memset64((int8 * uniform)pa[i], extract(val, i), extract(count, i));
|
__memset64((int8 * uniform)pa[i], extract(val, i), extract(count, i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -711,7 +711,7 @@ static inline void prefetch_l1(const void * varying ptr) {
|
|||||||
const void * uniform ptrArray[programCount];
|
const void * uniform ptrArray[programCount];
|
||||||
ptrArray[programIndex] = ptr;
|
ptrArray[programIndex] = ptr;
|
||||||
|
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
const void * uniform p = ptrArray[i];
|
const void * uniform p = ptrArray[i];
|
||||||
prefetch_l1(p);
|
prefetch_l1(p);
|
||||||
}
|
}
|
||||||
@@ -721,7 +721,7 @@ static inline void prefetch_l2(const void * varying ptr) {
|
|||||||
const void * uniform ptrArray[programCount];
|
const void * uniform ptrArray[programCount];
|
||||||
ptrArray[programIndex] = ptr;
|
ptrArray[programIndex] = ptr;
|
||||||
|
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
const void * uniform p = ptrArray[i];
|
const void * uniform p = ptrArray[i];
|
||||||
prefetch_l2(p);
|
prefetch_l2(p);
|
||||||
}
|
}
|
||||||
@@ -731,7 +731,7 @@ static inline void prefetch_l3(const void * varying ptr) {
|
|||||||
const void * uniform ptrArray[programCount];
|
const void * uniform ptrArray[programCount];
|
||||||
ptrArray[programIndex] = ptr;
|
ptrArray[programIndex] = ptr;
|
||||||
|
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
const void * uniform p = ptrArray[i];
|
const void * uniform p = ptrArray[i];
|
||||||
prefetch_l3(p);
|
prefetch_l3(p);
|
||||||
}
|
}
|
||||||
@@ -741,7 +741,7 @@ static inline void prefetch_nt(const void * varying ptr) {
|
|||||||
const void * uniform ptrArray[programCount];
|
const void * uniform ptrArray[programCount];
|
||||||
ptrArray[programIndex] = ptr;
|
ptrArray[programIndex] = ptr;
|
||||||
|
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
const void * uniform p = ptrArray[i];
|
const void * uniform p = ptrArray[i];
|
||||||
prefetch_nt(p);
|
prefetch_nt(p);
|
||||||
}
|
}
|
||||||
@@ -1621,7 +1621,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
|||||||
uniform TA * uniform ptrArray[programCount]; \
|
uniform TA * uniform ptrArray[programCount]; \
|
||||||
ptrArray[programIndex] = ptr; \
|
ptrArray[programIndex] = ptr; \
|
||||||
TA ret; \
|
TA ret; \
|
||||||
__foreach_active (i) { \
|
foreach_active (i) { \
|
||||||
uniform TA * uniform p = ptrArray[i]; \
|
uniform TA * uniform p = ptrArray[i]; \
|
||||||
uniform TA v = extract(value, i); \
|
uniform TA v = extract(value, i); \
|
||||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
|
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
|
||||||
@@ -1672,7 +1672,7 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
|
|||||||
uniform TA * uniform ptrArray[programCount]; \
|
uniform TA * uniform ptrArray[programCount]; \
|
||||||
ptrArray[programIndex] = ptr; \
|
ptrArray[programIndex] = ptr; \
|
||||||
TA ret; \
|
TA ret; \
|
||||||
__foreach_active (i) { \
|
foreach_active (i) { \
|
||||||
uniform TA * uniform p = ptrArray[i]; \
|
uniform TA * uniform p = ptrArray[i]; \
|
||||||
uniform TA v = extract(value, i); \
|
uniform TA v = extract(value, i); \
|
||||||
uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \
|
uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \
|
||||||
@@ -1699,7 +1699,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
|||||||
uniform TA * uniform ptrArray[programCount]; \
|
uniform TA * uniform ptrArray[programCount]; \
|
||||||
ptrArray[programIndex] = ptr; \
|
ptrArray[programIndex] = ptr; \
|
||||||
TA ret; \
|
TA ret; \
|
||||||
__foreach_active (i) { \
|
foreach_active (i) { \
|
||||||
uniform TA * uniform p = ptrArray[i]; \
|
uniform TA * uniform p = ptrArray[i]; \
|
||||||
uniform TA v = extract(value, i); \
|
uniform TA v = extract(value, i); \
|
||||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
|
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
|
||||||
@@ -1774,7 +1774,7 @@ static inline TA atomic_compare_exchange_global( \
|
|||||||
uniform TA * uniform ptrArray[programCount]; \
|
uniform TA * uniform ptrArray[programCount]; \
|
||||||
ptrArray[programIndex] = ptr; \
|
ptrArray[programIndex] = ptr; \
|
||||||
TA ret; \
|
TA ret; \
|
||||||
__foreach_active (i) { \
|
foreach_active (i) { \
|
||||||
uniform TA r = \
|
uniform TA r = \
|
||||||
__atomic_compare_exchange_uniform_##TB##_global(ptrArray[i], \
|
__atomic_compare_exchange_uniform_##TB##_global(ptrArray[i], \
|
||||||
extract(oldval, i), \
|
extract(oldval, i), \
|
||||||
@@ -1848,7 +1848,7 @@ static inline uniform TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, \
|
|||||||
} \
|
} \
|
||||||
static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value) { \
|
static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value) { \
|
||||||
TYPE ret; \
|
TYPE ret; \
|
||||||
__foreach_active (i) { \
|
foreach_active (i) { \
|
||||||
ret = insert(ret, i, *ptr); \
|
ret = insert(ret, i, *ptr); \
|
||||||
*ptr = OPFUNC(*ptr, extract(value, i)); \
|
*ptr = OPFUNC(*ptr, extract(value, i)); \
|
||||||
} \
|
} \
|
||||||
@@ -1858,7 +1858,7 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) { \
|
|||||||
TYPE ret; \
|
TYPE ret; \
|
||||||
uniform TYPE * uniform ptrs[programCount]; \
|
uniform TYPE * uniform ptrs[programCount]; \
|
||||||
ptrs[programIndex] = p; \
|
ptrs[programIndex] = p; \
|
||||||
__foreach_active (i) { \
|
foreach_active (i) { \
|
||||||
ret = insert(ret, i, *ptrs[i]); \
|
ret = insert(ret, i, *ptrs[i]); \
|
||||||
*ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \
|
*ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \
|
||||||
} \
|
} \
|
||||||
@@ -1975,7 +1975,7 @@ static inline uniform TYPE atomic_compare_exchange_local(uniform TYPE * uniform
|
|||||||
static inline TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
|
static inline TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
|
||||||
TYPE cmp, TYPE update) { \
|
TYPE cmp, TYPE update) { \
|
||||||
TYPE ret; \
|
TYPE ret; \
|
||||||
__foreach_active (i) { \
|
foreach_active (i) { \
|
||||||
uniform TYPE old = *ptr; \
|
uniform TYPE old = *ptr; \
|
||||||
if (old == extract(cmp, i)) \
|
if (old == extract(cmp, i)) \
|
||||||
*ptr = extract(update, i); \
|
*ptr = extract(update, i); \
|
||||||
@@ -1988,7 +1988,7 @@ static inline TYPE atomic_compare_exchange_local(uniform TYPE * varying p, \
|
|||||||
uniform TYPE * uniform ptrs[programCount]; \
|
uniform TYPE * uniform ptrs[programCount]; \
|
||||||
ptrs[programIndex] = p; \
|
ptrs[programIndex] = p; \
|
||||||
TYPE ret; \
|
TYPE ret; \
|
||||||
__foreach_active (i) { \
|
foreach_active (i) { \
|
||||||
uniform TYPE old = *ptrs[i]; \
|
uniform TYPE old = *ptrs[i]; \
|
||||||
if (old == extract(cmp, i)) \
|
if (old == extract(cmp, i)) \
|
||||||
*ptrs[i] = extract(update, i); \
|
*ptrs[i] = extract(update, i); \
|
||||||
@@ -2127,7 +2127,7 @@ static inline float sin(float x_full) {
|
|||||||
}
|
}
|
||||||
else if (__math_lib == __math_lib_system) {
|
else if (__math_lib == __math_lib_system) {
|
||||||
float ret;
|
float ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform float r = __stdlib_sinf(extract(x_full, i));
|
uniform float r = __stdlib_sinf(extract(x_full, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -2259,7 +2259,7 @@ static inline float asin(float x) {
|
|||||||
if (__math_lib == __math_lib_svml ||
|
if (__math_lib == __math_lib_svml ||
|
||||||
__math_lib == __math_lib_system) {
|
__math_lib == __math_lib_system) {
|
||||||
float ret;
|
float ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform float r = __stdlib_asinf(extract(x, i));
|
uniform float r = __stdlib_asinf(extract(x, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -2364,7 +2364,7 @@ static inline float cos(float x_full) {
|
|||||||
}
|
}
|
||||||
else if (__math_lib == __math_lib_system) {
|
else if (__math_lib == __math_lib_system) {
|
||||||
float ret;
|
float ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform float r = __stdlib_cosf(extract(x_full, i));
|
uniform float r = __stdlib_cosf(extract(x_full, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -2502,7 +2502,7 @@ static inline void sincos(float x_full, varying float * uniform sin_result,
|
|||||||
__svml_sincos(x_full, sin_result, cos_result);
|
__svml_sincos(x_full, sin_result, cos_result);
|
||||||
}
|
}
|
||||||
else if (__math_lib == __math_lib_system) {
|
else if (__math_lib == __math_lib_system) {
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform float s, c;
|
uniform float s, c;
|
||||||
__stdlib_sincosf(extract(x_full, i), &s, &c);
|
__stdlib_sincosf(extract(x_full, i), &s, &c);
|
||||||
*sin_result = insert(*sin_result, i, s);
|
*sin_result = insert(*sin_result, i, s);
|
||||||
@@ -2635,7 +2635,7 @@ static inline float tan(float x_full) {
|
|||||||
}
|
}
|
||||||
else if (__math_lib == __math_lib_system) {
|
else if (__math_lib == __math_lib_system) {
|
||||||
float ret;
|
float ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform float r = __stdlib_tanf(extract(x_full, i));
|
uniform float r = __stdlib_tanf(extract(x_full, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -2786,7 +2786,7 @@ static inline float atan(float x_full) {
|
|||||||
}
|
}
|
||||||
else if (__math_lib == __math_lib_system) {
|
else if (__math_lib == __math_lib_system) {
|
||||||
float ret;
|
float ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform float r = __stdlib_atanf(extract(x_full, i));
|
uniform float r = __stdlib_atanf(extract(x_full, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -2881,7 +2881,7 @@ static inline float atan2(float y, float x) {
|
|||||||
}
|
}
|
||||||
else if (__math_lib == __math_lib_system) {
|
else if (__math_lib == __math_lib_system) {
|
||||||
float ret;
|
float ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i));
|
uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -2944,7 +2944,7 @@ static inline float exp(float x_full) {
|
|||||||
}
|
}
|
||||||
else if (__math_lib == __math_lib_system) {
|
else if (__math_lib == __math_lib_system) {
|
||||||
float ret;
|
float ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform float r = __stdlib_expf(extract(x_full, i));
|
uniform float r = __stdlib_expf(extract(x_full, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -3151,7 +3151,7 @@ static inline float log(float x_full) {
|
|||||||
}
|
}
|
||||||
else if (__math_lib == __math_lib_system) {
|
else if (__math_lib == __math_lib_system) {
|
||||||
float ret;
|
float ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform float r = __stdlib_logf(extract(x_full, i));
|
uniform float r = __stdlib_logf(extract(x_full, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -3326,7 +3326,7 @@ static inline float pow(float a, float b) {
|
|||||||
}
|
}
|
||||||
else if (__math_lib == __math_lib_system) {
|
else if (__math_lib == __math_lib_system) {
|
||||||
float ret;
|
float ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
|
uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -3416,7 +3416,7 @@ static inline double sin(double x) {
|
|||||||
return sin((float)x);
|
return sin((float)x);
|
||||||
else {
|
else {
|
||||||
double ret;
|
double ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform double r = __stdlib_sin(extract(x, i));
|
uniform double r = __stdlib_sin(extract(x, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -3438,7 +3438,7 @@ static inline double cos(double x) {
|
|||||||
return cos((float)x);
|
return cos((float)x);
|
||||||
else {
|
else {
|
||||||
double ret;
|
double ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform double r = __stdlib_cos(extract(x, i));
|
uniform double r = __stdlib_cos(extract(x, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -3464,7 +3464,7 @@ static inline void sincos(double x, varying double * uniform sin_result,
|
|||||||
*cos_result = cr;
|
*cos_result = cr;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform double sr, cr;
|
uniform double sr, cr;
|
||||||
__stdlib_sincos(extract(x, i), &sr, &cr);
|
__stdlib_sincos(extract(x, i), &sr, &cr);
|
||||||
*sin_result = insert(*sin_result, i, sr);
|
*sin_result = insert(*sin_result, i, sr);
|
||||||
@@ -3492,7 +3492,7 @@ static inline double tan(double x) {
|
|||||||
return tan((float)x);
|
return tan((float)x);
|
||||||
else {
|
else {
|
||||||
double ret;
|
double ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform double r = __stdlib_tan(extract(x, i));
|
uniform double r = __stdlib_tan(extract(x, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -3514,7 +3514,7 @@ static inline double atan(double x) {
|
|||||||
return atan((float)x);
|
return atan((float)x);
|
||||||
else {
|
else {
|
||||||
double ret;
|
double ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform double r = __stdlib_atan(extract(x, i));
|
uniform double r = __stdlib_atan(extract(x, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -3536,7 +3536,7 @@ static inline double atan2(double y, double x) {
|
|||||||
return atan2((float)y, (float)x);
|
return atan2((float)y, (float)x);
|
||||||
else {
|
else {
|
||||||
double ret;
|
double ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform double r = __stdlib_atan2(extract(y, i), extract(x, i));
|
uniform double r = __stdlib_atan2(extract(y, i), extract(x, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -3558,7 +3558,7 @@ static inline double exp(double x) {
|
|||||||
return exp((float)x);
|
return exp((float)x);
|
||||||
else {
|
else {
|
||||||
double ret;
|
double ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform double r = __stdlib_exp(extract(x, i));
|
uniform double r = __stdlib_exp(extract(x, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -3580,7 +3580,7 @@ static inline double log(double x) {
|
|||||||
return log((float)x);
|
return log((float)x);
|
||||||
else {
|
else {
|
||||||
double ret;
|
double ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform double r = __stdlib_log(extract(x, i));
|
uniform double r = __stdlib_log(extract(x, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
@@ -3602,7 +3602,7 @@ static inline double pow(double a, double b) {
|
|||||||
return pow((float)a, (float)b);
|
return pow((float)a, (float)b);
|
||||||
else {
|
else {
|
||||||
double ret;
|
double ret;
|
||||||
__foreach_active (i) {
|
foreach_active (i) {
|
||||||
uniform double r = __stdlib_pow(extract(a, i), extract(b, i));
|
uniform double r = __stdlib_pow(extract(a, i), extract(b, i));
|
||||||
ret = insert(ret, i, r);
|
ret = insert(ret, i, r);
|
||||||
}
|
}
|
||||||
|
|||||||
259
stmt.cpp
259
stmt.cpp
@@ -1915,6 +1915,188 @@ ForeachStmt::Print(int indent) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// ForeachActiveStmt
|
||||||
|
|
||||||
|
ForeachActiveStmt::ForeachActiveStmt(Symbol *s, Stmt *st, SourcePos pos)
|
||||||
|
: Stmt(pos) {
|
||||||
|
sym = s;
|
||||||
|
stmts = st;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||||
|
if (!ctx->GetCurrentBasicBlock())
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Allocate storage for the symbol that we'll use for the uniform
|
||||||
|
// variable that holds the current program instance in each loop
|
||||||
|
// iteration.
|
||||||
|
if (sym->type == NULL) {
|
||||||
|
Assert(m->errorCount > 0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Assert(Type::Equal(sym->type,
|
||||||
|
AtomicType::UniformInt64->GetAsConstType()));
|
||||||
|
sym->storagePtr = ctx->AllocaInst(LLVMTypes::Int64Type, sym->name.c_str());
|
||||||
|
|
||||||
|
ctx->SetDebugPos(pos);
|
||||||
|
ctx->EmitVariableDebugInfo(sym);
|
||||||
|
|
||||||
|
// The various basic blocks that we'll need in the below
|
||||||
|
llvm::BasicBlock *bbFindNext =
|
||||||
|
ctx->CreateBasicBlock("foreach_active_find_next");
|
||||||
|
llvm::BasicBlock *bbBody = ctx->CreateBasicBlock("foreach_active_body");
|
||||||
|
llvm::BasicBlock *bbCheckForMore =
|
||||||
|
ctx->CreateBasicBlock("foreach_active_check_for_more");
|
||||||
|
llvm::BasicBlock *bbDone = ctx->CreateBasicBlock("foreach_active_done");
|
||||||
|
|
||||||
|
// Save the old mask so that we can restore it at the end
|
||||||
|
llvm::Value *oldInternalMask = ctx->GetInternalMask();
|
||||||
|
|
||||||
|
// Now, *maskBitsPtr will maintain a bitmask for the lanes that remain
|
||||||
|
// to be processed by a pass through the loop body. It starts out with
|
||||||
|
// the current execution mask (which should never be all off going in
|
||||||
|
// to this)...
|
||||||
|
llvm::Value *oldFullMask = ctx->GetFullMask();
|
||||||
|
llvm::Value *maskBitsPtr =
|
||||||
|
ctx->AllocaInst(LLVMTypes::Int64Type, "mask_bits");
|
||||||
|
llvm::Value *movmsk = ctx->LaneMask(oldFullMask);
|
||||||
|
ctx->StoreInst(movmsk, maskBitsPtr);
|
||||||
|
|
||||||
|
// Officially start the loop.
|
||||||
|
ctx->StartScope();
|
||||||
|
ctx->StartForeach(FunctionEmitContext::FOREACH_ACTIVE);
|
||||||
|
ctx->SetContinueTarget(bbCheckForMore);
|
||||||
|
|
||||||
|
// Onward to find the first set of program instance to run the loop for
|
||||||
|
ctx->BranchInst(bbFindNext);
|
||||||
|
|
||||||
|
ctx->SetCurrentBasicBlock(bbFindNext); {
|
||||||
|
// Load the bitmask of the lanes left to be processed
|
||||||
|
llvm::Value *remainingBits = ctx->LoadInst(maskBitsPtr, "remaining_bits");
|
||||||
|
|
||||||
|
// Find the index of the first set bit in the mask
|
||||||
|
llvm::Function *ctlzFunc =
|
||||||
|
m->module->getFunction("__count_trailing_zeros_i64");
|
||||||
|
Assert(ctlzFunc != NULL);
|
||||||
|
llvm::Value *firstSet = ctx->CallInst(ctlzFunc, NULL, remainingBits,
|
||||||
|
"first_set");
|
||||||
|
|
||||||
|
// Store that value into the storage allocated for the iteration
|
||||||
|
// variable.
|
||||||
|
ctx->StoreInst(firstSet, sym->storagePtr);
|
||||||
|
|
||||||
|
// Now set the execution mask to be only on for the current program
|
||||||
|
// instance. (TODO: is there a more efficient way to do this? e.g.
|
||||||
|
// for AVX1, we might want to do this as float rather than int
|
||||||
|
// math...)
|
||||||
|
|
||||||
|
// Get the "program index" vector value
|
||||||
|
llvm::Value *programIndex =
|
||||||
|
llvm::UndefValue::get(LLVMTypes::Int32VectorType);
|
||||||
|
for (int i = 0; i < g->target.vectorWidth; ++i)
|
||||||
|
programIndex = ctx->InsertInst(programIndex, LLVMInt32(i), i,
|
||||||
|
"prog_index");
|
||||||
|
|
||||||
|
// And smear the current lane out to a vector
|
||||||
|
llvm::Value *firstSet32 =
|
||||||
|
ctx->TruncInst(firstSet, LLVMTypes::Int32Type, "first_set32");
|
||||||
|
llvm::Value *firstSet32Smear = ctx->SmearUniform(firstSet32);
|
||||||
|
|
||||||
|
// Now set the execution mask based on doing a vector compare of
|
||||||
|
// these two
|
||||||
|
llvm::Value *iterMask =
|
||||||
|
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
|
||||||
|
firstSet32Smear, programIndex);
|
||||||
|
iterMask = ctx->I1VecToBoolVec(iterMask);
|
||||||
|
|
||||||
|
ctx->SetInternalMask(iterMask);
|
||||||
|
|
||||||
|
// Also update the bitvector of lanes left to turn off the bit for
|
||||||
|
// the lane we're about to run.
|
||||||
|
llvm::Value *setMask =
|
||||||
|
ctx->BinaryOperator(llvm::Instruction::Shl, LLVMInt64(1),
|
||||||
|
firstSet, "set_mask");
|
||||||
|
llvm::Value *notSetMask = ctx->NotOperator(setMask);
|
||||||
|
llvm::Value *newRemaining =
|
||||||
|
ctx->BinaryOperator(llvm::Instruction::And, remainingBits,
|
||||||
|
notSetMask, "new_remaining");
|
||||||
|
ctx->StoreInst(newRemaining, maskBitsPtr);
|
||||||
|
|
||||||
|
// and onward to run the loop body...
|
||||||
|
ctx->BranchInst(bbBody);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->SetCurrentBasicBlock(bbBody); {
|
||||||
|
// Run the code in the body of the loop. This is easy now.
|
||||||
|
if (stmts)
|
||||||
|
stmts->EmitCode(ctx);
|
||||||
|
|
||||||
|
Assert(ctx->GetCurrentBasicBlock() != NULL);
|
||||||
|
ctx->BranchInst(bbCheckForMore);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->SetCurrentBasicBlock(bbCheckForMore); {
|
||||||
|
// At the end of the loop body (either due to running the
|
||||||
|
// statements normally, or a continue statement in the middle of
|
||||||
|
// the loop that jumps to the end, see if there are any lanes left
|
||||||
|
// to be processed.
|
||||||
|
llvm::Value *remainingBits = ctx->LoadInst(maskBitsPtr, "remaining_bits");
|
||||||
|
llvm::Value *nonZero =
|
||||||
|
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE,
|
||||||
|
remainingBits, LLVMInt64(0), "remaining_ne_zero");
|
||||||
|
ctx->BranchInst(bbFindNext, bbDone, nonZero);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->SetCurrentBasicBlock(bbDone);
|
||||||
|
ctx->SetInternalMask(oldInternalMask);
|
||||||
|
ctx->EndForeach();
|
||||||
|
ctx->EndScope();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
ForeachActiveStmt::Print(int indent) const {
|
||||||
|
printf("%*cForeach_active Stmt", indent, ' ');
|
||||||
|
pos.Print();
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
printf("%*cIter symbol: ", indent+4, ' ');
|
||||||
|
if (sym != NULL) {
|
||||||
|
printf("%s", sym->name.c_str());
|
||||||
|
if (sym->type != NULL)
|
||||||
|
printf(" %s", sym->type->GetString().c_str());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
printf("NULL");
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
printf("%*cStmts:\n", indent+4, ' ');
|
||||||
|
if (stmts != NULL)
|
||||||
|
stmts->Print(indent+8);
|
||||||
|
else
|
||||||
|
printf("NULL");
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Stmt *
|
||||||
|
ForeachActiveStmt::TypeCheck() {
|
||||||
|
if (sym == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
ForeachActiveStmt::EstimateCost() const {
|
||||||
|
return COST_VARYING_LOOP;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// ForeachUniqueStmt
|
// ForeachUniqueStmt
|
||||||
|
|
||||||
@@ -3043,80 +3225,3 @@ int
|
|||||||
DeleteStmt::EstimateCost() const {
|
DeleteStmt::EstimateCost() const {
|
||||||
return COST_DELETE;
|
return COST_DELETE;
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
/** This generates AST nodes for an __foreach_active statement. This
|
|
||||||
construct can be synthesized ouf of the existing ForStmt (and other AST
|
|
||||||
nodes), so here we just build up the AST that we need rather than
|
|
||||||
having a new Stmt implementation for __foreach_active.
|
|
||||||
|
|
||||||
@param iterSym Symbol for the iteration variable (e.g. "i" in
|
|
||||||
__foreach_active (i) { .. .}
|
|
||||||
@param stmts Statements to execute each time through the loop, for
|
|
||||||
each active program instance.
|
|
||||||
@param pos Position of the __foreach_active statement in the source
|
|
||||||
file.
|
|
||||||
*/
|
|
||||||
Stmt *
|
|
||||||
CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
|
|
||||||
if (iterSym == NULL) {
|
|
||||||
AssertPos(pos, m->errorCount > 0);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
// loop initializer: set iter = 0
|
|
||||||
std::vector<VariableDeclaration> var;
|
|
||||||
ConstExpr *zeroExpr = new ConstExpr(AtomicType::UniformInt32, 0,
|
|
||||||
iterSym->pos);
|
|
||||||
var.push_back(VariableDeclaration(iterSym, zeroExpr));
|
|
||||||
Stmt *initStmt = new DeclStmt(var, iterSym->pos);
|
|
||||||
|
|
||||||
// loop test: (iter < programCount)
|
|
||||||
ConstExpr *progCountExpr =
|
|
||||||
new ConstExpr(AtomicType::UniformInt32, g->target.vectorWidth,
|
|
||||||
pos);
|
|
||||||
SymbolExpr *symExpr = new SymbolExpr(iterSym, iterSym->pos);
|
|
||||||
Expr *testExpr = new BinaryExpr(BinaryExpr::Lt, symExpr, progCountExpr,
|
|
||||||
pos);
|
|
||||||
|
|
||||||
// loop step: ++iterSym
|
|
||||||
UnaryExpr *incExpr = new UnaryExpr(UnaryExpr::PreInc, symExpr, pos);
|
|
||||||
Stmt *stepStmt = new ExprStmt(incExpr, pos);
|
|
||||||
|
|
||||||
// loop body
|
|
||||||
// First, call __movmsk(__mask)) to get the mask as a set of bits.
|
|
||||||
// This should be hoisted out of the loop
|
|
||||||
Symbol *maskSym = m->symbolTable->LookupVariable("__mask");
|
|
||||||
AssertPos(pos, maskSym != NULL);
|
|
||||||
Expr *maskVecExpr = new SymbolExpr(maskSym, pos);
|
|
||||||
std::vector<Symbol *> mmFuns;
|
|
||||||
m->symbolTable->LookupFunction("__movmsk", &mmFuns);
|
|
||||||
AssertPos(pos, mmFuns.size() == (g->target.maskBitCount == 32 ? 2 : 1));
|
|
||||||
FunctionSymbolExpr *movmskFunc = new FunctionSymbolExpr("__movmsk", mmFuns,
|
|
||||||
pos);
|
|
||||||
ExprList *movmskArgs = new ExprList(maskVecExpr, pos);
|
|
||||||
FunctionCallExpr *movmskExpr = new FunctionCallExpr(movmskFunc, movmskArgs,
|
|
||||||
pos);
|
|
||||||
|
|
||||||
// Compute the per lane mask to test the mask bits against: (1 << iter)
|
|
||||||
ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt64, int64_t(1),
|
|
||||||
iterSym->pos);
|
|
||||||
Expr *shiftLaneExpr = new BinaryExpr(BinaryExpr::Shl, oneExpr, symExpr,
|
|
||||||
pos);
|
|
||||||
|
|
||||||
// Compute the AND: movmsk & (1 << iter)
|
|
||||||
Expr *maskAndLaneExpr = new BinaryExpr(BinaryExpr::BitAnd, movmskExpr,
|
|
||||||
shiftLaneExpr, pos);
|
|
||||||
// Test to see if it's non-zero: (mask & (1 << iter)) != 0
|
|
||||||
Expr *ifTestExpr = new BinaryExpr(BinaryExpr::NotEqual, maskAndLaneExpr,
|
|
||||||
zeroExpr, pos);
|
|
||||||
|
|
||||||
// Now, enclose the provided statements in an if test such that they
|
|
||||||
// only run if the mask is non-zero for the lane we're currently
|
|
||||||
// handling in the loop.
|
|
||||||
IfStmt *laneCheckIf = new IfStmt(ifTestExpr, stmts, NULL, false, pos);
|
|
||||||
|
|
||||||
// And return a for loop that wires it all together.
|
|
||||||
return new ForStmt(initStmt, testExpr, stepStmt, laneCheckIf, false, pos);
|
|
||||||
}
|
|
||||||
|
|||||||
17
stmt.h
17
stmt.h
@@ -260,6 +260,23 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/** Iteration over each executing program instance.
|
||||||
|
*/
|
||||||
|
class ForeachActiveStmt : public Stmt {
|
||||||
|
public:
|
||||||
|
ForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos);
|
||||||
|
|
||||||
|
void EmitCode(FunctionEmitContext *ctx) const;
|
||||||
|
void Print(int indent) const;
|
||||||
|
|
||||||
|
Stmt *TypeCheck();
|
||||||
|
int EstimateCost() const;
|
||||||
|
|
||||||
|
Symbol *sym;
|
||||||
|
Stmt *stmts;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
/** Parallel iteration over each unique value in the given (varying)
|
/** Parallel iteration over each unique value in the given (varying)
|
||||||
expression.
|
expression.
|
||||||
*/
|
*/
|
||||||
|
|||||||
16
tests/foreach-active-1.ispc
Normal file
16
tests/foreach-active-1.ispc
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
|
||||||
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
|
float a = aFOO[programIndex];
|
||||||
|
uniform int count = 0;
|
||||||
|
if (programIndex & 1)
|
||||||
|
foreach_active (i)
|
||||||
|
++count;
|
||||||
|
RET[programIndex] = count;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
RET[programIndex] = programCount / 2;
|
||||||
|
}
|
||||||
16
tests/foreach-active-2.ispc
Normal file
16
tests/foreach-active-2.ispc
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
|
||||||
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
|
float a = aFOO[programIndex];
|
||||||
|
uniform int count = 0;
|
||||||
|
if (programIndex & 1)
|
||||||
|
foreach_active (i)
|
||||||
|
++a;
|
||||||
|
RET[programIndex] = a;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
RET[programIndex] = (1 + programIndex) + ((programIndex & 1) ? 1 : 0);
|
||||||
|
}
|
||||||
16
tests/foreach-active-3.ispc
Normal file
16
tests/foreach-active-3.ispc
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
|
||||||
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
|
float a = aFOO[programIndex];
|
||||||
|
RET[programIndex] = a;
|
||||||
|
|
||||||
|
if (programIndex & 1)
|
||||||
|
foreach_active (i)
|
||||||
|
++RET[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
RET[programIndex] = (1 + programIndex) + ((programIndex & 1) ? 1 : 0);
|
||||||
|
}
|
||||||
21
tests/foreach-active-4.ispc
Normal file
21
tests/foreach-active-4.ispc
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
|
||||||
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
|
float a = aFOO[programIndex];
|
||||||
|
RET[programIndex] = a;
|
||||||
|
|
||||||
|
if (programIndex & 1) {
|
||||||
|
foreach_active (i) {
|
||||||
|
if (i == 1)
|
||||||
|
continue;
|
||||||
|
++RET[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
RET[programIndex] = (1 + programIndex) + ((programIndex & 1) ? 1 : 0);
|
||||||
|
--RET[1];
|
||||||
|
}
|
||||||
22
tests/foreach-active-5.ispc
Normal file
22
tests/foreach-active-5.ispc
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
|
||||||
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
|
float a = aFOO[programIndex];
|
||||||
|
RET[programIndex] = a;
|
||||||
|
|
||||||
|
if (programIndex & 1) {
|
||||||
|
foreach_active (i) {
|
||||||
|
if (i & 1)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
int * uniform null = 0;
|
||||||
|
*null = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
RET[programIndex] = (1 + programIndex);
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user