From b4a078e2f68c1be7f41bd712c4d3ab631f69b973 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 22 Jun 2012 10:35:43 -0700 Subject: [PATCH] Add foreach_active iteration statement. Issue #298. --- ast.cpp | 17 +- ctx.cpp | 29 ++- ctx.h | 6 +- docs/ispc.rst | 376 ++++++++++++++++++++++-------------- docs/perfguide.rst | 38 ++++ lex.ll | 8 +- parse.yy | 8 +- stdlib.ispc | 74 +++---- stmt.cpp | 259 +++++++++++++++++-------- stmt.h | 17 ++ tests/foreach-active-1.ispc | 16 ++ tests/foreach-active-2.ispc | 16 ++ tests/foreach-active-3.ispc | 16 ++ tests/foreach-active-4.ispc | 21 ++ tests/foreach-active-5.ispc | 22 +++ 15 files changed, 644 insertions(+), 279 deletions(-) create mode 100644 tests/foreach-active-1.ispc create mode 100644 tests/foreach-active-2.ispc create mode 100644 tests/foreach-active-3.ispc create mode 100644 tests/foreach-active-4.ispc create mode 100644 tests/foreach-active-5.ispc diff --git a/ast.cpp b/ast.cpp index 06ccc1a9..66b597d5 100644 --- a/ast.cpp +++ b/ast.cpp @@ -92,6 +92,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc, DoStmt *dos; ForStmt *fs; ForeachStmt *fes; + ForeachActiveStmt *fas; ForeachUniqueStmt *fus; CaseStmt *cs; DefaultStmt *defs; @@ -138,6 +139,9 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc, postFunc, data); fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data); } + else if ((fas = dynamic_cast(node)) != NULL) { + fas->stmts = (Stmt *)WalkAST(fas->stmts, preFunc, postFunc, data); + } else if ((fus = dynamic_cast(node)) != NULL) { fus->expr = (Expr *)WalkAST(fus->expr, preFunc, postFunc, data); fus->stmts = (Stmt *)WalkAST(fus->stmts, preFunc, postFunc, data); @@ -391,14 +395,15 @@ lCheckAllOffSafety(ASTNode *node, void *data) { } if (dynamic_cast(node) != NULL || + dynamic_cast(node) != NULL || dynamic_cast(node) != NULL) { - // foreach() statements also shouldn't be run with an all-off mask. - // Since they re-establish an 'all on' mask, this would be pretty - // unintuitive. (More generally, it's possibly a little strange to - // allow foreach() in the presence of any non-uniform control - // flow...) + // The various foreach statements also shouldn't be run with an + // all-off mask. Since they can re-establish an 'all on' mask, + // this would be pretty unintuitive. (More generally, it's + // possibly a little strange to allow foreach in the presence of + // any non-uniform control flow...) // - // Similarly, the implementation foreach_unique assumes as a + // Similarly, the implementation of foreach_unique assumes as a // precondition that the mask won't be all off going into it, so // we'll enforce that here... *okPtr = false; diff --git a/ctx.cpp b/ctx.cpp index 194ca30b..75c05769 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -89,12 +89,14 @@ struct CFInfo { bool IsIf() { return type == If; } bool IsLoop() { return type == Loop; } bool IsForeach() { return (type == ForeachRegular || + type == ForeachActive || type == ForeachUnique); } bool IsSwitch() { return type == Switch; } bool IsVarying() { return !isUniform; } bool IsUniform() { return isUniform; } - enum CFType { If, Loop, ForeachRegular, ForeachUnique, Switch }; + enum CFType { If, Loop, ForeachRegular, ForeachActive, ForeachUnique, + Switch }; CFType type; bool isUniform; llvm::BasicBlock *savedBreakTarget, *savedContinueTarget; @@ -143,7 +145,7 @@ private: CFInfo(CFType t, llvm::BasicBlock *bt, llvm::BasicBlock *ct, llvm::Value *sb, llvm::Value *sc, llvm::Value *sm, llvm::Value *lm) { - Assert(t == ForeachRegular || t == ForeachUnique); + Assert(t == ForeachRegular || t == ForeachActive || t == ForeachUnique); type = t; isUniform = false; savedBreakTarget = bt; @@ -190,6 +192,9 @@ CFInfo::GetForeach(FunctionEmitContext::ForeachType ft, case FunctionEmitContext::FOREACH_REGULAR: cfType = ForeachRegular; break; + case FunctionEmitContext::FOREACH_ACTIVE: + cfType = ForeachActive; + break; case FunctionEmitContext::FOREACH_UNIQUE: cfType = ForeachUnique; break; @@ -744,6 +749,16 @@ FunctionEmitContext::Break(bool doCoherenceCheck) { } +static bool +lEnclosingLoopIsForeachActive(const std::vector &controlFlowInfo) { + for (int i = (int)controlFlowInfo.size() - 1; i >= 0; --i) { + if (controlFlowInfo[i]->type == CFInfo::ForeachActive) + return true; + } + return false; +} + + void FunctionEmitContext::Continue(bool doCoherenceCheck) { if (!continueTarget) { @@ -753,12 +768,16 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) { } AssertPos(currentPos, controlFlowInfo.size() > 0); - if (ifsInCFAllUniform(CFInfo::Loop)) { + if (ifsInCFAllUniform(CFInfo::Loop) || + lEnclosingLoopIsForeachActive(controlFlowInfo)) { // Similarly to 'break' statements, we can immediately jump to the // continue target if we're only in 'uniform' control flow within - // loop or if we can tell that the mask is all on. + // loop or if we can tell that the mask is all on. Here, we can + // also jump if the enclosing loop is a 'foreach_active' loop, in + // which case we know that only a single program instance is + // executing. AddInstrumentationPoint("continue: uniform CF, jumped"); - if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck) + if (doCoherenceCheck) Warning(currentPos, "Coherent continue statement not necessary in " "fully uniform control flow."); BranchInst(continueTarget); diff --git a/ctx.h b/ctx.h index a37842af..59975c01 100644 --- a/ctx.h +++ b/ctx.h @@ -160,9 +160,9 @@ public: finished. */ void EndLoop(); - /** Indicates that code generation for a 'foreach', 'foreach_tiled', or - 'foreach_unique' loop is about to start. */ - enum ForeachType { FOREACH_REGULAR, FOREACH_UNIQUE }; + /** Indicates that code generation for a 'foreach', 'foreach_tiled', + 'foreach_active', or 'foreach_unique' loop is about to start. */ + enum ForeachType { FOREACH_REGULAR, FOREACH_ACTIVE, FOREACH_UNIQUE }; void StartForeach(ForeachType ft); void EndForeach(); diff --git a/docs/ispc.rst b/docs/ispc.rst index 7ab01571..58c58d6a 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -105,12 +105,16 @@ Contents: * `Conditional Statements: "if"`_ * `Conditional Statements: "switch"`_ - * `Basic Iteration Statements: "for", "while", and "do"`_ - * `Iteration over unique elements: "foreach_unique"`_ + * `Iteration Statements`_ + + + `Basic Iteration Statements: "for", "while", and "do"`_ + + `Iteration over active program instances: "foreach_active"`_ + + `Iteration over unique elements: "foreach_unique"`_ + + `Parallel Iteration Statements: "foreach" and "foreach_tiled"`_ + + `Parallel Iteration with "programIndex" and "programCount"`_ + * `Unstructured Control Flow: "goto"`_ * `"Coherent" Control Flow Statements: "cif" and Friends`_ - * `Parallel Iteration Statements: "foreach" and "foreach_tiled"`_ - * `Parallel Iteration with "programIndex" and "programCount"`_ * `Functions and Function Calls`_ + `Function Overloading`_ @@ -1984,7 +1988,7 @@ format in memory; the benefits from SOA layout are discussed in more detail in the `Use "Structure of Arrays" Layout When Possible`_ section in the ispc Performance Guide. -.. _Use "Structure of Arrays" Layout When Possible: perf.html#use-structure-of-arrays-layout-when-possible +.. _Use "Structure of Arrays" Layout When Possible: perfguide.html#use-structure-of-arrays-layout-when-possible ``ispc`` provides two key language-level capabilities for laying out and accessing data in SOA format: @@ -2348,11 +2352,19 @@ code below. x *= x; } + +Iteration Statements +-------------------- + +In addition to the standard iteration statements ``for``, ``while``, and +``do``, inherited from C/C++, ``ispc`` provides a number of additional +specialized ways to iterate over data. + Basic Iteration Statements: "for", "while", and "do" ---------------------------------------------------- ``ispc`` supports ``for``, ``while``, and ``do`` loops, with the same -specification as in C. Like C++, variables can be declared in the ``for`` +specification as in C. As in C++, variables can be declared in the ``for`` statement itself: :: @@ -2374,6 +2386,58 @@ executing code in the loop body that didn't execute the ``continue`` will be unaffected by it. +Iteration over active program instances: "foreach_active" +--------------------------------------------------------- + +The ``foreach_active`` construct specifies a loop that serializes over the +active program instances: the loop body executes once for each active +program instance, and with only that program instance executing. + +As an example of the use of this construct, consider an application where +each program instance independently computes an offset into a shared array +that is being updated: + +:: + + uniform float array[...] = { ... }; + int index = ...; + ++array[index]; + +If more than one active program instance computes the same value for +``index``, the above code has undefined behavior (see the section `Data +Races Within a Gang`_ for details.) The increment of ``array[index]`` +could instead be written inside a ``foreach_active`` statement: + +:: + + foreach_active (i) { + ++array[index]; + } + + +The variable name provided in parenthesis after the ``foreach_active`` +keyword (here, ``index``), causes a ``const uniform int64`` local variable +of that name to be declared, where the variable takes the ``programIndex`` +value of the program instance executing at each loop iteraton. + +In the code above, because only one program instance is executing at a time +when the loop body executes, the update to ``array`` is well-defined. +Note that for this particular example, the "local atomic" operations in +the standard library could be used instead to safely update ``array``. +However, local atomics functions aren't always available or appropriate for +more complex cases.) + +``continue`` statements may be used inside ``foreach_active`` loops, though +``break`` and ``return`` are prohibited. The order in which the active +program instances are processed in the loop is not defined. + +See the `Using "foreach_active" Effectively`_ Section in the ispc +Performance Guide for more details about ``foreach_active``. + +.. _Using "foreach_active" Effectively: perfguide.html#using-foreach-active-effectively + + + Iteration over unique elements: "foreach_unique" ------------------------------------------------ @@ -2408,7 +2472,144 @@ evaluated once, and it must be of an atomic type (``float``, ``int``, etc.), an ``enum`` type, or a pointer type. The iteration variable ``val`` is a variable of ``const uniform`` type of the iteration type; it can't be modified within the loop. Finally, ``break`` and ``return`` statements are -illegal within the loop body, but ``continue`` statemetns are allowed. +illegal within the loop body, but ``continue`` statements are allowed. + + +Parallel Iteration Statements: "foreach" and "foreach_tiled" +------------------------------------------------------------ + +The ``foreach`` and ``foreach_tiled`` constructs specify loops over a +possibly multi-dimensional domain of integer ranges. Their role goes +beyond "syntactic sugar"; they provides one of the two key ways of +expressing parallel computation in ``ispc``. + +In general, a ``foreach`` or ``foreach_tiled`` statement takes one or more +dimension specifiers separated by commas, where each dimension is specified +by ``identifier = start ... end``, where ``start`` is a signed integer +value less than or equal to ``end``, specifying iteration over all integer +values from ``start`` up to and including ``end-1``. An arbitrary number +of iteration dimensions may be specified, with each one spanning a +different range of values. Within the ``foreach`` loop, the given +identifiers are available as ``const varying int32`` variables. The +execution mask starts out "all on" at the start of each ``foreach`` loop +iteration, but may be changed by control flow constructs within the loop. + +It is illegal to have a ``break`` statement or a ``return`` statement +within a ``foreach`` loop; a compile-time error will be issued in this +case. (It is legal to have a ``break`` in a regular ``for`` loop that's +nested inside a ``foreach`` loop.) ``continue`` statements are legal in +``foreach`` loops; they have the same effect as in regular ``for`` loops: +a program instances that executes a ``continue`` statement effectively +skips over the rest of the loop body for the current iteration. + +It is also currently illegal to have nested ``foreach`` statements; this +limitation will be removed in a future release of ``ispc``. + +As a specific example, consider the following ``foreach`` statement: + +:: + + foreach (j = 0 ... height, i = 0 ... width) { + // loop body--process data element (i,j) + } + +It specifies a loop over a 2D domain, where the ``j`` variable goes from 0 +to ``height-1`` and ``i`` goes from 0 to ``width-1``. Within the loop, the +variables ``i`` and ``j`` are available and initialized accordingly. + +``foreach`` loops actually cause the given iteration domain to be +automatically mapped to the program instances in the gang, so that all of +the data can be processed, in gang-sized chunks. As a specific example, +consider a simple ``foreach`` loop like the following, on a target where +the gang size is 8: + +:: + + foreach (i = 0 ... 16) { + // perform computation on element i + } + +One possible valid execution path of this loop would be for the program +counter the step through the statements of this loop just ``16/8==2`` +times; the first time through, with the ``varying int32`` variable ``i`` +having the values (0,1,2,3,4,5,6,7) over the program instances, and the +second time through, having the values (8,9,10,11,12,13,14,15), thus +mapping the available program instances to all of the data by the end of +the loop's execution. + +In general, however, you shouldn't make any assumptions about the order in +which elements of the iteration domain will be processed by a ``foreach`` +loop. For example, the following code exhibits undefined behavior: + +:: + + uniform float a[10][100]; + foreach (i = 0 ... 10, j = 0 ... 100) { + if (i == 0) + a[i][j] = j; + else + // Error: can't assume that a[i-1][j] has been set yet + a[i][j] = a[i-1][j]; + +The ``foreach`` statement generally subdivides the iteration domain by +selecting sets of contiguous elements in the inner-most dimension of the +iteration domain. This decomposition approach generally leads to coherent +memory reads and writes, but may lead to worse control flow coherence than +other decompositions. + +Therefore, ``foreach_tiled`` decomposes the iteration domain in a way that +tries to map locations in the domain to program instances in a way that is +compact across all of the dimensions. For example, on a target with an +8-wide gang size, the following ``foreach_tiled`` statement might process +the iteration domain in chunks of 2 elements in ``j`` and 4 elements in +``i`` each time. (The trade-offs between these two constructs are +discussed in more detail in the `ispc Performance Guide`_.) + +.. _ispc Performance Guide: perfguide.html#improving-control-flow-coherence-with-foreach-tiled + +:: + + foreach_tiled (j = 0 ... height, i = 0 ... width) { + // loop body--process data element (i,j) + } + + +Parallel Iteration with "programIndex" and "programCount" +--------------------------------------------------------- + +In addition to ``foreach`` and ``foreach_tiled``, ``ispc`` provides a +lower-level mechanism for mapping SPMD program instances to data to operate +on via the built-in ``programIndex`` and ``programCount`` variables. + +``programIndex`` gives the index of the SIMD-lane being used for running +each program instance. (In other words, it's a varying integer value that +has value zero for the first program instance, and so forth.) The +``programCount`` builtin gives the total number of instances in the gang. +Together, these can be used to uniquely map executing program instances to +input data. [#]_ + +.. [#] ``programIndex`` is analogous to ``get_global_id()`` in OpenCL* and + ``threadIdx`` in CUDA*. + +As a specific example, consider an ``ispc`` function that needs to perform +some computation on an array of data. + +:: + + for (uniform int i = 0; i < count; i += programCount) { + float d = data[i + programIndex]; + float r = .... + result[i + programIndex] = r; + } + +Here, we've written a loop that explicitly loops over the data in chunks of +``programCount`` elements. In each loop iteration, the running program +instances effectively collude amongst themselves using ``programIndex`` to +determine which elements to work on in a way that ensures that all of the +data elements will be processed. In this particular case, a ``foreach`` +loop would be preferable, as ``foreach`` naturally handles the case where +``programCount`` doesn't evenly divide the number of elements to be +processed, while the loop above assumes that case implicitly. Unstructured Control Flow: "goto" @@ -2479,139 +2680,6 @@ constructs in ``ispc`` that a loop will never be executed with an "all off" execution mask. -Parallel Iteration Statements: "foreach" and "foreach_tiled" ------------------------------------------------------------- - -The ``foreach`` and ``foreach_tiled`` constructs specify loops over a -possibly multi-dimensional domain of integer ranges. Their role goes -beyond "syntactic sugar"; they provides one of the two key ways of -expressing parallel computation in ``ispc``. - -In general, a ``foreach`` or ``foreach_tiled`` statement takes one or more -dimension specifiers separated by commas, where each dimension is specified -by ``identifier = start ... end``, where ``start`` is a signed integer -value less than or equal to ``end``, specifying iteration over all integer -values from ``start`` up to and including ``end-1``. An arbitrary number -of iteration dimensions may be specified, with each one spanning a -different range of values. Within the ``foreach`` loop, the given -identifiers are available as ``const varying int32`` variables. The -execution mask starts out "all on" at the start of each ``foreach`` loop -iteration, but may be changed by control flow constructs within the loop. - -It is illegal to have a ``break`` statement or a ``return`` statement -within a ``foreach`` loop; a compile-time error will be issued in this -case. (It is legal to have a ``break`` in a regular ``for`` loop that's -nested inside a ``foreach`` loop.) ``continue`` statements are legal in -``foreach`` loops; they have the same effect as in regular ``for`` loops: -a program instances that executes a ``continue`` statement effectively -skips over the rest of the loop body for the current iteration. - -As a specific example, consider the following ``foreach`` statement: - -:: - - foreach (j = 0 ... height, i = 0 ... width) { - // loop body--process data element (i,j) - } - -It specifies a loop over a 2D domain, where the ``j`` variable goes from 0 -to ``height-1`` and ``i`` goes from 0 to ``width-1``. Within the loop, the -variables ``i`` and ``j`` are available and initialized accordingly. - -``foreach`` loops actually cause the given iteration domain to be -automatically mapped to the program instances in the gang, so that all of -the data can be processed, in gang-sized chunks. As a specific example, -consider a simple ``foreach`` loop like the following, on a target where -the gang size is 8: - -:: - - foreach (i = 0 ... 16) { - // perform computation on element i - } - -One possible valid execution path of this loop would be for the program -counter the step through the statements of this loop just ``16/8==2`` -times; the first time through, with the ``varying int32`` variable ``i`` -having the values (0,1,2,3,4,5,6,7) over the program instances, and the -second time through, having the values (8,9,10,11,12,13,14,15), thus -mapping the available program instances to all of the data by the end of -the loop's execution. - -In general, however, you shouldn't make any assumptions about the order in -which elements of the iteration domain will be processed by a ``foreach`` -loop. For example, the following code exhibits undefined behavior: - -:: - - uniform float a[10][100]; - foreach (i = 0 ... 10, j = 0 ... 100) { - if (i == 0) - a[i][j] = j; - else - // Error: can't assume that a[i-1][j] has been set yet - a[i][j] = a[i-1][j]; - -The ``foreach`` statement generally subdivides the iteration domain by -selecting sets of contiguous elements in the inner-most dimension of the -iteration domain. This decomposition approach generally leads to coherent -memory reads and writes, but may lead to worse control flow coherence than -other decompositions. - -Therefore, ``foreach_tiled`` decomposes the iteration domain in a way that -tries to map locations in the domain to program instances in a way that is -compact across all of the dimensions. For example, on a target with an -8-wide gang size, the following ``foreach_tiled`` statement might process -the iteration domain in chunks of 2 elements in ``j`` and 4 elements in -``i`` each time. (The trade-offs between these two constructs are -discussed in more detail in the `ispc Performance Guide`_.) - -.. _ispc Performance Guide: perf.html#improving-control-flow-coherence-with-foreach-tiled - -:: - - foreach_tiled (j = 0 ... height, i = 0 ... width) { - // loop body--process data element (i,j) - } - - -Parallel Iteration with "programIndex" and "programCount" ---------------------------------------------------------- - -In addition to ``foreach`` and ``foreach_tiled``, ``ispc`` provides a -lower-level mechanism for mapping SPMD program instances to data to operate -on via the built-in ``programIndex`` and ``programCount`` variables. - -``programIndex`` gives the index of the SIMD-lane being used for running -each program instance. (In other words, it's a varying integer value that -has value zero for the first program instance, and so forth.) The -``programCount`` builtin gives the total number of instances in the gang. -Together, these can be used to uniquely map executing program instances to -input data. [#]_ - -.. [#] ``programIndex`` is analogous to ``get_global_id()`` in OpenCL* and - ``threadIdx`` in CUDA*. - -As a specific example, consider an ``ispc`` function that needs to perform -some computation on an array of data. - -:: - - for (uniform int i = 0; i < count; i += programCount) { - float d = data[i + programIndex]; - float r = .... - result[i + programIndex] = r; - } - -Here, we've written a loop that explicitly loops over the data in chunks of -``programCount`` elements. In each loop iteration, the running program -instances effectively collude amongst themselves using ``programIndex`` to -determine which elements to work on in a way that ensures that all of the -data elements will be processed. In this particular case, a ``foreach`` -loop would be preferable, as ``foreach`` naturally handles the case where -``programCount`` doesn't evenly divide the number of elements to be -processed, while the loop above assumes that case implicitly. - Functions and Function Calls ---------------------------- @@ -3452,7 +3520,7 @@ There are also variants of these functions that return the value as a discussion of an application of this variant to improve memory access performance in the `Performance Guide`_. -.. _Performance Guide: perf.html#understanding-gather-and-scatter +.. _Performance Guide: perfguide.html#understanding-gather-and-scatter :: @@ -4130,8 +4198,10 @@ from ``ispc`` must be declared as follows: It is illegal to overload functions declared with ``extern "C"`` linkage; ``ispc`` issues an error in this case. -Function calls back to C/C++ are not made if none of the program instances -want to make the call. For example, given code like: +**Only a single function call is made back to C++ for the entire gang of +runing program instances**. Furthermore, function calls back to C/C++ are not +made if none of the program instances want to make the call. For example, +given code like: :: @@ -4174,6 +4244,24 @@ Application code can thus be written as: } } +In some cases, it can be desirable to generate a single call for each +executing program instance, rather than one call for a gang. For example, +the code below shows how one might call an existing math library routine +that takes a scalar parameter. + +:: + + extern "C" uniform double erf(uniform double); + double v = ...; + double result; + foreach_active (instance) { + uniform double r = erf(extract(v, instance)); + result = insert(result, instance, r); + } + +This code calls ``erf()`` once for each active program instance, passing it +the program instance's value of ``v`` and storing the result in the +instance's ``result`` value. Data Layout ----------- @@ -4309,7 +4397,7 @@ can also have a significant effect on performance; in general, creating groups of work that will tend to do similar computation across the SPMD program instances improves performance. -.. _ispc Performance Tuning Guide: http://ispc.github.com/perf.html +.. _ispc Performance Tuning Guide: http://ispc.github.com/perfguide.html Disclaimer and Legal Information diff --git a/docs/perfguide.rst b/docs/perfguide.rst index b8e65893..b1d110b2 100644 --- a/docs/perfguide.rst +++ b/docs/perfguide.rst @@ -21,6 +21,7 @@ the most out of ``ispc`` in practice. + `Avoid 64-bit Addressing Calculations When Possible`_ + `Avoid Computation With 8 and 16-bit Integer Types`_ + `Implementing Reductions Efficiently`_ + + `Using "foreach_active" Effectively`_ + `Using Low-level Vector Tricks`_ + `The "Fast math" Option`_ + `"inline" Aggressively`_ @@ -510,6 +511,43 @@ values--very efficient code in the end. return reduce_add(sum); } +Using "foreach_active" Effectively +---------------------------------- + +For high-performance code, + +For example, consider this segment of code, from the introduction of +``foreach_active`` in the ispc User's Guide: + +:: + + uniform float array[...] = { ... }; + int index = ...; + foreach_active (i) { + ++array[index]; + } + +Here, ``index`` was assumed to possibly have the same value for multiple +program instances, so the updates to ``array[index]`` are serialized by the +``foreach_active`` statement in order to not have undefined results when +``index`` values do collide. + +The code generated by the compiler can be improved in this case by making +it clear that only a single element of the array is accessed by +``array[index]`` and that thus a general gather or scatter isn't required. +Specifically, by using the ``extract()`` function from the standard library +to extract the current program instance's value of ``index`` into a +``uniform`` variable and then using that to index into ``array``, as below, +more efficient code is generated. + +:: + + foreach_active (instanceNum) { + uniform int unifIndex = extract(index, instanceNum); + ++array[unifIndex]; + } + + Using Low-level Vector Tricks ----------------------------- diff --git a/lex.ll b/lex.ll index d55ce930..9bf768e4 100644 --- a/lex.ll +++ b/lex.ll @@ -66,8 +66,8 @@ static int allTokens[] = { TOKEN_CONST, TOKEN_CONTINUE, TOKEN_CRETURN, TOKEN_DEFAULT, TOKEN_DO, TOKEN_DELETE, TOKEN_DOUBLE, TOKEN_ELSE, TOKEN_ENUM, TOKEN_EXPORT, TOKEN_EXTERN, TOKEN_FALSE, TOKEN_FLOAT, TOKEN_FOR, - TOKEN_FOREACH, TOKEN_FOREACH_TILED, TOKEN_FOREACH_UNIQUE, - TOKEN_GOTO, TOKEN_IF, TOKEN_IN, TOKEN_INLINE, + TOKEN_FOREACH, TOKEN_FOREACH_ACTIVE, TOKEN_FOREACH_TILED, + TOKEN_FOREACH_UNIQUE, TOKEN_GOTO, TOKEN_IF, TOKEN_IN, TOKEN_INLINE, TOKEN_INT, TOKEN_INT8, TOKEN_INT16, TOKEN_INT, TOKEN_INT64, TOKEN_LAUNCH, TOKEN_NEW, TOKEN_NULL, TOKEN_PRINT, TOKEN_RETURN, TOKEN_SOA, TOKEN_SIGNED, TOKEN_SIZEOF, TOKEN_STATIC, TOKEN_STRUCT, TOKEN_SWITCH, TOKEN_SYNC, @@ -115,6 +115,7 @@ void ParserInit() { tokenToName[TOKEN_FLOAT] = "float"; tokenToName[TOKEN_FOR] = "for"; tokenToName[TOKEN_FOREACH] = "foreach"; + tokenToName[TOKEN_FOREACH_ACTIVE] = "foreach_active"; tokenToName[TOKEN_FOREACH_TILED] = "foreach_tiled"; tokenToName[TOKEN_FOREACH_UNIQUE] = "foreach_unique"; tokenToName[TOKEN_GOTO] = "goto"; @@ -226,6 +227,7 @@ void ParserInit() { tokenNameRemap["TOKEN_FLOAT"] = "\'float\'"; tokenNameRemap["TOKEN_FOR"] = "\'for\'"; tokenNameRemap["TOKEN_FOREACH"] = "\'foreach\'"; + tokenNameRemap["TOKEN_FOREACH_ACTIVE"] = "\'foreach_active\'"; tokenNameRemap["TOKEN_FOREACH_TILED"] = "\'foreach_tiled\'"; tokenNameRemap["TOKEN_FOREACH_UNIQUE"] = "\'foreach_unique\'"; tokenNameRemap["TOKEN_GOTO"] = "\'goto\'"; @@ -369,8 +371,8 @@ extern { RT; return TOKEN_EXTERN; } false { RT; return TOKEN_FALSE; } float { RT; return TOKEN_FLOAT; } for { RT; return TOKEN_FOR; } -__foreach_active { RT; return TOKEN_FOREACH_ACTIVE; } foreach { RT; return TOKEN_FOREACH; } +foreach_active { RT; return TOKEN_FOREACH_ACTIVE; } foreach_tiled { RT; return TOKEN_FOREACH_TILED; } foreach_unique { RT; return TOKEN_FOREACH_UNIQUE; } goto { RT; return TOKEN_GOTO; } diff --git a/parse.yy b/parse.yy index 8200fb3d..7a9026e3 100644 --- a/parse.yy +++ b/parse.yy @@ -117,8 +117,8 @@ static const char *lBuiltinTokens[] = { "assert", "bool", "break", "case", "cbreak", "ccontinue", "cdo", "cfor", "cif", "cwhile", "const", "continue", "creturn", "default", "do", "delete", "double", "else", "enum", "export", "extern", "false", - "float", "for", "foreach", "foreach_tiled", "foreach_unique", - "goto", "if", "in", "inline", + "float", "for", "foreach", "foreach_active", "foreach_tiled", + "foreach_unique", "goto", "if", "in", "inline", "int", "int8", "int16", "int32", "int64", "launch", "new", "NULL", "print", "return", "signed", "sizeof", "static", "struct", "switch", "sync", "task", "true", "typedef", "uniform", "unmasked", "unsigned", @@ -1688,7 +1688,7 @@ foreach_active_scope foreach_active_identifier : TOKEN_IDENTIFIER { - $$ = new Symbol(yytext, @1, AtomicType::UniformInt32); + $$ = new Symbol(yytext, @1, AtomicType::UniformInt64->GetAsConstType()); } ; @@ -1838,7 +1838,7 @@ iteration_statement } statement { - $$ = CreateForeachActiveStmt($3, $6, Union(@1, @4)); + $$ = new ForeachActiveStmt($3, $6, Union(@1, @4)); m->symbolTable->PopScope(); } | foreach_unique_scope '(' foreach_unique_identifier TOKEN_IN diff --git a/stdlib.ispc b/stdlib.ispc index 84865d90..a7499930 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -421,7 +421,7 @@ static inline void memcpy(void * varying dst, void * varying src, da[programIndex] = dst; sa[programIndex] = src; - __foreach_active (i) { + foreach_active (i) { void * uniform d = da[i], * uniform s = sa[i]; __memcpy32((int8 * uniform)d, (int8 * uniform)s, extract(count, i)); } @@ -435,7 +435,7 @@ static inline void memcpy64(void * varying dst, void * varying src, da[programIndex] = dst; sa[programIndex] = src; - __foreach_active (i) { + foreach_active (i) { void * uniform d = da[i], * uniform s = sa[i]; __memcpy64((int8 * uniform)d, (int8 * uniform)s, extract(count, i)); } @@ -459,7 +459,7 @@ static inline void memmove(void * varying dst, void * varying src, da[programIndex] = dst; sa[programIndex] = src; - __foreach_active (i) { + foreach_active (i) { void * uniform d = da[i], * uniform s = sa[i]; __memmove32((int8 * uniform)d, (int8 * uniform)s, extract(count, i)); } @@ -473,7 +473,7 @@ static inline void memmove64(void * varying dst, void * varying src, da[programIndex] = dst; sa[programIndex] = src; - __foreach_active (i) { + foreach_active (i) { void * uniform d = da[i], * uniform s = sa[i]; __memmove64((int8 * uniform)d, (int8 * uniform)s, extract(count, i)); } @@ -493,7 +493,7 @@ static inline void memset(void * varying ptr, int8 val, int32 count) { void * uniform pa[programCount]; pa[programIndex] = ptr; - __foreach_active (i) { + foreach_active (i) { __memset32((int8 * uniform)pa[i], extract(val, i), extract(count, i)); } } @@ -502,7 +502,7 @@ static inline void memset64(void * varying ptr, int8 val, int64 count) { void * uniform pa[programCount]; pa[programIndex] = ptr; - __foreach_active (i) { + foreach_active (i) { __memset64((int8 * uniform)pa[i], extract(val, i), extract(count, i)); } } @@ -711,7 +711,7 @@ static inline void prefetch_l1(const void * varying ptr) { const void * uniform ptrArray[programCount]; ptrArray[programIndex] = ptr; - __foreach_active (i) { + foreach_active (i) { const void * uniform p = ptrArray[i]; prefetch_l1(p); } @@ -721,7 +721,7 @@ static inline void prefetch_l2(const void * varying ptr) { const void * uniform ptrArray[programCount]; ptrArray[programIndex] = ptr; - __foreach_active (i) { + foreach_active (i) { const void * uniform p = ptrArray[i]; prefetch_l2(p); } @@ -731,7 +731,7 @@ static inline void prefetch_l3(const void * varying ptr) { const void * uniform ptrArray[programCount]; ptrArray[programIndex] = ptr; - __foreach_active (i) { + foreach_active (i) { const void * uniform p = ptrArray[i]; prefetch_l3(p); } @@ -741,7 +741,7 @@ static inline void prefetch_nt(const void * varying ptr) { const void * uniform ptrArray[programCount]; ptrArray[programIndex] = ptr; - __foreach_active (i) { + foreach_active (i) { const void * uniform p = ptrArray[i]; prefetch_nt(p); } @@ -1621,7 +1621,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \ uniform TA * uniform ptrArray[programCount]; \ ptrArray[programIndex] = ptr; \ TA ret; \ - __foreach_active (i) { \ + foreach_active (i) { \ uniform TA * uniform p = ptrArray[i]; \ uniform TA v = extract(value, i); \ uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \ @@ -1672,7 +1672,7 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \ uniform TA * uniform ptrArray[programCount]; \ ptrArray[programIndex] = ptr; \ TA ret; \ - __foreach_active (i) { \ + foreach_active (i) { \ uniform TA * uniform p = ptrArray[i]; \ uniform TA v = extract(value, i); \ uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \ @@ -1699,7 +1699,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \ uniform TA * uniform ptrArray[programCount]; \ ptrArray[programIndex] = ptr; \ TA ret; \ - __foreach_active (i) { \ + foreach_active (i) { \ uniform TA * uniform p = ptrArray[i]; \ uniform TA v = extract(value, i); \ uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \ @@ -1774,7 +1774,7 @@ static inline TA atomic_compare_exchange_global( \ uniform TA * uniform ptrArray[programCount]; \ ptrArray[programIndex] = ptr; \ TA ret; \ - __foreach_active (i) { \ + foreach_active (i) { \ uniform TA r = \ __atomic_compare_exchange_uniform_##TB##_global(ptrArray[i], \ extract(oldval, i), \ @@ -1848,7 +1848,7 @@ static inline uniform TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, \ } \ static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value) { \ TYPE ret; \ - __foreach_active (i) { \ + foreach_active (i) { \ ret = insert(ret, i, *ptr); \ *ptr = OPFUNC(*ptr, extract(value, i)); \ } \ @@ -1858,7 +1858,7 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) { \ TYPE ret; \ uniform TYPE * uniform ptrs[programCount]; \ ptrs[programIndex] = p; \ - __foreach_active (i) { \ + foreach_active (i) { \ ret = insert(ret, i, *ptrs[i]); \ *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \ } \ @@ -1975,7 +1975,7 @@ static inline uniform TYPE atomic_compare_exchange_local(uniform TYPE * uniform static inline TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \ TYPE cmp, TYPE update) { \ TYPE ret; \ - __foreach_active (i) { \ + foreach_active (i) { \ uniform TYPE old = *ptr; \ if (old == extract(cmp, i)) \ *ptr = extract(update, i); \ @@ -1988,7 +1988,7 @@ static inline TYPE atomic_compare_exchange_local(uniform TYPE * varying p, \ uniform TYPE * uniform ptrs[programCount]; \ ptrs[programIndex] = p; \ TYPE ret; \ - __foreach_active (i) { \ + foreach_active (i) { \ uniform TYPE old = *ptrs[i]; \ if (old == extract(cmp, i)) \ *ptrs[i] = extract(update, i); \ @@ -2127,7 +2127,7 @@ static inline float sin(float x_full) { } else if (__math_lib == __math_lib_system) { float ret; - __foreach_active (i) { + foreach_active (i) { uniform float r = __stdlib_sinf(extract(x_full, i)); ret = insert(ret, i, r); } @@ -2259,7 +2259,7 @@ static inline float asin(float x) { if (__math_lib == __math_lib_svml || __math_lib == __math_lib_system) { float ret; - __foreach_active (i) { + foreach_active (i) { uniform float r = __stdlib_asinf(extract(x, i)); ret = insert(ret, i, r); } @@ -2364,7 +2364,7 @@ static inline float cos(float x_full) { } else if (__math_lib == __math_lib_system) { float ret; - __foreach_active (i) { + foreach_active (i) { uniform float r = __stdlib_cosf(extract(x_full, i)); ret = insert(ret, i, r); } @@ -2502,7 +2502,7 @@ static inline void sincos(float x_full, varying float * uniform sin_result, __svml_sincos(x_full, sin_result, cos_result); } else if (__math_lib == __math_lib_system) { - __foreach_active (i) { + foreach_active (i) { uniform float s, c; __stdlib_sincosf(extract(x_full, i), &s, &c); *sin_result = insert(*sin_result, i, s); @@ -2635,7 +2635,7 @@ static inline float tan(float x_full) { } else if (__math_lib == __math_lib_system) { float ret; - __foreach_active (i) { + foreach_active (i) { uniform float r = __stdlib_tanf(extract(x_full, i)); ret = insert(ret, i, r); } @@ -2786,7 +2786,7 @@ static inline float atan(float x_full) { } else if (__math_lib == __math_lib_system) { float ret; - __foreach_active (i) { + foreach_active (i) { uniform float r = __stdlib_atanf(extract(x_full, i)); ret = insert(ret, i, r); } @@ -2881,7 +2881,7 @@ static inline float atan2(float y, float x) { } else if (__math_lib == __math_lib_system) { float ret; - __foreach_active (i) { + foreach_active (i) { uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i)); ret = insert(ret, i, r); } @@ -2944,7 +2944,7 @@ static inline float exp(float x_full) { } else if (__math_lib == __math_lib_system) { float ret; - __foreach_active (i) { + foreach_active (i) { uniform float r = __stdlib_expf(extract(x_full, i)); ret = insert(ret, i, r); } @@ -3151,7 +3151,7 @@ static inline float log(float x_full) { } else if (__math_lib == __math_lib_system) { float ret; - __foreach_active (i) { + foreach_active (i) { uniform float r = __stdlib_logf(extract(x_full, i)); ret = insert(ret, i, r); } @@ -3326,7 +3326,7 @@ static inline float pow(float a, float b) { } else if (__math_lib == __math_lib_system) { float ret; - __foreach_active (i) { + foreach_active (i) { uniform float r = __stdlib_powf(extract(a, i), extract(b, i)); ret = insert(ret, i, r); } @@ -3416,7 +3416,7 @@ static inline double sin(double x) { return sin((float)x); else { double ret; - __foreach_active (i) { + foreach_active (i) { uniform double r = __stdlib_sin(extract(x, i)); ret = insert(ret, i, r); } @@ -3438,7 +3438,7 @@ static inline double cos(double x) { return cos((float)x); else { double ret; - __foreach_active (i) { + foreach_active (i) { uniform double r = __stdlib_cos(extract(x, i)); ret = insert(ret, i, r); } @@ -3464,7 +3464,7 @@ static inline void sincos(double x, varying double * uniform sin_result, *cos_result = cr; } else { - __foreach_active (i) { + foreach_active (i) { uniform double sr, cr; __stdlib_sincos(extract(x, i), &sr, &cr); *sin_result = insert(*sin_result, i, sr); @@ -3492,7 +3492,7 @@ static inline double tan(double x) { return tan((float)x); else { double ret; - __foreach_active (i) { + foreach_active (i) { uniform double r = __stdlib_tan(extract(x, i)); ret = insert(ret, i, r); } @@ -3514,7 +3514,7 @@ static inline double atan(double x) { return atan((float)x); else { double ret; - __foreach_active (i) { + foreach_active (i) { uniform double r = __stdlib_atan(extract(x, i)); ret = insert(ret, i, r); } @@ -3536,7 +3536,7 @@ static inline double atan2(double y, double x) { return atan2((float)y, (float)x); else { double ret; - __foreach_active (i) { + foreach_active (i) { uniform double r = __stdlib_atan2(extract(y, i), extract(x, i)); ret = insert(ret, i, r); } @@ -3558,7 +3558,7 @@ static inline double exp(double x) { return exp((float)x); else { double ret; - __foreach_active (i) { + foreach_active (i) { uniform double r = __stdlib_exp(extract(x, i)); ret = insert(ret, i, r); } @@ -3580,7 +3580,7 @@ static inline double log(double x) { return log((float)x); else { double ret; - __foreach_active (i) { + foreach_active (i) { uniform double r = __stdlib_log(extract(x, i)); ret = insert(ret, i, r); } @@ -3602,7 +3602,7 @@ static inline double pow(double a, double b) { return pow((float)a, (float)b); else { double ret; - __foreach_active (i) { + foreach_active (i) { uniform double r = __stdlib_pow(extract(a, i), extract(b, i)); ret = insert(ret, i, r); } diff --git a/stmt.cpp b/stmt.cpp index e6622bb2..533cbac8 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -1915,6 +1915,188 @@ ForeachStmt::Print(int indent) const { } +/////////////////////////////////////////////////////////////////////////// +// ForeachActiveStmt + +ForeachActiveStmt::ForeachActiveStmt(Symbol *s, Stmt *st, SourcePos pos) + : Stmt(pos) { + sym = s; + stmts = st; +} + + +void +ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const { + if (!ctx->GetCurrentBasicBlock()) + return; + + // Allocate storage for the symbol that we'll use for the uniform + // variable that holds the current program instance in each loop + // iteration. + if (sym->type == NULL) { + Assert(m->errorCount > 0); + return; + } + Assert(Type::Equal(sym->type, + AtomicType::UniformInt64->GetAsConstType())); + sym->storagePtr = ctx->AllocaInst(LLVMTypes::Int64Type, sym->name.c_str()); + + ctx->SetDebugPos(pos); + ctx->EmitVariableDebugInfo(sym); + + // The various basic blocks that we'll need in the below + llvm::BasicBlock *bbFindNext = + ctx->CreateBasicBlock("foreach_active_find_next"); + llvm::BasicBlock *bbBody = ctx->CreateBasicBlock("foreach_active_body"); + llvm::BasicBlock *bbCheckForMore = + ctx->CreateBasicBlock("foreach_active_check_for_more"); + llvm::BasicBlock *bbDone = ctx->CreateBasicBlock("foreach_active_done"); + + // Save the old mask so that we can restore it at the end + llvm::Value *oldInternalMask = ctx->GetInternalMask(); + + // Now, *maskBitsPtr will maintain a bitmask for the lanes that remain + // to be processed by a pass through the loop body. It starts out with + // the current execution mask (which should never be all off going in + // to this)... + llvm::Value *oldFullMask = ctx->GetFullMask(); + llvm::Value *maskBitsPtr = + ctx->AllocaInst(LLVMTypes::Int64Type, "mask_bits"); + llvm::Value *movmsk = ctx->LaneMask(oldFullMask); + ctx->StoreInst(movmsk, maskBitsPtr); + + // Officially start the loop. + ctx->StartScope(); + ctx->StartForeach(FunctionEmitContext::FOREACH_ACTIVE); + ctx->SetContinueTarget(bbCheckForMore); + + // Onward to find the first set of program instance to run the loop for + ctx->BranchInst(bbFindNext); + + ctx->SetCurrentBasicBlock(bbFindNext); { + // Load the bitmask of the lanes left to be processed + llvm::Value *remainingBits = ctx->LoadInst(maskBitsPtr, "remaining_bits"); + + // Find the index of the first set bit in the mask + llvm::Function *ctlzFunc = + m->module->getFunction("__count_trailing_zeros_i64"); + Assert(ctlzFunc != NULL); + llvm::Value *firstSet = ctx->CallInst(ctlzFunc, NULL, remainingBits, + "first_set"); + + // Store that value into the storage allocated for the iteration + // variable. + ctx->StoreInst(firstSet, sym->storagePtr); + + // Now set the execution mask to be only on for the current program + // instance. (TODO: is there a more efficient way to do this? e.g. + // for AVX1, we might want to do this as float rather than int + // math...) + + // Get the "program index" vector value + llvm::Value *programIndex = + llvm::UndefValue::get(LLVMTypes::Int32VectorType); + for (int i = 0; i < g->target.vectorWidth; ++i) + programIndex = ctx->InsertInst(programIndex, LLVMInt32(i), i, + "prog_index"); + + // And smear the current lane out to a vector + llvm::Value *firstSet32 = + ctx->TruncInst(firstSet, LLVMTypes::Int32Type, "first_set32"); + llvm::Value *firstSet32Smear = ctx->SmearUniform(firstSet32); + + // Now set the execution mask based on doing a vector compare of + // these two + llvm::Value *iterMask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, + firstSet32Smear, programIndex); + iterMask = ctx->I1VecToBoolVec(iterMask); + + ctx->SetInternalMask(iterMask); + + // Also update the bitvector of lanes left to turn off the bit for + // the lane we're about to run. + llvm::Value *setMask = + ctx->BinaryOperator(llvm::Instruction::Shl, LLVMInt64(1), + firstSet, "set_mask"); + llvm::Value *notSetMask = ctx->NotOperator(setMask); + llvm::Value *newRemaining = + ctx->BinaryOperator(llvm::Instruction::And, remainingBits, + notSetMask, "new_remaining"); + ctx->StoreInst(newRemaining, maskBitsPtr); + + // and onward to run the loop body... + ctx->BranchInst(bbBody); + } + + ctx->SetCurrentBasicBlock(bbBody); { + // Run the code in the body of the loop. This is easy now. + if (stmts) + stmts->EmitCode(ctx); + + Assert(ctx->GetCurrentBasicBlock() != NULL); + ctx->BranchInst(bbCheckForMore); + } + + ctx->SetCurrentBasicBlock(bbCheckForMore); { + // At the end of the loop body (either due to running the + // statements normally, or a continue statement in the middle of + // the loop that jumps to the end, see if there are any lanes left + // to be processed. + llvm::Value *remainingBits = ctx->LoadInst(maskBitsPtr, "remaining_bits"); + llvm::Value *nonZero = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, + remainingBits, LLVMInt64(0), "remaining_ne_zero"); + ctx->BranchInst(bbFindNext, bbDone, nonZero); + } + + ctx->SetCurrentBasicBlock(bbDone); + ctx->SetInternalMask(oldInternalMask); + ctx->EndForeach(); + ctx->EndScope(); +} + + +void +ForeachActiveStmt::Print(int indent) const { + printf("%*cForeach_active Stmt", indent, ' '); + pos.Print(); + printf("\n"); + + printf("%*cIter symbol: ", indent+4, ' '); + if (sym != NULL) { + printf("%s", sym->name.c_str()); + if (sym->type != NULL) + printf(" %s", sym->type->GetString().c_str()); + } + else + printf("NULL"); + printf("\n"); + + printf("%*cStmts:\n", indent+4, ' '); + if (stmts != NULL) + stmts->Print(indent+8); + else + printf("NULL"); + printf("\n"); +} + + +Stmt * +ForeachActiveStmt::TypeCheck() { + if (sym == NULL) + return NULL; + + return this; +} + + +int +ForeachActiveStmt::EstimateCost() const { + return COST_VARYING_LOOP; +} + + /////////////////////////////////////////////////////////////////////////// // ForeachUniqueStmt @@ -3043,80 +3225,3 @@ int DeleteStmt::EstimateCost() const { return COST_DELETE; } - -/////////////////////////////////////////////////////////////////////////// - -/** This generates AST nodes for an __foreach_active statement. This - construct can be synthesized ouf of the existing ForStmt (and other AST - nodes), so here we just build up the AST that we need rather than - having a new Stmt implementation for __foreach_active. - - @param iterSym Symbol for the iteration variable (e.g. "i" in - __foreach_active (i) { .. .} - @param stmts Statements to execute each time through the loop, for - each active program instance. - @param pos Position of the __foreach_active statement in the source - file. - */ -Stmt * -CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) { - if (iterSym == NULL) { - AssertPos(pos, m->errorCount > 0); - return NULL; - } - - // loop initializer: set iter = 0 - std::vector var; - ConstExpr *zeroExpr = new ConstExpr(AtomicType::UniformInt32, 0, - iterSym->pos); - var.push_back(VariableDeclaration(iterSym, zeroExpr)); - Stmt *initStmt = new DeclStmt(var, iterSym->pos); - - // loop test: (iter < programCount) - ConstExpr *progCountExpr = - new ConstExpr(AtomicType::UniformInt32, g->target.vectorWidth, - pos); - SymbolExpr *symExpr = new SymbolExpr(iterSym, iterSym->pos); - Expr *testExpr = new BinaryExpr(BinaryExpr::Lt, symExpr, progCountExpr, - pos); - - // loop step: ++iterSym - UnaryExpr *incExpr = new UnaryExpr(UnaryExpr::PreInc, symExpr, pos); - Stmt *stepStmt = new ExprStmt(incExpr, pos); - - // loop body - // First, call __movmsk(__mask)) to get the mask as a set of bits. - // This should be hoisted out of the loop - Symbol *maskSym = m->symbolTable->LookupVariable("__mask"); - AssertPos(pos, maskSym != NULL); - Expr *maskVecExpr = new SymbolExpr(maskSym, pos); - std::vector mmFuns; - m->symbolTable->LookupFunction("__movmsk", &mmFuns); - AssertPos(pos, mmFuns.size() == (g->target.maskBitCount == 32 ? 2 : 1)); - FunctionSymbolExpr *movmskFunc = new FunctionSymbolExpr("__movmsk", mmFuns, - pos); - ExprList *movmskArgs = new ExprList(maskVecExpr, pos); - FunctionCallExpr *movmskExpr = new FunctionCallExpr(movmskFunc, movmskArgs, - pos); - - // Compute the per lane mask to test the mask bits against: (1 << iter) - ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt64, int64_t(1), - iterSym->pos); - Expr *shiftLaneExpr = new BinaryExpr(BinaryExpr::Shl, oneExpr, symExpr, - pos); - - // Compute the AND: movmsk & (1 << iter) - Expr *maskAndLaneExpr = new BinaryExpr(BinaryExpr::BitAnd, movmskExpr, - shiftLaneExpr, pos); - // Test to see if it's non-zero: (mask & (1 << iter)) != 0 - Expr *ifTestExpr = new BinaryExpr(BinaryExpr::NotEqual, maskAndLaneExpr, - zeroExpr, pos); - - // Now, enclose the provided statements in an if test such that they - // only run if the mask is non-zero for the lane we're currently - // handling in the loop. - IfStmt *laneCheckIf = new IfStmt(ifTestExpr, stmts, NULL, false, pos); - - // And return a for loop that wires it all together. - return new ForStmt(initStmt, testExpr, stepStmt, laneCheckIf, false, pos); -} diff --git a/stmt.h b/stmt.h index 01a7244d..ee6bd2f0 100644 --- a/stmt.h +++ b/stmt.h @@ -260,6 +260,23 @@ public: }; +/** Iteration over each executing program instance. + */ +class ForeachActiveStmt : public Stmt { +public: + ForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos); + + void EmitCode(FunctionEmitContext *ctx) const; + void Print(int indent) const; + + Stmt *TypeCheck(); + int EstimateCost() const; + + Symbol *sym; + Stmt *stmts; +}; + + /** Parallel iteration over each unique value in the given (varying) expression. */ diff --git a/tests/foreach-active-1.ispc b/tests/foreach-active-1.ispc new file mode 100644 index 00000000..d7c5ccef --- /dev/null +++ b/tests/foreach-active-1.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform int count = 0; + if (programIndex & 1) + foreach_active (i) + ++count; + RET[programIndex] = count; +} + +export void result(uniform float RET[]) { + RET[programIndex] = programCount / 2; +} diff --git a/tests/foreach-active-2.ispc b/tests/foreach-active-2.ispc new file mode 100644 index 00000000..80b4f606 --- /dev/null +++ b/tests/foreach-active-2.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform int count = 0; + if (programIndex & 1) + foreach_active (i) + ++a; + RET[programIndex] = a; +} + +export void result(uniform float RET[]) { + RET[programIndex] = (1 + programIndex) + ((programIndex & 1) ? 1 : 0); +} diff --git a/tests/foreach-active-3.ispc b/tests/foreach-active-3.ispc new file mode 100644 index 00000000..b258907e --- /dev/null +++ b/tests/foreach-active-3.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + RET[programIndex] = a; + + if (programIndex & 1) + foreach_active (i) + ++RET[i]; +} + +export void result(uniform float RET[]) { + RET[programIndex] = (1 + programIndex) + ((programIndex & 1) ? 1 : 0); +} diff --git a/tests/foreach-active-4.ispc b/tests/foreach-active-4.ispc new file mode 100644 index 00000000..ffe72db0 --- /dev/null +++ b/tests/foreach-active-4.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + RET[programIndex] = a; + + if (programIndex & 1) { + foreach_active (i) { + if (i == 1) + continue; + ++RET[i]; + } + } +} + +export void result(uniform float RET[]) { + RET[programIndex] = (1 + programIndex) + ((programIndex & 1) ? 1 : 0); + --RET[1]; +} diff --git a/tests/foreach-active-5.ispc b/tests/foreach-active-5.ispc new file mode 100644 index 00000000..2b00f220 --- /dev/null +++ b/tests/foreach-active-5.ispc @@ -0,0 +1,22 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + RET[programIndex] = a; + + if (programIndex & 1) { + foreach_active (i) { + if (i & 1) + continue; + + int * uniform null = 0; + *null = 0; + } + } +} + +export void result(uniform float RET[]) { + RET[programIndex] = (1 + programIndex); +}