diff --git a/ast.cpp b/ast.cpp index 66b597d5..beb6004f 100644 --- a/ast.cpp +++ b/ast.cpp @@ -103,6 +103,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc, PrintStmt *ps; AssertStmt *as; DeleteStmt *dels; + UnmaskedStmt *ums; if ((es = dynamic_cast(node)) != NULL) es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data); @@ -174,6 +175,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc, as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data); else if ((dels = dynamic_cast(node)) != NULL) dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data); + else if ((ums = dynamic_cast(node)) != NULL) + ums->stmts = (Stmt *)WalkAST(ums->stmts, preFunc, postFunc, data); else FATAL("Unhandled statement type in WalkAST()"); } @@ -396,7 +399,8 @@ lCheckAllOffSafety(ASTNode *node, void *data) { if (dynamic_cast(node) != NULL || dynamic_cast(node) != NULL || - dynamic_cast(node) != NULL) { + dynamic_cast(node) != NULL || + dynamic_cast(node) != NULL) { // The various foreach statements also shouldn't be run with an // all-off mask. Since they can re-establish an 'all on' mask, // this would be pretty unintuitive. (More generally, it's diff --git a/docs/ispc.rst b/docs/ispc.rst index 58c58d6a..7487da3a 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -119,6 +119,7 @@ Contents: + `Function Overloading`_ + * `Re-establishing The Execution Mask`_ * `Task Parallel Execution`_ + `Task Parallelism: "launch" and "sync" Statements`_ @@ -2712,6 +2713,15 @@ Any function that can be launched with the ``launch`` construct in ``ispc`` must have a ``task`` qualifier; see `Task Parallelism: "launch" and "sync" Statements`_ for more discussion of launching tasks in ``ispc``. +A function can also be given the ``unmasked`` qualifier; this qualifier +indicates that all program instances should be made active at the start of +the function execution (or, equivalently, that the current execution mask +shouldn't be passed to the function from the function call site.) If it is +known that a function will always be called when all program instances are +executing, adding this qualifier can slightly improve performance. See the +Section `Re-establishing The Execution Mask`_ for more discussion of +``unmasked`` program code. + Functions that are intended to be called from C/C++ application code must have the ``export`` qualifier. This causes them to have regular C linkage and to have their declarations included in header files, if the ``ispc`` @@ -2754,6 +2764,95 @@ of a given type are found, an error is issued. variability from ``uniform`` to ``varying`` as needed. +Re-establishing The Execution Mask +---------------------------------- + +As discussed in `Functions and Function Calls`_, a function that is +declared with an ``unmasked`` qualifier starts execution with all program +instances running, regardless of the execution mask at the site of the +function call. A block of statements can also be enclosed with +``unmasked`` to have the same effect within a function: + +:: + + int a = ..., b = ...; + if (a < b) { + // only program instances where a < b are executing here + unmasked { + // now all program instances are executing + } + // and again only the a < b instances + } + +``unmasked`` can be useful in cases where the programmer wants to "change +the axis of parallelism" or use nested parallelism, as shown in the +following code: + +:: + + uniform WorkItem items[...] = ...; + foreach (itemNum = 0 ... numItems) { + // do computation on items[itemNum] to determine if it needs + // further processing... + if (/* itemNum needs processing */) { + foreach_active (i) { + unmasked { + uniform int uItemNum = extract(itemNum, i); + // apply entire gang of program instances to uItemNum + } + } + } + } + +The general idea is that we are first using SPMD parallelism to determine +which of the items requires further processing, checking a gang's worth of +them concurrently inside the ``foreach`` loop. Assuming that only a subset +of them need further processing, would be wasteful to do this work within +the ``foreach`` loop in the same program instance that made the initial +determination of whether more work as needed; in this case, all of the +program instances corresponding to items that didn't need further +processing would be inactive, with corresponding unused computational +capability in the system. + +In the above code, this issue is avoided by working on each of the items +requiring more processing in turn with ``foreach_active`` and then using +``unmasked`` to re-establish execution of all of the program instances. +The entire gang can in turn be applied to the computation to be done for +each ``items[itemNum]``. + +The ``unmasked`` statement should be used with care; it can lead to a +number of surprising cases of undefined program behavior. For example, +consider the following code: + +:: + + void func(float); + float a = ...; + float b; + if (a < 0) { + b = 0; + unmasked { + if (b == 0) + func(a); + } + } + +The variable ``a`` is initialized to some value and ``b`` is declared but +not initialized, and thus has an undefined value. Within the ``if`` test, +we have assigned zero to ``b``, though only for the program instances +currently executing--i.e. those where ``a < 0``. After re-establishing the +executing mask with ``unmasked``, we then compare ``b`` to zero--this +comparison is well-defined (and "true") for the program instances where ``a +< 0``, but it is undefed for any program instances where that isn't the +case, since the value of ``b`` is undefined for those program instances. +Similar surprising cases can arise when writing to ``varying`` variables +within ``unmasked`` code. + +As a general rule, code within an ``unmasked`` block, or a function with +the ``unmasked`` qualifier should use great care when accessing ``varying`` +variables that were declared in an outer scope. + + Task Parallel Execution ----------------------- @@ -2789,17 +2888,46 @@ Any function that is launched as a task must be declared with the Tasks must return ``void``; a compile time error is issued if a non-``void`` task is defined. -Given a task definitions, there are two ways to write code that launches -tasks, using the ``launch`` construct. First, one task can be launched at -a time, with parameters passed to the task to help it determine what part -of the overall computation it's responsible for: +Given a task declaration, a task can be launched with ``launch``: + +:: + + uniform float a[...] = ...; + launch func(a, 1); + +Program execution continues asynchronously after a ``launch`` statement in +a function; thus, a function shouldn't access values written by a task it +has launched within the function without synchronization. A function can +use a ``sync`` statement to wait for all launched tasks to finish: + +:: + + launch func(a, 1); + sync; + // now safe to use computed values in a[]... + +Alternatively, any function that launches tasks has an automatically-added +implicit ``sync`` statement before it returns, so that functions that call +a function that launches tasks don't have to worry about outstanding +asynchronous computation from that function. + +The task generated by a ``launch`` statement is a single gang's worth of +work. The same program instances are respectively active and inactive at +the start of the task as were active and inactive when their ``launch`` +statement executed. To make all program instances in the launched gang be +active, the ``unmasked`` construct can be used (see `Re-establishing The +Execution Mask`_.) + +There are two ways to write code that launches a group multiple tasks. +First, one task can be launched at a time, with parameters passed to the +task to help it determine what part of the overall computation it's +responsible for: :: for (uniform int i = 0; i < 100; ++i) launch func(a, i); -Note the ``launch`` keyword before the function call expression. This code launches 100 tasks, each of which presumably does some computation that is keyed off of given the value ``i``. In general, one should launch many more tasks than there are processors in the system to @@ -2830,23 +2958,6 @@ implementation of ``func2`` to determine which array element to process. a[taskIndex] = ... } -Program execution continues asynchronously after a ``launch`` statement in -a function; thus, a function shouldn't access values being generated by the -tasks it has launched within the function without synchronization. If -results are needed before function return, a function can use a ``sync`` -statement to wait for all launched tasks to finish: - -:: - - launch[100] func2(a); - sync; - // now safe to use computed values in a[]... - -Alternatively, any function that launches tasks has an automatically-added -``sync`` statement before it returns, so that functions that call a -function that launches tasks don't have to worry about outstanding -asynchronous computation from that function. - Inside functions with the ``task`` qualifier, two additional built-in variables are provided in addition to ``taskIndex`` and ``taskCount``: ``threadIndex`` and ``threadCount``. ``threadCount`` gives the total diff --git a/examples/mandelbrot/mandelbrot.ispc b/examples/mandelbrot/mandelbrot.ispc index 8b471139..e6bebca8 100644 --- a/examples/mandelbrot/mandelbrot.ispc +++ b/examples/mandelbrot/mandelbrot.ispc @@ -1,5 +1,5 @@ /* - Copyright (c) 2010-2011, Intel Corporation + Copyright (c) 2010-2012, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -40,8 +40,10 @@ static inline int mandel(float c_re, float c_im, int count) { float new_re = z_re*z_re - z_im*z_im; float new_im = 2.f * z_re * z_im; - z_re = c_re + new_re; - z_im = c_im + new_im; + unmasked { + z_re = c_re + new_re; + z_im = c_im + new_im; + } } return i; diff --git a/examples/mandelbrot_tasks/mandelbrot.ispc b/examples/mandelbrot_tasks/mandelbrot.ispc index ef99dd72..84d4ccd4 100644 --- a/examples/mandelbrot_tasks/mandelbrot.ispc +++ b/examples/mandelbrot_tasks/mandelbrot.ispc @@ -1,5 +1,5 @@ /* - Copyright (c) 2010-2011, Intel Corporation + Copyright (c) 2010-2012, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -41,8 +41,10 @@ mandel(float c_re, float c_im, int count) { float new_re = z_re*z_re - z_im*z_im; float new_im = 2.f * z_re * z_im; - z_re = c_re + new_re; - z_im = c_im + new_im; + unmasked { + z_re = c_re + new_re; + z_im = c_im + new_im; + } } return i; diff --git a/parse.yy b/parse.yy index 7a9026e3..e068efe9 100644 --- a/parse.yy +++ b/parse.yy @@ -214,7 +214,7 @@ struct ForeachDimension { %type statement labeled_statement compound_statement for_init_statement %type expression_statement selection_statement iteration_statement %type jump_statement statement_list declaration_statement print_statement -%type assert_statement sync_statement delete_statement +%type assert_statement sync_statement delete_statement unmasked_statement %type declaration parameter_declaration %type init_declarator_list @@ -1570,6 +1570,7 @@ statement | assert_statement | sync_statement | delete_statement + | unmasked_statement | error ';' { lSuggestBuiltinAlternates(); @@ -1898,6 +1899,13 @@ delete_statement } ; +unmasked_statement + : TOKEN_UNMASKED '{' statement_list '}' + { + $$ = new UnmaskedStmt($3, @1); + } + ; + print_statement : TOKEN_PRINT '(' string_constant ')' ';' { diff --git a/stmt.cpp b/stmt.cpp index 533cbac8..ade7e1ed 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -2624,6 +2624,60 @@ SwitchStmt::EstimateCost() const { } +/////////////////////////////////////////////////////////////////////////// +// UnmaskedStmt + +UnmaskedStmt::UnmaskedStmt(Stmt *s, SourcePos pos) + : Stmt(pos) { + stmts = s; +} + + +void +UnmaskedStmt::EmitCode(FunctionEmitContext *ctx) const { + if (!ctx->GetCurrentBasicBlock() || !stmts) + return; + + llvm::Value *oldInternalMask = ctx->GetInternalMask(); + llvm::Value *oldFunctionMask = ctx->GetFunctionMask(); + + ctx->SetInternalMask(LLVMMaskAllOn); + ctx->SetFunctionMask(LLVMMaskAllOn); + + stmts->EmitCode(ctx); + + ctx->SetInternalMask(oldInternalMask); + ctx->SetFunctionMask(oldFunctionMask); +} + + +void +UnmaskedStmt::Print(int indent) const { + printf("%*cUnmasked Stmt", indent, ' '); + pos.Print(); + printf("\n"); + + printf("%*cStmts:\n", indent+4, ' '); + if (stmts != NULL) + stmts->Print(indent+8); + else + printf("NULL"); + printf("\n"); +} + + +Stmt * +UnmaskedStmt::TypeCheck() { + return this; +} + + +int +UnmaskedStmt::EstimateCost() const { + return COST_ASSIGN; +} + + /////////////////////////////////////////////////////////////////////////// // ReturnStmt diff --git a/stmt.h b/stmt.h index ee6bd2f0..0c8d784e 100644 --- a/stmt.h +++ b/stmt.h @@ -297,6 +297,23 @@ public: }; +/** + */ +class UnmaskedStmt : public Stmt { +public: + UnmaskedStmt(Stmt *stmt, SourcePos pos); + + void EmitCode(FunctionEmitContext *ctx) const; + void Print(int indent) const; + + Stmt *TypeCheck(); + int EstimateCost() const; + + Stmt *stmts; +}; + + + /** @brief Statement implementation for a 'return' or 'coherent' return statement in the program. */ class ReturnStmt : public Stmt { diff --git a/tests/unmasked-1.ispc b/tests/unmasked-1.ispc new file mode 100644 index 00000000..1433b5b7 --- /dev/null +++ b/tests/unmasked-1.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + if (a == 2) { + unmasked { + a = 0; + } + } + RET[programIndex] = a; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} diff --git a/tests/unmasked-2.ispc b/tests/unmasked-2.ispc new file mode 100644 index 00000000..3d70204d --- /dev/null +++ b/tests/unmasked-2.ispc @@ -0,0 +1,18 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + int count = 0; + for (int i = 0; i < a; ++i) { + unmasked { + ++count; + } + } + RET[programIndex] = count; +} + +export void result(uniform float RET[]) { + RET[programIndex] = programCount; +} diff --git a/tests/unmasked-3.ispc b/tests/unmasked-3.ispc new file mode 100644 index 00000000..61ce1653 --- /dev/null +++ b/tests/unmasked-3.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + int count = 0; + for (int i = 0; i < a; ++i) { + ccontinue; + unmasked { + ++count; + } + } + RET[programIndex] = count; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +}