Add unmasked { } statement.

This reestablishes an "all on" execution mask for the gang, which can be useful for nested parallelism..
2012-06-22 14:30:58 -07:00
parent b4a078e2f6
commit 54459255d4
10 changed files with 282 additions and 30 deletions
--- a/ast.cpp
+++ b/ast.cpp
@@ -103,6 +103,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        PrintStmt *ps;
        AssertStmt *as;
        DeleteStmt *dels;
+        UnmaskedStmt *ums;

        if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
            es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
@@ -174,6 +175,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
            as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
        else if ((dels = dynamic_cast<DeleteStmt *>(node)) != NULL)
            dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
+        else if ((ums = dynamic_cast<UnmaskedStmt *>(node)) != NULL)
+            ums->stmts = (Stmt *)WalkAST(ums->stmts, preFunc, postFunc, data);
        else
            FATAL("Unhandled statement type in WalkAST()");
    }
@@ -396,7 +399,8 @@ lCheckAllOffSafety(ASTNode *node, void *data) {

    if (dynamic_cast<ForeachStmt *>(node) != NULL ||
        dynamic_cast<ForeachActiveStmt *>(node) != NULL ||
-        dynamic_cast<ForeachUniqueStmt *>(node) != NULL) {
+        dynamic_cast<ForeachUniqueStmt *>(node) != NULL ||
+        dynamic_cast<UnmaskedStmt *>(node) != NULL) {
        // The various foreach statements also shouldn't be run with an
        // all-off mask.  Since they can re-establish an 'all on' mask,
        // this would be pretty unintuitive.  (More generally, it's
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -119,6 +119,7 @@ Contents:

      + `Function Overloading`_

+    * `Re-establishing The Execution Mask`_
    * `Task Parallel Execution`_

      + `Task Parallelism: "launch" and "sync" Statements`_
@@ -2712,6 +2713,15 @@ Any function that can be launched with the ``launch`` construct in ``ispc``
 must have a ``task`` qualifier; see `Task Parallelism: "launch" and "sync"
 Statements`_ for more discussion of launching tasks in ``ispc``.

+A function can also be given the ``unmasked`` qualifier; this qualifier
+indicates that all program instances should be made active at the start of
+the function execution (or, equivalently, that the current execution mask
+shouldn't be passed to the function from the function call site.)  If it is
+known that a function will always be called when all program instances are
+executing, adding this qualifier can slightly improve performance.  See the
+Section `Re-establishing The Execution Mask`_ for more discussion of
+``unmasked`` program code.
+
 Functions that are intended to be called from C/C++ application code must
 have the ``export`` qualifier.  This causes them to have regular C linkage
 and to have their declarations included in header files, if the ``ispc``
@@ -2754,6 +2764,95 @@ of a given type are found, an error is issued.
  variability from ``uniform`` to ``varying`` as needed.


+Re-establishing The Execution Mask
+----------------------------------
+
+As discussed in `Functions and Function Calls`_, a function that is
+declared with an ``unmasked`` qualifier starts execution with all program
+instances running, regardless of the execution mask at the site of the
+function call.  A block of statements can also be enclosed with
+``unmasked`` to have the same effect within a function:
+
+::
+
+    int a = ..., b = ...;
+    if (a < b) {
+        // only program instances where a < b are executing here
+        unmasked {
+            // now all program instances are executing
+        }
+        // and again only the a < b instances
+    }
+
+``unmasked`` can be useful in cases where the programmer wants to "change
+the axis of parallelism" or use nested parallelism, as shown in the
+following code:
+
+::
+
+    uniform WorkItem items[...] = ...;
+    foreach (itemNum = 0 ... numItems) {
+        // do computation on items[itemNum] to determine if it needs
+        // further processing...
+        if (/* itemNum needs processing */) {
+            foreach_active (i) {
+                unmasked {
+                    uniform int uItemNum = extract(itemNum, i);
+                    // apply entire gang of program instances to uItemNum
+                }
+            }
+        }
+    }
+
+The general idea is that we are first using SPMD parallelism to determine
+which of the items requires further processing, checking a gang's worth of
+them concurrently inside the ``foreach`` loop.  Assuming that only a subset
+of them need further processing, would be wasteful to do this work within
+the ``foreach`` loop in the same program instance that made the initial
+determination of whether more work as needed; in this case, all of the
+program instances corresponding to items that didn't need further
+processing would be inactive, with corresponding unused computational
+capability in the system.
+
+In the above code, this issue is avoided by working on each of the items
+requiring more processing in turn with ``foreach_active`` and then using
+``unmasked`` to re-establish execution of all of the program instances.
+The entire gang can in turn be applied to the computation to be done for
+each ``items[itemNum]``.
+
+The ``unmasked`` statement should be used with care; it can lead to a
+number of surprising cases of undefined program behavior.  For example,
+consider the following code:
+
+::
+
+    void func(float);
+    float a = ...;
+    float b;
+    if (a < 0) {
+        b = 0;
+        unmasked {
+            if (b == 0)
+                func(a);
+        }
+    }
+
+The variable ``a`` is initialized to some value and ``b`` is declared but
+not initialized, and thus has an undefined value.  Within the ``if`` test,
+we have assigned zero to ``b``, though only for the program instances
+currently executing--i.e. those where ``a < 0``.  After re-establishing the
+executing mask with ``unmasked``, we then compare ``b`` to zero--this
+comparison is well-defined (and "true") for the program instances where ``a
+< 0``, but it is undefed for any program instances where that isn't the
+case, since the value of ``b`` is undefined for those program instances.
+Similar surprising cases can arise when writing to ``varying`` variables
+within ``unmasked`` code.
+
+As a general rule, code within an ``unmasked`` block, or a function with
+the ``unmasked`` qualifier should use great care when accessing ``varying``
+variables that were declared in an outer scope.
+
+
 Task Parallel Execution
 -----------------------

@@ -2789,17 +2888,46 @@ Any function that is launched as a task must be declared with the
 Tasks must return ``void``; a compile time error is issued if a
 non-``void`` task is defined.

-Given a task definitions, there are two ways to write code that launches
-tasks, using the ``launch`` construct.  First, one task can be launched at
-a time, with parameters passed to the task to help it determine what part
-of the overall computation it's responsible for:
+Given a task declaration, a task can be launched with ``launch``:
+
+::
+
+    uniform float a[...] = ...;
+    launch func(a, 1);
+
+Program execution continues asynchronously after a ``launch`` statement in
+a function; thus, a function shouldn't access values written by a task it
+has launched within the function without synchronization.  A function can
+use a ``sync`` statement to wait for all launched tasks to finish:
+
+::
+
+    launch func(a, 1);
+    sync;
+    // now safe to use computed values in a[]...
+
+Alternatively, any function that launches tasks has an automatically-added
+implicit ``sync`` statement before it returns, so that functions that call
+a function that launches tasks don't have to worry about outstanding
+asynchronous computation from that function.
+
+The task generated by a ``launch`` statement is a single gang's worth of
+work.  The same program instances are respectively active and inactive at
+the start of the task as were active and inactive when their ``launch``
+statement executed.  To make all program instances in the launched gang be
+active, the ``unmasked`` construct can be used (see `Re-establishing The
+Execution Mask`_.)
+
+There are two ways to write code that launches a group multiple tasks.
+First, one task can be launched at a time, with parameters passed to the
+task to help it determine what part of the overall computation it's
+responsible for:

 ::

    for (uniform int i = 0; i < 100; ++i)
        launch func(a, i);

-Note the ``launch`` keyword before the function call expression.
 This code launches 100 tasks, each of which presumably does some
 computation that is keyed off of given the value ``i``.  In general, one
 should launch many more tasks than there are processors in the system to
@@ -2830,23 +2958,6 @@ implementation of ``func2`` to determine which array element to process.
        a[taskIndex] = ...
    }

-Program execution continues asynchronously after a ``launch`` statement in
-a function; thus, a function shouldn't access values being generated by the
-tasks it has launched within the function without synchronization.  If
-results are needed before function return, a function can use a ``sync``
-statement to wait for all launched tasks to finish:
-
-::
-
-    launch[100] func2(a);
-    sync;
-    // now safe to use computed values in a[]...
-
-Alternatively, any function that launches tasks has an automatically-added
-``sync`` statement before it returns, so that functions that call a
-function that launches tasks don't have to worry about outstanding
-asynchronous computation from that function.
-
 Inside functions with the ``task`` qualifier, two additional built-in
 variables are provided in addition to ``taskIndex`` and ``taskCount``:
 ``threadIndex`` and ``threadCount``.  ``threadCount`` gives the total
--- a/examples/mandelbrot/mandelbrot.ispc
+++ b/examples/mandelbrot/mandelbrot.ispc
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,10 @@ static inline int mandel(float c_re, float c_im, int count) {

        float new_re = z_re*z_re - z_im*z_im;
        float new_im = 2.f * z_re * z_im;
-        z_re = c_re + new_re;
-        z_im = c_im + new_im;
+        unmasked {
+            z_re = c_re + new_re;
+            z_im = c_im + new_im;
+        }
    }

    return i;
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -41,8 +41,10 @@ mandel(float c_re, float c_im, int count) {

        float new_re = z_re*z_re - z_im*z_im;
        float new_im = 2.f * z_re * z_im;
-        z_re = c_re + new_re;
-        z_im = c_im + new_im;
+        unmasked {
+            z_re = c_re + new_re;
+            z_im = c_im + new_im;
+        }
    }

    return i;
--- a/parse.yy
+++ b/parse.yy
@@ -214,7 +214,7 @@ struct ForeachDimension {
 %type <stmt> statement labeled_statement compound_statement for_init_statement
 %type <stmt> expression_statement selection_statement iteration_statement
 %type <stmt> jump_statement statement_list declaration_statement print_statement
-%type <stmt> assert_statement sync_statement delete_statement
+%type <stmt> assert_statement sync_statement delete_statement unmasked_statement

 %type <declaration> declaration parameter_declaration
 %type <declarators> init_declarator_list 
@@ -1570,6 +1570,7 @@ statement
    | assert_statement
    | sync_statement
    | delete_statement
+    | unmasked_statement
    | error ';'
    {
        lSuggestBuiltinAlternates();
@@ -1898,6 +1899,13 @@ delete_statement
    }
    ;

+unmasked_statement
+    : TOKEN_UNMASKED '{' statement_list '}'
+    {
+        $$ = new UnmaskedStmt($3, @1);
+    }
+    ;
+
 print_statement
    : TOKEN_PRINT '(' string_constant ')' ';'
      {
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -2624,6 +2624,60 @@ SwitchStmt::EstimateCost() const {
 }


+///////////////////////////////////////////////////////////////////////////
+// UnmaskedStmt
+
+UnmaskedStmt::UnmaskedStmt(Stmt *s, SourcePos pos)
+    : Stmt(pos) {
+    stmts = s;
+}
+
+
+void
+UnmaskedStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock() || !stmts)
+        return;
+
+    llvm::Value *oldInternalMask = ctx->GetInternalMask();
+    llvm::Value *oldFunctionMask = ctx->GetFunctionMask();
+
+    ctx->SetInternalMask(LLVMMaskAllOn);
+    ctx->SetFunctionMask(LLVMMaskAllOn);
+
+    stmts->EmitCode(ctx);
+
+    ctx->SetInternalMask(oldInternalMask);
+    ctx->SetFunctionMask(oldFunctionMask);
+}
+
+
+void
+UnmaskedStmt::Print(int indent) const {
+    printf("%*cUnmasked Stmt", indent, ' ');
+    pos.Print();
+    printf("\n");
+
+    printf("%*cStmts:\n", indent+4, ' ');
+    if (stmts != NULL)
+        stmts->Print(indent+8);
+    else
+        printf("NULL");
+    printf("\n");
+}
+
+
+Stmt *
+UnmaskedStmt::TypeCheck() {
+    return this;
+}
+
+
+int
+UnmaskedStmt::EstimateCost() const {
+    return COST_ASSIGN;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // ReturnStmt

--- a/stmt.h
+++ b/stmt.h
@@ -297,6 +297,23 @@ public:
 };


+/** 
+ */
+class UnmaskedStmt : public Stmt {
+public:
+    UnmaskedStmt(Stmt *stmt, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    Stmt *stmts;
+};
+
+
+
 /** @brief Statement implementation for a 'return' or 'coherent' return
    statement in the program. */
 class ReturnStmt : public Stmt {
--- a/tests/unmasked-1.ispc
+++ b/tests/unmasked-1.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    if (a == 2) {
+        unmasked {
+            a = 0;
+        }
+    }
+    RET[programIndex] = a; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/unmasked-2.ispc
+++ b/tests/unmasked-2.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    int count = 0;
+    for (int i = 0; i < a; ++i) {
+        unmasked {
+            ++count;
+        }
+    }
+    RET[programIndex] = count; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount;
+}
--- a/tests/unmasked-3.ispc
+++ b/tests/unmasked-3.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    int count = 0;
+    for (int i = 0; i < a; ++i) {
+        ccontinue;
+        unmasked {
+            ++count;
+        }
+    }
+    RET[programIndex] = count; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}