From f90aa172a65a05aac3ea1f1a5a624d5c80108c8d Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 1 Dec 2011 09:42:56 -0800 Subject: [PATCH] Documentation work; first pass perf guide complete --- docs/faq.txt | 124 ++++++++--- docs/ispc.txt | 157 +++++++------- docs/perf.txt | 587 +++++++++++++++++++++++++++++++++++++------------- 3 files changed, 611 insertions(+), 257 deletions(-) diff --git a/docs/faq.txt b/docs/faq.txt index 9e66d16f..3f184cf1 100644 --- a/docs/faq.txt +++ b/docs/faq.txt @@ -23,7 +23,7 @@ distribution. * Programming Techniques + `What primitives are there for communicating between SPMD program instances?`_ - + `How can a gang of program instances generate variable output efficiently?`_ + + `How can a gang of program instances generate variable amounts of output efficiently?`_ + `Is it possible to use ispc for explicit vector programming?`_ @@ -48,8 +48,7 @@ If the SSE4 target is used, then the following assembly is printed: :: - _foo: ## @foo - ## BB#0: ## %allocas + _foo: addl %esi, %edi movl %edi, %eax ret @@ -98,7 +97,7 @@ output array. } Here is the assembly code for the application-callable instance of the -function--note that the selected instructions are ideal. +function. :: @@ -111,21 +110,7 @@ function--note that the selected instructions are ideal. And here is the assembly code for the ``ispc``-callable instance of the -function. There are a few things to notice in this code. - -The current program mask is coming in via the %xmm0 register and the -initial few instructions in the function essentially check to see if the -mask is all-on or all-off. If the mask is all on, the code at the label -LBB0_3 executes; it's the same as the code that was generated for ``_foo`` -above. If the mask is all off, then there's nothing to be done, and the -function can return immediately. - -In the case of a mixed mask, a substantial amount of code is generated to -load from and then store to only the array elements that correspond to -program instances where the mask is on. (This code is elided below). This -general pattern of having two-code paths for the "all on" and "mixed" mask -cases is used in the code generated for almost all but the most simple -functions (where the overhead of the test isn't worthwhile.) +function. :: @@ -148,11 +133,84 @@ functions (where the overhead of the test isn't worthwhile.) #### ret +There are a few things to notice in this code. First, the current program +mask is coming in via the ``%xmm0`` register and the initial few +instructions in the function essentially check to see if the mask is all on +or all off. If the mask is all on, the code at the label LBB0_3 executes; +it's the same as the code that was generated for ``_foo`` above. If the +mask is all off, then there's nothing to be done, and the function can +return immediately. + +In the case of a mixed mask, a substantial amount of code is generated to +load from and then store to only the array elements that correspond to +program instances where the mask is on. (This code is elided below). This +general pattern of having two-code paths for the "all on" and "mixed" mask +cases is used in the code generated for almost all but the most simple +functions (where the overhead of the test isn't worthwhile.) How can I more easily see gathers and scatters in generated assembly? --------------------------------------------------------------------- -FIXME +Because CPU vector ISAs don't have native gather and scatter instructions, +these memory operations are turned into sequences of a series of +instructions in the code that ``ispc`` generates. In some cases, it can be +useful to see where gathers and scatters actually happen in code; there is +an otherwise undocumented command-line flag that provides this information. + +Consider this simple program: + +:: + + void set(uniform int a[], int value, int index) { + a[index] = value; + } + +When compiled normally to the SSE4 target, this program generates this +extensive code sequence, which makes it more difficult to see what the +program is actually doing. + +:: + + "_set___uptrii": + pmulld LCPI0_0(%rip), %xmm1 + movmskps %xmm2, %eax + testb $1, %al + je LBB0_2 + movd %xmm1, %ecx + movd %xmm0, (%rcx,%rdi) + LBB0_2: + testb $2, %al + je LBB0_4 + pextrd $1, %xmm1, %ecx + pextrd $1, %xmm0, (%rcx,%rdi) + LBB0_4: + testb $4, %al + je LBB0_6 + pextrd $2, %xmm1, %ecx + pextrd $2, %xmm0, (%rcx,%rdi) + LBB0_6: + testb $8, %al + je LBB0_8 + pextrd $3, %xmm1, %eax + pextrd $3, %xmm0, (%rax,%rdi) + LBB0_8: + ret + +If this program is compiled with the +``--opt=disable-handle-pseudo-memory-ops`` command-line flag, then the +scatter is left as an unresolved function call. The resulting program +won't link without unresolved symbols, but the assembly output is much +easier to understand: + +:: + + "_set___uptrii": + movaps %xmm0, %xmm3 + pmulld LCPI0_0(%rip), %xmm1 + movdqa %xmm1, %xmm0 + movaps %xmm3, %xmm1 + jmp ___pseudo_scatter_base_offsets32_32 ## TAILCALL + Interoperability ================ @@ -301,13 +359,17 @@ need to synchronize the program instances before communicating between them, due to the synchronized execution model of gangs of program instances in ``ispc``. -How can a gang of program instances generate variable output efficiently? -------------------------------------------------------------------------- +How can a gang of program instances generate variable amounts of output efficiently? +------------------------------------------------------------------------------------ -A useful application of the ``exclusive_scan_add()`` function in the -standard library is when program instances want to generate a variable -amount of output and when one would like that output to be densely packed -in a single array. For example, consider the code fragment below: +It's not unusual to have a gang of program instances where each program +instance generates a variable amount of output (perhaps some generate no +output, some generate one output value, some generate many output values +and so forth), and where one would like to have the output densely packed +in an output array. The ``exclusive_scan_add()`` function from the +standard library is quite useful in this situation. + +Consider the following function: :: @@ -331,11 +393,11 @@ value, the second two values, and the third and fourth three values each. In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6) to the four program instances, respectively. -The first program instance will write its one result to ``outArray[0]``, -the second will write its two values to ``outArray[1]`` and -``outArray[2]``, and so forth. The ``reduce_add`` call at the end returns -the total number of values that all of the program instances have written -to the array. +The first program instance will then write its one result to +``outArray[0]``, the second will write its two values to ``outArray[1]`` +and ``outArray[2]``, and so forth. The ``reduce_add()`` call at the end +returns the total number of values that all of the program instances have +written to the array. FIXME: add discussion of foreach_active as an option here once that's in diff --git a/docs/ispc.txt b/docs/ispc.txt index 848c8dd7..7dbe1399 100644 --- a/docs/ispc.txt +++ b/docs/ispc.txt @@ -48,6 +48,8 @@ Contents: * `Recent Changes to ISPC`_ + + `Updating ISPC Programs For Changes In ISPC 1.1`_ + * `Getting Started with ISPC`_ + `Installing ISPC`_ @@ -62,18 +64,27 @@ Contents: * `The ISPC Language`_ + + `Relationship To The C Programming Language`_ + `Lexical Structure`_ - + `Basic Types and Type Qualifiers`_ - + `Function Pointer Types`_ - + `Enumeration Types`_ - + `Short Vector Types`_ - + `Struct and Array Types`_ + + `Types`_ + + * `Basic Types and Type Qualifiers`_ + * `Function Pointer Types`_ + * `Enumeration Types`_ + * `Short Vector Types`_ + * `Struct and Array Types`_ + + `Declarations and Initializers`_ - + `Function Declarations`_ + `Expressions`_ + `Control Flow`_ - + `Functions`_ - + `C Constructs not in ISPC`_ + + * `Conditional Statements: "if"`_ + * `Basic Iteration Statements: "for", "while", and "do"`_ + * `Parallel Iteration Statements: "foreach" and "foreach_tiled"`_ + * `Functions and Function Calls`_ + + + `Function Declarations`_ + + `Function Overloading`_ * `Parallel Execution Model in ISPC`_ @@ -110,6 +121,8 @@ Contents: + `Restructuring Existing Programs to Use ISPC`_ + `Understanding How to Interoperate With the Application's Data`_ +* `Related Languages`_ + * `Disclaimer and Legal Information`_ * `Optimization Notice`_ @@ -120,6 +133,9 @@ Recent Changes to ISPC See the file ``ReleaseNotes.txt`` in the ``ispc`` distribution for a list of recent changes to the compiler. +Updating ISPC Programs For Changes In ISPC 1.1 +---------------------------------------------- + Getting Started with ISPC ========================= @@ -407,6 +423,9 @@ parallel execution model (versus C's serial model), C code is not directly portable to ``ispc``, although starting with working C code and porting it to ``ispc`` can be an efficient way to write ``ispc`` programs. +Relationship To The C Programming Language +------------------------------------------ + Lexical Structure ----------------- @@ -541,6 +560,9 @@ A number of tokens are used for grouping in ``ispc``: - Compound statements +Types +----- + Basic Types and Type Qualifiers ------------------------------- @@ -906,31 +928,6 @@ Structures can also be initialized only with element values in braces: Color d = { 0.5, .75, 1.0 }; // r = 0.5, ... -Function Declarations ---------------------- - -Functions can be declared with a number of qualifiers that affect their -visibility and capabilities. As in C/C++, functions have global visibility -by default. If a function is declared with a ``static`` qualifier, then it -is only visible in the file in which it was declared. - -Any function that can be launched with the ``launch`` construct in ``ispc`` -must have a ``task`` qualifier; see `Task Parallelism: Language Syntax`_ -for more discussion of launching tasks in ``ispc``. - -Functions that are intended to be called from C/C++ application code must -have the ``export`` qualifier. This causes them to have regular C linkage -and to have their declarations included in header files, if the ``ispc`` -compiler is directed to generated a C/C++ header file for the file it -compiled. - -Finally, any function defined with an ``inline`` qualifier will always be -inlined by ``ispc``; ``inline`` is not a hint, but forces inlining. The -compiler will opportunistically inline short functions depending on their -complexity, but any function that should always be inlined should have the -``inline`` qualifier. - - Expressions ----------- @@ -959,6 +956,46 @@ Structure member access and array indexing also work as in C. Control Flow ------------ +Conditional Statements: "if" +---------------------------- + +Basic Iteration Statements: "for", "while", and "do" +---------------------------------------------------- + +Parallel Iteration Statements: "foreach" and "foreach_tiled" +------------------------------------------------------------ + +Functions and Function Calls +---------------------------- + +Function Declarations +--------------------- + +Functions can be declared with a number of qualifiers that affect their +visibility and capabilities. As in C/C++, functions have global visibility +by default. If a function is declared with a ``static`` qualifier, then it +is only visible in the file in which it was declared. + +Any function that can be launched with the ``launch`` construct in ``ispc`` +must have a ``task`` qualifier; see `Task Parallelism: Language Syntax`_ +for more discussion of launching tasks in ``ispc``. + +Functions that are intended to be called from C/C++ application code must +have the ``export`` qualifier. This causes them to have regular C linkage +and to have their declarations included in header files, if the ``ispc`` +compiler is directed to generated a C/C++ header file for the file it +compiled. + +Finally, any function defined with an ``inline`` qualifier will always be +inlined by ``ispc``; ``inline`` is not a hint, but forces inlining. The +compiler will opportunistically inline short functions depending on their +complexity, but any function that should always be inlined should have the +``inline`` qualifier. + + +Function Overloading +-------------------- + ``ispc`` supports most of C's control flow constructs, including ``if``, ``for``, ``while``, ``do``. You can use ``break`` and ``continue`` statements in ``for``, ``while``, and ``do`` loops. @@ -1335,13 +1372,8 @@ at run-time, in which case it can jump to a simpler code path or otherwise save work. The first of these statements is ``cif``, indicating an ``if`` statement -that is expected to be coherent. Recall from the `The -SPMD-on-SIMD Execution Model`_ section that ``if`` statements with a -``uniform`` test compile to more efficient code than ``if`` tests with -varying tests. ``cif`` can provide many benefits of ``if`` with a -uniform test in the case where the test is actually varying. - -The usage of ``cif`` in code is just the same as ``if``: +that is expected to be coherent. The usage of ``cif`` in code is just the +same as ``if``: :: @@ -1353,47 +1385,7 @@ The usage of ``cif`` in code is just the same as ``if``: ``cif`` provides a hint to the compiler that you expect that most of the executing SPMD programs will all have the same result for the ``if`` -condition. In this case, the code the compiler generates for the ``if`` -test is along the lines of the following pseudo-code: - -:: - - bool expr = /* evaluate cif condition */ - if (all(expr)) { - // run "true" case of if test only - } else if (!any(expr)) { - // run "false" case of if test only - } else { - // run both true and false cases, updating mask appropriately - } - -(For comparison, see the discussion of how regular ``if`` statements are -executed from the `The SPMD-on-SIMD Execution Model`_ -section.) - -For ``if`` statements where the different running SPMD program instances -don't have coherent values for the boolean ``if`` test, using ``cif`` -introduces some additional overhead from the ``all`` and ``any`` tests as -well as the corresponding branches. For cases where the program -instances often do compute the same boolean value, this overhead is -worthwhile. If the control flow is in fact usually incoherent, this -overhead only costs performance. - -In a similar fashion, ``ispc`` provides ``cfor``, ``cwhile``, ``cdo``, -``cbreak``, ``ccontinue``, and ``creturn`` statements. These statements -are semantically the same as the corresponding non-"c"-prefixed functions. - -For example, when ``ispc`` encounters a regular ``continue`` statement in -the middle of loop, it disables the mask bits for the program instances -that executed the ``continue`` and then executes the remainder of the loop -body, under the expectation that other executing program instances will -still need to run those instructions. If you expect that all running -program instances will often execute ``continue`` together, then -``ccontinue`` provides the compiler a hint to do extra work to check if -every running program instance continued, in which case it can jump to the -end of the loop, saving the work of executing the otherwise meaningless -instructions. - +condition. Program Instance Convergence ---------------------------- @@ -2952,6 +2944,9 @@ elements to work with and then proceeds with the computation. } +Related Languages +================= + Disclaimer and Legal Information ================================ diff --git a/docs/perf.txt b/docs/perf.txt index 03f83ba1..b2d98207 100644 --- a/docs/perf.txt +++ b/docs/perf.txt @@ -2,38 +2,275 @@ Intel® SPMD Program Compiler Performance Guide ============================================== +The SPMD programming model provided by ``ispc`` naturally delivers +excellent performance for many workloads thanks to efficient use of CPU +SIMD vector hardware. This guide provides more details about how to get +the most out of ``ispc`` in practice. -* `Using ISPC Effectively`_ +* `Key Concepts`_ - + `Gather and Scatter`_ - + `8 and 16-bit Integer Types`_ - + `Low-level Vector Tricks`_ - + `The "Fast math" Option`_ - + `"Inline" Aggressively`_ - + `Small Performance Tricks`_ - + `Instrumenting Your ISPC Programs`_ - + `Choosing A Target Vector Width`_ + + `Efficient Iteration With "foreach"`_ + + `Improving Control Flow Coherence With "foreach_tiled"`_ + + `Using Coherent Control Flow Constructs`_ + + `Use "uniform" Whenever Appropriate`_ + +* `Tips and Techniques`_ + + + `Understanding Gather and Scatter`_ + + `Avoid 64-bit Addressing Calculations When Possible`_ + + `Avoid Computation With 8 and 16-bit Integer Types`_ + `Implementing Reductions Efficiently`_ + + `Using Low-level Vector Tricks`_ + + `The "Fast math" Option`_ + + `"inline" Aggressively`_ + + `Avoid The System Math Library`_ + + `Declare Variables In The Scope Where They're Used`_ + + `Instrumenting ISPC Programs To Understand Runtime Behavior`_ + + `Choosing A Target Vector Width`_ * `Disclaimer and Legal Information`_ * `Optimization Notice`_ +Key Concepts +============ -don't use the system math library unless it's absolutely necessary +This section describes the four most important concepts to understand and +keep in mind when writing high-performance ``ispc`` programs. It assumes +good familiarity with the topics covered in the ``ispc`` `Users Guide`_. -opt=32-bit-addressing +.. _Users Guide: ispc.html -Using ISPC Effectively -====================== +Efficient Iteration With "foreach" +---------------------------------- + +The ``foreach`` parallel iteration construct is semantically equivalent to +a regular ``for()`` loop, though it offers meaningful performance benefits. +(See the `documentation on "foreach" in the Users Guide`_ for a review of +its syntax and semantics.) As an example, consider this simple function +that iterates over some number of elements in an array, doing computation +on each one: + +.. _documentation on "foreach" in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled + +:: + + export void foo(uniform int a[], uniform int count) { + for (int i = programIndex; i < count; i += programCount) { + // do some computation on a[i] + } + } + +Depending on the specifics of the computation being performed, the code +generated for this function could likely be improved by modifying the code +so that the loop only goes as far through the data as is possible to pack +an entire gang of program instances with computation each time thorugh the +loop. Doing so enables the ``ispc`` compiler to generate more efficient +code for cases where it knows that the execution mask is "all on". Then, +an ``if`` statement at the end handles processing the ragged extra bits of +data that didn't fully fill a gang. + +:: + + export void foo(uniform int a[], uniform int count) { + // First, just loop up to the point where all program instances + // in the gang will be active at the loop iteration start + uniform int countBase = count & ~(programCount-1); + for (uniform int i = 0; i < countBase; i += programCount) { + int index = i + programIndex; + // do some computation on a[index] + } + // Now handle the ragged extra bits at the end + if (countBase < count) { + int index = countBase + programIndex; + // do some computation on a[index] + } + } + +While the performance of the above code will likely be better than the +first version of the function, the loop body code has been duplicated (or +has been forced to move into a separate utility function). + +Using the ``foreach`` looping construct as below provides all of the +performance benefits of the second version of this function, with the +compactness of the first. + +:: + + export void foo(uniform int a[], uniform int count) { + foreach (i = 0 ... count) { + // do some computation on a[i] + } + } + +Improving Control Flow Coherence With "foreach_tiled" +----------------------------------------------------- + +Depending on the computation being performed, ``foreach_tiled`` may give +better performance than ``foreach``. (See the `documentation in the Users +Guide`_ for the syntax and semantics of ``foreach_tiled``.) Given a +multi-dimensional iteration like: + +.. _documentation in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled + +:: + + foreach (i = 0 ... width, j = 0 ... height) { + // do computation on element (i,j) + } + +if the ``foreach`` statement is used, elements in the gang of program +instances will be mapped to values of ``i`` and ``j`` by taking spans of +``programCount`` elements across ``i`` with a single value of ``j``. For +example, the ``foreach`` statement above roughly corresponds to: + +:: + + for (uniform int j = 0; j < height; ++j) + for (int i = 0; i < width; i += programCount) { + // do computation + } + +When a multi-dimensional domain is being iterated over, ``foreach_tiled`` +statement maps program instances to data in a way that tries to select +square n-dimensional segments of the domain. For example, on a compilation +target with 8-wide gangs of program instances, it generates code that +iterates over the domain the same way as the following code (though more +efficiently): + +:: + + for (int j = programIndex/4; j < height; j += 2) + for (int i = programIndex%4; i < width; i += 4) { + // do computation + } + +Thus, each gang of program instances operates on a 2x4 tile of the domain. +With higher-dimensional iteration and different gang sizes, a similar +mapping is performed--e.g. for 2D iteration with a 16-wide gang size, 4x4 +tiles are iterated over; for 4D iteration with a 8-gang, 1x2x2x2 tiles are +processed, and so forth. + +Performance benefit can come from using ``foreach_tiled`` in that it +essentially optimizes for the benefit of iterating over *compact* regions +of the domian (while ``foreach`` iterates over the domain in a way that +generally allows linear memory access.) There are two benefits from +processing compact regions of the domain. + +First, it's often the case that the control flow coherence of the program +instances in the gang is improved; if data-dependent control flow decisions +are related to the values of the data in the domain being processed, and if +the data values have some coherence, iterating with compact regions will +improve control flow coherence. + +Second, processing compact regions may mean that the data accessed by +program instances in the gang is be more coherent, leading to performance +benefits from better cache hit rates. + +As a concrete example, for the ray tracer example in the ``ispc`` +distribution (in the ``examples/rt`` directory), performance is 20% better +when the pixels are iterated over using ``foreach_tiled`` than ``foreach``, +because more coherent regions of the scene are accessed by the set of rays +in the gang of program instances. -Gather and Scatter ------------------- +Using Coherent Control Flow Constructs +-------------------------------------- -The CPU is a poor fit for SPMD execution in some ways, the worst of which -is handling of general memory reads and writes from SPMD program instances. -For example, in a "simple" array index: +Recall from the ``ispc`` Users Guide, in the `SPMD-on-SIMD Execution Model +section`_ that ``if`` statements with a ``uniform`` test compile to more +efficient code than ``if`` tests with varying tests. The coherent ``cif`` +statement can provide many benefits of ``if`` with a uniform test in the +case where the test is actually varying. + +.. _SPMD-on-SIMD Execution Model section: ispc.html#the-spmd-on-simd-execution-model + +In this case, the code the compiler generates for the ``if`` +test is along the lines of the following pseudo-code: + +:: + + bool expr = /* evaluate cif condition */ + if (all(expr)) { + // run "true" case of if test only + } else if (!any(expr)) { + // run "false" case of if test only + } else { + // run both true and false cases, updating mask appropriately + } + +For ``if`` statements where the different running SPMD program instances +don't have coherent values for the boolean ``if`` test, using ``cif`` +introduces some additional overhead from the ``all`` and ``any`` tests as +well as the corresponding branches. For cases where the program +instances often do compute the same boolean value, this overhead is +worthwhile. If the control flow is in fact usually incoherent, this +overhead only costs performance. + +In a similar fashion, ``ispc`` provides ``cfor``, ``cwhile``, ``cdo``, +``cbreak``, ``ccontinue``, and ``creturn`` statements. These statements +are semantically the same as the corresponding non-"c"-prefixed functions. + +For example, when ``ispc`` encounters a regular ``continue`` statement in +the middle of loop, it disables the mask bits for the program instances +that executed the ``continue`` and then executes the remainder of the loop +body, under the expectation that other executing program instances will +still need to run those instructions. If you expect that all running +program instances will often execute ``continue`` together, then +``ccontinue`` provides the compiler a hint to do extra work to check if +every running program instance continued, in which case it can jump to the +end of the loop, saving the work of executing the otherwise meaningless +instructions. + +Use "uniform" Whenever Appropriate +---------------------------------- + +For any variable that will always have the same value across all of the +program instances in a gang, declare the variable with the ``unfiorm`` +qualifier. Doing so enables the ``ispc`` compiler to emit better code in +many different ways. + +As a simple example, consider a ``for`` loop that always does the same +number of iterations: + +:: + + for (int i = 0; i < 10; ++i) + // do something ten times + +If this is written with ``i`` as a ``varying`` variable, as above, there's +additional overhead in the code generated for the loop as the compiler +emits instructions to handle the possibilty of not all program instances +following the same control flow path (as might be the case if the loop +limit, 10, was itself a ``varying`` value.) + +If the above loop is instead written with ``i`` ``uniform``, as: + +:: + + for (uniform int i = 0; i < 10; ++i) + // do something ten times + +Then better code can be generated (and the loop possibly unrolled). + +In some cases, the compiler may be able to detect simple cases like these, +but it's always best to provide the compiler with as much help as possible +to understand the actual form of your computation. + + +Tips and Techniques +=================== + +This section introduces a number of additional techniques that are worth +keeping in mind when writing ``ispc`` programs. + +Understanding Gather and Scatter +-------------------------------- + +Memory reads and writes from the program instances in a gang that access +irregular memory locations (rather than a consecutive set of locations, or +a single location) can be relatively inefficient. As an example, consider +the "simple" array indexing calculation below: :: @@ -41,23 +278,23 @@ For example, in a "simple" array index: uniform float x[10] = { ... }; float f = x[i]; -Since the index ``i`` is a varying value, the various SPMD program -instances will in general be reading different locations in the array -``x``. Because the CPU doesn't have a gather instruction, the ``ispc`` -compiler has to serialize these memory reads, performing a separate memory -load for each running program instance, packing the result into ``f``. -(And the analogous case would happen for a write into ``x[i]``.) +Since the index ``i`` is a varying value, the program instances in the gang +will in general be reading different locations in the array ``x``. Because +current CPUs have a "gather" instruction, the ``ispc`` compiler has to +serialize these memory reads, performing a separate memory load for each +running program instance, packing the result into ``f``. (The analogous +case happens for a write into ``x[i]``.) -In many cases, gathers like these are unavoidable; the running program -instances just need to access incoherent memory locations. However, if the -array index ``i`` could actually be declared and used as a ``uniform`` -variable, the resulting array index is substantially more -efficient. This is another case where using ``uniform`` whenever applicable -is of benefit. +In many cases, gathers like these are unavoidable; the program instances +just need to access incoherent memory locations. However, if the array +index ``i`` actually has the same value for all of the program instances or +if it represents an access to a consecutive set of array locations, much +more efficient load and store instructions can be generated instead of +gathers and scatters, respectively. -In some cases, the ``ispc`` compiler is able to deduce that the memory -locations accessed are either all the same or are uniform. For example, -given: +In many cases, the ``ispc`` compiler is able to deduce that the memory +locations accessed by a varying index are either all the same or are +uniform. For example, given: :: @@ -67,37 +304,160 @@ given: The compiler is able to determine that all of the program instances are loading from the same location, even though ``y`` is not a ``uniform`` -variable. In this case, the compiler will transform this load to a regular vector -load, rather than a general gather. +variable. In this case, the compiler will transform this load to a regular +vector load, rather than a general gather. -Sometimes the running program instances will access a -linear sequence of memory locations; this happens most frequently when -array indexing is done based on the built-in ``programIndex`` variable. In -many of these cases, the compiler is also able to detect this case and then -do a vector load. For example, given: +Sometimes the running program instances will access a linear sequence of +memory locations; this happens most frequently when array indexing is done +based on the built-in ``programIndex`` variable. In many of these cases, +the compiler is also able to detect this case and then do a vector load. +For example, given: :: - uniform int x = ...; - return array[2*x + programIndex]; + for (int i = programIndex; i < count; i += programCount) + // process array[i]; -A regular vector load is done from array, starting at offset ``2*x``. +Regular vector loads and stores are issued for accesses to ``array[i]``. +Both of these cases have been ones where the compiler is able to determine +statically that the index has the same value at compile-time. It's +often the case that this determination can't be made at compile time, but +this is often the case at run time. The ``reduce_equal()`` function from +the standard library can be used in this case; it checks to see if the +given value is the same across over all of the running program instances, +returning true and its ``uniform`` value if so. -8 and 16-bit Integer Types --------------------------- +The following function shows the use of ``reduce_equal()`` to check for an +equal index at execution time and then either do a scalar load and +broadcast or a general gather. + +:: + + uniform float array[..] = { ... }; + float value; + int i = ...; + uniform int ui; + if (reduce_equal(i, &ui) == true) + value = array[ui]; // scalar load + broadcast + else + value = array[i]; // gather + +For a simple case like the one above, the overhead of doing the +``reduce_equal()`` check is likely not worthwhile compared to just always +doing a gather. In more complex cases, where a number of accesses are done +based on the index, it can be worth doing. See the example +``examples/volume_rendering`` in the ``ispc`` distribution for the use of +this technique in an instance where it is beneficial to performance. + +Avoid 64-bit Addressing Calculations When Possible +-------------------------------------------------- + +Even when compiling to a 64-bit architecture target, ``ispc`` does many of +the addressing calculations in 32-bit precision by default--this behavior +can be overridden with the ``--addressing=64`` command-line argument. This +option should only be used if it's necessary to be able to address over 4GB +of memory in the ``ispc`` code, as it essentially doubles the cost of +memory addressing calculations in the generated code. + +Avoid Computation With 8 and 16-bit Integer Types +------------------------------------------------- The code generated for 8 and 16-bit integer types is generally not as efficient as the code generated for 32-bit integer types. It is generally worthwhile to use 32-bit integer types for intermediate computations, even if the final result will be stored in a smaller integer type. -Low-level Vector Tricks ------------------------ +Implementing Reductions Efficiently +----------------------------------- -Many low-level Intel® SSE coding constructs can be implemented in ``ispc`` -code. For example, the following code efficiently reverses the sign of the -given values. +It's often necessary to compute a reduction over a data set--for example, +one might want to add all of the values in an array, compute their minimum, +etc. ``ispc`` provides a few capabilities that make it easy to efficiently +compute reductions like these. However, it's important to use these +capabilities appropriately for best results. + +As an example, consider the task of computing the sum of all of the values +in an array. In C code, we might have: + +:: + + /* C implementation of a sum reduction */ + float sum(const float array[], int count) { + float sum = 0; + for (int i = 0; i < count; ++i) + sum += array[i]; + return sum; + } + +Exactly this computation could also be expressed as a purely uniform +computation in ``ispc``, though without any benefit from vectorization: + +:: + + /* inefficient ispc implementation of a sum reduction */ + uniform float sum(const uniform float array[], uniform int count) { + uniform float sum = 0; + for (uniform int i = 0; i < count; ++i) + sum += array[i]; + return sum; + } + +As a first try, one might try using the ``reduce_add()`` function from the +``ispc`` standard library; it takes a ``varying`` value and returns the sum +of that value across all of the active program instances. + +:: + + /* inefficient ispc implementation of a sum reduction */ + uniform float sum(const uniform float array[], uniform int count) { + uniform float sum = 0; + foreach (i = 0 ... count) + sum += reduce_add(array[i+programIndex]); + return sum; + } + +This implementation loads a gang's worth of values from the array, one for +each of the program instances, and then uses ``reduce_add()`` to reduce +across the program instances and then update the sum. Unfortunately this +approach loses most benefit from vectorization, as it does more work on the +cross-program instance ``reduce_add()`` call than it saves from the vector +load of values. + +The most efficient approach is to do the reduction in two phases: rather +than using a ``uniform`` variable to store the sum, we maintain a varying +value, such that each program instance is effectively computing a local +partial sum on the subset of array values that it has loaded from the +array. When the loop over array elements concludes, a single call to +``reduce_add()`` computes the final reduction across each of the program +instances' elements of ``sum``. This approach effectively compiles to a +single vector load and a single vector add for each loop iteration's of +values--very efficient code in the end. + +:: + + /* good ispc implementation of a sum reduction */ + uniform float sum(const uniform float array[], uniform int count) { + float sum = 0; + foreach (i = 0 ... count) + sum += array[i+programIndex]; + return reduce_add(sum); + } + +Using Low-level Vector Tricks +----------------------------- + +Many low-level Intel® SSE and AVX coding constructs can be implemented in +``ispc`` code. The ``ispc`` standard library functions ``intbits()`` and +``floatbits()`` are often useful in this context. Recall that +``intbits()`` takes a ``float`` value and returns it as an integer where +the bits of the integer are the same as the bit representation in memory of +the ``float``. (In other words, it does *not* perform an integer to +floating-point conversion.) ``floatbits()``, then, performs the inverse +computation. + +As an example of the use of these functions, the following code efficiently +reverses the sign of the given values. :: @@ -112,12 +472,12 @@ This code compiles down to a single XOR instruction. The "Fast math" Option ---------------------- -``ispc`` has a ``--fast-math`` command-line flag that enables a number of -optimizations that may be undesirable in code where numerical preceision is -critically important. For many graphics applications, the -approximations may be acceptable. The following two optimizations are -performed when ``--fast-math`` is used. By default, the ``--fast-math`` -flag is off. +``ispc`` has a ``--opt=fast-math`` command-line flag that enables a number of +optimizations that may be undesirable in code where numerical precision is +critically important. For many graphics applications, for example, the +approximations introduced may be acceptable, however. The following two +optimizations are performed when ``--opt=fast-math`` is used. By default, the +``--opt=fast-math`` flag is off. * Expressions like ``x / y``, where ``y`` is a compile-time constant, are transformed to ``x * (1./y)``, where the inverse value of ``y`` is @@ -125,18 +485,34 @@ flag is off. * Expressions like ``x / y``, where ``y`` is not a compile-time constant, are transformed to ``x * rcp(y)``, where ``rcp()`` maps to the - approximate reciprocal instruction from the standard library. + approximate reciprocal instruction from the ``ispc`` standard library. -"Inline" Aggressively +"inline" Aggressively --------------------- Inlining functions aggressively is generally beneficial for performance with ``ispc``. Definitely use the ``inline`` qualifier for any short functions (a few lines long), and experiment with it for longer functions. -Small Performance Tricks ------------------------- +Avoid The System Math Library +----------------------------- + +The default math library for transcendentals and the like that ``ispc`` has +higher error than the system's math library, though is much more efficient +due to being vectorized across the program instances and due to the fact +that the functions can be inlined in the final code. (It generally has +errors in the range of 10ulps, while the system math library generally has +no more than 1ulp of error for transcendentals.) + +If the ``--math-lib=system`` command-line option is used when compiling an +``ispc`` program, then calls to the system math library will be generated +instead. This option should only be used if the higher precision is +absolutely required as the performance impact of using it can be +significant. + +Declare Variables In The Scope Where They're Used +------------------------------------------------- Performance is slightly improved by declaring variables at the same block scope where they are first used. For example, in code like the @@ -168,8 +544,8 @@ Try not to write code as: Doing so can reduce the amount of masked store instructions that the compiler needs to generate. -Instrumenting Your ISPC Programs --------------------------------- +Instrumenting ISPC Programs To Understand Runtime Behavior +---------------------------------------------------------- ``ispc`` has an optional instrumentation feature that can help you understand performance issues. If a program is compiled using the @@ -187,7 +563,7 @@ gathers happen.) This function is passed the file name of the ``ispc`` file running, a short note indicating what is happening, the line number in the source file, and -the current mask of active SPMD program lanes. You must provide an +the current mask of active program instances in the gang. You must provide an implementation of this function and link it in with your application. For example, when the ``ispc`` program runs, this function might be called @@ -199,13 +575,13 @@ as follows: This call indicates that at the currently executing program has just entered the function defined at line 55 of the file ``foo.ispc``, with a -mask of all lanes currently executing (assuming a four-wide Intel® SSE +mask of all lanes currently executing (assuming a four-wide gang size target machine). For a fuller example of the utility of this functionality, see ``examples/aobench_instrumented`` in the ``ispc`` distribution. Ths -example includes an implementation of the ``ISPCInstrument`` function that -collects aggregate data about the program's execution behavior. +example includes an implementation of the ``ISPCInstrument()`` function +that collects aggregate data about the program's execution behavior. When running this example, you will want to direct to the ``ao`` executable to generate a low resolution image, because the instrumentation adds @@ -252,85 +628,6 @@ It is selected with the ``--target=sse2-x2``, ``--target=sse4-x2`` and ``--target=avx-x2`` options, respectively. -Implementing Reductions Efficiently ------------------------------------ - -It's often necessary to compute a "reduction" over a data set--for example, -one might want to add all of the values in an array, compute their minimum, -etc. ``ispc`` provides a few capabilities that make it easy to efficiently -compute reductions like these. However, it's important to use these -capabilities appropriately for best results. - -As an example, consider the task of computing the sum of all of the values -in an array. In C code, we might have: - -:: - - /* C implementation of a sum reduction */ - float sum(const float array[], int count) { - float sum = 0; - for (int i = 0; i < count; ++i) - sum += array[i]; - return sum; - } - -Of course, exactly this computation could also be expressed in ``ispc``, -though without any benefit from vectorization: - -:: - - /* inefficient ispc implementation of a sum reduction */ - uniform float sum(const uniform float array[], uniform int count) { - uniform float sum = 0; - for (uniform int i = 0; i < count; ++i) - sum += array[i]; - return sum; - } - -As a first try, one might try using the ``reduce_add()`` function from the -``ispc`` standard library; it takes a ``varying`` value and returns the sum -of that value across all of the active program instances. - -:: - - /* inefficient ispc implementation of a sum reduction */ - uniform float sum(const uniform float array[], uniform int count) { - uniform float sum = 0; - // Assumes programCount evenly divides count - for (uniform int i = 0; i < count; i += programCount) - sum += reduce_add(array[i+programIndex]); - return sum; - } - -This implementation loads a set of ``programCount`` values from the array, -one for each of the program instances, and then uses ``reduce_add`` to -reduce across the program instances and then update the sum. Unfortunately -this approach loses most benefit from vectorization, as it does more work -on the cross-program instance ``reduce_add()`` call than it saves from the -vector load of values. - -The most efficient approach is to do the reduction in two phases: rather -than using a ``uniform`` variable to store the sum, we maintain a varying -value, such that each program instance is effectively computing a local -partial sum on the subset of array values that it has loaded from the -array. When the loop over array elements concludes, a single call to -``reduce_add()`` computes the final reduction across each of the program -instances' elements of ``sum``. This approach effectively compiles to a -single vector load and a single vector add for each ``programCount`` worth -of values--very efficient code in the end. - -:: - - /* good ispc implementation of a sum reduction */ - uniform float sum(const uniform float array[], uniform int count) { - float sum = 0; - // Assumes programCount evenly divides count - for (uniform int i = 0; i < count; i += programCount) - sum += array[i+programIndex]; - return reduce_add(sum); - } - - Disclaimer and Legal Information ================================