From 2951cad365f7dbaf4bc3f73b0a822ddd4a40357b Mon Sep 17 00:00:00 2001 From: evghenii Date: Mon, 9 Dec 2013 13:10:26 +0100 Subject: [PATCH 1/8] added description for multi-dimensional tasking --- docs/ispc.rst | 63 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/docs/ispc.rst b/docs/ispc.rst index eac9b24e..3aab730b 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3012,8 +3012,8 @@ Intel® Cilk(tm), Intel® Thread Building Blocks or another task system), and for tasks to use ``ispc`` for SPMD parallelism across the vector lanes as appropriate. Alternatively, ``ispc`` also has support for launching tasks from ``ispc`` code. The approach is similar to Intel® Cilk's task launch -feature. (See the ``examples/mandelbrot_tasks`` example to see it used in -a small example.) +feature. (See the ``examples/mandelbrot_tasks`` and +``examples/mandelbrot_tasks3d`` examples to see it used in a small example.) Any function that is launched as a task must be declared with the ``task`` qualifier: @@ -3108,6 +3108,38 @@ executing the current task. The ``threadIndex`` can be used for accessing data that is private to the current thread and thus doesn't require synchronization to access under parallel execution. +The tasking system also supports multi-dimensional partitioning (currently up +to three dimensions). To launch a 3D grid of tasks, for example with ``N0``, +``N1`` and ``N2`` tasks in x-, y- and z-dimension respectively + +:: + + float data[N2][N1][N0] + task void foo_task() + { + data[taskIndex2][taskIndex1][threadIndex0] = taskIndex; + } + +we use the following ``launch`` expressions: + +:: + + launch [N2][N1][N0] foo_task() + +or + +:: + + launch [N0,N1,N2] foo_task() + +Value of ``taskIndex`` is equal to ``taskIndex0 + taskCount0*(taskIndex1 + +taskCount1*taskIndex2)`` and it ranges from ``0`` to ``taskCount-1``, where +``taskCount = taskCount0*taskCount1*taskCount2``. If ``N1`` or/and ``N2`` are +not specified in the ``launch`` expression, a value of ``1`` is assumed. +Finally, for an one-dimensional grid of tasks, ``taskIndex`` is equivalent to +``taskIndex0`` and ``taskCount`` is equivalent to ``taskCount0``. + + Task Parallelism: Runtime Requirements -------------------------------------- @@ -3138,7 +3170,7 @@ manage tasks in ``ispc``: :: void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); - void ISPCLaunch(void **handlePtr, void *f, void *data, int count); + void ISPCLaunch(void **handlePtr, void *f, void *data, int count0, int count1, int count2); void ISPCSync(void *handle); All three of these functions take an opaque handle (or a pointer to an @@ -3175,16 +3207,20 @@ tasks. Each ``launch`` statement in ``ispc`` code causes a call to after the handle pointer to the function are relatively straightforward; the ``void *f`` parameter holds a pointer to a function to call to run the work for this task, ``data`` holds a pointer to data to pass to this -function, and ``count`` is the number of instances of this function to -enqueue for asynchronous execution. (In other words, ``count`` corresponds -to the value ``n`` in a multiple-task launch statement like ``launch[n]``.) +function, and ``count0``, ``count1`` and ``count2`` are the number of instances +of this function to enqueue for asynchronous execution. (In other words, +``count0``, ``count1`` and ``count2`` correspond to the value ``n0``, ``n1`` +and ``n2`` in a multiple-task launch statement like ``launch[n2][n1][n0]`` or +``launch [n0,n1,n2]`` respectively.) The signature of the provided function pointer ``f`` is :: void (*TaskFuncPtr)(void *data, int threadIndex, int threadCount, - int taskIndex, int taskCount) + int taskIndex, int taskCount, + int taskIndex0, int taskIndex1, int taskIndex2, + int taskCount0, int taskCount1, int taskCount2); When this function pointer is called by one of the hardware threads managed by the task system, the ``data`` pointer passed to ``ISPCLaunch()`` should @@ -3194,11 +3230,14 @@ number of hardware threads that have been spawned to run tasks and uniquely identifying the hardware thread that is running the task. (These values can be used to index into thread-local storage.) -The value of ``taskCount`` should be the number of tasks launched in the -``launch`` statement that caused the call to ``ISPCLaunch()`` and each of -the calls to this function should be given a unique value of ``taskIndex`` -between zero and ``taskCount``, to distinguish which of the instances -of the set of launched tasks is running. +The value of ``taskCount`` should be the total number of tasks launched in the +``launch`` statement (it must be equal to ``taskCount0*taskCount1*taskCount2``) +that caused the call to ``ISPCLaunch()`` and each of the calls to this function +should be given a unique value of ``taskIndex``, ``taskIndex0``, ``taskIndex1`` +and ``taskIndex2`` between zero and ``taskCount``, ``taskCount0``, +``taskCount1`` and ``taskCount2`` respectively, with ``taskIndex = taskIndex0 ++ taskCount0*(taskIndex1 + taskCount1*taskIndex2)``, to distinguish which of +the instances of the set of launched tasks is running. From c06ec92d0d79acaa398a9c109baa52525b22cd1d Mon Sep 17 00:00:00 2001 From: evghenii Date: Fri, 13 Dec 2013 11:49:11 +0100 Subject: [PATCH 2/8] added commas, added multi-dimensional tasking to mandelbrot_tasks & removed mandelbrot_task3d. Also adjusted documentaiton a bit --- builtins/util.m4 | 2 +- docs/ispc.rst | 3 +- examples/mandelbrot_tasks/Makefile | 2 +- .../mandelbrot_tasks/mandelbrot_tasks.cpp | 3 +- .../mandelbrot_tasks/mandelbrot_tasks.ispc | 29 ++- examples/mandelbrot_tasks3d/.gitignore | 2 - examples/mandelbrot_tasks3d/Makefile | 8 - .../mandelbrot_tasks.vcxproj | 180 ------------------ .../mandelbrot_tasks3d/mandelbrot_tasks3d.cpp | 146 -------------- .../mandelbrot_tasks3d.ispc | 99 ---------- .../mandelbrot_tasks_serial.cpp | 68 ------- examples/tasksys.cpp | 2 +- test_static.cpp | 2 +- 13 files changed, 28 insertions(+), 518 deletions(-) delete mode 100644 examples/mandelbrot_tasks3d/.gitignore delete mode 100644 examples/mandelbrot_tasks3d/Makefile delete mode 100644 examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj delete mode 100644 examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp delete mode 100644 examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc delete mode 100644 examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp diff --git a/builtins/util.m4 b/builtins/util.m4 index c90e8adc..1580dc08 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1813,7 +1813,7 @@ define(`stdlib_core', ` declare i32 @__fast_masked_vload() declare i8* @ISPCAlloc(i8**, i64, i32) nounwind -declare void @ISPCLaunch(i8**, i8*, i8*, i32,i32,i32) nounwind +declare void @ISPCLaunch(i8**, i8*, i8*, i32, i32, i32) nounwind declare void @ISPCSync(i8*) nounwind declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind diff --git a/docs/ispc.rst b/docs/ispc.rst index 3aab730b..04f478dc 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3012,8 +3012,7 @@ Intel® Cilk(tm), Intel® Thread Building Blocks or another task system), and for tasks to use ``ispc`` for SPMD parallelism across the vector lanes as appropriate. Alternatively, ``ispc`` also has support for launching tasks from ``ispc`` code. The approach is similar to Intel® Cilk's task launch -feature. (See the ``examples/mandelbrot_tasks`` and -``examples/mandelbrot_tasks3d`` examples to see it used in a small example.) +feature. (Check the ``examples/mandelbrot_tasks`` example to see how it is used.) Any function that is launched as a task must be declared with the ``task`` qualifier: diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile index 1a565ffd..a50631ab 100644 --- a/examples/mandelbrot_tasks/Makefile +++ b/examples/mandelbrot_tasks/Makefile @@ -2,7 +2,7 @@ EXAMPLE=mandelbrot_tasks CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp ISPC_SRC=mandelbrot_tasks.ispc -ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 +ISPC_IA_TARGETS=sse2,sse4-x2,avx ISPC_ARM_TARGETS=neon include ../common.mk diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp index 698daf0f..802afde0 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -38,7 +38,8 @@ #pragma warning (disable: 4305) #endif -#include +#include +#include #include #include #include "../timing.h" diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.ispc b/examples/mandelbrot_tasks/mandelbrot_tasks.ispc index 84d4ccd4..f9b0be4c 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.ispc +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.ispc @@ -57,21 +57,26 @@ task void mandelbrot_scanline(uniform float x0, uniform float dx, uniform float y0, uniform float dy, uniform int width, uniform int height, - uniform int span, + uniform int xspan, uniform int yspan, uniform int maxIterations, uniform int output[]) { - uniform int ystart = taskIndex * span; - uniform int yend = min((taskIndex+1) * span, (unsigned int)height); + const uniform int xstart = taskIndex0 * xspan; + const uniform int xend = min(xstart + xspan, width); - foreach (yi = ystart ... yend, xi = 0 ... width) { + const uniform int ystart = taskIndex1 * yspan; + const uniform int yend = min(ystart + yspan, height); + + + foreach (yi = ystart ... yend, xi = xstart ... xend) { float x = x0 + xi * dx; float y = y0 + yi * dy; int index = yi * width + xi; output[index] = mandel(x, y, maxIterations); } + } - +#if 1 export void mandelbrot_ispc(uniform float x0, uniform float y0, uniform float x1, uniform float y1, @@ -79,8 +84,16 @@ mandelbrot_ispc(uniform float x0, uniform float y0, uniform int maxIterations, uniform int output[]) { uniform float dx = (x1 - x0) / width; uniform float dy = (y1 - y0) / height; - uniform int span = 4; + const uniform int xspan = max(32, programCount*2); /* make sure it is big enough to avoid false-sharing */ + const uniform int yspan = 16; - launch[height/span] mandelbrot_scanline(x0, dx, y0, dy, width, height, span, - maxIterations, output); + +#if 1 + launch [width/xspan, height/yspan] +#else + launch [height/yspan][width/xspan] +#endif + mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan, + maxIterations, output); } +#endif diff --git a/examples/mandelbrot_tasks3d/.gitignore b/examples/mandelbrot_tasks3d/.gitignore deleted file mode 100644 index c2471c27..00000000 --- a/examples/mandelbrot_tasks3d/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -mandelbrot -*.ppm diff --git a/examples/mandelbrot_tasks3d/Makefile b/examples/mandelbrot_tasks3d/Makefile deleted file mode 100644 index 3dd44d65..00000000 --- a/examples/mandelbrot_tasks3d/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -EXAMPLE=mandelbrot_tasks3d -CPP_SRC=mandelbrot_tasks3d.cpp mandelbrot_tasks_serial.cpp -ISPC_SRC=mandelbrot_tasks3d.ispc -ISPC_IA_TARGETS=avx,sse2,sse4 -ISPC_ARM_TARGETS=neon - -include ../common.mk diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj deleted file mode 100644 index 3a8fca79..00000000 --- a/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj +++ /dev/null @@ -1,180 +0,0 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - - - {E80DA7D4-AB22-4648-A068-327307156BE6} - Win32Proj - mandelbrot_tasks - - - - Application - true - Unicode - - - Application - true - Unicode - - - Application - false - true - Unicode - - - Application - false - true - Unicode - - - - - - - - - - - - - - - - - - - true - $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot_tasks - - - true - $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot_tasks - - - false - $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot_tasks - - - false - $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot_tasks - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - - - - - - Document - ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - - - - - - diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp deleted file mode 100644 index 9cbb966a..00000000 --- a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifdef _MSC_VER -#define _CRT_SECURE_NO_WARNINGS -#define NOMINMAX -#pragma warning (disable: 4244) -#pragma warning (disable: 4305) -#endif - -#include -#include -#include -#include "../timing.h" -#include "mandelbrot_tasks3d_ispc.h" -using namespace ispc; - -extern void mandelbrot_serial(float x0, float y0, float x1, float y1, - int width, int height, int maxIterations, - int output[]); - -/* Write a PPM image file with the image of the Mandelbrot set */ -static void -writePPM(int *buf, int width, int height, const char *fn) { - FILE *fp = fopen(fn, "wb"); - fprintf(fp, "P6\n"); - fprintf(fp, "%d %d\n", width, height); - fprintf(fp, "255\n"); - for (int i = 0; i < width*height; ++i) { - // Map the iteration count to colors by just alternating between - // two greys. - char c = (buf[i] & 0x1) ? 240 : 20; - for (int j = 0; j < 3; ++j) - fputc(c, fp); - } - fclose(fp); - printf("Wrote image file %s\n", fn); -} - - -static void usage() { - fprintf(stderr, "usage: mandelbrot [--scale=]\n"); - exit(1); -} - -int main(int argc, char *argv[]) { - unsigned int width = 1536; - unsigned int height = 1024; - float x0 = -2; - float x1 = 1; - float y0 = -1; - float y1 = 1; - - if (argc == 1) - ; - else if (argc == 2) { - if (strncmp(argv[1], "--scale=", 8) == 0) { - float scale = atof(argv[1] + 8); - if (scale == 0.f) - usage(); - width *= scale; - height *= scale; - // round up to multiples of 16 - width = (width + 0xf) & ~0xf; - height = (height + 0xf) & ~0xf; - } - else - usage(); - } - else - usage(); - - int maxIterations = 512; - int *buf = new int[width*height]; - - // - // Compute the image using the ispc implementation; report the minimum - // time of three runs. - // - double minISPC = 1e30; - for (int i = 0; i < 3; ++i) { - // Clear out the buffer - for (unsigned int i = 0; i < width * height; ++i) - buf[i] = 0; - reset_and_start_timer(); - mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); - double dt = get_elapsed_mcycles(); - minISPC = std::min(minISPC, dt); - } - - printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC); - writePPM(buf, width, height, "mandelbrot-ispc.ppm"); - - - // - // And run the serial implementation 3 times, again reporting the - // minimum time. - // - double minSerial = 1e30; - for (int i = 0; i < 3; ++i) { - // Clear out the buffer - for (unsigned int i = 0; i < width * height; ++i) - buf[i] = 0; - reset_and_start_timer(); - mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); - double dt = get_elapsed_mcycles(); - minSerial = std::min(minSerial, dt); - } - - printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); - writePPM(buf, width, height, "mandelbrot-serial.ppm"); - - printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); - - return 0; -} diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc deleted file mode 100644 index 395bdca4..00000000 --- a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc +++ /dev/null @@ -1,99 +0,0 @@ -/* - Copyright (c) 2010-2012, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -static inline int -mandel(float c_re, float c_im, int count) { - float z_re = c_re, z_im = c_im; - int i; - for (i = 0; i < count; ++i) { - if (z_re * z_re + z_im * z_im > 4.) - break; - - float new_re = z_re*z_re - z_im*z_im; - float new_im = 2.f * z_re * z_im; - unmasked { - z_re = c_re + new_re; - z_im = c_im + new_im; - } - } - - return i; -} - - -/* Task to compute the Mandelbrot iterations for a single scanline. - */ -task void -mandelbrot_scanline(uniform float x0, uniform float dx, - uniform float y0, uniform float dy, - uniform int width, uniform int height, - uniform int xspan, uniform int yspan, - uniform int maxIterations, uniform int output[]) { - const uniform int xstart = taskIndex0 * xspan; - const uniform int xend = min(xstart + xspan, width); - - const uniform int ystart = taskIndex1 * yspan; - const uniform int yend = min(ystart + yspan, height); - - - foreach (yi = ystart ... yend, xi = xstart ... xend) { - float x = x0 + xi * dx; - float y = y0 + yi * dy; - - int index = yi * width + xi; - output[index] = mandel(x, y, maxIterations); - } - -} - -#if 1 -export void -mandelbrot_ispc(uniform float x0, uniform float y0, - uniform float x1, uniform float y1, - uniform int width, uniform int height, - uniform int maxIterations, uniform int output[]) { - uniform float dx = (x1 - x0) / width; - uniform float dy = (y1 - y0) / height; - const uniform int xspan = 16; /* make sure it is big enough to avoid false-sharing */ - const uniform int yspan = 16; - - -#if 1 - launch [width/xspan, height/yspan] -#else - launch [height/yspan][width/xspan] -#endif - mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan, - maxIterations, output); -} -#endif diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp b/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp deleted file mode 100644 index a76fb5ca..00000000 --- a/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp +++ /dev/null @@ -1,68 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - - -static int mandel(float c_re, float c_im, int count) { - float z_re = c_re, z_im = c_im; - int i; - for (i = 0; i < count; ++i) { - if (z_re * z_re + z_im * z_im > 4.f) - break; - - float new_re = z_re*z_re - z_im*z_im; - float new_im = 2.f * z_re * z_im; - z_re = c_re + new_re; - z_im = c_im + new_im; - } - - return i; -} - -void mandelbrot_serial(float x0, float y0, float x1, float y1, - int width, int height, int maxIterations, - int output[]) -{ - float dx = (x1 - x0) / width; - float dy = (y1 - y0) / height; - - for (int j = 0; j < height; j++) { - for (int i = 0; i < width; ++i) { - float x = x0 + i * dx; - float y = y0 + j * dy; - - int index = (j * width + i); - output[index] = mandel(x, y, maxIterations); - } - } -} - diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index 6bc60129..b914068e 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -204,7 +204,7 @@ struct TaskInfo { // ispc expects these functions to have C linkage / not be mangled extern "C" { - void ISPCLaunch(void **handlePtr, void *f, void *data, int countx,int county, int countz); + void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz); void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); void ISPCSync(void *handle); } diff --git a/test_static.cpp b/test_static.cpp index fceeb64e..27a5b136 100644 --- a/test_static.cpp +++ b/test_static.cpp @@ -69,7 +69,7 @@ extern "C" { void ISPCLaunch(void **handle, void *f, void *d, int count0, int count1, int count2) { *handle = (void *)0xdeadbeef; - typedef void (*TaskFuncType)(void *, int, int, int, int, int,int,int, int,int,int); + typedef void (*TaskFuncType)(void *, int, int, int, int, int, int, int, int, int, int); TaskFuncType func = (TaskFuncType)f; int count = count0*count1*count2, idx = 0; for (int k = 0; k < count2; ++k) From b506c92d21e68fd859f6835a255be43edcf43fd9 Mon Sep 17 00:00:00 2001 From: evghenii Date: Fri, 13 Dec 2013 13:55:58 +0100 Subject: [PATCH 3/8] restored-x2 --- examples/mandelbrot_tasks/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile index a50631ab..1a565ffd 100644 --- a/examples/mandelbrot_tasks/Makefile +++ b/examples/mandelbrot_tasks/Makefile @@ -2,7 +2,7 @@ EXAMPLE=mandelbrot_tasks CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp ISPC_SRC=mandelbrot_tasks.ispc -ISPC_IA_TARGETS=sse2,sse4-x2,avx +ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_ARM_TARGETS=neon include ../common.mk From b5dc78b06ea8f06405d93ce81a675ea5032ed3db Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Mon, 16 Dec 2013 15:11:13 +0400 Subject: [PATCH 4/8] adding support of shl instruction in lExtractConstantOffset optimization --- opt.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/opt.cpp b/opt.cpp index 9059c746..c75d4225 100644 --- a/opt.cpp +++ b/opt.cpp @@ -1479,6 +1479,33 @@ lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset, insertBefore); return; } + else if (bop->getOpcode() == llvm::Instruction::Shl) { + lExtractConstantOffset(op0, &c0, &v0, insertBefore); + lExtractConstantOffset(op1, &c1, &v1, insertBefore); + + // Given the product of constant and variable terms, we have: + // (c0 + v0) * (2^(c1 + v1)) = c0 * 2^c1 * 2^v1 + v0 * 2^c1 * 2^v1 + // We can optimize only if v1 == NULL. + if ((v1 != NULL) || (c0 == NULL) || (c1 == NULL)) { + *constOffset = NULL; + *variableOffset = vec; + } + else if (v0 == NULL) { + *constOffset = vec; + *variableOffset = NULL; + } + else { + *constOffset = + llvm::BinaryOperator::Create(llvm::Instruction::Shl, c0, c1, + LLVMGetName("shl", c0, c1), + insertBefore); + *variableOffset = + llvm::BinaryOperator::Create(llvm::Instruction::Shl, v0, c1, + LLVMGetName("shl", v0, c1), + insertBefore); + } + return; + } else if (bop->getOpcode() == llvm::Instruction::Mul) { lExtractConstantOffset(op0, &c0, &v0, insertBefore); lExtractConstantOffset(op1, &c1, &v1, insertBefore); From 37f3c0926cbb52b3cfedf178bb87eb7fc46fb8f4 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 17 Dec 2013 17:11:57 +0400 Subject: [PATCH 5/8] Adding missing 3.4 handing in alloy.py (for alloy-build) --- alloy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alloy.py b/alloy.py index 83296b46..0055842c 100755 --- a/alloy.py +++ b/alloy.py @@ -584,7 +584,7 @@ def Main(): if os.environ.get("SMTP_ISPC") == None: error("you have no SMTP_ISPC in your environment for option notify", 1) if options.only != "": - test_only_r = " 3.1 3.2 3.3 trunk current build stability performance x86 x86-64 -O0 -O2 native " + test_only_r = " 3.1 3.2 3.3 3.4 trunk current build stability performance x86 x86-64 -O0 -O2 native " test_only = options.only.split(" ") for iterator in test_only: if not (" " + iterator + " " in test_only_r): From 63ecf009ecbe6864511a64c6ddb8215aacae50a3 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Tue, 17 Dec 2013 15:06:29 +0100 Subject: [PATCH 6/8] fix compilation for Visual Studio --- expr.cpp | 11 ++++++++--- expr.h | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/expr.cpp b/expr.cpp index 60d9ce66..9f75ab08 100644 --- a/expr.cpp +++ b/expr.cpp @@ -3544,9 +3544,14 @@ FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, : Expr(p), isLaunch(il) { func = f; args = a; - launchCountExpr[0] = lce[0]; - launchCountExpr[1] = lce[1]; - launchCountExpr[2] = lce[2]; + if (lce != NULL) + { + launchCountExpr[0] = lce[0]; + launchCountExpr[1] = lce[1]; + launchCountExpr[2] = lce[2]; + } + else + launchCountExpr[0] = launchCountExpr[1] = launchCountExpr[2] = NULL; } diff --git a/expr.h b/expr.h index 0d46191b..e4d7e07b 100644 --- a/expr.h +++ b/expr.h @@ -247,7 +247,7 @@ class FunctionCallExpr : public Expr { public: FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, bool isLaunch = false, - Expr *launchCountExpr[3] = (Expr*[3]){NULL, NULL, NULL}); + Expr *launchCountExpr[3] = NULL); llvm::Value *GetValue(FunctionEmitContext *ctx) const; llvm::Value *GetLValue(FunctionEmitContext *ctx) const; From 59b989d243eb47c1135d464da2badabf5eee7ec2 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Tue, 17 Dec 2013 16:06:20 +0100 Subject: [PATCH 7/8] fix for --target=sse4-i18x16 --- tests/launch-8.ispc | 16 ++++++++-------- tests/launch-9.ispc | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/launch-8.ispc b/tests/launch-8.ispc index a0b976e4..eacba673 100644 --- a/tests/launch-8.ispc +++ b/tests/launch-8.ispc @@ -10,14 +10,14 @@ static uniform float array[N2][N1][N0]; task void x(const float f) { uniform int j; - assert(taskCount == N0*N1*N2); - assert(taskCount0 == N0); - assert(taskCount1 == N1); - assert(taskCount2 == N2); - assert(taskIndex == taskIndex0 + N0*(taskIndex1 + N1*taskIndex2)); - assert(taskIndex0 < N0); - assert(taskIndex1 < N1); - assert(taskIndex2 < N2); + assert(taskCount == (int32)N0*N1*N2); + assert(taskCount0 == (int32)N0); + assert(taskCount1 == (int32)N1); + assert(taskCount2 == (int32)N2); + assert(taskIndex == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2)); + assert(taskIndex0 < (int32)N0); + assert(taskIndex1 < (int32)N1); + assert(taskIndex2 < (int32)N2); const uniform int i0 = taskIndex0; const uniform int i1 = taskIndex1; diff --git a/tests/launch-9.ispc b/tests/launch-9.ispc index 761b070c..1952e8e7 100644 --- a/tests/launch-9.ispc +++ b/tests/launch-9.ispc @@ -10,14 +10,14 @@ static uniform float array[N2][N1][N0]; task void x(const float f) { uniform int j; - assert(taskCount == N0*N1*N2); - assert(taskCount0 == N0); - assert(taskCount1 == N1); - assert(taskCount2 == N2); - assert(taskIndex == taskIndex0 + N0*(taskIndex1 + N1*taskIndex2)); - assert(taskIndex0 < N0); - assert(taskIndex1 < N1); - assert(taskIndex2 < N2); + assert(taskCount == (int32)N0*N1*N2); + assert(taskCount0 == (int32)N0); + assert(taskCount1 == (int32)N1); + assert(taskCount2 == (int32)N2); + assert(taskIndex == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2)); + assert(taskIndex0 < (int32)N0); + assert(taskIndex1 < (int32)N1); + assert(taskIndex2 < (int32)N2); const uniform int i0 = taskIndex0; const uniform int i1 = taskIndex1; From 473f1cb4d2f196e20bed159aef7a041053173f80 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Tue, 10 Dec 2013 20:39:24 +0400 Subject: [PATCH 8/8] packed_store_active2 --- builtins.cpp | 1 + builtins/util.m4 | 45 ++++++++++++++++++++++++++++++++++++++ stdlib.ispc | 13 +++++++++++ tests/packed-store2-1.ispc | 16 ++++++++++++++ tests/packed-store2-2.ispc | 21 ++++++++++++++++++ tests/packed-store2-3.ispc | 17 ++++++++++++++ tests/packed-store2.ispc | 15 +++++++++++++ 7 files changed, 128 insertions(+) create mode 100644 tests/packed-store2-1.ispc create mode 100644 tests/packed-store2-2.ispc create mode 100644 tests/packed-store2-3.ispc create mode 100644 tests/packed-store2.ispc diff --git a/builtins.cpp b/builtins.cpp index 2afd92d9..6be41f13 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -488,6 +488,7 @@ lSetInternalFunctions(llvm::Module *module) { "__num_cores", "__packed_load_active", "__packed_store_active", + "__packed_store_active2", "__popcnt_int32", "__popcnt_int64", "__prefetch_read_uniform_1", diff --git a/builtins/util.m4 b/builtins/util.m4 index e1c9bf97..7ce4ab7f 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -3815,6 +3815,51 @@ loopend: done: ret i32 %nextoffset } + +define MASK @__packed_store_active2(i32 * %startptr, %vals, + %full_mask) nounwind alwaysinline { +entry: + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + br i1 %mask_known, label %known_mask, label %unknown_mask + +known_mask: + %allon = icmp eq i64 %mask, ALL_ON_MASK + br i1 %allon, label %all_on, label %unknown_mask + +all_on: + %vecptr = bitcast i32 *%startptr to * + store %vals, * %vecptr, align 4 + ret MASK WIDTH + +unknown_mask: + br label %loop + +loop: + %offset = phi MASK [ 0, %unknown_mask ], [ %ch_offset, %loop ] + %i = phi i32 [ 0, %unknown_mask ], [ %ch_i, %loop ] + %storeval = extractelement %vals, i32 %i + +;; Offset has value in range from 0 to WIDTH-1. So it does not matter if we +;; zero or sign extending it, while zero extend is free. Also do nothing for +;; i64 MASK, as we need i64 value. +ifelse(MASK, `i64', +` %storeptr = getelementptr i32 *%startptr, MASK %offset', +` %offset1 = zext MASK %offset to i64 + %storeptr = getelementptr i32 *%startptr, i64 %offset1') + store i32 %storeval, i32 *%storeptr + + %mull_mask = extractelement %full_mask, i32 %i + %ch_offset = sub MASK %offset, %mull_mask + + ; are we done yet? + %ch_i = add i32 %i, 1 + %test = icmp ne i32 %ch_i, WIDTH + br i1 %test, label %loop, label %done + +done: + ret MASK %ch_offset +} ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/stdlib.ispc b/stdlib.ispc index 6768594b..3b17283d 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1209,6 +1209,13 @@ packed_store_active(uniform unsigned int a[], return __packed_store_active(a, vals, (UIntMaskType)__mask); } +static inline uniform int +packed_store_active2(uniform unsigned int a[], + unsigned int vals) { + return __packed_store_active2(a, vals, (UIntMaskType)__mask); +} + + static inline uniform int packed_load_active(uniform int a[], varying int * uniform vals) { return __packed_load_active(a, vals, (IntMaskType)__mask); @@ -1219,6 +1226,12 @@ packed_store_active(uniform int a[], int vals) { return __packed_store_active(a, vals, (IntMaskType)__mask); } +static inline uniform int +packed_store_active2(uniform int a[], int vals) { + return __packed_store_active2(a, vals, (IntMaskType)__mask); +} + + /////////////////////////////////////////////////////////////////////////// // System information diff --git a/tests/packed-store2-1.ispc b/tests/packed-store2-1.ispc new file mode 100644 index 00000000..0ca3230a --- /dev/null +++ b/tests/packed-store2-1.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform int pack[2+programCount]; + for (uniform int i = 0; i < 2+programCount; ++i) + pack[i] = 0; + packed_store_active2(&pack[2], a); + RET[programIndex] = pack[programIndex]; +} + +export void result(uniform float RET[]) { + RET[programIndex] = programIndex-1; + RET[0] = RET[1] = 0; +} diff --git a/tests/packed-store2-2.ispc b/tests/packed-store2-2.ispc new file mode 100644 index 00000000..c29230ca --- /dev/null +++ b/tests/packed-store2-2.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform int pack[2+programCount]; + uniform int number; + for (uniform int i = 0; i < 2+programCount; ++i) + pack[i] = 0; + if ((int)a & 1) + number = packed_store_active2(&pack[2], a); + pack[2+number] = 0; + RET[programIndex] = pack[programIndex]; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + uniform int val = 1; + for (uniform int i = 2; i < 2+programCount/2; ++i, val += 2) + RET[i] = val; +} diff --git a/tests/packed-store2-3.ispc b/tests/packed-store2-3.ispc new file mode 100644 index 00000000..9192525e --- /dev/null +++ b/tests/packed-store2-3.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform int pack[2+programCount]; + for (uniform int i = 0; i < 2+programCount; ++i) + pack[i] = 0; + uniform int count = 0; + if ((int)a & 1) + count += packed_store_active2(&pack[2], a); + RET[programIndex] = count; +} + +export void result(uniform float RET[]) { + RET[programIndex] = (programCount == 1) ? 1 : programCount/2; +} diff --git a/tests/packed-store2.ispc b/tests/packed-store2.ispc new file mode 100644 index 00000000..13973bc3 --- /dev/null +++ b/tests/packed-store2.ispc @@ -0,0 +1,15 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform unsigned int pack[programCount]; + for (uniform int i = 0; i < programCount; ++i) + pack[i] = 0; + packed_store_active2(pack, (unsigned int)a); + RET[programIndex] = pack[programIndex]; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +}