added stencil

2014-01-05 17:24:38 +01:00
parent 07049d6c76
commit 6e2d59e279
10 changed files with 747 additions and 0 deletions
--- a/examples_ptx/stencil/.gitignore
+++ b/examples_ptx/stencil/.gitignore
@@ -0,0 +1,2 @@
+stencil
+objs
--- a/examples_ptx/stencil/Makefile
+++ b/examples_ptx/stencil/Makefile
@@ -0,0 +1,8 @@
+
+EXAMPLE=stencil
+CPP_SRC=stencil.cpp stencil_serial.cpp
+ISPC_SRC=stencil.ispc
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
+ISPC_ARM_TARGETS=neon
+
+include ../common.mk
--- a/examples_ptx/stencil/Makefile_cpu
+++ b/examples_ptx/stencil/Makefile_cpu
@@ -0,0 +1,8 @@
+
+EXAMPLE=stencil
+CPP_SRC=stencil.cpp stencil_serial.cpp
+ISPC_SRC=stencil.ispc
+ISPC_IA_TARGETS=avx1-i32x16
+ISPC_ARM_TARGETS=neon
+
+include ../common.mk
--- a/examples_ptx/stencil/Makefile_gpu
+++ b/examples_ptx/stencil/Makefile_gpu
@@ -0,0 +1,10 @@
+PROG=stencil
+ISPC_SRC=stencil.ispc
+CU_SRC=stencil.cu
+CXX_SRC=stencil.cpp  stencil_serial.cpp
+PTXCC_REGMAX=128
+
+include ../common_gpu.mk
+
+
+
--- a/examples_ptx/stencil/stencil.cpp
+++ b/examples_ptx/stencil/stencil.cpp
@@ -0,0 +1,184 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+#include <math.h>
+#include "../timing.h"
+#include "../ispc_malloc.h"
+#include "stencil_ispc.h"
+using namespace ispc;
+
+
+extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
+                                int y0, int y1, int z0, int z1,
+                                int Nx, int Ny, int Nz,
+                                const double coef[5], 
+                                const double vsq[],
+                                double Aeven[], double Aodd[]);
+
+
+void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
+    int offset = 0;
+    for (int z = 0; z < Nz; ++z)
+        for (int y = 0; y < Ny; ++y)
+            for (int x = 0; x < Nx; ++x, ++offset) {
+                A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
+                A[1][offset] = 0;
+                vsq[offset] = x*y*z / double(Nx * Ny * Nz);
+            }
+}
+
+
+int main(int argc, char *argv[]) {
+    static unsigned int test_iterations[] = {3, 3, 3};//the last two numbers must be equal here
+    int Nx = 256, Ny = 256, Nz = 256;
+    int width = 4;
+
+    if (argc > 1) {
+        if (strncmp(argv[1], "--scale=", 8) == 0) {
+            double scale = atof(argv[1] + 8);
+            Nx *= scale;
+            Ny *= scale;
+            Nz *= scale;
+        }
+    }
+    if ((argc == 4) || (argc == 5)) {
+        for (int i = 0; i < 3; i++) {
+            test_iterations[i] = atoi(argv[argc - 3 + i]);
+        }
+    }
+
+    double *Aserial[2], *Aispc[2];
+    Aserial[0] = new double [Nx * Ny * Nz];
+    Aserial[1] = new double [Nx * Ny * Nz];
+    Aispc[0] = new double [Nx * Ny * Nz];
+    Aispc[1] = new double [Nx * Ny * Nz];
+    double *vsq = new double [Nx * Ny * Nz];
+    double *coeff = new double[4];
+
+    coeff[0] = 0.5;
+    coeff[1] = -.25;
+    coeff[2] = .125;
+    coeff[3] = -.0625;
+
+    InitData(Nx, Ny, Nz, Aispc, vsq);
+    //
+    // Compute the image using the ispc implementation on one core; report
+    // the minimum time of three runs.
+    // 
+#if 0    
+//#ifndef _CUDA_
+    double minTimeISPC = 1e30;
+    for (int i = 0; i < test_iterations[0]; ++i) {
+        reset_and_start_timer();
+        loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
+                          width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                          Aispc[0], Aispc[1]);
+        double dt = get_elapsed_msec();
+        printf("@time of ISPC run:\t\t\t[%.3f] msec\n", dt);
+        minTimeISPC = std::min(minTimeISPC, dt);
+    }
+
+    printf("[stencil ispc 1 core]:\t\t[%.3f] msec\n", minTimeISPC);
+
+    InitData(Nx, Ny, Nz, Aispc, vsq);
+#endif
+
+    //
+    // Compute the image using the ispc implementation with tasks; report
+    // the minimum time of three runs.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (int i = 0; i < test_iterations[1]; ++i) {
+        reset_and_start_timer();
+        loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
+                                width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                                Aispc[0], Aispc[1]);
+        double dt = get_elapsed_msec();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
+        minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
+    }
+
+    printf("[stencil ispc + tasks]:\t\t[%.3f] msec\n", minTimeISPCTasks);
+
+    InitData(Nx, Ny, Nz, Aserial, vsq);
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minTimeSerial = 1e30;
+    for (int i = 0; i < test_iterations[2]; ++i) {
+        reset_and_start_timer();
+        loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
+                            width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                            Aserial[0], Aserial[1]);
+        double dt = get_elapsed_msec();
+        printf("@time of serial run:\t\t\t[%.3f] msec\n", dt);
+        minTimeSerial = std::min(minTimeSerial, dt);
+    }
+
+    printf("[stencil serial]:\t\t[%.3f] msec\n", minTimeSerial);
+
+#if 0
+//#ifndef _CUDA_
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
+#else
+    printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPCTasks);
+#endif
+
+    // Check for agreement
+    int offset = 0;
+    for (int z = 0; z < Nz; ++z)
+        for (int y = 0; y < Ny; ++y)
+            for (int x = 0; x < Nx; ++x, ++offset) {
+                double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
+                                    Aserial[1][offset]);
+                if (error > 1e-4)
+                    printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
+                           x, y, z, Aispc[1][offset], Aserial[1][offset]);
+            }
+
+    return 0;
+}
--- a/examples_ptx/stencil/stencil.cu
+++ b/examples_ptx/stencil/stencil.cu
@@ -0,0 +1,129 @@
+#include "cuda_helpers.cuh"
+
+__device__ static void
+stencil_step( int x0,  int x1,
+              int y0,  int y1,
+              int z0,  int z1,
+              int Nx,  int Ny,  int Nz,
+              const double coef[4],  const double vsq[],
+              const double Ain[],  double Aout[]) 
+{
+  const  int Nxy = Nx * Ny;
+
+
+  const  double coef0 = coef[0];
+  const  double coef1 = coef[1];
+  const  double coef2 = coef[2];
+  const  double coef3 = coef[3];
+  for ( int z = z0; z < z1; z++)
+    for ( int y = y0 ; y < y1; y++)
+      for ( int xb = x0; xb < x1; xb += programCount)
+      {
+        const int x = xb + programIndex;
+
+        int index = (z * Nxy) + (y * Nx) + x;
+#define A_cur(x, y, z) __ldg(&Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)])
+#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+        double div = 
+          coef0 *  A_cur(0, 0, 0) +
+          coef1 * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+              A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+              A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+          coef2 * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+              A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+              A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+          coef3 * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+              A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+              A_cur(0, 0, +3) + A_cur(0, 0, -3));
+
+        if (x < x1)
+          A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+            __ldg(&vsq[index]) * div;
+      }
+}
+
+
+#define SPANX 32
+#define SPANY 2
+#define SPANZ 4
+
+__global__  void
+stencil_step_task( int x0,  int x1,
+                   int y0,  int y1,
+                   int z0,  int z1,
+                   int Nx,  int Ny,  int Nz,
+                   const double coef[4],  const double vsq[],
+                   const double Ain[],  double Aout[]) {
+  if (taskIndex0 >= taskCount0 || 
+      taskIndex1 >= taskCount1 || 
+      taskIndex2 >= taskCount2)
+    return;
+
+  const  int xfirst = x0 + taskIndex0 * SPANX;
+  const  int xlast  = min(x1, xfirst + SPANX);
+
+  const  int yfirst = y0 + taskIndex1 * SPANY;
+  const  int ylast  = min(y1, yfirst + SPANY);
+
+  const  int zfirst = z0 + taskIndex2 * SPANZ;
+  const  int zlast  = min(z1, zfirst + SPANZ);
+
+  stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
+      Nx, Ny, Nz, coef, vsq, Ain, Aout);
+}
+
+
+
+extern "C"
+__global__ void
+loop_stencil_ispc_tasks___export( int t0,  int t1, 
+                         int x0,  int x1,
+                         int y0,  int y1,
+                         int z0,  int z1,
+                         int Nx,  int Ny,  int Nz,
+                         const double coef[4], 
+                         const double vsq[],
+                         double Aeven[],  double Aodd[])
+{
+#define NB(x,n) (((x)+(n)-1)/(n))
+
+  dim3 grid((NB(x1-x0,SPANX)-1)/4+1, NB(y1-y0,SPANY), NB(z1-z0,SPANZ));
+
+    for ( int t = t0; t < t1; ++t) 
+    {
+      // Parallelize across cores as well: each task will work on a slice
+      // of 1 in the z extent of the volume.
+      if ((t & 1) == 0)
+      {
+        if (programIndex == 0)
+          stencil_step_task<<<grid,128>>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, 
+              coef, vsq, Aeven, Aodd);
+      }
+      else
+      {
+        if (programIndex == 0)
+          stencil_step_task<<<grid,128>>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, 
+              coef, vsq, Aodd, Aeven);
+      }
+
+      // We need to wait for all of the launched tasks to finish before
+      // starting the next iteration
+      cudaDeviceSynchronize();
+    }
+}
+
+extern "C"
+__host__ void
+loop_stencil_ispc_tasks( int t0,  int t1, 
+                         int x0,  int x1,
+                         int y0,  int y1,
+                         int z0,  int z1,
+                         int Nx,  int Ny,  int Nz,
+                         const double coef[4], 
+                         const double vsq[],
+                         double Aeven[],  double Aodd[])
+{
+  loop_stencil_ispc_tasks___export<<<1,32>>>(t0,t1,x0,x1,y0,y1,z0,z1,Nx,Ny,Nz,coef,vsq,Aeven,Aodd);
+  cudaDeviceSynchronize();
+}
+
--- a/examples_ptx/stencil/stencil.ispc
+++ b/examples_ptx/stencil/stencil.ispc
@@ -0,0 +1,164 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+static inline void
+stencil_step(uniform int x0, uniform int x1,
+             uniform int y0, uniform int y1,
+             uniform int z0, uniform int z1,
+             uniform int Nx, uniform int Ny, uniform int Nz,
+             uniform const double coef[4], uniform const double vsq[],
+             uniform const double Ain[], uniform double Aout[]) {
+    const uniform int Nxy = Nx * Ny;
+
+#if 0
+#define VER1
+#endif
+
+#ifdef VER1
+    const uniform int x1o = 1;
+    const uniform int x2o = 2;
+    const uniform int x3o = 3;
+    const uniform int y1o = Nx;
+    const uniform int y2o = Nx*2;
+    const uniform int y3o = Nx*3;
+    const uniform int z1o = Nxy;
+    const uniform int z2o = Nxy*2;
+    const uniform int z3o = Nxy*3;
+#endif
+
+    foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) 
+    {
+#ifdef VER1
+
+      int index = (z * Nxy) + (y * Nx) + x;
+#define A_cur(x, y, z)  Ain [index + (x) + (y) + (z)]
+#define A_next(x, y, z) Aout[index + (x) + (y) + (z)]
+          double div = coef[0] * A_cur(0, 0, 0) +
+            coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) +
+                A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) +
+                A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) +
+            coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) +
+                A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) +
+                A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) +
+            coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) + 
+                A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) +
+                A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o));
+            A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+              vsq[index] * div;
+
+#else
+
+      int index = (z * Nxy) + (y * Nx) + x;
+#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+      double div = coef[0] * A_cur(0, 0, 0) +
+        coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+            A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+            A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+        coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+            A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+            A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+        coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+            A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+            A_cur(0, 0, +3) + A_cur(0, 0, -3));
+
+      A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+        vsq[index] * div;
+
+#endif
+
+    }
+}
+
+#define SPANX 32
+#define SPANY 2
+#define SPANZ 4
+
+static task void
+stencil_step_task(uniform int x0, uniform int x1,
+                  uniform int y0, uniform int y1,
+                  uniform int z0, uniform int z1,
+                  uniform int Nx, uniform int Ny, uniform int Nz,
+                  uniform const double coef[4], uniform const double vsq[],
+                  uniform const double Ain[], uniform double Aout[]) {
+  if (taskIndex0 >= taskCount0 || 
+      taskIndex1 >= taskCount1 || 
+      taskIndex2 >= taskCount2)
+    return;
+
+  const uniform int xfirst = x0 + taskIndex0 * SPANX;
+  const uniform int xlast  = min(x1, xfirst + SPANX);
+
+  const uniform int yfirst = y0 + taskIndex1 * SPANY;
+  const uniform int ylast  = min(y1, yfirst + SPANY);
+
+  const uniform int zfirst = z0 + taskIndex2 * SPANZ;
+  const uniform int zlast  = min(z1, zfirst + SPANZ);
+
+  stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
+      Nx, Ny, Nz, coef, vsq, Ain, Aout);
+}
+
+
+
+export void
+loop_stencil_ispc_tasks(uniform int t0, uniform int t1, 
+                        uniform int x0, uniform int x1,
+                        uniform int y0, uniform int y1,
+                        uniform int z0, uniform int z1,
+                        uniform int Nx, uniform int Ny, uniform int Nz,
+                        uniform const double coef[4], 
+                        uniform const double vsq[],
+                        uniform double Aeven[], uniform double Aodd[])
+{
+#define NB(x,n) (((x)+(n)-1)/(n))
+
+    for (uniform int t = t0; t < t1; ++t) 
+    {
+      // Parallelize across cores as well: each task will work on a slice
+      // of 1 in the z extent of the volume.
+      if ((t & 1) == 0)
+        launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] 
+          stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, 
+            coef, vsq, Aeven, Aodd);
+      else
+         launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] 
+           stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, 
+             coef, vsq, Aodd, Aeven);
+
+      // We need to wait for all of the launched tasks to finish before
+      // starting the next iteration.
+      sync;
+    }
+}
+
--- a/examples_ptx/stencil/stencil.orig.ispc
+++ b/examples_ptx/stencil/stencil.orig.ispc
@@ -0,0 +1,122 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static void
+stencil_step(uniform int x0, uniform int x1,
+             uniform int y0, uniform int y1,
+             uniform int z0, uniform int z1,
+             uniform int Nx, uniform int Ny, uniform int Nz,
+             uniform const double coef[4], uniform const double vsq[],
+             uniform const double Ain[], uniform double Aout[]) {
+    const uniform int Nxy = Nx * Ny;
+
+    foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) {
+        int index = (z * Nxy) + (y * Nx) + x;
+#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+        double div = coef[0] * A_cur(0, 0, 0) +
+            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
+
+        A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+            vsq[index] * div;
+    }
+}
+
+
+static task void
+stencil_step_task(uniform int x0, uniform int x1,
+                  uniform int y0, uniform int y1,
+                  uniform int z0,
+                  uniform int Nx, uniform int Ny, uniform int Nz,
+                  uniform const double coef[4], uniform const double vsq[],
+                  uniform const double Ain[], uniform double Aout[]) {
+    stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
+                 Nx, Ny, Nz, coef, vsq, Ain, Aout);
+}
+
+
+export void
+loop_stencil_ispc_tasks(uniform int t0, uniform int t1, 
+                        uniform int x0, uniform int x1,
+                        uniform int y0, uniform int y1,
+                        uniform int z0, uniform int z1,
+                        uniform int Nx, uniform int Ny, uniform int Nz,
+                        uniform const double coef[4], 
+                        uniform const double vsq[],
+                        uniform double Aeven[], uniform double Aodd[])
+{
+    for (uniform int t = t0; t < t1; ++t) {
+        // Parallelize across cores as well: each task will work on a slice
+        // of 1 in the z extent of the volume.
+        if ((t & 1) == 0)
+            launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
+                                            coef, vsq, Aeven, Aodd);
+        else
+            launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
+                                            coef, vsq, Aodd, Aeven);
+
+        // We need to wait for all of the launched tasks to finish before
+        // starting the next iteration.
+        sync;
+    }
+}
+
+
+export void
+loop_stencil_ispc(uniform int t0, uniform int t1, 
+                  uniform int x0, uniform int x1,
+                  uniform int y0, uniform int y1,
+                  uniform int z0, uniform int z1,
+                  uniform int Nx, uniform int Ny, uniform int Nz,
+                  uniform const double coef[4], 
+                  uniform const double vsq[],
+                  uniform double Aeven[], uniform double Aodd[])
+{
+    for (uniform int t = t0; t < t1; ++t) {
+        if ((t & 1) == 0)
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aeven, Aodd);
+        else
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aodd, Aeven);
+    }
+}
--- a/examples_ptx/stencil/stencil.vcxproj
+++ b/examples_ptx/stencil/stencil.vcxproj
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>rt</RootNamespace>
+    <ISPC_file>stencil</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-i32x8</default_targets>
+  </PropertyGroup>
+  <Import Project="..\common.props" />
+  <ItemGroup>
+    <ClCompile Include="stencil.cpp" />
+    <ClCompile Include="stencil_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+</Project>
--- a/examples_ptx/stencil/stencil_serial.cpp
+++ b/examples_ptx/stencil/stencil_serial.cpp
@@ -0,0 +1,86 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static void
+stencil_step(int x0, int x1,
+             int y0, int y1,
+             int z0, int z1,
+             int Nx, int Ny, int Nz,
+             const double coef[4], const double vsq[],
+             const double Ain[], double Aout[]) {
+    int Nxy = Nx * Ny;
+
+    for (int z = z0; z < z1; ++z) {
+        for (int y = y0; y < y1; ++y) {
+            for (int x = x0; x < x1; ++x) {
+                int index = (z * Nxy) + (y * Nx) + x;
+#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+                double div = coef[0] * A_cur(0, 0, 0) +
+                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
+
+                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+                    vsq[index] * div;
+            }
+        }
+    }
+}
+
+
+void loop_stencil_serial(int t0, int t1, 
+                         int x0, int x1,
+                         int y0, int y1,
+                         int z0, int z1,
+                         int Nx, int Ny, int Nz,
+                         const double coef[4], 
+                         const double vsq[],
+                         double Aeven[], double Aodd[])
+{
+    for (int t = t0; t < t1; ++t) {
+        if ((t & 1) == 0)
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aeven, Aodd);
+        else
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aodd, Aeven);
+    }
+}