added stencil code

2013-11-18 12:04:00 +01:00
parent 8d4dd13750
commit 589538bf39
38 changed files with 72 additions and 8820 deletions
--- a/examples_cuda/common.mk
+++ b/examples_cuda/common.mk
@@ -10,7 +10,8 @@ CCFLAGS+=-Iobjs/ -O2

 LIBS=-lm $(TASK_LIB) -lstdc++
 ISPC=ispc
-ISPC_FLAGS+=-O2 --opt=fast-math --math-lib=default
+ISPC_FLAGS+=-O2 
+ISPC_FLAGS+=--opt=fast-math --math-lib=default
 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)

 ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
--- a/examples_cuda/stencil/.stencil.ispc.swn
+++ b/examples_cuda/stencil/.stencil.ispc.swn
--- a/examples_cuda/stencil/__kernels.ptx
+++ b/examples_cuda/stencil/__kernels.ptx
--- a/examples_cuda/stencil/drvapi_error_string.h
+++ b/examples_cuda/stencil/drvapi_error_string.h
@@ -1,370 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
- 
-#ifndef _DRVAPI_ERROR_STRING_H_
-#define _DRVAPI_ERROR_STRING_H_
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
-// Error Code string definitions here
-typedef struct
-{
-    char const *error_string;
-    int  error_id;
-} s_CudaErrorStr;
-
-/**
- * Error codes
- */
-static s_CudaErrorStr sCudaDrvErrorString[] =
-{
-    /**
-     * The API call returned with no errors. In the case of query calls, this
-     * can also mean that the operation being queried is complete (see
-     * ::cuEventQuery() and ::cuStreamQuery()).
-     */
-    { "CUDA_SUCCESS", 0 },
-
-    /**
-     * This indicates that one or more of the parameters passed to the API call
-     * is not within an acceptable range of values.
-     */
-    { "CUDA_ERROR_INVALID_VALUE", 1 },
-
-    /**
-     * The API call failed because it was unable to allocate enough memory to
-     * perform the requested operation.
-     */
-    { "CUDA_ERROR_OUT_OF_MEMORY", 2 },
-
-    /**
-     * This indicates that the CUDA driver has not been initialized with
-     * ::cuInit() or that initialization has failed.
-     */
-    { "CUDA_ERROR_NOT_INITIALIZED", 3 },
-
-    /**
-     * This indicates that the CUDA driver is in the process of shutting down.
-     */
-    { "CUDA_ERROR_DEINITIALIZED", 4 },
-
-    /**
-     * This indicates profiling APIs are called while application is running
-     * in visual profiler mode. 
-    */
-    { "CUDA_ERROR_PROFILER_DISABLED", 5 },
-    /**
-     * This indicates profiling has not been initialized for this context. 
-     * Call cuProfilerInitialize() to resolve this. 
-    */
-    { "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 },
-    /**
-     * This indicates profiler has already been started and probably
-     * cuProfilerStart() is incorrectly called.
-    */
-    { "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 },
-    /**
-     * This indicates profiler has already been stopped and probably
-     * cuProfilerStop() is incorrectly called.
-    */
-    { "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 },  
-    /**
-     * This indicates that no CUDA-capable devices were detected by the installed
-     * CUDA driver.
-     */
-    { "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 },
-
-    /**
-     * This indicates that the device ordinal supplied by the user does not
-     * correspond to a valid CUDA device.
-     */
-    { "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 },
-
-
-    /**
-     * This indicates that the device kernel image is invalid. This can also
-     * indicate an invalid CUDA module.
-     */
-    { "CUDA_ERROR_INVALID_IMAGE", 200 },
-
-    /**
-     * This most frequently indicates that there is no context bound to the
-     * current thread. This can also be returned if the context passed to an
-     * API call is not a valid handle (such as a context that has had
-     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
-     * mixes different API versions (i.e. 3010 context with 3020 API calls).
-     * See ::cuCtxGetApiVersion() for more details.
-     */
-    { "CUDA_ERROR_INVALID_CONTEXT", 201 },
-
-    /**
-     * This indicated that the context being supplied as a parameter to the
-     * API call was already the active context.
-     * \deprecated
-     * This error return is deprecated as of CUDA 3.2. It is no longer an
-     * error to attempt to push the active context via ::cuCtxPushCurrent().
-     */
-    { "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 },
-
-    /**
-     * This indicates that a map or register operation has failed.
-     */
-    { "CUDA_ERROR_MAP_FAILED", 205 },
-
-    /**
-     * This indicates that an unmap or unregister operation has failed.
-     */
-    { "CUDA_ERROR_UNMAP_FAILED", 206 },
-
-    /**
-     * This indicates that the specified array is currently mapped and thus
-     * cannot be destroyed.
-     */
-    { "CUDA_ERROR_ARRAY_IS_MAPPED", 207 },
-
-    /**
-     * This indicates that the resource is already mapped.
-     */
-    { "CUDA_ERROR_ALREADY_MAPPED", 208 },
-
-    /**
-     * This indicates that there is no kernel image available that is suitable
-     * for the device. This can occur when a user specifies code generation
-     * options for a particular CUDA source file that do not include the
-     * corresponding device configuration.
-     */
-    { "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 },
-
-    /**
-     * This indicates that a resource has already been acquired.
-     */
-    { "CUDA_ERROR_ALREADY_ACQUIRED", 210 },
-
-    /**
-     * This indicates that a resource is not mapped.
-     */
-    { "CUDA_ERROR_NOT_MAPPED", 211 },
-
-    /**
-     * This indicates that a mapped resource is not available for access as an
-     * array.
-     */
-    { "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 },
-
-    /**
-     * This indicates that a mapped resource is not available for access as a
-     * pointer.
-     */
-    { "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 },
-
-    /**
-     * This indicates that an uncorrectable ECC error was detected during
-     * execution.
-     */
-    { "CUDA_ERROR_ECC_UNCORRECTABLE", 214 },
-
-    /**
-     * This indicates that the ::CUlimit passed to the API call is not
-     * supported by the active device.
-     */
-    { "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 },
-
-    /**
-     * This indicates that the ::CUcontext passed to the API call can
-     * only be bound to a single CPU thread at a time but is already 
-     * bound to a CPU thread.
-     */
-    { "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 },
-
-    /**
-     * This indicates that peer access is not supported across the given
-     * devices.
-     */
-    { "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217},
-
-    /**
-     * This indicates that the device kernel source is invalid.
-     */
-    { "CUDA_ERROR_INVALID_SOURCE", 300 },
-
-    /**
-     * This indicates that the file specified was not found.
-     */
-    { "CUDA_ERROR_FILE_NOT_FOUND", 301 },
-
-    /**
-     * This indicates that a link to a shared object failed to resolve.
-     */
-    { "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 },
-
-    /**
-     * This indicates that initialization of a shared object failed.
-     */
-    { "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 },
-
-    /**
-     * This indicates that an OS call failed.
-     */
-    { "CUDA_ERROR_OPERATING_SYSTEM", 304 },
-
-
-    /**
-     * This indicates that a resource handle passed to the API call was not
-     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
-     */
-    { "CUDA_ERROR_INVALID_HANDLE", 400 },
-
-
-    /**
-     * This indicates that a named symbol was not found. Examples of symbols
-     * are global/constant variable names, texture names }, and surface names.
-     */
-    { "CUDA_ERROR_NOT_FOUND", 500 },
-
-
-    /**
-     * This indicates that asynchronous operations issued previously have not
-     * completed yet. This result is not actually an error, but must be indicated
-     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
-     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
-     */
-    { "CUDA_ERROR_NOT_READY", 600 },
-
-
-    /**
-     * An exception occurred on the device while executing a kernel. Common
-     * causes include dereferencing an invalid device pointer and accessing
-     * out of bounds shared memory. The context cannot be used }, so it must
-     * be destroyed (and a new one should be created). All existing device
-     * memory allocations from this context are invalid and must be
-     * reconstructed if the program is to continue using CUDA.
-     */
-    { "CUDA_ERROR_LAUNCH_FAILED", 700 },
-
-    /**
-     * This indicates that a launch did not occur because it did not have
-     * appropriate resources. This error usually indicates that the user has
-     * attempted to pass too many arguments to the device kernel, or the
-     * kernel launch specifies too many threads for the kernel's register
-     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
-     * when a 32-bit int is expected) is equivalent to passing too many
-     * arguments and can also result in this error.
-     */
-    { "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 },
-
-    /**
-     * This indicates that the device kernel took too long to execute. This can
-     * only occur if timeouts are enabled - see the device attribute
-     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
-     * context cannot be used (and must be destroyed similar to
-     * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
-     * this context are invalid and must be reconstructed if the program is to
-     * continue using CUDA.
-     */
-    { "CUDA_ERROR_LAUNCH_TIMEOUT", 702 },
-
-    /**
-     * This error indicates a kernel launch that uses an incompatible texturing
-     * mode.
-     */
-    { "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 },
-    
-    /**
-     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
-     * trying to re-enable peer access to a context which has already
-     * had peer access to it enabled.
-     */
-    { "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 },
-
-    /**
-     * This error indicates that ::cuCtxDisablePeerAccess() is 
-     * trying to disable peer access which has not been enabled yet 
-     * via ::cuCtxEnablePeerAccess(). 
-     */
-    { "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 },
-
-    /**
-     * This error indicates that the primary context for the specified device
-     * has already been initialized.
-     */
-    { "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 },
-
-    /**
-     * This error indicates that the context current to the calling thread
-     * has been destroyed using ::cuCtxDestroy }, or is a primary context which
-     * has not yet been initialized.
-     */
-    { "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 },
-
-    /**
-     * A device-side assert triggered during kernel execution. The context
-     * cannot be used anymore, and must be destroyed. All existing device 
-     * memory allocations from this context are invalid and must be 
-     * reconstructed if the program is to continue using CUDA.
-     */
-    { "CUDA_ERROR_ASSERT", 710 },
-
-        /**
-     * This error indicates that the hardware resources required to enable
-     * peer access have been exhausted for one or more of the devices 
-     * passed to ::cuCtxEnablePeerAccess().
-     */
-    { "CUDA_ERROR_TOO_MANY_PEERS", 711 },
-
-    /**
-     * This error indicates that the memory range passed to ::cuMemHostRegister()
-     * has already been registered.
-     */
-    { "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 },
-
-    /**
-     * This error indicates that the pointer passed to ::cuMemHostUnregister()
-     * does not correspond to any currently registered memory region.
-     */
-    { "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 },
-
-    /**
-     * This error indicates that the attempted operation is not permitted.
-     */
-    { "CUDA_ERROR_NOT_PERMITTED", 800 },
-
-    /**
-     * This error indicates that the attempted operation is not supported
-     * on the current system or device.
-     */
-    { "CUDA_ERROR_NOT_SUPPORTED", 801 },
-
-    /**
-     * This indicates that an unknown internal error has occurred.
-     */
-    { "CUDA_ERROR_UNKNOWN", 999 },
-    { NULL, -1 }
-};
-
-// This is just a linear search through the array, since the error_id's are not
-// always ocurring consecutively
-const char * getCudaDrvErrorString(CUresult error_id)
-{
-    int index = 0;
-    while (sCudaDrvErrorString[index].error_id != error_id && 
-           sCudaDrvErrorString[index].error_id != -1)
-    {
-        index++;
-    }
-    if (sCudaDrvErrorString[index].error_id == error_id)
-        return (const char *)sCudaDrvErrorString[index].error_string;
-    else
-        return (const char *)"CUDA_ERROR not found!";
-}
-
-#endif
--- a/examples_cuda/stencil/kernel.ptx
+++ b/examples_cuda/stencil/kernel.ptx
--- a/examples_cuda/stencil/libcudadevrt.a
+++ b/examples_cuda/stencil/libcudadevrt.a
--- a/examples_cuda/stencil/stencil.cu
+++ b/examples_cuda/stencil/stencil.cu
@@ -1,6 +1,11 @@
 #define programCount 32
 #define programIndex (threadIdx.x & 31)
-#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5))
+#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5))
+#define taskIndex1 (blockIdx.y)
+#define taskIndex2 (blockIdx.z)
+#define taskCount0  (gridDim.x*4)
+#define taskCount1  (gridDim.y)
+#define taskCount2  (gridDim.z)

 __device__ static void
 stencil_step( int x0,  int x1,
@@ -48,15 +53,71 @@ stencil_step( int x0,  int x1,
 }


-extern "C"
+#define SPANX 32
+#define SPANY 8
+#define SPANZ 8
+
 __global__  void
 stencil_step_task( int x0,  int x1,
                   int y0,  int y1,
-                   int z0,
+                   int z0,  int z1,
                   int Nx,  int Ny,  int Nz,
                   const double coef[4],  const double vsq[],
                   const double Ain[],  double Aout[]) {
-    stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
-                 Nx, Ny, Nz, coef, vsq, Ain, Aout);
+  if (taskIndex0 >= taskCount0 || 
+      taskIndex1 >= taskCount1 || 
+      taskIndex2 >= taskCount2)
+    return;
+
+  const  int xfirst = x0 + taskIndex0 * SPANX;
+  const  int xlast  = min(x1, xfirst + SPANX);
+
+  const  int yfirst = y0 + taskIndex1 * SPANY;
+  const  int ylast  = min(y1, yfirst + SPANY);
+
+  const  int zfirst = z0 + taskIndex2 * SPANZ;
+  const  int zlast  = min(z1, zfirst + SPANZ);
+
+  stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
+      Nx, Ny, Nz, coef, vsq, Ain, Aout);
 }

+
+
+extern "C"
+__global__ void
+loop_stencil_ispc_tasks( int t0,  int t1, 
+                         int x0,  int x1,
+                         int y0,  int y1,
+                         int z0,  int z1,
+                         int Nx,  int Ny,  int Nz,
+                         const double coef[4], 
+                         const double vsq[],
+                         double Aeven[],  double Aodd[])
+{
+#define NB(x,n) (((x)+(n)-1)/(n))
+
+  dim3 grid((NB(x1-x0,SPANX)-1)/4+1, NB(y1-y0,SPANY), NB(z1-z0,SPANZ));
+
+    for ( int t = t0; t < t1; ++t) 
+    {
+      // Parallelize across cores as well: each task will work on a slice
+      // of 1 in the z extent of the volume.
+      if ((t & 1) == 0)
+      {
+        if (programIndex == 0)
+          stencil_step_task<<<grid,128>>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, 
+              coef, vsq, Aeven, Aodd);
+      }
+      else
+      {
+        if (programIndex == 0)
+          stencil_step_task<<<grid,128>>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, 
+              coef, vsq, Aodd, Aeven);
+      }
+
+      // We need to wait for all of the launched tasks to finish before
+      // starting the next iteration
+      cudaDeviceSynchronize();
+    }
+}
--- a/examples_cuda/stencil/stencilX.ispc
+++ b/examples_cuda/stencil/stencilX.ispc
@@ -1,159 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-static inline void
-stencil_step(uniform int x0, uniform int x1,
-             uniform int y0, uniform int y1,
-             uniform int z0, uniform int z1,
-             uniform int Nx, uniform int Ny, uniform int Nz,
-             uniform const double coef[4], uniform const double vsq[],
-             uniform const double Ain[], uniform double Aout[]) {
-    const uniform int Nxy = Nx * Ny;
-
-#if 0
-#define VER1
-#endif
-
-#ifdef VER1
-    const uniform int x1o = 1;
-    const uniform int x2o = 2;
-    const uniform int x3o = 3;
-    const uniform int y1o = Nx;
-    const uniform int y2o = Nx*2;
-    const uniform int y3o = Nx*3;
-    const uniform int z1o = Nxy;
-    const uniform int z2o = Nxy*2;
-    const uniform int z3o = Nxy*3;
-#endif
-    foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) 
-    {
-      const int index= (z * Nxy) + (y * Nx) + x;
-
-#ifndef VER1
-#define A_cur(x, y, z)   Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
-#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
-      double div = coef[0] * A_cur(0, 0, 0) +
-        coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
-            A_cur(0, +1, 0) + A_cur(0, -1, 0) +
-            A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
-        coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
-            A_cur(0, +2, 0) + A_cur(0, -2, 0) +
-            A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
-        coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
-            A_cur(0, +3, 0) + A_cur(0, -3, 0) +
-            A_cur(0, 0, +3) + A_cur(0, 0, -3));
-
-#else
-
-#define A_cur(x, y, z)  Ain [index + (x) + (y) + (z)]
-#define A_next(x, y, z) Aout[index + (x) + (y) + (z)]
-      double div = coef[0] * A_cur(0, 0, 0) +
-        coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) +
-            A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) +
-            A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) +
-        coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) +
-            A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) +
-            A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) +
-        coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) +
-            A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) +
-            A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o));
-
-#endif
-
-        A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
-          vsq[index] * div;
-    }
-}
-
-#define SPANX 32
-#define SPANY 8
-#define SPANZ 8
-
-static task void
-stencil_step_task(uniform int x0, uniform int x1,
-                  uniform int y0, uniform int y1,
-                  uniform int z0, uniform int z1,
-                  uniform int Nx, uniform int Ny, uniform int Nz,
-                  uniform const double coef[4], uniform const double vsq[],
-                  uniform const double Ain[], uniform double Aout[]) {
-  if (taskIndex0 >= taskCount0 || 
-      taskIndex1 >= taskCount1 || 
-      taskIndex2 >= taskCount2)
-    return;
-
-  const uniform int xfirst = x0 + taskIndex0 * SPANX;
-  const uniform int xlast  = min(x1, xfirst + SPANX);
-
-  const uniform int yfirst = y0 + taskIndex1 * SPANY;
-  const uniform int ylast  = min(y1, yfirst + SPANY);
-
-  const uniform int zfirst = z0 + taskIndex2 * SPANZ;
-  const uniform int zlast  = min(z1, zfirst + SPANZ);
-
-  stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
-      Nx, Ny, Nz, coef, vsq, Ain, Aout);
-}
-
-
-
-export void
-loop_stencil_ispc_tasks(uniform int t0, uniform int t1, 
-                        uniform int x0, uniform int x1,
-                        uniform int y0, uniform int y1,
-                        uniform int z0, uniform int z1,
-                        uniform int Nx, uniform int Ny, uniform int Nz,
-                        uniform const double coef[4], 
-                        uniform const double vsq[],
-                        uniform double Aeven[], uniform double Aodd[])
-{
-#define NB(x,n) (((x)+(n)-1)/(n))
-
-    for (uniform int t = t0; t < t1; ++t) 
-    {
-      // Parallelize across cores as well: each task will work on a slice
-      // of 1 in the z extent of the volume.
-      if ((t & 1) == 0)
-        launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] 
-          stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, 
-            coef, vsq, Aeven, Aodd);
-      else
-         launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] 
-           stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, 
-             coef, vsq, Aodd, Aeven);
-
-      // We need to wait for all of the launched tasks to finish before
-      // starting the next iteration.
-      sync;
-    }
-}
-
--- a/examples_cuda/stencil/stencilY.ispc
+++ b/examples_cuda/stencil/stencilY.ispc
@@ -1,126 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-static inline void
-stencil_step(uniform int x0, uniform int x1,
-             uniform int y0, uniform int y1,
-             uniform int z0, uniform int z1,
-             uniform int Nx, uniform int Ny, uniform int Nz,
-             uniform const double coef[4], uniform const double vsq[],
-             uniform const double Ain[], uniform double Aout[]) {
-    const uniform int Nxy = Nx * Ny;
-
-    foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) 
-    {
-      int index = (z * Nxy) + (y * Nx) + x;
-#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
-#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
-      double div = coef[0] * A_cur(0, 0, 0) +
-        coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
-            A_cur(0, +1, 0) + A_cur(0, -1, 0) +
-            A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
-        coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
-            A_cur(0, +2, 0) + A_cur(0, -2, 0) +
-            A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
-        coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
-            A_cur(0, +3, 0) + A_cur(0, -3, 0) +
-            A_cur(0, 0, +3) + A_cur(0, 0, -3));
-
-      A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
-        vsq[index] * div;
-
-    }
-}
-
-#define SPANX 32
-#define SPANY 8
-#define SPANZ 8
-
-static task void
-stencil_step_task(uniform int x0, uniform int x1,
-                  uniform int y0, uniform int y1,
-                  uniform int z0, uniform int z1,
-                  uniform int Nx, uniform int Ny, uniform int Nz,
-                  uniform const double coef[4], uniform const double vsq[],
-                  uniform const double Ain[], uniform double Aout[]) {
-  if (taskIndex0 >= taskCount0 || 
-      taskIndex1 >= taskCount1 || 
-      taskIndex2 >= taskCount2)
-    return;
-
-  const uniform int xfirst = x0 + taskIndex0 * SPANX;
-  const uniform int xlast  = min(x1, xfirst + SPANX);
-
-  const uniform int yfirst = y0 + taskIndex1 * SPANY;
-  const uniform int ylast  = min(y1, yfirst + SPANY);
-
-  const uniform int zfirst = z0 + taskIndex2 * SPANZ;
-  const uniform int zlast  = min(z1, zfirst + SPANZ);
-
-  stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
-      Nx, Ny, Nz, coef, vsq, Ain, Aout);
-}
-
-
-
-export void
-loop_stencil_ispc_tasks(uniform int t0, uniform int t1, 
-                        uniform int x0, uniform int x1,
-                        uniform int y0, uniform int y1,
-                        uniform int z0, uniform int z1,
-                        uniform int Nx, uniform int Ny, uniform int Nz,
-                        uniform const double coef[4], 
-                        uniform const double vsq[],
-                        uniform double Aeven[], uniform double Aodd[])
-{
-#define NB(x,n) (((x)+(n)-1)/(n))
-
-    for (uniform int t = t0; t < t1; ++t) 
-    {
-      // Parallelize across cores as well: each task will work on a slice
-      // of 1 in the z extent of the volume.
-      if ((t & 1) == 0)
-        launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] 
-          stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, 
-            coef, vsq, Aeven, Aodd);
-      else
-         launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] 
-           stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, 
-             coef, vsq, Aodd, Aeven);
-
-      // We need to wait for all of the launched tasks to finish before
-      // starting the next iteration.
-      sync;
-    }
-}
-
--- a/examples_cuda/stencil/stencil_cu
+++ b/examples_cuda/stencil/stencil_cu
--- a/examples_cuda/stencil/stencil_cu.o
+++ b/examples_cuda/stencil/stencil_cu.o
--- a/examples_cuda/stencil/stencil_ispc.h
+++ b/examples_cuda/stencil/stencil_ispc.h
@@ -1,34 +0,0 @@
-//
-// stencil_ispc.h
-// (Header automatically generated by the ispc compiler.)
-// DO NOT EDIT THIS FILE.
-//
-
-#ifndef ISPC_STENCIL_ISPC_H
-#define ISPC_STENCIL_ISPC_H
-
-#include <stdint.h>
-
-
-
-#ifdef __cplusplus
-namespace ispc { /* namespace */
-#endif // __cplusplus
-
-///////////////////////////////////////////////////////////////////////////
-// Functions exported from ispc code
-///////////////////////////////////////////////////////////////////////////
-#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C)
-extern "C" {
-#endif // __cplusplus
-    extern void loop_stencil_ispc_tasks(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd);
-#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C)
-} /* end extern C */
-#endif // __cplusplus
-
-
-#ifdef __cplusplus
-} /* namespace */
-#endif // __cplusplus
-
-#endif // ISPC_STENCIL_ISPC_H
--- a/examples_cuda/stencil/stencil_ispc_nvptx64.ll
+++ b/examples_cuda/stencil/stencil_ispc_nvptx64.ll
@@ -1,974 +0,0 @@
-; ModuleID = 'stencil_ispc_nvptx64.bc'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-target triple = "nvptx64"
-
-module asm ""
-module asm ".extern .func  (.param .b32 func_retval0) cudaLaunchDevice"
-module asm "("
-module asm "  .param .b64 cudaLaunchDevice_param_0,"
-module asm "  .param .b64 cudaLaunchDevice_param_1,"
-module asm "  .param .align 4 .b8 cudaLaunchDevice_param_2[12],"
-module asm "  .param .align 4 .b8 cudaLaunchDevice_param_3[12],"
-module asm "  .param .b32 cudaLaunchDevice_param_4,"
-module asm "  .param .b64 cudaLaunchDevice_param_5"
-module asm ");"
-
-@constDeltaForeach1 = private unnamed_addr constant [32 x i8] zeroinitializer
-@constDeltaForeach4 = private unnamed_addr constant [32 x i8] c"\00\01\02\03\04\05\06\07\08\09\0A\0B\0C\0D\0E\0F\10\11\12\13\14\15\16\17\18\19\1A\1B\1C\1D\1E\1F"
-
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
-
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
-
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() nounwind readnone
-
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() nounwind readnone
-
-declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() nounwind readnone
-
-declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() nounwind readnone
-
-declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() nounwind readnone
-
-define i32 @__shfl_i32(i32, i32) {
-  %shfl = tail call i32 asm sideeffect "shfl.idx.b32  $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1)
-  ret i32 %shfl
-}
-
-define float @__shfl_xor_float(float, i32) {
-  %shfl = tail call float asm sideeffect "shfl.bfly.b32  $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1)
-  ret float %shfl
-}
-
-define i32 @__shfl_xor_i32(i32, i32) {
-  %shfl = tail call i32 asm sideeffect "shfl.bfly.b32  $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1)
-  ret i32 %shfl
-}
-
-define float @__fminf(float, float) {
-  %min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1)
-  ret float %min
-}
-
-define float @__fmaxf(float, float) {
-  %max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1)
-  ret float %max
-}
-
-define i32 @__ballot(i1) {
-  %conv = zext i1 %0 to i32
-  %res = tail call i32 asm sideeffect "{ .reg .pred %p1; \0A         setp.ne.u32 %p1, $1, 0; \0A         vote.ballot.b32  $0, %p1; \0A      }", "=r,r"(i32 %conv)
-  ret i32 %res
-}
-
-define i32 @__lanemask_lt() {
-  %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"()
-  ret i32 %mask
-}
-
-define i8* @ISPCAlloc(i8**, i64, i32) {
-  ret i8* inttoptr (i64 1 to i8*)
-}
-
-declare i64 @cudaGetParameterBuffer(i64, i64)
-
-define i8* @ISPCGetParamBuffer(i8**, i64 %align, i64 %size) {
-entry:
-  %tid.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %and = and i32 %tid.i, 31
-  %cmp = icmp eq i32 %and, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  %ptri64tmp = tail call i64 @cudaGetParameterBuffer(i64 %align, i64 %size)
-  %phitmp = inttoptr i64 %ptri64tmp to i8*
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %entry
-  %ptri64 = phi i8* [ %phitmp, %if.then ], [ null, %entry ]
-  ret i8* %ptri64
-}
-
-define void @ISPCLaunch(i8**, i8* %func_ptr, i8* %func_args, i32 %ntx, i32 %nty, i32 %ntz) {
-entry:
-  %tid.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %and = and i32 %tid.i, 31
-  %cmp = icmp eq i32 %and, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  %ntxm1 = add nsw i32 %ntx, -1
-  %ntxm1d4 = ashr i32 %ntxm1, 2
-  %nbx = add nsw i32 %ntxm1d4, 1
-  %args_i64 = ptrtoint i8* %func_args to i64
-  %func_i64 = ptrtoint i8* %func_ptr to i64
-  %res_tmp = tail call i32 asm sideeffect "{\0A     .param .b64 param0;\0A     st.param.b64\09[param0+0], $1;\0A     .param .b64 param1;\0A     st.param.b64\09[param1+0], $2;\0A     .param .align 4 .b8 param2[12];\0A     st.param.b32\09[param2+0], $3; \0A     st.param.b32\09[param2+4], $4; \0A     st.param.b32\09[param2+8], $5; \0A     .param .align 4 .b8 param3[12];\0A     st.param.b32\09[param3+0], $6; \0A     st.param.b32\09[param3+4], $7; \0A     st.param.b32\09[param3+8], $8; \0A     .param .b32 param4;\0A     st.param.b32\09[param4+0], $9; \0A     .param .b64 param5;\0A     st.param.b64\09[param5+0], $10; \0A\0A     .param .b32 retval0;\0A     call.uni (retval0), \0A       cudaLaunchDevice,\0A       (\0A        param0, \0A        param1, \0A        param2, \0A        param3, \0A        param4, \0A        param5\0A       );\0A     ld.param.b32\09$0, [retval0+0];\0A  }\0A  ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 %func_i64, i64 %args_i64, i32 %nbx, i32 %nty, i32 %ntz, i32 128, i32 1, i32 1, i32 0, i64 0)
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %entry
-  ret void
-}
-
-declare i32 @cudaDeviceSynchronize()
-
-define void @ISPCSync(i8*) {
-  %2 = tail call i32 @cudaDeviceSynchronize()
-  ret void
-}
-
-define i64 @__warpBinExclusiveScan(i1 %p) {
-entry:
-  %conv.i = zext i1 %p to i32
-  %res.i = tail call i32 asm sideeffect "{ .reg .pred %p1; \0A         setp.ne.u32 %p1, $1, 0; \0A         vote.ballot.b32  $0, %p1; \0A      }", "=r,r"(i32 %conv.i)
-  %res.i1 = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %res.i)
-  %mask.i = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"()
-  %and = and i32 %mask.i, %res.i
-  %res.i2 = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %and)
-  %retval.sroa.1.4.insert.ext.i = zext i32 %res.i2 to i64
-  %retval.sroa.1.4.insert.shift.i = shl nuw i64 %retval.sroa.1.4.insert.ext.i, 32
-  %retval.sroa.0.0.insert.ext.i = zext i32 %res.i1 to i64
-  %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.1.4.insert.shift.i, %retval.sroa.0.0.insert.ext.i
-  ret i64 %retval.sroa.0.0.insert.insert.i
-}
-
-define internal void @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Ain, double* %Aout) {
-allocas:
-  %bid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  %mul_calltmp_.i = shl i32 %bid.i.i, 2
-  %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %bitop.i = ashr i32 %tid.i.i, 5
-  %add_mul_calltmp__bitop.i = add i32 %bitop.i, %mul_calltmp_.i
-  %nb.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-  %mul_calltmp_.i57 = shl i32 %nb.i.i, 2
-  %greaterequal_calltmp_calltmp18 = icmp sge i32 %add_mul_calltmp__bitop.i, %mul_calltmp_.i57
-  %bid.i.i58 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-  %nb.i.i59 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
-  %greaterequal_calltmp21_calltmp24 = icmp sge i32 %bid.i.i58, %nb.i.i59
-  %logical_or = or i1 %greaterequal_calltmp_calltmp18, %greaterequal_calltmp21_calltmp24
-  %bid.i.i60 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
-  %nb.i.i61 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
-  %greaterequal_calltmp27_calltmp30 = icmp sge i32 %bid.i.i60, %nb.i.i61
-  %logical_or31 = or i1 %logical_or, %greaterequal_calltmp27_calltmp30
-  br i1 %logical_or31, label %if_then, label %if_exit
-
-if_then:                                          ; preds = %foreach_reset19.i, %if_exit, %allocas
-  ret void
-
-if_exit:                                          ; preds = %allocas
-  %bid.i.i62 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  %mul_calltmp_.i63 = shl i32 %bid.i.i62, 7
-  %tid.i.i64 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %bitop.i657375 = add i32 %tid.i.i64, %mul_calltmp_.i63
-  %mul_calltmp35_ = and i32 %bitop.i657375, -32
-  %add_x0_load_mul_calltmp35_ = add i32 %mul_calltmp35_, %x0
-  %add_xfirst_load_ = add i32 %add_x0_load_mul_calltmp35_, 32
-  %c.i.i = icmp sgt i32 %add_xfirst_load_, %x1
-  %r.i.i = select i1 %c.i.i, i32 %x1, i32 %add_xfirst_load_
-  %bid.i.i67 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-  %mul_calltmp41_ = shl i32 %bid.i.i67, 3
-  %add_y0_load_mul_calltmp41_ = add i32 %mul_calltmp41_, %y0
-  %add_yfirst_load_ = add i32 %add_y0_load_mul_calltmp41_, 8
-  %bid.i.i70 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
-  %mul_calltmp47_ = shl i32 %bid.i.i70, 3
-  %add_z0_load_mul_calltmp47_ = add i32 %mul_calltmp47_, %z0
-  %add_zfirst_load_ = add i32 %add_z0_load_mul_calltmp47_, 8
-  %c.i.i71 = icmp sgt i32 %add_zfirst_load_, %z1
-  %r.i.i72 = select i1 %c.i.i71, i32 %z1, i32 %add_zfirst_load_
-  %mul_Nx_load_Ny_load.i = mul i32 %Ny, %Nx
-  %nitems29.i = sub i32 %r.i.i, %add_x0_load_mul_calltmp35_
-  %nextras30.i = srem i32 %nitems29.i, 32
-  %aligned_end31.i = sub i32 %r.i.i, %nextras30.i
-  %tid.i4.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %__laneidx.i = and i32 %tid.i4.i, 31
-  %0 = zext i32 %__laneidx.i to i64
-  %arrayidx.i = getelementptr [32 x i8]* @constDeltaForeach1, i64 0, i64 %0
-  %cmp38.i396 = icmp slt i32 %add_z0_load_mul_calltmp47_, %r.i.i72
-  br i1 %cmp38.i396, label %foreach_test21.i.preheader.lr.ph, label %if_then
-
-foreach_test21.i.preheader.lr.ph:                 ; preds = %if_exit
-  %c.i.i68 = icmp sgt i32 %add_yfirst_load_, %y1
-  %r.i.i69 = select i1 %c.i.i68, i32 %y1, i32 %add_yfirst_load_
-  %1 = load i8* %arrayidx.i, align 1
-  %_zext.i394 = zext i8 %1 to i32
-  %2 = insertelement <1 x i32> undef, i32 %_zext.i394, i32 0
-  %smear_counter_init.i393 = insertelement <1 x i32> undef, i32 %add_z0_load_mul_calltmp47_, i32 0
-  %iter_val.i395 = add <1 x i32> %smear_counter_init.i393, %2
-  %smear_counter_init44.i387 = insertelement <1 x i32> undef, i32 %add_y0_load_mul_calltmp41_, i32 0
-  %cmp54.i390 = icmp slt i32 %add_y0_load_mul_calltmp41_, %r.i.i69
-  %before_aligned_end73.i385 = icmp slt i32 %add_x0_load_mul_calltmp35_, %aligned_end31.i
-  %smear_end_init289.i = insertelement <1 x i32> undef, i32 %r.i.i, i32 0
-  %Nxy_load298_broadcast_init.i = insertelement <1 x i32> undef, i32 %mul_Nx_load_Ny_load.i, i32 0
-  %Nx_load300_broadcast_init.i = insertelement <1 x i32> undef, i32 %Nx, i32 0
-  %Ain_load309_ptr2int.i = ptrtoint double* %Ain to i64
-  %coef_load314_offset.i = getelementptr double* %coef, i64 1
-  %coef_load365_offset.i = getelementptr double* %coef, i64 2
-  %mul__Nx_load385.i = shl i32 %Nx, 1
-  %mul__Nx_load393.i = mul i32 %Nx, -2
-  %mul__Nxy_load402.i = shl i32 %mul_Nx_load_Ny_load.i, 1
-  %mul__Nxy_load410.i = mul i32 %mul_Nx_load_Ny_load.i, -2
-  %coef_load416_offset.i = getelementptr double* %coef, i64 3
-  %mul__Nx_load436.i = mul i32 %Nx, 3
-  %mul__Nx_load444.i = mul i32 %Nx, -3
-  %mul__Nxy_load453.i = mul i32 %mul_Nx_load_Ny_load.i, 3
-  %mul__Nxy_load461.i = mul i32 %mul_Nx_load_Ny_load.i, -3
-  %Aout_load470_ptr2int.i = ptrtoint double* %Aout to i64
-  %vsq_load488_ptr2int.i = ptrtoint double* %vsq to i64
-  %3 = sub i32 -9, %y0
-  %4 = shl i32 %bid.i.i67, 3
-  %5 = sub i32 %3, %4
-  %6 = xor i32 %y1, -1
-  %7 = icmp sgt i32 %5, %6
-  %smax = select i1 %7, i32 %5, i32 %6
-  %8 = xor i32 %smax, -1
-  %9 = sub i32 -9, %z0
-  %10 = shl i32 %bid.i.i70, 3
-  %11 = sub i32 %9, %10
-  %12 = xor i32 %z1, -1
-  %13 = icmp sgt i32 %11, %12
-  %smax399 = select i1 %13, i32 %11, i32 %12
-  %14 = xor i32 %smax399, -1
-  br label %foreach_test21.i.preheader
-
-foreach_full_body.i:                              ; preds = %outer_not_in_extras.i.preheader, %foreach_full_body.i
-  %counter32.4.i386 = phi i32 [ %new_counter279.i, %foreach_full_body.i ], [ %add_x0_load_mul_calltmp35_, %outer_not_in_extras.i.preheader ]
-  %tid.i.i56 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %__laneidx80.i = and i32 %tid.i.i56, 31
-  %15 = zext i32 %__laneidx80.i to i64
-  %arrayidx81.i = getelementptr [32 x i8]* @constDeltaForeach4, i64 0, i64 %15
-  %16 = load i8* %arrayidx81.i, align 1
-  %_zext82.i = zext i8 %16 to i32
-  %coef_load_offset_load.i = load double* %coef, align 8
-  %.lhs362.lhs.lhs = extractelement <1 x i32> %mul_z_load297_Nxy_load298_broadcast.i, i32 0
-  %.lhs362.lhs.rhs.lhs = extractelement <1 x i32> %iter_val50.i392, i32 0
-  %.lhs362.lhs.rhs = mul i32 %.lhs362.lhs.rhs.lhs, %Nx
-  %.lhs362.lhs = add i32 %.lhs362.lhs.lhs, %.lhs362.lhs.rhs
-  %.lhs362.rhs = add i32 %counter32.4.i386, %_zext82.i
-  %.lhs362 = add i32 %.lhs362.lhs, %.lhs362.rhs
-  %17 = shl i32 %.lhs362, 3
-  %iptr__id.i.rhs = sext i32 %17 to i64
-  %iptr__id.i = add i64 %iptr__id.i.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i = inttoptr i64 %iptr__id.i to double*
-  %val__id.i = load double* %ptr__id.i, align 8
-  %coef_load94_offset_load.i = load double* %coef_load314_offset.i, align 8
-  %18 = add i32 %17, 8
-  %iptr__id.i335.rhs = sext i32 %18 to i64
-  %iptr__id.i335 = add i64 %iptr__id.i335.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i336 = inttoptr i64 %iptr__id.i335 to double*
-  %val__id.i337 = load double* %ptr__id.i336, align 8
-  %19 = add i32 %17, -8
-  %iptr__id.i330.rhs = sext i32 %19 to i64
-  %iptr__id.i330 = add i64 %iptr__id.i330.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i331 = inttoptr i64 %iptr__id.i330 to double*
-  %val__id.i332 = load double* %ptr__id.i331, align 8
-  %.lhs365 = add i32 %.lhs362, %Nx
-  %20 = shl i32 %.lhs365, 3
-  %iptr__id.i325.rhs = sext i32 %20 to i64
-  %iptr__id.i325 = add i64 %iptr__id.i325.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i326 = inttoptr i64 %iptr__id.i325 to double*
-  %val__id.i327 = load double* %ptr__id.i326, align 8
-  %.lhs366 = sub i32 %.lhs362, %Nx
-  %21 = shl i32 %.lhs366, 3
-  %iptr__id.i320.rhs = sext i32 %21 to i64
-  %iptr__id.i320 = add i64 %iptr__id.i320.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i321 = inttoptr i64 %iptr__id.i320 to double*
-  %val__id.i322 = load double* %ptr__id.i321, align 8
-  %.lhs367 = add i32 %.lhs362, %mul_Nx_load_Ny_load.i
-  %22 = shl i32 %.lhs367, 3
-  %iptr__id.i315.rhs = sext i32 %22 to i64
-  %iptr__id.i315 = add i64 %iptr__id.i315.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i316 = inttoptr i64 %iptr__id.i315 to double*
-  %val__id.i317 = load double* %ptr__id.i316, align 8
-  %.lhs368 = sub i32 %.lhs362, %mul_Nx_load_Ny_load.i
-  %23 = shl i32 %.lhs368, 3
-  %iptr__id.i310.rhs = sext i32 %23 to i64
-  %iptr__id.i310 = add i64 %iptr__id.i310.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i311 = inttoptr i64 %iptr__id.i310 to double*
-  %val__id.i312 = load double* %ptr__id.i311, align 8
-  %coef_load145_offset_load.i = load double* %coef_load365_offset.i, align 8
-  %24 = add i32 %17, 16
-  %iptr__id.i305.rhs = sext i32 %24 to i64
-  %iptr__id.i305 = add i64 %iptr__id.i305.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i306 = inttoptr i64 %iptr__id.i305 to double*
-  %val__id.i307 = load double* %ptr__id.i306, align 8
-  %25 = add i32 %17, -16
-  %iptr__id.i300.rhs = sext i32 %25 to i64
-  %iptr__id.i300 = add i64 %iptr__id.i300.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i301 = inttoptr i64 %iptr__id.i300 to double*
-  %val__id.i302 = load double* %ptr__id.i301, align 8
-  %.lhs371 = add i32 %.lhs362, %mul__Nx_load385.i
-  %26 = shl i32 %.lhs371, 3
-  %iptr__id.i295.rhs = sext i32 %26 to i64
-  %iptr__id.i295 = add i64 %iptr__id.i295.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i296 = inttoptr i64 %iptr__id.i295 to double*
-  %val__id.i297 = load double* %ptr__id.i296, align 8
-  %.lhs372 = add i32 %.lhs362, %mul__Nx_load393.i
-  %27 = shl i32 %.lhs372, 3
-  %iptr__id.i290.rhs = sext i32 %27 to i64
-  %iptr__id.i290 = add i64 %iptr__id.i290.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i291 = inttoptr i64 %iptr__id.i290 to double*
-  %val__id.i292 = load double* %ptr__id.i291, align 8
-  %.lhs373 = add i32 %.lhs362, %mul__Nxy_load402.i
-  %28 = shl i32 %.lhs373, 3
-  %iptr__id.i285.rhs = sext i32 %28 to i64
-  %iptr__id.i285 = add i64 %iptr__id.i285.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i286 = inttoptr i64 %iptr__id.i285 to double*
-  %val__id.i287 = load double* %ptr__id.i286, align 8
-  %.lhs374 = add i32 %.lhs362, %mul__Nxy_load410.i
-  %29 = shl i32 %.lhs374, 3
-  %iptr__id.i280.rhs = sext i32 %29 to i64
-  %iptr__id.i280 = add i64 %iptr__id.i280.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i281 = inttoptr i64 %iptr__id.i280 to double*
-  %val__id.i282 = load double* %ptr__id.i281, align 8
-  %coef_load196_offset_load.i = load double* %coef_load416_offset.i, align 8
-  %30 = add i32 %17, 24
-  %iptr__id.i275.rhs = sext i32 %30 to i64
-  %iptr__id.i275 = add i64 %iptr__id.i275.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i276 = inttoptr i64 %iptr__id.i275 to double*
-  %val__id.i277 = load double* %ptr__id.i276, align 8
-  %31 = add i32 %17, -24
-  %iptr__id.i270.rhs = sext i32 %31 to i64
-  %iptr__id.i270 = add i64 %iptr__id.i270.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i271 = inttoptr i64 %iptr__id.i270 to double*
-  %val__id.i272 = load double* %ptr__id.i271, align 8
-  %.lhs377 = add i32 %.lhs362, %mul__Nx_load436.i
-  %32 = shl i32 %.lhs377, 3
-  %iptr__id.i265.rhs = sext i32 %32 to i64
-  %iptr__id.i265 = add i64 %iptr__id.i265.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i266 = inttoptr i64 %iptr__id.i265 to double*
-  %val__id.i267 = load double* %ptr__id.i266, align 8
-  %.lhs378 = add i32 %.lhs362, %mul__Nx_load444.i
-  %33 = shl i32 %.lhs378, 3
-  %iptr__id.i260.rhs = sext i32 %33 to i64
-  %iptr__id.i260 = add i64 %iptr__id.i260.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i261 = inttoptr i64 %iptr__id.i260 to double*
-  %val__id.i262 = load double* %ptr__id.i261, align 8
-  %.lhs379 = add i32 %.lhs362, %mul__Nxy_load453.i
-  %34 = shl i32 %.lhs379, 3
-  %iptr__id.i255.rhs = sext i32 %34 to i64
-  %iptr__id.i255 = add i64 %iptr__id.i255.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i256 = inttoptr i64 %iptr__id.i255 to double*
-  %val__id.i257 = load double* %ptr__id.i256, align 8
-  %.lhs380 = add i32 %.lhs362, %mul__Nxy_load461.i
-  %35 = shl i32 %.lhs380, 3
-  %iptr__id.i250.rhs = sext i32 %35 to i64
-  %iptr__id.i250 = add i64 %iptr__id.i250.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i251 = inttoptr i64 %iptr__id.i250 to double*
-  %val__id.i252 = load double* %ptr__id.i251, align 8
-  %val__id.i247 = load double* %ptr__id.i, align 8
-  %iptr__id.i240 = add i64 %iptr__id.i.rhs, %Aout_load470_ptr2int.i
-  %ptr__id.i241 = inttoptr i64 %iptr__id.i240 to double*
-  %val__id.i242 = load double* %ptr__id.i241, align 8
-  %iptr__id.i235 = add i64 %iptr__id.i.rhs, %vsq_load488_ptr2int.i
-  %ptr__id.i236 = inttoptr i64 %iptr__id.i235 to double*
-  %val__id.i237 = load double* %ptr__id.i236, align 8
-  %val__id.i233.lhs.lhs = fmul double %val__id.i247, 2.000000e+00
-  %val__id.i233.lhs = fsub double %val__id.i233.lhs.lhs, %val__id.i242
-  %val__id.i233.rhs.rhs.lhs.lhs.lhs = fmul double %coef_load_offset_load.i, %val__id.i
-  %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i337, %val__id.i332
-  %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i327
-  %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i322
-  %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs, %val__id.i317
-  %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs, %val__id.i312
-  %val__id.i233.rhs.rhs.lhs.lhs.rhs = fmul double %coef_load94_offset_load.i, %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs
-  %val__id.i233.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.lhs, %val__id.i233.rhs.rhs.lhs.lhs.rhs
-  %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i307, %val__id.i302
-  %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i297
-  %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i292
-  %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs, %val__id.i287
-  %val__id.i233.rhs.rhs.lhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs, %val__id.i282
-  %val__id.i233.rhs.rhs.lhs.rhs = fmul double %coef_load145_offset_load.i, %val__id.i233.rhs.rhs.lhs.rhs.rhs
-  %val__id.i233.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs, %val__id.i233.rhs.rhs.lhs.rhs
-  %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i277, %val__id.i272
-  %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i267
-  %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs, %val__id.i262
-  %val__id.i233.rhs.rhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs, %val__id.i257
-  %val__id.i233.rhs.rhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs, %val__id.i252
-  %val__id.i233.rhs.rhs.rhs = fmul double %coef_load196_offset_load.i, %val__id.i233.rhs.rhs.rhs.rhs
-  %val__id.i233.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs, %val__id.i233.rhs.rhs.rhs
-  %val__id.i233.rhs = fmul double %val__id.i237, %val__id.i233.rhs.rhs
-  %val__id.i233 = fadd double %val__id.i233.lhs, %val__id.i233.rhs
-  store double %val__id.i233, double* %ptr__id.i241, align 8
-  %new_counter279.i = add i32 %counter32.4.i386, 32
-  %before_aligned_end73.i = icmp slt i32 %new_counter279.i, %aligned_end31.i
-  br i1 %before_aligned_end73.i, label %foreach_full_body.i, label %partial_inner_all_outer.i
-
-foreach_test21.i.preheader:                       ; preds = %foreach_reset19.i, %foreach_test21.i.preheader.lr.ph
-  %iter_val.i398 = phi <1 x i32> [ %iter_val.i395, %foreach_test21.i.preheader.lr.ph ], [ %iter_val.i, %foreach_reset19.i ]
-  %counter.0.i397 = phi i32 [ %add_z0_load_mul_calltmp47_, %foreach_test21.i.preheader.lr.ph ], [ %new_counter.i, %foreach_reset19.i ]
-  %tid.i3.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %__laneidx47.i = and i32 %tid.i3.i, 31
-  %36 = zext i32 %__laneidx47.i to i64
-  %arrayidx48.i = getelementptr [32 x i8]* @constDeltaForeach1, i64 0, i64 %36
-  br i1 %cmp54.i390, label %outer_not_in_extras.i.preheader.lr.ph, label %foreach_reset19.i
-
-outer_not_in_extras.i.preheader.lr.ph:            ; preds = %foreach_test21.i.preheader
-  %37 = load i8* %arrayidx48.i, align 1
-  %_zext49.i388 = zext i8 %37 to i32
-  %38 = insertelement <1 x i32> undef, i32 %_zext49.i388, i32 0
-  %iter_val50.i389 = add <1 x i32> %smear_counter_init44.i387, %38
-  %mul_z_load297_Nxy_load298_broadcast.i = mul <1 x i32> %iter_val.i398, %Nxy_load298_broadcast_init.i
-  br label %outer_not_in_extras.i.preheader
-
-foreach_reset19.i:                                ; preds = %foreach_reset27.i, %foreach_test21.i.preheader
-  %new_counter.i = add i32 %counter.0.i397, 1
-  %smear_counter_init.i = insertelement <1 x i32> undef, i32 %new_counter.i, i32 0
-  %39 = load i8* %arrayidx.i, align 1
-  %_zext.i = zext i8 %39 to i32
-  %40 = insertelement <1 x i32> undef, i32 %_zext.i, i32 0
-  %iter_val.i = add <1 x i32> %smear_counter_init.i, %40
-  %exitcond400 = icmp eq i32 %new_counter.i, %14
-  br i1 %exitcond400, label %if_then, label %foreach_test21.i.preheader
-
-outer_not_in_extras.i.preheader:                  ; preds = %foreach_reset27.i, %outer_not_in_extras.i.preheader.lr.ph
-  %iter_val50.i392 = phi <1 x i32> [ %iter_val50.i389, %outer_not_in_extras.i.preheader.lr.ph ], [ %iter_val50.i, %foreach_reset27.i ]
-  %counter25.1.i391 = phi i32 [ %add_y0_load_mul_calltmp41_, %outer_not_in_extras.i.preheader.lr.ph ], [ %new_counter35.i, %foreach_reset27.i ]
-  br i1 %before_aligned_end73.i385, label %foreach_full_body.i, label %partial_inner_all_outer.i
-
-foreach_reset27.i:                                ; preds = %pl_dolane.i, %partial_inner_only.i, %partial_inner_all_outer.i
-  %new_counter35.i = add i32 %counter25.1.i391, 1
-  %smear_counter_init44.i = insertelement <1 x i32> undef, i32 %new_counter35.i, i32 0
-  %41 = load i8* %arrayidx48.i, align 1
-  %_zext49.i = zext i8 %41 to i32
-  %42 = insertelement <1 x i32> undef, i32 %_zext49.i, i32 0
-  %iter_val50.i = add <1 x i32> %smear_counter_init44.i, %42
-  %exitcond = icmp eq i32 %new_counter35.i, %8
-  br i1 %exitcond, label %foreach_reset19.i, label %outer_not_in_extras.i.preheader
-
-partial_inner_all_outer.i:                        ; preds = %outer_not_in_extras.i.preheader, %foreach_full_body.i
-  %counter32.4.i.lcssa = phi i32 [ %add_x0_load_mul_calltmp35_, %outer_not_in_extras.i.preheader ], [ %new_counter279.i, %foreach_full_body.i ]
-  %before_full_end.i = icmp slt i32 %counter32.4.i.lcssa, %r.i.i
-  br i1 %before_full_end.i, label %partial_inner_only.i, label %foreach_reset27.i
-
-partial_inner_only.i:                             ; preds = %partial_inner_all_outer.i
-  %smear_counter_init282.i = insertelement <1 x i32> undef, i32 %counter32.4.i.lcssa, i32 0
-  %tid.i2.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %__laneidx285.i = and i32 %tid.i2.i, 31
-  %43 = zext i32 %__laneidx285.i to i64
-  %arrayidx286.i = getelementptr [32 x i8]* @constDeltaForeach4, i64 0, i64 %43
-  %44 = load i8* %arrayidx286.i, align 1
-  %_zext287.i = zext i8 %44 to i32
-  %45 = insertelement <1 x i32> undef, i32 %_zext287.i, i32 0
-  %iter_val288.i = add <1 x i32> %smear_counter_init282.i, %45
-  %cmp291.i = icmp slt <1 x i32> %iter_val288.i, %smear_end_init289.i
-  %mul_y_load299_Nx_load300_broadcast.i = mul <1 x i32> %iter_val50.i392, %Nx_load300_broadcast_init.i
-  %add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast.i = add <1 x i32> %mul_z_load297_Nxy_load298_broadcast.i, %mul_y_load299_Nx_load300_broadcast.i
-  %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i = add <1 x i32> %add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast.i, %iter_val288.i
-  %v.i.i224 = extractelement <1 x i1> %cmp291.i, i32 0
-  br i1 %v.i.i224, label %pl_dolane.i, label %foreach_reset27.i
-
-pl_dolane.i:                                      ; preds = %partial_inner_only.i
-  %coef_load303_offset_load.i = load double* %coef, align 8
-  %.lhs361 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %46 = shl i32 %.lhs361, 3
-  %iptr__id.i225.rhs = sext i32 %46 to i64
-  %iptr__id.i225 = add i64 %iptr__id.i225.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i226 = inttoptr i64 %iptr__id.i225 to double*
-  %val__id.i227 = load double* %ptr__id.i226, align 8
-  %coef_load314_offset_load.i401 = load double* %coef_load314_offset.i, align 8
-  %.lhs360.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs360 = shl i32 %.lhs360.lhs, 3
-  %47 = add i32 %.lhs360, 8
-  %iptr__id.i218.rhs = sext i32 %47 to i64
-  %iptr__id.i218 = add i64 %iptr__id.i218.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i219 = inttoptr i64 %iptr__id.i218 to double*
-  %val__id.i220 = load double* %ptr__id.i219, align 8
-  %.lhs359.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs359 = shl i32 %.lhs359.lhs, 3
-  %48 = add i32 %.lhs359, -8
-  %iptr__id.i211.rhs = sext i32 %48 to i64
-  %iptr__id.i211 = add i64 %iptr__id.i211.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i212 = inttoptr i64 %iptr__id.i211 to double*
-  %val__id.i213 = load double* %ptr__id.i212, align 8
-  %.lhs358.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs358 = add i32 %.lhs358.lhs, %Nx
-  %49 = shl i32 %.lhs358, 3
-  %iptr__id.i204.rhs = sext i32 %49 to i64
-  %iptr__id.i204 = add i64 %iptr__id.i204.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i205 = inttoptr i64 %iptr__id.i204 to double*
-  %val__id.i206 = load double* %ptr__id.i205, align 8
-  %.lhs357.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs357 = sub i32 %.lhs357.lhs, %Nx
-  %50 = shl i32 %.lhs357, 3
-  %iptr__id.i197.rhs = sext i32 %50 to i64
-  %iptr__id.i197 = add i64 %iptr__id.i197.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i198 = inttoptr i64 %iptr__id.i197 to double*
-  %val__id.i199 = load double* %ptr__id.i198, align 8
-  %.lhs356.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs356 = add i32 %.lhs356.lhs, %mul_Nx_load_Ny_load.i
-  %51 = shl i32 %.lhs356, 3
-  %iptr__id.i190.rhs = sext i32 %51 to i64
-  %iptr__id.i190 = add i64 %iptr__id.i190.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i191 = inttoptr i64 %iptr__id.i190 to double*
-  %val__id.i192 = load double* %ptr__id.i191, align 8
-  %.lhs355.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs355 = sub i32 %.lhs355.lhs, %mul_Nx_load_Ny_load.i
-  %52 = shl i32 %.lhs355, 3
-  %iptr__id.i183.rhs = sext i32 %52 to i64
-  %iptr__id.i183 = add i64 %iptr__id.i183.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i184 = inttoptr i64 %iptr__id.i183 to double*
-  %val__id.i185 = load double* %ptr__id.i184, align 8
-  %coef_load365_offset_load.i457 = load double* %coef_load365_offset.i, align 8
-  %.lhs354.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs354 = shl i32 %.lhs354.lhs, 3
-  %53 = add i32 %.lhs354, 16
-  %iptr__id.i176.rhs = sext i32 %53 to i64
-  %iptr__id.i176 = add i64 %iptr__id.i176.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i177 = inttoptr i64 %iptr__id.i176 to double*
-  %val__id.i178 = load double* %ptr__id.i177, align 8
-  %.lhs353.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs353 = shl i32 %.lhs353.lhs, 3
-  %54 = add i32 %.lhs353, -16
-  %iptr__id.i169.rhs = sext i32 %54 to i64
-  %iptr__id.i169 = add i64 %iptr__id.i169.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i170 = inttoptr i64 %iptr__id.i169 to double*
-  %val__id.i171 = load double* %ptr__id.i170, align 8
-  %.lhs352.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs352 = add i32 %.lhs352.lhs, %mul__Nx_load385.i
-  %55 = shl i32 %.lhs352, 3
-  %iptr__id.i162.rhs = sext i32 %55 to i64
-  %iptr__id.i162 = add i64 %iptr__id.i162.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i163 = inttoptr i64 %iptr__id.i162 to double*
-  %val__id.i164 = load double* %ptr__id.i163, align 8
-  %.lhs351.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs351 = add i32 %.lhs351.lhs, %mul__Nx_load393.i
-  %56 = shl i32 %.lhs351, 3
-  %iptr__id.i155.rhs = sext i32 %56 to i64
-  %iptr__id.i155 = add i64 %iptr__id.i155.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i156 = inttoptr i64 %iptr__id.i155 to double*
-  %val__id.i157 = load double* %ptr__id.i156, align 8
-  %.lhs350.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs350 = add i32 %.lhs350.lhs, %mul__Nxy_load402.i
-  %57 = shl i32 %.lhs350, 3
-  %iptr__id.i148.rhs = sext i32 %57 to i64
-  %iptr__id.i148 = add i64 %iptr__id.i148.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i149 = inttoptr i64 %iptr__id.i148 to double*
-  %val__id.i150 = load double* %ptr__id.i149, align 8
-  %.lhs349.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs349 = add i32 %.lhs349.lhs, %mul__Nxy_load410.i
-  %58 = shl i32 %.lhs349, 3
-  %iptr__id.i141.rhs = sext i32 %58 to i64
-  %iptr__id.i141 = add i64 %iptr__id.i141.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i142 = inttoptr i64 %iptr__id.i141 to double*
-  %val__id.i143 = load double* %ptr__id.i142, align 8
-  %coef_load416_offset_load.i544 = load double* %coef_load416_offset.i, align 8
-  %.lhs348.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs348 = shl i32 %.lhs348.lhs, 3
-  %59 = add i32 %.lhs348, 24
-  %iptr__id.i134.rhs = sext i32 %59 to i64
-  %iptr__id.i134 = add i64 %iptr__id.i134.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i135 = inttoptr i64 %iptr__id.i134 to double*
-  %val__id.i136 = load double* %ptr__id.i135, align 8
-  %.lhs347.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs347 = shl i32 %.lhs347.lhs, 3
-  %60 = add i32 %.lhs347, -24
-  %iptr__id.i127.rhs = sext i32 %60 to i64
-  %iptr__id.i127 = add i64 %iptr__id.i127.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i128 = inttoptr i64 %iptr__id.i127 to double*
-  %val__id.i129 = load double* %ptr__id.i128, align 8
-  %.lhs346.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs346 = add i32 %.lhs346.lhs, %mul__Nx_load436.i
-  %61 = shl i32 %.lhs346, 3
-  %iptr__id.i120.rhs = sext i32 %61 to i64
-  %iptr__id.i120 = add i64 %iptr__id.i120.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i121 = inttoptr i64 %iptr__id.i120 to double*
-  %val__id.i122 = load double* %ptr__id.i121, align 8
-  %.lhs345.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs345 = add i32 %.lhs345.lhs, %mul__Nx_load444.i
-  %62 = shl i32 %.lhs345, 3
-  %iptr__id.i113.rhs = sext i32 %62 to i64
-  %iptr__id.i113 = add i64 %iptr__id.i113.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i114 = inttoptr i64 %iptr__id.i113 to double*
-  %val__id.i115 = load double* %ptr__id.i114, align 8
-  %.lhs344.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs344 = add i32 %.lhs344.lhs, %mul__Nxy_load453.i
-  %63 = shl i32 %.lhs344, 3
-  %iptr__id.i106.rhs = sext i32 %63 to i64
-  %iptr__id.i106 = add i64 %iptr__id.i106.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i107 = inttoptr i64 %iptr__id.i106 to double*
-  %val__id.i108 = load double* %ptr__id.i107, align 8
-  %.lhs343.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %.lhs343 = add i32 %.lhs343.lhs, %mul__Nxy_load461.i
-  %64 = shl i32 %.lhs343, 3
-  %iptr__id.i99.rhs = sext i32 %64 to i64
-  %iptr__id.i99 = add i64 %iptr__id.i99.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i100 = inttoptr i64 %iptr__id.i99 to double*
-  %val__id.i101 = load double* %ptr__id.i100, align 8
-  %.lhs342 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %65 = shl i32 %.lhs342, 3
-  %iptr__id.i92.rhs = sext i32 %65 to i64
-  %iptr__id.i92 = add i64 %iptr__id.i92.rhs, %Ain_load309_ptr2int.i
-  %ptr__id.i93 = inttoptr i64 %iptr__id.i92 to double*
-  %val__id.i94 = load double* %ptr__id.i93, align 8
-  %.lhs341 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %66 = shl i32 %.lhs341, 3
-  %iptr__id.i85.rhs = sext i32 %66 to i64
-  %iptr__id.i85 = add i64 %iptr__id.i85.rhs, %Aout_load470_ptr2int.i
-  %ptr__id.i86 = inttoptr i64 %iptr__id.i85 to double*
-  %val__id.i87 = load double* %ptr__id.i86, align 8
-  %.lhs340 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %67 = shl i32 %.lhs340, 3
-  %iptr__id.i80.rhs = sext i32 %67 to i64
-  %iptr__id.i80 = add i64 %iptr__id.i80.rhs, %vsq_load488_ptr2int.i
-  %ptr__id.i81 = inttoptr i64 %iptr__id.i80 to double*
-  %val__id.i82 = load double* %ptr__id.i81, align 8
-  %.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
-  %68 = shl i32 %.lhs, 3
-  %iptr__id.i76.rhs = sext i32 %68 to i64
-  %iptr__id.i76 = add i64 %iptr__id.i76.rhs, %Aout_load470_ptr2int.i
-  %ptr__id.i77 = inttoptr i64 %iptr__id.i76 to double*
-  %val__id.i78.lhs.lhs = fmul double %val__id.i94, 2.000000e+00
-  %val__id.i78.lhs = fsub double %val__id.i78.lhs.lhs, %val__id.i87
-  %val__id.i78.rhs.rhs.lhs.lhs.lhs = fmul double %coef_load303_offset_load.i, %val__id.i227
-  %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i220, %val__id.i213
-  %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i206
-  %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i199
-  %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs, %val__id.i192
-  %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs, %val__id.i185
-  %val__id.i78.rhs.rhs.lhs.lhs.rhs = fmul double %coef_load314_offset_load.i401, %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs
-  %val__id.i78.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.lhs, %val__id.i78.rhs.rhs.lhs.lhs.rhs
-  %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i178, %val__id.i171
-  %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i164
-  %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i157
-  %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs, %val__id.i150
-  %val__id.i78.rhs.rhs.lhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs, %val__id.i143
-  %val__id.i78.rhs.rhs.lhs.rhs = fmul double %coef_load365_offset_load.i457, %val__id.i78.rhs.rhs.lhs.rhs.rhs
-  %val__id.i78.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs, %val__id.i78.rhs.rhs.lhs.rhs
-  %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i136, %val__id.i129
-  %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i122
-  %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs, %val__id.i115
-  %val__id.i78.rhs.rhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs, %val__id.i108
-  %val__id.i78.rhs.rhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs, %val__id.i101
-  %val__id.i78.rhs.rhs.rhs = fmul double %coef_load416_offset_load.i544, %val__id.i78.rhs.rhs.rhs.rhs
-  %val__id.i78.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs, %val__id.i78.rhs.rhs.rhs
-  %val__id.i78.rhs = fmul double %val__id.i78.rhs.rhs, %val__id.i82
-  %val__id.i78 = fadd double %val__id.i78.lhs, %val__id.i78.rhs
-  store double %val__id.i78, double* %ptr__id.i77, align 8
-  br label %foreach_reset27.i
-}
-
-define void @loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd, <1 x i1> %__mask) {
-allocas:
-  %less_t_load_t1_load94 = icmp slt i32 %t0, %t1
-  br i1 %less_t_load_t1_load94, label %for_loop.lr.ph, label %for_exit
-
-for_loop.lr.ph:                                   ; preds = %allocas
-  %add_sub_x1_load21_x0_load22_ = sub i32 31, %x0
-  %sub_add_sub_x1_load21_x0_load22__ = add i32 %add_sub_x1_load21_x0_load22_, %x1
-  %div_sub_add_sub_x1_load21_x0_load22___ = sdiv i32 %sub_add_sub_x1_load21_x0_load22__, 32
-  %add_sub_y1_load23_y0_load24_ = sub i32 7, %y0
-  %sub_add_sub_y1_load23_y0_load24__ = add i32 %add_sub_y1_load23_y0_load24_, %y1
-  %div_sub_add_sub_y1_load23_y0_load24___ = sdiv i32 %sub_add_sub_y1_load23_y0_load24__, 8
-  %add_sub_z1_load25_z0_load26_ = sub i32 7, %z0
-  %sub_add_sub_z1_load25_z0_load26__ = add i32 %add_sub_z1_load25_z0_load26_, %z1
-  %div_sub_add_sub_z1_load25_z0_load26___ = sdiv i32 %sub_add_sub_z1_load25_z0_load26__, 8
-  %ntxm1.i = add nsw i32 %div_sub_add_sub_x1_load21_x0_load22___, -1
-  %ntxm1d4.i = ashr i32 %ntxm1.i, 2
-  %nbx.i = add nsw i32 %ntxm1d4.i, 1
-  br label %for_loop
-
-for_loop:                                         ; preds = %if_exit, %for_loop.lr.ph
-  %t.095 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load78_plus1, %if_exit ]
-  %bitop = and i32 %t.095, 1
-  %equal_bitop_ = icmp eq i32 %bitop, 0
-  %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %and.i = and i32 %tid.i.i, 31
-  %cmp.i = icmp eq i32 %and.i, 0
-  br i1 %cmp.i, label %if.then.i, label %ISPCGetParamBuffer.exit
-
-if.then.i:                                        ; preds = %for_loop
-  %ptri64tmp.i = tail call i64 @cudaGetParameterBuffer(i64 8, i64 72)
-  %phitmp.i = inttoptr i64 %ptri64tmp.i to i8*
-  br label %ISPCGetParamBuffer.exit
-
-ISPCGetParamBuffer.exit:                          ; preds = %if.then.i, %for_loop
-  %ptri64.i = phi i8* [ %phitmp.i, %if.then.i ], [ null, %for_loop ]
-  %cmp1 = icmp eq i8* %ptri64.i, null
-  br i1 %equal_bitop_, label %if_then, label %if_else
-
-for_exit:                                         ; preds = %if_exit, %allocas
-  %0 = tail call i32 @cudaDeviceSynchronize()
-  ret void
-
-if_then:                                          ; preds = %ISPCGetParamBuffer.exit
-  br i1 %cmp1, label %if_false, label %if_true
-
-if_else:                                          ; preds = %ISPCGetParamBuffer.exit
-  br i1 %cmp1, label %if_false62, label %if_true61
-
-if_exit:                                          ; preds = %if.then.i92, %if_false62, %if.then.i83, %if_false
-  %1 = tail call i32 @cudaDeviceSynchronize()
-  %t_load78_plus1 = add i32 %t.095, 1
-  %exitcond = icmp eq i32 %t_load78_plus1, %t1
-  br i1 %exitcond, label %for_exit, label %for_loop
-
-if_true:                                          ; preds = %if_then
-  %funarg = bitcast i8* %ptri64.i to i32*
-  store i32 %x0, i32* %funarg, align 4
-  %funarg27 = getelementptr i8* %ptri64.i, i64 4
-  %2 = bitcast i8* %funarg27 to i32*
-  store i32 %x1, i32* %2, align 4
-  %funarg28 = getelementptr i8* %ptri64.i, i64 8
-  %3 = bitcast i8* %funarg28 to i32*
-  store i32 %y0, i32* %3, align 4
-  %funarg29 = getelementptr i8* %ptri64.i, i64 12
-  %4 = bitcast i8* %funarg29 to i32*
-  store i32 %y1, i32* %4, align 4
-  %funarg30 = getelementptr i8* %ptri64.i, i64 16
-  %5 = bitcast i8* %funarg30 to i32*
-  store i32 %z0, i32* %5, align 4
-  %funarg31 = getelementptr i8* %ptri64.i, i64 20
-  %6 = bitcast i8* %funarg31 to i32*
-  store i32 %z1, i32* %6, align 4
-  %funarg32 = getelementptr i8* %ptri64.i, i64 24
-  %7 = bitcast i8* %funarg32 to i32*
-  store i32 %Nx, i32* %7, align 4
-  %funarg33 = getelementptr i8* %ptri64.i, i64 28
-  %8 = bitcast i8* %funarg33 to i32*
-  store i32 %Ny, i32* %8, align 4
-  %funarg34 = getelementptr i8* %ptri64.i, i64 32
-  %9 = bitcast i8* %funarg34 to i32*
-  store i32 %Nz, i32* %9, align 4
-  %funarg35 = getelementptr i8* %ptri64.i, i64 40
-  %10 = bitcast i8* %funarg35 to double**
-  store double* %coef, double** %10, align 8
-  %funarg36 = getelementptr i8* %ptri64.i, i64 48
-  %11 = bitcast i8* %funarg36 to double**
-  store double* %vsq, double** %11, align 8
-  %funarg37 = getelementptr i8* %ptri64.i, i64 56
-  %12 = bitcast i8* %funarg37 to double**
-  store double* %Aeven, double** %12, align 8
-  %funarg38 = getelementptr i8* %ptri64.i, i64 64
-  %13 = bitcast i8* %funarg38 to double**
-  store double* %Aodd, double** %13, align 8
-  br label %if_false
-
-if_false:                                         ; preds = %if_true, %if_then
-  %tid.i.i80 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %and.i81 = and i32 %tid.i.i80, 31
-  %cmp.i82 = icmp eq i32 %and.i81, 0
-  br i1 %cmp.i82, label %if.then.i83, label %if_exit
-
-if.then.i83:                                      ; preds = %if_false
-  %args_i64.i = ptrtoint i8* %ptri64.i to i64
-  %res_tmp.i = tail call i32 asm sideeffect "{\0A     .param .b64 param0;\0A     st.param.b64\09[param0+0], $1;\0A     .param .b64 param1;\0A     st.param.b64\09[param1+0], $2;\0A     .param .align 4 .b8 param2[12];\0A     st.param.b32\09[param2+0], $3; \0A     st.param.b32\09[param2+4], $4; \0A     st.param.b32\09[param2+8], $5; \0A     .param .align 4 .b8 param3[12];\0A     st.param.b32\09[param3+0], $6; \0A     st.param.b32\09[param3+4], $7; \0A     st.param.b32\09[param3+8], $8; \0A     .param .b32 param4;\0A     st.param.b32\09[param4+0], $9; \0A     .param .b64 param5;\0A     st.param.b64\09[param5+0], $10; \0A\0A     .param .b32 retval0;\0A     call.uni (retval0), \0A       cudaLaunchDevice,\0A       (\0A        param0, \0A        param1, \0A        param2, \0A        param3, \0A        param4, \0A        param5\0A       );\0A     ld.param.b32\09$0, [retval0+0];\0A  }\0A  ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0)
-  br label %if_exit
-
-if_true61:                                        ; preds = %if_else
-  %funarg64 = bitcast i8* %ptri64.i to i32*
-  store i32 %x0, i32* %funarg64, align 4
-  %funarg65 = getelementptr i8* %ptri64.i, i64 4
-  %14 = bitcast i8* %funarg65 to i32*
-  store i32 %x1, i32* %14, align 4
-  %funarg66 = getelementptr i8* %ptri64.i, i64 8
-  %15 = bitcast i8* %funarg66 to i32*
-  store i32 %y0, i32* %15, align 4
-  %funarg67 = getelementptr i8* %ptri64.i, i64 12
-  %16 = bitcast i8* %funarg67 to i32*
-  store i32 %y1, i32* %16, align 4
-  %funarg68 = getelementptr i8* %ptri64.i, i64 16
-  %17 = bitcast i8* %funarg68 to i32*
-  store i32 %z0, i32* %17, align 4
-  %funarg69 = getelementptr i8* %ptri64.i, i64 20
-  %18 = bitcast i8* %funarg69 to i32*
-  store i32 %z1, i32* %18, align 4
-  %funarg70 = getelementptr i8* %ptri64.i, i64 24
-  %19 = bitcast i8* %funarg70 to i32*
-  store i32 %Nx, i32* %19, align 4
-  %funarg71 = getelementptr i8* %ptri64.i, i64 28
-  %20 = bitcast i8* %funarg71 to i32*
-  store i32 %Ny, i32* %20, align 4
-  %funarg72 = getelementptr i8* %ptri64.i, i64 32
-  %21 = bitcast i8* %funarg72 to i32*
-  store i32 %Nz, i32* %21, align 4
-  %funarg73 = getelementptr i8* %ptri64.i, i64 40
-  %22 = bitcast i8* %funarg73 to double**
-  store double* %coef, double** %22, align 8
-  %funarg74 = getelementptr i8* %ptri64.i, i64 48
-  %23 = bitcast i8* %funarg74 to double**
-  store double* %vsq, double** %23, align 8
-  %funarg75 = getelementptr i8* %ptri64.i, i64 56
-  %24 = bitcast i8* %funarg75 to double**
-  store double* %Aodd, double** %24, align 8
-  %funarg76 = getelementptr i8* %ptri64.i, i64 64
-  %25 = bitcast i8* %funarg76 to double**
-  store double* %Aeven, double** %25, align 8
-  br label %if_false62
-
-if_false62:                                       ; preds = %if_true61, %if_else
-  %tid.i.i84 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %and.i85 = and i32 %tid.i.i84, 31
-  %cmp.i86 = icmp eq i32 %and.i85, 0
-  br i1 %cmp.i86, label %if.then.i92, label %if_exit
-
-if.then.i92:                                      ; preds = %if_false62
-  %args_i64.i90 = ptrtoint i8* %ptri64.i to i64
-  %res_tmp.i91 = tail call i32 asm sideeffect "{\0A     .param .b64 param0;\0A     st.param.b64\09[param0+0], $1;\0A     .param .b64 param1;\0A     st.param.b64\09[param1+0], $2;\0A     .param .align 4 .b8 param2[12];\0A     st.param.b32\09[param2+0], $3; \0A     st.param.b32\09[param2+4], $4; \0A     st.param.b32\09[param2+8], $5; \0A     .param .align 4 .b8 param3[12];\0A     st.param.b32\09[param3+0], $6; \0A     st.param.b32\09[param3+4], $7; \0A     st.param.b32\09[param3+8], $8; \0A     .param .b32 param4;\0A     st.param.b32\09[param4+0], $9; \0A     .param .b64 param5;\0A     st.param.b64\09[param5+0], $10; \0A\0A     .param .b32 retval0;\0A     call.uni (retval0), \0A       cudaLaunchDevice,\0A       (\0A        param0, \0A        param1, \0A        param2, \0A        param3, \0A        param4, \0A        param5\0A       );\0A     ld.param.b32\09$0, [retval0+0];\0A  }\0A  ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i90, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0)
-  br label %if_exit
-}
-
-define void @loop_stencil_ispc_tasks(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd) {
-allocas:
-  %less_t_load_t1_load94 = icmp slt i32 %t0, %t1
-  br i1 %less_t_load_t1_load94, label %for_loop.lr.ph, label %for_exit
-
-for_loop.lr.ph:                                   ; preds = %allocas
-  %add_sub_x1_load21_x0_load22_ = sub i32 31, %x0
-  %sub_add_sub_x1_load21_x0_load22__ = add i32 %add_sub_x1_load21_x0_load22_, %x1
-  %div_sub_add_sub_x1_load21_x0_load22___ = sdiv i32 %sub_add_sub_x1_load21_x0_load22__, 32
-  %add_sub_y1_load23_y0_load24_ = sub i32 7, %y0
-  %sub_add_sub_y1_load23_y0_load24__ = add i32 %add_sub_y1_load23_y0_load24_, %y1
-  %div_sub_add_sub_y1_load23_y0_load24___ = sdiv i32 %sub_add_sub_y1_load23_y0_load24__, 8
-  %add_sub_z1_load25_z0_load26_ = sub i32 7, %z0
-  %sub_add_sub_z1_load25_z0_load26__ = add i32 %add_sub_z1_load25_z0_load26_, %z1
-  %div_sub_add_sub_z1_load25_z0_load26___ = sdiv i32 %sub_add_sub_z1_load25_z0_load26__, 8
-  %ntxm1.i = add nsw i32 %div_sub_add_sub_x1_load21_x0_load22___, -1
-  %ntxm1d4.i = ashr i32 %ntxm1.i, 2
-  %nbx.i = add nsw i32 %ntxm1d4.i, 1
-  br label %for_loop
-
-for_loop:                                         ; preds = %if_exit, %for_loop.lr.ph
-  %t.095 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load78_plus1, %if_exit ]
-  %bitop = and i32 %t.095, 1
-  %equal_bitop_ = icmp eq i32 %bitop, 0
-  %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %and.i = and i32 %tid.i.i, 31
-  %cmp.i = icmp eq i32 %and.i, 0
-  br i1 %cmp.i, label %if.then.i, label %ISPCGetParamBuffer.exit
-
-if.then.i:                                        ; preds = %for_loop
-  %ptri64tmp.i = tail call i64 @cudaGetParameterBuffer(i64 8, i64 72)
-  %phitmp.i = inttoptr i64 %ptri64tmp.i to i8*
-  br label %ISPCGetParamBuffer.exit
-
-ISPCGetParamBuffer.exit:                          ; preds = %if.then.i, %for_loop
-  %ptri64.i = phi i8* [ %phitmp.i, %if.then.i ], [ null, %for_loop ]
-  %cmp1 = icmp eq i8* %ptri64.i, null
-  br i1 %equal_bitop_, label %if_then, label %if_else
-
-for_exit:                                         ; preds = %if_exit, %allocas
-  %0 = tail call i32 @cudaDeviceSynchronize()
-  ret void
-
-if_then:                                          ; preds = %ISPCGetParamBuffer.exit
-  br i1 %cmp1, label %if_false, label %if_true
-
-if_else:                                          ; preds = %ISPCGetParamBuffer.exit
-  br i1 %cmp1, label %if_false62, label %if_true61
-
-if_exit:                                          ; preds = %if.then.i92, %if_false62, %if.then.i83, %if_false
-  %1 = tail call i32 @cudaDeviceSynchronize()
-  %t_load78_plus1 = add i32 %t.095, 1
-  %exitcond = icmp eq i32 %t_load78_plus1, %t1
-  br i1 %exitcond, label %for_exit, label %for_loop
-
-if_true:                                          ; preds = %if_then
-  %funarg = bitcast i8* %ptri64.i to i32*
-  store i32 %x0, i32* %funarg, align 4
-  %funarg27 = getelementptr i8* %ptri64.i, i64 4
-  %2 = bitcast i8* %funarg27 to i32*
-  store i32 %x1, i32* %2, align 4
-  %funarg28 = getelementptr i8* %ptri64.i, i64 8
-  %3 = bitcast i8* %funarg28 to i32*
-  store i32 %y0, i32* %3, align 4
-  %funarg29 = getelementptr i8* %ptri64.i, i64 12
-  %4 = bitcast i8* %funarg29 to i32*
-  store i32 %y1, i32* %4, align 4
-  %funarg30 = getelementptr i8* %ptri64.i, i64 16
-  %5 = bitcast i8* %funarg30 to i32*
-  store i32 %z0, i32* %5, align 4
-  %funarg31 = getelementptr i8* %ptri64.i, i64 20
-  %6 = bitcast i8* %funarg31 to i32*
-  store i32 %z1, i32* %6, align 4
-  %funarg32 = getelementptr i8* %ptri64.i, i64 24
-  %7 = bitcast i8* %funarg32 to i32*
-  store i32 %Nx, i32* %7, align 4
-  %funarg33 = getelementptr i8* %ptri64.i, i64 28
-  %8 = bitcast i8* %funarg33 to i32*
-  store i32 %Ny, i32* %8, align 4
-  %funarg34 = getelementptr i8* %ptri64.i, i64 32
-  %9 = bitcast i8* %funarg34 to i32*
-  store i32 %Nz, i32* %9, align 4
-  %funarg35 = getelementptr i8* %ptri64.i, i64 40
-  %10 = bitcast i8* %funarg35 to double**
-  store double* %coef, double** %10, align 8
-  %funarg36 = getelementptr i8* %ptri64.i, i64 48
-  %11 = bitcast i8* %funarg36 to double**
-  store double* %vsq, double** %11, align 8
-  %funarg37 = getelementptr i8* %ptri64.i, i64 56
-  %12 = bitcast i8* %funarg37 to double**
-  store double* %Aeven, double** %12, align 8
-  %funarg38 = getelementptr i8* %ptri64.i, i64 64
-  %13 = bitcast i8* %funarg38 to double**
-  store double* %Aodd, double** %13, align 8
-  br label %if_false
-
-if_false:                                         ; preds = %if_true, %if_then
-  %tid.i.i80 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %and.i81 = and i32 %tid.i.i80, 31
-  %cmp.i82 = icmp eq i32 %and.i81, 0
-  br i1 %cmp.i82, label %if.then.i83, label %if_exit
-
-if.then.i83:                                      ; preds = %if_false
-  %args_i64.i = ptrtoint i8* %ptri64.i to i64
-  %res_tmp.i = tail call i32 asm sideeffect "{\0A     .param .b64 param0;\0A     st.param.b64\09[param0+0], $1;\0A     .param .b64 param1;\0A     st.param.b64\09[param1+0], $2;\0A     .param .align 4 .b8 param2[12];\0A     st.param.b32\09[param2+0], $3; \0A     st.param.b32\09[param2+4], $4; \0A     st.param.b32\09[param2+8], $5; \0A     .param .align 4 .b8 param3[12];\0A     st.param.b32\09[param3+0], $6; \0A     st.param.b32\09[param3+4], $7; \0A     st.param.b32\09[param3+8], $8; \0A     .param .b32 param4;\0A     st.param.b32\09[param4+0], $9; \0A     .param .b64 param5;\0A     st.param.b64\09[param5+0], $10; \0A\0A     .param .b32 retval0;\0A     call.uni (retval0), \0A       cudaLaunchDevice,\0A       (\0A        param0, \0A        param1, \0A        param2, \0A        param3, \0A        param4, \0A        param5\0A       );\0A     ld.param.b32\09$0, [retval0+0];\0A  }\0A  ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0)
-  br label %if_exit
-
-if_true61:                                        ; preds = %if_else
-  %funarg64 = bitcast i8* %ptri64.i to i32*
-  store i32 %x0, i32* %funarg64, align 4
-  %funarg65 = getelementptr i8* %ptri64.i, i64 4
-  %14 = bitcast i8* %funarg65 to i32*
-  store i32 %x1, i32* %14, align 4
-  %funarg66 = getelementptr i8* %ptri64.i, i64 8
-  %15 = bitcast i8* %funarg66 to i32*
-  store i32 %y0, i32* %15, align 4
-  %funarg67 = getelementptr i8* %ptri64.i, i64 12
-  %16 = bitcast i8* %funarg67 to i32*
-  store i32 %y1, i32* %16, align 4
-  %funarg68 = getelementptr i8* %ptri64.i, i64 16
-  %17 = bitcast i8* %funarg68 to i32*
-  store i32 %z0, i32* %17, align 4
-  %funarg69 = getelementptr i8* %ptri64.i, i64 20
-  %18 = bitcast i8* %funarg69 to i32*
-  store i32 %z1, i32* %18, align 4
-  %funarg70 = getelementptr i8* %ptri64.i, i64 24
-  %19 = bitcast i8* %funarg70 to i32*
-  store i32 %Nx, i32* %19, align 4
-  %funarg71 = getelementptr i8* %ptri64.i, i64 28
-  %20 = bitcast i8* %funarg71 to i32*
-  store i32 %Ny, i32* %20, align 4
-  %funarg72 = getelementptr i8* %ptri64.i, i64 32
-  %21 = bitcast i8* %funarg72 to i32*
-  store i32 %Nz, i32* %21, align 4
-  %funarg73 = getelementptr i8* %ptri64.i, i64 40
-  %22 = bitcast i8* %funarg73 to double**
-  store double* %coef, double** %22, align 8
-  %funarg74 = getelementptr i8* %ptri64.i, i64 48
-  %23 = bitcast i8* %funarg74 to double**
-  store double* %vsq, double** %23, align 8
-  %funarg75 = getelementptr i8* %ptri64.i, i64 56
-  %24 = bitcast i8* %funarg75 to double**
-  store double* %Aodd, double** %24, align 8
-  %funarg76 = getelementptr i8* %ptri64.i, i64 64
-  %25 = bitcast i8* %funarg76 to double**
-  store double* %Aeven, double** %25, align 8
-  br label %if_false62
-
-if_false62:                                       ; preds = %if_true61, %if_else
-  %tid.i.i84 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %and.i85 = and i32 %tid.i.i84, 31
-  %cmp.i86 = icmp eq i32 %and.i85, 0
-  br i1 %cmp.i86, label %if.then.i92, label %if_exit
-
-if.then.i92:                                      ; preds = %if_false62
-  %args_i64.i90 = ptrtoint i8* %ptri64.i to i64
-  %res_tmp.i91 = tail call i32 asm sideeffect "{\0A     .param .b64 param0;\0A     st.param.b64\09[param0+0], $1;\0A     .param .b64 param1;\0A     st.param.b64\09[param1+0], $2;\0A     .param .align 4 .b8 param2[12];\0A     st.param.b32\09[param2+0], $3; \0A     st.param.b32\09[param2+4], $4; \0A     st.param.b32\09[param2+8], $5; \0A     .param .align 4 .b8 param3[12];\0A     st.param.b32\09[param3+0], $6; \0A     st.param.b32\09[param3+4], $7; \0A     st.param.b32\09[param3+8], $8; \0A     .param .b32 param4;\0A     st.param.b32\09[param4+0], $9; \0A     .param .b64 param5;\0A     st.param.b64\09[param5+0], $10; \0A\0A     .param .b32 retval0;\0A     call.uni (retval0), \0A       cudaLaunchDevice,\0A       (\0A        param0, \0A        param1, \0A        param2, \0A        param3, \0A        param4, \0A        param5\0A       );\0A     ld.param.b32\09$0, [retval0+0];\0A  }\0A  ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i90, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0)
-  br label %if_exit
-}
-
-!llvm.ident = !{!0}
-!nvvm.annotations = !{!1, !2}
-
-!0 = metadata !{metadata !"clang version 3.4 (trunk 194723)"}
-!1 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, metadata !"kernel", i32 1}
-!2 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @loop_stencil_ispc_tasks, metadata !"kernel", i32 1}
--- a/examples_cuda/stencil/stencil_ispc_nvptx64.ptx
+++ b/examples_cuda/stencil/stencil_ispc_nvptx64.ptx
--- a/examples_cuda/stencil/stencil_orig.cpp
+++ b/examples_cuda/stencil/stencil_orig.cpp
@@ -1,172 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-#ifdef _MSC_VER
-#define _CRT_SECURE_NO_WARNINGS
-#define NOMINMAX
-#pragma warning (disable: 4244)
-#pragma warning (disable: 4305)
-#endif
-
-#include <stdio.h>
-#include <algorithm>
-#include <math.h>
-#include "../timing.h"
-#include "stencil_ispc.h"
-using namespace ispc;
-
-#include <sys/time.h>
-
-
-double rtc(void)
-{
-  struct timeval Tvalue;
-  double etime;
-  struct timezone dummy;
-
-  gettimeofday(&Tvalue,&dummy);
-  etime =  (double) Tvalue.tv_sec +
-    1.e-6*((double) Tvalue.tv_usec);
-  return etime;
-}
-
-
-extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
-    int y0, int y1, int z0, int z1,
-    int Nx, int Ny, int Nz,
-    const double coef[5], 
-    const double vsq[],
-    double Aeven[], double Aodd[]);
-
-
-void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
-  int offset = 0;
-  for (int z = 0; z < Nz; ++z)
-    for (int y = 0; y < Ny; ++y)
-      for (int x = 0; x < Nx; ++x, ++offset) {
-        A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
-        A[1][offset] = 0;
-        vsq[offset] = x*y*z / double(Nx * Ny * Nz);
-      }
-}
-
-
-int main() {
-  int Nx = 256, Ny = 256, Nz = 256;
-  int width = 4;
-  double *Aserial[2], *Aispc[2];
-  Aserial[0] = new double [Nx * Ny * Nz];
-  Aserial[1] = new double [Nx * Ny * Nz];
-  Aispc[0] = new double [Nx * Ny * Nz];
-  Aispc[1] = new double [Nx * Ny * Nz];
-  double *vsq = new double [Nx * Ny * Nz];
-
-  double coeff[4] = { 0.5, -.25, .125, -.0625 }; 
-
-//  InitData(Nx, Ny, Nz, Aispc, vsq);
-
-  //
-  // Compute the image using the ispc implementation on one core; report
-  // the minimum time of three runs.
-  //
-  double minTimeISPC = 1e30;
-#if 0
-  for (int i = 0; i < 3; ++i) {
-    reset_and_start_timer();
-    loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
-        width, Nz - width, Nx, Ny, Nz, coeff, vsq,
-        Aispc[0], Aispc[1]);
-    double dt = get_elapsed_mcycles();
-    minTimeISPC = std::min(minTimeISPC, dt);
-  }
-
-  printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
-#endif
-
-  fprintf(stderr, " -- init -- \n");
-  InitData(Nx, Ny, Nz, Aispc, vsq);
-  fprintf(stderr, " -- done init -- \n");
-
-  //
-  // Compute the image using the ispc implementation with tasks; report
-  // the minimum time of three runs.
-  //
-  double minTimeISPCTasks = 1e30;
-  for (int i = 0; i < 3; ++i) {
-    reset_and_start_timer();
-    const double t0 = rtc();
-    loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
-        width, Nz - width, Nx, Ny, Nz, coeff, vsq,
-        Aispc[0], Aispc[1]);
-    double dt = 1e3*(rtc() - t0); //get_elapsed_mcycles();
-    minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
-  }
-
-  fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
-
-
-  InitData(Nx, Ny, Nz, Aserial, vsq);
-
-  // 
-  // And run the serial implementation 3 times, again reporting the
-  // minimum time.
-  //
-  double minTimeSerial = 1e30;
-  for (int i = 0; i < 3; ++i) {
-    reset_and_start_timer();
-    loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
-        width, Nz - width, Nx, Ny, Nz, coeff, vsq,
-        Aserial[0], Aserial[1]);
-    double dt = get_elapsed_mcycles();
-    minTimeSerial = std::min(minTimeSerial, dt);
-  }
-
-  printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);
-
-  printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
-      minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
-
-  // Check for agreement
-  int offset = 0;
-  for (int z = 0; z < Nz; ++z)
-    for (int y = 0; y < Ny; ++y)
-      for (int x = 0; x < Nx; ++x, ++offset) {
-        double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
-            Aserial[1][offset]);
-        if (error > 1e-4)
-          printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
-              x, y, z, Aispc[1][offset], Aserial[1][offset]);
-      }
-
-  return 0;
-}
--- a/examples_cuda/stencil/stencil_orig.ispc
+++ b/examples_cuda/stencil/stencil_orig.ispc
@@ -1,172 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-#ifdef __NVPTX__
-#warning "emitting DEVICE code"
-#define taskIndex blockIndex0()
-#define taskCount blockCount0()
-#define programIndex laneIndex()
-#define programCount warpSize()
-#else
-#warning "emitting HOST code"
-#endif
-
-static inline void
-stencil_step(uniform int x0, uniform int x1,
-             uniform int y0, uniform int y1,
-             uniform int z0, uniform int z1,
-             uniform int Nx, uniform int Ny, uniform int Nz,
-             uniform const double coef[4], uniform const double vsq[],
-             uniform const double Ain[], uniform double Aout[]) {
-    const uniform int Nxy = Nx * Ny;
-
-//    foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) 
-#if 0
-#define VER1
-#endif
-
-#ifdef VER1
-    const uniform long x1o = 1;
-    const uniform long x2o = 2;
-    const uniform long x3o = 3;
-    const uniform long y1o = Nx;
-    const uniform long y2o = Nx*2;
-    const uniform long y3o = Nx*3;
-    const uniform long z1o = Nxy;
-    const uniform long z2o = Nxy*2;
-    const uniform long z3o = Nxy*3;
-#endif
-    for (uniform int z = z0; z < z1; z++)
-      for (uniform int y = y0; y < y1; y++)
-      {
-        const int index_base = (z * Nxy) + (y * Nx);
-        for (uniform int xb = x0; xb < x1; xb += programCount)
-        {
-          const int x = xb + programIndex;
-          int index = index_base + x;
-#ifndef VER1
-#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
-#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
-          double div = coef[0] * A_cur(0, 0, 0) +
-            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
-                A_cur(0, +1, 0) + A_cur(0, -1, 0) +
-                A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
-            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
-                A_cur(0, +2, 0) + A_cur(0, -2, 0) +
-                A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
-            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
-                A_cur(0, +3, 0) + A_cur(0, -3, 0) +
-                A_cur(0, 0, +3) + A_cur(0, 0, -3));
-#else
-#define A_cur(x, y, z)  Ain [index + (x) + (y) + (z)]
-#define A_next(x, y, z) Aout[index + (x) + (y) + (z)]
-          double div = coef[0] * A_cur(0, 0, 0) +
-            coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) +
-                A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) +
-                A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) +
-            coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) +
-                A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) +
-                A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) +
-            coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) +
-                A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) +
-                A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o));
-#endif
-
-          if (x < x1)
-            A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
-              vsq[index] * div;
-        }
-      }
-}
-
-
-static task void
-stencil_step_task(uniform int x0, uniform int x1,
-                  uniform int y0, uniform int y1,
-                  uniform int z0,
-                  uniform int Nx, uniform int Ny, uniform int Nz,
-                  uniform const double coef[4], uniform const double vsq[],
-                  uniform const double Ain[], uniform double Aout[]) {
-  if(taskIndex >= taskCount) return;
-
-    stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
-                 Nx, Ny, Nz, coef, vsq, Ain, Aout);
-}
-
-
-export void
-loop_stencil_ispc_tasks(uniform int t0, uniform int t1, 
-                        uniform int x0, uniform int x1,
-                        uniform int y0, uniform int y1,
-                        uniform int z0, uniform int z1,
-                        uniform int Nx, uniform int Ny, uniform int Nz,
-                        uniform const double coef[4], 
-                        uniform const double vsq[],
-                        uniform double Aeven[], uniform double Aodd[])
-{
-    for (uniform int t = t0; t < t1; ++t) {
-        // Parallelize across cores as well: each task will work on a slice
-        // of 1 in the z extent of the volume.
-        if ((t & 1) == 0)
-            launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
-                                            coef, vsq, Aeven, Aodd);
-        else
-            launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
-                                            coef, vsq, Aodd, Aeven);
-
-        // We need to wait for all of the launched tasks to finish before
-        // starting the next iteration.
-        sync;
-    }
-}
-
-
-export void
-loop_stencil_ispc(uniform int t0, uniform int t1, 
-                  uniform int x0, uniform int x1,
-                  uniform int y0, uniform int y1,
-                  uniform int z0, uniform int z1,
-                  uniform int Nx, uniform int Ny, uniform int Nz,
-                  uniform const double coef[4], 
-                  uniform const double vsq[],
-                  uniform double Aeven[], uniform double Aodd[])
-{
-    for (uniform int t = t0; t < t1; ++t) {
-        if ((t & 1) == 0)
-            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
-                         Aeven, Aodd);
-        else
-            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
-                         Aodd, Aeven);
-    }
-}
--- a/examples_cuda/stencil/stencil_serial.o
+++ b/examples_cuda/stencil/stencil_serial.o