diff --git a/examples_cuda/deferred/kernels.cu b/examples_cuda/deferred/kernels.cu
index c04c5b0d..35c18099 100644
--- a/examples_cuda/deferred/kernels.cu
+++ b/examples_cuda/deferred/kernels.cu
@@ -184,7 +184,7 @@ struct Uniform
 
   __device__ inline Uniform()
   {
-#if 1
+#if 0
     if (programIndex == 0)
       data = new T[N];
     ptr[0] = __shfl(ptr[0], 0);
@@ -200,7 +200,7 @@ struct Uniform
   }
   __device__ inline ~Uniform()
   {
-#if 1
+#if 0
     if (programIndex == 0)
       delete data;
 #else
@@ -730,20 +730,21 @@ ShadeTile(
 ///////////////////////////////////////////////////////////////////////////
 // Static decomposition
 
-extern "C" __global__ void
+
+__global__ void
 RenderTile( int num_groups_x,  int num_groups_y,
-           const  InputHeader *inputHeaderPtr,
-           const  InputDataArrays *inputDataPtr,
+           const  InputHeader inputHeaderPtr[],
+           const  InputDataArrays inputDataPtr[],
             int visualizeLightCount,
            // Output
             unsigned int8 framebuffer_r[],
             unsigned int8 framebuffer_g[],
             unsigned int8 framebuffer_b[]) {
   if (taskIndex >= taskCount) return;
+  const InputHeader &inputHeader = *inputHeaderPtr;
+  const InputDataArrays &inputData = *inputDataPtr;
 
 #if 1
-  const  InputHeader inputHeader = *inputHeaderPtr;
-  const  InputDataArrays inputData = *inputDataPtr;
      int32 group_y = taskIndex / num_groups_x;
      int32 group_x = taskIndex % num_groups_x;
 
@@ -794,3 +795,28 @@ RenderTile( int num_groups_x,  int num_groups_y,
 }
 
 
+  extern "C"
+__global__ void
+RenderStatic(InputHeader inputHeaderPtr[],
+             InputDataArrays inputDataPtr[],
+             int visualizeLightCount,
+             // Output
+             unsigned int8 framebuffer_r[],
+             unsigned int8 framebuffer_g[],
+             unsigned int8 framebuffer_b[]) {
+
+  const InputHeader &inputHeader = *inputHeaderPtr;
+  const InputDataArrays &inputData = *inputDataPtr;
+
+    int num_groups_x = (inputHeader.framebufferWidth + 
+                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
+    int num_groups_y = (inputHeader.framebufferHeight + 
+                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
+    int num_groups = num_groups_x * num_groups_y;
+
+    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
+    // by MIN_TILE_HEIGHT pixels.
+    RenderTile<<<dim3(num_groups_x,num_groups_y), 128>>>(num_groups_x, num_groups_y,
+                                  inputHeaderPtr, inputDataPtr, visualizeLightCount,
+                                  framebuffer_r, framebuffer_g, framebuffer_b);
+}
diff --git a/examples_cuda/deferred/kernels1.ispc b/examples_cuda/deferred/kernels1.ispc
index 3a81daf6..9871aa9f 100644
--- a/examples_cuda/deferred/kernels1.ispc
+++ b/examples_cuda/deferred/kernels1.ispc
@@ -477,19 +477,18 @@ ShadeTile(
 ///////////////////////////////////////////////////////////////////////////
 // Static decomposition
 
-task void
+void task
 RenderTile(uniform int num_groups_x, uniform int num_groups_y,
-           const uniform InputHeader inputHeaderPtr[],
-           const uniform InputDataArrays inputDataPtr[],
+           const  uniform InputHeader inputHeaderPtr[],
+           const  uniform InputDataArrays inputDataPtr[],
            uniform int visualizeLightCount,
            // Output
            uniform unsigned int8 framebuffer_r[],
            uniform unsigned int8 framebuffer_g[],
            uniform unsigned int8 framebuffer_b[]) {
   if (taskIndex >= taskCount) return;
-
-  const uniform InputHeader inputHeader = *inputHeaderPtr;
-  const uniform InputDataArrays inputData = *inputDataPtr;
+  const  uniform InputHeader inputHeader = *inputHeaderPtr;
+  const  uniform InputDataArrays inputData = *inputDataPtr;
 
     uniform int32 group_y = taskIndex / num_groups_x;
     uniform int32 group_x = taskIndex % num_groups_x;
@@ -543,13 +542,16 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
 export void
 RenderStatic(uniform InputHeader inputHeaderPtr[],
              uniform InputDataArrays inputDataPtr[],
-             uniform InputHeader &inputHeader,
              uniform int visualizeLightCount,
              // Output
              uniform unsigned int8 framebuffer_r[],
              uniform unsigned int8 framebuffer_g[],
              uniform unsigned int8 framebuffer_b[]) {
 
+  const uniform InputHeader inputHeader = *inputHeaderPtr;
+  const uniform InputDataArrays inputData = *inputDataPtr;
+
+
     uniform int num_groups_x = (inputHeader.framebufferWidth + 
                                 MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
     uniform int num_groups_y = (inputHeader.framebufferHeight + 
diff --git a/examples_cuda/deferred/main_cu.cpp b/examples_cuda/deferred/main_cu.cpp
index eafb28b9..7fb02c85 100755
--- a/examples_cuda/deferred/main_cu.cpp
+++ b/examples_cuda/deferred/main_cu.cpp
@@ -124,10 +124,12 @@ void destroyContext()
 
 CUmodule loadModule(const char * module)
 {
+  const double t0 = rtc();
   CUmodule cudaModule;
   // in this branch we use compilation with parameters
 
-  const unsigned int jitNumOptions = 1;
+#if 0
+  unsigned int jitNumOptions = 1;
   CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
   void **jitOptVals = new void*[jitNumOptions];
   // set up pointer to set the Maximum # of registers for a particular kernel
@@ -136,23 +138,106 @@ CUmodule loadModule(const char * module)
   jitOptVals[0] = (void *)(size_t)jitRegCount;
 #if 0
 
-        // set up size of compilation log buffer
-        jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
-        int jitLogBufferSize = 1024;
-        jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
+  {
+    jitNumOptions = 3;
+    // set up size of compilation log buffer
+    jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+    int jitLogBufferSize = 1024;
+    jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
 
-        // set up pointer to the compilation log buffer
-        jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
-        char *jitLogBuffer = new char[jitLogBufferSize];
-        jitOptVals[1] = jitLogBuffer;
+    // set up pointer to the compilation log buffer
+    jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
+    char *jitLogBuffer = new char[jitLogBufferSize];
+    jitOptVals[1] = jitLogBuffer;
 
-        // set up pointer to set the Maximum # of registers for a particular kernel
-        jitOptions[2] = CU_JIT_MAX_REGISTERS;
-        int jitRegCount = 32;
-        jitOptVals[2] = (void *)(size_t)jitRegCount;
+    // set up pointer to set the Maximum # of registers for a particular kernel
+    jitOptions[2] = CU_JIT_MAX_REGISTERS;
+    int jitRegCount = 32;
+    jitOptVals[2] = (void *)(size_t)jitRegCount;
+  }
 #endif
 
   checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals));
+#else
+  CUlinkState  CUState;
+  CUlinkState *lState = &CUState;
+  const int nOptions = 7;
+    CUjit_option options[nOptions];
+    void* optionVals[nOptions];
+    float walltime;
+    const unsigned int logSize = 32768;
+    char error_log[logSize],
+         info_log[logSize];
+    void *cuOut;
+    size_t outSize;
+    int myErr = 0;
+
+    // Setup linker options
+    // Return walltime from JIT compilation
+    options[0] = CU_JIT_WALL_TIME;
+    optionVals[0] = (void*) &walltime;
+    // Pass a buffer for info messages
+    options[1] = CU_JIT_INFO_LOG_BUFFER;
+    optionVals[1] = (void*) info_log;
+    // Pass the size of the info buffer
+    options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+    optionVals[2] = (void*) logSize;
+    // Pass a buffer for error message
+    options[3] = CU_JIT_ERROR_LOG_BUFFER;
+    optionVals[3] = (void*) error_log;
+    // Pass the size of the error buffer
+    options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+    optionVals[4] = (void*) logSize;
+    // Make the linker verbose
+    options[5] = CU_JIT_LOG_VERBOSE;
+    optionVals[5] = (void*) 1;
+    // Max # of registers/pthread
+    options[6] = CU_JIT_MAX_REGISTERS;
+    int jitRegCount = 48;
+    optionVals[6] = (void *)(size_t)jitRegCount;
+
+    // Create a pending linker invocation
+    checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState));
+
+#if 0
+    if (sizeof(void *)==4)
+    {
+        // Load the PTX from the string myPtx32
+        printf("Loading myPtx32[] program\n");
+        // PTX May also be loaded from file, as per below.
+        myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0);
+    }
+    else
+#endif
+    {
+        // Load the PTX from the string myPtx (64-bit)
+        fprintf(stderr, "Loading ptx..\n");
+        myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0);
+        myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "libcudadevrt.a", 0,0,0); 
+        // PTX May also be loaded from file, as per below.
+        // myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0);
+    }
+
+    // Complete the linker step
+    myErr = cuLinkComplete(*lState, &cuOut, &outSize);
+
+    if ( myErr != CUDA_SUCCESS )
+    {
+      // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. 
+      fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
+      assert(0);
+    }    
+
+    // Linker walltime and info_log were requested in options above.
+    fprintf(stderr, "CUDA Link Completed in %fms [ %g ms]. Linker Output:\n%s\n",walltime,info_log,1e3*(rtc() - t0));
+
+    // Load resulting cuBin into module
+    checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut));
+
+    // Destroy the linker invocation
+    checkCudaErrors(cuLinkDestroy(*lState));
+#endif
+  fprintf(stderr, " loadModule took %g ms \n", 1e3*(rtc() - t0));
   return cudaModule;
 }
 void unloadModule(CUmodule &cudaModule)
@@ -185,16 +270,17 @@ void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
 {
   checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
 }
-#define deviceLaunch(func,nbx,nby,nbz,params) \
+#define deviceLaunch(func,params) \
   checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_SHARED)); \
   checkCudaErrors( \
       cuLaunchKernel( \
         (func), \
-        ((nbx-1)/(128/32)+1), (nby), (nbz), \
-        128, 1, 1, \
+        1,1,1, \
+        32, 1, 1, \
         0, NULL, (params), NULL \
         ));
 
+
 typedef CUdeviceptr devicePtr;
 
 
@@ -241,25 +327,14 @@ extern "C"
   }
   void CUDALaunch(
       void **handlePtr, 
-      const char * module_name,
-      const char * module_1,
       const char * func_name,
-      void **func_args, 
-      int countx, int county, int countz)
+      void **func_args)
   {
-    assert(module_name != NULL);
-    assert(module_1 != NULL);
-    assert(func_name != NULL);
-    assert(func_args != NULL);
-#if 0
-    const char * module = module_1;
-#else
-    const std::vector<char> module_str = readBinary("kernel.cubin");
+    const std::vector<char> module_str = readBinary("kernel.ptx");
     const char *  module = &module_str[0];
-#endif
     CUmodule   cudaModule   = loadModule(module);
     CUfunction cudaFunction = getFunction(cudaModule, func_name);
-    deviceLaunch(cudaFunction, countx, county, countz, func_args);
+    deviceLaunch(cudaFunction, func_args);
     unloadModule(cudaModule);
   }
   void CUDASync(void *handle)
@@ -372,14 +447,18 @@ int main(int argc, char** argv) {
         framebuffer.clear();
         const double t0 = rtc();
         for (int j = 0; j < nframes; ++j)
-            ispc::RenderStatic(
-                (ispc::InputHeader*)d_header, 
-                (ispc::InputDataArrays*)d_arrays,
-                input->header,
-                VISUALIZE_LIGHT_COUNT,
-                (uint8_t*)d_r, 
-                (uint8_t*)d_g, 
-                (uint8_t*)d_b);
+        {
+        const char * func_name = "RenderStatic";
+        int light_count = VISUALIZE_LIGHT_COUNT;
+        void *func_args[] = {
+                &d_header, 
+                &d_arrays,
+                &light_count,
+                &d_r, 
+                &d_g, 
+                &d_b};
+        CUDALaunch(NULL, func_name, func_args);
+        }
         double mcycles = 1000*(rtc() - t0) / nframes;
         fprintf(stderr, "dt= %g\n", mcycles);
         ispcCycles = std::min(ispcCycles, mcycles);