diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll
index f3c1a314..65cf405a 100644
--- a/builtins/target-nvptx64.ll
+++ b/builtins/target-nvptx64.ll
@@ -484,25 +484,28 @@ svml_stubs(double,d,WIDTH)
 
 define  i64 @__movmsk(<1 x i1>) nounwind readnone alwaysinline {
   %v = extractelement <1 x i1> %0, i32 0
-  %v64 = zext i1 %v to i64
-  ret i64 %v64
+    %v64 = zext i1 %v to i64
+    ret i64 %v64
 }
 
 define  i1 @__any(<1 x i1>) nounwind readnone alwaysinline {
   %v = extractelement <1 x i1> %0, i32 0
-   %cmp = icmp ne i1 %v, 0
+  %res = call i32 @__ballot(i1 %v)
+  %cmp = icmp ne i32 %res, 0
   ret i1 %cmp
 }
 
 define  i1 @__all(<1 x i1>) nounwind readnone alwaysinline {
   %v = extractelement <1 x i1> %0, i32 0
-  %cmp = icmp eq i1 %v, 1
+  %res = call i32 @__ballot(i1 %v)
+  %cmp = icmp eq i32 %res, 31
   ret i1 %cmp
 }
 
 define  i1 @__none(<1 x i1>) nounwind readnone alwaysinline {
   %v = extractelement <1 x i1> %0, i32 0
-  %cmp = icmp eq i1 %v, 0
+  %res = call i32 @__ballot(i1 %v)
+  %cmp = icmp eq i32 %res, 0
   ret i1 %cmp
 }
 
diff --git a/examples_cuda/deferred/common.cpp b/examples_cuda/deferred/common.cpp
index fa4ee57b..309be180 100644
--- a/examples_cuda/deferred/common.cpp
+++ b/examples_cuda/deferred/common.cpp
@@ -131,6 +131,7 @@ CreateInputDataFromFile(const char *path) {
         fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
         return NULL;
     }
+    fprintf(stderr, " numLights= %d\n", input->header.numLights);
 
     // Load data chunk and update pointers
     input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize, 
diff --git a/examples_cuda/deferred/kernels.ispc b/examples_cuda/deferred/kernels.ispc
index 82f8bdf7..2b90c014 100644
--- a/examples_cuda/deferred/kernels.ispc
+++ b/examples_cuda/deferred/kernels.ispc
@@ -200,36 +200,33 @@ IntersectLightsWithTileMinMax(
         // don't actually need to mask the rest of this function - this is
         // just a greedy early-out.  Could also structure all of this as
         // nested if() statements, but this a bit easier to read
-      bool active = false;
-        if (any(inFrustum)) {
-            float light_positionView_x = light_positionView_x_array[lightIndex];
-            float light_positionView_y = light_positionView_y_array[lightIndex];
+      if (any(inFrustum)) {
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
 
-            d = light_positionView_z * frustumPlanes_z[0] + 
-                light_positionView_x * frustumPlanes_xy[0];
-            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        d = light_positionView_z * frustumPlanes_z[0] + 
+          light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
 
-            d = light_positionView_z * frustumPlanes_z[1] + 
-                light_positionView_x * frustumPlanes_xy[1];
-            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        d = light_positionView_z * frustumPlanes_z[1] + 
+          light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
 
-            d = light_positionView_z * frustumPlanes_z[2] + 
-                light_positionView_y * frustumPlanes_xy[2];
-            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        d = light_positionView_z * frustumPlanes_z[2] + 
+          light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
 
-            d = light_positionView_z * frustumPlanes_z[3] + 
-                light_positionView_y * frustumPlanes_xy[3];
-            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-        
-            // Pack and store intersecting lights
-            if (inFrustum)
-              active = true;
-            }
-            if (lightIndex >= numLights) 
-              active = false;
+        d = light_positionView_z * frustumPlanes_z[3] + 
+          light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
 
-            tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
-        }
+        // Pack and store intersecting lights
+        const bool active = inFrustum && lightIndex < numLights;
+
+        if (any(active))
+          tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
+      }
+    }
 
     return tileNumLights;
 }
diff --git a/examples_cuda/deferred/kernels1.ispc b/examples_cuda/deferred/kernels1.ispc
index 251da1f2..07afe33a 100644
--- a/examples_cuda/deferred/kernels1.ispc
+++ b/examples_cuda/deferred/kernels1.ispc
@@ -37,6 +37,7 @@
 #define programIndex laneIndex()
 #define taskIndex    blockIndex0()
 #define taskCount    blockCount0()
+#define cif          if
 #else
 #warning "emitting HOST code"
 #endif
@@ -212,8 +213,8 @@ IntersectLightsWithTileMinMax(
       // don't actually need to mask the rest of this function - this is
       // just a greedy early-out.  Could also structure all of this as
       // nested if() statements, but this a bit easier to read
-      bool active = false;
-      if (any(inFrustum)) {
+      if (any(inFrustum)) 
+      {
         float light_positionView_x = light_positionView_x_array[lightIndex];
         float light_positionView_y = light_positionView_y_array[lightIndex];
 
@@ -234,13 +235,11 @@ IntersectLightsWithTileMinMax(
         inFrustum = inFrustum && (d >= light_attenuationEndNeg);
 
         // Pack and store intersecting lights
-        if (inFrustum) 
-          active = true;
-      }
-      if (lightIndex >= numLights) 
-        active = false;
+        const bool active = inFrustum && lightIndex < numLights;
 
-      tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
+        if(any(active))
+          tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
+      }
     }
 
     return tileNumLights;
@@ -402,7 +401,7 @@ ShadeTile(
                     // Clip at end of attenuation
                     float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
 
-                    if (distanceToLight2 < light_attenutaionEnd2) {                    
+                    cif (distanceToLight2 < light_attenutaionEnd2) {                    
                         float distanceToLight = sqrt(distanceToLight2);
 
                         // HLSL "rcp" is allowed to be fairly inaccurate
@@ -416,7 +415,7 @@ ShadeTile(
                                            surface_normal_z, L_x, L_y, L_z);
                     
                         // Clip back facing
-                        if (NdotL > 0.0f) {
+                        cif (NdotL > 0.0f) {
                             uniform float light_attenuationBegin = 
                                 inputData.lightAttenuationBegin[lightIndex];
 
diff --git a/examples_cuda/deferred/main_cu.cpp b/examples_cuda/deferred/main_cu.cpp
index df96b209..e16745ec 100755
--- a/examples_cuda/deferred/main_cu.cpp
+++ b/examples_cuda/deferred/main_cu.cpp
@@ -186,7 +186,7 @@ void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
   checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
 }
 #define deviceLaunch(func,nbx,nby,nbz,params) \
-  checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_EQUAL)); \
+  checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \
   checkCudaErrors( \
       cuLaunchKernel( \
         (func), \
@@ -251,7 +251,7 @@ extern "C"
     assert(module_1 != NULL);
     assert(func_name != NULL);
     assert(func_args != NULL);
-#if 1
+#if 0
     const char * module = module_1;
 #else
     const std::vector<char> module_str = readBinary("kernel.cubin");
@@ -388,7 +388,7 @@ int main(int argc, char** argv) {
     memcpyD2H(framebuffer.g, d_g, buffsize);
     memcpyD2H(framebuffer.b, d_b, buffsize);
 
-    printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
+    printf("[ispc cuda]:\t\t[%.3f] million cycles to render "
            "%d x %d image\n", ispcCycles,
            input->header.framebufferWidth, input->header.framebufferHeight);
     WriteFrame("deferred-cuda.ppm", input, framebuffer);