diff --git a/examples_cuda/deferred/kernels.ispc b/examples_cuda/deferred/kernels.ispc
index 2b90c014..7bc42777 100644
--- a/examples_cuda/deferred/kernels.ispc
+++ b/examples_cuda/deferred/kernels.ispc
@@ -116,11 +116,8 @@ ComputeZBounds(
     float laneMinZ = cameraFar;
     float laneMaxZ = cameraNear;
     for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
-//        foreach (x = tileStartX ... tileEndX) 
-        for (uniform int xb = tileStartX; xb < tileEndX; xb += programCount)
+        foreach (x = tileStartX ... tileEndX) 
         {
-          const int x = xb + programIndex;
-          if (x >= tileEndX) break;
             // Unproject depth buffer Z value into view space
             float z = zBuffer[y * gBufferWidth + x];
             float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
@@ -182,10 +179,8 @@ IntersectLightsWithTileMinMax(
 
     uniform int32 tileNumLights = 0;
 
-//    foreach (lightIndex = 0 ... numLights) 
-    for (uniform int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount)
+    foreach (lightIndex = 0 ... numLights) 
     {
-      const int lightIndex = lightIndexB + programIndex;
         float light_positionView_z = light_positionView_z_array[lightIndex];
         float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
         float light_attenuationEndNeg = -light_attenuationEnd;
@@ -292,11 +287,8 @@ ShadeTile(
     if (tileNumLights == 0 || visualizeLightCount) {
         uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
         for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
-//            foreach (x = tileStartX ... tileEndX) 
-            for (uniform int xb = tileStartX ; xb < tileEndX; xb += programCount)
+            foreach (x = tileStartX ... tileEndX) 
             { 
-              const int x = xb + programIndex;
-              if (x >= tileEndX) continue;
                 int32 framebufferIndex = (y * gBufferWidth + x);
                 framebuffer_r[framebufferIndex] = c;
                 framebuffer_g[framebufferIndex] = c;
@@ -310,10 +302,7 @@ ShadeTile(
         for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
             uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
 
-//            foreach (x = tileStartX ... tileEndX) {
-            for (uniform int xb = tileStartX ; xb < tileEndX; xb += programCount)
-            { 
-              const int x = xb + programIndex;
+            foreach (x = tileStartX ... tileEndX) {
                 int32 gBufferOffset = y * gBufferWidth + x;
                 
                 // Reconstruct position and (negative) view vector from G-buffer
diff --git a/examples_cuda/deferred/kernels1.ispc b/examples_cuda/deferred/kernels1.ispc
index 5def1900..740d86a3 100644
--- a/examples_cuda/deferred/kernels1.ispc
+++ b/examples_cuda/deferred/kernels1.ispc
@@ -127,19 +127,20 @@ ComputeZBounds(
     // Find Z bounds
     float laneMinZ = cameraFar;
     float laneMaxZ = cameraNear;
-    foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) 
-    {
-      // Unproject depth buffer Z value into view space
-      float z = zBuffer[y * gBufferWidth + x];
-      float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+    for (uniform int32 y = tileStartY; y < tileEndY; ++y) 
+      foreach (x = tileStartX ... tileEndX) 
+      {
+        // Unproject depth buffer Z value into view space
+        float z = zBuffer[y * gBufferWidth + x];
+        float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
 
-      // Work out Z bounds for our samples
-      // Avoid considering skybox/background or otherwise invalid pixels
-      if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
-        laneMinZ = min(laneMinZ, viewSpaceZ);
-        laneMaxZ = max(laneMaxZ, viewSpaceZ);
+        // Work out Z bounds for our samples
+        // Avoid considering skybox/background or otherwise invalid pixels
+        if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+          laneMinZ = min(laneMinZ, viewSpaceZ);
+          laneMaxZ = max(laneMaxZ, viewSpaceZ);
+        }
       }
-    }
     minZ = reduce_min(laneMinZ);
     maxZ = reduce_max(laneMaxZ);
 }
@@ -297,160 +298,163 @@ ShadeTile(
 {
     if (tileNumLights == 0 || visualizeLightCount) {
         uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
-        foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) 
-        {
-          int32 framebufferIndex = (y * gBufferWidth + x);
-          framebuffer_r[framebufferIndex] = c;
-          framebuffer_g[framebufferIndex] = c;
-          framebuffer_b[framebufferIndex] = c;
-        }
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) 
+          foreach (x = tileStartX ... tileEndX) 
+          { 
+            int32 framebufferIndex = (y * gBufferWidth + x);
+            framebuffer_r[framebufferIndex] = c;
+            framebuffer_g[framebufferIndex] = c;
+            framebuffer_b[framebufferIndex] = c;
+          }
     } else {
         uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
         uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
         
-        foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) 
-        {
-          float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
-          int32 gBufferOffset = y * gBufferWidth + x;
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+          uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
 
-          // Reconstruct position and (negative) view vector from G-buffer
-          float surface_positionView_x, surface_positionView_y, surface_positionView_z;
-          float Vneg_x, Vneg_y, Vneg_z;
+          foreach (x = tileStartX ... tileEndX) {
+            int32 gBufferOffset = y * gBufferWidth + x;
 
-          float z = inputData.zBuffer[gBufferOffset];
+            // Reconstruct position and (negative) view vector from G-buffer
+            float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+            float Vneg_x, Vneg_y, Vneg_z;
 
-          // Compute screen/clip-space position
-          // NOTE: Mind DX11 viewport transform and pixel center!
-          float positionScreen_x = (0.5f + (float)(x)) * 
-            twoOverGBufferWidth - 1.0f;
+            float z = inputData.zBuffer[gBufferOffset];
 
-          // Unproject depth buffer Z value into view space
-          surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
-          surface_positionView_x = positionScreen_x * surface_positionView_z / 
-            cameraProj_11;
-          surface_positionView_y = positionScreen_y * surface_positionView_z / 
-            cameraProj_22;
+            // Compute screen/clip-space position
+            // NOTE: Mind DX11 viewport transform and pixel center!
+            float positionScreen_x = (0.5f + (float)(x)) * 
+              twoOverGBufferWidth - 1.0f;
 
-          // We actually end up with a vector pointing *at* the
-          // surface (i.e. the negative view vector)
-          normalize3(surface_positionView_x, surface_positionView_y, 
-              surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+            // Unproject depth buffer Z value into view space
+            surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+            surface_positionView_x = positionScreen_x * surface_positionView_z / 
+              cameraProj_11;
+            surface_positionView_y = positionScreen_y * surface_positionView_z / 
+              cameraProj_22;
 
-          // Reconstruct normal from G-buffer
-          float surface_normal_x, surface_normal_y, surface_normal_z;
-          float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
-          float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
+            // We actually end up with a vector pointing *at* the
+            // surface (i.e. the negative view vector)
+            normalize3(surface_positionView_x, surface_positionView_y, 
+                surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
 
-          float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
-          float m = sqrt(4.0f * f - 1.0f);
+            // Reconstruct normal from G-buffer
+            float surface_normal_x, surface_normal_y, surface_normal_z;
+            float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
+            float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
 
-          surface_normal_x = m * (4.0f * normal_x - 2.0f);
-          surface_normal_y = m * (4.0f * normal_y - 2.0f);
-          surface_normal_z = 3.0f - 8.0f * f;
+            float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+            float m = sqrt(4.0f * f - 1.0f);
 
-          // Load other G-buffer parameters
-          float surface_specularAmount = 
-            half_to_float(inputData.specularAmount[gBufferOffset]);
-          float surface_specularPower  = 
-            half_to_float(inputData.specularPower[gBufferOffset]);
-          float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
-          float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
-          float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+            surface_normal_x = m * (4.0f * normal_x - 2.0f);
+            surface_normal_y = m * (4.0f * normal_y - 2.0f);
+            surface_normal_z = 3.0f - 8.0f * f;
 
-          float lit_x = 0.0f;
-          float lit_y = 0.0f;
-          float lit_z = 0.0f;
-          for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
-              ++tileLightIndex) {
-            uniform int32 lightIndex = tileLightIndices[tileLightIndex];
+            // Load other G-buffer parameters
+            float surface_specularAmount = 
+              half_to_float(inputData.specularAmount[gBufferOffset]);
+            float surface_specularPower  = 
+              half_to_float(inputData.specularPower[gBufferOffset]);
+            float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+            float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+            float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
 
-            // Gather light data relevant to initial culling
-            uniform float light_positionView_x = 
-              inputData.lightPositionView_x[lightIndex];
-            uniform float light_positionView_y = 
-              inputData.lightPositionView_y[lightIndex];
-            uniform float light_positionView_z = 
-              inputData.lightPositionView_z[lightIndex];
-            uniform float light_attenuationEnd = 
-              inputData.lightAttenuationEnd[lightIndex];
+            float lit_x = 0.0f;
+            float lit_y = 0.0f;
+            float lit_z = 0.0f;
+            for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                ++tileLightIndex) {
+              uniform int32 lightIndex = tileLightIndices[tileLightIndex];
 
-            // Compute light vector
-            float L_x = light_positionView_x - surface_positionView_x;
-            float L_y = light_positionView_y - surface_positionView_y;
-            float L_z = light_positionView_z - surface_positionView_z;
+              // Gather light data relevant to initial culling
+              uniform float light_positionView_x = 
+                inputData.lightPositionView_x[lightIndex];
+              uniform float light_positionView_y = 
+                inputData.lightPositionView_y[lightIndex];
+              uniform float light_positionView_z = 
+                inputData.lightPositionView_z[lightIndex];
+              uniform float light_attenuationEnd = 
+                inputData.lightAttenuationEnd[lightIndex];
 
-            float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+              // Compute light vector
+              float L_x = light_positionView_x - surface_positionView_x;
+              float L_y = light_positionView_y - surface_positionView_y;
+              float L_z = light_positionView_z - surface_positionView_z;
 
-            // Clip at end of attenuation
-            float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+              float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
 
-            cif (distanceToLight2 < light_attenutaionEnd2) {                    
-              float distanceToLight = sqrt(distanceToLight2);
+              // Clip at end of attenuation
+              float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
 
-              // HLSL "rcp" is allowed to be fairly inaccurate
-              float distanceToLightRcp = rcp(distanceToLight);
-              L_x *= distanceToLightRcp;
-              L_y *= distanceToLightRcp;
-              L_z *= distanceToLightRcp;
+              cif (distanceToLight2 < light_attenutaionEnd2) {                    
+                float distanceToLight = sqrt(distanceToLight2);
 
-              // Start computing brdf
-              float NdotL = dot3(surface_normal_x, surface_normal_y, 
-                  surface_normal_z, L_x, L_y, L_z);
+                // HLSL "rcp" is allowed to be fairly inaccurate
+                float distanceToLightRcp = rcp(distanceToLight);
+                L_x *= distanceToLightRcp;
+                L_y *= distanceToLightRcp;
+                L_z *= distanceToLightRcp;
 
-              // Clip back facing
-              cif (NdotL > 0.0f) {
-                uniform float light_attenuationBegin = 
-                  inputData.lightAttenuationBegin[lightIndex];
+                // Start computing brdf
+                float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                    surface_normal_z, L_x, L_y, L_z);
 
-                // Light distance attenuation (linstep)
-                float lightRange = (light_attenuationEnd - light_attenuationBegin);
-                float falloffPosition = (light_attenuationEnd - distanceToLight);
-                float attenuation = min(falloffPosition / lightRange, 1.0f);
+                // Clip back facing
+                cif (NdotL > 0.0f) {
+                  uniform float light_attenuationBegin = 
+                    inputData.lightAttenuationBegin[lightIndex];
 
-                float H_x = (L_x - Vneg_x);
-                float H_y = (L_y - Vneg_y);
-                float H_z = (L_z - Vneg_z);
-                normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                  // Light distance attenuation (linstep)
+                  float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                  float falloffPosition = (light_attenuationEnd - distanceToLight);
+                  float attenuation = min(falloffPosition / lightRange, 1.0f);
 
-                float NdotH = dot3(surface_normal_x, surface_normal_y, 
-                    surface_normal_z, H_x, H_y, H_z);
-                NdotH = max(NdotH, 0.0f);
+                  float H_x = (L_x - Vneg_x);
+                  float H_y = (L_y - Vneg_y);
+                  float H_z = (L_z - Vneg_z);
+                  normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
 
-                float specular = pow(NdotH, surface_specularPower);
-                float specularNorm = (surface_specularPower + 2.0f) * 
-                  (1.0f / 8.0f);
-                float specularContrib = surface_specularAmount * 
-                  specularNorm * specular;
+                  float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                      surface_normal_z, H_x, H_y, H_z);
+                  NdotH = max(NdotH, 0.0f);
 
-                float k = attenuation * NdotL * (1.0f + specularContrib);
+                  float specular = pow(NdotH, surface_specularPower);
+                  float specularNorm = (surface_specularPower + 2.0f) * 
+                    (1.0f / 8.0f);
+                  float specularContrib = surface_specularAmount * 
+                    specularNorm * specular;
 
-                uniform float light_color_x = inputData.lightColor_x[lightIndex];
-                uniform float light_color_y = inputData.lightColor_y[lightIndex];
-                uniform float light_color_z = inputData.lightColor_z[lightIndex];
+                  float k = attenuation * NdotL * (1.0f + specularContrib);
 
-                float lightContrib_x = surface_albedo_x * light_color_x;
-                float lightContrib_y = surface_albedo_y * light_color_y;
-                float lightContrib_z = surface_albedo_z * light_color_z;
+                  uniform float light_color_x = inputData.lightColor_x[lightIndex];
+                  uniform float light_color_y = inputData.lightColor_y[lightIndex];
+                  uniform float light_color_z = inputData.lightColor_z[lightIndex];
 
-                lit_x += lightContrib_x * k;
-                lit_y += lightContrib_y * k;
-                lit_z += lightContrib_z * k;
+                  float lightContrib_x = surface_albedo_x * light_color_x;
+                  float lightContrib_y = surface_albedo_y * light_color_y;
+                  float lightContrib_z = surface_albedo_z * light_color_z;
+
+                  lit_x += lightContrib_x * k;
+                  lit_y += lightContrib_y * k;
+                  lit_z += lightContrib_z * k;
+                }
               }
             }
+
+            // Gamma correct
+            // These pows are pretty slow right now, but we can do
+            // something faster if really necessary to squeeze every
+            // last bit of performance out of it
+            float gamma = 1.0 / 2.2f;
+            lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+            lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+            lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+
+            framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+            framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+            framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
           }
-
-          // Gamma correct
-          // These pows are pretty slow right now, but we can do
-          // something faster if really necessary to squeeze every
-          // last bit of performance out of it
-          float gamma = 1.0 / 2.2f;
-          lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
-          lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
-          lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
-
-          framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
-          framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
-          framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
         }
     }
 }