working on deferred

2014-01-05 15:30:18 +01:00
parent 50e3b68586
commit cd929b17a7
13 changed files with 3908 additions and 0 deletions
--- a/examples_ptx/deferred/Makefile_cpu
+++ b/examples_ptx/deferred/Makefile_cpu
@@ -0,0 +1,9 @@
+
+EXAMPLE=deferred_shading
+CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
+ISPC_SRC=kernels.ispc
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
+ISPC_ARM_TARGETS=neon
+ISPC_FLAGS=--opt=fast-math
+
+include ../common.mk
--- a/examples_ptx/deferred/common.cpp
+++ b/examples_ptx/deferred/common.cpp
@@ -0,0 +1,210 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#ifdef ISPC_IS_LINUX
+  #include <malloc.h>
+#endif
+#include "deferred.h"
+#include "../timing.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+Framebuffer::Framebuffer(int width, int height) {
+    nPixels = width*height;
+    r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+}
+
+
+Framebuffer::~Framebuffer() {
+    lAlignedFree(r);
+    lAlignedFree(g);
+    lAlignedFree(b);
+}
+
+
+void
+Framebuffer::clear() {
+    memset(r, 0, nPixels);
+    memset(g, 0, nPixels);
+    memset(b, 0, nPixels);
+}
+
+
+InputData *
+CreateInputDataFromFile(const char *path) {
+    FILE *in = fopen(path, "rb");
+    if (!in) return 0;
+
+    InputData *input = new InputData;
+
+    // Load header
+    if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+
+    // Load data chunk and update pointers
+    input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize, 
+                                             ALIGNMENT_BYTES);
+    if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+    
+    input->arrays.zBuffer =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
+    input->arrays.normalEncoded_x =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
+    input->arrays.normalEncoded_y =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
+    input->arrays.specularAmount =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
+    input->arrays.specularPower =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
+    input->arrays.albedo_x =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
+    input->arrays.albedo_y =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
+    input->arrays.albedo_z =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
+    input->arrays.lightPositionView_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
+    input->arrays.lightPositionView_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
+    input->arrays.lightPositionView_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
+    input->arrays.lightAttenuationBegin =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
+    input->arrays.lightColor_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
+    input->arrays.lightColor_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
+    input->arrays.lightColor_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
+    input->arrays.lightAttenuationEnd =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
+
+    fclose(in);
+    return input;
+}
+
+
+void DeleteInputData(InputData *input) {
+    lAlignedFree(input->chunk);
+}
+
+
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer) {
+    // Deswizzle and copy to RGBA output
+    // Doesn't need to be fast... only happens once
+    size_t imageBytes = 3 * input->header.framebufferWidth * 
+        input->header.framebufferHeight;
+    uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
+    memset(framebufferAOS, 0, imageBytes);
+
+    for (int i = 0; i < input->header.framebufferWidth * 
+                        input->header.framebufferHeight; ++i) {
+        framebufferAOS[3 * i + 0] = framebuffer.r[i];
+        framebufferAOS[3 * i + 1] = framebuffer.g[i];
+        framebufferAOS[3 * i + 2] = framebuffer.b[i];
+    }
+    
+    // Write out simple PPM file
+    FILE *out = fopen(filename, "wb");
+    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, 
+            input->header.framebufferHeight);
+    fwrite(framebufferAOS, imageBytes, 1, out);
+    fclose(out);
+
+    lAlignedFree(framebufferAOS);
+}
--- a/examples_ptx/deferred/data/pp1280x720.bin
+++ b/examples_ptx/deferred/data/pp1280x720.bin
--- a/examples_ptx/deferred/data/pp1920x1200.bin
+++ b/examples_ptx/deferred/data/pp1920x1200.bin
--- a/examples_ptx/deferred/deferred.h
+++ b/examples_ptx/deferred/deferred.h
@@ -0,0 +1,108 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef DEFERRED_H
+#define DEFERRED_H
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+#define MAX_LIGHTS 1024
+
+enum InputDataArraysEnum {
+    idaZBuffer = 0,
+    idaNormalEncoded_x,
+    idaNormalEncoded_y,
+    idaSpecularAmount,
+    idaSpecularPower,
+    idaAlbedo_x,
+    idaAlbedo_y,
+    idaAlbedo_z,
+    idaLightPositionView_x,
+    idaLightPositionView_y,
+    idaLightPositionView_z,
+    idaLightAttenuationBegin,
+    idaLightColor_x,
+    idaLightColor_y,
+    idaLightColor_z,
+    idaLightAttenuationEnd,
+
+    idaNum
+};
+
+#ifndef ISPC
+
+#include <stdint.h>
+#include "kernels_ispc.h"
+
+#define ALIGNMENT_BYTES 64
+
+#define MAX_LIGHTS 1024
+
+#define VISUALIZE_LIGHT_COUNT 0
+
+struct InputData
+{
+    ispc::InputHeader header;
+    ispc::InputDataArrays arrays;
+    uint8_t *chunk;
+};
+
+
+struct Framebuffer {
+    Framebuffer(int width, int height);
+    ~Framebuffer();
+
+    void clear();
+
+    uint8_t *r, *g, *b;
+
+private:
+    int nPixels;
+    Framebuffer(const Framebuffer &);
+    Framebuffer &operator=(const Framebuffer *);
+};
+
+
+InputData *CreateInputDataFromFile(const char *path);
+void DeleteInputData(InputData *input);
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer);
+void InitDynamicC(InputData *input);
+void InitDynamicCilk(InputData *input);
+void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
+void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
+
+#endif // !ISPC
+
+#endif // DEFERRED_H
--- a/examples_ptx/deferred/deferred_shading.vcxproj
+++ b/examples_ptx/deferred/deferred_shading.vcxproj
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>	
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>deferred</RootNamespace>
+    <ISPC_file>kernels</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-x2</default_targets>
+  </PropertyGroup>
+  <Import Project="..\common.props" />
+  <ItemGroup>
+    <ClCompile Include="common.cpp" />
+    <ClCompile Include="dynamic_c.cpp" />
+    <ClCompile Include="dynamic_cilk.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+</Project>
--- a/examples_ptx/deferred/dynamic_c.cpp
+++ b/examples_ptx/deferred/dynamic_c.cpp
@@ -0,0 +1,870 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+static void
+ComputeZBounds(int tileStartX, int tileEndX,
+               int tileStartY, int tileEndY,
+               // G-buffer data
+               float zBuffer[],
+               int gBufferWidth,
+               // Camera data
+               float cameraProj_33, float cameraProj_43,
+               float cameraNear, float cameraFar,
+               // Output
+               float *minZ, float *maxZ)
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (int y = tileStartY; y < tileEndY; ++y) {
+        for (int x = tileStartX; x < tileEndX; ++x) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[(y * gBufferWidth + x)];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = std::min(laneMinZ, viewSpaceZ);
+                laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    *minZ = laneMinZ;
+    *maxZ = laneMaxZ;
+}
+
+
+static void
+ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
+                  int numTilesX, int numTilesY,
+                  // G-buffer data
+                  float zBuffer[],
+                  int gBufferWidth,
+                  // Camera data
+                  float cameraProj_33, float cameraProj_43,
+                  float cameraNear, float cameraFar,
+                  // Output
+                  float minZArray[],
+                  float maxZArray[])
+{
+    for (int tileX = 0; tileX < numTilesX; ++tileX) {
+        float minZ, maxZ;
+        ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
+                       tileY * tileHeight, tileY * tileHeight + tileHeight,
+                       zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, 
+                       cameraNear, cameraFar, &minZ, &maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+class MinMaxZTree
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTree(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+        
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                              zBuffer, gBufferPitchInElements,
+                              cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                              mMinZArrays[0] + (tileY * mNumTilesX),
+                              mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTree() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays); 
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTree *gMinMaxZTree = 0;
+
+void InitDynamicC(InputData *input) {
+    gMinMaxZTree = 
+        new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                        input->header.framebufferWidth, 
+                        input->header.framebufferHeight);
+}
+
+
+/* We're going to split a tile into 4 sub-tiles.  This function
+   reclassifies the tile's lights with respect to the sub-tiles. */
+static void
+SplitTileMinMax(
+    int tileMidX, int tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    float subtileMinZ[],
+    float subtileMaxZ[],
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int lightIndices[],
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Outputs
+    int subtileIndices[],
+    int subtileIndicesPitch,
+    int subtileNumLights[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                   (cameraProj_22 * gBufferScale_y) };
+    float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                 tileMidY - gBufferScale_y };
+
+    for (int i = 0; i < 2; ++i) {
+        // Normalize
+        float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                 frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+    // Initialize
+    int subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    for (int i = 0; i < numLights; ++i) {
+        int lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+        
+        // Test lights again against subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+        
+        if (fabsf(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        if (fabsf(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        if (inFrustum[0])
+            subtileIndices[subtileLightOffset[0]++] = lightIndex;
+        if (inFrustum[1])
+            subtileIndices[subtileLightOffset[1]++] = lightIndex;
+        if (inFrustum[2])
+            subtileIndices[subtileLightOffset[2]++] = lightIndex;
+        if (inFrustum[3])
+            subtileIndices[subtileLightOffset[3]++] = lightIndex;
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
+
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = 1.f / sqrtf(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(uint8_t u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline uint8_t
+Float32ToUnorm8(float f) {
+    return (uint8_t)(f * 255.0f);
+}
+
+
+static inline float
+half_to_float_fast(uint16_t h) {
+    uint32_t hs = h & (int32_t)0x8000u;  // Pick off sign bit
+    uint32_t he = h & (int32_t)0x7C00u;  // Pick off exponent bits
+    uint32_t hm = h & (int32_t)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    uint32_t xs = ((uint32_t) hs) << 16; 
+    // Exponent: unbias the halfp, then bias the single
+    int32_t xes = ((int32_t) (he >> 10)) - 15 + 127; 
+    // Exponent
+    uint32_t xe = (uint32_t) (xes << 23);
+    // Mantissa
+    uint32_t xm = ((uint32_t) hm) << 13; 
+
+    uint32_t bits = (xs | xe | xm);
+    float *fp = reinterpret_cast<float *>(&bits);
+    return *fp;
+}
+
+
+static void
+ShadeTileC(
+    int32_t tileStartX, int32_t tileEndX,
+    int32_t tileStartY, int32_t tileEndY,
+    int32_t gBufferWidth, int32_t gBufferHeight,
+    const ispc::InputDataArrays &inputData,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    float cameraProj_33, float cameraProj_43,
+    // Light list
+    int32_t tileLightIndices[],
+    int32_t tileNumLights,
+    // UI
+    bool visualizeLightCount,
+    // Output
+    uint8_t framebuffer_r[],
+    uint8_t framebuffer_g[],
+    uint8_t framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t gBufferOffset = y * gBufferWidth + x;
+                
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) * 
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                    cameraProj_22;
+                
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y, 
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                    
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrtf(4.0f * f - 1.0f);
+                    
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount = 
+                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  = 
+                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+                
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                     ++tileLightIndex) {
+                    int32_t lightIndex = tileLightIndices[tileLightIndex];
+                                        
+                    // Gather light data relevant to initial culling
+                    float light_positionView_x = 
+                        inputData.lightPositionView_x[lightIndex];
+                    float light_positionView_y = 
+                        inputData.lightPositionView_y[lightIndex];
+                    float light_positionView_z = 
+                        inputData.lightPositionView_z[lightIndex];
+                    float light_attenuationEnd = 
+                        inputData.lightAttenuationEnd[lightIndex];
+                    
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+                    
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    if (distanceToLight2 < light_attenutaionEnd2) {                    
+                        float distanceToLight = sqrtf(distanceToLight2);
+
+                        float distanceToLightRcp = 1.f / distanceToLight;
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                                           surface_normal_z, L_x, L_y, L_z);
+                    
+                        // Clip back facing
+                        if (NdotL > 0.0f) {
+                            float light_attenuationBegin = 
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = std::min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                    
+                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = std::max(NdotH, 0.0f);
+
+                            float specular = powf(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) * 
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount * 
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+                    
+                            float light_color_x = inputData.lightColor_x[lightIndex];
+                            float light_color_y = inputData.lightColor_y[lightIndex];
+                            float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                float gamma = 1.0 / 2.2f;
+                lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
+                lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
+                lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
+                
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
+                        int *lightIndices, int numLights, 
+                        Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+    
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+        
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ShadeTileC(startX, endX, startY, endY,
+                       input->header.framebufferWidth, input->header.framebufferHeight,
+                       input->arrays,
+                       input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                       input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                       lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                       framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    } 
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x, 
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+        
+        // Recurse into subtiles
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+                                subtileLightIndices[0], subtileNumLights[0],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                subtileLightIndices[1], subtileNumLights[1],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                subtileLightIndices[2], subtileNumLights[2],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static int
+IntersectLightsWithTileMinMax(
+    int tileStartX, int tileEndX,
+    int tileStartY, int tileEndY,
+    // Tile data
+    float minZ,
+    float maxZ,
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Output
+    int tileLightIndices[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    float frustumPlanes_xy[4];
+    float frustumPlanes_z[4];
+
+    // This one is totally constant over the whole screen... worth pulling it up at all?
+    float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_22 * gBufferScale_y),
+                                    -(cameraProj_22 * gBufferScale_y) };
+    
+    float frustumPlanes_z_v[4] = {  tileEndX - gBufferScale_x,
+                                    -tileStartX + gBufferScale_x,
+                                    tileEndY - gBufferScale_y,
+                                    -tileStartY + gBufferScale_y };
+
+    for (int i = 0; i < 4; ++i) {
+        float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] + 
+                                 frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
+        frustumPlanes_xy_v[i] *= norm;
+        frustumPlanes_z_v[i] *= norm;
+
+        frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
+        frustumPlanes_z[i] = frustumPlanes_z_v[i];
+    }
+
+    int tileNumLights = 0;
+
+    for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        if (!inFrustum) 
+            continue;
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+
+        d = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[1] + 
+            light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[2] + 
+            light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] + 
+            light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // Pack and store intersecting lights
+        if (inFrustum)
+            tileLightIndices[tileNumLights++] = lightIndex;
+    }
+
+    return tileNumLights;
+}
+
+
+void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x, 
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTree *minMaxZTree = gMinMaxZTree;
+        
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraNear, input->header.cameraFar);
+
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
--- a/examples_ptx/deferred/dynamic_cilk.cpp
+++ b/examples_ptx/deferred/dynamic_cilk.cpp
@@ -0,0 +1,398 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef __cilk
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+class MinMaxZTreeCilk
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTreeCilk(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+        
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        // Compute level 0 in parallel. Outer loops is here since we use Cilk
+        _Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ispc::ComputeZBoundsRow(tileY,
+                mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                zBuffer, gBufferPitchInElements,
+                cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                mMinZArrays[0] + (tileY * mNumTilesX),
+                mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        // NOTE: We currently don't use ispc here since it's sort of an
+        // awkward gather-based reduction Using SSE odd pack/unpack
+        // instructions might actually work here when we need to optimize
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            _Cilk_for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTreeCilk() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays); 
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
+
+void InitDynamicCilk(InputData *input) {
+    gMinMaxZTreeCilk = 
+        new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                            input->header.framebufferWidth, 
+                            input->header.framebufferHeight);
+}
+
+
+static void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
+                        int *lightIndices, int numLights, 
+                        Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+    
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+        
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ispc::ShadeTile(
+                startX, endX, startY, endY,
+                input->header.framebufferWidth, input->header.framebufferHeight,
+                &input->arrays,
+                input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    } 
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x, 
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+        
+        // Recurse into subtiles
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+                                            subtileLightIndices[0], subtileNumLights[0],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                            subtileLightIndices[1], subtileNumLights[1],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                            subtileLightIndices[2], subtileNumLights[2],
+                                            framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = ispc::IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x, 
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+        
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraNear, input->header.cameraFar);
+
+    // Launch the "root" tiles.  Ideally these should at least fill the
+    // machine... at the moment we have a static number of "levels" to the
+    // mip tree but it might make sense to compute it based on the width of
+    // the machine.
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    _Cilk_for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
+
+#endif // __cilk
--- a/examples_ptx/deferred/kernels.cu
+++ b/examples_ptx/deferred/kernels.cu
@@ -0,0 +1,761 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#include "deferred.h"
+#include <stdio.h>
+#include <assert.h>
+
+#define programCount 32
+#define programIndex (threadIdx.x & 31)
+#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5))
+#define taskCount (gridDim.x*4)
+#define warpIdx (threadIdx.x >> 5)
+
+#define int32 int
+#define int16 short
+#define int8 char
+
+__device__ static inline float clamp(float v, float low, float high) 
+{
+      return min(max(v, low), high);
+}
+
+struct InputDataArrays
+{
+    float *zBuffer;
+    unsigned int16 *normalEncoded_x; // half float
+    unsigned int16 *normalEncoded_y; // half float
+    unsigned int16 *specularAmount; // half float
+    unsigned int16 *specularPower; // half float
+    unsigned int8 *albedo_x; // unorm8
+    unsigned int8 *albedo_y; // unorm8
+    unsigned int8 *albedo_z; // unorm8
+    float *lightPositionView_x;
+    float *lightPositionView_y;
+    float *lightPositionView_z;
+    float *lightAttenuationBegin;
+    float *lightColor_x;
+    float *lightColor_y;
+    float *lightColor_z;
+    float *lightAttenuationEnd;
+};
+
+struct InputHeader
+{
+    float cameraProj[4][4];
+    float cameraNear;
+    float cameraFar;
+
+    int32 framebufferWidth;
+    int32 framebufferHeight;
+    int32 numLights;
+    int32 inputDataChunkSize;
+    int32 inputDataArrayOffsets[idaNum];
+};
+
+
+///////////////////////////////////////////////////////////////////////////
+// Common utility routines
+
+__device__
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+#if 0
+static __shared__ int shdata_full[128];
+template<typename T, int N>
+struct Uniform
+{
+  T data[(N+programCount-1)/programCount];
+  volatile T *shdata;
+
+  __device__ inline Uniform()
+  {
+    shdata = ((T*)shdata_full) + warpIdx*32;
+  }
+
+  __device__ inline int2 get_chunk(const int i) const
+  {
+    const int elem  = i & (programCount - 1);
+    const int chunk = i >> 5;
+    shdata[programIndex] = chunk;
+    shdata[        elem] = chunk;
+    return make_int2(shdata[programIndex], elem);
+  }
+
+  __device__ inline const T get(const int i) const
+  {
+    const int2 idx = get_chunk(i);
+    return __shfl(data[idx.x], idx.y);
+  }
+  
+  __device__ inline void set(const bool active, const int i, T value) 
+  {
+    const int2 idx = get_chunk(i);
+    const int chunkIdx = idx.x;
+    const int elemIdx = idx.y;
+    shdata[programIndex] = data[chunkIdx];
+    if (active) shdata[elemIdx] = value;
+    data[chunkIdx] = shdata[programIndex];
+  }
+};
+#elif 1
+template<typename T, int N>
+struct Uniform
+{
+  union
+  {
+    T *data;
+    int32_t ptr[2];
+  };
+
+  __device__ inline Uniform()
+  {
+    if (programIndex == 0)
+      data = (T*)malloc(N*sizeof(T));
+    ptr[0] = __shfl(ptr[0], 0);
+    ptr[1] = __shfl(ptr[1], 0);
+  }
+  __device__ inline ~Uniform()
+  {
+    if (programIndex == 0)
+      free(data);
+  }
+
+  __device__ inline const T get(const int i) const
+  {
+    return data[i];
+  }
+  
+  __device__ inline T* get_ptr(const int i) {return &data[i]; }
+  __device__ inline void set(const bool active, const int i, T value) 
+  {
+    if (active)
+      data[i] = value;
+  }
+};
+
+#else
+__shared__ int shdata_full[4*MAX_LIGHTS];
+template<typename T, int N>
+struct Uniform
+{
+  volatile T *shdata;
+
+  __device__ Uniform()
+  {
+    shdata = (T*)&shdata_full[warpIdx*MAX_LIGHTS];
+  }
+
+  __device__ inline const T get(const int i) const
+  {
+    return shdata[i];
+  }
+  
+  __device__ inline void set(const bool active, const int i, T value) 
+  {
+    if (active)
+      shdata[i] = value;
+  }
+};
+#endif
+
+
+__device__
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = rsqrt(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+__device__ inline
+static float reduce_min(float value)
+{
+#pragma unroll
+  for (int i = 4; i >=0; i--)
+    value = fminf(value, __shfl_xor(value, 1<<i, 32));
+  return value;
+}
+__device__ inline
+static float reduce_max(float value)
+{
+#pragma unroll
+  for (int i = 4; i >=0; i--)
+    value = fmaxf(value, __shfl_xor(value, 1<<i, 32));
+  return value;
+}
+
+#if 0
+__device__ inline
+static int reduce_sum(int value)
+{
+#pragma unroll
+  for (int i = 4; i >=0; i--)
+    value +=  __shfl_xor(value, 1<<i, 32);
+  return value;
+}
+static __device__ __forceinline__ uint shfl_scan_add_step(uint partial, uint up_offset)
+{
+  uint result;
+  asm(
+      "{.reg .u32 r0;"
+      ".reg .pred p;"
+      "shfl.up.b32 r0|p, %1, %2, 0;"
+      "@p add.u32 r0, r0, %3;"
+      "mov.u32 %0, r0;}"
+      : "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
+  return result;
+}
+static __device__ __forceinline__ int inclusive_scan_warp(const int value)
+{
+  uint sum = value;
+#pragma unroll
+  for(int i = 0; i < 5; ++i)
+    sum = shfl_scan_add_step(sum, 1 << i);
+  return sum - value;
+}
+#endif
+
+
+static __device__ __forceinline__ int lanemask_lt()
+{
+  int mask;
+  asm("mov.u32 %0, %lanemask_lt;" : "=r" (mask));
+  return mask;
+}
+static __device__ __forceinline__ int2 warpBinExclusiveScan(const bool p)
+{
+  const int b = __ballot(p);
+  return make_int2(__popc(b), __popc(b & lanemask_lt()));
+}
+  __device__ static inline 
+int packed_store_active(bool active, int* ptr, int value)
+{
+  const int2 res = warpBinExclusiveScan(active);
+  const int idx = res.y;
+  const int nactive = res.x;
+  if (active)
+    ptr[idx] = value;
+  return nactive;
+}
+
+
+
+
+
+__device__
+static inline float
+Unorm8ToFloat32(unsigned int8 u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+__device__
+static inline unsigned int8
+Float32ToUnorm8(float f) {
+    return (unsigned int8)(f * 255.0f);
+}
+
+
+__device__
+static inline void
+ComputeZBounds(
+     int32 tileStartX,  int32 tileEndX,
+     int32 tileStartY,  int32 tileEndY,
+    // G-buffer data
+     float zBuffer[],
+     int32 gBufferWidth,
+    // Camera data
+     float cameraProj_33,  float cameraProj_43,
+     float cameraNear,  float cameraFar,
+    // Output
+     float &minZ,
+     float &maxZ
+    )
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for ( int32 y = tileStartY; y < tileEndY; ++y) {
+        for ( int xb = tileStartX; xb < tileEndX; xb += programCount)
+        {
+          const int x = xb + programIndex;
+          if (x >= tileEndX) break;
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[y * gBufferWidth + x];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = min(laneMinZ, viewSpaceZ);
+                laneMaxZ = max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    minZ = reduce_min(laneMinZ);
+    maxZ = reduce_max(laneMaxZ);
+}
+
+
+__device__
+static inline  int32
+IntersectLightsWithTileMinMax(
+     int32 tileStartX,  int32 tileEndX,
+     int32 tileStartY,  int32 tileEndY,
+    // Tile data
+     float minZ,
+     float maxZ,
+    // G-buffer data
+     int32 gBufferWidth,  int32 gBufferHeight,
+    // Camera data
+     float cameraProj_11,  float cameraProj_22,
+    // Light Data
+     int32 numLights,
+     float light_positionView_x_array[],
+     float light_positionView_y_array[],
+     float light_positionView_z_array[],
+     float light_attenuationEnd_array[],
+    // Output
+     Uniform<int,MAX_LIGHTS> &tileLightIndices
+    )
+{
+     float gBufferScale_x = 0.5f * (float)gBufferWidth;
+     float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+     float frustumPlanes_xy[4] = {
+        -(cameraProj_11 * gBufferScale_x),
+         (cameraProj_11 * gBufferScale_x),
+         (cameraProj_22 * gBufferScale_y),
+        -(cameraProj_22 * gBufferScale_y) };
+     float frustumPlanes_z[4] = {
+         tileEndX - gBufferScale_x,
+        -tileStartX + gBufferScale_x,
+         tileEndY - gBufferScale_y,
+        -tileStartY + gBufferScale_y };
+
+    for ( int i = 0; i < 4; ++i) {
+         float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+     int32 tileNumLights = 0;
+
+    for ( int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount)
+    {
+      const int lightIndex = lightIndexB + programIndex;
+      if (lightIndex >= numLights) break;
+
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // This seems better than cif(!inFrustum) ccontinue; here since we
+        // don't actually need to mask the rest of this function - this is
+        // just a greedy early-out.  Could also structure all of this as
+        // nested if() statements, but this a bit easier to read
+        if (__ballot(inFrustum) > 0) 
+        {
+            float light_positionView_x = light_positionView_x_array[lightIndex];
+            float light_positionView_y = light_positionView_y_array[lightIndex];
+
+            d = light_positionView_z * frustumPlanes_z[0] + 
+                light_positionView_x * frustumPlanes_xy[0];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[1] + 
+                light_positionView_x * frustumPlanes_xy[1];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[2] + 
+                light_positionView_y * frustumPlanes_xy[2];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[3] + 
+                light_positionView_y * frustumPlanes_xy[3];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+            // Pack and store intersecting lights
+            const bool active = inFrustum && lightIndex < numLights;
+#if 0
+            if (__ballot(active) > 0)
+              tileNumLights += packed_store_active(active, tileLightIndices.get_ptr(tileNumLights), lightIndex);
+#else
+            if (__ballot(active) > 0)
+            {
+              const int2 res = warpBinExclusiveScan(active);
+              const int idx = tileNumLights + res.y;
+              const int nactive = res.x;
+              tileLightIndices.set(active, idx, lightIndex);
+              tileNumLights += nactive;
+            }
+#endif
+        }
+    }
+
+    return tileNumLights;
+}
+
+
+__device__
+static inline   int32
+IntersectLightsWithTile(
+     int32 tileStartX,  int32 tileEndX,
+     int32 tileStartY,  int32 tileEndY,
+     int32 gBufferWidth,  int32 gBufferHeight,
+    // G-buffer data
+     float zBuffer[],
+    // Camera data
+     float cameraProj_11,  float cameraProj_22,
+     float cameraProj_33,  float cameraProj_43,
+     float cameraNear,  float cameraFar,
+    // Light Data
+     int32 numLights,
+     float light_positionView_x_array[],
+     float light_positionView_y_array[],
+     float light_positionView_z_array[],
+     float light_attenuationEnd_array[],
+    // Output
+     Uniform<int,MAX_LIGHTS> &tileLightIndices
+    )
+{
+     float minZ, maxZ;
+    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
+        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+        minZ, maxZ);
+
+
+     int32 tileNumLights = IntersectLightsWithTileMinMax(
+        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
+        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, 
+        light_positionView_z_array, light_attenuationEnd_array,
+        tileLightIndices);
+
+    return tileNumLights;
+}
+
+
+__device__
+static inline void
+ShadeTile(
+     int32 tileStartX,  int32 tileEndX,
+     int32 tileStartY,  int32 tileEndY,
+     int32 gBufferWidth,  int32 gBufferHeight,
+    const  InputDataArrays &inputData,
+    // Camera data
+     float cameraProj_11,  float cameraProj_22,
+     float cameraProj_33,  float cameraProj_43,
+    // Light list
+     Uniform<int,MAX_LIGHTS> &tileLightIndices,
+     int32 tileNumLights,
+    // UI
+     bool visualizeLightCount,
+    // Output
+     unsigned int8 framebuffer_r[],
+     unsigned int8 framebuffer_g[],
+     unsigned int8 framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+         unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        for ( int32 y = tileStartY; y < tileEndY; ++y) {
+            for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
+            { 
+              const int x = xb + programIndex;
+              if (x >= tileEndX) continue;
+                int32 framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+         float twoOverGBufferWidth = 2.0f / gBufferWidth;
+         float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for ( int32 y = tileStartY; y < tileEndY; ++y) {
+             float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
+            { 
+              const int x = xb + programIndex;
+//              if (x >= tileEndX) break;
+                int32 gBufferOffset = y * gBufferWidth + x;
+                
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) * 
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                    cameraProj_22;
+                
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y, 
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                asm("// half2float //");
+                float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]);
+                asm("// half2float //");
+                    
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrt(4.0f * f - 1.0f);
+                    
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount = 
+                    __half2float(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  = 
+                    __half2float(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+                
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                     ++tileLightIndex) {
+                     int32 lightIndex = tileLightIndices.get(tileLightIndex);
+                                        
+                    // Gather light data relevant to initial culling
+                     float light_positionView_x = 
+                        __ldg(&inputData.lightPositionView_x[lightIndex]);
+                     float light_positionView_y = 
+                        __ldg(&inputData.lightPositionView_y[lightIndex]);
+                     float light_positionView_z = 
+                        __ldg(&inputData.lightPositionView_z[lightIndex]);
+                     float light_attenuationEnd = 
+                        __ldg(&inputData.lightAttenuationEnd[lightIndex]);
+                    
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+                    
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    if (distanceToLight2 < light_attenutaionEnd2) {                    
+                        float distanceToLight = sqrt(distanceToLight2);
+
+                        // HLSL "rcp" is allowed to be fairly inaccurate
+                        float distanceToLightRcp = 1.0f/distanceToLight;
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                                           surface_normal_z, L_x, L_y, L_z);
+                    
+                        // Clip back facing
+                        if (NdotL > 0.0f) {
+                             float light_attenuationBegin = 
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                    
+                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = max(NdotH, 0.0f);
+
+                            float specular = pow(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) * 
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount * 
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+                    
+                             float light_color_x = inputData.lightColor_x[lightIndex];
+                             float light_color_y = inputData.lightColor_y[lightIndex];
+                             float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                // These pows are pretty slow right now, but we can do
+                // something faster if really necessary to squeeze every
+                // last bit of performance out of it
+                float gamma = 1.0 / 2.2f;
+                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+                
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Static decomposition
+
+__global__ void
+RenderTile( int num_groups_x,  int num_groups_y,
+           const  InputHeader *inputHeaderPtr,
+           const  InputDataArrays *inputDataPtr,
+            int visualizeLightCount,
+           // Output
+            unsigned int8 framebuffer_r[],
+            unsigned int8 framebuffer_g[],
+            unsigned int8 framebuffer_b[]) {
+  if (taskIndex >= taskCount) return;
+
+  const  InputHeader inputHeader = *inputHeaderPtr;
+  const  InputDataArrays inputData = *inputDataPtr;
+     int32 group_y = taskIndex / num_groups_x;
+     int32 group_x = taskIndex % num_groups_x;
+
+     int32 tile_start_x = group_x * MIN_TILE_WIDTH;
+     int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
+     int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
+     int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
+
+     int framebufferWidth = inputHeader.framebufferWidth;
+     int framebufferHeight = inputHeader.framebufferHeight;
+     float cameraProj_00 = inputHeader.cameraProj[0][0];
+     float cameraProj_11 = inputHeader.cameraProj[1][1];
+     float cameraProj_22 = inputHeader.cameraProj[2][2];
+     float cameraProj_32 = inputHeader.cameraProj[3][2];
+
+    // Light intersection: figure out which lights illuminate this tile.
+     Uniform<int,MAX_LIGHTS> tileLightIndices;  // Light list for the tile
+#if 1
+     int numTileLights = 
+        IntersectLightsWithTile(tile_start_x, tile_end_x, 
+                                tile_start_y, tile_end_y,
+                                framebufferWidth, framebufferHeight,
+                                inputData.zBuffer,
+                                cameraProj_00, cameraProj_11,
+                                cameraProj_22, cameraProj_32,
+                                inputHeader.cameraNear, inputHeader.cameraFar,
+                                MAX_LIGHTS,
+                                inputData.lightPositionView_x, 
+                                inputData.lightPositionView_y, 
+                                inputData.lightPositionView_z, 
+                                inputData.lightAttenuationEnd,
+                                tileLightIndices);
+
+    // And now shade the tile, using the lights in tileLightIndices
+    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
+              framebufferWidth, framebufferHeight, inputData,
+              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
+              tileLightIndices, numTileLights, visualizeLightCount, 
+              framebuffer_r, framebuffer_g, framebuffer_b);
+#endif
+}
+
+
+extern "C" __global__ void
+RenderStatic( InputHeader inputHeaderPtr[],
+              InputDataArrays inputDataPtr[],
+              int visualizeLightCount,
+             // Output
+              unsigned int8 framebuffer_r[],
+              unsigned int8 framebuffer_g[],
+              unsigned int8 framebuffer_b[]) {
+
+  const  InputHeader inputHeader = *inputHeaderPtr;
+  const  InputDataArrays inputData = *inputDataPtr;
+
+
+     int num_groups_x = (inputHeader.framebufferWidth + 
+                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
+     int num_groups_y = (inputHeader.framebufferHeight + 
+                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
+     int num_groups = num_groups_x * num_groups_y;
+
+    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
+    // by MIN_TILE_HEIGHT pixels.
+     if (programIndex == 0)
+       RenderTile<<<(num_groups+4-1)/4,128>>>(num_groups_x, num_groups_y,
+           inputHeaderPtr, inputDataPtr, visualizeLightCount,
+           framebuffer_r, framebuffer_g, framebuffer_b);
+     cudaDeviceSynchronize();
+}
--- a/examples_ptx/deferred/kernels.ispc
+++ b/examples_ptx/deferred/kernels.ispc
@@ -0,0 +1,672 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "deferred.h"
+
+struct InputDataArrays
+{
+    float *zBuffer;
+    unsigned int16 *normalEncoded_x; // half float
+    unsigned int16 *normalEncoded_y; // half float
+    unsigned int16 *specularAmount; // half float
+    unsigned int16 *specularPower; // half float
+    unsigned int8 *albedo_x; // unorm8
+    unsigned int8 *albedo_y; // unorm8
+    unsigned int8 *albedo_z; // unorm8
+    float *lightPositionView_x;
+    float *lightPositionView_y;
+    float *lightPositionView_z;
+    float *lightAttenuationBegin;
+    float *lightColor_x;
+    float *lightColor_y;
+    float *lightColor_z;
+    float *lightAttenuationEnd;
+};
+
+struct InputHeader
+{
+    float cameraProj[4][4];
+    float cameraNear;
+    float cameraFar;
+
+    int32 framebufferWidth;
+    int32 framebufferHeight;
+    int32 numLights;
+    int32 inputDataChunkSize;
+    int32 inputDataArrayOffsets[idaNum];
+};
+
+
+///////////////////////////////////////////////////////////////////////////
+// Common utility routines
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = rsqrt(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(unsigned int8 u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline unsigned int8
+Float32ToUnorm8(float f) {
+    return (unsigned int8)(f * 255.0f);
+}
+
+
+static void
+ComputeZBounds(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    uniform float &minZ,
+    uniform float &maxZ
+    )
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+        foreach (x = tileStartX ... tileEndX) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[y * gBufferWidth + x];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = min(laneMinZ, viewSpaceZ);
+                laneMaxZ = max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    minZ = reduce_min(laneMinZ);
+    maxZ = reduce_max(laneMaxZ);
+}
+
+
+export uniform int32
+IntersectLightsWithTileMinMax(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // Tile data
+    uniform float minZ,
+    uniform float maxZ,
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    uniform int32 tileLightIndices[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    uniform float frustumPlanes_xy[4] = {
+        -(cameraProj_11 * gBufferScale_x),
+         (cameraProj_11 * gBufferScale_x),
+         (cameraProj_22 * gBufferScale_y),
+        -(cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[4] = {
+         tileEndX - gBufferScale_x,
+        -tileStartX + gBufferScale_x,
+         tileEndY - gBufferScale_y,
+        -tileStartY + gBufferScale_y };
+
+    for (uniform int i = 0; i < 4; ++i) {
+        uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+    uniform int32 tileNumLights = 0;
+
+    foreach (lightIndex = 0 ... numLights) {
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // This seems better than cif(!inFrustum) ccontinue; here since we
+        // don't actually need to mask the rest of this function - this is
+        // just a greedy early-out.  Could also structure all of this as
+        // nested if() statements, but this a bit easier to read
+        if (any(inFrustum)) {
+            float light_positionView_x = light_positionView_x_array[lightIndex];
+            float light_positionView_y = light_positionView_y_array[lightIndex];
+
+            d = light_positionView_z * frustumPlanes_z[0] + 
+                light_positionView_x * frustumPlanes_xy[0];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[1] + 
+                light_positionView_x * frustumPlanes_xy[1];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[2] + 
+                light_positionView_y * frustumPlanes_xy[2];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[3] + 
+                light_positionView_y * frustumPlanes_xy[3];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+            // Pack and store intersecting lights
+            cif (inFrustum) {
+                tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], 
+                                                     lightIndex);
+            }
+        }
+    }
+
+    return tileNumLights;
+}
+
+
+static uniform int32
+IntersectLightsWithTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // G-buffer data
+    uniform float zBuffer[],
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    uniform int32 tileLightIndices[]
+    )
+{
+    uniform float minZ, maxZ;
+    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
+        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+        minZ, maxZ);
+
+    uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
+        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
+        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, 
+        light_positionView_z_array, light_attenuationEnd_array,
+        tileLightIndices);
+
+    return tileNumLights;
+}
+
+
+export void
+ShadeTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    uniform InputDataArrays &inputData,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    // Light list
+    uniform int32 tileLightIndices[],
+    uniform int32 tileNumLights,
+    // UI
+    uniform bool visualizeLightCount,
+    // Output
+    uniform unsigned int8 framebuffer_r[],
+    uniform unsigned int8 framebuffer_g[],
+    uniform unsigned int8 framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            foreach (x = tileStartX ... tileEndX) {
+                int32 framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            foreach (x = tileStartX ... tileEndX) {
+                int32 gBufferOffset = y * gBufferWidth + x;
+                
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) * 
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                    cameraProj_22;
+                
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y, 
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
+                    
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrt(4.0f * f - 1.0f);
+                    
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount = 
+                    half_to_float(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  = 
+                    half_to_float(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+                
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                     ++tileLightIndex) {
+                    uniform int32 lightIndex = tileLightIndices[tileLightIndex];
+                                        
+                    // Gather light data relevant to initial culling
+                    uniform float light_positionView_x = 
+                        inputData.lightPositionView_x[lightIndex];
+                    uniform float light_positionView_y = 
+                        inputData.lightPositionView_y[lightIndex];
+                    uniform float light_positionView_z = 
+                        inputData.lightPositionView_z[lightIndex];
+                    uniform float light_attenuationEnd = 
+                        inputData.lightAttenuationEnd[lightIndex];
+                    
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+                    
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    cif (distanceToLight2 < light_attenutaionEnd2) {                    
+                        float distanceToLight = sqrt(distanceToLight2);
+
+                        // HLSL "rcp" is allowed to be fairly inaccurate
+                        float distanceToLightRcp = rcp(distanceToLight);
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                                           surface_normal_z, L_x, L_y, L_z);
+                    
+                        // Clip back facing
+                        cif (NdotL > 0.0f) {
+                            uniform float light_attenuationBegin = 
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                    
+                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = max(NdotH, 0.0f);
+
+                            float specular = pow(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) * 
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount * 
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+                    
+                            uniform float light_color_x = inputData.lightColor_x[lightIndex];
+                            uniform float light_color_y = inputData.lightColor_y[lightIndex];
+                            uniform float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                // These pows are pretty slow right now, but we can do
+                // something faster if really necessary to squeeze every
+                // last bit of performance out of it
+                float gamma = 1.0 / 2.2f;
+                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+                
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Static decomposition
+
+task void
+RenderTile(uniform int num_groups_x, uniform int num_groups_y,
+           uniform InputHeader &inputHeader,
+           uniform InputDataArrays &inputData,
+           uniform int visualizeLightCount,
+           // Output
+           uniform unsigned int8 framebuffer_r[],
+           uniform unsigned int8 framebuffer_g[],
+           uniform unsigned int8 framebuffer_b[]) {
+    uniform int32 group_y = taskIndex / num_groups_x;
+    uniform int32 group_x = taskIndex % num_groups_x;
+    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
+    uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
+    uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
+    uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
+
+    uniform int framebufferWidth = inputHeader.framebufferWidth;
+    uniform int framebufferHeight = inputHeader.framebufferHeight;
+    uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
+    uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
+    uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
+    uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
+
+    // Light intersection: figure out which lights illuminate this tile.
+    uniform int tileLightIndices[MAX_LIGHTS];  // Light list for the tile
+    uniform int numTileLights = 
+        IntersectLightsWithTile(tile_start_x, tile_end_x, 
+                                tile_start_y, tile_end_y,
+                                framebufferWidth, framebufferHeight,
+                                inputData.zBuffer,
+                                cameraProj_00, cameraProj_11,
+                                cameraProj_22, cameraProj_32,
+                                inputHeader.cameraNear, inputHeader.cameraFar,
+                                MAX_LIGHTS,
+                                inputData.lightPositionView_x, 
+                                inputData.lightPositionView_y, 
+                                inputData.lightPositionView_z, 
+                                inputData.lightAttenuationEnd,
+                                tileLightIndices);
+
+    // And now shade the tile, using the lights in tileLightIndices
+    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
+              framebufferWidth, framebufferHeight, inputData,
+              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
+              tileLightIndices, numTileLights, visualizeLightCount, 
+              framebuffer_r, framebuffer_g, framebuffer_b);
+}
+
+
+export void
+RenderStatic(uniform InputHeader &inputHeader,
+             uniform InputDataArrays &inputData,
+             uniform int visualizeLightCount,
+             // Output
+             uniform unsigned int8 framebuffer_r[],
+             uniform unsigned int8 framebuffer_g[],
+             uniform unsigned int8 framebuffer_b[]) {
+    uniform int num_groups_x = (inputHeader.framebufferWidth + 
+                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
+    uniform int num_groups_y = (inputHeader.framebufferHeight + 
+                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
+    uniform int num_groups = num_groups_x * num_groups_y;
+
+    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
+    // by MIN_TILE_HEIGHT pixels.
+    launch[num_groups] RenderTile(num_groups_x, num_groups_y,
+                                  inputHeader, inputData, visualizeLightCount,
+                                  framebuffer_r, framebuffer_g, framebuffer_b);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Routines for dynamic decomposition path
+
+// This computes the z min/max range for a whole row worth of tiles.
+export void
+ComputeZBoundsRow(
+    uniform int32 tileY,
+    uniform int32 tileWidth, uniform int32 tileHeight,
+    uniform int32 numTilesX, uniform int32 numTilesY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    uniform float minZArray[],
+    uniform float maxZArray[]
+    )
+{
+    for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
+        uniform float minZ, maxZ;
+        ComputeZBounds(
+            tileX * tileWidth, tileX * tileWidth + tileWidth,
+            tileY * tileHeight, tileY * tileHeight + tileHeight,
+            zBuffer, gBufferWidth,
+            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+            minZ, maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
+// numLights need not be a multiple of programCount here, but the input and output arrays
+// should be able to handle programCount-sized load/stores.
+export void
+SplitTileMinMax(
+    uniform int32 tileMidX, uniform int32 tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    uniform float subtileMinZ[],
+    uniform float subtileMaxZ[],
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    uniform int32 lightIndices[],
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Outputs
+    uniform int32 subtileIndices[],
+    uniform int32 subtileIndicesPitch,
+    uniform int32 subtileNumLights[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                           (cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                         tileMidY - gBufferScale_y };
+
+    // Normalize
+    uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + 
+                                    frustumPlanes_z[0] * frustumPlanes_z[0]),
+                              rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + 
+                                    frustumPlanes_z[1] * frustumPlanes_z[1]) };
+    frustumPlanes_xy[0] *= norm[0];
+    frustumPlanes_xy[1] *= norm[1];
+    frustumPlanes_z[0] *= norm[0];
+    frustumPlanes_z[1] *= norm[1];
+
+    // Initialize
+    uniform int32 subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    foreach (i = 0 ... numLights) {
+        int32 lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+        
+        // Test lights again subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+        
+        cif (abs(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        cif (abs(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        // Pack and store intersecting lights
+        // TODO: Experiment with a loop here instead
+        cif (inFrustum[0])
+            subtileLightOffset[0] += 
+            packed_store_active(&subtileIndices[subtileLightOffset[0]],
+                                lightIndex);
+        cif (inFrustum[1])
+            subtileLightOffset[1] += 
+            packed_store_active(&subtileIndices[subtileLightOffset[1]],
+                                lightIndex);
+        cif (inFrustum[2])
+            subtileLightOffset[2] += 
+            packed_store_active(&subtileIndices[subtileLightOffset[2]], 
+                                lightIndex);
+        cif (inFrustum[3])
+            subtileLightOffset[3] += 
+            packed_store_active(&subtileIndices[subtileLightOffset[3]], 
+                                lightIndex);
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
--- a/examples_ptx/deferred/kernels1.ispc
+++ b/examples_ptx/deferred/kernels1.ispc
@@ -0,0 +1,556 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef __NVPTX__
+#warning "emitting DEVICE code"
+#define programCount warpSize()
+#define programIndex laneIndex()
+#define taskIndex    blockIndex0()
+#define taskCount    blockCount0()
+#define cif          if
+#else
+#warning "emitting HOST code"
+#endif
+
+
+#include "deferred.h"
+
+struct InputDataArrays
+{
+    float *zBuffer;
+    unsigned int16 *normalEncoded_x; // half float
+    unsigned int16 *normalEncoded_y; // half float
+    unsigned int16 *specularAmount; // half float
+    unsigned int16 *specularPower; // half float
+    unsigned int8 *albedo_x; // unorm8
+    unsigned int8 *albedo_y; // unorm8
+    unsigned int8 *albedo_z; // unorm8
+    float *lightPositionView_x;
+    float *lightPositionView_y;
+    float *lightPositionView_z;
+    float *lightAttenuationBegin;
+    float *lightColor_x;
+    float *lightColor_y;
+    float *lightColor_z;
+    float *lightAttenuationEnd;
+};
+
+struct InputHeader
+{
+    float cameraProj[4][4];
+    float cameraNear;
+    float cameraFar;
+
+    int32 framebufferWidth;
+    int32 framebufferHeight;
+    int32 numLights;
+    int32 inputDataChunkSize;
+    int32 inputDataArrayOffsets[idaNum];
+};
+
+
+///////////////////////////////////////////////////////////////////////////
+// Common utility routines
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = rsqrt(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(unsigned int8 u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline unsigned int8
+Float32ToUnorm8(float f) {
+    return (unsigned int8)(f * 255.0f);
+}
+
+
+static inline void
+ComputeZBounds(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    uniform float &minZ,
+    uniform float &maxZ
+    )
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (uniform int32 y = tileStartY; y < tileEndY; ++y) 
+      foreach (x = tileStartX ... tileEndX) 
+      {
+        // Unproject depth buffer Z value into view space
+        float z = zBuffer[y * gBufferWidth + x];
+        float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+        // Work out Z bounds for our samples
+        // Avoid considering skybox/background or otherwise invalid pixels
+        if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+          laneMinZ = min(laneMinZ, viewSpaceZ);
+          laneMaxZ = max(laneMaxZ, viewSpaceZ);
+        }
+      }
+    minZ = reduce_min(laneMinZ);
+    maxZ = reduce_max(laneMaxZ);
+}
+
+
+static inline uniform int32
+IntersectLightsWithTileMinMax(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // Tile data
+    uniform float minZ,
+    uniform float maxZ,
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    uniform int32 tileLightIndices[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    uniform float frustumPlanes_xy[4] = {
+        -(cameraProj_11 * gBufferScale_x),
+         (cameraProj_11 * gBufferScale_x),
+         (cameraProj_22 * gBufferScale_y),
+        -(cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[4] = {
+         tileEndX - gBufferScale_x,
+        -tileStartX + gBufferScale_x,
+         tileEndY - gBufferScale_y,
+        -tileStartY + gBufferScale_y };
+
+    for (uniform int i = 0; i < 4; ++i) {
+        uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+    uniform int32 tileNumLights = 0;
+
+    foreach (lightIndex = 0 ... numLights) 
+    {
+      float light_positionView_z = light_positionView_z_array[lightIndex];
+      float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+      float light_attenuationEndNeg = -light_attenuationEnd;
+
+      float d = light_positionView_z - minZ;
+      bool inFrustum = (d >= light_attenuationEndNeg);
+
+      d = maxZ - light_positionView_z;
+      inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+      // This seems better than cif(!inFrustum) ccontinue; here since we
+      // don't actually need to mask the rest of this function - this is
+      // just a greedy early-out.  Could also structure all of this as
+      // nested if() statements, but this a bit easier to read
+      if (any(inFrustum)) 
+      {
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+
+        d = light_positionView_z * frustumPlanes_z[0] + 
+          light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[1] + 
+          light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[2] + 
+          light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] + 
+          light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        // Pack and store intersecting lights
+        const bool active = inFrustum && lightIndex < numLights;
+
+        if(any(active))
+          tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
+      }
+    }
+
+    return tileNumLights;
+}
+
+
+static inline  uniform int32
+IntersectLightsWithTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // G-buffer data
+    uniform float zBuffer[],
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    uniform int32 tileLightIndices[]
+    )
+{
+    uniform float minZ, maxZ;
+    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
+        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+        minZ, maxZ);
+
+    uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
+        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
+        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, 
+        light_positionView_z_array, light_attenuationEnd_array,
+        tileLightIndices);
+
+    return tileNumLights;
+}
+
+
+static inline void
+ShadeTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    const uniform InputDataArrays &inputData,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    // Light list
+    uniform int32 tileLightIndices[],
+    uniform int32 tileNumLights,
+    // UI
+    uniform bool visualizeLightCount,
+    // Output
+    uniform unsigned int8 framebuffer_r[],
+    uniform unsigned int8 framebuffer_g[],
+    uniform unsigned int8 framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) 
+          foreach (x = tileStartX ... tileEndX) 
+          { 
+            int32 framebufferIndex = (y * gBufferWidth + x);
+            framebuffer_r[framebufferIndex] = c;
+            framebuffer_g[framebufferIndex] = c;
+            framebuffer_b[framebufferIndex] = c;
+          }
+    } else {
+        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+          uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+          foreach (x = tileStartX ... tileEndX) {
+            int32 gBufferOffset = y * gBufferWidth + x;
+
+            // Reconstruct position and (negative) view vector from G-buffer
+            float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+            float Vneg_x, Vneg_y, Vneg_z;
+
+            float z = inputData.zBuffer[gBufferOffset];
+
+            // Compute screen/clip-space position
+            // NOTE: Mind DX11 viewport transform and pixel center!
+            float positionScreen_x = (0.5f + (float)(x)) * 
+              twoOverGBufferWidth - 1.0f;
+
+            // Unproject depth buffer Z value into view space
+            surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+            surface_positionView_x = positionScreen_x * surface_positionView_z / 
+              cameraProj_11;
+            surface_positionView_y = positionScreen_y * surface_positionView_z / 
+              cameraProj_22;
+
+            // We actually end up with a vector pointing *at* the
+            // surface (i.e. the negative view vector)
+            normalize3(surface_positionView_x, surface_positionView_y, 
+                surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+            // Reconstruct normal from G-buffer
+            float surface_normal_x, surface_normal_y, surface_normal_z;
+            float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
+            float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
+
+            float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+            float m = sqrt(4.0f * f - 1.0f);
+
+            surface_normal_x = m * (4.0f * normal_x - 2.0f);
+            surface_normal_y = m * (4.0f * normal_y - 2.0f);
+            surface_normal_z = 3.0f - 8.0f * f;
+
+            // Load other G-buffer parameters
+            float surface_specularAmount = 
+              half_to_float(inputData.specularAmount[gBufferOffset]);
+            float surface_specularPower  = 
+              half_to_float(inputData.specularPower[gBufferOffset]);
+            float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+            float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+            float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+
+            float lit_x = 0.0f;
+            float lit_y = 0.0f;
+            float lit_z = 0.0f;
+            for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                ++tileLightIndex) {
+              uniform int32 lightIndex = tileLightIndices[tileLightIndex];
+
+              // Gather light data relevant to initial culling
+              uniform float light_positionView_x = 
+                inputData.lightPositionView_x[lightIndex];
+              uniform float light_positionView_y = 
+                inputData.lightPositionView_y[lightIndex];
+              uniform float light_positionView_z = 
+                inputData.lightPositionView_z[lightIndex];
+              uniform float light_attenuationEnd = 
+                inputData.lightAttenuationEnd[lightIndex];
+
+              // Compute light vector
+              float L_x = light_positionView_x - surface_positionView_x;
+              float L_y = light_positionView_y - surface_positionView_y;
+              float L_z = light_positionView_z - surface_positionView_z;
+
+              float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+
+              // Clip at end of attenuation
+              float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+              cif (distanceToLight2 < light_attenutaionEnd2) {                    
+                float distanceToLight = sqrt(distanceToLight2);
+
+                // HLSL "rcp" is allowed to be fairly inaccurate
+                float distanceToLightRcp = rcp(distanceToLight);
+                L_x *= distanceToLightRcp;
+                L_y *= distanceToLightRcp;
+                L_z *= distanceToLightRcp;
+
+                // Start computing brdf
+                float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                    surface_normal_z, L_x, L_y, L_z);
+
+                // Clip back facing
+                cif (NdotL > 0.0f) {
+                  uniform float light_attenuationBegin = 
+                    inputData.lightAttenuationBegin[lightIndex];
+
+                  // Light distance attenuation (linstep)
+                  float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                  float falloffPosition = (light_attenuationEnd - distanceToLight);
+                  float attenuation = min(falloffPosition / lightRange, 1.0f);
+
+                  float H_x = (L_x - Vneg_x);
+                  float H_y = (L_y - Vneg_y);
+                  float H_z = (L_z - Vneg_z);
+                  normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+
+                  float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                      surface_normal_z, H_x, H_y, H_z);
+                  NdotH = max(NdotH, 0.0f);
+
+                  float specular = pow(NdotH, surface_specularPower);
+                  float specularNorm = (surface_specularPower + 2.0f) * 
+                    (1.0f / 8.0f);
+                  float specularContrib = surface_specularAmount * 
+                    specularNorm * specular;
+
+                  float k = attenuation * NdotL * (1.0f + specularContrib);
+
+                  uniform float light_color_x = inputData.lightColor_x[lightIndex];
+                  uniform float light_color_y = inputData.lightColor_y[lightIndex];
+                  uniform float light_color_z = inputData.lightColor_z[lightIndex];
+
+                  float lightContrib_x = surface_albedo_x * light_color_x;
+                  float lightContrib_y = surface_albedo_y * light_color_y;
+                  float lightContrib_z = surface_albedo_z * light_color_z;
+
+                  lit_x += lightContrib_x * k;
+                  lit_y += lightContrib_y * k;
+                  lit_z += lightContrib_z * k;
+                }
+              }
+            }
+
+            // Gamma correct
+            // These pows are pretty slow right now, but we can do
+            // something faster if really necessary to squeeze every
+            // last bit of performance out of it
+            float gamma = 1.0 / 2.2f;
+            lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+            lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+            lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+
+            framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+            framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+            framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+          }
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Static decomposition
+
+void task
+RenderTile(uniform int num_groups_x, uniform int num_groups_y,
+           const  uniform InputHeader inputHeaderPtr[],
+           const  uniform InputDataArrays inputDataPtr[],
+           uniform int visualizeLightCount,
+           // Output
+           uniform unsigned int8 framebuffer_r[],
+           uniform unsigned int8 framebuffer_g[],
+           uniform unsigned int8 framebuffer_b[]) {
+  if (taskIndex >= taskCount) return;
+  const  uniform InputHeader inputHeader = *inputHeaderPtr;
+  const  uniform InputDataArrays inputData = *inputDataPtr;
+
+    uniform int32 group_y = taskIndex / num_groups_x;
+    uniform int32 group_x = taskIndex % num_groups_x;
+    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
+    uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
+    uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
+    uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
+
+    uniform int framebufferWidth = inputHeader.framebufferWidth;
+    uniform int framebufferHeight = inputHeader.framebufferHeight;
+    uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
+    uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
+    uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
+    uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
+
+    // Light intersection: figure out which lights illuminate this tile.
+#if 1
+    uniform int * uniform tileLightIndices = uniform new uniform int [MAX_LIGHTS];
+#else
+    uniform int tileLightIndices[MAX_LIGHTS];  // Light list for the tile
+#endif
+#if 1
+    uniform int numTileLights = 
+        IntersectLightsWithTile(tile_start_x, tile_end_x, 
+                                tile_start_y, tile_end_y,
+                                framebufferWidth, framebufferHeight,
+                                inputData.zBuffer,
+                                cameraProj_00, cameraProj_11,
+                                cameraProj_22, cameraProj_32,
+                                inputHeader.cameraNear, inputHeader.cameraFar,
+                                MAX_LIGHTS,
+                                inputData.lightPositionView_x, 
+                                inputData.lightPositionView_y, 
+                                inputData.lightPositionView_z, 
+                                inputData.lightAttenuationEnd,
+                                tileLightIndices);
+
+    // And now shade the tile, using the lights in tileLightIndices
+    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
+              framebufferWidth, framebufferHeight, inputData,
+              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
+              tileLightIndices, numTileLights, visualizeLightCount, 
+              framebuffer_r, framebuffer_g, framebuffer_b);
+#endif
+#if 1
+    delete tileLightIndices;
+#endif
+}
+
+
+export void
+RenderStatic(uniform InputHeader inputHeaderPtr[],
+             uniform InputDataArrays inputDataPtr[],
+             uniform int visualizeLightCount,
+             // Output
+             uniform unsigned int8 framebuffer_r[],
+             uniform unsigned int8 framebuffer_g[],
+             uniform unsigned int8 framebuffer_b[]) {
+
+  const uniform InputHeader inputHeader = *inputHeaderPtr;
+  const uniform InputDataArrays inputData = *inputDataPtr;
+
+
+    uniform int num_groups_x = (inputHeader.framebufferWidth + 
+                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
+    uniform int num_groups_y = (inputHeader.framebufferHeight + 
+                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
+    uniform int num_groups = num_groups_x * num_groups_y;
+
+    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
+    // by MIN_TILE_HEIGHT pixels.
+    launch[num_groups] RenderTile(num_groups_x, num_groups_y,
+                                  inputHeaderPtr, inputDataPtr, visualizeLightCount,
+                                  framebuffer_r, framebuffer_g, framebuffer_b);
+    sync;
+}
+
+
+
--- a/examples_ptx/deferred/main.cpp
+++ b/examples_ptx/deferred/main.cpp
@@ -0,0 +1,149 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#define NOMINMAX
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include "../timing.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv) {
+    if (argc < 2) {
+        printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)> [tasks iterations] [serial iterations]\n");
+        return 1;
+    }
+    static unsigned int test_iterations[] = {5, 3, 500}; //last value is for nframes, it is scale.
+    if (argc == 5) {
+        for (int i = 0; i < 3; i++) {
+            test_iterations[i] = atoi(argv[2 + i]);
+        }
+    }
+
+    InputData *input = CreateInputDataFromFile(argv[1]);
+    if (!input) {
+        printf("Failed to load input file \"%s\"!\n", argv[1]);
+        return 1;
+    }
+
+    Framebuffer framebuffer(input->header.framebufferWidth,
+                            input->header.framebufferHeight);
+
+    InitDynamicC(input);
+#ifdef __cilk
+    InitDynamicCilk(input);
+#endif // __cilk
+
+    int nframes = test_iterations[2];
+    double ispcCycles = 1e30;
+    for (int i = 0; i < test_iterations[0]; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            ispc::RenderStatic(input->header, input->arrays,
+                               VISUALIZE_LIGHT_COUNT,
+                               framebuffer.r, framebuffer.g, framebuffer.b);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", mcycles);
+        ispcCycles = std::min(ispcCycles, mcycles);
+    }
+    printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
+           "%d x %d image\n", ispcCycles,
+           input->header.framebufferWidth, input->header.framebufferHeight);
+    WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
+
+    nframes = 3;
+#ifdef __cilk
+    double dynamicCilkCycles = 1e30;
+    for (int i = 0; i < test_iterations[1]; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicCilk(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        printf("@time of serial run:\t\t\t[%.3f] million cycles\n", mcycles);
+        dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
+    }
+    printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", 
+           dynamicCilkCycles);
+    WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
+#endif // __cilk
+
+    double serialCycles = 1e30;
+    for (int i = 0; i < test_iterations[1]; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicC(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        printf("@time of serial run:\t\t\t[%.3f] million cycles\n", mcycles);
+        serialCycles = std::min(serialCycles, mcycles);
+    }
+    printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", 
+           serialCycles);
+    WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
+
+#ifdef __cilk
+    printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
+           serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
+#else
+    printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles);
+#endif // __cilk
+
+    DeleteInputData(input);
+
+    return 0;
+}
--- a/examples_ptx/deferred/main_cu.cpp
+++ b/examples_ptx/deferred/main_cu.cpp
@@ -0,0 +1,139 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#define NOMINMAX
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include "../timing.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv) {
+    if (argc != 2) {
+        printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
+        return 1;
+    }
+
+    InputData *input = CreateInputDataFromFile(argv[1]);
+    if (!input) {
+        printf("Failed to load input file \"%s\"!\n", argv[1]);
+        return 1;
+    }
+
+    Framebuffer framebuffer(input->header.framebufferWidth,
+                            input->header.framebufferHeight);
+
+    InitDynamicC(input);
+#ifdef __cilk
+    InitDynamicCilk(input);
+#endif // __cilk
+
+    int nframes = 5;
+    double ispcCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            ispc::RenderStatic(input->header, input->arrays,
+                               VISUALIZE_LIGHT_COUNT,
+                               framebuffer.r, framebuffer.g, framebuffer.b);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        ispcCycles = std::min(ispcCycles, mcycles);
+    }
+    printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
+           "%d x %d image\n", ispcCycles,
+           input->header.framebufferWidth, input->header.framebufferHeight);
+    WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
+
+#ifdef __cilk
+    double dynamicCilkCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicCilk(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
+    }
+    printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", 
+           dynamicCilkCycles);
+    WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
+#endif // __cilk
+
+    double serialCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicC(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        serialCycles = std::min(serialCycles, mcycles);
+    }
+    printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", 
+           serialCycles);
+    WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
+
+#ifdef __cilk
+    printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
+           serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
+#else
+    printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles);
+#endif // __cilk
+
+    DeleteInputData(input);
+
+    return 0;
+}