diff --git a/examples/README.txt b/examples/README.txt
index 32717e44..7dcae8bf 100644
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -13,6 +13,7 @@ against regular serial C++ implementations, printing out a comparison of
 the runtimes and the speedup delivered by ispc.  It may be instructive to
 do a side-by-side diff of the C++ and ispc implementations of these
 algorithms to learn more about wirting ispc code.
+
  
 AOBench
 =======
@@ -27,6 +28,7 @@ It executes the program for the given number of iterations, rendering an
 (xres x yres) image each time and measuring the computation time with both
 serial and ispc implementations.
 
+
 AOBench_Instrumented
 ====================
 
@@ -40,12 +42,47 @@ is provided in the instrument.cpp file.
 *** Note: on Linux, this example currently hits an assertion in LLVM during
 *** compilation
 
+
+Deferred
+========
+
+This example shows an extensive example of using ispc for efficient
+deferred shading of scenes with thousands of lights; it's an implementation
+of the algorithm that Johan Andersson described at SIGGRAPH 2009,
+implemented by Andrew Lauritzen and Jefferson Montgomery.  The basic idea
+is that a pre-rendered G-buffer is partitioned into tiles, and in each
+tile, the set of lights that contribute to the tile is first computed.
+Then, the pixels in the tile are then shaded using just those light
+sources. (See slides 19-29 of
+http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
+for more details on the algorithm.)
+
+This directory includes three implementations of the algorithm:
+
+- An ispc implementation that first does a static partitioning of the
+  screen into tiles to parallelize across the CPU cores.  Within each tile
+  ispc kernels provide highly efficient implementations of the light
+  culling and shading calculations.
+- A "best practices" serial C++ implementation.  This implementation does a
+  dynamic partitioning of the screen, refining tiles with significant Z
+  depth complexity (these tiles often have a large number of lights that
+  affect them).  Within each final tile, the pixels are shaded using
+  regular C++ code.
+- If the Cilk extensions are available in your compiler, an ispc
+  implementation that uses Cilk will also be built.
+  (See http://software.intel.com/en-us/articles/intel-cilk-plus/).  Like 
+  the "best practices" serial implementation, this version does dynamic
+  tile partitioning for better load balancing and then uses ispc for the
+  light culling and shading.
+
+
 Mandelbrot
 ==========
 
 Mandelbrot set generation.  This example is extensively documented at the
 http://ispc.github.com/example.html page.
 
+
 Mandelbrot_tasks
 ================
 
@@ -58,6 +95,7 @@ using tasks with ispc, no task system is mandated; the user is free to plug
 in any task system they want, for ease of interoperating with existing task
 systems.
 
+
 Noise
 =====
 
@@ -71,6 +109,7 @@ Options
 This program implements both the Black-Scholes and Binomial options pricing
 models in both ispc and regular serial C++ code.
 
+
 RT
 ==
 
@@ -87,6 +126,7 @@ and triangle intersection code from pbrt; see the pbrt source code and/or
 "Physically Based Rendering" book for more about the basic algorithmic
 details.
 
+
 Simple
 ======
 
@@ -94,6 +134,7 @@ This is a simple "hello world" type program that shows a ~10 line
 application program calling out to a ~5 line ispc program to do a simple
 computation.
 
+
 Volume
 ======
 
diff --git a/examples/deferred/Makefile b/examples/deferred/Makefile
new file mode 100644
index 00000000..840af3de
--- /dev/null
+++ b/examples/deferred/Makefile
@@ -0,0 +1,36 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64 --math-lib=fast
+
+OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/dynamic_c.o objs/dynamic_cilk.o
+
+default: deferred_shading
+
+.PHONY: dirs clean
+.PRECIOUS: objs/kernels_ispc.h
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ deferred_shading
+
+deferred_shading: dirs $(OBJS) $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(TASK_OBJ) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
diff --git a/examples/deferred/common.cpp b/examples/deferred/common.cpp
new file mode 100644
index 00000000..c8fdc36a
--- /dev/null
+++ b/examples/deferred/common.cpp
@@ -0,0 +1,209 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#ifdef ISPC_IS_LINUX
+  #include <malloc.h>
+#endif
+#include "deferred.h"
+#include "../timing.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+static void *
+lAlignedMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+Framebuffer::Framebuffer(int width, int height) {
+    nPixels = width*height;
+    r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+}
+
+
+Framebuffer::~Framebuffer() {
+    lAlignedFree(r);
+    lAlignedFree(g);
+    lAlignedFree(b);
+}
+
+
+void
+Framebuffer::clear() {
+    memset(r, 0, nPixels);
+    memset(g, 0, nPixels);
+    memset(b, 0, nPixels);
+}
+
+InputData *
+CreateInputDataFromFile(const char *path) {
+    FILE *in = fopen(path, "rb");
+    if (!in) return 0;
+
+    InputData *input = new InputData;
+
+    // Load header
+    if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+
+    // Load data chunk and update pointers
+    input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize, 
+                                             ALIGNMENT_BYTES);
+    if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+    
+    input->arrays.zBuffer =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
+    input->arrays.normalEncoded_x =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
+    input->arrays.normalEncoded_y =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
+    input->arrays.specularAmount =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
+    input->arrays.specularPower =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
+    input->arrays.albedo_x =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
+    input->arrays.albedo_y =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
+    input->arrays.albedo_z =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
+    input->arrays.lightPositionView_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
+    input->arrays.lightPositionView_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
+    input->arrays.lightPositionView_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
+    input->arrays.lightAttenuationBegin =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
+    input->arrays.lightColor_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
+    input->arrays.lightColor_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
+    input->arrays.lightColor_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
+    input->arrays.lightAttenuationEnd =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
+
+    fclose(in);
+    return input;
+}
+
+
+void DeleteInputData(InputData *input)
+{
+    lAlignedFree(input->chunk);
+}
+
+
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer) {
+    // Deswizzle and copy to RGBA output
+    // Doesn't need to be fast... only happens once
+    size_t imageBytes = 3 * input->header.framebufferWidth * 
+        input->header.framebufferHeight;
+    uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
+    memset(framebufferAOS, 0, imageBytes);
+
+    for (int i = 0; i < input->header.framebufferWidth * 
+                        input->header.framebufferHeight; ++i) {
+        framebufferAOS[3 * i + 0] = framebuffer.r[i];
+        framebufferAOS[3 * i + 1] = framebuffer.g[i];
+        framebufferAOS[3 * i + 2] = framebuffer.b[i];
+    }
+    
+    // Write out simple PPM file
+    FILE *out = fopen(filename, "wb");
+    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, 
+            input->header.framebufferHeight);
+    fwrite(framebufferAOS, imageBytes, 1, out);
+
+    lAlignedFree(framebufferAOS);
+}
diff --git a/examples/deferred/data/pp1280x720.bin b/examples/deferred/data/pp1280x720.bin
new file mode 100644
index 00000000..32a6a6af
Binary files /dev/null and b/examples/deferred/data/pp1280x720.bin differ
diff --git a/examples/deferred/data/pp1920x1200.bin b/examples/deferred/data/pp1920x1200.bin
new file mode 100644
index 00000000..1bf84c46
Binary files /dev/null and b/examples/deferred/data/pp1920x1200.bin differ
diff --git a/examples/deferred/deferred.h b/examples/deferred/deferred.h
new file mode 100644
index 00000000..5e814ca5
--- /dev/null
+++ b/examples/deferred/deferred.h
@@ -0,0 +1,108 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef DEFERRED_H
+#define DEFERRED_H
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+#define MAX_LIGHTS 1024
+
+enum InputDataArraysEnum {
+    idaZBuffer = 0,
+    idaNormalEncoded_x,
+    idaNormalEncoded_y,
+    idaSpecularAmount,
+    idaSpecularPower,
+    idaAlbedo_x,
+    idaAlbedo_y,
+    idaAlbedo_z,
+    idaLightPositionView_x,
+    idaLightPositionView_y,
+    idaLightPositionView_z,
+    idaLightAttenuationBegin,
+    idaLightColor_x,
+    idaLightColor_y,
+    idaLightColor_z,
+    idaLightAttenuationEnd,
+
+    idaNum
+};
+
+#ifndef ISPC
+
+#include <stdint.h>
+#include "kernels_ispc.h"
+
+#define ALIGNMENT_BYTES 64
+
+#define MAX_LIGHTS 1024
+
+#define VISUALIZE_LIGHT_COUNT 0
+
+struct InputData
+{
+    ispc::InputHeader header;
+    ispc::InputDataArrays arrays;
+    uint8_t *chunk;
+};
+
+
+struct Framebuffer {
+    Framebuffer(int width, int height);
+    ~Framebuffer();
+
+    void clear();
+
+    uint8_t *r, *g, *b;
+
+private:
+    int nPixels;
+    Framebuffer(const Framebuffer &);
+    Framebuffer &operator=(const Framebuffer *);
+};
+
+
+InputData *CreateInputDataFromFile(const char *path);
+void DeleteInputData(InputData *input);
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer);
+void InitDynamicC(InputData *input);
+void InitDynamicCilk(InputData *input);
+void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
+void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
+
+#endif // !ISPC
+
+#endif // DEFERRED_H
diff --git a/examples/deferred/deferred_shading.vcxproj b/examples/deferred/deferred_shading.vcxproj
new file mode 100755
index 00000000..b87bec51
--- /dev/null
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -0,0 +1,170 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>mandelbrot</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="common.cpp" />
+    <ClCompile Include="dynamic_c.cpp" />
+    <ClCompile Include="dynamic_cilk.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="kernels.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/examples/deferred/dynamic_c.cpp b/examples/deferred/dynamic_c.cpp
new file mode 100644
index 00000000..27e9a839
--- /dev/null
+++ b/examples/deferred/dynamic_c.cpp
@@ -0,0 +1,871 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+static void
+ComputeZBounds(int tileStartX, int tileEndX,
+               int tileStartY, int tileEndY,
+               // G-buffer data
+               float zBuffer[],
+               int gBufferWidth,
+               // Camera data
+               float cameraProj_33, float cameraProj_43,
+               float cameraNear, float cameraFar,
+               // Output
+               float *minZ, float *maxZ)
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (int y = tileStartY; y < tileEndY; ++y) {
+        for (int x = tileStartX; x < tileEndX; ++x) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[(y * gBufferWidth + x)];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = std::min(laneMinZ, viewSpaceZ);
+                laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    *minZ = laneMinZ;
+    *maxZ = laneMaxZ;
+}
+
+
+static void
+ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
+                  int numTilesX, int numTilesY,
+                  // G-buffer data
+                  float zBuffer[],
+                  int gBufferWidth,
+                  // Camera data
+                  float cameraProj_33, float cameraProj_43,
+                  float cameraNear, float cameraFar,
+                  // Output
+                  float minZArray[],
+                  float maxZArray[])
+{
+    for (int tileX = 0; tileX < numTilesX; ++tileX) {
+        float minZ, maxZ;
+        ComputeZBounds(
+            tileX * tileWidth, tileX * tileWidth + tileWidth,
+            tileY * tileHeight, tileY * tileHeight + tileHeight,
+            zBuffer, gBufferWidth,
+            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+            &minZ, &maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+class MinMaxZTree
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTree(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+        
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                              zBuffer, gBufferPitchInElements,
+                              cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                              mMinZArrays[0] + (tileY * mNumTilesX),
+                              mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTree() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays); 
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTree *gMinMaxZTree = 0;
+
+void InitDynamicC(InputData *input) {
+    gMinMaxZTree = 
+        new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                        input->header.framebufferWidth, 
+                        input->header.framebufferHeight);
+}
+
+
+// numLights need not be a multiple of programCount here, but the input and output arrays
+// should be able to handle programCount-sized load/stores.
+static void
+SplitTileMinMax(
+    int tileMidX, int tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    float subtileMinZ[],
+    float subtileMaxZ[],
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int lightIndices[],
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Outputs
+    int subtileIndices[],
+    int subtileIndicesPitch,
+    int subtileNumLights[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                   (cameraProj_22 * gBufferScale_y) };
+    float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                 tileMidY - gBufferScale_y };
+
+    for (int i = 0; i < 2; ++i) {
+        // Normalize
+        float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                 frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+    // Initialize
+    int subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    for (int i = 0; i < numLights; ++i) {
+        int lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+        
+        // Test lights again subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+        
+        if (fabsf(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        if (fabsf(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        if (inFrustum[0])
+            subtileIndices[subtileLightOffset[0]++] = lightIndex;
+        if (inFrustum[1])
+            subtileIndices[subtileLightOffset[1]++] = lightIndex;
+        if (inFrustum[2])
+            subtileIndices[subtileLightOffset[2]++] = lightIndex;
+        if (inFrustum[3])
+            subtileIndices[subtileLightOffset[3]++] = lightIndex;
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
+
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = 1.f / sqrtf(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(uint8_t u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline uint8_t
+Float32ToUnorm8(float f) {
+    return (uint8_t)(f * 255.0f);
+}
+
+
+static inline float half_to_float_fast(uint16_t h) {
+    uint32_t hs = h & (int32_t)0x8000u;  // Pick off sign bit
+    uint32_t he = h & (int32_t)0x7C00u;  // Pick off exponent bits
+    uint32_t hm = h & (int32_t)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    uint32_t xs = ((uint32_t) hs) << 16; 
+    // Exponent: unbias the halfp, then bias the single
+    int32_t xes = ((int32_t) (he >> 10)) - 15 + 127; 
+    // Exponent
+    uint32_t xe = (uint32_t) (xes << 23);
+    // Mantissa
+    uint32_t xm = ((uint32_t) hm) << 13; 
+
+    uint32_t bits = (xs | xe | xm);
+    float *fp = reinterpret_cast<float *>(&bits);
+    return *fp;
+}
+
+
+static void
+ShadeTileC(
+    int32_t tileStartX, int32_t tileEndX,
+    int32_t tileStartY, int32_t tileEndY,
+    int32_t gBufferWidth, int32_t gBufferHeight,
+    const ispc::InputDataArrays &inputData,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    float cameraProj_33, float cameraProj_43,
+    // Light list
+    int32_t tileLightIndices[],
+    int32_t tileNumLights,
+    // UI
+    bool visualizeLightCount,
+    // Output
+    uint8_t framebuffer_r[],
+    uint8_t framebuffer_g[],
+    uint8_t framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t gBufferOffset = y * gBufferWidth + x;
+                
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) * 
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                    cameraProj_22;
+                
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y, 
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                    
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrtf(4.0f * f - 1.0f);
+                    
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount = 
+                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  = 
+                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+                
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                     ++tileLightIndex) {
+                    int32_t lightIndex = tileLightIndices[tileLightIndex];
+                                        
+                    // Gather light data relevant to initial culling
+                    float light_positionView_x = 
+                        inputData.lightPositionView_x[lightIndex];
+                    float light_positionView_y = 
+                        inputData.lightPositionView_y[lightIndex];
+                    float light_positionView_z = 
+                        inputData.lightPositionView_z[lightIndex];
+                    float light_attenuationEnd = 
+                        inputData.lightAttenuationEnd[lightIndex];
+                    
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+                    
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    if (distanceToLight2 < light_attenutaionEnd2) {                    
+                        float distanceToLight = sqrtf(distanceToLight2);
+
+                        float distanceToLightRcp = 1.f / distanceToLight;
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                                           surface_normal_z, L_x, L_y, L_z);
+                    
+                        // Clip back facing
+                        if (NdotL > 0.0f) {
+                            float light_attenuationBegin = 
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = std::min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                    
+                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = std::max(NdotH, 0.0f);
+
+                            float specular = powf(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) * 
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount * 
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+                    
+                            float light_color_x = inputData.lightColor_x[lightIndex];
+                            float light_color_y = inputData.lightColor_y[lightIndex];
+                            float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                float gamma = 1.0 / 2.2f;
+                lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
+                lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
+                lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
+                
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
+                        int *lightIndices, int numLights, 
+                        Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+    
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+        
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ShadeTileC(startX, endX, startY, endY,
+                       input->header.framebufferWidth, input->header.framebufferHeight,
+                       input->arrays,
+                       input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                       input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                       lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                       framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    } 
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x, 
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+        
+        // Recurse into subtiles
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+                                subtileLightIndices[0], subtileNumLights[0],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                subtileLightIndices[1], subtileNumLights[1],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                subtileLightIndices[2], subtileNumLights[2],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static int
+IntersectLightsWithTileMinMax(
+    int tileStartX, int tileEndX,
+    int tileStartY, int tileEndY,
+    // Tile data
+    float minZ,
+    float maxZ,
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Output
+    int tileLightIndices[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    float frustumPlanes_xy[4];
+    float frustumPlanes_z[4];
+
+    // This one is totally constant over the whole screen... worth pulling it up at all?
+    float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_22 * gBufferScale_y),
+                                    -(cameraProj_22 * gBufferScale_y) };
+    
+    float frustumPlanes_z_v[4] = {  tileEndX - gBufferScale_x,
+                                    -tileStartX + gBufferScale_x,
+                                    tileEndY - gBufferScale_y,
+                                    -tileStartY + gBufferScale_y };
+
+    for (int i = 0; i < 4; ++i) {
+        float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] + 
+                                 frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
+        frustumPlanes_xy_v[i] *= norm;
+        frustumPlanes_z_v[i] *= norm;
+
+        frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
+        frustumPlanes_z[i] = frustumPlanes_z_v[i];
+    }
+
+    int tileNumLights = 0;
+
+    for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        if (!inFrustum) 
+            continue;
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+
+        d = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[1] + 
+            light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[2] + 
+            light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] + 
+            light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // Pack and store intersecting lights
+        if (inFrustum)
+            tileLightIndices[tileNumLights++] = lightIndex;
+    }
+
+    return tileNumLights;
+}
+
+
+void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x, 
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTree *minMaxZTree = gMinMaxZTree;
+        
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraNear, input->header.cameraFar);
+
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
diff --git a/examples/deferred/dynamic_cilk.cpp b/examples/deferred/dynamic_cilk.cpp
new file mode 100644
index 00000000..2bcfced6
--- /dev/null
+++ b/examples/deferred/dynamic_cilk.cpp
@@ -0,0 +1,398 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef __cilkplusplus
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+class MinMaxZTreeCilk
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTreeCilk(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+        
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        // Compute level 0 in parallel. Outer loops is here since we use Cilk
+        _Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ispc::ComputeZBoundsRow(tileY,
+                mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                zBuffer, gBufferPitchInElements,
+                cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                mMinZArrays[0] + (tileY * mNumTilesX),
+                mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        // NOTE: We currently don't use ispc here since it's sort of an
+        // awkward gather-based reduction Using SSE odd pack/unpack
+        // instructions might actually work here when we need to optimize
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            _Cilk_for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTreeCilk() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays); 
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
+
+void InitDynamicCilk(InputData *input) {
+    gMinMaxZTreeCilk = 
+        new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                            input->header.framebufferWidth, 
+                            input->header.framebufferHeight);
+}
+
+
+static void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
+                        int *lightIndices, int numLights, 
+                        Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+    
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+        
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ispc::ShadeTile(
+                startX, endX, startY, endY,
+                input->header.framebufferWidth, input->header.framebufferHeight,
+                &input->arrays,
+                input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    } 
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x, 
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+        
+        // Recurse into subtiles
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+                                            subtileLightIndices[0], subtileNumLights[0],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                            subtileLightIndices[1], subtileNumLights[1],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                            subtileLightIndices[2], subtileNumLights[2],
+                                            framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = ispc::IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x, 
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+        
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraNear, input->header.cameraFar);
+
+    // Launch the "root" tiles.  Ideally these should at least fill the
+    // machine... at the moment we have a static number of "levels" to the
+    // mip tree but it might make sense to compute it based on the width of
+    // the machine.
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    _Cilk_for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
+
+#endif // __cilkplusplus
diff --git a/examples/deferred/kernels.ispc b/examples/deferred/kernels.ispc
new file mode 100644
index 00000000..27e9c5d1
--- /dev/null
+++ b/examples/deferred/kernels.ispc
@@ -0,0 +1,717 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "deferred.h"
+
+struct InputDataArrays
+{
+    uniform float zBuffer[];
+    uniform unsigned int16 normalEncoded_x[]; // half float
+    uniform unsigned int16 normalEncoded_y[]; // half float
+    uniform unsigned int16 specularAmount[]; // half float
+    uniform unsigned int16 specularPower[]; // half float
+    uniform unsigned int8 albedo_x[]; // unorm8
+    uniform unsigned int8 albedo_y[]; // unorm8
+    uniform unsigned int8 albedo_z[]; // unorm8
+    uniform float lightPositionView_x[];
+    uniform float lightPositionView_y[];
+    uniform float lightPositionView_z[];
+    uniform float lightAttenuationBegin[];
+    uniform float lightColor_x[];
+    uniform float lightColor_y[];
+    uniform float lightColor_z[];
+    uniform float lightAttenuationEnd[];
+};
+
+struct InputHeader
+{
+    uniform float cameraProj[4][4];
+    uniform float cameraNear;
+    uniform float cameraFar;
+
+    uniform int32 framebufferWidth;
+    uniform int32 framebufferHeight;
+    uniform int32 numLights;
+    uniform int32 inputDataChunkSize;
+    uniform int32 inputDataArrayOffsets[idaNum];
+};
+
+export void foo(reference InputHeader h) { }
+
+
+///////////////////////////////////////////////////////////////////////////
+// Common utility routines
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, reference float ox, 
+           reference float oy, reference float oz) {
+    float n = rsqrt(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(unsigned int8 u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline unsigned int8
+Float32ToUnorm8(float f) {
+    return (unsigned int8)(f * 255.0f);
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+static void
+ComputeZBounds(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    reference uniform float minZ,
+    reference uniform float maxZ
+    )
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+        for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[(y * gBufferWidth + x) + programIndex];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = min(laneMinZ, viewSpaceZ);
+                laneMaxZ = max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    minZ = reduce_min(laneMinZ);
+    maxZ = reduce_max(laneMaxZ);
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+// numLights must currently be a multiple of programCount (SIMD size)
+export uniform int32
+IntersectLightsWithTileMinMax(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // Tile data
+    uniform float minZ,
+    uniform float maxZ,
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    reference uniform int32 tileLightIndices[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    // Parallize across frustum planes.
+    // We really only have four side planes here, but write the code to
+    // handle programCount > 4 robustly
+    uniform float frustumPlanes_xy[programCount];
+    uniform float frustumPlanes_z[programCount];
+
+    // TODO: If programIndex < 4 here? Don't care about masking off the
+    // rest but if interleaving ("x2" modes) the other lanes should ideally
+    // not be emitted...
+    {
+        // This one is totally constant over the whole screen... worth pulling it up at all?
+        float frustumPlanes_xy_v;
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_11 * gBufferScale_x));
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2,  (cameraProj_22 * gBufferScale_y));
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
+    
+        float frustumPlanes_z_v;
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 0,  tileEndX - gBufferScale_x);
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 2,  tileEndY - gBufferScale_y);
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
+
+        // Normalize
+        float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
+                           frustumPlanes_z_v * frustumPlanes_z_v);
+            frustumPlanes_xy_v *= norm;
+            frustumPlanes_z_v *= norm;
+
+        // Save out for uniform use later
+        frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
+        frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    }
+
+    uniform int32 tileNumLights = 0;
+
+    for (uniform int32 baseLightIndex = 0; baseLightIndex < numLights; 
+         baseLightIndex += programCount) {
+        int32 lightIndex = baseLightIndex + programIndex;
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // This seems better than cif(!inFrustum) ccontinue; here since we
+        // don't actually need to mask the rest of this function - this is
+        // just a greedy early-out.  Could also structure all of this as
+        // nested if() statements, but this a bit easier to read
+        if (!any(inFrustum)) 
+            continue;
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+
+        d = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[1] + 
+            light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[2] + 
+            light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] + 
+            light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // Pack and store intersecting lights
+        cif (inFrustum) {
+            tileNumLights += packed_store_active(tileLightIndices, tileNumLights, 
+                                                 lightIndex);
+        }
+    }
+
+    return tileNumLights;
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+// numLights must currently be a multiple of programCount (SIMD size)
+static uniform int32
+IntersectLightsWithTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // G-buffer data
+    uniform float zBuffer[],
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    reference uniform int32 tileLightIndices[]
+    )
+{
+    uniform float minZ, maxZ;
+    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
+        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+        minZ, maxZ);
+
+    uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
+        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
+        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, 
+        light_positionView_z_array, light_attenuationEnd_array,
+        tileLightIndices);
+
+    return tileNumLights;
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+export void
+ShadeTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    reference uniform InputDataArrays inputData,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    // Light list
+    reference uniform int32 tileLightIndices[],
+    uniform int32 tileNumLights,
+    // UI
+    uniform bool visualizeLightCount,
+    // Output
+    reference uniform unsigned int8 framebuffer_r[],
+    reference uniform unsigned int8 framebuffer_g[],
+    reference uniform unsigned int8 framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+                int32 framebufferIndex = (y * gBufferWidth + x) + programIndex;
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+                uniform int32 gBufferOffsetBase = y * gBufferWidth + x;
+                int32 gBufferOffset = gBufferOffsetBase + programIndex;
+                
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x + programIndex)) * 
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                    cameraProj_22;
+                
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y, 
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                    
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrt(4.0f * f - 1.0f);
+                    
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount = 
+                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  = 
+                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+                
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                     ++tileLightIndex) {
+                    uniform int32 lightIndex = tileLightIndices[tileLightIndex];
+                                        
+                    // Gather light data relevant to initial culling
+                    uniform float light_positionView_x = 
+                        inputData.lightPositionView_x[lightIndex];
+                    uniform float light_positionView_y = 
+                        inputData.lightPositionView_y[lightIndex];
+                    uniform float light_positionView_z = 
+                        inputData.lightPositionView_z[lightIndex];
+                    uniform float light_attenuationEnd = 
+                        inputData.lightAttenuationEnd[lightIndex];
+                    
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+                    
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    cif (distanceToLight2 < light_attenutaionEnd2) {                    
+                        float distanceToLight = sqrt(distanceToLight2);
+
+                        // HLSL "rcp" is allowed to be fairly inaccurate
+                        float distanceToLightRcp = rcp(distanceToLight);
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                                           surface_normal_z, L_x, L_y, L_z);
+                    
+                        // Clip back facing
+                        cif (NdotL > 0.0f) {
+                            uniform float light_attenuationBegin = 
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                    
+                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = max(NdotH, 0.0f);
+
+                            float specular = pow(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) * 
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount * 
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+                    
+                            uniform float light_color_x = inputData.lightColor_x[lightIndex];
+                            uniform float light_color_y = inputData.lightColor_y[lightIndex];
+                            uniform float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                // These pows are pretty slow right now, but we can do
+                // something faster if really necessary to squeeze every
+                // last bit of performance out of it
+                float gamma = 1.0 / 2.2f;
+                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+                
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Static decomposition
+
+task void
+RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
+           reference uniform InputHeader inputHeader,
+           reference uniform InputDataArrays inputData,
+           uniform int visualizeLightCount,
+           // Output
+           reference uniform unsigned int8 framebuffer_r[],
+           reference uniform unsigned int8 framebuffer_g[],
+           reference uniform unsigned int8 framebuffer_b[]) {
+    uniform int32 group_y = g / num_groups_x;
+    uniform int32 group_x = g % num_groups_x;
+    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
+    uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
+    uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
+    uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
+
+    uniform int sTileNumLights = 0;
+    uniform int sTileLightIndices[MAX_LIGHTS];  // Light list for the tile
+
+    uniform int framebufferWidth = inputHeader.framebufferWidth;
+    uniform int framebufferHeight = inputHeader.framebufferHeight;
+    uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
+    uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
+    uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
+    uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
+
+    // Light intersection
+    sTileNumLights = 
+        IntersectLightsWithTile(tile_start_x, tile_end_x, 
+                                tile_start_y, tile_end_y,
+                                framebufferWidth, framebufferHeight,
+                                inputData.zBuffer,
+                                cameraProj_00, cameraProj_11,
+                                cameraProj_22, cameraProj_32,
+                                inputHeader.cameraNear, inputHeader.cameraFar,
+                                MAX_LIGHTS,
+                                inputData.lightPositionView_x, 
+                                inputData.lightPositionView_y, 
+                                inputData.lightPositionView_z, 
+                                inputData.lightAttenuationEnd,
+                                sTileLightIndices);
+
+    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
+              framebufferWidth, framebufferHeight, inputData,
+              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
+              sTileLightIndices, sTileNumLights, visualizeLightCount, 
+              framebuffer_r, framebuffer_g, framebuffer_b);
+}
+
+
+export void
+RenderStatic(reference uniform InputHeader inputHeader,
+             reference uniform InputDataArrays inputData,
+             uniform int visualizeLightCount,
+             // Output
+             reference uniform unsigned int8 framebuffer_r[],
+             reference uniform unsigned int8 framebuffer_g[],
+             reference uniform unsigned int8 framebuffer_b[]) {
+    uniform int num_groups_x = (inputHeader.framebufferWidth + 
+                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
+    uniform int num_groups_y = (inputHeader.framebufferHeight + 
+                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
+    uniform int num_groups = num_groups_x * num_groups_y;
+
+    for (uniform int g = 0; g < num_groups; ++g)
+        launch < RenderTile(g, num_groups_x, num_groups_y,
+                            inputHeader, inputData, visualizeLightCount,
+                            framebuffer_r, framebuffer_g, framebuffer_b) >;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Routines for dynamic decomposition path
+
+// tile width must be a multiple of programCount (SIMD size)
+export void
+ComputeZBoundsRow(
+    uniform int32 tileY,
+    uniform int32 tileWidth, uniform int32 tileHeight,
+    uniform int32 numTilesX, uniform int32 numTilesY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    reference uniform float minZArray[],
+    reference uniform float maxZArray[]
+    )
+{
+    for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
+        uniform float minZ, maxZ;
+        ComputeZBounds(
+            tileX * tileWidth, tileX * tileWidth + tileWidth,
+            tileY * tileHeight, tileY * tileHeight + tileHeight,
+            zBuffer, gBufferWidth,
+            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+            minZ, maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+// numLights need not be a multiple of programCount here, but the input and output arrays
+// should be able to handle programCount-sized load/stores.
+export void
+SplitTileMinMax(
+    uniform int32 tileMidX, uniform int32 tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    uniform float subtileMinZ[],
+    uniform float subtileMaxZ[],
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    reference uniform int32 lightIndices[],
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Outputs
+    // TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
+    // indexing math ourselves
+    reference uniform int32 subtileIndices[],
+    uniform int32 subtileIndicesPitch,
+    reference uniform int32 subtileNumLights[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    // Parallize across frustum planes
+    // Only have 2 frustum split planes here so may not be worth it, but
+    // we'll do it for now for consistency
+    uniform float frustumPlanes_xy[programCount];
+    uniform float frustumPlanes_z[programCount];
+
+    // This one is totally constant over the whole screen... worth pulling it up at all?
+    float frustumPlanes_xy_v;
+    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
+    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_22 * gBufferScale_y));
+    
+    float frustumPlanes_z_v;
+    frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
+    frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
+
+    // Normalize
+    float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
+                       frustumPlanes_z_v * frustumPlanes_z_v);
+    frustumPlanes_xy_v *= norm;
+    frustumPlanes_z_v *= norm;
+
+    // Save out for uniform use later
+    frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
+    frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+
+    // Initialize
+    uniform int32 subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    for (int32 i = programIndex; i < numLights; i += programCount) {
+        // TODO: ISPC says gather required here when it actually
+        // isn't... this could be fixed this by nesting an if() within a
+        // uniform loop, but I'm not totally sure if that's a win
+        // overall. For now we'll just eat the perf cost for cleanliness
+        // since the below are real gathers anyways.
+        int32 lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+        
+        // Test lights again subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+        
+        cif (abs(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        cif (abs(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        // Pack and store intersecting lights
+        // TODO: Experiment with a loop here instead
+        cif (inFrustum[0])
+            subtileLightOffset[0] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[0], 
+                                                         lightIndex);
+        cif (inFrustum[1])
+            subtileLightOffset[1] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[1], 
+                                                         lightIndex);
+        cif (inFrustum[2])
+            subtileLightOffset[2] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[2], 
+                                                         lightIndex);
+        cif (inFrustum[3])
+            subtileLightOffset[3] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[3], 
+                                                         lightIndex);
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
diff --git a/examples/deferred/main.cpp b/examples/deferred/main.cpp
new file mode 100644
index 00000000..40964295
--- /dev/null
+++ b/examples/deferred/main.cpp
@@ -0,0 +1,137 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#define NOMINMAX
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include "../timing.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv) {
+    if (argc != 2) {
+        printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
+        return 1;
+    }
+
+    InputData *input = CreateInputDataFromFile(argv[1]);
+    if (!input) {
+        printf("Failed to load input file \"%s\"!\n", argv[1]);
+        return 1;
+    }
+
+    Framebuffer framebuffer(input->header.framebufferWidth,
+                            input->header.framebufferHeight);
+
+    InitDynamicC(input);
+#ifdef __cilkplusplus
+    InitDynamicCilk(input);
+#endif // __cilkplusplus
+
+    int nframes = 5;
+    double ispcCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            ispc::RenderStatic(&input->header, &input->arrays, 
+                               VISUALIZE_LIGHT_COUNT,
+                               framebuffer.r, framebuffer.g, framebuffer.b);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        ispcCycles = std::min(ispcCycles, mcycles);
+    }
+    printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
+           "%d x %d image\n", ispcCycles,
+           input->header.framebufferWidth, input->header.framebufferHeight);
+    WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
+
+    double serialCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicC(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        serialCycles = std::min(serialCycles, mcycles);
+    }
+    printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles\n", 
+           serialCycles);
+    WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
+
+#ifdef __cilkplusplus
+    double dynamicCilkCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicCilk(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
+    }
+    printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles\n", 
+           dynamicCilkCycles);
+    WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
+
+    printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
+           serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
+#else
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
+#endif // __cilkplusplus
+
+    DeleteInputData(input);
+
+    return 0;
+}
diff --git a/examples/examples.sln b/examples/examples.sln
index 4088045c..102dbade 100755
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -18,8 +18,11 @@ EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "volume", "volume_rendering\volume.vcxproj", "{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}"
+EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.vcxproj", "{2EF070A1-F62F-4E6A-944B-88D140945C3C}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -108,6 +111,14 @@ Global
 		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.Build.0 = Release|Win32
 		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.ActiveCfg = Release|x64
 		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.Build.0 = Release|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.ActiveCfg = Debug|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.Build.0 = Debug|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.ActiveCfg = Debug|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.Build.0 = Debug|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.ActiveCfg = Release|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE