From 260b1ad8872b9287307f4a84fb88a1799759ebb2 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Mon, 6 Jan 2014 12:26:49 +0100
Subject: [PATCH] added deferred example

---
 examples_ptx/common_gpu.mk              |  21 +-
 examples_ptx/deferred/Makefile_cpu      |   3 +-
 examples_ptx/deferred/Makefile_gpu      |   3 +
 examples_ptx/deferred/common.cpp        |  12 +
 examples_ptx/deferred/deferred.h        |   2 +-
 examples_ptx/deferred/ispc_malloc_gcc.o | Bin 0 -> 3040 bytes
 examples_ptx/deferred/kernels.cu        |  19 +-
 examples_ptx/deferred/kernels.ispc      |  56 ++-
 examples_ptx/deferred/kernels1.ispc     | 556 ------------------------
 examples_ptx/deferred/main.cpp          |  18 +-
 examples_ptx/deferred/test.cpp          |  37 ++
 11 files changed, 144 insertions(+), 583 deletions(-)
 create mode 100644 examples_ptx/deferred/ispc_malloc_gcc.o
 delete mode 100644 examples_ptx/deferred/kernels1.ispc
 create mode 100644 examples_ptx/deferred/test.cpp

diff --git a/examples_ptx/common_gpu.mk b/examples_ptx/common_gpu.mk
index bd3e106f..c4628559 100644
--- a/examples_ptx/common_gpu.mk
+++ b/examples_ptx/common_gpu.mk
@@ -5,7 +5,7 @@ CXX=g++ -ffast-math
 CXXFLAGS=-O3 -I$(CUDATK)/include -Iobjs_gpu/ -D_CUDA_
 #
 NVCC=nvcc
-NVCC_FLAGS=-O3 -arch=sm_35 -D_CUDA_ -I../ -Xptxas=-v
+NVCC_FLAGS=-O3 -arch=sm_35 -D_CUDA_ -I../ -Xptxas=-v -Iobjs_gpu/
 ifdef PTXCC_REGMAX
   NVCC_FLAGS += --maxrregcount=$(PTXCC_REGMAX)
 endif
@@ -52,13 +52,18 @@ LLC_FLAGS=-march=nvptx64 -mcpu=sm_35
 # .SUFFIXES: .bc .o .cu 
 
 ifdef LLVM_GPU
-  OBJSgpu_llvm=$(ISPC_LLVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS)
-  PROGgpu_llvm = $(PROG)_llvm_gpu
+  OBJSgpu_llvm=$(ISPC_LLVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS) 
+  PROGgpu_llvm=$(PROG)_llvm_gpu
+else
+  ISPC_LLVM_PTX=
 endif
 
+
 ifdef NVVM_GPU
-  OBJSgpu_nvvm=$(ISPC_NVVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS)
-  PROGgpu_nvvm = $(PROG)_nvvm_gpu
+  OBJSgpu_nvvm=$(ISPC_NVVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS) $(ISPC_LVVM_PTX)
+  PROGgpu_nvvm=$(PROG)_nvvm_gpu
+else
+  ISPC_NVVM_PTX=
 endif
 
 ifdef CU_SRC
@@ -68,9 +73,9 @@ endif
 
 
 all: dirs  \
-	$(PROGgpu_nvvm) $(ISPC_NVVM_PTX)  \
-	$(PROGgpu_llvm) $(ISPC_LLVM_PTX)  \
-	$(PROGcu) $(ISPC_BC) 
+	$(PROGgpu_nvvm)  \
+	$(PROGgpu_llvm)  \
+	$(PROGcu) $(ISPC_BC)  $(ISPC_HEADERS) $(ISPC_NVVM_PTX) $(ISPC_LLVM_PTX)
 
 dirs:
 	/bin/mkdir -p objs_gpu/
diff --git a/examples_ptx/deferred/Makefile_cpu b/examples_ptx/deferred/Makefile_cpu
index b21cc643..63f442d0 100644
--- a/examples_ptx/deferred/Makefile_cpu
+++ b/examples_ptx/deferred/Makefile_cpu
@@ -1,6 +1,7 @@
 
 EXAMPLE=deferred_shading
-CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
+CPP_SRC=common.cpp main.cpp dynamic_c.cpp 
+# CPP_SRC+=dynamic_cilk.cpp
 ISPC_SRC=kernels.ispc
 ISPC_IA_TARGETS=avx1-i32x16
 ISPC_ARM_TARGETS=neon
diff --git a/examples_ptx/deferred/Makefile_gpu b/examples_ptx/deferred/Makefile_gpu
index df90e13b..1145ea4d 100644
--- a/examples_ptx/deferred/Makefile_gpu
+++ b/examples_ptx/deferred/Makefile_gpu
@@ -4,6 +4,9 @@ CU_SRC=kernels.cu
 CXX_SRC=common.cpp  main.cpp
 PTXCC_REGMAX=64
 
+NVVM_GPU=1
+LLVM_GPU=1
+
 include ../common_gpu.mk
 
 
diff --git a/examples_ptx/deferred/common.cpp b/examples_ptx/deferred/common.cpp
index fa4ee57b..3604d850 100644
--- a/examples_ptx/deferred/common.cpp
+++ b/examples_ptx/deferred/common.cpp
@@ -60,11 +60,13 @@
 #endif
 #include "deferred.h"
 #include "../timing.h"
+#include "../ispc_malloc.h"
 
 ///////////////////////////////////////////////////////////////////////////
 
 static void *
 lAlignedMalloc(size_t size, int32_t alignment) {
+#ifndef _CUDA_
 #ifdef ISPC_IS_WINDOWS
     return _aligned_malloc(size, alignment);
 #endif
@@ -79,11 +81,18 @@ lAlignedMalloc(size_t size, int32_t alignment) {
     ((void**)amem)[-1] = mem;
     return amem;
 #endif
+#else
+    void *ptr;
+    ispc_malloc(&ptr, size);
+    return ptr;
+#endif
+
 }
 
 
 static void
 lAlignedFree(void *ptr) {
+#ifndef _CUDA_
 #ifdef ISPC_IS_WINDOWS
     _aligned_free(ptr);
 #endif
@@ -93,6 +102,9 @@ lAlignedFree(void *ptr) {
 #ifdef ISPC_IS_APPLE
     free(((void**)ptr)[-1]);
 #endif
+#else
+    ispc_free(ptr);
+#endif
 }
 
 
diff --git a/examples_ptx/deferred/deferred.h b/examples_ptx/deferred/deferred.h
index 5e814ca5..485315a8 100644
--- a/examples_ptx/deferred/deferred.h
+++ b/examples_ptx/deferred/deferred.h
@@ -35,7 +35,7 @@
 #define DEFERRED_H
 
 // Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
-#define MIN_TILE_WIDTH 16
+#define MIN_TILE_WIDTH 64
 #define MIN_TILE_HEIGHT 16
 #define MAX_LIGHTS 1024
 
diff --git a/examples_ptx/deferred/ispc_malloc_gcc.o b/examples_ptx/deferred/ispc_malloc_gcc.o
new file mode 100644
index 0000000000000000000000000000000000000000..0e0954c097ad87889a6bd2dee363698f6d48a500
GIT binary patch
literal 3040
zcmbtW&2Jk;6rar|WC^X~hK~jniKMC^RbdrN1yYbuN>Z|s+~PJ=LP2O)ueXVXy|%oY
z)a|8}EDjL|a^QftaqE#Y2S88>{sI05id3X>tt!I%&Av${ll7@j+IjE&-q*ahGhW}h
zRQ+(oafBvEye|$kjS4Y0av(1%aZyZ(Nf8VlP6s>x1v`H|02+KXdL|g$pZ+Zv{OSZd
zzh`3#R?Fn8%j7V4=;fY+`(BY3CE0UNINrqAv@<&)#2r;k?P<?F?s%hPCFkS;j(k7E
zMJ4?a(oa|vMmzEThGx*p($c##b1PS_pPf1HzvEZf^a^w5cCC6%IQ{at3&cHjY~0(W
zfxv!-ea;adg}hvvS(_+%7oW@R)SjYQTG?J+W%~<kb8f5bj?N2Q*M1E?%cb&XB~J)H
zz17abI`OQR^Gz-F;#TNqaX%A&Bg{hK*V9z^3f+hz8TX^On@I|4#SFul5{12NtIJT7
zb$S#Pb=vJXp-f39i+vi)o8%UK!A|j}ywMBWF>n4U*bK$uTs>puSY-XGg_kY;6zOWF
z7B1*0fOc~x5QjQ6K}FNdF<ap3tuhAojUk(&UF!`pBImGp<TlBUf18Z(F|Ltv*Y(eu
z!6e)9IHxv-kX`Oy_lf?;=&}31Oc?6h^Z$h~kUD$0s8)Oa0ii|p?-k5{oP6#2pA%Lz
z{|AJFw12K0)juUeQT+wFdmyv_XGz!jG`>L?>d(_-(j!);Ops*wf;8_MvNG^8q8&`I
z!1qhW?Dy|+{H_(R_pjl*Boy^u<NkGgnf#8^qvLh|CkV68E<xg3uKzU@gY34T+k1!m
zJw8yMun~2N>f@ah)fX@jJ<i$wYS-U4SOWY;#lL$NtW<aZ?H#td#XBLiXu{YxZq!k7
z1y1^5LjNNMuJ0ANW8^pcbM+7or=q&YEF@Q5HQ}QsSMXslOmC4qWz69m+ukFNvtm<?
zfFisx1W$+HJ45jA2*-ZriGyZyCcY<LRQHD=_#eOv4&<Lh{Qnt(i)Px5WIJrNI*}iB
zyF#v1Kl<QeRmya$E?1>oPj0q_Twlwo3sN>a&7_%;szABX@5^r7OFK!}YG$|O_B@#?
z6(eP>woTF3lbg+?A+N>h7JW;ljQXLBsHEIV;(j-dvbaHkAFr-u7wTa{s?8A4Xw|lf
zT~PUM^x{~lSJtwHW+#>PFpbZz(sq_Am&i>v(vI8=lLmeC<@yn)<=hFCL;5#BP2{9c
z3`RLg&y&5JPO}|2-iyXxH*lPFjpLkw;E(?_8V>*x*?I6i0s%Mk=re%Nius$YZy<2<
zmh$IE<#~~8x-NPF!Tyqx@&$VX*?C^hVTGS$8SFI6*e8BVb$*7`&ElUx1wGFO<S%I_
n;&Ut`4<1bw7T~XY1Z2(){__BduI5bb6aM=5{VwD1HTUyBuFWE_

literal 0
HcmV?d00001

diff --git a/examples_ptx/deferred/kernels.cu b/examples_ptx/deferred/kernels.cu
index 2530532a..f6bb02f2 100644
--- a/examples_ptx/deferred/kernels.cu
+++ b/examples_ptx/deferred/kernels.cu
@@ -733,7 +733,7 @@ RenderTile( int num_groups_x,  int num_groups_y,
 
 
 extern "C" __global__ void
-RenderStatic( InputHeader inputHeaderPtr[],
+RenderStatic___export( InputHeader inputHeaderPtr[],
               InputDataArrays inputDataPtr[],
               int visualizeLightCount,
              // Output
@@ -759,3 +759,20 @@ RenderStatic( InputHeader inputHeaderPtr[],
            framebuffer_r, framebuffer_g, framebuffer_b);
      cudaDeviceSynchronize();
 }
+extern "C" __host__ void
+RenderStatic( InputHeader inputHeaderPtr[],
+              InputDataArrays inputDataPtr[],
+              int visualizeLightCount,
+             // Output
+              unsigned int8 framebuffer_r[],
+              unsigned int8 framebuffer_g[],
+              unsigned int8 framebuffer_b[]) {
+  RenderStatic___export<<<1,32>>>( inputHeaderPtr,
+              inputDataPtr,
+              visualizeLightCount,
+             // Output
+              framebuffer_r,
+              framebuffer_g,
+              framebuffer_b);
+     cudaDeviceSynchronize();
+}
diff --git a/examples_ptx/deferred/kernels.ispc b/examples_ptx/deferred/kernels.ispc
index 6d2a8cc9..a39723c4 100644
--- a/examples_ptx/deferred/kernels.ispc
+++ b/examples_ptx/deferred/kernels.ispc
@@ -97,6 +97,9 @@ Float32ToUnorm8(float f) {
 }
 
 
+#if 1
+inline
+#endif
 static void
 ComputeZBounds(
     uniform int32 tileStartX, uniform int32 tileEndX,
@@ -133,8 +136,13 @@ ComputeZBounds(
     maxZ = reduce_max(laneMaxZ);
 }
 
-
-export uniform int32
+#if 1 
+inline
+#endif
+#ifndef __NVPTX__
+export 
+#endif
+uniform int32
 IntersectLightsWithTileMinMax(
     uniform int32 tileStartX, uniform int32 tileEndX,
     uniform int32 tileStartY, uniform int32 tileEndY,
@@ -212,12 +220,18 @@ IntersectLightsWithTileMinMax(
             d = light_positionView_z * frustumPlanes_z[3] + 
                 light_positionView_y * frustumPlanes_xy[3];
             inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-        
+       
+#if 0 
             // Pack and store intersecting lights
             cif (inFrustum) {
                 tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], 
                                                      lightIndex);
             }
+#else
+        const bool active = inFrustum && lightIndex < numLights;
+        if(any(active))
+          tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
+#endif
         }
     }
 
@@ -225,6 +239,9 @@ IntersectLightsWithTileMinMax(
 }
 
 
+#if 1 
+inline
+#endif
 static uniform int32
 IntersectLightsWithTile(
     uniform int32 tileStartX, uniform int32 tileEndX,
@@ -262,7 +279,13 @@ IntersectLightsWithTile(
 }
 
 
-export void
+#if 1 
+inline
+#endif
+#ifndef __NVPTX__
+export
+#endif
+void
 ShadeTile(
     uniform int32 tileStartX, uniform int32 tileEndX,
     uniform int32 tileStartY, uniform int32 tileEndY,
@@ -451,13 +474,17 @@ ShadeTile(
 
 task void
 RenderTile(uniform int num_groups_x, uniform int num_groups_y,
-           uniform InputHeader &inputHeader,
-           uniform InputDataArrays &inputData,
+           uniform InputHeader inputHeaderPtr[],
+           uniform InputDataArrays inputDataPtr[],
            uniform int visualizeLightCount,
            // Output
            uniform unsigned int8 framebuffer_r[],
            uniform unsigned int8 framebuffer_g[],
            uniform unsigned int8 framebuffer_b[]) {
+
+  uniform InputHeader inputHeader = *inputHeaderPtr;
+  uniform InputDataArrays inputData = *inputDataPtr;
+
     uniform int32 group_y = taskIndex / num_groups_x;
     uniform int32 group_x = taskIndex % num_groups_x;
     uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
@@ -473,7 +500,11 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
     uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
 
     // Light intersection: figure out which lights illuminate this tile.
+#ifdef __NVPTX__
+    uniform int * uniform tileLightIndices = uniform new uniform int [MAX_LIGHTS];
+#else
     uniform int tileLightIndices[MAX_LIGHTS];  // Light list for the tile
+#endif
     uniform int numTileLights = 
         IntersectLightsWithTile(tile_start_x, tile_end_x, 
                                 tile_start_y, tile_end_y,
@@ -495,17 +526,24 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
               cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
               tileLightIndices, numTileLights, visualizeLightCount, 
               framebuffer_r, framebuffer_g, framebuffer_b);
+#ifdef __NVPTX__
+    delete tileLightIndices;
+#endif
 }
 
 
 export void
-RenderStatic(uniform InputHeader &inputHeader,
-             uniform InputDataArrays &inputData,
+RenderStatic(uniform InputHeader inputHeaderPtr[],
+             uniform InputDataArrays inputDataPtr[],
              uniform int visualizeLightCount,
              // Output
              uniform unsigned int8 framebuffer_r[],
              uniform unsigned int8 framebuffer_g[],
              uniform unsigned int8 framebuffer_b[]) {
+
+    uniform InputHeader inputHeader = *inputHeaderPtr;
+    uniform InputDataArrays inputData = *inputDataPtr;
+
     uniform int num_groups_x = (inputHeader.framebufferWidth + 
                                 MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
     uniform int num_groups_y = (inputHeader.framebufferHeight + 
@@ -515,7 +553,7 @@ RenderStatic(uniform InputHeader &inputHeader,
     // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
     // by MIN_TILE_HEIGHT pixels.
     launch[num_groups] RenderTile(num_groups_x, num_groups_y,
-                                  inputHeader, inputData, visualizeLightCount,
+                                  inputHeaderPtr, inputDataPtr, visualizeLightCount,
                                   framebuffer_r, framebuffer_g, framebuffer_b);
 }
 
diff --git a/examples_ptx/deferred/kernels1.ispc b/examples_ptx/deferred/kernels1.ispc
deleted file mode 100644
index 1c0962cc..00000000
--- a/examples_ptx/deferred/kernels1.ispc
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-#ifdef __NVPTX__
-#warning "emitting DEVICE code"
-#define programCount warpSize()
-#define programIndex laneIndex()
-#define taskIndex    blockIndex0()
-#define taskCount    blockCount0()
-#define cif          if
-#else
-#warning "emitting HOST code"
-#endif
-
-
-#include "deferred.h"
-
-struct InputDataArrays
-{
-    float *zBuffer;
-    unsigned int16 *normalEncoded_x; // half float
-    unsigned int16 *normalEncoded_y; // half float
-    unsigned int16 *specularAmount; // half float
-    unsigned int16 *specularPower; // half float
-    unsigned int8 *albedo_x; // unorm8
-    unsigned int8 *albedo_y; // unorm8
-    unsigned int8 *albedo_z; // unorm8
-    float *lightPositionView_x;
-    float *lightPositionView_y;
-    float *lightPositionView_z;
-    float *lightAttenuationBegin;
-    float *lightColor_x;
-    float *lightColor_y;
-    float *lightColor_z;
-    float *lightAttenuationEnd;
-};
-
-struct InputHeader
-{
-    float cameraProj[4][4];
-    float cameraNear;
-    float cameraFar;
-
-    int32 framebufferWidth;
-    int32 framebufferHeight;
-    int32 numLights;
-    int32 inputDataChunkSize;
-    int32 inputDataArrayOffsets[idaNum];
-};
-
-
-///////////////////////////////////////////////////////////////////////////
-// Common utility routines
-
-static inline float
-dot3(float x, float y, float z, float a, float b, float c) {
-    return (x*a + y*b + z*c);
-}
-
-
-static inline void
-normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
-    float n = rsqrt(x*x + y*y + z*z);
-    ox = x * n;
-    oy = y * n;
-    oz = z * n;
-}
-
-
-static inline float
-Unorm8ToFloat32(unsigned int8 u) {
-    return (float)u * (1.0f / 255.0f);
-}
-
-
-static inline unsigned int8
-Float32ToUnorm8(float f) {
-    return (unsigned int8)(f * 255.0f);
-}
-
-
-static inline void
-ComputeZBounds(
-    uniform int32 tileStartX, uniform int32 tileEndX,
-    uniform int32 tileStartY, uniform int32 tileEndY,
-    // G-buffer data
-    uniform float zBuffer[],
-    uniform int32 gBufferWidth,
-    // Camera data
-    uniform float cameraProj_33, uniform float cameraProj_43,
-    uniform float cameraNear, uniform float cameraFar,
-    // Output
-    uniform float &minZ,
-    uniform float &maxZ
-    )
-{
-    // Find Z bounds
-    float laneMinZ = cameraFar;
-    float laneMaxZ = cameraNear;
-    for (uniform int32 y = tileStartY; y < tileEndY; ++y) 
-      foreach (x = tileStartX ... tileEndX) 
-      {
-        // Unproject depth buffer Z value into view space
-        float z = zBuffer[y * gBufferWidth + x];
-        float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
-
-        // Work out Z bounds for our samples
-        // Avoid considering skybox/background or otherwise invalid pixels
-        if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
-          laneMinZ = min(laneMinZ, viewSpaceZ);
-          laneMaxZ = max(laneMaxZ, viewSpaceZ);
-        }
-      }
-    minZ = reduce_min(laneMinZ);
-    maxZ = reduce_max(laneMaxZ);
-}
-
-
-static inline uniform int32
-IntersectLightsWithTileMinMax(
-    uniform int32 tileStartX, uniform int32 tileEndX,
-    uniform int32 tileStartY, uniform int32 tileEndY,
-    // Tile data
-    uniform float minZ,
-    uniform float maxZ,
-    // G-buffer data
-    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
-    // Camera data
-    uniform float cameraProj_11, uniform float cameraProj_22,
-    // Light Data
-    uniform int32 numLights,
-    uniform float light_positionView_x_array[],
-    uniform float light_positionView_y_array[],
-    uniform float light_positionView_z_array[],
-    uniform float light_attenuationEnd_array[],
-    // Output
-    uniform int32 tileLightIndices[]
-    )
-{
-    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
-    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
-        
-    uniform float frustumPlanes_xy[4] = {
-        -(cameraProj_11 * gBufferScale_x),
-         (cameraProj_11 * gBufferScale_x),
-         (cameraProj_22 * gBufferScale_y),
-        -(cameraProj_22 * gBufferScale_y) };
-    uniform float frustumPlanes_z[4] = {
-         tileEndX - gBufferScale_x,
-        -tileStartX + gBufferScale_x,
-         tileEndY - gBufferScale_y,
-        -tileStartY + gBufferScale_y };
-
-    for (uniform int i = 0; i < 4; ++i) {
-        uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
-                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
-        frustumPlanes_xy[i] *= norm;
-        frustumPlanes_z[i] *= norm;
-    }
-
-    uniform int32 tileNumLights = 0;
-
-    foreach (lightIndex = 0 ... numLights) 
-    {
-      float light_positionView_z = light_positionView_z_array[lightIndex];
-      float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
-      float light_attenuationEndNeg = -light_attenuationEnd;
-
-      float d = light_positionView_z - minZ;
-      bool inFrustum = (d >= light_attenuationEndNeg);
-
-      d = maxZ - light_positionView_z;
-      inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-
-      // This seems better than cif(!inFrustum) ccontinue; here since we
-      // don't actually need to mask the rest of this function - this is
-      // just a greedy early-out.  Could also structure all of this as
-      // nested if() statements, but this a bit easier to read
-      if (any(inFrustum)) 
-      {
-        float light_positionView_x = light_positionView_x_array[lightIndex];
-        float light_positionView_y = light_positionView_y_array[lightIndex];
-
-        d = light_positionView_z * frustumPlanes_z[0] + 
-          light_positionView_x * frustumPlanes_xy[0];
-        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-
-        d = light_positionView_z * frustumPlanes_z[1] + 
-          light_positionView_x * frustumPlanes_xy[1];
-        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-
-        d = light_positionView_z * frustumPlanes_z[2] + 
-          light_positionView_y * frustumPlanes_xy[2];
-        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-
-        d = light_positionView_z * frustumPlanes_z[3] + 
-          light_positionView_y * frustumPlanes_xy[3];
-        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-
-        // Pack and store intersecting lights
-        const bool active = inFrustum && lightIndex < numLights;
-
-        if(any(active))
-          tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
-      }
-    }
-
-    return tileNumLights;
-}
-
-
-static inline  uniform int32
-IntersectLightsWithTile(
-    uniform int32 tileStartX, uniform int32 tileEndX,
-    uniform int32 tileStartY, uniform int32 tileEndY,
-    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
-    // G-buffer data
-    uniform float zBuffer[],
-    // Camera data
-    uniform float cameraProj_11, uniform float cameraProj_22,
-    uniform float cameraProj_33, uniform float cameraProj_43,
-    uniform float cameraNear, uniform float cameraFar,
-    // Light Data
-    uniform int32 numLights,
-    uniform float light_positionView_x_array[],
-    uniform float light_positionView_y_array[],
-    uniform float light_positionView_z_array[],
-    uniform float light_attenuationEnd_array[],
-    // Output
-    uniform int32 tileLightIndices[]
-    )
-{
-    uniform float minZ, maxZ;
-    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
-        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
-        minZ, maxZ);
-
-    uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
-        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
-        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
-        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, 
-        light_positionView_z_array, light_attenuationEnd_array,
-        tileLightIndices);
-
-    return tileNumLights;
-}
-
-
-static inline void
-ShadeTile(
-    uniform int32 tileStartX, uniform int32 tileEndX,
-    uniform int32 tileStartY, uniform int32 tileEndY,
-    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
-    const uniform InputDataArrays &inputData,
-    // Camera data
-    uniform float cameraProj_11, uniform float cameraProj_22,
-    uniform float cameraProj_33, uniform float cameraProj_43,
-    // Light list
-    uniform int32 tileLightIndices[],
-    uniform int32 tileNumLights,
-    // UI
-    uniform bool visualizeLightCount,
-    // Output
-    uniform unsigned int8 framebuffer_r[],
-    uniform unsigned int8 framebuffer_g[],
-    uniform unsigned int8 framebuffer_b[]
-    )
-{
-    if (tileNumLights == 0 || visualizeLightCount) {
-        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
-        for (uniform int32 y = tileStartY; y < tileEndY; ++y) 
-          foreach (x = tileStartX ... tileEndX) 
-          { 
-            int32 framebufferIndex = (y * gBufferWidth + x);
-            framebuffer_r[framebufferIndex] = c;
-            framebuffer_g[framebufferIndex] = c;
-            framebuffer_b[framebufferIndex] = c;
-          }
-    } else {
-        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
-        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
-        
-        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
-          uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
-
-          foreach (x = tileStartX ... tileEndX) {
-            int32 gBufferOffset = y * gBufferWidth + x;
-
-            // Reconstruct position and (negative) view vector from G-buffer
-            float surface_positionView_x, surface_positionView_y, surface_positionView_z;
-            float Vneg_x, Vneg_y, Vneg_z;
-
-            float z = inputData.zBuffer[gBufferOffset];
-
-            // Compute screen/clip-space position
-            // NOTE: Mind DX11 viewport transform and pixel center!
-            float positionScreen_x = (0.5f + (float)(x)) * 
-              twoOverGBufferWidth - 1.0f;
-
-            // Unproject depth buffer Z value into view space
-            surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
-            surface_positionView_x = positionScreen_x * surface_positionView_z / 
-              cameraProj_11;
-            surface_positionView_y = positionScreen_y * surface_positionView_z / 
-              cameraProj_22;
-
-            // We actually end up with a vector pointing *at* the
-            // surface (i.e. the negative view vector)
-            normalize3(surface_positionView_x, surface_positionView_y, 
-                surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
-
-            // Reconstruct normal from G-buffer
-            float surface_normal_x, surface_normal_y, surface_normal_z;
-            float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
-            float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
-
-            float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
-            float m = sqrt(4.0f * f - 1.0f);
-
-            surface_normal_x = m * (4.0f * normal_x - 2.0f);
-            surface_normal_y = m * (4.0f * normal_y - 2.0f);
-            surface_normal_z = 3.0f - 8.0f * f;
-
-            // Load other G-buffer parameters
-            float surface_specularAmount = 
-              half_to_float(inputData.specularAmount[gBufferOffset]);
-            float surface_specularPower  = 
-              half_to_float(inputData.specularPower[gBufferOffset]);
-            float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
-            float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
-            float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
-
-            float lit_x = 0.0f;
-            float lit_y = 0.0f;
-            float lit_z = 0.0f;
-            for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
-                ++tileLightIndex) {
-              uniform int32 lightIndex = tileLightIndices[tileLightIndex];
-
-              // Gather light data relevant to initial culling
-              uniform float light_positionView_x = 
-                inputData.lightPositionView_x[lightIndex];
-              uniform float light_positionView_y = 
-                inputData.lightPositionView_y[lightIndex];
-              uniform float light_positionView_z = 
-                inputData.lightPositionView_z[lightIndex];
-              uniform float light_attenuationEnd = 
-                inputData.lightAttenuationEnd[lightIndex];
-
-              // Compute light vector
-              float L_x = light_positionView_x - surface_positionView_x;
-              float L_y = light_positionView_y - surface_positionView_y;
-              float L_z = light_positionView_z - surface_positionView_z;
-
-              float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
-
-              // Clip at end of attenuation
-              float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
-
-              cif (distanceToLight2 < light_attenutaionEnd2) {                    
-                float distanceToLight = sqrt(distanceToLight2);
-
-                // HLSL "rcp" is allowed to be fairly inaccurate
-                float distanceToLightRcp = rcp(distanceToLight);
-                L_x *= distanceToLightRcp;
-                L_y *= distanceToLightRcp;
-                L_z *= distanceToLightRcp;
-
-                // Start computing brdf
-                float NdotL = dot3(surface_normal_x, surface_normal_y, 
-                    surface_normal_z, L_x, L_y, L_z);
-
-                // Clip back facing
-                cif (NdotL > 0.0f) {
-                  uniform float light_attenuationBegin = 
-                    inputData.lightAttenuationBegin[lightIndex];
-
-                  // Light distance attenuation (linstep)
-                  float lightRange = (light_attenuationEnd - light_attenuationBegin);
-                  float falloffPosition = (light_attenuationEnd - distanceToLight);
-                  float attenuation = min(falloffPosition / lightRange, 1.0f);
-
-                  float H_x = (L_x - Vneg_x);
-                  float H_y = (L_y - Vneg_y);
-                  float H_z = (L_z - Vneg_z);
-                  normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
-
-                  float NdotH = dot3(surface_normal_x, surface_normal_y, 
-                      surface_normal_z, H_x, H_y, H_z);
-                  NdotH = max(NdotH, 0.0f);
-
-                  float specular = pow(NdotH, surface_specularPower);
-                  float specularNorm = (surface_specularPower + 2.0f) * 
-                    (1.0f / 8.0f);
-                  float specularContrib = surface_specularAmount * 
-                    specularNorm * specular;
-
-                  float k = attenuation * NdotL * (1.0f + specularContrib);
-
-                  uniform float light_color_x = inputData.lightColor_x[lightIndex];
-                  uniform float light_color_y = inputData.lightColor_y[lightIndex];
-                  uniform float light_color_z = inputData.lightColor_z[lightIndex];
-
-                  float lightContrib_x = surface_albedo_x * light_color_x;
-                  float lightContrib_y = surface_albedo_y * light_color_y;
-                  float lightContrib_z = surface_albedo_z * light_color_z;
-
-                  lit_x += lightContrib_x * k;
-                  lit_y += lightContrib_y * k;
-                  lit_z += lightContrib_z * k;
-                }
-              }
-            }
-
-            // Gamma correct
-            // These pows are pretty slow right now, but we can do
-            // something faster if really necessary to squeeze every
-            // last bit of performance out of it
-            float gamma = 1.0 / 2.2f;
-            lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
-            lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
-            lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
-
-            framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
-            framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
-            framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
-          }
-        }
-    }
-}
-
-
-///////////////////////////////////////////////////////////////////////////
-// Static decomposition
-
-void task
-RenderTile(uniform int num_groups_x, uniform int num_groups_y,
-           const  uniform InputHeader inputHeaderPtr[],
-           const  uniform InputDataArrays inputDataPtr[],
-           uniform int visualizeLightCount,
-           // Output
-           uniform unsigned int8 framebuffer_r[],
-           uniform unsigned int8 framebuffer_g[],
-           uniform unsigned int8 framebuffer_b[]) {
-  if (taskIndex >= taskCount) return;
-  const  uniform InputHeader inputHeader = *inputHeaderPtr;
-  const  uniform InputDataArrays inputData = *inputDataPtr;
-
-    uniform int32 group_y = taskIndex / num_groups_x;
-    uniform int32 group_x = taskIndex % num_groups_x;
-    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
-    uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
-    uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
-    uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
-
-    uniform int framebufferWidth = inputHeader.framebufferWidth;
-    uniform int framebufferHeight = inputHeader.framebufferHeight;
-    uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
-    uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
-    uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
-    uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
-
-    // Light intersection: figure out which lights illuminate this tile.
-#if 1
-    uniform int * uniform tileLightIndices = uniform new uniform int [MAX_LIGHTS];
-#else
-    uniform int tileLightIndices[MAX_LIGHTS];  // Light list for the tile
-#endif
-#if 1
-    uniform int numTileLights = 
-        IntersectLightsWithTile(tile_start_x, tile_end_x, 
-                                tile_start_y, tile_end_y,
-                                framebufferWidth, framebufferHeight,
-                                inputData.zBuffer,
-                                cameraProj_00, cameraProj_11,
-                                cameraProj_22, cameraProj_32,
-                                inputHeader.cameraNear, inputHeader.cameraFar,
-                                MAX_LIGHTS,
-                                inputData.lightPositionView_x, 
-                                inputData.lightPositionView_y, 
-                                inputData.lightPositionView_z, 
-                                inputData.lightAttenuationEnd,
-                                tileLightIndices);
-
-    // And now shade the tile, using the lights in tileLightIndices
-    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
-              framebufferWidth, framebufferHeight, inputData,
-              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
-              tileLightIndices, numTileLights, visualizeLightCount, 
-              framebuffer_r, framebuffer_g, framebuffer_b);
-#endif
-#if 1
-    delete tileLightIndices;
-#endif
-}
-
-
-export void
-RenderStatic(uniform InputHeader inputHeaderPtr[],
-             uniform InputDataArrays inputDataPtr[],
-             uniform int visualizeLightCount,
-             // Output
-             uniform unsigned int8 framebuffer_r[],
-             uniform unsigned int8 framebuffer_g[],
-             uniform unsigned int8 framebuffer_b[]) {
-
-  const uniform InputHeader inputHeader = *inputHeaderPtr;
-  const uniform InputDataArrays inputData = *inputDataPtr;
-
-
-    uniform int num_groups_x = (inputHeader.framebufferWidth + 
-                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
-    uniform int num_groups_y = (inputHeader.framebufferHeight + 
-                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
-    uniform int num_groups = num_groups_x * num_groups_y;
-
-    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
-    // by MIN_TILE_HEIGHT pixels.
-    launch[num_groups] RenderTile(num_groups_x, num_groups_y,
-                                  inputHeaderPtr, inputDataPtr, visualizeLightCount,
-                                  framebuffer_r, framebuffer_g, framebuffer_b);
-    sync;
-}
-
-
-
diff --git a/examples_ptx/deferred/main.cpp b/examples_ptx/deferred/main.cpp
index cc5ce300..05d38f28 100644
--- a/examples_ptx/deferred/main.cpp
+++ b/examples_ptx/deferred/main.cpp
@@ -83,10 +83,12 @@ int main(int argc, char** argv) {
     Framebuffer framebuffer(input->header.framebufferWidth,
                             input->header.framebufferHeight);
 
+#ifndef _CUDA_
     InitDynamicC(input);
-#ifdef __cilk
+#if 0 //def __cilk
     InitDynamicCilk(input);
 #endif // __cilk
+#endif
 
     int nframes = test_iterations[2];
     double ispcCycles = 1e30;
@@ -94,11 +96,11 @@ int main(int argc, char** argv) {
         framebuffer.clear();
         reset_and_start_timer();
         for (int j = 0; j < nframes; ++j)
-            ispc::RenderStatic(input->header, input->arrays,
+            ispc::RenderStatic(&input->header, &input->arrays,
                                VISUALIZE_LIGHT_COUNT,
                                framebuffer.r, framebuffer.g, framebuffer.b);
         double msec = get_elapsed_msec() / nframes;
-        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", msec);
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec [%.3f fps]\n", msec, 1.0e3/msec);
         ispcCycles = std::min(ispcCycles, msec);
     }
     printf("[ispc static + tasks]:\t\t[%.3f] msec to render "
@@ -106,8 +108,9 @@ int main(int argc, char** argv) {
            input->header.framebufferWidth, input->header.framebufferHeight);
     WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
 
+#ifndef _CUDA_
     nframes = 3;
-#ifdef __cilk
+#if 0 //def __cilk
     double dynamicCilkCycles = 1e30;
     for (int i = 0; i < test_iterations[1]; ++i) {
         framebuffer.clear();
@@ -115,7 +118,7 @@ int main(int argc, char** argv) {
         for (int j = 0; j < nframes; ++j)
             DispatchDynamicCilk(input, &framebuffer);
         double msec = get_elapsed_msec() / nframes;
-        printf("@time of serial run:\t\t\t[%.3f] msec\n", msec);
+        printf("@time of serial run:\t\t\t[%.3f] msec [%.3f fps]\n", msec, 1.0e3/msec);
         dynamicCilkCycles = std::min(dynamicCilkCycles, msec);
     }
     printf("[ispc + Cilk dynamic]:\t\t[%.3f] msec to render image\n", 
@@ -130,19 +133,20 @@ int main(int argc, char** argv) {
         for (int j = 0; j < nframes; ++j)
             DispatchDynamicC(input, &framebuffer);
         double msec = get_elapsed_msec() / nframes;
-        printf("@time of serial run:\t\t\t[%.3f] msec\n", msec);
+        printf("@time of serial run:\t\t\t[%.3f] msec [%.3f fps]\n", msec, 1.0e3/msec);
         serialCycles = std::min(serialCycles, msec);
     }
     printf("[C++ serial dynamic, 1 core]:\t[%.3f] msec to render image\n", 
            serialCycles);
     WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
 
-#ifdef __cilk
+#if 0 //def __cilk
     printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
            serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
 #else
     printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles);
 #endif // __cilk
+#endif
 
     DeleteInputData(input);
 
diff --git a/examples_ptx/deferred/test.cpp b/examples_ptx/deferred/test.cpp
new file mode 100644
index 00000000..0c2e49dc
--- /dev/null
+++ b/examples_ptx/deferred/test.cpp
@@ -0,0 +1,37 @@
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+
+struct Case
+{
+  int a; float b;
+};
+
+#if 0
+void * operator new(size_t s) throw(std::bad_alloc)
+{
+  fprintf(stderr, "alloc %d bytes\n", (int)s);
+  return (void*)0x123;
+}
+void operator delete(void *p) throw()
+{
+  fprintf(stderr, " free \n");
+}
+#else
+inline void *malloc(size_t size)
+{
+  fprintf(stderr, "alloc %d bytes\n", (int)size);
+  return (void*)0x123;
+}
+inline void free(void *ptr)
+{
+  fprintf(stderr, " free \n");
+}
+#endif
+
+int main()
+{
+  Case *ptr = new Case[10];
+  delete ptr;
+  return 0;
+}