From 599ada8354d41958c8a615cf7805a329833e174c Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Thu, 14 Nov 2013 16:49:47 +0100
Subject: [PATCH] added deferred shading foreach_tile

---
 examples_cuda/deferred/kernels1.ispc | 311 +++++++++++++--------------
 examples_cuda/deferred/main_cu.cpp   |   4 +
 2 files changed, 159 insertions(+), 156 deletions(-)

diff --git a/examples_cuda/deferred/kernels1.ispc b/examples_cuda/deferred/kernels1.ispc
index 9250f5ab..5def1900 100644
--- a/examples_cuda/deferred/kernels1.ispc
+++ b/examples_cuda/deferred/kernels1.ispc
@@ -127,7 +127,7 @@ ComputeZBounds(
     // Find Z bounds
     float laneMinZ = cameraFar;
     float laneMaxZ = cameraNear;
-    foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX)
+    foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) 
     {
       // Unproject depth buffer Z value into view space
       float z = zBuffer[y * gBufferWidth + x];
@@ -191,7 +191,6 @@ IntersectLightsWithTileMinMax(
 
     foreach (lightIndex = 0 ... numLights) 
     {
-
       float light_positionView_z = light_positionView_z_array[lightIndex];
       float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
       float light_attenuationEndNeg = -light_attenuationEnd;
@@ -296,164 +295,164 @@ ShadeTile(
     uniform unsigned int8 framebuffer_b[]
     )
 {
-  if (tileNumLights == 0 || visualizeLightCount) {
-    uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
-    foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) 
-    { 
-      int32 framebufferIndex = (y * gBufferWidth + x);
-      framebuffer_r[framebufferIndex] = c;
-      framebuffer_g[framebufferIndex] = c;
-      framebuffer_b[framebufferIndex] = c;
-    }
-  } else {
-    uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
-    uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
-
-    foreach_tiled(y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) 
-    {
-      float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
-      int32 gBufferOffset = y * gBufferWidth + x;
-
-      // Reconstruct position and (negative) view vector from G-buffer
-      float surface_positionView_x, surface_positionView_y, surface_positionView_z;
-      float Vneg_x, Vneg_y, Vneg_z;
-
-      float z = inputData.zBuffer[gBufferOffset];
-
-      // Compute screen/clip-space position
-      // NOTE: Mind DX11 viewport transform and pixel center!
-      float positionScreen_x = (0.5f + (float)(x)) * 
-        twoOverGBufferWidth - 1.0f;
-
-      // Unproject depth buffer Z value into view space
-      surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
-      surface_positionView_x = positionScreen_x * surface_positionView_z / 
-        cameraProj_11;
-      surface_positionView_y = positionScreen_y * surface_positionView_z / 
-        cameraProj_22;
-
-      // We actually end up with a vector pointing *at* the
-      // surface (i.e. the negative view vector)
-      normalize3(surface_positionView_x, surface_positionView_y, 
-          surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
-
-      // Reconstruct normal from G-buffer
-      float surface_normal_x, surface_normal_y, surface_normal_z;
-      float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
-      float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
-
-      float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
-      float m = sqrt(4.0f * f - 1.0f);
-
-      surface_normal_x = m * (4.0f * normal_x - 2.0f);
-      surface_normal_y = m * (4.0f * normal_y - 2.0f);
-      surface_normal_z = 3.0f - 8.0f * f;
-
-      // Load other G-buffer parameters
-      float surface_specularAmount = 
-        half_to_float(inputData.specularAmount[gBufferOffset]);
-      float surface_specularPower  = 
-        half_to_float(inputData.specularPower[gBufferOffset]);
-      float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
-      float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
-      float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
-
-      float lit_x = 0.0f;
-      float lit_y = 0.0f;
-      float lit_z = 0.0f;
-      for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
-          ++tileLightIndex) {
-        uniform int32 lightIndex = tileLightIndices[tileLightIndex];
-
-        // Gather light data relevant to initial culling
-        uniform float light_positionView_x = 
-          inputData.lightPositionView_x[lightIndex];
-        uniform float light_positionView_y = 
-          inputData.lightPositionView_y[lightIndex];
-        uniform float light_positionView_z = 
-          inputData.lightPositionView_z[lightIndex];
-        uniform float light_attenuationEnd = 
-          inputData.lightAttenuationEnd[lightIndex];
-
-        // Compute light vector
-        float L_x = light_positionView_x - surface_positionView_x;
-        float L_y = light_positionView_y - surface_positionView_y;
-        float L_z = light_positionView_z - surface_positionView_z;
-
-        float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
-
-        // Clip at end of attenuation
-        float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
-
-        cif (distanceToLight2 < light_attenutaionEnd2) {                    
-          float distanceToLight = sqrt(distanceToLight2);
-
-          // HLSL "rcp" is allowed to be fairly inaccurate
-          float distanceToLightRcp = rcp(distanceToLight);
-          L_x *= distanceToLightRcp;
-          L_y *= distanceToLightRcp;
-          L_z *= distanceToLightRcp;
-
-          // Start computing brdf
-          float NdotL = dot3(surface_normal_x, surface_normal_y, 
-              surface_normal_z, L_x, L_y, L_z);
-
-          // Clip back facing
-          cif (NdotL > 0.0f) {
-            uniform float light_attenuationBegin = 
-              inputData.lightAttenuationBegin[lightIndex];
-
-            // Light distance attenuation (linstep)
-            float lightRange = (light_attenuationEnd - light_attenuationBegin);
-            float falloffPosition = (light_attenuationEnd - distanceToLight);
-            float attenuation = min(falloffPosition / lightRange, 1.0f);
-
-            float H_x = (L_x - Vneg_x);
-            float H_y = (L_y - Vneg_y);
-            float H_z = (L_z - Vneg_z);
-            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
-
-            float NdotH = dot3(surface_normal_x, surface_normal_y, 
-                surface_normal_z, H_x, H_y, H_z);
-            NdotH = max(NdotH, 0.0f);
-
-            float specular = pow(NdotH, surface_specularPower);
-            float specularNorm = (surface_specularPower + 2.0f) * 
-              (1.0f / 8.0f);
-            float specularContrib = surface_specularAmount * 
-              specularNorm * specular;
-
-            float k = attenuation * NdotL * (1.0f + specularContrib);
-
-            uniform float light_color_x = inputData.lightColor_x[lightIndex];
-            uniform float light_color_y = inputData.lightColor_y[lightIndex];
-            uniform float light_color_z = inputData.lightColor_z[lightIndex];
-
-            float lightContrib_x = surface_albedo_x * light_color_x;
-            float lightContrib_y = surface_albedo_y * light_color_y;
-            float lightContrib_z = surface_albedo_z * light_color_z;
-
-            lit_x += lightContrib_x * k;
-            lit_y += lightContrib_y * k;
-            lit_z += lightContrib_z * k;
-          }
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) 
+        {
+          int32 framebufferIndex = (y * gBufferWidth + x);
+          framebuffer_r[framebufferIndex] = c;
+          framebuffer_g[framebufferIndex] = c;
+          framebuffer_b[framebufferIndex] = c;
         }
-      }
+    } else {
+        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) 
+        {
+          float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+          int32 gBufferOffset = y * gBufferWidth + x;
 
-      // Gamma correct
-      // These pows are pretty slow right now, but we can do
-      // something faster if really necessary to squeeze every
-      // last bit of performance out of it
-      float gamma = 1.0 / 2.2f;
-      lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
-      lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
-      lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+          // Reconstruct position and (negative) view vector from G-buffer
+          float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+          float Vneg_x, Vneg_y, Vneg_z;
 
-      framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
-      framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
-      framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+          float z = inputData.zBuffer[gBufferOffset];
+
+          // Compute screen/clip-space position
+          // NOTE: Mind DX11 viewport transform and pixel center!
+          float positionScreen_x = (0.5f + (float)(x)) * 
+            twoOverGBufferWidth - 1.0f;
+
+          // Unproject depth buffer Z value into view space
+          surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+          surface_positionView_x = positionScreen_x * surface_positionView_z / 
+            cameraProj_11;
+          surface_positionView_y = positionScreen_y * surface_positionView_z / 
+            cameraProj_22;
+
+          // We actually end up with a vector pointing *at* the
+          // surface (i.e. the negative view vector)
+          normalize3(surface_positionView_x, surface_positionView_y, 
+              surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+          // Reconstruct normal from G-buffer
+          float surface_normal_x, surface_normal_y, surface_normal_z;
+          float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
+          float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
+
+          float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+          float m = sqrt(4.0f * f - 1.0f);
+
+          surface_normal_x = m * (4.0f * normal_x - 2.0f);
+          surface_normal_y = m * (4.0f * normal_y - 2.0f);
+          surface_normal_z = 3.0f - 8.0f * f;
+
+          // Load other G-buffer parameters
+          float surface_specularAmount = 
+            half_to_float(inputData.specularAmount[gBufferOffset]);
+          float surface_specularPower  = 
+            half_to_float(inputData.specularPower[gBufferOffset]);
+          float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+          float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+          float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+
+          float lit_x = 0.0f;
+          float lit_y = 0.0f;
+          float lit_z = 0.0f;
+          for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
+              ++tileLightIndex) {
+            uniform int32 lightIndex = tileLightIndices[tileLightIndex];
+
+            // Gather light data relevant to initial culling
+            uniform float light_positionView_x = 
+              inputData.lightPositionView_x[lightIndex];
+            uniform float light_positionView_y = 
+              inputData.lightPositionView_y[lightIndex];
+            uniform float light_positionView_z = 
+              inputData.lightPositionView_z[lightIndex];
+            uniform float light_attenuationEnd = 
+              inputData.lightAttenuationEnd[lightIndex];
+
+            // Compute light vector
+            float L_x = light_positionView_x - surface_positionView_x;
+            float L_y = light_positionView_y - surface_positionView_y;
+            float L_z = light_positionView_z - surface_positionView_z;
+
+            float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+
+            // Clip at end of attenuation
+            float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+            cif (distanceToLight2 < light_attenutaionEnd2) {                    
+              float distanceToLight = sqrt(distanceToLight2);
+
+              // HLSL "rcp" is allowed to be fairly inaccurate
+              float distanceToLightRcp = rcp(distanceToLight);
+              L_x *= distanceToLightRcp;
+              L_y *= distanceToLightRcp;
+              L_z *= distanceToLightRcp;
+
+              // Start computing brdf
+              float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                  surface_normal_z, L_x, L_y, L_z);
+
+              // Clip back facing
+              cif (NdotL > 0.0f) {
+                uniform float light_attenuationBegin = 
+                  inputData.lightAttenuationBegin[lightIndex];
+
+                // Light distance attenuation (linstep)
+                float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                float falloffPosition = (light_attenuationEnd - distanceToLight);
+                float attenuation = min(falloffPosition / lightRange, 1.0f);
+
+                float H_x = (L_x - Vneg_x);
+                float H_y = (L_y - Vneg_y);
+                float H_z = (L_z - Vneg_z);
+                normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+
+                float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                    surface_normal_z, H_x, H_y, H_z);
+                NdotH = max(NdotH, 0.0f);
+
+                float specular = pow(NdotH, surface_specularPower);
+                float specularNorm = (surface_specularPower + 2.0f) * 
+                  (1.0f / 8.0f);
+                float specularContrib = surface_specularAmount * 
+                  specularNorm * specular;
+
+                float k = attenuation * NdotL * (1.0f + specularContrib);
+
+                uniform float light_color_x = inputData.lightColor_x[lightIndex];
+                uniform float light_color_y = inputData.lightColor_y[lightIndex];
+                uniform float light_color_z = inputData.lightColor_z[lightIndex];
+
+                float lightContrib_x = surface_albedo_x * light_color_x;
+                float lightContrib_y = surface_albedo_y * light_color_y;
+                float lightContrib_z = surface_albedo_z * light_color_z;
+
+                lit_x += lightContrib_x * k;
+                lit_y += lightContrib_y * k;
+                lit_z += lightContrib_z * k;
+              }
+            }
+          }
+
+          // Gamma correct
+          // These pows are pretty slow right now, but we can do
+          // something faster if really necessary to squeeze every
+          // last bit of performance out of it
+          float gamma = 1.0 / 2.2f;
+          lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+          lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+          lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+
+          framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+          framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+          framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+        }
     }
-  }
 }
 
 
diff --git a/examples_cuda/deferred/main_cu.cpp b/examples_cuda/deferred/main_cu.cpp
index 697c1d98..ff308202 100755
--- a/examples_cuda/deferred/main_cu.cpp
+++ b/examples_cuda/deferred/main_cu.cpp
@@ -116,6 +116,10 @@ void createContext(const int deviceId = 0)
 
   // Create driver context
   checkCudaErrors(cuCtxCreate(&context, 0, device));
+    const size_t stackLimit = 4*1024;
+ //   const size_t heapLimit = 1024*1024*1024;
+  checkCudaErrors(cuCtxSetLimit(CU_LIMIT_STACK_SIZE,stackLimit));
+//  checkCudaErrors(cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE,heapLimit));
 }
 void destroyContext()
 {