added deferred shading foreach_tile

2013-11-14 16:49:47 +01:00
parent 83b9cc5c0a
commit 599ada8354
2 changed files with 159 additions and 156 deletions
--- a/examples_cuda/deferred/kernels1.ispc
+++ b/examples_cuda/deferred/kernels1.ispc
@@ -191,7 +191,6 @@ IntersectLightsWithTileMinMax(
    foreach (lightIndex = 0 ... numLights) 
    {
      float light_positionView_z = light_positionView_z_array[lightIndex];
      float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
      float light_attenuationEndNeg = -light_attenuationEnd;
@@ -296,164 +295,164 @@ ShadeTile(
    uniform unsigned int8 framebuffer_b[]
    )
 {
-  if (tileNumLights == 0 || visualizeLightCount) {
+    if (tileNumLights == 0 || visualizeLightCount) {
-    uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
-    foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) 
+        foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) 
-    { 
+        {
-      int32 framebufferIndex = (y * gBufferWidth + x);
+          int32 framebufferIndex = (y * gBufferWidth + x);
-      framebuffer_r[framebufferIndex] = c;
+          framebuffer_r[framebufferIndex] = c;
-      framebuffer_g[framebufferIndex] = c;
+          framebuffer_g[framebufferIndex] = c;
-      framebuffer_b[framebufferIndex] = c;
+          framebuffer_b[framebufferIndex] = c;
    }
  } else {
    uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
    uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
    foreach_tiled(y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) 
    {
      float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
      int32 gBufferOffset = y * gBufferWidth + x;
      // Reconstruct position and (negative) view vector from G-buffer
      float surface_positionView_x, surface_positionView_y, surface_positionView_z;
      float Vneg_x, Vneg_y, Vneg_z;
      float z = inputData.zBuffer[gBufferOffset];
      // Compute screen/clip-space position
      // NOTE: Mind DX11 viewport transform and pixel center!
      float positionScreen_x = (0.5f + (float)(x)) * 
        twoOverGBufferWidth - 1.0f;
      // Unproject depth buffer Z value into view space
      surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
      surface_positionView_x = positionScreen_x * surface_positionView_z / 
        cameraProj_11;
      surface_positionView_y = positionScreen_y * surface_positionView_z / 
        cameraProj_22;
      // We actually end up with a vector pointing *at* the
      // surface (i.e. the negative view vector)
      normalize3(surface_positionView_x, surface_positionView_y, 
          surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
      // Reconstruct normal from G-buffer
      float surface_normal_x, surface_normal_y, surface_normal_z;
      float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
      float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
      float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
      float m = sqrt(4.0f * f - 1.0f);
      surface_normal_x = m * (4.0f * normal_x - 2.0f);
      surface_normal_y = m * (4.0f * normal_y - 2.0f);
      surface_normal_z = 3.0f - 8.0f * f;
      // Load other G-buffer parameters
      float surface_specularAmount = 
        half_to_float(inputData.specularAmount[gBufferOffset]);
      float surface_specularPower  = 
        half_to_float(inputData.specularPower[gBufferOffset]);
      float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
      float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
      float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
      float lit_x = 0.0f;
      float lit_y = 0.0f;
      float lit_z = 0.0f;
      for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
          ++tileLightIndex) {
        uniform int32 lightIndex = tileLightIndices[tileLightIndex];
        // Gather light data relevant to initial culling
        uniform float light_positionView_x = 
          inputData.lightPositionView_x[lightIndex];
        uniform float light_positionView_y = 
          inputData.lightPositionView_y[lightIndex];
        uniform float light_positionView_z = 
          inputData.lightPositionView_z[lightIndex];
        uniform float light_attenuationEnd = 
          inputData.lightAttenuationEnd[lightIndex];
        // Compute light vector
        float L_x = light_positionView_x - surface_positionView_x;
        float L_y = light_positionView_y - surface_positionView_y;
        float L_z = light_positionView_z - surface_positionView_z;
        float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
        // Clip at end of attenuation
        float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
        cif (distanceToLight2 < light_attenutaionEnd2) {                    
          float distanceToLight = sqrt(distanceToLight2);
          // HLSL "rcp" is allowed to be fairly inaccurate
          float distanceToLightRcp = rcp(distanceToLight);
          L_x *= distanceToLightRcp;
          L_y *= distanceToLightRcp;
          L_z *= distanceToLightRcp;
          // Start computing brdf
          float NdotL = dot3(surface_normal_x, surface_normal_y, 
              surface_normal_z, L_x, L_y, L_z);
          // Clip back facing
          cif (NdotL > 0.0f) {
            uniform float light_attenuationBegin = 
              inputData.lightAttenuationBegin[lightIndex];
            // Light distance attenuation (linstep)
            float lightRange = (light_attenuationEnd - light_attenuationBegin);
            float falloffPosition = (light_attenuationEnd - distanceToLight);
            float attenuation = min(falloffPosition / lightRange, 1.0f);
            float H_x = (L_x - Vneg_x);
            float H_y = (L_y - Vneg_y);
            float H_z = (L_z - Vneg_z);
            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
            float NdotH = dot3(surface_normal_x, surface_normal_y, 
                surface_normal_z, H_x, H_y, H_z);
            NdotH = max(NdotH, 0.0f);
            float specular = pow(NdotH, surface_specularPower);
            float specularNorm = (surface_specularPower + 2.0f) * 
              (1.0f / 8.0f);
            float specularContrib = surface_specularAmount * 
              specularNorm * specular;
            float k = attenuation * NdotL * (1.0f + specularContrib);
            uniform float light_color_x = inputData.lightColor_x[lightIndex];
            uniform float light_color_y = inputData.lightColor_y[lightIndex];
            uniform float light_color_z = inputData.lightColor_z[lightIndex];
            float lightContrib_x = surface_albedo_x * light_color_x;
            float lightContrib_y = surface_albedo_y * light_color_y;
            float lightContrib_z = surface_albedo_z * light_color_z;
            lit_x += lightContrib_x * k;
            lit_y += lightContrib_y * k;
            lit_z += lightContrib_z * k;
          }
        }
-      }
+    } else {
        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
-      // Gamma correct
+        foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) 
-      // These pows are pretty slow right now, but we can do
+        {
-      // something faster if really necessary to squeeze every
+          float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
-      // last bit of performance out of it
+          int32 gBufferOffset = y * gBufferWidth + x;
      float gamma = 1.0 / 2.2f;
      lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
      lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
      lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
-      framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+          // Reconstruct position and (negative) view vector from G-buffer
-      framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+          float surface_positionView_x, surface_positionView_y, surface_positionView_z;
-      framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+          float Vneg_x, Vneg_y, Vneg_z;
          float z = inputData.zBuffer[gBufferOffset];
          // Compute screen/clip-space position
          // NOTE: Mind DX11 viewport transform and pixel center!
          float positionScreen_x = (0.5f + (float)(x)) * 
            twoOverGBufferWidth - 1.0f;
          // Unproject depth buffer Z value into view space
          surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
          surface_positionView_x = positionScreen_x * surface_positionView_z / 
            cameraProj_11;
          surface_positionView_y = positionScreen_y * surface_positionView_z / 
            cameraProj_22;
          // We actually end up with a vector pointing *at* the
          // surface (i.e. the negative view vector)
          normalize3(surface_positionView_x, surface_positionView_y, 
              surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
          // Reconstruct normal from G-buffer
          float surface_normal_x, surface_normal_y, surface_normal_z;
          float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
          float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
          float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
          float m = sqrt(4.0f * f - 1.0f);
          surface_normal_x = m * (4.0f * normal_x - 2.0f);
          surface_normal_y = m * (4.0f * normal_y - 2.0f);
          surface_normal_z = 3.0f - 8.0f * f;
          // Load other G-buffer parameters
          float surface_specularAmount = 
            half_to_float(inputData.specularAmount[gBufferOffset]);
          float surface_specularPower  = 
            half_to_float(inputData.specularPower[gBufferOffset]);
          float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
          float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
          float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
          float lit_x = 0.0f;
          float lit_y = 0.0f;
          float lit_z = 0.0f;
          for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
              ++tileLightIndex) {
            uniform int32 lightIndex = tileLightIndices[tileLightIndex];
            // Gather light data relevant to initial culling
            uniform float light_positionView_x = 
              inputData.lightPositionView_x[lightIndex];
            uniform float light_positionView_y = 
              inputData.lightPositionView_y[lightIndex];
            uniform float light_positionView_z = 
              inputData.lightPositionView_z[lightIndex];
            uniform float light_attenuationEnd = 
              inputData.lightAttenuationEnd[lightIndex];
            // Compute light vector
            float L_x = light_positionView_x - surface_positionView_x;
            float L_y = light_positionView_y - surface_positionView_y;
            float L_z = light_positionView_z - surface_positionView_z;
            float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
            // Clip at end of attenuation
            float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
            cif (distanceToLight2 < light_attenutaionEnd2) {                    
              float distanceToLight = sqrt(distanceToLight2);
              // HLSL "rcp" is allowed to be fairly inaccurate
              float distanceToLightRcp = rcp(distanceToLight);
              L_x *= distanceToLightRcp;
              L_y *= distanceToLightRcp;
              L_z *= distanceToLightRcp;
              // Start computing brdf
              float NdotL = dot3(surface_normal_x, surface_normal_y, 
                  surface_normal_z, L_x, L_y, L_z);
              // Clip back facing
              cif (NdotL > 0.0f) {
                uniform float light_attenuationBegin = 
                  inputData.lightAttenuationBegin[lightIndex];
                // Light distance attenuation (linstep)
                float lightRange = (light_attenuationEnd - light_attenuationBegin);
                float falloffPosition = (light_attenuationEnd - distanceToLight);
                float attenuation = min(falloffPosition / lightRange, 1.0f);
                float H_x = (L_x - Vneg_x);
                float H_y = (L_y - Vneg_y);
                float H_z = (L_z - Vneg_z);
                normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
                float NdotH = dot3(surface_normal_x, surface_normal_y, 
                    surface_normal_z, H_x, H_y, H_z);
                NdotH = max(NdotH, 0.0f);
                float specular = pow(NdotH, surface_specularPower);
                float specularNorm = (surface_specularPower + 2.0f) * 
                  (1.0f / 8.0f);
                float specularContrib = surface_specularAmount * 
                  specularNorm * specular;
                float k = attenuation * NdotL * (1.0f + specularContrib);
                uniform float light_color_x = inputData.lightColor_x[lightIndex];
                uniform float light_color_y = inputData.lightColor_y[lightIndex];
                uniform float light_color_z = inputData.lightColor_z[lightIndex];
                float lightContrib_x = surface_albedo_x * light_color_x;
                float lightContrib_y = surface_albedo_y * light_color_y;
                float lightContrib_z = surface_albedo_z * light_color_z;
                lit_x += lightContrib_x * k;
                lit_y += lightContrib_y * k;
                lit_z += lightContrib_z * k;
              }
            }
          }
          // Gamma correct
          // These pows are pretty slow right now, but we can do
          // something faster if really necessary to squeeze every
          // last bit of performance out of it
          float gamma = 1.0 / 2.2f;
          lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
          lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
          lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
          framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
          framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
          framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
        }
    }
  }
 }
--- a/examples_cuda/deferred/main_cu.cpp
+++ b/examples_cuda/deferred/main_cu.cpp
@@ -116,6 +116,10 @@ void createContext(const int deviceId = 0)
  // Create driver context
  checkCudaErrors(cuCtxCreate(&context, 0, device));
    const size_t stackLimit = 4*1024;
 //   const size_t heapLimit = 1024*1024*1024;
  checkCudaErrors(cuCtxSetLimit(CU_LIMIT_STACK_SIZE,stackLimit));
 //  checkCudaErrors(cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE,heapLimit));
 }
 void destroyContext()
 {