From 599ada8354d41958c8a615cf7805a329833e174c Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 14 Nov 2013 16:49:47 +0100 Subject: [PATCH] added deferred shading foreach_tile --- examples_cuda/deferred/kernels1.ispc | 311 +++++++++++++-------------- examples_cuda/deferred/main_cu.cpp | 4 + 2 files changed, 159 insertions(+), 156 deletions(-) diff --git a/examples_cuda/deferred/kernels1.ispc b/examples_cuda/deferred/kernels1.ispc index 9250f5ab..5def1900 100644 --- a/examples_cuda/deferred/kernels1.ispc +++ b/examples_cuda/deferred/kernels1.ispc @@ -127,7 +127,7 @@ ComputeZBounds( // Find Z bounds float laneMinZ = cameraFar; float laneMaxZ = cameraNear; - foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) + foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) { // Unproject depth buffer Z value into view space float z = zBuffer[y * gBufferWidth + x]; @@ -191,7 +191,6 @@ IntersectLightsWithTileMinMax( foreach (lightIndex = 0 ... numLights) { - float light_positionView_z = light_positionView_z_array[lightIndex]; float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; float light_attenuationEndNeg = -light_attenuationEnd; @@ -296,164 +295,164 @@ ShadeTile( uniform unsigned int8 framebuffer_b[] ) { - if (tileNumLights == 0 || visualizeLightCount) { - uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255)); - foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) - { - int32 framebufferIndex = (y * gBufferWidth + x); - framebuffer_r[framebufferIndex] = c; - framebuffer_g[framebufferIndex] = c; - framebuffer_b[framebufferIndex] = c; - } - } else { - uniform float twoOverGBufferWidth = 2.0f / gBufferWidth; - uniform float twoOverGBufferHeight = 2.0f / gBufferHeight; - - foreach_tiled(y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) - { - float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); - int32 gBufferOffset = y * gBufferWidth + x; - - // Reconstruct position and (negative) view vector from G-buffer - float surface_positionView_x, surface_positionView_y, surface_positionView_z; - float Vneg_x, Vneg_y, Vneg_z; - - float z = inputData.zBuffer[gBufferOffset]; - - // Compute screen/clip-space position - // NOTE: Mind DX11 viewport transform and pixel center! - float positionScreen_x = (0.5f + (float)(x)) * - twoOverGBufferWidth - 1.0f; - - // Unproject depth buffer Z value into view space - surface_positionView_z = cameraProj_43 / (z - cameraProj_33); - surface_positionView_x = positionScreen_x * surface_positionView_z / - cameraProj_11; - surface_positionView_y = positionScreen_y * surface_positionView_z / - cameraProj_22; - - // We actually end up with a vector pointing *at* the - // surface (i.e. the negative view vector) - normalize3(surface_positionView_x, surface_positionView_y, - surface_positionView_z, Vneg_x, Vneg_y, Vneg_z); - - // Reconstruct normal from G-buffer - float surface_normal_x, surface_normal_y, surface_normal_z; - float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]); - float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]); - - float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); - float m = sqrt(4.0f * f - 1.0f); - - surface_normal_x = m * (4.0f * normal_x - 2.0f); - surface_normal_y = m * (4.0f * normal_y - 2.0f); - surface_normal_z = 3.0f - 8.0f * f; - - // Load other G-buffer parameters - float surface_specularAmount = - half_to_float(inputData.specularAmount[gBufferOffset]); - float surface_specularPower = - half_to_float(inputData.specularPower[gBufferOffset]); - float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]); - float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]); - float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]); - - float lit_x = 0.0f; - float lit_y = 0.0f; - float lit_z = 0.0f; - for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; - ++tileLightIndex) { - uniform int32 lightIndex = tileLightIndices[tileLightIndex]; - - // Gather light data relevant to initial culling - uniform float light_positionView_x = - inputData.lightPositionView_x[lightIndex]; - uniform float light_positionView_y = - inputData.lightPositionView_y[lightIndex]; - uniform float light_positionView_z = - inputData.lightPositionView_z[lightIndex]; - uniform float light_attenuationEnd = - inputData.lightAttenuationEnd[lightIndex]; - - // Compute light vector - float L_x = light_positionView_x - surface_positionView_x; - float L_y = light_positionView_y - surface_positionView_y; - float L_z = light_positionView_z - surface_positionView_z; - - float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z); - - // Clip at end of attenuation - float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; - - cif (distanceToLight2 < light_attenutaionEnd2) { - float distanceToLight = sqrt(distanceToLight2); - - // HLSL "rcp" is allowed to be fairly inaccurate - float distanceToLightRcp = rcp(distanceToLight); - L_x *= distanceToLightRcp; - L_y *= distanceToLightRcp; - L_z *= distanceToLightRcp; - - // Start computing brdf - float NdotL = dot3(surface_normal_x, surface_normal_y, - surface_normal_z, L_x, L_y, L_z); - - // Clip back facing - cif (NdotL > 0.0f) { - uniform float light_attenuationBegin = - inputData.lightAttenuationBegin[lightIndex]; - - // Light distance attenuation (linstep) - float lightRange = (light_attenuationEnd - light_attenuationBegin); - float falloffPosition = (light_attenuationEnd - distanceToLight); - float attenuation = min(falloffPosition / lightRange, 1.0f); - - float H_x = (L_x - Vneg_x); - float H_y = (L_y - Vneg_y); - float H_z = (L_z - Vneg_z); - normalize3(H_x, H_y, H_z, H_x, H_y, H_z); - - float NdotH = dot3(surface_normal_x, surface_normal_y, - surface_normal_z, H_x, H_y, H_z); - NdotH = max(NdotH, 0.0f); - - float specular = pow(NdotH, surface_specularPower); - float specularNorm = (surface_specularPower + 2.0f) * - (1.0f / 8.0f); - float specularContrib = surface_specularAmount * - specularNorm * specular; - - float k = attenuation * NdotL * (1.0f + specularContrib); - - uniform float light_color_x = inputData.lightColor_x[lightIndex]; - uniform float light_color_y = inputData.lightColor_y[lightIndex]; - uniform float light_color_z = inputData.lightColor_z[lightIndex]; - - float lightContrib_x = surface_albedo_x * light_color_x; - float lightContrib_y = surface_albedo_y * light_color_y; - float lightContrib_z = surface_albedo_z * light_color_z; - - lit_x += lightContrib_x * k; - lit_y += lightContrib_y * k; - lit_z += lightContrib_z * k; - } + if (tileNumLights == 0 || visualizeLightCount) { + uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255)); + foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) + { + int32 framebufferIndex = (y * gBufferWidth + x); + framebuffer_r[framebufferIndex] = c; + framebuffer_g[framebufferIndex] = c; + framebuffer_b[framebufferIndex] = c; } - } + } else { + uniform float twoOverGBufferWidth = 2.0f / gBufferWidth; + uniform float twoOverGBufferHeight = 2.0f / gBufferHeight; + + foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) + { + float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); + int32 gBufferOffset = y * gBufferWidth + x; - // Gamma correct - // These pows are pretty slow right now, but we can do - // something faster if really necessary to squeeze every - // last bit of performance out of it - float gamma = 1.0 / 2.2f; - lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma); - lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); - lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); + // Reconstruct position and (negative) view vector from G-buffer + float surface_positionView_x, surface_positionView_y, surface_positionView_z; + float Vneg_x, Vneg_y, Vneg_z; - framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); - framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); - framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); + float z = inputData.zBuffer[gBufferOffset]; + + // Compute screen/clip-space position + // NOTE: Mind DX11 viewport transform and pixel center! + float positionScreen_x = (0.5f + (float)(x)) * + twoOverGBufferWidth - 1.0f; + + // Unproject depth buffer Z value into view space + surface_positionView_z = cameraProj_43 / (z - cameraProj_33); + surface_positionView_x = positionScreen_x * surface_positionView_z / + cameraProj_11; + surface_positionView_y = positionScreen_y * surface_positionView_z / + cameraProj_22; + + // We actually end up with a vector pointing *at* the + // surface (i.e. the negative view vector) + normalize3(surface_positionView_x, surface_positionView_y, + surface_positionView_z, Vneg_x, Vneg_y, Vneg_z); + + // Reconstruct normal from G-buffer + float surface_normal_x, surface_normal_y, surface_normal_z; + float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]); + float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]); + + float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); + float m = sqrt(4.0f * f - 1.0f); + + surface_normal_x = m * (4.0f * normal_x - 2.0f); + surface_normal_y = m * (4.0f * normal_y - 2.0f); + surface_normal_z = 3.0f - 8.0f * f; + + // Load other G-buffer parameters + float surface_specularAmount = + half_to_float(inputData.specularAmount[gBufferOffset]); + float surface_specularPower = + half_to_float(inputData.specularPower[gBufferOffset]); + float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]); + float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]); + float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]); + + float lit_x = 0.0f; + float lit_y = 0.0f; + float lit_z = 0.0f; + for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; + ++tileLightIndex) { + uniform int32 lightIndex = tileLightIndices[tileLightIndex]; + + // Gather light data relevant to initial culling + uniform float light_positionView_x = + inputData.lightPositionView_x[lightIndex]; + uniform float light_positionView_y = + inputData.lightPositionView_y[lightIndex]; + uniform float light_positionView_z = + inputData.lightPositionView_z[lightIndex]; + uniform float light_attenuationEnd = + inputData.lightAttenuationEnd[lightIndex]; + + // Compute light vector + float L_x = light_positionView_x - surface_positionView_x; + float L_y = light_positionView_y - surface_positionView_y; + float L_z = light_positionView_z - surface_positionView_z; + + float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z); + + // Clip at end of attenuation + float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; + + cif (distanceToLight2 < light_attenutaionEnd2) { + float distanceToLight = sqrt(distanceToLight2); + + // HLSL "rcp" is allowed to be fairly inaccurate + float distanceToLightRcp = rcp(distanceToLight); + L_x *= distanceToLightRcp; + L_y *= distanceToLightRcp; + L_z *= distanceToLightRcp; + + // Start computing brdf + float NdotL = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, L_x, L_y, L_z); + + // Clip back facing + cif (NdotL > 0.0f) { + uniform float light_attenuationBegin = + inputData.lightAttenuationBegin[lightIndex]; + + // Light distance attenuation (linstep) + float lightRange = (light_attenuationEnd - light_attenuationBegin); + float falloffPosition = (light_attenuationEnd - distanceToLight); + float attenuation = min(falloffPosition / lightRange, 1.0f); + + float H_x = (L_x - Vneg_x); + float H_y = (L_y - Vneg_y); + float H_z = (L_z - Vneg_z); + normalize3(H_x, H_y, H_z, H_x, H_y, H_z); + + float NdotH = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, H_x, H_y, H_z); + NdotH = max(NdotH, 0.0f); + + float specular = pow(NdotH, surface_specularPower); + float specularNorm = (surface_specularPower + 2.0f) * + (1.0f / 8.0f); + float specularContrib = surface_specularAmount * + specularNorm * specular; + + float k = attenuation * NdotL * (1.0f + specularContrib); + + uniform float light_color_x = inputData.lightColor_x[lightIndex]; + uniform float light_color_y = inputData.lightColor_y[lightIndex]; + uniform float light_color_z = inputData.lightColor_z[lightIndex]; + + float lightContrib_x = surface_albedo_x * light_color_x; + float lightContrib_y = surface_albedo_y * light_color_y; + float lightContrib_z = surface_albedo_z * light_color_z; + + lit_x += lightContrib_x * k; + lit_y += lightContrib_y * k; + lit_z += lightContrib_z * k; + } + } + } + + // Gamma correct + // These pows are pretty slow right now, but we can do + // something faster if really necessary to squeeze every + // last bit of performance out of it + float gamma = 1.0 / 2.2f; + lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma); + lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); + lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); + + framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); + framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); + framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); + } } - } } diff --git a/examples_cuda/deferred/main_cu.cpp b/examples_cuda/deferred/main_cu.cpp index 697c1d98..ff308202 100755 --- a/examples_cuda/deferred/main_cu.cpp +++ b/examples_cuda/deferred/main_cu.cpp @@ -116,6 +116,10 @@ void createContext(const int deviceId = 0) // Create driver context checkCudaErrors(cuCtxCreate(&context, 0, device)); + const size_t stackLimit = 4*1024; + // const size_t heapLimit = 1024*1024*1024; + checkCudaErrors(cuCtxSetLimit(CU_LIMIT_STACK_SIZE,stackLimit)); +// checkCudaErrors(cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE,heapLimit)); } void destroyContext() {