diff --git a/examples_cuda/deferred/kernels.ispc b/examples_cuda/deferred/kernels.ispc index 2b90c014..7bc42777 100644 --- a/examples_cuda/deferred/kernels.ispc +++ b/examples_cuda/deferred/kernels.ispc @@ -116,11 +116,8 @@ ComputeZBounds( float laneMinZ = cameraFar; float laneMaxZ = cameraNear; for (uniform int32 y = tileStartY; y < tileEndY; ++y) { -// foreach (x = tileStartX ... tileEndX) - for (uniform int xb = tileStartX; xb < tileEndX; xb += programCount) + foreach (x = tileStartX ... tileEndX) { - const int x = xb + programIndex; - if (x >= tileEndX) break; // Unproject depth buffer Z value into view space float z = zBuffer[y * gBufferWidth + x]; float viewSpaceZ = cameraProj_43 / (z - cameraProj_33); @@ -182,10 +179,8 @@ IntersectLightsWithTileMinMax( uniform int32 tileNumLights = 0; -// foreach (lightIndex = 0 ... numLights) - for (uniform int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount) + foreach (lightIndex = 0 ... numLights) { - const int lightIndex = lightIndexB + programIndex; float light_positionView_z = light_positionView_z_array[lightIndex]; float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; float light_attenuationEndNeg = -light_attenuationEnd; @@ -292,11 +287,8 @@ ShadeTile( if (tileNumLights == 0 || visualizeLightCount) { uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255)); for (uniform int32 y = tileStartY; y < tileEndY; ++y) { -// foreach (x = tileStartX ... tileEndX) - for (uniform int xb = tileStartX ; xb < tileEndX; xb += programCount) + foreach (x = tileStartX ... tileEndX) { - const int x = xb + programIndex; - if (x >= tileEndX) continue; int32 framebufferIndex = (y * gBufferWidth + x); framebuffer_r[framebufferIndex] = c; framebuffer_g[framebufferIndex] = c; @@ -310,10 +302,7 @@ ShadeTile( for (uniform int32 y = tileStartY; y < tileEndY; ++y) { uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); -// foreach (x = tileStartX ... tileEndX) { - for (uniform int xb = tileStartX ; xb < tileEndX; xb += programCount) - { - const int x = xb + programIndex; + foreach (x = tileStartX ... tileEndX) { int32 gBufferOffset = y * gBufferWidth + x; // Reconstruct position and (negative) view vector from G-buffer diff --git a/examples_cuda/deferred/kernels1.ispc b/examples_cuda/deferred/kernels1.ispc index 5def1900..740d86a3 100644 --- a/examples_cuda/deferred/kernels1.ispc +++ b/examples_cuda/deferred/kernels1.ispc @@ -127,19 +127,20 @@ ComputeZBounds( // Find Z bounds float laneMinZ = cameraFar; float laneMaxZ = cameraNear; - foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) - { - // Unproject depth buffer Z value into view space - float z = zBuffer[y * gBufferWidth + x]; - float viewSpaceZ = cameraProj_43 / (z - cameraProj_33); + for (uniform int32 y = tileStartY; y < tileEndY; ++y) + foreach (x = tileStartX ... tileEndX) + { + // Unproject depth buffer Z value into view space + float z = zBuffer[y * gBufferWidth + x]; + float viewSpaceZ = cameraProj_43 / (z - cameraProj_33); - // Work out Z bounds for our samples - // Avoid considering skybox/background or otherwise invalid pixels - if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) { - laneMinZ = min(laneMinZ, viewSpaceZ); - laneMaxZ = max(laneMaxZ, viewSpaceZ); + // Work out Z bounds for our samples + // Avoid considering skybox/background or otherwise invalid pixels + if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) { + laneMinZ = min(laneMinZ, viewSpaceZ); + laneMaxZ = max(laneMaxZ, viewSpaceZ); + } } - } minZ = reduce_min(laneMinZ); maxZ = reduce_max(laneMaxZ); } @@ -297,160 +298,163 @@ ShadeTile( { if (tileNumLights == 0 || visualizeLightCount) { uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255)); - foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) - { - int32 framebufferIndex = (y * gBufferWidth + x); - framebuffer_r[framebufferIndex] = c; - framebuffer_g[framebufferIndex] = c; - framebuffer_b[framebufferIndex] = c; - } + for (uniform int32 y = tileStartY; y < tileEndY; ++y) + foreach (x = tileStartX ... tileEndX) + { + int32 framebufferIndex = (y * gBufferWidth + x); + framebuffer_r[framebufferIndex] = c; + framebuffer_g[framebufferIndex] = c; + framebuffer_b[framebufferIndex] = c; + } } else { uniform float twoOverGBufferWidth = 2.0f / gBufferWidth; uniform float twoOverGBufferHeight = 2.0f / gBufferHeight; - foreach_tiled (y = tileStartY ... tileEndY, x = tileStartX ... tileEndX) - { - float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); - int32 gBufferOffset = y * gBufferWidth + x; + for (uniform int32 y = tileStartY; y < tileEndY; ++y) { + uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); - // Reconstruct position and (negative) view vector from G-buffer - float surface_positionView_x, surface_positionView_y, surface_positionView_z; - float Vneg_x, Vneg_y, Vneg_z; + foreach (x = tileStartX ... tileEndX) { + int32 gBufferOffset = y * gBufferWidth + x; - float z = inputData.zBuffer[gBufferOffset]; + // Reconstruct position and (negative) view vector from G-buffer + float surface_positionView_x, surface_positionView_y, surface_positionView_z; + float Vneg_x, Vneg_y, Vneg_z; - // Compute screen/clip-space position - // NOTE: Mind DX11 viewport transform and pixel center! - float positionScreen_x = (0.5f + (float)(x)) * - twoOverGBufferWidth - 1.0f; + float z = inputData.zBuffer[gBufferOffset]; - // Unproject depth buffer Z value into view space - surface_positionView_z = cameraProj_43 / (z - cameraProj_33); - surface_positionView_x = positionScreen_x * surface_positionView_z / - cameraProj_11; - surface_positionView_y = positionScreen_y * surface_positionView_z / - cameraProj_22; + // Compute screen/clip-space position + // NOTE: Mind DX11 viewport transform and pixel center! + float positionScreen_x = (0.5f + (float)(x)) * + twoOverGBufferWidth - 1.0f; - // We actually end up with a vector pointing *at* the - // surface (i.e. the negative view vector) - normalize3(surface_positionView_x, surface_positionView_y, - surface_positionView_z, Vneg_x, Vneg_y, Vneg_z); + // Unproject depth buffer Z value into view space + surface_positionView_z = cameraProj_43 / (z - cameraProj_33); + surface_positionView_x = positionScreen_x * surface_positionView_z / + cameraProj_11; + surface_positionView_y = positionScreen_y * surface_positionView_z / + cameraProj_22; - // Reconstruct normal from G-buffer - float surface_normal_x, surface_normal_y, surface_normal_z; - float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]); - float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]); + // We actually end up with a vector pointing *at* the + // surface (i.e. the negative view vector) + normalize3(surface_positionView_x, surface_positionView_y, + surface_positionView_z, Vneg_x, Vneg_y, Vneg_z); - float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); - float m = sqrt(4.0f * f - 1.0f); + // Reconstruct normal from G-buffer + float surface_normal_x, surface_normal_y, surface_normal_z; + float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]); + float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]); - surface_normal_x = m * (4.0f * normal_x - 2.0f); - surface_normal_y = m * (4.0f * normal_y - 2.0f); - surface_normal_z = 3.0f - 8.0f * f; + float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); + float m = sqrt(4.0f * f - 1.0f); - // Load other G-buffer parameters - float surface_specularAmount = - half_to_float(inputData.specularAmount[gBufferOffset]); - float surface_specularPower = - half_to_float(inputData.specularPower[gBufferOffset]); - float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]); - float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]); - float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]); + surface_normal_x = m * (4.0f * normal_x - 2.0f); + surface_normal_y = m * (4.0f * normal_y - 2.0f); + surface_normal_z = 3.0f - 8.0f * f; - float lit_x = 0.0f; - float lit_y = 0.0f; - float lit_z = 0.0f; - for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; - ++tileLightIndex) { - uniform int32 lightIndex = tileLightIndices[tileLightIndex]; + // Load other G-buffer parameters + float surface_specularAmount = + half_to_float(inputData.specularAmount[gBufferOffset]); + float surface_specularPower = + half_to_float(inputData.specularPower[gBufferOffset]); + float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]); + float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]); + float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]); - // Gather light data relevant to initial culling - uniform float light_positionView_x = - inputData.lightPositionView_x[lightIndex]; - uniform float light_positionView_y = - inputData.lightPositionView_y[lightIndex]; - uniform float light_positionView_z = - inputData.lightPositionView_z[lightIndex]; - uniform float light_attenuationEnd = - inputData.lightAttenuationEnd[lightIndex]; + float lit_x = 0.0f; + float lit_y = 0.0f; + float lit_z = 0.0f; + for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; + ++tileLightIndex) { + uniform int32 lightIndex = tileLightIndices[tileLightIndex]; - // Compute light vector - float L_x = light_positionView_x - surface_positionView_x; - float L_y = light_positionView_y - surface_positionView_y; - float L_z = light_positionView_z - surface_positionView_z; + // Gather light data relevant to initial culling + uniform float light_positionView_x = + inputData.lightPositionView_x[lightIndex]; + uniform float light_positionView_y = + inputData.lightPositionView_y[lightIndex]; + uniform float light_positionView_z = + inputData.lightPositionView_z[lightIndex]; + uniform float light_attenuationEnd = + inputData.lightAttenuationEnd[lightIndex]; - float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z); + // Compute light vector + float L_x = light_positionView_x - surface_positionView_x; + float L_y = light_positionView_y - surface_positionView_y; + float L_z = light_positionView_z - surface_positionView_z; - // Clip at end of attenuation - float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; + float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z); - cif (distanceToLight2 < light_attenutaionEnd2) { - float distanceToLight = sqrt(distanceToLight2); + // Clip at end of attenuation + float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; - // HLSL "rcp" is allowed to be fairly inaccurate - float distanceToLightRcp = rcp(distanceToLight); - L_x *= distanceToLightRcp; - L_y *= distanceToLightRcp; - L_z *= distanceToLightRcp; + cif (distanceToLight2 < light_attenutaionEnd2) { + float distanceToLight = sqrt(distanceToLight2); - // Start computing brdf - float NdotL = dot3(surface_normal_x, surface_normal_y, - surface_normal_z, L_x, L_y, L_z); + // HLSL "rcp" is allowed to be fairly inaccurate + float distanceToLightRcp = rcp(distanceToLight); + L_x *= distanceToLightRcp; + L_y *= distanceToLightRcp; + L_z *= distanceToLightRcp; - // Clip back facing - cif (NdotL > 0.0f) { - uniform float light_attenuationBegin = - inputData.lightAttenuationBegin[lightIndex]; + // Start computing brdf + float NdotL = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, L_x, L_y, L_z); - // Light distance attenuation (linstep) - float lightRange = (light_attenuationEnd - light_attenuationBegin); - float falloffPosition = (light_attenuationEnd - distanceToLight); - float attenuation = min(falloffPosition / lightRange, 1.0f); + // Clip back facing + cif (NdotL > 0.0f) { + uniform float light_attenuationBegin = + inputData.lightAttenuationBegin[lightIndex]; - float H_x = (L_x - Vneg_x); - float H_y = (L_y - Vneg_y); - float H_z = (L_z - Vneg_z); - normalize3(H_x, H_y, H_z, H_x, H_y, H_z); + // Light distance attenuation (linstep) + float lightRange = (light_attenuationEnd - light_attenuationBegin); + float falloffPosition = (light_attenuationEnd - distanceToLight); + float attenuation = min(falloffPosition / lightRange, 1.0f); - float NdotH = dot3(surface_normal_x, surface_normal_y, - surface_normal_z, H_x, H_y, H_z); - NdotH = max(NdotH, 0.0f); + float H_x = (L_x - Vneg_x); + float H_y = (L_y - Vneg_y); + float H_z = (L_z - Vneg_z); + normalize3(H_x, H_y, H_z, H_x, H_y, H_z); - float specular = pow(NdotH, surface_specularPower); - float specularNorm = (surface_specularPower + 2.0f) * - (1.0f / 8.0f); - float specularContrib = surface_specularAmount * - specularNorm * specular; + float NdotH = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, H_x, H_y, H_z); + NdotH = max(NdotH, 0.0f); - float k = attenuation * NdotL * (1.0f + specularContrib); + float specular = pow(NdotH, surface_specularPower); + float specularNorm = (surface_specularPower + 2.0f) * + (1.0f / 8.0f); + float specularContrib = surface_specularAmount * + specularNorm * specular; - uniform float light_color_x = inputData.lightColor_x[lightIndex]; - uniform float light_color_y = inputData.lightColor_y[lightIndex]; - uniform float light_color_z = inputData.lightColor_z[lightIndex]; + float k = attenuation * NdotL * (1.0f + specularContrib); - float lightContrib_x = surface_albedo_x * light_color_x; - float lightContrib_y = surface_albedo_y * light_color_y; - float lightContrib_z = surface_albedo_z * light_color_z; + uniform float light_color_x = inputData.lightColor_x[lightIndex]; + uniform float light_color_y = inputData.lightColor_y[lightIndex]; + uniform float light_color_z = inputData.lightColor_z[lightIndex]; - lit_x += lightContrib_x * k; - lit_y += lightContrib_y * k; - lit_z += lightContrib_z * k; + float lightContrib_x = surface_albedo_x * light_color_x; + float lightContrib_y = surface_albedo_y * light_color_y; + float lightContrib_z = surface_albedo_z * light_color_z; + + lit_x += lightContrib_x * k; + lit_y += lightContrib_y * k; + lit_z += lightContrib_z * k; + } } } + + // Gamma correct + // These pows are pretty slow right now, but we can do + // something faster if really necessary to squeeze every + // last bit of performance out of it + float gamma = 1.0 / 2.2f; + lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma); + lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); + lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); + + framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); + framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); + framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); } - - // Gamma correct - // These pows are pretty slow right now, but we can do - // something faster if really necessary to squeeze every - // last bit of performance out of it - float gamma = 1.0 / 2.2f; - lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma); - lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); - lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); - - framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); - framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); - framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); } } }