diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll index 85b5ead2..21ddf1bb 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx64.ll @@ -337,8 +337,20 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline ret float %ret } -declare @__rcp_varying_float() nounwind readnone -declare @__rsqrt_varying_float() nounwind readnone +define @__rcp_varying_float() nounwind readnone alwaysinline +{ + %v = extractelement <1 x float> %0, i32 0 + %r = call float @__rcp_uniform_float(float %v) + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret %rv +} +define @__rsqrt_varying_float() nounwind readnone alwaysinline +{ + %v = extractelement <1 x float> %0, i32 0 + %r = call float @__rsqrt_uniform_float(float %v) + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret %rv +} define @__sqrt_varying_float() nounwind readnone alwaysinline { %v = extractelement <1 x float> %0, i32 0 @@ -397,19 +409,19 @@ define i64 @__movmsk(<1 x i1>) nounwind readnone alwaysinline { define i1 @__any(<1 x i1>) nounwind readnone alwaysinline { %v = extractelement <1 x i1> %0, i32 0 -; %cmp = icmp ne i1 %v, 0 - ret i1 %v + %cmp = icmp ne i1 %v, 0 + ret i1 %cmp } define i1 @__all(<1 x i1>) nounwind readnone alwaysinline { %v = extractelement <1 x i1> %0, i32 0 -; %cmp = icmp eq i1 %v, 1 - ret i1 %v + %cmp = icmp eq i1 %v, 1 + ret i1 %cmp } define i1 @__none(<1 x i1>) nounwind readnone alwaysinline { %v = extractelement <1 x i1> %0, i32 0 - %cmp = xor i1 %v, 1; ;icmp eq i1 %v, 0 + %cmp = icmp eq i1 %v, 0 ret i1 %cmp } diff --git a/examples_cuda/deferred/deferred-ispc-static.ppm b/examples_cuda/deferred/deferred-ispc-static.ppm deleted file mode 100644 index 6236e266..00000000 Binary files a/examples_cuda/deferred/deferred-ispc-static.ppm and /dev/null differ diff --git a/examples_cuda/deferred/deferred.h b/examples_cuda/deferred/deferred.h index 5e814ca5..485315a8 100644 --- a/examples_cuda/deferred/deferred.h +++ b/examples_cuda/deferred/deferred.h @@ -35,7 +35,7 @@ #define DEFERRED_H // Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)! -#define MIN_TILE_WIDTH 16 +#define MIN_TILE_WIDTH 64 #define MIN_TILE_HEIGHT 16 #define MAX_LIGHTS 1024 diff --git a/examples_cuda/deferred/deferred_shading b/examples_cuda/deferred/deferred_shading deleted file mode 100755 index 8cecdec6..00000000 Binary files a/examples_cuda/deferred/deferred_shading and /dev/null differ diff --git a/examples_cuda/deferred/kernels.cu b/examples_cuda/deferred/kernels.cu new file mode 100644 index 00000000..e977bf72 --- /dev/null +++ b/examples_cuda/deferred/kernels.cu @@ -0,0 +1,650 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#include "deferred.h" +#include +#include + +#define programCount 32 +#define programIndex (threadIdx.x & 31) +#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5)) +#define taskCount (gridDim.x*4) +#define warpIdx (threadIdx.x >> 5) + +#define int32 int +#define int16 short +#define int8 char + +__device__ static inline float clamp(float v, float low, float high) +{ + return min(max(v, low), high); +} + +struct InputDataArrays +{ + float *zBuffer; + unsigned int16 *normalEncoded_x; // half float + unsigned int16 *normalEncoded_y; // half float + unsigned int16 *specularAmount; // half float + unsigned int16 *specularPower; // half float + unsigned int8 *albedo_x; // unorm8 + unsigned int8 *albedo_y; // unorm8 + unsigned int8 *albedo_z; // unorm8 + float *lightPositionView_x; + float *lightPositionView_y; + float *lightPositionView_z; + float *lightAttenuationBegin; + float *lightColor_x; + float *lightColor_y; + float *lightColor_z; + float *lightAttenuationEnd; +}; + +struct InputHeader +{ + float cameraProj[4][4]; + float cameraNear; + float cameraFar; + + int32 framebufferWidth; + int32 framebufferHeight; + int32 numLights; + int32 inputDataChunkSize; + int32 inputDataArrayOffsets[idaNum]; +}; + + +/////////////////////////////////////////////////////////////////////////// +// Common utility routines + +__device__ +static inline float +dot3(float x, float y, float z, float a, float b, float c) { + return (x*a + y*b + z*c); +} + + +#if 0 +template +struct Uniform +{ + T data[(N-1)/programCount+1]; + + __device__ inline const T& operator[](const int i) const + { + const int laneIdx = i & (programCount-1); + const int chunkIdx = i >> 5; + return __shfl(data[chunkIdx], laneIdx); + } +} +#endif + + +__device__ +static inline void +normalize3(float x, float y, float z, float &ox, float &oy, float &oz) { + float n = rsqrt(x*x + y*y + z*z); + ox = x * n; + oy = y * n; + oz = z * n; +} + +__device__ inline +static float reduce_min(float value) +{ +#pragma unroll + for (int i = 4; i >=0; i--) + value = min(value, __shfl_xor(value, 1<=0; i--) + value = max(value, __shfl_xor(value, 1<=0; i--) + value += __shfl_xor(value, 1<= tileEndX) break; + // Unproject depth buffer Z value into view space + float z = zBuffer[y * gBufferWidth + x]; + float viewSpaceZ = cameraProj_43 / (z - cameraProj_33); + + // Work out Z bounds for our samples + // Avoid considering skybox/background or otherwise invalid pixels + if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) { + laneMinZ = min(laneMinZ, viewSpaceZ); + laneMaxZ = max(laneMaxZ, viewSpaceZ); + } + } + } + minZ = reduce_min(laneMinZ); + maxZ = reduce_max(laneMaxZ); +} + + +__device__ +static inline int32 +IntersectLightsWithTileMinMax( + int32 tileStartX, int32 tileEndX, + int32 tileStartY, int32 tileEndY, + // Tile data + float minZ, + float maxZ, + // G-buffer data + int32 gBufferWidth, int32 gBufferHeight, + // Camera data + float cameraProj_11, float cameraProj_22, + // Light Data + int32 numLights, + float light_positionView_x_array[], + float light_positionView_y_array[], + float light_positionView_z_array[], + float light_attenuationEnd_array[], + // Output + volatile int32 tileLightIndices[] + ) +{ + float gBufferScale_x = 0.5f * (float)gBufferWidth; + float gBufferScale_y = 0.5f * (float)gBufferHeight; + + float frustumPlanes_xy[4] = { + -(cameraProj_11 * gBufferScale_x), + (cameraProj_11 * gBufferScale_x), + (cameraProj_22 * gBufferScale_y), + -(cameraProj_22 * gBufferScale_y) }; + float frustumPlanes_z[4] = { + tileEndX - gBufferScale_x, + -tileStartX + gBufferScale_x, + tileEndY - gBufferScale_y, + -tileStartY + gBufferScale_y }; + + for ( int i = 0; i < 4; ++i) { + float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + + frustumPlanes_z[i] * frustumPlanes_z[i]); + frustumPlanes_xy[i] *= norm; + frustumPlanes_z[i] *= norm; + } + + int32 tileNumLights = 0; + + for ( int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount) + { + const int lightIndex = lightIndexB + programIndex; + + float light_positionView_z = light_positionView_z_array[lightIndex]; + float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; + float light_attenuationEndNeg = -light_attenuationEnd; + + float d = light_positionView_z - minZ; + bool inFrustum = (d >= light_attenuationEndNeg); + + d = maxZ - light_positionView_z; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + // This seems better than cif(!inFrustum) ccontinue; here since we + // don't actually need to mask the rest of this function - this is + // just a greedy early-out. Could also structure all of this as + // nested if() statements, but this a bit easier to read + int active = 0; + if ((inFrustum)) { + float light_positionView_x = light_positionView_x_array[lightIndex]; + float light_positionView_y = light_positionView_y_array[lightIndex]; + + d = light_positionView_z * frustumPlanes_z[0] + + light_positionView_x * frustumPlanes_xy[0]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[1] + + light_positionView_x * frustumPlanes_xy[1]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[2] + + light_positionView_y * frustumPlanes_xy[2]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[3] + + light_positionView_y * frustumPlanes_xy[3]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + // Pack and store intersecting lights +#if 0 + if (inFrustum) { + tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], + lightIndex); + } +#else + if (inFrustum) + { + active = 1; + } +#endif + } +#if 1 + if (lightIndex >= numLights) + active = 0; + +#if 0 + const int idx = tileNumLights + inclusive_scan_warp(active); + const int nactive = reduce_sum(active); +#else + const int2 res = warpBinExclusiveScan(active); + const int idx = tileNumLights + res.x; + const int nactive = res.y; +#endif + if (active) + tileLightIndices[idx] = lightIndex; + tileNumLights += nactive; +#endif + } + + return tileNumLights; +} + + +__device__ +static inline int32 +IntersectLightsWithTile( + int32 tileStartX, int32 tileEndX, + int32 tileStartY, int32 tileEndY, + int32 gBufferWidth, int32 gBufferHeight, + // G-buffer data + float zBuffer[], + // Camera data + float cameraProj_11, float cameraProj_22, + float cameraProj_33, float cameraProj_43, + float cameraNear, float cameraFar, + // Light Data + int32 numLights, + float light_positionView_x_array[], + float light_positionView_y_array[], + float light_positionView_z_array[], + float light_attenuationEnd_array[], + // Output + int32 tileLightIndices[] + ) +{ + float minZ, maxZ; + ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY, + zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar, + minZ, maxZ); + + + int32 tileNumLights = IntersectLightsWithTileMinMax( + tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ, + gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22, + MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, + light_positionView_z_array, light_attenuationEnd_array, + tileLightIndices); + + return tileNumLights; +} + + +__device__ +static inline void +ShadeTile( + int32 tileStartX, int32 tileEndX, + int32 tileStartY, int32 tileEndY, + int32 gBufferWidth, int32 gBufferHeight, + const InputDataArrays &inputData, + // Camera data + float cameraProj_11, float cameraProj_22, + float cameraProj_33, float cameraProj_43, + // Light list + volatile int32 tileLightIndices[], + int32 tileNumLights, + // UI + bool visualizeLightCount, + // Output + unsigned int8 framebuffer_r[], + unsigned int8 framebuffer_g[], + unsigned int8 framebuffer_b[] + ) +{ + if (tileNumLights == 0 || visualizeLightCount) { + unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255)); + for ( int32 y = tileStartY; y < tileEndY; ++y) { + for ( int xb = tileStartX ; xb < tileEndX; xb += programCount) + { + const int x = xb + programIndex; + if (x >= tileEndX) continue; + int32 framebufferIndex = (y * gBufferWidth + x); + framebuffer_r[framebufferIndex] = c; + framebuffer_g[framebufferIndex] = c; + framebuffer_b[framebufferIndex] = c; + } + } + } else { + float twoOverGBufferWidth = 2.0f / gBufferWidth; + float twoOverGBufferHeight = 2.0f / gBufferHeight; + + for ( int32 y = tileStartY; y < tileEndY; ++y) { + float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); + + for ( int xb = tileStartX ; xb < tileEndX; xb += programCount) + { + const int x = xb + programIndex; +// if (x >= tileEndX) break; + int32 gBufferOffset = y * gBufferWidth + x; + + // Reconstruct position and (negative) view vector from G-buffer + float surface_positionView_x, surface_positionView_y, surface_positionView_z; + float Vneg_x, Vneg_y, Vneg_z; + + float z = inputData.zBuffer[gBufferOffset]; + + // Compute screen/clip-space position + // NOTE: Mind DX11 viewport transform and pixel center! + float positionScreen_x = (0.5f + (float)(x)) * + twoOverGBufferWidth - 1.0f; + + // Unproject depth buffer Z value into view space + surface_positionView_z = cameraProj_43 / (z - cameraProj_33); + surface_positionView_x = positionScreen_x * surface_positionView_z / + cameraProj_11; + surface_positionView_y = positionScreen_y * surface_positionView_z / + cameraProj_22; + + // We actually end up with a vector pointing *at* the + // surface (i.e. the negative view vector) + normalize3(surface_positionView_x, surface_positionView_y, + surface_positionView_z, Vneg_x, Vneg_y, Vneg_z); + + // Reconstruct normal from G-buffer + float surface_normal_x, surface_normal_y, surface_normal_z; + float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]); + float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]); + + float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); + float m = sqrt(4.0f * f - 1.0f); + + surface_normal_x = m * (4.0f * normal_x - 2.0f); + surface_normal_y = m * (4.0f * normal_y - 2.0f); + surface_normal_z = 3.0f - 8.0f * f; + + // Load other G-buffer parameters + float surface_specularAmount = + __half2float(inputData.specularAmount[gBufferOffset]); + float surface_specularPower = + __half2float(inputData.specularPower[gBufferOffset]); + float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]); + float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]); + float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]); + + float lit_x = 0.0f; + float lit_y = 0.0f; + float lit_z = 0.0f; + for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights; + ++tileLightIndex) { + int32 lightIndex = tileLightIndices[tileLightIndex]; + + // Gather light data relevant to initial culling + float light_positionView_x = + inputData.lightPositionView_x[lightIndex]; + float light_positionView_y = + inputData.lightPositionView_y[lightIndex]; + float light_positionView_z = + inputData.lightPositionView_z[lightIndex]; + float light_attenuationEnd = + inputData.lightAttenuationEnd[lightIndex]; + + // Compute light vector + float L_x = light_positionView_x - surface_positionView_x; + float L_y = light_positionView_y - surface_positionView_y; + float L_z = light_positionView_z - surface_positionView_z; + + float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z); + + // Clip at end of attenuation + float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; + + if (distanceToLight2 < light_attenutaionEnd2) { + float distanceToLight = sqrt(distanceToLight2); + + // HLSL "rcp" is allowed to be fairly inaccurate + float distanceToLightRcp = 1.0f/distanceToLight; + L_x *= distanceToLightRcp; + L_y *= distanceToLightRcp; + L_z *= distanceToLightRcp; + + // Start computing brdf + float NdotL = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, L_x, L_y, L_z); + + // Clip back facing + if (NdotL > 0.0f) { + float light_attenuationBegin = + inputData.lightAttenuationBegin[lightIndex]; + + // Light distance attenuation (linstep) + float lightRange = (light_attenuationEnd - light_attenuationBegin); + float falloffPosition = (light_attenuationEnd - distanceToLight); + float attenuation = min(falloffPosition / lightRange, 1.0f); + + float H_x = (L_x - Vneg_x); + float H_y = (L_y - Vneg_y); + float H_z = (L_z - Vneg_z); + normalize3(H_x, H_y, H_z, H_x, H_y, H_z); + + float NdotH = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, H_x, H_y, H_z); + NdotH = max(NdotH, 0.0f); + + float specular = pow(NdotH, surface_specularPower); + float specularNorm = (surface_specularPower + 2.0f) * + (1.0f / 8.0f); + float specularContrib = surface_specularAmount * + specularNorm * specular; + + float k = attenuation * NdotL * (1.0f + specularContrib); + + float light_color_x = inputData.lightColor_x[lightIndex]; + float light_color_y = inputData.lightColor_y[lightIndex]; + float light_color_z = inputData.lightColor_z[lightIndex]; + + float lightContrib_x = surface_albedo_x * light_color_x; + float lightContrib_y = surface_albedo_y * light_color_y; + float lightContrib_z = surface_albedo_z * light_color_z; + + lit_x += lightContrib_x * k; + lit_y += lightContrib_y * k; + lit_z += lightContrib_z * k; + } + } + } + + // Gamma correct + // These pows are pretty slow right now, but we can do + // something faster if really necessary to squeeze every + // last bit of performance out of it + float gamma = 1.0 / 2.2f; + lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma); + lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); + lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); + + framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); + framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); + framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); + } + } + } +} + + +/////////////////////////////////////////////////////////////////////////// +// Static decomposition + +extern "C" __global__ void +RenderTile( int num_groups_x, int num_groups_y, + const InputHeader *inputHeaderPtr, + const InputDataArrays *inputDataPtr, + int visualizeLightCount, + // Output + unsigned int8 framebuffer_r[], + unsigned int8 framebuffer_g[], + unsigned int8 framebuffer_b[]) { + if (taskIndex >= taskCount) return; + + const InputHeader inputHeader = *inputHeaderPtr; + const InputDataArrays inputData = *inputDataPtr; + int32 group_y = taskIndex / num_groups_x; + int32 group_x = taskIndex % num_groups_x; + + int32 tile_start_x = group_x * MIN_TILE_WIDTH; + int32 tile_start_y = group_y * MIN_TILE_HEIGHT; + int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH; + int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT; + + int framebufferWidth = inputHeader.framebufferWidth; + int framebufferHeight = inputHeader.framebufferHeight; + float cameraProj_00 = inputHeader.cameraProj[0][0]; + float cameraProj_11 = inputHeader.cameraProj[1][1]; + float cameraProj_22 = inputHeader.cameraProj[2][2]; + float cameraProj_32 = inputHeader.cameraProj[3][2]; + + // Light intersection: figure out which lights illuminate this tile. +#if 0 + int tileLightIndices[MAX_LIGHTS]; // Light list for the tile +#else + __shared__ int tileLightIndicesFull[4*MAX_LIGHTS]; // Light list for the tile + int *tileLightIndices = &tileLightIndicesFull[warpIdx*MAX_LIGHTS]; +#endif + int numTileLights = + IntersectLightsWithTile(tile_start_x, tile_end_x, + tile_start_y, tile_end_y, + framebufferWidth, framebufferHeight, + inputData.zBuffer, + cameraProj_00, cameraProj_11, + cameraProj_22, cameraProj_32, + inputHeader.cameraNear, inputHeader.cameraFar, + MAX_LIGHTS, + inputData.lightPositionView_x, + inputData.lightPositionView_y, + inputData.lightPositionView_z, + inputData.lightAttenuationEnd, + tileLightIndices); + + // And now shade the tile, using the lights in tileLightIndices + ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, + framebufferWidth, framebufferHeight, inputData, + cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32, + tileLightIndices, numTileLights, visualizeLightCount, + framebuffer_r, framebuffer_g, framebuffer_b); +} + + diff --git a/examples_cuda/deferred/kernels.ispc b/examples_cuda/deferred/kernels.ispc index 6d2a8cc9..80b70ed4 100644 --- a/examples_cuda/deferred/kernels.ispc +++ b/examples_cuda/deferred/kernels.ispc @@ -472,6 +472,7 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y, uniform float cameraProj_22 = inputHeader.cameraProj[2][2]; uniform float cameraProj_32 = inputHeader.cameraProj[3][2]; + // Light intersection: figure out which lights illuminate this tile. uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile uniform int numTileLights = @@ -489,6 +490,7 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y, inputData.lightAttenuationEnd, tileLightIndices); + // And now shade the tile, using the lights in tileLightIndices ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, framebufferWidth, framebufferHeight, inputData, @@ -519,154 +521,3 @@ RenderStatic(uniform InputHeader &inputHeader, framebuffer_r, framebuffer_g, framebuffer_b); } - -/////////////////////////////////////////////////////////////////////////// -// Routines for dynamic decomposition path - -// This computes the z min/max range for a whole row worth of tiles. -export void -ComputeZBoundsRow( - uniform int32 tileY, - uniform int32 tileWidth, uniform int32 tileHeight, - uniform int32 numTilesX, uniform int32 numTilesY, - // G-buffer data - uniform float zBuffer[], - uniform int32 gBufferWidth, - // Camera data - uniform float cameraProj_33, uniform float cameraProj_43, - uniform float cameraNear, uniform float cameraFar, - // Output - uniform float minZArray[], - uniform float maxZArray[] - ) -{ - for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) { - uniform float minZ, maxZ; - ComputeZBounds( - tileX * tileWidth, tileX * tileWidth + tileWidth, - tileY * tileHeight, tileY * tileHeight + tileHeight, - zBuffer, gBufferWidth, - cameraProj_33, cameraProj_43, cameraNear, cameraFar, - minZ, maxZ); - minZArray[tileX] = minZ; - maxZArray[tileX] = maxZ; - } -} - - -// Reclassifies the lights with respect to four sub-tiles when we refine a tile. -// numLights need not be a multiple of programCount here, but the input and output arrays -// should be able to handle programCount-sized load/stores. -export void -SplitTileMinMax( - uniform int32 tileMidX, uniform int32 tileMidY, - // Subtile data (00, 10, 01, 11) - uniform float subtileMinZ[], - uniform float subtileMaxZ[], - // G-buffer data - uniform int32 gBufferWidth, uniform int32 gBufferHeight, - // Camera data - uniform float cameraProj_11, uniform float cameraProj_22, - // Light Data - uniform int32 lightIndices[], - uniform int32 numLights, - uniform float light_positionView_x_array[], - uniform float light_positionView_y_array[], - uniform float light_positionView_z_array[], - uniform float light_attenuationEnd_array[], - // Outputs - uniform int32 subtileIndices[], - uniform int32 subtileIndicesPitch, - uniform int32 subtileNumLights[] - ) -{ - uniform float gBufferScale_x = 0.5f * (float)gBufferWidth; - uniform float gBufferScale_y = 0.5f * (float)gBufferHeight; - - uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x), - (cameraProj_22 * gBufferScale_y) }; - uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x, - tileMidY - gBufferScale_y }; - - // Normalize - uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + - frustumPlanes_z[0] * frustumPlanes_z[0]), - rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + - frustumPlanes_z[1] * frustumPlanes_z[1]) }; - frustumPlanes_xy[0] *= norm[0]; - frustumPlanes_xy[1] *= norm[1]; - frustumPlanes_z[0] *= norm[0]; - frustumPlanes_z[1] *= norm[1]; - - // Initialize - uniform int32 subtileLightOffset[4]; - subtileLightOffset[0] = 0 * subtileIndicesPitch; - subtileLightOffset[1] = 1 * subtileIndicesPitch; - subtileLightOffset[2] = 2 * subtileIndicesPitch; - subtileLightOffset[3] = 3 * subtileIndicesPitch; - - foreach (i = 0 ... numLights) { - int32 lightIndex = lightIndices[i]; - - float light_positionView_x = light_positionView_x_array[lightIndex]; - float light_positionView_y = light_positionView_y_array[lightIndex]; - float light_positionView_z = light_positionView_z_array[lightIndex]; - float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; - float light_attenuationEndNeg = -light_attenuationEnd; - - // Test lights again subtile z bounds - bool inFrustum[4]; - inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) && - (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg); - inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && - (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg); - inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && - (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg); - inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && - (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg); - - float dx = light_positionView_z * frustumPlanes_z[0] + - light_positionView_x * frustumPlanes_xy[0]; - float dy = light_positionView_z * frustumPlanes_z[1] + - light_positionView_y * frustumPlanes_xy[1]; - - cif (abs(dx) > light_attenuationEnd) { - bool positiveX = dx > 0.0f; - inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile - inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile - inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile - inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile - } - cif (abs(dy) > light_attenuationEnd) { - bool positiveY = dy > 0.0f; - inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile - inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile - inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile - inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile - } - - // Pack and store intersecting lights - // TODO: Experiment with a loop here instead - cif (inFrustum[0]) - subtileLightOffset[0] += - packed_store_active(&subtileIndices[subtileLightOffset[0]], - lightIndex); - cif (inFrustum[1]) - subtileLightOffset[1] += - packed_store_active(&subtileIndices[subtileLightOffset[1]], - lightIndex); - cif (inFrustum[2]) - subtileLightOffset[2] += - packed_store_active(&subtileIndices[subtileLightOffset[2]], - lightIndex); - cif (inFrustum[3]) - subtileLightOffset[3] += - packed_store_active(&subtileIndices[subtileLightOffset[3]], - lightIndex); - } - - subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch; - subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch; - subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch; - subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch; -} diff --git a/examples_cuda/deferred/kernels1.ispc b/examples_cuda/deferred/kernels1.ispc new file mode 100644 index 00000000..83f785b1 --- /dev/null +++ b/examples_cuda/deferred/kernels1.ispc @@ -0,0 +1,557 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef __NVPTX__ +#warning "emitting DEVICE code" +#define programCount warpSize() +#define programIndex laneIndex() +#define taskIndex blockIndex0() +#define taskCount blockCount0() +#else +#warning "emitting HOST code" +#endif + + +#include "deferred.h" + +struct InputDataArrays +{ + float *zBuffer; + unsigned int16 *normalEncoded_x; // half float + unsigned int16 *normalEncoded_y; // half float + unsigned int16 *specularAmount; // half float + unsigned int16 *specularPower; // half float + unsigned int8 *albedo_x; // unorm8 + unsigned int8 *albedo_y; // unorm8 + unsigned int8 *albedo_z; // unorm8 + float *lightPositionView_x; + float *lightPositionView_y; + float *lightPositionView_z; + float *lightAttenuationBegin; + float *lightColor_x; + float *lightColor_y; + float *lightColor_z; + float *lightAttenuationEnd; +}; + +struct InputHeader +{ + float cameraProj[4][4]; + float cameraNear; + float cameraFar; + + int32 framebufferWidth; + int32 framebufferHeight; + int32 numLights; + int32 inputDataChunkSize; + int32 inputDataArrayOffsets[idaNum]; +}; + + +/////////////////////////////////////////////////////////////////////////// +// Common utility routines + +static inline float +dot3(float x, float y, float z, float a, float b, float c) { + return (x*a + y*b + z*c); +} + + +static inline void +normalize3(float x, float y, float z, float &ox, float &oy, float &oz) { + float n = rsqrt(x*x + y*y + z*z); + ox = x * n; + oy = y * n; + oz = z * n; +} + + +static inline float +Unorm8ToFloat32(unsigned int8 u) { + return (float)u * (1.0f / 255.0f); +} + + +static inline unsigned int8 +Float32ToUnorm8(float f) { + return (unsigned int8)(f * 255.0f); +} + + +static inline void +ComputeZBounds( + uniform int32 tileStartX, uniform int32 tileEndX, + uniform int32 tileStartY, uniform int32 tileEndY, + // G-buffer data + uniform float zBuffer[], + uniform int32 gBufferWidth, + // Camera data + uniform float cameraProj_33, uniform float cameraProj_43, + uniform float cameraNear, uniform float cameraFar, + // Output + uniform float &minZ, + uniform float &maxZ + ) +{ + // Find Z bounds + float laneMinZ = cameraFar; + float laneMaxZ = cameraNear; + for (uniform int32 y = tileStartY; y < tileEndY; ++y) { +// foreach (x = tileStartX ... tileEndX) { + for (uniform int xb = tileStartX; xb < tileEndX; xb += programCount) + { + const int x = xb + programIndex; + if (x >= tileEndX) continue; + // Unproject depth buffer Z value into view space + float z = zBuffer[y * gBufferWidth + x]; + float viewSpaceZ = cameraProj_43 / (z - cameraProj_33); + + // Work out Z bounds for our samples + // Avoid considering skybox/background or otherwise invalid pixels + if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) { + laneMinZ = min(laneMinZ, viewSpaceZ); + laneMaxZ = max(laneMaxZ, viewSpaceZ); + } + } + } + minZ = reduce_min(laneMinZ); + maxZ = reduce_max(laneMaxZ); +} + + +static inline uniform int32 +IntersectLightsWithTileMinMax( + uniform int32 tileStartX, uniform int32 tileEndX, + uniform int32 tileStartY, uniform int32 tileEndY, + // Tile data + uniform float minZ, + uniform float maxZ, + // G-buffer data + uniform int32 gBufferWidth, uniform int32 gBufferHeight, + // Camera data + uniform float cameraProj_11, uniform float cameraProj_22, + // Light Data + uniform int32 numLights, + uniform float light_positionView_x_array[], + uniform float light_positionView_y_array[], + uniform float light_positionView_z_array[], + uniform float light_attenuationEnd_array[], + // Output + uniform int32 tileLightIndices[] + ) +{ + uniform float gBufferScale_x = 0.5f * (float)gBufferWidth; + uniform float gBufferScale_y = 0.5f * (float)gBufferHeight; + + uniform float frustumPlanes_xy[4] = { + -(cameraProj_11 * gBufferScale_x), + (cameraProj_11 * gBufferScale_x), + (cameraProj_22 * gBufferScale_y), + -(cameraProj_22 * gBufferScale_y) }; + uniform float frustumPlanes_z[4] = { + tileEndX - gBufferScale_x, + -tileStartX + gBufferScale_x, + tileEndY - gBufferScale_y, + -tileStartY + gBufferScale_y }; + + for (uniform int i = 0; i < 4; ++i) { + uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + + frustumPlanes_z[i] * frustumPlanes_z[i]); + frustumPlanes_xy[i] *= norm; + frustumPlanes_z[i] *= norm; + } + + uniform int32 tileNumLights = 0; + +// foreach (lightIndex = 0 ... numLights) { + for (uniform int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount) + { + const int lightIndex = lightIndexB + programIndex; + if (lightIndex >= numLights) continue; + + float light_positionView_z = light_positionView_z_array[lightIndex]; + float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; + float light_attenuationEndNeg = -light_attenuationEnd; + + float d = light_positionView_z - minZ; + bool inFrustum = (d >= light_attenuationEndNeg); + + d = maxZ - light_positionView_z; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + // This seems better than cif(!inFrustum) ccontinue; here since we + // don't actually need to mask the rest of this function - this is + // just a greedy early-out. Could also structure all of this as + // nested if() statements, but this a bit easier to read + if (any(inFrustum)) { + float light_positionView_x = light_positionView_x_array[lightIndex]; + float light_positionView_y = light_positionView_y_array[lightIndex]; + + d = light_positionView_z * frustumPlanes_z[0] + + light_positionView_x * frustumPlanes_xy[0]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[1] + + light_positionView_x * frustumPlanes_xy[1]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[2] + + light_positionView_y * frustumPlanes_xy[2]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[3] + + light_positionView_y * frustumPlanes_xy[3]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + // Pack and store intersecting lights + if (inFrustum) { + tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], + lightIndex); + } + } + } + + return tileNumLights; +} + + +static inline uniform int32 +IntersectLightsWithTile( + uniform int32 tileStartX, uniform int32 tileEndX, + uniform int32 tileStartY, uniform int32 tileEndY, + uniform int32 gBufferWidth, uniform int32 gBufferHeight, + // G-buffer data + uniform float zBuffer[], + // Camera data + uniform float cameraProj_11, uniform float cameraProj_22, + uniform float cameraProj_33, uniform float cameraProj_43, + uniform float cameraNear, uniform float cameraFar, + // Light Data + uniform int32 numLights, + uniform float light_positionView_x_array[], + uniform float light_positionView_y_array[], + uniform float light_positionView_z_array[], + uniform float light_attenuationEnd_array[], + // Output + uniform int32 tileLightIndices[] + ) +{ + uniform float minZ, maxZ; + ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY, + zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar, + minZ, maxZ); + + uniform int32 tileNumLights = IntersectLightsWithTileMinMax( + tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ, + gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22, + MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, + light_positionView_z_array, light_attenuationEnd_array, + tileLightIndices); + + return tileNumLights; +} + + +static inline void +ShadeTile( + uniform int32 tileStartX, uniform int32 tileEndX, + uniform int32 tileStartY, uniform int32 tileEndY, + uniform int32 gBufferWidth, uniform int32 gBufferHeight, + const uniform InputDataArrays &inputData, + // Camera data + uniform float cameraProj_11, uniform float cameraProj_22, + uniform float cameraProj_33, uniform float cameraProj_43, + // Light list + uniform int32 tileLightIndices[], + uniform int32 tileNumLights, + // UI + uniform bool visualizeLightCount, + // Output + uniform unsigned int8 framebuffer_r[], + uniform unsigned int8 framebuffer_g[], + uniform unsigned int8 framebuffer_b[] + ) +{ + if (tileNumLights == 0 || visualizeLightCount) { + uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255)); + for (uniform int32 y = tileStartY; y < tileEndY; ++y) { +// foreach (x = tileStartX ... tileEndX) + for (uniform int xb = tileStartX ; xb < tileEndX; xb += programCount) + { + const int x = xb + programIndex; + if (x >= tileEndX) continue; + int32 framebufferIndex = (y * gBufferWidth + x); + framebuffer_r[framebufferIndex] = c; + framebuffer_g[framebufferIndex] = c; + framebuffer_b[framebufferIndex] = c; + } + } + } else { + uniform float twoOverGBufferWidth = 2.0f / gBufferWidth; + uniform float twoOverGBufferHeight = 2.0f / gBufferHeight; + + for (uniform int32 y = tileStartY; y < tileEndY; ++y) { + uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); + +// foreach (x = tileStartX ... tileEndX) { + for (uniform int xb = tileStartX ; xb < tileEndX; xb += programCount) + { + const int x = xb + programIndex; + int32 gBufferOffset = y * gBufferWidth + x; + + // Reconstruct position and (negative) view vector from G-buffer + float surface_positionView_x, surface_positionView_y, surface_positionView_z; + float Vneg_x, Vneg_y, Vneg_z; + + float z = inputData.zBuffer[gBufferOffset]; + + // Compute screen/clip-space position + // NOTE: Mind DX11 viewport transform and pixel center! + float positionScreen_x = (0.5f + (float)(x)) * + twoOverGBufferWidth - 1.0f; + + // Unproject depth buffer Z value into view space + surface_positionView_z = cameraProj_43 / (z - cameraProj_33); + surface_positionView_x = positionScreen_x * surface_positionView_z / + cameraProj_11; + surface_positionView_y = positionScreen_y * surface_positionView_z / + cameraProj_22; + + // We actually end up with a vector pointing *at* the + // surface (i.e. the negative view vector) + normalize3(surface_positionView_x, surface_positionView_y, + surface_positionView_z, Vneg_x, Vneg_y, Vneg_z); + + // Reconstruct normal from G-buffer + float surface_normal_x, surface_normal_y, surface_normal_z; + float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]); + float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]); + + float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); + float m = sqrt(4.0f * f - 1.0f); + + surface_normal_x = m * (4.0f * normal_x - 2.0f); + surface_normal_y = m * (4.0f * normal_y - 2.0f); + surface_normal_z = 3.0f - 8.0f * f; + + // Load other G-buffer parameters + float surface_specularAmount = + half_to_float(inputData.specularAmount[gBufferOffset]); + float surface_specularPower = + half_to_float(inputData.specularPower[gBufferOffset]); + float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]); + float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]); + float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]); + + float lit_x = 0.0f; + float lit_y = 0.0f; + float lit_z = 0.0f; + for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; + ++tileLightIndex) { + uniform int32 lightIndex = tileLightIndices[tileLightIndex]; + + // Gather light data relevant to initial culling + uniform float light_positionView_x = + inputData.lightPositionView_x[lightIndex]; + uniform float light_positionView_y = + inputData.lightPositionView_y[lightIndex]; + uniform float light_positionView_z = + inputData.lightPositionView_z[lightIndex]; + uniform float light_attenuationEnd = + inputData.lightAttenuationEnd[lightIndex]; + + // Compute light vector + float L_x = light_positionView_x - surface_positionView_x; + float L_y = light_positionView_y - surface_positionView_y; + float L_z = light_positionView_z - surface_positionView_z; + + float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z); + + // Clip at end of attenuation + float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; + + if (distanceToLight2 < light_attenutaionEnd2) { + float distanceToLight = sqrt(distanceToLight2); + + // HLSL "rcp" is allowed to be fairly inaccurate + float distanceToLightRcp = rcp(distanceToLight); + L_x *= distanceToLightRcp; + L_y *= distanceToLightRcp; + L_z *= distanceToLightRcp; + + // Start computing brdf + float NdotL = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, L_x, L_y, L_z); + + // Clip back facing + if (NdotL > 0.0f) { + uniform float light_attenuationBegin = + inputData.lightAttenuationBegin[lightIndex]; + + // Light distance attenuation (linstep) + float lightRange = (light_attenuationEnd - light_attenuationBegin); + float falloffPosition = (light_attenuationEnd - distanceToLight); + float attenuation = min(falloffPosition / lightRange, 1.0f); + + float H_x = (L_x - Vneg_x); + float H_y = (L_y - Vneg_y); + float H_z = (L_z - Vneg_z); + normalize3(H_x, H_y, H_z, H_x, H_y, H_z); + + float NdotH = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, H_x, H_y, H_z); + NdotH = max(NdotH, 0.0f); + + float specular = pow(NdotH, surface_specularPower); + float specularNorm = (surface_specularPower + 2.0f) * + (1.0f / 8.0f); + float specularContrib = surface_specularAmount * + specularNorm * specular; + + float k = attenuation * NdotL * (1.0f + specularContrib); + + uniform float light_color_x = inputData.lightColor_x[lightIndex]; + uniform float light_color_y = inputData.lightColor_y[lightIndex]; + uniform float light_color_z = inputData.lightColor_z[lightIndex]; + + float lightContrib_x = surface_albedo_x * light_color_x; + float lightContrib_y = surface_albedo_y * light_color_y; + float lightContrib_z = surface_albedo_z * light_color_z; + + lit_x += lightContrib_x * k; + lit_y += lightContrib_y * k; + lit_z += lightContrib_z * k; + } + } + } + + // Gamma correct + // These pows are pretty slow right now, but we can do + // something faster if really necessary to squeeze every + // last bit of performance out of it + float gamma = 1.0 / 2.2f; + lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma); + lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); + lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); + + framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); + framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); + framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); + } + } + } +} + + +/////////////////////////////////////////////////////////////////////////// +// Static decomposition + +task void +RenderTile(uniform int num_groups_x, uniform int num_groups_y, + const uniform InputHeader inputHeaderPtr[], + const uniform InputDataArrays inputDataPtr[], + uniform int visualizeLightCount, + // Output + uniform unsigned int8 framebuffer_r[], + uniform unsigned int8 framebuffer_g[], + uniform unsigned int8 framebuffer_b[]) { + if (taskIndex >= taskCount) return; + + const uniform InputHeader inputHeader = *inputHeaderPtr; + const uniform InputDataArrays inputData = *inputDataPtr; + + uniform int32 group_y = taskIndex / num_groups_x; + uniform int32 group_x = taskIndex % num_groups_x; + uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH; + uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT; + uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH; + uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT; + + uniform int framebufferWidth = inputHeader.framebufferWidth; + uniform int framebufferHeight = inputHeader.framebufferHeight; + uniform float cameraProj_00 = inputHeader.cameraProj[0][0]; + uniform float cameraProj_11 = inputHeader.cameraProj[1][1]; + uniform float cameraProj_22 = inputHeader.cameraProj[2][2]; + uniform float cameraProj_32 = inputHeader.cameraProj[3][2]; + + // Light intersection: figure out which lights illuminate this tile. + uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile + uniform int numTileLights = + IntersectLightsWithTile(tile_start_x, tile_end_x, + tile_start_y, tile_end_y, + framebufferWidth, framebufferHeight, + inputData.zBuffer, + cameraProj_00, cameraProj_11, + cameraProj_22, cameraProj_32, + inputHeader.cameraNear, inputHeader.cameraFar, + MAX_LIGHTS, + inputData.lightPositionView_x, + inputData.lightPositionView_y, + inputData.lightPositionView_z, + inputData.lightAttenuationEnd, + tileLightIndices); + + // And now shade the tile, using the lights in tileLightIndices + ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, + framebufferWidth, framebufferHeight, inputData, + cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32, + tileLightIndices, numTileLights, visualizeLightCount, + framebuffer_r, framebuffer_g, framebuffer_b); +} + + +export void +RenderStatic(uniform InputHeader inputHeaderPtr[], + uniform InputDataArrays inputDataPtr[], + uniform InputHeader &inputHeader, + uniform int visualizeLightCount, + // Output + uniform unsigned int8 framebuffer_r[], + uniform unsigned int8 framebuffer_g[], + uniform unsigned int8 framebuffer_b[]) { + + uniform int num_groups_x = (inputHeader.framebufferWidth + + MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH; + uniform int num_groups_y = (inputHeader.framebufferHeight + + MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT; + uniform int num_groups = num_groups_x * num_groups_y; + + // Launch a task to render each tile, each of which is MIN_TILE_WIDTH + // by MIN_TILE_HEIGHT pixels. + launch[num_groups] RenderTile(num_groups_x, num_groups_y, + inputHeaderPtr, inputDataPtr, visualizeLightCount, + framebuffer_r, framebuffer_g, framebuffer_b); +} + + + diff --git a/examples_cuda/deferred/main.cpp b/examples_cuda/deferred/main.cpp index 4f2be879..cdfc0591 100644 --- a/examples_cuda/deferred/main.cpp +++ b/examples_cuda/deferred/main.cpp @@ -59,6 +59,19 @@ #include "kernels_ispc.h" #include "../timing.h" +#include +static inline double rtc(void) +{ + struct timeval Tvalue; + double etime; + struct timezone dummy; + + gettimeofday(&Tvalue,&dummy); + etime = (double) Tvalue.tv_sec + + 1.e-6*((double) Tvalue.tv_usec); + return etime; +} + /////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { @@ -76,27 +89,36 @@ int main(int argc, char** argv) { Framebuffer framebuffer(input->header.framebufferWidth, input->header.framebufferHeight); +#if 0 InitDynamicC(input); #ifdef __cilk InitDynamicCilk(input); #endif // __cilk +#endif + + const int buffsize = input->header.framebufferWidth*input->header.framebufferHeight; + for (int i = 0; i < buffsize; i++) + framebuffer.r[i] = framebuffer.g[i] = framebuffer.b[i] = 0; int nframes = 5; double ispcCycles = 1e30; for (int i = 0; i < 5; ++i) { framebuffer.clear(); - reset_and_start_timer(); + const double t0 = rtc(); for (int j = 0; j < nframes; ++j) ispc::RenderStatic(input->header, input->arrays, VISUALIZE_LIGHT_COUNT, framebuffer.r, framebuffer.g, framebuffer.b); - double mcycles = get_elapsed_mcycles() / nframes; + double mcycles = (rtc() - t0) / nframes; ispcCycles = std::min(ispcCycles, mcycles); } printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render " "%d x %d image\n", ispcCycles, input->header.framebufferWidth, input->header.framebufferHeight); WriteFrame("deferred-ispc-static.ppm", input, framebuffer); + return 0; + +#if 0 #ifdef __cilk double dynamicCilkCycles = 1e30; @@ -132,6 +154,7 @@ int main(int argc, char** argv) { #else printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles); #endif // __cilk +#endif DeleteInputData(input); diff --git a/examples_cuda/deferred/main_cu.cpp b/examples_cuda/deferred/main_cu.cpp old mode 100644 new mode 100755 index 4f2be879..a9d9ed4f --- a/examples_cuda/deferred/main_cu.cpp +++ b/examples_cuda/deferred/main_cu.cpp @@ -59,6 +59,222 @@ #include "kernels_ispc.h" #include "../timing.h" +#include +static inline double rtc(void) +{ + struct timeval Tvalue; + double etime; + struct timezone dummy; + + gettimeofday(&Tvalue,&dummy); + etime = (double) Tvalue.tv_sec + + 1.e-6*((double) Tvalue.tv_usec); + return etime; +} +/******************************/ #include +#include +#include +#include "drvapi_error_string.h" + +#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) +// These are the inline versions for all of the SDK helper functions +void __checkCudaErrors(CUresult err, const char *file, const int line) { + if(CUDA_SUCCESS != err) { + std::cerr << "checkCudeErrors() Driver API error = " << err << "\"" + << getCudaDrvErrorString(err) << "\" from file <" << file + << ", line " << line << "\n"; + exit(-1); + } +} + +/**********************/ +/* Basic CUDriver API */ +CUcontext context; + +void createContext(const int deviceId = 0) +{ + CUdevice device; + int devCount; + checkCudaErrors(cuInit(0)); + checkCudaErrors(cuDeviceGetCount(&devCount)); + assert(devCount > 0); + checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0)); + + char name[128]; + checkCudaErrors(cuDeviceGetName(name, 128, device)); + std::cout << "Using CUDA Device [0]: " << name << "\n"; + + int devMajor, devMinor; + checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); + std::cout << "Device Compute Capability: " + << devMajor << "." << devMinor << "\n"; + if (devMajor < 2) { + std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; + exit(1); + } + + // Create driver context + checkCudaErrors(cuCtxCreate(&context, 0, device)); +} +void destroyContext() +{ + checkCudaErrors(cuCtxDestroy(context)); +} + +CUmodule loadModule(const char * module) +{ + CUmodule cudaModule; + // in this branch we use compilation with parameters + + const unsigned int jitNumOptions = 1; + CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; + void **jitOptVals = new void*[jitNumOptions]; + // set up pointer to set the Maximum # of registers for a particular kernel + jitOptions[0] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 64; + jitOptVals[0] = (void *)(size_t)jitRegCount; +#if 0 + + // set up size of compilation log buffer + jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + int jitLogBufferSize = 1024; + jitOptVals[0] = (void *)(size_t)jitLogBufferSize; + + // set up pointer to the compilation log buffer + jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; + char *jitLogBuffer = new char[jitLogBufferSize]; + jitOptVals[1] = jitLogBuffer; + + // set up pointer to set the Maximum # of registers for a particular kernel + jitOptions[2] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 32; + jitOptVals[2] = (void *)(size_t)jitRegCount; +#endif + + checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals)); + return cudaModule; +} +void unloadModule(CUmodule &cudaModule) +{ + checkCudaErrors(cuModuleUnload(cudaModule)); +} + +CUfunction getFunction(CUmodule &cudaModule, const char * function) +{ + CUfunction cudaFunction; + checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function)); + return cudaFunction; +} + +CUdeviceptr deviceMalloc(const size_t size) +{ + CUdeviceptr d_buf; + checkCudaErrors(cuMemAlloc(&d_buf, size)); + return d_buf; +} +void deviceFree(CUdeviceptr d_buf) +{ + checkCudaErrors(cuMemFree(d_buf)); +} +void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size)); +} +void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); +} +#define deviceLaunch(func,nbx,nby,nbz,params) \ + checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_EQUAL)); \ + checkCudaErrors( \ + cuLaunchKernel( \ + (func), \ + ((nbx-1)/(128/32)+1), (nby), (nbz), \ + 128, 1, 1, \ + 0, NULL, (params), NULL \ + )); + +typedef CUdeviceptr devicePtr; + + +/**************/ +#include +std::vector readBinary(const char * filename) +{ + std::vector buffer; + FILE *fp = fopen(filename, "rb"); + if (!fp ) + { + fprintf(stderr, "file %s not found\n", filename); + assert(0); + } +#if 0 + char c; + while ((c = fgetc(fp)) != EOF) + buffer.push_back(c); +#else + fseek(fp, 0, SEEK_END); + const unsigned long long size = ftell(fp); /*calc the size needed*/ + fseek(fp, 0, SEEK_SET); + buffer.resize(size); + + if (fp == NULL){ /*ERROR detection if file == empty*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n",filename); + exit(1); + } + else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n", filename); + exit(1); + } +#endif + fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size()); + return buffer; +} + +extern "C" +{ + + void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment) + { + return NULL; + } + void CUDALaunch( + void **handlePtr, + const char * module_name, + const char * module_1, + const char * func_name, + void **func_args, + int countx, int county, int countz) + { + assert(module_name != NULL); + assert(module_1 != NULL); + assert(func_name != NULL); + assert(func_args != NULL); +#if 0 + const char * module = module_1; +#else + const std::vector module_str = readBinary("kernel.cubin"); + const char * module = &module_str[0]; +#endif + CUmodule cudaModule = loadModule(module); + CUfunction cudaFunction = getFunction(cudaModule, func_name); + deviceLaunch(cudaFunction, countx, county, countz, func_args); + unloadModule(cudaModule); + } + void CUDASync(void *handle) + { + checkCudaErrors(cuStreamSynchronize(0)); + } + void ISPCSync(void *handle) + { + checkCudaErrors(cuStreamSynchronize(0)); + } + void CUDAFree(void *handle) + { + } +} +/******************************/ + /////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { @@ -76,27 +292,112 @@ int main(int argc, char** argv) { Framebuffer framebuffer(input->header.framebufferWidth, input->header.framebufferHeight); - InitDynamicC(input); +// InitDynamicC(input); +#if 0 #ifdef __cilk InitDynamicCilk(input); #endif // __cilk +#endif + + /*******************/ + createContext(); + /*******************/ + + devicePtr d_header = deviceMalloc(sizeof(ispc::InputHeader)); + devicePtr d_arrays = deviceMalloc(sizeof(ispc::InputDataArrays)); + const int buffsize = input->header.framebufferWidth*input->header.framebufferHeight; + devicePtr d_r = deviceMalloc(buffsize); + devicePtr d_g = deviceMalloc(buffsize); + devicePtr d_b = deviceMalloc(buffsize); + + for (int i = 0; i < buffsize; i++) + framebuffer.r[i] = framebuffer.g[i] = framebuffer.b[i] = 0; + + + ispc::InputDataArrays dh_arrays; + { + devicePtr d_chunk = deviceMalloc(input->header.inputDataChunkSize); + memcpyH2D(d_chunk, input->chunk, input->header.inputDataChunkSize); + + dh_arrays.zBuffer = (float*)(d_chunk + input->header.inputDataArrayOffsets[idaZBuffer]); + dh_arrays.normalEncoded_x = + (uint16_t *)(d_chunk+input->header.inputDataArrayOffsets[idaNormalEncoded_x]); + fprintf(stderr, "%p %p \n", + dh_arrays.zBuffer, dh_arrays.normalEncoded_x); + fprintf(stderr, " diff= %d %d \n", + input->header.inputDataArrayOffsets[idaZBuffer], + input->header.inputDataArrayOffsets[idaNormalEncoded_x]); + + dh_arrays.normalEncoded_y = + (uint16_t *)(d_chunk+input->header.inputDataArrayOffsets[idaNormalEncoded_y]); + dh_arrays.specularAmount = + (uint16_t *)(d_chunk+input->header.inputDataArrayOffsets[idaSpecularAmount]); + dh_arrays.specularPower = + (uint16_t *)(d_chunk+input->header.inputDataArrayOffsets[idaSpecularPower]); + dh_arrays.albedo_x = + (uint8_t *)(d_chunk+input->header.inputDataArrayOffsets[idaAlbedo_x]); + dh_arrays.albedo_y = + (uint8_t *)(d_chunk+input->header.inputDataArrayOffsets[idaAlbedo_y]); + dh_arrays.albedo_z = + (uint8_t *)(d_chunk+input->header.inputDataArrayOffsets[idaAlbedo_z]); + dh_arrays.lightPositionView_x = + (float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightPositionView_x]); + dh_arrays.lightPositionView_y = + (float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightPositionView_y]); + dh_arrays.lightPositionView_z = + (float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightPositionView_z]); + dh_arrays.lightAttenuationBegin = + (float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightAttenuationBegin]); + dh_arrays.lightColor_x = + (float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightColor_x]); + dh_arrays.lightColor_y = + (float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightColor_y]); + dh_arrays.lightColor_z = + (float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightColor_z]); + dh_arrays.lightAttenuationEnd = + (float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightAttenuationEnd]); + } + + memcpyH2D(d_header, &input->header, sizeof(ispc::InputHeader)); + memcpyH2D(d_arrays, &dh_arrays, sizeof(ispc::InputDataArrays)); + memcpyH2D(d_r, framebuffer.r, buffsize); + memcpyH2D(d_g, framebuffer.g, buffsize); + memcpyH2D(d_b, framebuffer.b, buffsize); + int nframes = 5; double ispcCycles = 1e30; for (int i = 0; i < 5; ++i) { framebuffer.clear(); - reset_and_start_timer(); + const double t0 = rtc(); for (int j = 0; j < nframes; ++j) - ispc::RenderStatic(input->header, input->arrays, - VISUALIZE_LIGHT_COUNT, - framebuffer.r, framebuffer.g, framebuffer.b); - double mcycles = get_elapsed_mcycles() / nframes; + ispc::RenderStatic( + (ispc::InputHeader*)d_header, + (ispc::InputDataArrays*)d_arrays, + input->header, + VISUALIZE_LIGHT_COUNT, + (uint8_t*)d_r, + (uint8_t*)d_g, + (uint8_t*)d_b); + double mcycles = (rtc() - t0) / nframes; ispcCycles = std::min(ispcCycles, mcycles); } + + memcpyD2H(framebuffer.r, d_r, buffsize); + memcpyD2H(framebuffer.g, d_g, buffsize); + memcpyD2H(framebuffer.b, d_b, buffsize); + printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render " "%d x %d image\n", ispcCycles, input->header.framebufferWidth, input->header.framebufferHeight); - WriteFrame("deferred-ispc-static.ppm", input, framebuffer); + WriteFrame("deferred-cuda.ppm", input, framebuffer); + + /*******************/ + destroyContext(); + /*******************/ + return 0; + +#if 0 #ifdef __cilk double dynamicCilkCycles = 1e30; @@ -132,6 +433,7 @@ int main(int argc, char** argv) { #else printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles); #endif // __cilk +#endif DeleteInputData(input);