diff --git a/examples_cuda/deferred/kernels.cu b/examples_cuda/deferred/kernels.cu index 2af43a28..57740ff4 100644 --- a/examples_cuda/deferred/kernels.cu +++ b/examples_cuda/deferred/kernels.cu @@ -398,8 +398,8 @@ IntersectLightsWithTileMinMax( // don't actually need to mask the rest of this function - this is // just a greedy early-out. Could also structure all of this as // nested if() statements, but this a bit easier to read - int active = 0; - if ((inFrustum)) { + if (__ballot(inFrustum) > 0) + { float light_positionView_x = light_positionView_x_array[lightIndex]; float light_positionView_y = light_positionView_y_array[lightIndex]; @@ -420,29 +420,21 @@ IntersectLightsWithTileMinMax( inFrustum = inFrustum && (d >= light_attenuationEndNeg); // Pack and store intersecting lights + const bool active = inFrustum && lightIndex < numLights; #if 0 - if (inFrustum) { - tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], - lightIndex); - } + if (__ballot(active) > 0) + tileNumLights += packed_store_active(active, tileLightIndices.get_ptr(tileNumLights), lightIndex); #else - if (inFrustum) + if (__ballot(active) > 0) { - active = 1; + const int2 res = warpBinExclusiveScan(active); + const int idx = tileNumLights + res.y; + const int nactive = res.x; + tileLightIndices.set(active, idx, lightIndex); + tileNumLights += nactive; } #endif } - -#if 0 - const int2 res = warpBinExclusiveScan(active); - const int idx = tileNumLights + res.x; - const int nactive = res.y; - tileLightIndices.set(active, idx,lightIndex); - tileNumLights += nactive; -#else - tileNumLights += packed_store_active(active, tileLightIndices.get_ptr(tileNumLights), - lightIndex); -#endif } return tileNumLights; @@ -590,13 +582,13 @@ ShadeTile( // Gather light data relevant to initial culling float light_positionView_x = - inputData.lightPositionView_x[lightIndex]; + __ldg(&inputData.lightPositionView_x[lightIndex]); float light_positionView_y = - inputData.lightPositionView_y[lightIndex]; + __ldg(&inputData.lightPositionView_y[lightIndex]); float light_positionView_z = - inputData.lightPositionView_z[lightIndex]; + __ldg(&inputData.lightPositionView_z[lightIndex]); float light_attenuationEnd = - inputData.lightAttenuationEnd[lightIndex]; + __ldg(&inputData.lightAttenuationEnd[lightIndex]); // Compute light vector float L_x = light_positionView_x - surface_positionView_x; @@ -713,13 +705,8 @@ RenderTile( int num_groups_x, int num_groups_y, float cameraProj_32 = inputHeader.cameraProj[3][2]; // Light intersection: figure out which lights illuminate this tile. -#if 1 -// int tileLightIndices[MAX_LIGHTS]; // Light list for the tile Uniform tileLightIndices; // Light list for the tile -#else - __shared__ int tileLightIndicesFull[4*MAX_LIGHTS]; // Light list for the tile - int *tileLightIndices = &tileLightIndicesFull[warpIdx*MAX_LIGHTS]; -#endif +#if 1 int numTileLights = IntersectLightsWithTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, @@ -741,6 +728,7 @@ RenderTile( int num_groups_x, int num_groups_y, cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32, tileLightIndices, numTileLights, visualizeLightCount, framebuffer_r, framebuffer_g, framebuffer_b); +#endif } diff --git a/examples_cuda/deferred/kernels1.ispc b/examples_cuda/deferred/kernels1.ispc index 07afe33a..3a81daf6 100644 --- a/examples_cuda/deferred/kernels1.ispc +++ b/examples_cuda/deferred/kernels1.ispc @@ -511,6 +511,7 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y, #else uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile #endif +#if 1 uniform int numTileLights = IntersectLightsWithTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, @@ -532,6 +533,7 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y, cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32, tileLightIndices, numTileLights, visualizeLightCount, framebuffer_r, framebuffer_g, framebuffer_b); +#endif #if 1 delete tileLightIndices; #endif