From ee61a265f485f0397bfe63eecaf86a72508419fc Mon Sep 17 00:00:00 2001 From: Evghenii Date: Mon, 18 Nov 2013 13:01:36 +0100 Subject: [PATCH] fixed kernel --- examples_cuda/deferred/kernels.cu | 148 +++++++++++++----------------- 1 file changed, 64 insertions(+), 84 deletions(-) diff --git a/examples_cuda/deferred/kernels.cu b/examples_cuda/deferred/kernels.cu index 9914256c..2530532a 100644 --- a/examples_cuda/deferred/kernels.cu +++ b/examples_cuda/deferred/kernels.cu @@ -95,7 +95,7 @@ dot3(float x, float y, float z, float a, float b, float c) { } -#if 1 +#if 0 static __shared__ int shdata_full[128]; template struct Uniform @@ -133,44 +133,6 @@ struct Uniform data[chunkIdx] = shdata[programIndex]; } }; -#elif 0 -static __shared__ void* shptr_full[128]; -template -struct Uniform -{ - T data[(N+programCount-1)/programCount]; - T* *shptr; - - __device__ inline Uniform() - { - shptr = (T**)shptr_full; - shptr[threadIdx.x] = data; - __syncthreads(); - } - - __device__ inline int2 get_chunk(const int i) const - { - const int elem = i & (programCount - 1); - const int chunk = i >> 5; - return make_int2(chunk, elem); - } - - __device__ inline const T get(const int i) const - { - const int2 idx = get_chunk(i); - const int chunk = idx.x; - const int elem = idx.y; - return shptr[chunk][elem]; - } - - __device__ inline void set(const bool active, const int i, T value) - { - const int2 idx = get_chunk(i); - const int chunk = idx.x; - const int elem = idx.y; - shptr[chunk][elem] = value; - } -}; #elif 1 template struct Uniform @@ -181,32 +143,17 @@ struct Uniform int32_t ptr[2]; }; - __device__ inline Uniform() { -#if 1 if (programIndex == 0) - data = new T[N]; + data = (T*)malloc(N*sizeof(T)); ptr[0] = __shfl(ptr[0], 0); ptr[1] = __shfl(ptr[1], 0); -#else - __shared__ T *ptr; - if (threadIdx.x == 0) - ptr = new T[4*N]; - __syncthreads(); - data = ptr; - data += warpIdx*N; -#endif } __device__ inline ~Uniform() { -#if 1 if (programIndex == 0) - delete data; -#else - if (threadIdx.x == 0) - delete data; -#endif + free(data); } __device__ inline const T get(const int i) const @@ -274,6 +221,36 @@ static float reduce_max(float value) return value; } +#if 0 +__device__ inline +static int reduce_sum(int value) +{ +#pragma unroll + for (int i = 4; i >=0; i--) + value += __shfl_xor(value, 1< 0) { float light_positionView_x = light_positionView_x_array[lightIndex]; float light_positionView_y = light_positionView_y_array[lightIndex]; @@ -444,7 +421,11 @@ IntersectLightsWithTileMinMax( // Pack and store intersecting lights const bool active = inFrustum && lightIndex < numLights; - if (any(active)) +#if 0 + if (__ballot(active) > 0) + tileNumLights += packed_store_active(active, tileLightIndices.get_ptr(tileNumLights), lightIndex); +#else + if (__ballot(active) > 0) { const int2 res = warpBinExclusiveScan(active); const int idx = tileNumLights + res.y; @@ -452,6 +433,7 @@ IntersectLightsWithTileMinMax( tileLightIndices.set(active, idx, lightIndex); tileNumLights += nactive; } +#endif } } @@ -682,7 +664,6 @@ ShadeTile( lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); - // if (x >= tileEndX) break; framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); @@ -695,20 +676,19 @@ ShadeTile( /////////////////////////////////////////////////////////////////////////// // Static decomposition - __global__ void RenderTile( int num_groups_x, int num_groups_y, - const InputHeader inputHeaderPtr[], - const InputDataArrays inputDataPtr[], + const InputHeader *inputHeaderPtr, + const InputDataArrays *inputDataPtr, int visualizeLightCount, // Output unsigned int8 framebuffer_r[], unsigned int8 framebuffer_g[], unsigned int8 framebuffer_b[]) { if (taskIndex >= taskCount) return; - const InputHeader &inputHeader = *inputHeaderPtr; - const InputDataArrays &inputData = *inputDataPtr; + const InputHeader inputHeader = *inputHeaderPtr; + const InputDataArrays inputData = *inputDataPtr; int32 group_y = taskIndex / num_groups_x; int32 group_x = taskIndex % num_groups_x; @@ -726,8 +706,7 @@ RenderTile( int num_groups_x, int num_groups_y, // Light intersection: figure out which lights illuminate this tile. Uniform tileLightIndices; // Light list for the tile - - +#if 1 int numTileLights = IntersectLightsWithTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, @@ -749,33 +728,34 @@ RenderTile( int num_groups_x, int num_groups_y, cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32, tileLightIndices, numTileLights, visualizeLightCount, framebuffer_r, framebuffer_g, framebuffer_b); +#endif } - extern "C" -__global__ void -RenderStatic(InputHeader inputHeaderPtr[], - InputDataArrays inputDataPtr[], - int visualizeLightCount, +extern "C" __global__ void +RenderStatic( InputHeader inputHeaderPtr[], + InputDataArrays inputDataPtr[], + int visualizeLightCount, // Output - unsigned int8 framebuffer_r[], - unsigned int8 framebuffer_g[], - unsigned int8 framebuffer_b[]) { + unsigned int8 framebuffer_r[], + unsigned int8 framebuffer_g[], + unsigned int8 framebuffer_b[]) { - const InputHeader inputHeader = *inputHeaderPtr; + const InputHeader inputHeader = *inputHeaderPtr; + const InputDataArrays inputData = *inputDataPtr; - int num_groups_x = (inputHeader.framebufferWidth + + + int num_groups_x = (inputHeader.framebufferWidth + MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH; - int num_groups_y = (inputHeader.framebufferHeight + + int num_groups_y = (inputHeader.framebufferHeight + MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT; - int num_groups = num_groups_x * num_groups_y; + int num_groups = num_groups_x * num_groups_y; // Launch a task to render each tile, each of which is MIN_TILE_WIDTH // by MIN_TILE_HEIGHT pixels. - if (programIndex == 0) - RenderTile<<>>(num_groups_x, num_groups_y, - inputHeaderPtr, inputDataPtr, visualizeLightCount, - framebuffer_r, framebuffer_g, framebuffer_b); - cudaDeviceSynchronize(); - cudaDeviceSynchronize(); + if (programIndex == 0) + RenderTile<<<(num_groups+4-1)/4,128>>>(num_groups_x, num_groups_y, + inputHeaderPtr, inputDataPtr, visualizeLightCount, + framebuffer_r, framebuffer_g, framebuffer_b); + cudaDeviceSynchronize(); }