From 4b7dbbf43bb51e33810759cda268440b9785a7f6 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Mon, 18 Nov 2013 12:46:30 +0100 Subject: [PATCH 1/8] added cuda kernel --- .../mandelbrot_tasks3d/mandelbrot_tasks3d.cu | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cu diff --git a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cu b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cu new file mode 100644 index 00000000..e642042a --- /dev/null +++ b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cu @@ -0,0 +1,104 @@ +/* + Copyright (c) 2010-2012, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define programCount 32 +#define programIndex (threadIdx.x & 31) +#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5)) +#define taskCount0 (gridDim.x*4) +#define taskIndex1 (blockIdx.y) +#define taskCount1 (gridDim.y) +#define warpIdx (threadIdx.x >> 5) + + +__device__ +static inline int +mandel(float c_re, float c_im, int count) { + float z_re = c_re, z_im = c_im; + int i; + for (i = 0; i < count; ++i) { + if (z_re * z_re + z_im * z_im > 4.0f) + break; + + float new_re = z_re*z_re - z_im*z_im; + float new_im = 2.f * z_re * z_im; + z_re = c_re + new_re; + z_im = c_im + new_im; + } + + return i; +} + + +/* Task to compute the Mandelbrot iterations for a single scanline. + */ +__global__ void +mandelbrot_scanline( float x0, float dx, + float y0, float dy, + int width, int height, + int xspan, int yspan, + int maxIterations, int output[]) { + const int xstart = taskIndex0 * xspan; + const int xend = min(xstart + xspan, width); + + const int ystart = taskIndex1 * yspan; + const int yend = min(ystart + yspan, height); + + for ( int yi = ystart; yi < yend; yi++) + for ( int xi = xstart; xi < xend; xi += programCount) + { + const float x = x0 + (xi + programIndex) * dx; + const float y = y0 + yi * dy; + + const int res = mandel(x,y,maxIterations); + const int index = yi * width + (xi + programIndex); + if (xi + programIndex < xend) + output[index] = res; + } +} + +extern "C" __global__ void +mandelbrot_ispc( float x0, float y0, + float x1, float y1, + int width, int height, + int maxIterations, int output[]) { + float dx = (x1 - x0) / width; + float dy = (y1 - y0) / height; + const int xspan = 64; /* make sure it is big enough to avoid false-sharing */ + const int yspan = 8; + + + if (programIndex == 0) + mandelbrot_scanline<<>> + (x0, dx, y0, dy, width, height, xspan, yspan, maxIterations, output); + cudaDeviceSynchronize(); +} From db4abfe198d0519e570b9ca6d5589f1d08e9d45d Mon Sep 17 00:00:00 2001 From: Evghenii Date: Mon, 18 Nov 2013 12:58:30 +0100 Subject: [PATCH 2/8] +1 --- examples_cuda/deferred/kernels.cu | 54 +++--------------------- examples_cuda/stencil/.stencil.ispc.swn | Bin 16384 -> 0 bytes 2 files changed, 5 insertions(+), 49 deletions(-) delete mode 100644 examples_cuda/stencil/.stencil.ispc.swn diff --git a/examples_cuda/deferred/kernels.cu b/examples_cuda/deferred/kernels.cu index 2388ea22..9914256c 100644 --- a/examples_cuda/deferred/kernels.cu +++ b/examples_cuda/deferred/kernels.cu @@ -95,7 +95,7 @@ dot3(float x, float y, float z, float a, float b, float c) { } -#if 0 +#if 1 static __shared__ int shdata_full[128]; template struct Uniform @@ -171,7 +171,7 @@ struct Uniform shptr[chunk][elem] = value; } }; -#elif 0 +#elif 1 template struct Uniform { @@ -274,36 +274,6 @@ static float reduce_max(float value) return value; } -#if 0 -__device__ inline -static int reduce_sum(int value) -{ -#pragma unroll - for (int i = 4; i >=0; i--) - value += __shfl_xor(value, 1< 0) + if (any(inFrustum)) { float light_positionView_x = light_positionView_x_array[lightIndex]; float light_positionView_y = light_positionView_y_array[lightIndex]; @@ -474,11 +444,7 @@ IntersectLightsWithTileMinMax( // Pack and store intersecting lights const bool active = inFrustum && lightIndex < numLights; -#if 0 - if (__ballot(active) > 0) - tileNumLights += packed_store_active(active, tileLightIndices.get_ptr(tileNumLights), lightIndex); -#else - if (__ballot(active) > 0) + if (any(active)) { const int2 res = warpBinExclusiveScan(active); const int idx = tileNumLights + res.y; @@ -486,7 +452,6 @@ IntersectLightsWithTileMinMax( tileLightIndices.set(active, idx, lightIndex); tileNumLights += nactive; } -#endif } } @@ -717,7 +682,7 @@ ShadeTile( lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); - if (x >= tileEndX) break; + // if (x >= tileEndX) break; framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); @@ -744,7 +709,6 @@ RenderTile( int num_groups_x, int num_groups_y, const InputHeader &inputHeader = *inputHeaderPtr; const InputDataArrays &inputData = *inputDataPtr; -#if 1 int32 group_y = taskIndex / num_groups_x; int32 group_x = taskIndex % num_groups_x; @@ -759,17 +723,11 @@ RenderTile( int num_groups_x, int num_groups_y, float cameraProj_11 = inputHeader.cameraProj[1][1]; float cameraProj_22 = inputHeader.cameraProj[2][2]; float cameraProj_32 = inputHeader.cameraProj[3][2]; -#endif // Light intersection: figure out which lights illuminate this tile. Uniform tileLightIndices; // Light list for the tile -#if 0 - tileLightIndices.set(threadIdx.x&1, threadIdx.x, framebuffer_g[blockIdx.x]); - framebuffer_r[threadIdx.x] = tileLightIndices.get(threadIdx.x); -#endif -#if 1 int numTileLights = IntersectLightsWithTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, @@ -791,7 +749,6 @@ RenderTile( int num_groups_x, int num_groups_y, cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32, tileLightIndices, numTileLights, visualizeLightCount, framebuffer_r, framebuffer_g, framebuffer_b); -#endif } @@ -806,7 +763,6 @@ RenderStatic(InputHeader inputHeaderPtr[], unsigned int8 framebuffer_b[]) { const InputHeader inputHeader = *inputHeaderPtr; - const InputDataArrays inputData = *inputDataPtr; int num_groups_x = (inputHeader.framebufferWidth + MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH; diff --git a/examples_cuda/stencil/.stencil.ispc.swn b/examples_cuda/stencil/.stencil.ispc.swn deleted file mode 100644 index ad3f6c7805836bce454e0ea3c69a06f27d7ae0cb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeHOU2No56?PX|3bYh02&obehus#f5=GAV&c<4g zJ=mUPCbZ=R34v4|SSnHe9*__(JRk%Tg%>0qkXj)j0pg(`fdHxei}D0&f$v<~i6@!e zY`ZTA*~&K_U!QxtMPk4^Q zw{CS@7Xw!Wm2epJLMyN;o;zLo=~Urd==Y*ICmeLkUEdb++ng9#edq~yxLF%~o>0>%{p@$@Rm@`H2(vlgs>+GmtZoGmtZoGmtZoGmtZo zGmtZoGmtZoGmtazKgfV(E6Tg!*XMY_k3Ii%{r}nnit-cSdEmQ12+RN%feXN|-k~VZ z0iOq2z&*hC?^l%X0b`&7{OvwP`77`j;H$tE@E+iWw=2riz$3ul-=-*E2X=r5fnU5; zQGN#e6!-yf1Ly&d0k7VxDBl4>paHA{=YjixU%y3Bz6@*vGr&LYQIsD7p9QW0v%tUZ zMj3De=mNU{0rvoR1OIumqPzyY3j7Ip33w5B0eBwxB+vma0>42*K9l91uXo>ocS2(Pe z=&n6q$ zp(Pqm7^^16wr*l4sX5OI1`I8re`Au^wc#CncD(=LoV)b@wFt{~O%u5Oo(x;HKL zhtbvTB7F>&P99U$bbZmg`hkq12_yUAM`7d^x;UyDqhmMZ_4vf~;aM6E=O=3*G|#X( zK2Zblr6fQ1*}-|R!xR_N7PD8Q6EzSds9{A;SN0g~`;LuaX^WoYi8!$7(TVG=6W5!x zh+jNCB|UwL$3T2AXATdLZG@5S?5A;~?s#Gp72pQ{6)_4IDjaDleykK67ne~mDL6~V z%GpG=!t7!ZV=`M(UM$LJPrS*>&n`;C8ToQjqNLI?92VL0LabEQnW!e~#3e`7xO&!! z*@aVXEfi5Sy~6CmgaOb~PD+mCrC$qKzgS|Tnyhn{U)587Rd3-J<`HyMman8_(ywaP zFP507Rz5CCswHC;v4J3vEt_WzSpwPYQSc=JfnkP`byy^o2e}L}_*NLPhlRqp2o;Uw z0z!N-E^cR(aF`wI!1QsuDo7qYh~gScR5(oZv+41X;ZYpx@#KT2jQ(K|t zWooZ!)M}chzO2{v_7)eJo!T0$sO##gRv~`cQA1m;>#JI$rcu+NCh8a)P=FvdbW?Av zlHMS-u|*py6m;6D-i`@*HldrWrLL>7krHjJX{?0n)tZgAq1M_ZGTVk;YiGq^tlc!) zRL~o>dS?ZeSWnAnUe?>@CH7^x1N)k3LQ`8aj83bqHycG-Yi^*e2Gvyfuma!E*4Rau zZ5rInL{qI}XzN--`fPTV&9>g|v^837HdjnGU~0w{y{5rh9V~Hkps@sc+G+{+S&O!* zt+(o`vBd@49#i{p2P$>7p3rD!T2O+|ZE7^Rq3e1Z-nET0P*Td#HC6ty}A`d33&X;5I+>Z-o3tyGx$ zMpH&ao6I$}UYDUzYqquweRU0dY&104Jh&%s>1d=5cM(d4iq>h8jdjhat>H51T1ho} zNA`*In;Fv03-SgxLfdQ^++k?bX^q^rq^D2u#ELkSj^Y_=ITo&fffZWbI^q9Xv%aE% zM(%qVV=1o}R1>e-P=+Jgb3)?ZARkzPpaUn`!>ml{JtCo&h9YsgOK{xn05sXLt#_olkjYm3W9<`;P?GV@Yx=AqmXP7IQtOm z;f7R&`B1>P2;h+*&+4OIk902*F4g>C;0Jie@jddOEtLpa#EHVREw%VU~DiBoNIeg z+PXV&DekTrI_V4=75c-V%jRrBi8GNEw}7U2(ihR=Xj$L4IX)!QCF0PLv4JSDj~R~k ztmp(wSqcNDXwoCeE8>S)N*rURGlYeuxZ^k~<6vceRM#a*RtPRb5g@d<$+eV2Ds+pq zFkhW7;{#9YQ*out%_zM7e+=vHKLEV`e;6k|euOptSAZ`8Uj)7YJO?}jw1M{mzr&jU z8-NXzfq!Dn|1IE~z-NGGfL-7tzJSo{AN_%xsc4*?GXKSv(m2G9pw;344O@aGla z&%hr6_U~mNKXL|g266^+266^+266^+266^&kAXDVOY#WEUN=oe&CL%_2sez^bg9Oy*X(cX{Djwc z==2@?%)2y&h~7^~NiHoR3#`P>qXW@V)nuhghlFm%!a41|ec#1GI@8-N{4cZX3fDOl zWOV~S4AEy;X^|DufpFc&N#;nov#~5g7##S)b>c-hg)SD-((F+_l_pYo5fY-&{3VL$ zgA`RS(d=x*`O|a@1N}29c}27XDdO3JS1ebHb2t^1SYO&3*>jp!Om~yGcc$(#**12W zq;uKo>GxF4L^}1Rdr5JvsdbK0uxFZjLn+tPUz}6(tmn+^>rLuSOT=bGC%$IwP1~As zeSg3^S^N*|JN%F9az`ti3;>MVteyyCSP>(ne4-?an{H1b51Sd=+41Md(Kn3b=u7S4 dJSfDQqb0esig`lrr Date: Mon, 18 Nov 2013 13:01:36 +0100 Subject: [PATCH 3/8] fixed kernel --- examples_cuda/deferred/kernels.cu | 148 +++++++++++++----------------- 1 file changed, 64 insertions(+), 84 deletions(-) diff --git a/examples_cuda/deferred/kernels.cu b/examples_cuda/deferred/kernels.cu index 9914256c..2530532a 100644 --- a/examples_cuda/deferred/kernels.cu +++ b/examples_cuda/deferred/kernels.cu @@ -95,7 +95,7 @@ dot3(float x, float y, float z, float a, float b, float c) { } -#if 1 +#if 0 static __shared__ int shdata_full[128]; template struct Uniform @@ -133,44 +133,6 @@ struct Uniform data[chunkIdx] = shdata[programIndex]; } }; -#elif 0 -static __shared__ void* shptr_full[128]; -template -struct Uniform -{ - T data[(N+programCount-1)/programCount]; - T* *shptr; - - __device__ inline Uniform() - { - shptr = (T**)shptr_full; - shptr[threadIdx.x] = data; - __syncthreads(); - } - - __device__ inline int2 get_chunk(const int i) const - { - const int elem = i & (programCount - 1); - const int chunk = i >> 5; - return make_int2(chunk, elem); - } - - __device__ inline const T get(const int i) const - { - const int2 idx = get_chunk(i); - const int chunk = idx.x; - const int elem = idx.y; - return shptr[chunk][elem]; - } - - __device__ inline void set(const bool active, const int i, T value) - { - const int2 idx = get_chunk(i); - const int chunk = idx.x; - const int elem = idx.y; - shptr[chunk][elem] = value; - } -}; #elif 1 template struct Uniform @@ -181,32 +143,17 @@ struct Uniform int32_t ptr[2]; }; - __device__ inline Uniform() { -#if 1 if (programIndex == 0) - data = new T[N]; + data = (T*)malloc(N*sizeof(T)); ptr[0] = __shfl(ptr[0], 0); ptr[1] = __shfl(ptr[1], 0); -#else - __shared__ T *ptr; - if (threadIdx.x == 0) - ptr = new T[4*N]; - __syncthreads(); - data = ptr; - data += warpIdx*N; -#endif } __device__ inline ~Uniform() { -#if 1 if (programIndex == 0) - delete data; -#else - if (threadIdx.x == 0) - delete data; -#endif + free(data); } __device__ inline const T get(const int i) const @@ -274,6 +221,36 @@ static float reduce_max(float value) return value; } +#if 0 +__device__ inline +static int reduce_sum(int value) +{ +#pragma unroll + for (int i = 4; i >=0; i--) + value += __shfl_xor(value, 1< 0) { float light_positionView_x = light_positionView_x_array[lightIndex]; float light_positionView_y = light_positionView_y_array[lightIndex]; @@ -444,7 +421,11 @@ IntersectLightsWithTileMinMax( // Pack and store intersecting lights const bool active = inFrustum && lightIndex < numLights; - if (any(active)) +#if 0 + if (__ballot(active) > 0) + tileNumLights += packed_store_active(active, tileLightIndices.get_ptr(tileNumLights), lightIndex); +#else + if (__ballot(active) > 0) { const int2 res = warpBinExclusiveScan(active); const int idx = tileNumLights + res.y; @@ -452,6 +433,7 @@ IntersectLightsWithTileMinMax( tileLightIndices.set(active, idx, lightIndex); tileNumLights += nactive; } +#endif } } @@ -682,7 +664,6 @@ ShadeTile( lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); - // if (x >= tileEndX) break; framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); @@ -695,20 +676,19 @@ ShadeTile( /////////////////////////////////////////////////////////////////////////// // Static decomposition - __global__ void RenderTile( int num_groups_x, int num_groups_y, - const InputHeader inputHeaderPtr[], - const InputDataArrays inputDataPtr[], + const InputHeader *inputHeaderPtr, + const InputDataArrays *inputDataPtr, int visualizeLightCount, // Output unsigned int8 framebuffer_r[], unsigned int8 framebuffer_g[], unsigned int8 framebuffer_b[]) { if (taskIndex >= taskCount) return; - const InputHeader &inputHeader = *inputHeaderPtr; - const InputDataArrays &inputData = *inputDataPtr; + const InputHeader inputHeader = *inputHeaderPtr; + const InputDataArrays inputData = *inputDataPtr; int32 group_y = taskIndex / num_groups_x; int32 group_x = taskIndex % num_groups_x; @@ -726,8 +706,7 @@ RenderTile( int num_groups_x, int num_groups_y, // Light intersection: figure out which lights illuminate this tile. Uniform tileLightIndices; // Light list for the tile - - +#if 1 int numTileLights = IntersectLightsWithTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, @@ -749,33 +728,34 @@ RenderTile( int num_groups_x, int num_groups_y, cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32, tileLightIndices, numTileLights, visualizeLightCount, framebuffer_r, framebuffer_g, framebuffer_b); +#endif } - extern "C" -__global__ void -RenderStatic(InputHeader inputHeaderPtr[], - InputDataArrays inputDataPtr[], - int visualizeLightCount, +extern "C" __global__ void +RenderStatic( InputHeader inputHeaderPtr[], + InputDataArrays inputDataPtr[], + int visualizeLightCount, // Output - unsigned int8 framebuffer_r[], - unsigned int8 framebuffer_g[], - unsigned int8 framebuffer_b[]) { + unsigned int8 framebuffer_r[], + unsigned int8 framebuffer_g[], + unsigned int8 framebuffer_b[]) { - const InputHeader inputHeader = *inputHeaderPtr; + const InputHeader inputHeader = *inputHeaderPtr; + const InputDataArrays inputData = *inputDataPtr; - int num_groups_x = (inputHeader.framebufferWidth + + + int num_groups_x = (inputHeader.framebufferWidth + MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH; - int num_groups_y = (inputHeader.framebufferHeight + + int num_groups_y = (inputHeader.framebufferHeight + MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT; - int num_groups = num_groups_x * num_groups_y; + int num_groups = num_groups_x * num_groups_y; // Launch a task to render each tile, each of which is MIN_TILE_WIDTH // by MIN_TILE_HEIGHT pixels. - if (programIndex == 0) - RenderTile<<>>(num_groups_x, num_groups_y, - inputHeaderPtr, inputDataPtr, visualizeLightCount, - framebuffer_r, framebuffer_g, framebuffer_b); - cudaDeviceSynchronize(); - cudaDeviceSynchronize(); + if (programIndex == 0) + RenderTile<<<(num_groups+4-1)/4,128>>>(num_groups_x, num_groups_y, + inputHeaderPtr, inputDataPtr, visualizeLightCount, + framebuffer_r, framebuffer_g, framebuffer_b); + cudaDeviceSynchronize(); } From 4f9b8ebc7319b44e9e438108e24c7115c973e6f3 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Mon, 18 Nov 2013 13:11:57 +0100 Subject: [PATCH 4/8] +1 --- examples_cuda/rt/rt.cu | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/examples_cuda/rt/rt.cu b/examples_cuda/rt/rt.cu index 2575662d..b7277cf6 100644 --- a/examples_cuda/rt/rt.cu +++ b/examples_cuda/rt/rt.cu @@ -319,7 +319,6 @@ static inline void raytrace_tile( int x0, int x1, -extern "C" __global__ void raytrace_tile_task( int width, int height, int baseWidth, int baseHeight, @@ -330,7 +329,7 @@ void raytrace_tile_task( int width, int height, const Triangle triangles[]) { if (taskIndex >= taskCount) return; - int dx = 32, dy = 16; // must match dx, dy below + int dx = 64, dy = 8; // must match dx, dy below int xBuckets = (width + (dx-1)) / dx; int x0 = (taskIndex % xBuckets) * dx; int x1 = min(x0 + dx, width); @@ -343,3 +342,20 @@ void raytrace_tile_task( int width, int height, } +extern "C" __global__ void raytrace_ispc_tasks( int width, int height, + int baseWidth, int baseHeight, + const float raster2camera[4][4], + const float camera2world[4][4], + float image[], int id[], + const LinearBVHNode nodes[], + const Triangle triangles[]) { + int dx = 64, dy = 8; + int xBuckets = (width + (dx-1)) / dx; + int yBuckets = (height + (dy-1)) / dy; + int nTasks = xBuckets * yBuckets; + if (programIndex == 0) + raytrace_tile_task<<<(nTasks-1+4)/4,128>>>(width, height, baseWidth, baseHeight, + raster2camera, camera2world, + image, id, nodes, triangles); + cudaDeviceSynchronize(); +} From 64762c5acddba333f2d86c02b4df07479d98525f Mon Sep 17 00:00:00 2001 From: Evghenii Date: Mon, 18 Nov 2013 13:15:05 +0100 Subject: [PATCH 5/8] +1 --- examples_cuda/rt/rt.cu | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples_cuda/rt/rt.cu b/examples_cuda/rt/rt.cu index b7277cf6..46267941 100644 --- a/examples_cuda/rt/rt.cu +++ b/examples_cuda/rt/rt.cu @@ -2,6 +2,7 @@ #define programIndex (threadIdx.x & 31) #define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5)) #define taskCount (gridDim.x*4) +#define warpIdx (threadIdx.x >> 5) #define float3 Float3 struct Float3 @@ -235,7 +236,12 @@ static inline bool BVHIntersect(const LinearBVHNode nodes[], bool hit = false; // Follow ray through BVH nodes to find primitive intersections int todoOffset = 0, nodeNum = 0; +#if 0 + __shared__ int todoX[64*4]; + volatile int * todo = &todoX[warpIdx * 64]; +#else int todo[64]; +#endif while (true) { // Check ray against BVH node From cf2116e167e409f0e73cd68c92b36202802f332d Mon Sep 17 00:00:00 2001 From: Evghenii Date: Mon, 18 Nov 2013 13:16:30 +0100 Subject: [PATCH 6/8] +1 --- examples_cuda/rt/rt.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/examples_cuda/rt/rt.cu b/examples_cuda/rt/rt.cu index 46267941..e0c3855a 100644 --- a/examples_cuda/rt/rt.cu +++ b/examples_cuda/rt/rt.cu @@ -345,6 +345,7 @@ void raytrace_tile_task( int width, int height, raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, raster2camera, camera2world, image, id, nodes, triangles); + cudaDeviceSynchronize(); } From 915dc4be7fdf6c8a4bf311801a1869076082571c Mon Sep 17 00:00:00 2001 From: Evghenii Date: Mon, 18 Nov 2013 13:24:01 +0100 Subject: [PATCH 7/8] +1 --- examples_cuda/sort/sort1.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples_cuda/sort/sort1.cu b/examples_cuda/sort/sort1.cu index 49886fdb..2c94a409 100644 --- a/examples_cuda/sort/sort1.cu +++ b/examples_cuda/sort/sort1.cu @@ -247,7 +247,7 @@ void sort_ispc ( int n, unsigned int code[], int order[], int ntasks, { int num = ntasks; int span = n / num; -#if 0 +#if 1 int hsize = 256*programCount*num; int * hist = __new< int>(hsize); int64 * pair = __new< int64>(n); @@ -293,7 +293,7 @@ void sort_ispc ( int n, unsigned int code[], int order[], int ntasks, unpack<<>> (span, n, pair, code, order); sync; -#if ALLOCATED +#ifdef ALLOCATED __delete(g); __delete(hist); __delete(pair); From 4bc8c79bd3371b663f1b34289df97843f085c990 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Mon, 18 Nov 2013 13:28:12 +0100 Subject: [PATCH 8/8] fixed cuda kernel --- examples_cuda/rt/rt.cu | 91 +++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 50 deletions(-) diff --git a/examples_cuda/rt/rt.cu b/examples_cuda/rt/rt.cu index e0c3855a..8decd03a 100644 --- a/examples_cuda/rt/rt.cu +++ b/examples_cuda/rt/rt.cu @@ -58,17 +58,12 @@ struct Float3 } }; -#if 0 -#define DIRISNEG -#endif +#define int8 char +#define int16 short struct Ray { float3 origin, dir, invDir; -#ifdef DIRISNEG /* this fails to compile with nvvm */ - unsigned int dirIsNeg[3]; -#else unsigned int dirIsNeg0, dirIsNeg1, dirIsNeg2; -#endif float mint, maxt; int hitId; }; @@ -79,8 +74,6 @@ struct Triangle { int pad[3]; }; -#define int8 char -#define int16 short struct LinearBVHNode { float bounds[2][3]; unsigned int offset; // num primitives for leaf, second child for interior @@ -106,7 +99,8 @@ static inline float Dot(const float3 a, const float3 b) { } __device__ -static inline void generateRay( const float raster2camera[4][4], +inline +static void generateRay( const float raster2camera[4][4], const float camera2world[4][4], float x, float y, Ray &ray) { ray.mint = 0.f; @@ -136,7 +130,7 @@ static inline void generateRay( const float raster2camera[4][4], ray.invDir = 1.f / ray.dir; -#ifdef DIRISNEG +#if 0 ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0; ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0; ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0; @@ -147,9 +141,9 @@ static inline void generateRay( const float raster2camera[4][4], #endif } - __device__ -static inline bool BBoxIntersect(const float bounds[2][3], +inline +static bool BBoxIntersect(const float bounds[2][3], const Ray &ray) { float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] }; float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] }; @@ -187,9 +181,9 @@ static inline bool BBoxIntersect(const float bounds[2][3], } - __device__ -static inline bool TriIntersect(const Triangle &tri, Ray &ray) { +inline +static bool TriIntersect(const Triangle &tri, Ray &ray) { float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] }; float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] }; float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] }; @@ -228,20 +222,15 @@ static inline bool TriIntersect(const Triangle &tri, Ray &ray) { return hit; } - __device__ -static inline bool BVHIntersect(const LinearBVHNode nodes[], - const Triangle tris[], Ray &r) { +inline +bool BVHIntersect(const LinearBVHNode nodes[], + const Triangle tris[], Ray &r, + int todo[]) { Ray ray = r; bool hit = false; // Follow ray through BVH nodes to find primitive intersections int todoOffset = 0, nodeNum = 0; -#if 0 - __shared__ int todoX[64*4]; - volatile int * todo = &todoX[warpIdx * 64]; -#else - int todo[64]; -#endif while (true) { // Check ray against BVH node @@ -255,20 +244,17 @@ static inline bool BVHIntersect(const LinearBVHNode nodes[], if (TriIntersect(tris[primitivesOffset+i], ray)) hit = true; } + if (todoOffset == 0) + break; nodeNum = todo[--todoOffset]; } else { // Put far BVH node on _todo_ stack, advance to near node -#ifdef DIRISNEG - const int dirIsNeg = r.dirIsNeg[node.splitAxis]; -#else int dirIsNeg; if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg0; if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg1; if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg2; -#endif - if (dirIsNeg) - { + if (dirIsNeg) { todo[todoOffset++] = nodeNum + 1; nodeNum = node.offset; } @@ -279,10 +265,10 @@ static inline bool BVHIntersect(const LinearBVHNode nodes[], } } else { - nodeNum = todo[--todoOffset]; - } if (todoOffset == 0) break; + nodeNum = todo[--todoOffset]; + } } r.maxt = ray.maxt; r.hitId = ray.hitId; @@ -290,9 +276,9 @@ static inline bool BVHIntersect(const LinearBVHNode nodes[], return hit; } - __device__ -static inline void raytrace_tile( int x0, int x1, +inline +static void raytrace_tile( int x0, int x1, int y0, int y1, int width, int height, int baseWidth, int baseHeight, @@ -304,23 +290,30 @@ static inline void raytrace_tile( int x0, int x1, float widthScale = (float)(baseWidth) / (float)(width); float heightScale = (float)(baseHeight) / (float)(height); -// foreach_tiled (y = y0 ... y1, x = x0 ... x1) - for ( int y = y0; y < y1; y++) - for ( int xb = x0; xb < x1; xb += programCount) - { - const int x = xb + programIndex; - Ray ray; - generateRay(raster2camera, camera2world, x*widthScale, - y*heightScale, ray); - BVHIntersect(nodes, triangles, ray); +#if 0 + int * todo = new int[64]; +#define ALLOC +#else + int todo[64]; +#endif - int offset = y * width + x; + for (int y = y0 ;y < y1; y++) + for (int x = x0 + programIndex; x < x1; x += programCount) if (x < x1) { + Ray ray; + generateRay(raster2camera, camera2world, x*widthScale, + y*heightScale, ray); + BVHIntersect(nodes, triangles, ray, todo); + + int offset = y * width + x; image[offset] = ray.maxt; id[offset] = ray.hitId; } - } + +#ifdef ALLOC + delete todo; +#endif } @@ -333,19 +326,16 @@ void raytrace_tile_task( int width, int height, float image[], int id[], const LinearBVHNode nodes[], const Triangle triangles[]) { - - if (taskIndex >= taskCount) return; int dx = 64, dy = 8; // must match dx, dy below int xBuckets = (width + (dx-1)) / dx; int x0 = (taskIndex % xBuckets) * dx; int x1 = min(x0 + dx, width); int y0 = (taskIndex / xBuckets) * dy; int y1 = min(y0 + dy, height); - + raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, raster2camera, camera2world, image, id, nodes, triangles); - cudaDeviceSynchronize(); } @@ -361,8 +351,9 @@ extern "C" __global__ void raytrace_ispc_tasks( int width, int height, int yBuckets = (height + (dy-1)) / dy; int nTasks = xBuckets * yBuckets; if (programIndex == 0) - raytrace_tile_task<<<(nTasks-1+4)/4,128>>>(width, height, baseWidth, baseHeight, + raytrace_tile_task<<<(nTasks+4-1)/4,128>>>(width, height, baseWidth, baseHeight, raster2camera, camera2world, image, id, nodes, triangles); cudaDeviceSynchronize(); } +