diff --git a/examples_cuda/deferred/kernels.cu b/examples_cuda/deferred/kernels.cu index 2388ea22..2530532a 100644 --- a/examples_cuda/deferred/kernels.cu +++ b/examples_cuda/deferred/kernels.cu @@ -133,45 +133,7 @@ struct Uniform data[chunkIdx] = shdata[programIndex]; } }; -#elif 0 -static __shared__ void* shptr_full[128]; -template -struct Uniform -{ - T data[(N+programCount-1)/programCount]; - T* *shptr; - - __device__ inline Uniform() - { - shptr = (T**)shptr_full; - shptr[threadIdx.x] = data; - __syncthreads(); - } - - __device__ inline int2 get_chunk(const int i) const - { - const int elem = i & (programCount - 1); - const int chunk = i >> 5; - return make_int2(chunk, elem); - } - - __device__ inline const T get(const int i) const - { - const int2 idx = get_chunk(i); - const int chunk = idx.x; - const int elem = idx.y; - return shptr[chunk][elem]; - } - - __device__ inline void set(const bool active, const int i, T value) - { - const int2 idx = get_chunk(i); - const int chunk = idx.x; - const int elem = idx.y; - shptr[chunk][elem] = value; - } -}; -#elif 0 +#elif 1 template struct Uniform { @@ -181,32 +143,17 @@ struct Uniform int32_t ptr[2]; }; - __device__ inline Uniform() { -#if 1 if (programIndex == 0) - data = new T[N]; + data = (T*)malloc(N*sizeof(T)); ptr[0] = __shfl(ptr[0], 0); ptr[1] = __shfl(ptr[1], 0); -#else - __shared__ T *ptr; - if (threadIdx.x == 0) - ptr = new T[4*N]; - __syncthreads(); - data = ptr; - data += warpIdx*N; -#endif } __device__ inline ~Uniform() { -#if 1 if (programIndex == 0) - delete data; -#else - if (threadIdx.x == 0) - delete data; -#endif + free(data); } __device__ inline const T get(const int i) const @@ -717,7 +664,6 @@ ShadeTile( lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); - if (x >= tileEndX) break; framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); @@ -730,21 +676,19 @@ ShadeTile( /////////////////////////////////////////////////////////////////////////// // Static decomposition - __global__ void RenderTile( int num_groups_x, int num_groups_y, - const InputHeader inputHeaderPtr[], - const InputDataArrays inputDataPtr[], + const InputHeader *inputHeaderPtr, + const InputDataArrays *inputDataPtr, int visualizeLightCount, // Output unsigned int8 framebuffer_r[], unsigned int8 framebuffer_g[], unsigned int8 framebuffer_b[]) { if (taskIndex >= taskCount) return; - const InputHeader &inputHeader = *inputHeaderPtr; - const InputDataArrays &inputData = *inputDataPtr; -#if 1 + const InputHeader inputHeader = *inputHeaderPtr; + const InputDataArrays inputData = *inputDataPtr; int32 group_y = taskIndex / num_groups_x; int32 group_x = taskIndex % num_groups_x; @@ -759,16 +703,9 @@ RenderTile( int num_groups_x, int num_groups_y, float cameraProj_11 = inputHeader.cameraProj[1][1]; float cameraProj_22 = inputHeader.cameraProj[2][2]; float cameraProj_32 = inputHeader.cameraProj[3][2]; -#endif // Light intersection: figure out which lights illuminate this tile. Uniform tileLightIndices; // Light list for the tile - -#if 0 - tileLightIndices.set(threadIdx.x&1, threadIdx.x, framebuffer_g[blockIdx.x]); - framebuffer_r[threadIdx.x] = tileLightIndices.get(threadIdx.x); -#endif - #if 1 int numTileLights = IntersectLightsWithTile(tile_start_x, tile_end_x, @@ -795,31 +732,30 @@ RenderTile( int num_groups_x, int num_groups_y, } - extern "C" -__global__ void -RenderStatic(InputHeader inputHeaderPtr[], - InputDataArrays inputDataPtr[], - int visualizeLightCount, +extern "C" __global__ void +RenderStatic( InputHeader inputHeaderPtr[], + InputDataArrays inputDataPtr[], + int visualizeLightCount, // Output - unsigned int8 framebuffer_r[], - unsigned int8 framebuffer_g[], - unsigned int8 framebuffer_b[]) { + unsigned int8 framebuffer_r[], + unsigned int8 framebuffer_g[], + unsigned int8 framebuffer_b[]) { - const InputHeader inputHeader = *inputHeaderPtr; - const InputDataArrays inputData = *inputDataPtr; + const InputHeader inputHeader = *inputHeaderPtr; + const InputDataArrays inputData = *inputDataPtr; - int num_groups_x = (inputHeader.framebufferWidth + + + int num_groups_x = (inputHeader.framebufferWidth + MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH; - int num_groups_y = (inputHeader.framebufferHeight + + int num_groups_y = (inputHeader.framebufferHeight + MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT; - int num_groups = num_groups_x * num_groups_y; + int num_groups = num_groups_x * num_groups_y; // Launch a task to render each tile, each of which is MIN_TILE_WIDTH // by MIN_TILE_HEIGHT pixels. - if (programIndex == 0) - RenderTile<<>>(num_groups_x, num_groups_y, - inputHeaderPtr, inputDataPtr, visualizeLightCount, - framebuffer_r, framebuffer_g, framebuffer_b); - cudaDeviceSynchronize(); - cudaDeviceSynchronize(); + if (programIndex == 0) + RenderTile<<<(num_groups+4-1)/4,128>>>(num_groups_x, num_groups_y, + inputHeaderPtr, inputDataPtr, visualizeLightCount, + framebuffer_r, framebuffer_g, framebuffer_b); + cudaDeviceSynchronize(); } diff --git a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cu b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cu new file mode 100644 index 00000000..e642042a --- /dev/null +++ b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cu @@ -0,0 +1,104 @@ +/* + Copyright (c) 2010-2012, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define programCount 32 +#define programIndex (threadIdx.x & 31) +#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5)) +#define taskCount0 (gridDim.x*4) +#define taskIndex1 (blockIdx.y) +#define taskCount1 (gridDim.y) +#define warpIdx (threadIdx.x >> 5) + + +__device__ +static inline int +mandel(float c_re, float c_im, int count) { + float z_re = c_re, z_im = c_im; + int i; + for (i = 0; i < count; ++i) { + if (z_re * z_re + z_im * z_im > 4.0f) + break; + + float new_re = z_re*z_re - z_im*z_im; + float new_im = 2.f * z_re * z_im; + z_re = c_re + new_re; + z_im = c_im + new_im; + } + + return i; +} + + +/* Task to compute the Mandelbrot iterations for a single scanline. + */ +__global__ void +mandelbrot_scanline( float x0, float dx, + float y0, float dy, + int width, int height, + int xspan, int yspan, + int maxIterations, int output[]) { + const int xstart = taskIndex0 * xspan; + const int xend = min(xstart + xspan, width); + + const int ystart = taskIndex1 * yspan; + const int yend = min(ystart + yspan, height); + + for ( int yi = ystart; yi < yend; yi++) + for ( int xi = xstart; xi < xend; xi += programCount) + { + const float x = x0 + (xi + programIndex) * dx; + const float y = y0 + yi * dy; + + const int res = mandel(x,y,maxIterations); + const int index = yi * width + (xi + programIndex); + if (xi + programIndex < xend) + output[index] = res; + } +} + +extern "C" __global__ void +mandelbrot_ispc( float x0, float y0, + float x1, float y1, + int width, int height, + int maxIterations, int output[]) { + float dx = (x1 - x0) / width; + float dy = (y1 - y0) / height; + const int xspan = 64; /* make sure it is big enough to avoid false-sharing */ + const int yspan = 8; + + + if (programIndex == 0) + mandelbrot_scanline<<>> + (x0, dx, y0, dy, width, height, xspan, yspan, maxIterations, output); + cudaDeviceSynchronize(); +} diff --git a/examples_cuda/rt/rt.cu b/examples_cuda/rt/rt.cu index 2575662d..8decd03a 100644 --- a/examples_cuda/rt/rt.cu +++ b/examples_cuda/rt/rt.cu @@ -2,6 +2,7 @@ #define programIndex (threadIdx.x & 31) #define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5)) #define taskCount (gridDim.x*4) +#define warpIdx (threadIdx.x >> 5) #define float3 Float3 struct Float3 @@ -57,17 +58,12 @@ struct Float3 } }; -#if 0 -#define DIRISNEG -#endif +#define int8 char +#define int16 short struct Ray { float3 origin, dir, invDir; -#ifdef DIRISNEG /* this fails to compile with nvvm */ - unsigned int dirIsNeg[3]; -#else unsigned int dirIsNeg0, dirIsNeg1, dirIsNeg2; -#endif float mint, maxt; int hitId; }; @@ -78,8 +74,6 @@ struct Triangle { int pad[3]; }; -#define int8 char -#define int16 short struct LinearBVHNode { float bounds[2][3]; unsigned int offset; // num primitives for leaf, second child for interior @@ -105,7 +99,8 @@ static inline float Dot(const float3 a, const float3 b) { } __device__ -static inline void generateRay( const float raster2camera[4][4], +inline +static void generateRay( const float raster2camera[4][4], const float camera2world[4][4], float x, float y, Ray &ray) { ray.mint = 0.f; @@ -135,7 +130,7 @@ static inline void generateRay( const float raster2camera[4][4], ray.invDir = 1.f / ray.dir; -#ifdef DIRISNEG +#if 0 ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0; ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0; ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0; @@ -146,9 +141,9 @@ static inline void generateRay( const float raster2camera[4][4], #endif } - __device__ -static inline bool BBoxIntersect(const float bounds[2][3], +inline +static bool BBoxIntersect(const float bounds[2][3], const Ray &ray) { float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] }; float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] }; @@ -186,9 +181,9 @@ static inline bool BBoxIntersect(const float bounds[2][3], } - __device__ -static inline bool TriIntersect(const Triangle &tri, Ray &ray) { +inline +static bool TriIntersect(const Triangle &tri, Ray &ray) { float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] }; float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] }; float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] }; @@ -227,15 +222,15 @@ static inline bool TriIntersect(const Triangle &tri, Ray &ray) { return hit; } - __device__ -static inline bool BVHIntersect(const LinearBVHNode nodes[], - const Triangle tris[], Ray &r) { +inline +bool BVHIntersect(const LinearBVHNode nodes[], + const Triangle tris[], Ray &r, + int todo[]) { Ray ray = r; bool hit = false; // Follow ray through BVH nodes to find primitive intersections int todoOffset = 0, nodeNum = 0; - int todo[64]; while (true) { // Check ray against BVH node @@ -249,20 +244,17 @@ static inline bool BVHIntersect(const LinearBVHNode nodes[], if (TriIntersect(tris[primitivesOffset+i], ray)) hit = true; } + if (todoOffset == 0) + break; nodeNum = todo[--todoOffset]; } else { // Put far BVH node on _todo_ stack, advance to near node -#ifdef DIRISNEG - const int dirIsNeg = r.dirIsNeg[node.splitAxis]; -#else int dirIsNeg; if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg0; if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg1; if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg2; -#endif - if (dirIsNeg) - { + if (dirIsNeg) { todo[todoOffset++] = nodeNum + 1; nodeNum = node.offset; } @@ -273,10 +265,10 @@ static inline bool BVHIntersect(const LinearBVHNode nodes[], } } else { - nodeNum = todo[--todoOffset]; - } if (todoOffset == 0) break; + nodeNum = todo[--todoOffset]; + } } r.maxt = ray.maxt; r.hitId = ray.hitId; @@ -284,9 +276,9 @@ static inline bool BVHIntersect(const LinearBVHNode nodes[], return hit; } - __device__ -static inline void raytrace_tile( int x0, int x1, +inline +static void raytrace_tile( int x0, int x1, int y0, int y1, int width, int height, int baseWidth, int baseHeight, @@ -298,28 +290,34 @@ static inline void raytrace_tile( int x0, int x1, float widthScale = (float)(baseWidth) / (float)(width); float heightScale = (float)(baseHeight) / (float)(height); -// foreach_tiled (y = y0 ... y1, x = x0 ... x1) - for ( int y = y0; y < y1; y++) - for ( int xb = x0; xb < x1; xb += programCount) - { - const int x = xb + programIndex; - Ray ray; - generateRay(raster2camera, camera2world, x*widthScale, - y*heightScale, ray); - BVHIntersect(nodes, triangles, ray); +#if 0 + int * todo = new int[64]; +#define ALLOC +#else + int todo[64]; +#endif - int offset = y * width + x; + for (int y = y0 ;y < y1; y++) + for (int x = x0 + programIndex; x < x1; x += programCount) if (x < x1) { + Ray ray; + generateRay(raster2camera, camera2world, x*widthScale, + y*heightScale, ray); + BVHIntersect(nodes, triangles, ray, todo); + + int offset = y * width + x; image[offset] = ray.maxt; id[offset] = ray.hitId; } - } + +#ifdef ALLOC + delete todo; +#endif } -extern "C" __global__ void raytrace_tile_task( int width, int height, int baseWidth, int baseHeight, @@ -328,18 +326,34 @@ void raytrace_tile_task( int width, int height, float image[], int id[], const LinearBVHNode nodes[], const Triangle triangles[]) { - - if (taskIndex >= taskCount) return; - int dx = 32, dy = 16; // must match dx, dy below + int dx = 64, dy = 8; // must match dx, dy below int xBuckets = (width + (dx-1)) / dx; int x0 = (taskIndex % xBuckets) * dx; int x1 = min(x0 + dx, width); int y0 = (taskIndex / xBuckets) * dy; int y1 = min(y0 + dy, height); - + raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, raster2camera, camera2world, image, id, nodes, triangles); } +extern "C" __global__ void raytrace_ispc_tasks( int width, int height, + int baseWidth, int baseHeight, + const float raster2camera[4][4], + const float camera2world[4][4], + float image[], int id[], + const LinearBVHNode nodes[], + const Triangle triangles[]) { + int dx = 64, dy = 8; + int xBuckets = (width + (dx-1)) / dx; + int yBuckets = (height + (dy-1)) / dy; + int nTasks = xBuckets * yBuckets; + if (programIndex == 0) + raytrace_tile_task<<<(nTasks+4-1)/4,128>>>(width, height, baseWidth, baseHeight, + raster2camera, camera2world, + image, id, nodes, triangles); + cudaDeviceSynchronize(); +} + diff --git a/examples_cuda/sort/sort1.cu b/examples_cuda/sort/sort1.cu index 49886fdb..2c94a409 100644 --- a/examples_cuda/sort/sort1.cu +++ b/examples_cuda/sort/sort1.cu @@ -247,7 +247,7 @@ void sort_ispc ( int n, unsigned int code[], int order[], int ntasks, { int num = ntasks; int span = n / num; -#if 0 +#if 1 int hsize = 256*programCount*num; int * hist = __new< int>(hsize); int64 * pair = __new< int64>(n); @@ -293,7 +293,7 @@ void sort_ispc ( int n, unsigned int code[], int order[], int ntasks, unpack<<>> (span, n, pair, code, order); sync; -#if ALLOCATED +#ifdef ALLOCATED __delete(g); __delete(hist); __delete(pair); diff --git a/examples_cuda/stencil/.stencil.ispc.swn b/examples_cuda/stencil/.stencil.ispc.swn deleted file mode 100644 index ad3f6c78..00000000 Binary files a/examples_cuda/stencil/.stencil.ispc.swn and /dev/null differ