Merge branch 'sm35_foreach' of github.com:egaburov/ispc into sm35_foreach

This commit is contained in:
evghenii
2013-11-18 21:59:09 +01:00
5 changed files with 190 additions and 136 deletions

View File

@@ -133,45 +133,7 @@ struct Uniform
data[chunkIdx] = shdata[programIndex];
}
};
#elif 0
static __shared__ void* shptr_full[128];
template<typename T, int N>
struct Uniform
{
T data[(N+programCount-1)/programCount];
T* *shptr;
__device__ inline Uniform()
{
shptr = (T**)shptr_full;
shptr[threadIdx.x] = data;
__syncthreads();
}
__device__ inline int2 get_chunk(const int i) const
{
const int elem = i & (programCount - 1);
const int chunk = i >> 5;
return make_int2(chunk, elem);
}
__device__ inline const T get(const int i) const
{
const int2 idx = get_chunk(i);
const int chunk = idx.x;
const int elem = idx.y;
return shptr[chunk][elem];
}
__device__ inline void set(const bool active, const int i, T value)
{
const int2 idx = get_chunk(i);
const int chunk = idx.x;
const int elem = idx.y;
shptr[chunk][elem] = value;
}
};
#elif 0
#elif 1
template<typename T, int N>
struct Uniform
{
@@ -181,32 +143,17 @@ struct Uniform
int32_t ptr[2];
};
__device__ inline Uniform()
{
#if 1
if (programIndex == 0)
data = new T[N];
data = (T*)malloc(N*sizeof(T));
ptr[0] = __shfl(ptr[0], 0);
ptr[1] = __shfl(ptr[1], 0);
#else
__shared__ T *ptr;
if (threadIdx.x == 0)
ptr = new T[4*N];
__syncthreads();
data = ptr;
data += warpIdx*N;
#endif
}
__device__ inline ~Uniform()
{
#if 1
if (programIndex == 0)
delete data;
#else
if (threadIdx.x == 0)
delete data;
#endif
free(data);
}
__device__ inline const T get(const int i) const
@@ -717,7 +664,6 @@ ShadeTile(
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
if (x >= tileEndX) break;
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
@@ -730,21 +676,19 @@ ShadeTile(
///////////////////////////////////////////////////////////////////////////
// Static decomposition
__global__ void
RenderTile( int num_groups_x, int num_groups_y,
const InputHeader inputHeaderPtr[],
const InputDataArrays inputDataPtr[],
const InputHeader *inputHeaderPtr,
const InputDataArrays *inputDataPtr,
int visualizeLightCount,
// Output
unsigned int8 framebuffer_r[],
unsigned int8 framebuffer_g[],
unsigned int8 framebuffer_b[]) {
if (taskIndex >= taskCount) return;
const InputHeader &inputHeader = *inputHeaderPtr;
const InputDataArrays &inputData = *inputDataPtr;
#if 1
const InputHeader inputHeader = *inputHeaderPtr;
const InputDataArrays inputData = *inputDataPtr;
int32 group_y = taskIndex / num_groups_x;
int32 group_x = taskIndex % num_groups_x;
@@ -759,16 +703,9 @@ RenderTile( int num_groups_x, int num_groups_y,
float cameraProj_11 = inputHeader.cameraProj[1][1];
float cameraProj_22 = inputHeader.cameraProj[2][2];
float cameraProj_32 = inputHeader.cameraProj[3][2];
#endif
// Light intersection: figure out which lights illuminate this tile.
Uniform<int,MAX_LIGHTS> tileLightIndices; // Light list for the tile
#if 0
tileLightIndices.set(threadIdx.x&1, threadIdx.x, framebuffer_g[blockIdx.x]);
framebuffer_r[threadIdx.x] = tileLightIndices.get(threadIdx.x);
#endif
#if 1
int numTileLights =
IntersectLightsWithTile(tile_start_x, tile_end_x,
@@ -795,31 +732,30 @@ RenderTile( int num_groups_x, int num_groups_y,
}
extern "C"
__global__ void
RenderStatic(InputHeader inputHeaderPtr[],
InputDataArrays inputDataPtr[],
int visualizeLightCount,
extern "C" __global__ void
RenderStatic( InputHeader inputHeaderPtr[],
InputDataArrays inputDataPtr[],
int visualizeLightCount,
// Output
unsigned int8 framebuffer_r[],
unsigned int8 framebuffer_g[],
unsigned int8 framebuffer_b[]) {
unsigned int8 framebuffer_r[],
unsigned int8 framebuffer_g[],
unsigned int8 framebuffer_b[]) {
const InputHeader inputHeader = *inputHeaderPtr;
const InputDataArrays inputData = *inputDataPtr;
const InputHeader inputHeader = *inputHeaderPtr;
const InputDataArrays inputData = *inputDataPtr;
int num_groups_x = (inputHeader.framebufferWidth +
int num_groups_x = (inputHeader.framebufferWidth +
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
int num_groups_y = (inputHeader.framebufferHeight +
int num_groups_y = (inputHeader.framebufferHeight +
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
int num_groups = num_groups_x * num_groups_y;
int num_groups = num_groups_x * num_groups_y;
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
// by MIN_TILE_HEIGHT pixels.
if (programIndex == 0)
RenderTile<<<num_groups, 128>>>(num_groups_x, num_groups_y,
inputHeaderPtr, inputDataPtr, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
cudaDeviceSynchronize();
cudaDeviceSynchronize();
if (programIndex == 0)
RenderTile<<<(num_groups+4-1)/4,128>>>(num_groups_x, num_groups_y,
inputHeaderPtr, inputDataPtr, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
cudaDeviceSynchronize();
}

View File

@@ -0,0 +1,104 @@
/*
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define programCount 32
#define programIndex (threadIdx.x & 31)
#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5))
#define taskCount0 (gridDim.x*4)
#define taskIndex1 (blockIdx.y)
#define taskCount1 (gridDim.y)
#define warpIdx (threadIdx.x >> 5)
__device__
static inline int
mandel(float c_re, float c_im, int count) {
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.0f)
break;
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.f * z_re * z_im;
z_re = c_re + new_re;
z_im = c_im + new_im;
}
return i;
}
/* Task to compute the Mandelbrot iterations for a single scanline.
*/
__global__ void
mandelbrot_scanline( float x0, float dx,
float y0, float dy,
int width, int height,
int xspan, int yspan,
int maxIterations, int output[]) {
const int xstart = taskIndex0 * xspan;
const int xend = min(xstart + xspan, width);
const int ystart = taskIndex1 * yspan;
const int yend = min(ystart + yspan, height);
for ( int yi = ystart; yi < yend; yi++)
for ( int xi = xstart; xi < xend; xi += programCount)
{
const float x = x0 + (xi + programIndex) * dx;
const float y = y0 + yi * dy;
const int res = mandel(x,y,maxIterations);
const int index = yi * width + (xi + programIndex);
if (xi + programIndex < xend)
output[index] = res;
}
}
extern "C" __global__ void
mandelbrot_ispc( float x0, float y0,
float x1, float y1,
int width, int height,
int maxIterations, int output[]) {
float dx = (x1 - x0) / width;
float dy = (y1 - y0) / height;
const int xspan = 64; /* make sure it is big enough to avoid false-sharing */
const int yspan = 8;
if (programIndex == 0)
mandelbrot_scanline<<<dim3((width+4-1)/xspan/4,height/yspan),128>>>
(x0, dx, y0, dy, width, height, xspan, yspan, maxIterations, output);
cudaDeviceSynchronize();
}

View File

@@ -2,6 +2,7 @@
#define programIndex (threadIdx.x & 31)
#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5))
#define taskCount (gridDim.x*4)
#define warpIdx (threadIdx.x >> 5)
#define float3 Float3
struct Float3
@@ -57,17 +58,12 @@ struct Float3
}
};
#if 0
#define DIRISNEG
#endif
#define int8 char
#define int16 short
struct Ray {
float3 origin, dir, invDir;
#ifdef DIRISNEG /* this fails to compile with nvvm */
unsigned int dirIsNeg[3];
#else
unsigned int dirIsNeg0, dirIsNeg1, dirIsNeg2;
#endif
float mint, maxt;
int hitId;
};
@@ -78,8 +74,6 @@ struct Triangle {
int pad[3];
};
#define int8 char
#define int16 short
struct LinearBVHNode {
float bounds[2][3];
unsigned int offset; // num primitives for leaf, second child for interior
@@ -105,7 +99,8 @@ static inline float Dot(const float3 a, const float3 b) {
}
__device__
static inline void generateRay( const float raster2camera[4][4],
inline
static void generateRay( const float raster2camera[4][4],
const float camera2world[4][4],
float x, float y, Ray &ray) {
ray.mint = 0.f;
@@ -135,7 +130,7 @@ static inline void generateRay( const float raster2camera[4][4],
ray.invDir = 1.f / ray.dir;
#ifdef DIRISNEG
#if 0
ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
@@ -146,9 +141,9 @@ static inline void generateRay( const float raster2camera[4][4],
#endif
}
__device__
static inline bool BBoxIntersect(const float bounds[2][3],
inline
static bool BBoxIntersect(const float bounds[2][3],
const Ray &ray) {
float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
@@ -186,9 +181,9 @@ static inline bool BBoxIntersect(const float bounds[2][3],
}
__device__
static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
inline
static bool TriIntersect(const Triangle &tri, Ray &ray) {
float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
@@ -227,15 +222,15 @@ static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
return hit;
}
__device__
static inline bool BVHIntersect(const LinearBVHNode nodes[],
const Triangle tris[], Ray &r) {
inline
bool BVHIntersect(const LinearBVHNode nodes[],
const Triangle tris[], Ray &r,
int todo[]) {
Ray ray = r;
bool hit = false;
// Follow ray through BVH nodes to find primitive intersections
int todoOffset = 0, nodeNum = 0;
int todo[64];
while (true) {
// Check ray against BVH node
@@ -249,20 +244,17 @@ static inline bool BVHIntersect(const LinearBVHNode nodes[],
if (TriIntersect(tris[primitivesOffset+i], ray))
hit = true;
}
if (todoOffset == 0)
break;
nodeNum = todo[--todoOffset];
}
else {
// Put far BVH node on _todo_ stack, advance to near node
#ifdef DIRISNEG
const int dirIsNeg = r.dirIsNeg[node.splitAxis];
#else
int dirIsNeg;
if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg0;
if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg1;
if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg2;
#endif
if (dirIsNeg)
{
if (dirIsNeg) {
todo[todoOffset++] = nodeNum + 1;
nodeNum = node.offset;
}
@@ -273,10 +265,10 @@ static inline bool BVHIntersect(const LinearBVHNode nodes[],
}
}
else {
nodeNum = todo[--todoOffset];
}
if (todoOffset == 0)
break;
nodeNum = todo[--todoOffset];
}
}
r.maxt = ray.maxt;
r.hitId = ray.hitId;
@@ -284,9 +276,9 @@ static inline bool BVHIntersect(const LinearBVHNode nodes[],
return hit;
}
__device__
static inline void raytrace_tile( int x0, int x1,
inline
static void raytrace_tile( int x0, int x1,
int y0, int y1,
int width, int height,
int baseWidth, int baseHeight,
@@ -298,28 +290,34 @@ static inline void raytrace_tile( int x0, int x1,
float widthScale = (float)(baseWidth) / (float)(width);
float heightScale = (float)(baseHeight) / (float)(height);
// foreach_tiled (y = y0 ... y1, x = x0 ... x1)
for ( int y = y0; y < y1; y++)
for ( int xb = x0; xb < x1; xb += programCount)
{
const int x = xb + programIndex;
Ray ray;
generateRay(raster2camera, camera2world, x*widthScale,
y*heightScale, ray);
BVHIntersect(nodes, triangles, ray);
#if 0
int * todo = new int[64];
#define ALLOC
#else
int todo[64];
#endif
int offset = y * width + x;
for (int y = y0 ;y < y1; y++)
for (int x = x0 + programIndex; x < x1; x += programCount)
if (x < x1)
{
Ray ray;
generateRay(raster2camera, camera2world, x*widthScale,
y*heightScale, ray);
BVHIntersect(nodes, triangles, ray, todo);
int offset = y * width + x;
image[offset] = ray.maxt;
id[offset] = ray.hitId;
}
}
#ifdef ALLOC
delete todo;
#endif
}
extern "C"
__global__
void raytrace_tile_task( int width, int height,
int baseWidth, int baseHeight,
@@ -328,18 +326,34 @@ void raytrace_tile_task( int width, int height,
float image[], int id[],
const LinearBVHNode nodes[],
const Triangle triangles[]) {
if (taskIndex >= taskCount) return;
int dx = 32, dy = 16; // must match dx, dy below
int dx = 64, dy = 8; // must match dx, dy below
int xBuckets = (width + (dx-1)) / dx;
int x0 = (taskIndex % xBuckets) * dx;
int x1 = min(x0 + dx, width);
int y0 = (taskIndex / xBuckets) * dy;
int y1 = min(y0 + dy, height);
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
raster2camera, camera2world, image,
id, nodes, triangles);
}
extern "C" __global__ void raytrace_ispc_tasks( int width, int height,
int baseWidth, int baseHeight,
const float raster2camera[4][4],
const float camera2world[4][4],
float image[], int id[],
const LinearBVHNode nodes[],
const Triangle triangles[]) {
int dx = 64, dy = 8;
int xBuckets = (width + (dx-1)) / dx;
int yBuckets = (height + (dy-1)) / dy;
int nTasks = xBuckets * yBuckets;
if (programIndex == 0)
raytrace_tile_task<<<(nTasks+4-1)/4,128>>>(width, height, baseWidth, baseHeight,
raster2camera, camera2world,
image, id, nodes, triangles);
cudaDeviceSynchronize();
}

View File

@@ -247,7 +247,7 @@ void sort_ispc ( int n, unsigned int code[], int order[], int ntasks,
{
int num = ntasks;
int span = n / num;
#if 0
#if 1
int hsize = 256*programCount*num;
int * hist = __new< int>(hsize);
int64 * pair = __new< int64>(n);
@@ -293,7 +293,7 @@ void sort_ispc ( int n, unsigned int code[], int order[], int ntasks,
unpack<<<nbx(num),128>>> (span, n, pair, code, order);
sync;
#if ALLOCATED
#ifdef ALLOCATED
__delete(g);
__delete(hist);
__delete(pair);