+1
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
PROG=ao_gpu
|
||||
PROG=ao
|
||||
ISPC_SRC=ao.ispc
|
||||
CU_SRC=ao.cu
|
||||
CXX_SRC=ao.cpp ao_serial.cpp
|
||||
PTXCC_REGMAX=64
|
||||
|
||||
|
||||
@@ -5,10 +5,11 @@ CXX=g++
|
||||
CXXFLAGS=-O3 -I$(CUDATK)/include -Iobjs_gpu/ -D_CUDA_
|
||||
#
|
||||
NVCC=nvcc
|
||||
NVCC_FLAGS=-O3 -arch=sm_35 -D_CUDA_ -I../
|
||||
NVCC_FLAGS=-O3 -arch=sm_35 -D_CUDA_ -I../ -Xptxas=-v
|
||||
ifdef PTXCC_REGMAX
|
||||
NVCC_FLAGS += --maxrregcount=$(PTXCC_REGMAX)
|
||||
endif
|
||||
NVCC_FLAGS+=--use_fast_math
|
||||
#
|
||||
LD=nvcc
|
||||
LDFLAGS=-lcudart -lcudadevrt -arch=sm_35
|
||||
|
||||
@@ -6,5 +6,6 @@
|
||||
#define taskCount0 (gridDim.x*4)
|
||||
#define taskIndex1 (blockIdx.y)
|
||||
#define taskCount1 (gridDim.y)
|
||||
#define taskIndex (taskIndex0 + taskCount0*taskIndex1)
|
||||
#define warpIdx (threadIdx.x >> 5)
|
||||
#define launch(ntx,nty,ntz,func) if (programIndex==0) func<<<dim3(((ntx)+4-1)/4,nty,ntz),128>>>
|
||||
|
||||
@@ -197,7 +197,12 @@ int main(int argc, char *argv[]) {
|
||||
// And then read the triangles
|
||||
uint nTris;
|
||||
READ(nTris, 1);
|
||||
#if 0
|
||||
Triangle *triangles = new Triangle[nTris];
|
||||
#else
|
||||
Triangle *triangles;
|
||||
ispc_malloc((void**)&triangles, nTris*sizeof(Triangle));
|
||||
#endif
|
||||
for (uint i = 0; i < nTris; ++i) {
|
||||
// 9x floats for the 3 vertices
|
||||
float v[9];
|
||||
@@ -246,8 +251,8 @@ int main(int argc, char *argv[]) {
|
||||
writeImage(id, image, width, height, "rt-ispc-1core.ppm");
|
||||
#endif
|
||||
|
||||
memset(id, 0, width*height*sizeof(int));
|
||||
memset(image, 0, width*height*sizeof(float));
|
||||
ispc_memset(id, 0, width*height*sizeof(int));
|
||||
ispc_memset(image, 0, width*height*sizeof(float));
|
||||
|
||||
//
|
||||
// Run 3 iterations with ispc + 1 core, record the minimum time
|
||||
@@ -266,8 +271,8 @@ int main(int argc, char *argv[]) {
|
||||
|
||||
writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
|
||||
|
||||
memset(id, 0, width*height*sizeof(int));
|
||||
memset(image, 0, width*height*sizeof(float));
|
||||
ispc_memset(id, 0, width*height*sizeof(int));
|
||||
ispc_memset(image, 0, width*height*sizeof(float));
|
||||
|
||||
//
|
||||
// And 3 iterations with the serial implementation, reporting the
|
||||
|
||||
@@ -1,8 +1,4 @@
|
||||
#define programCount 32
|
||||
#define programIndex (threadIdx.x & 31)
|
||||
#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5))
|
||||
#define taskCount (gridDim.x*4)
|
||||
#define warpIdx (threadIdx.x >> 5)
|
||||
#include "cuda_helpers.cuh"
|
||||
|
||||
#define float3 Float3
|
||||
struct Float3
|
||||
@@ -339,7 +335,7 @@ void raytrace_tile_task( int width, int height,
|
||||
}
|
||||
|
||||
|
||||
extern "C" __global__ void raytrace_ispc_tasks( int width, int height,
|
||||
extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height,
|
||||
int baseWidth, int baseHeight,
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
@@ -350,10 +346,28 @@ extern "C" __global__ void raytrace_ispc_tasks( int width, int height,
|
||||
int xBuckets = (width + (dx-1)) / dx;
|
||||
int yBuckets = (height + (dy-1)) / dy;
|
||||
int nTasks = xBuckets * yBuckets;
|
||||
if (programIndex == 0)
|
||||
raytrace_tile_task<<<(nTasks+4-1)/4,128>>>(width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world,
|
||||
image, id, nodes, triangles);
|
||||
launch(nTasks,1,1,raytrace_tile_task)
|
||||
(width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world,
|
||||
image, id, nodes, triangles);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
|
||||
extern "C" __host__ void raytrace_ispc_tasks( int width, int height,
|
||||
int baseWidth, int baseHeight,
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float image[], int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
raytrace_ispc_tasks___export<<<1,32>>>( width, height,
|
||||
baseWidth, baseHeight,
|
||||
raster2camera,
|
||||
camera2world,
|
||||
image, id,
|
||||
nodes,
|
||||
triangles);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
@@ -31,13 +31,25 @@
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#define bool int
|
||||
|
||||
typedef bool bool_t;
|
||||
typedef float<3> float3;
|
||||
|
||||
#if 1
|
||||
#define __ORIG
|
||||
#endif
|
||||
|
||||
struct int3
|
||||
{
|
||||
int x,y,z;
|
||||
};
|
||||
|
||||
struct Ray {
|
||||
float3 origin, dir, invDir;
|
||||
#ifdef __ORIG
|
||||
uniform unsigned int dirIsNeg[3];
|
||||
#else
|
||||
uniform int3 dirIsNeg;
|
||||
#endif
|
||||
float mint, maxt;
|
||||
int hitId;
|
||||
};
|
||||
@@ -101,13 +113,19 @@ static void generateRay(uniform const float raster2camera[4][4],
|
||||
|
||||
ray.invDir = 1.f / ray.dir;
|
||||
|
||||
#ifdef __ORIG
|
||||
ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
|
||||
ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
|
||||
ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
|
||||
#else
|
||||
ray.dirIsNeg.x = any(ray.invDir.x < 0) ? 1 : 0;
|
||||
ray.dirIsNeg.y = any(ray.invDir.y < 0) ? 1 : 0;
|
||||
ray.dirIsNeg.z = any(ray.invDir.z < 0) ? 1 : 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static bool BBoxIntersect(const uniform float bounds[2][3],
|
||||
static bool_t BBoxIntersect(const uniform float bounds[2][3],
|
||||
const Ray &ray) {
|
||||
uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
|
||||
uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
|
||||
@@ -146,7 +164,7 @@ static bool BBoxIntersect(const uniform float bounds[2][3],
|
||||
|
||||
|
||||
|
||||
static bool TriIntersect(const uniform Triangle &tri, Ray &ray) {
|
||||
static bool_t TriIntersect(const uniform Triangle &tri, Ray &ray) {
|
||||
uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
|
||||
uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
|
||||
uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
|
||||
@@ -155,7 +173,7 @@ static bool TriIntersect(const uniform Triangle &tri, Ray &ray) {
|
||||
|
||||
float3 s1 = Cross(ray.dir, e2);
|
||||
float divisor = Dot(s1, e1);
|
||||
bool hit = true;
|
||||
bool_t hit = true;
|
||||
|
||||
if (divisor == 0.)
|
||||
hit = false;
|
||||
@@ -186,10 +204,11 @@ static bool TriIntersect(const uniform Triangle &tri, Ray &ray) {
|
||||
}
|
||||
|
||||
|
||||
bool BVHIntersect(const uniform LinearBVHNode nodes[],
|
||||
bool_t
|
||||
BVHIntersect(const uniform LinearBVHNode nodes[],
|
||||
const uniform Triangle tris[], Ray &r) {
|
||||
Ray ray = r;
|
||||
bool hit = false;
|
||||
bool_t hit = false;
|
||||
// Follow ray through BVH nodes to find primitive intersections
|
||||
uniform int todoOffset = 0, nodeNum = 0;
|
||||
uniform int todo[64];
|
||||
@@ -212,7 +231,15 @@ bool BVHIntersect(const uniform LinearBVHNode nodes[],
|
||||
}
|
||||
else {
|
||||
// Put far BVH node on _todo_ stack, advance to near node
|
||||
if (r.dirIsNeg[node.splitAxis]) {
|
||||
#ifdef __ORIG
|
||||
int dirIsNeg = r.dirIsNeg[node.splitAxis];
|
||||
#else
|
||||
int dirIsNeg;
|
||||
if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg.x;
|
||||
if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg.y;
|
||||
if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg.z;
|
||||
#endif
|
||||
if (dirIsNeg) {
|
||||
todo[todoOffset++] = nodeNum + 1;
|
||||
nodeNum = node.offset;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user