+1
This commit is contained in:
@@ -2,7 +2,7 @@
|
|||||||
EXAMPLE=rt
|
EXAMPLE=rt
|
||||||
CPP_SRC=rt.cpp rt_serial.cpp
|
CPP_SRC=rt.cpp rt_serial.cpp
|
||||||
ISPC_SRC=rt.ispc
|
ISPC_SRC=rt.ispc
|
||||||
ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
|
ISPC_IA_TARGETS=avx1-i32x8
|
||||||
ISPC_ARM_TARGETS=neon
|
ISPC_ARM_TARGETS=neon
|
||||||
|
|
||||||
include ../common.mk
|
include ../common.mk
|
||||||
|
|||||||
@@ -45,7 +45,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
#include "rt_ispc.h"
|
#include "rt1_ispc.h"
|
||||||
|
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
|
|
||||||
@@ -513,7 +513,7 @@ int main(int argc, char *argv[]) {
|
|||||||
(Triangle*)d_triangles);
|
(Triangle*)d_triangles);
|
||||||
double dt = rtc() - t0; //get_elapsed_mcycles();
|
double dt = rtc() - t0; //get_elapsed_mcycles();
|
||||||
#else
|
#else
|
||||||
const char * func_name = "raytrace_ispc_tasks";
|
const char * func_name = "raytrace_ispc_tasks___export";
|
||||||
void *func_args[] = {&width, &height, &baseWidth, &baseHeight,
|
void *func_args[] = {&width, &height, &baseWidth, &baseHeight,
|
||||||
&d_raster2camera, &d_camera2world,
|
&d_raster2camera, &d_camera2world,
|
||||||
&d_image, &d_id,
|
&d_image, &d_id,
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
PROG=ao_gpu
|
PROG=ao
|
||||||
ISPC_SRC=ao.ispc
|
ISPC_SRC=ao.ispc
|
||||||
|
CU_SRC=ao.cu
|
||||||
CXX_SRC=ao.cpp ao_serial.cpp
|
CXX_SRC=ao.cpp ao_serial.cpp
|
||||||
PTXCC_REGMAX=64
|
PTXCC_REGMAX=64
|
||||||
|
|
||||||
|
|||||||
@@ -5,10 +5,11 @@ CXX=g++
|
|||||||
CXXFLAGS=-O3 -I$(CUDATK)/include -Iobjs_gpu/ -D_CUDA_
|
CXXFLAGS=-O3 -I$(CUDATK)/include -Iobjs_gpu/ -D_CUDA_
|
||||||
#
|
#
|
||||||
NVCC=nvcc
|
NVCC=nvcc
|
||||||
NVCC_FLAGS=-O3 -arch=sm_35 -D_CUDA_ -I../
|
NVCC_FLAGS=-O3 -arch=sm_35 -D_CUDA_ -I../ -Xptxas=-v
|
||||||
ifdef PTXCC_REGMAX
|
ifdef PTXCC_REGMAX
|
||||||
NVCC_FLAGS += --maxrregcount=$(PTXCC_REGMAX)
|
NVCC_FLAGS += --maxrregcount=$(PTXCC_REGMAX)
|
||||||
endif
|
endif
|
||||||
|
NVCC_FLAGS+=--use_fast_math
|
||||||
#
|
#
|
||||||
LD=nvcc
|
LD=nvcc
|
||||||
LDFLAGS=-lcudart -lcudadevrt -arch=sm_35
|
LDFLAGS=-lcudart -lcudadevrt -arch=sm_35
|
||||||
|
|||||||
@@ -6,5 +6,6 @@
|
|||||||
#define taskCount0 (gridDim.x*4)
|
#define taskCount0 (gridDim.x*4)
|
||||||
#define taskIndex1 (blockIdx.y)
|
#define taskIndex1 (blockIdx.y)
|
||||||
#define taskCount1 (gridDim.y)
|
#define taskCount1 (gridDim.y)
|
||||||
|
#define taskIndex (taskIndex0 + taskCount0*taskIndex1)
|
||||||
#define warpIdx (threadIdx.x >> 5)
|
#define warpIdx (threadIdx.x >> 5)
|
||||||
#define launch(ntx,nty,ntz,func) if (programIndex==0) func<<<dim3(((ntx)+4-1)/4,nty,ntz),128>>>
|
#define launch(ntx,nty,ntz,func) if (programIndex==0) func<<<dim3(((ntx)+4-1)/4,nty,ntz),128>>>
|
||||||
|
|||||||
@@ -197,7 +197,12 @@ int main(int argc, char *argv[]) {
|
|||||||
// And then read the triangles
|
// And then read the triangles
|
||||||
uint nTris;
|
uint nTris;
|
||||||
READ(nTris, 1);
|
READ(nTris, 1);
|
||||||
|
#if 0
|
||||||
Triangle *triangles = new Triangle[nTris];
|
Triangle *triangles = new Triangle[nTris];
|
||||||
|
#else
|
||||||
|
Triangle *triangles;
|
||||||
|
ispc_malloc((void**)&triangles, nTris*sizeof(Triangle));
|
||||||
|
#endif
|
||||||
for (uint i = 0; i < nTris; ++i) {
|
for (uint i = 0; i < nTris; ++i) {
|
||||||
// 9x floats for the 3 vertices
|
// 9x floats for the 3 vertices
|
||||||
float v[9];
|
float v[9];
|
||||||
@@ -246,8 +251,8 @@ int main(int argc, char *argv[]) {
|
|||||||
writeImage(id, image, width, height, "rt-ispc-1core.ppm");
|
writeImage(id, image, width, height, "rt-ispc-1core.ppm");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
memset(id, 0, width*height*sizeof(int));
|
ispc_memset(id, 0, width*height*sizeof(int));
|
||||||
memset(image, 0, width*height*sizeof(float));
|
ispc_memset(image, 0, width*height*sizeof(float));
|
||||||
|
|
||||||
//
|
//
|
||||||
// Run 3 iterations with ispc + 1 core, record the minimum time
|
// Run 3 iterations with ispc + 1 core, record the minimum time
|
||||||
@@ -266,8 +271,8 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
|
writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
|
||||||
|
|
||||||
memset(id, 0, width*height*sizeof(int));
|
ispc_memset(id, 0, width*height*sizeof(int));
|
||||||
memset(image, 0, width*height*sizeof(float));
|
ispc_memset(image, 0, width*height*sizeof(float));
|
||||||
|
|
||||||
//
|
//
|
||||||
// And 3 iterations with the serial implementation, reporting the
|
// And 3 iterations with the serial implementation, reporting the
|
||||||
|
|||||||
@@ -1,8 +1,4 @@
|
|||||||
#define programCount 32
|
#include "cuda_helpers.cuh"
|
||||||
#define programIndex (threadIdx.x & 31)
|
|
||||||
#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5))
|
|
||||||
#define taskCount (gridDim.x*4)
|
|
||||||
#define warpIdx (threadIdx.x >> 5)
|
|
||||||
|
|
||||||
#define float3 Float3
|
#define float3 Float3
|
||||||
struct Float3
|
struct Float3
|
||||||
@@ -339,7 +335,7 @@ void raytrace_tile_task( int width, int height,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
extern "C" __global__ void raytrace_ispc_tasks( int width, int height,
|
extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height,
|
||||||
int baseWidth, int baseHeight,
|
int baseWidth, int baseHeight,
|
||||||
const float raster2camera[4][4],
|
const float raster2camera[4][4],
|
||||||
const float camera2world[4][4],
|
const float camera2world[4][4],
|
||||||
@@ -350,10 +346,28 @@ extern "C" __global__ void raytrace_ispc_tasks( int width, int height,
|
|||||||
int xBuckets = (width + (dx-1)) / dx;
|
int xBuckets = (width + (dx-1)) / dx;
|
||||||
int yBuckets = (height + (dy-1)) / dy;
|
int yBuckets = (height + (dy-1)) / dy;
|
||||||
int nTasks = xBuckets * yBuckets;
|
int nTasks = xBuckets * yBuckets;
|
||||||
if (programIndex == 0)
|
launch(nTasks,1,1,raytrace_tile_task)
|
||||||
raytrace_tile_task<<<(nTasks+4-1)/4,128>>>(width, height, baseWidth, baseHeight,
|
(width, height, baseWidth, baseHeight,
|
||||||
raster2camera, camera2world,
|
raster2camera, camera2world,
|
||||||
image, id, nodes, triangles);
|
image, id, nodes, triangles);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
extern "C" __host__ void raytrace_ispc_tasks( int width, int height,
|
||||||
|
int baseWidth, int baseHeight,
|
||||||
|
const float raster2camera[4][4],
|
||||||
|
const float camera2world[4][4],
|
||||||
|
float image[], int id[],
|
||||||
|
const LinearBVHNode nodes[],
|
||||||
|
const Triangle triangles[]) {
|
||||||
|
raytrace_ispc_tasks___export<<<1,32>>>( width, height,
|
||||||
|
baseWidth, baseHeight,
|
||||||
|
raster2camera,
|
||||||
|
camera2world,
|
||||||
|
image, id,
|
||||||
|
nodes,
|
||||||
|
triangles);
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
}
|
||||||
|
|||||||
@@ -31,13 +31,25 @@
|
|||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define bool int
|
typedef bool bool_t;
|
||||||
|
|
||||||
typedef float<3> float3;
|
typedef float<3> float3;
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
#define __ORIG
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct int3
|
||||||
|
{
|
||||||
|
int x,y,z;
|
||||||
|
};
|
||||||
|
|
||||||
struct Ray {
|
struct Ray {
|
||||||
float3 origin, dir, invDir;
|
float3 origin, dir, invDir;
|
||||||
|
#ifdef __ORIG
|
||||||
uniform unsigned int dirIsNeg[3];
|
uniform unsigned int dirIsNeg[3];
|
||||||
|
#else
|
||||||
|
uniform int3 dirIsNeg;
|
||||||
|
#endif
|
||||||
float mint, maxt;
|
float mint, maxt;
|
||||||
int hitId;
|
int hitId;
|
||||||
};
|
};
|
||||||
@@ -101,13 +113,19 @@ static void generateRay(uniform const float raster2camera[4][4],
|
|||||||
|
|
||||||
ray.invDir = 1.f / ray.dir;
|
ray.invDir = 1.f / ray.dir;
|
||||||
|
|
||||||
|
#ifdef __ORIG
|
||||||
ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
|
ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
|
||||||
ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
|
ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
|
||||||
ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
|
ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
|
||||||
|
#else
|
||||||
|
ray.dirIsNeg.x = any(ray.invDir.x < 0) ? 1 : 0;
|
||||||
|
ray.dirIsNeg.y = any(ray.invDir.y < 0) ? 1 : 0;
|
||||||
|
ray.dirIsNeg.z = any(ray.invDir.z < 0) ? 1 : 0;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static bool BBoxIntersect(const uniform float bounds[2][3],
|
static bool_t BBoxIntersect(const uniform float bounds[2][3],
|
||||||
const Ray &ray) {
|
const Ray &ray) {
|
||||||
uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
|
uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
|
||||||
uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
|
uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
|
||||||
@@ -146,7 +164,7 @@ static bool BBoxIntersect(const uniform float bounds[2][3],
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
static bool TriIntersect(const uniform Triangle &tri, Ray &ray) {
|
static bool_t TriIntersect(const uniform Triangle &tri, Ray &ray) {
|
||||||
uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
|
uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
|
||||||
uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
|
uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
|
||||||
uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
|
uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
|
||||||
@@ -155,7 +173,7 @@ static bool TriIntersect(const uniform Triangle &tri, Ray &ray) {
|
|||||||
|
|
||||||
float3 s1 = Cross(ray.dir, e2);
|
float3 s1 = Cross(ray.dir, e2);
|
||||||
float divisor = Dot(s1, e1);
|
float divisor = Dot(s1, e1);
|
||||||
bool hit = true;
|
bool_t hit = true;
|
||||||
|
|
||||||
if (divisor == 0.)
|
if (divisor == 0.)
|
||||||
hit = false;
|
hit = false;
|
||||||
@@ -186,10 +204,11 @@ static bool TriIntersect(const uniform Triangle &tri, Ray &ray) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool BVHIntersect(const uniform LinearBVHNode nodes[],
|
bool_t
|
||||||
|
BVHIntersect(const uniform LinearBVHNode nodes[],
|
||||||
const uniform Triangle tris[], Ray &r) {
|
const uniform Triangle tris[], Ray &r) {
|
||||||
Ray ray = r;
|
Ray ray = r;
|
||||||
bool hit = false;
|
bool_t hit = false;
|
||||||
// Follow ray through BVH nodes to find primitive intersections
|
// Follow ray through BVH nodes to find primitive intersections
|
||||||
uniform int todoOffset = 0, nodeNum = 0;
|
uniform int todoOffset = 0, nodeNum = 0;
|
||||||
uniform int todo[64];
|
uniform int todo[64];
|
||||||
@@ -212,7 +231,15 @@ bool BVHIntersect(const uniform LinearBVHNode nodes[],
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// Put far BVH node on _todo_ stack, advance to near node
|
// Put far BVH node on _todo_ stack, advance to near node
|
||||||
if (r.dirIsNeg[node.splitAxis]) {
|
#ifdef __ORIG
|
||||||
|
int dirIsNeg = r.dirIsNeg[node.splitAxis];
|
||||||
|
#else
|
||||||
|
int dirIsNeg;
|
||||||
|
if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg.x;
|
||||||
|
if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg.y;
|
||||||
|
if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg.z;
|
||||||
|
#endif
|
||||||
|
if (dirIsNeg) {
|
||||||
todo[todoOffset++] = nodeNum + 1;
|
todo[todoOffset++] = nodeNum + 1;
|
||||||
nodeNum = node.offset;
|
nodeNum = node.offset;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user