diff --git a/examples/rt/Makefile b/examples/rt/Makefile index e52b02e4..0c72f104 100644 --- a/examples/rt/Makefile +++ b/examples/rt/Makefile @@ -2,7 +2,7 @@ EXAMPLE=rt CPP_SRC=rt.cpp rt_serial.cpp ISPC_SRC=rt.ispc -ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8 +ISPC_IA_TARGETS=avx1-i32x8 ISPC_ARM_TARGETS=neon include ../common.mk diff --git a/examples_cuda/rt/rt_cu.cpp b/examples_cuda/rt/rt_cu.cpp index b8447b83..6969e4be 100644 --- a/examples_cuda/rt/rt_cu.cpp +++ b/examples_cuda/rt/rt_cu.cpp @@ -45,7 +45,7 @@ #include #include #include "../timing.h" -#include "rt_ispc.h" +#include "rt1_ispc.h" #include @@ -513,7 +513,7 @@ int main(int argc, char *argv[]) { (Triangle*)d_triangles); double dt = rtc() - t0; //get_elapsed_mcycles(); #else - const char * func_name = "raytrace_ispc_tasks"; + const char * func_name = "raytrace_ispc_tasks___export"; void *func_args[] = {&width, &height, &baseWidth, &baseHeight, &d_raster2camera, &d_camera2world, &d_image, &d_id, diff --git a/examples_ptx/aobench/Makefile_gpu b/examples_ptx/aobench/Makefile_gpu index fa58d307..d3cc3663 100644 --- a/examples_ptx/aobench/Makefile_gpu +++ b/examples_ptx/aobench/Makefile_gpu @@ -1,5 +1,6 @@ -PROG=ao_gpu +PROG=ao ISPC_SRC=ao.ispc +CU_SRC=ao.cu CXX_SRC=ao.cpp ao_serial.cpp PTXCC_REGMAX=64 diff --git a/examples_ptx/common_gpu.mk b/examples_ptx/common_gpu.mk index adc806bc..79e3b47b 100644 --- a/examples_ptx/common_gpu.mk +++ b/examples_ptx/common_gpu.mk @@ -5,10 +5,11 @@ CXX=g++ CXXFLAGS=-O3 -I$(CUDATK)/include -Iobjs_gpu/ -D_CUDA_ # NVCC=nvcc -NVCC_FLAGS=-O3 -arch=sm_35 -D_CUDA_ -I../ +NVCC_FLAGS=-O3 -arch=sm_35 -D_CUDA_ -I../ -Xptxas=-v ifdef PTXCC_REGMAX NVCC_FLAGS += --maxrregcount=$(PTXCC_REGMAX) endif +NVCC_FLAGS+=--use_fast_math # LD=nvcc LDFLAGS=-lcudart -lcudadevrt -arch=sm_35 diff --git a/examples_ptx/cuda_helpers.cuh b/examples_ptx/cuda_helpers.cuh index 1a3ff226..689753e1 100644 --- a/examples_ptx/cuda_helpers.cuh +++ b/examples_ptx/cuda_helpers.cuh @@ -6,5 +6,6 @@ #define taskCount0 (gridDim.x*4) #define taskIndex1 (blockIdx.y) #define taskCount1 (gridDim.y) +#define taskIndex (taskIndex0 + taskCount0*taskIndex1) #define warpIdx (threadIdx.x >> 5) #define launch(ntx,nty,ntz,func) if (programIndex==0) func<<>> diff --git a/examples_ptx/rt/rt.cpp b/examples_ptx/rt/rt.cpp index ea1594c5..f269ca3b 100644 --- a/examples_ptx/rt/rt.cpp +++ b/examples_ptx/rt/rt.cpp @@ -197,7 +197,12 @@ int main(int argc, char *argv[]) { // And then read the triangles uint nTris; READ(nTris, 1); +#if 0 Triangle *triangles = new Triangle[nTris]; +#else + Triangle *triangles; + ispc_malloc((void**)&triangles, nTris*sizeof(Triangle)); +#endif for (uint i = 0; i < nTris; ++i) { // 9x floats for the 3 vertices float v[9]; @@ -246,8 +251,8 @@ int main(int argc, char *argv[]) { writeImage(id, image, width, height, "rt-ispc-1core.ppm"); #endif - memset(id, 0, width*height*sizeof(int)); - memset(image, 0, width*height*sizeof(float)); + ispc_memset(id, 0, width*height*sizeof(int)); + ispc_memset(image, 0, width*height*sizeof(float)); // // Run 3 iterations with ispc + 1 core, record the minimum time @@ -266,8 +271,8 @@ int main(int argc, char *argv[]) { writeImage(id, image, width, height, "rt-ispc-tasks.ppm"); - memset(id, 0, width*height*sizeof(int)); - memset(image, 0, width*height*sizeof(float)); + ispc_memset(id, 0, width*height*sizeof(int)); + ispc_memset(image, 0, width*height*sizeof(float)); // // And 3 iterations with the serial implementation, reporting the diff --git a/examples_ptx/rt/rt.cu b/examples_ptx/rt/rt.cu index 8decd03a..b60929bf 100644 --- a/examples_ptx/rt/rt.cu +++ b/examples_ptx/rt/rt.cu @@ -1,8 +1,4 @@ -#define programCount 32 -#define programIndex (threadIdx.x & 31) -#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5)) -#define taskCount (gridDim.x*4) -#define warpIdx (threadIdx.x >> 5) +#include "cuda_helpers.cuh" #define float3 Float3 struct Float3 @@ -339,7 +335,7 @@ void raytrace_tile_task( int width, int height, } -extern "C" __global__ void raytrace_ispc_tasks( int width, int height, +extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height, int baseWidth, int baseHeight, const float raster2camera[4][4], const float camera2world[4][4], @@ -350,10 +346,28 @@ extern "C" __global__ void raytrace_ispc_tasks( int width, int height, int xBuckets = (width + (dx-1)) / dx; int yBuckets = (height + (dy-1)) / dy; int nTasks = xBuckets * yBuckets; - if (programIndex == 0) - raytrace_tile_task<<<(nTasks+4-1)/4,128>>>(width, height, baseWidth, baseHeight, - raster2camera, camera2world, - image, id, nodes, triangles); + launch(nTasks,1,1,raytrace_tile_task) + (width, height, baseWidth, baseHeight, + raster2camera, camera2world, + image, id, nodes, triangles); cudaDeviceSynchronize(); } + + +extern "C" __host__ void raytrace_ispc_tasks( int width, int height, + int baseWidth, int baseHeight, + const float raster2camera[4][4], + const float camera2world[4][4], + float image[], int id[], + const LinearBVHNode nodes[], + const Triangle triangles[]) { + raytrace_ispc_tasks___export<<<1,32>>>( width, height, + baseWidth, baseHeight, + raster2camera, + camera2world, + image, id, + nodes, + triangles); + cudaDeviceSynchronize(); +} diff --git a/examples_ptx/rt/rt.ispc b/examples_ptx/rt/rt.ispc index dd655085..1db7ee5e 100644 --- a/examples_ptx/rt/rt.ispc +++ b/examples_ptx/rt/rt.ispc @@ -31,13 +31,25 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#define bool int - +typedef bool bool_t; typedef float<3> float3; +#if 1 +#define __ORIG +#endif + +struct int3 +{ + int x,y,z; +}; + struct Ray { float3 origin, dir, invDir; +#ifdef __ORIG uniform unsigned int dirIsNeg[3]; +#else + uniform int3 dirIsNeg; +#endif float mint, maxt; int hitId; }; @@ -101,13 +113,19 @@ static void generateRay(uniform const float raster2camera[4][4], ray.invDir = 1.f / ray.dir; +#ifdef __ORIG ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0; ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0; ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0; +#else + ray.dirIsNeg.x = any(ray.invDir.x < 0) ? 1 : 0; + ray.dirIsNeg.y = any(ray.invDir.y < 0) ? 1 : 0; + ray.dirIsNeg.z = any(ray.invDir.z < 0) ? 1 : 0; +#endif } -static bool BBoxIntersect(const uniform float bounds[2][3], +static bool_t BBoxIntersect(const uniform float bounds[2][3], const Ray &ray) { uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] }; uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] }; @@ -146,7 +164,7 @@ static bool BBoxIntersect(const uniform float bounds[2][3], -static bool TriIntersect(const uniform Triangle &tri, Ray &ray) { +static bool_t TriIntersect(const uniform Triangle &tri, Ray &ray) { uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] }; uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] }; uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] }; @@ -155,7 +173,7 @@ static bool TriIntersect(const uniform Triangle &tri, Ray &ray) { float3 s1 = Cross(ray.dir, e2); float divisor = Dot(s1, e1); - bool hit = true; + bool_t hit = true; if (divisor == 0.) hit = false; @@ -186,10 +204,11 @@ static bool TriIntersect(const uniform Triangle &tri, Ray &ray) { } -bool BVHIntersect(const uniform LinearBVHNode nodes[], +bool_t +BVHIntersect(const uniform LinearBVHNode nodes[], const uniform Triangle tris[], Ray &r) { Ray ray = r; - bool hit = false; + bool_t hit = false; // Follow ray through BVH nodes to find primitive intersections uniform int todoOffset = 0, nodeNum = 0; uniform int todo[64]; @@ -212,7 +231,15 @@ bool BVHIntersect(const uniform LinearBVHNode nodes[], } else { // Put far BVH node on _todo_ stack, advance to near node - if (r.dirIsNeg[node.splitAxis]) { +#ifdef __ORIG + int dirIsNeg = r.dirIsNeg[node.splitAxis]; +#else + int dirIsNeg; + if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg.x; + if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg.y; + if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg.z; +#endif + if (dirIsNeg) { todo[todoOffset++] = nodeNum + 1; nodeNum = node.offset; }