diff --git a/examples_ptx/common.mk b/examples_ptx/common.mk index 04a566bb..81b1a770 100644 --- a/examples_ptx/common.mk +++ b/examples_ptx/common.mk @@ -1,7 +1,7 @@ -TASK_CXX=../tasksys.cpp +TASK_CXX=../tasksys.cpp ../ispc_malloc.cpp TASK_LIB=-lpthread -TASK_OBJ=objs/tasksys.o +TASK_OBJ=objs/tasksys.o objs/ispc_malloc.o CXX=clang++ CXXFLAGS+=-Iobjs/ -O2 diff --git a/examples_ptx/common_gpu.mk b/examples_ptx/common_gpu.mk index cd0ef686..5460faee 100644 --- a/examples_ptx/common_gpu.mk +++ b/examples_ptx/common_gpu.mk @@ -33,6 +33,9 @@ CXX_OBJS=$(CXX_SRC:%.cpp=objs_gpu/%_gcc.o) CU_OBJS=$(CU_SRC:%.cu=objs_gpu/%_cu.o) #NVCC_OBJS=$(NVCC_SRC:%.cu=objs_gpu/%_nvcc.o) +CXX_SRC+=../ispc_malloc.cpp +CXX_OJS+=objs/ispc_malloc_gcc.o + # PTXGEN = $(HOME)/ptxgen # PTXGEN += -opt=3 # PTXGEN += -ftz=1 -prec-div=0 -prec-sqrt=0 -fma=1 diff --git a/examples_ptx/deferred/Makefile_cpu b/examples_ptx/deferred/Makefile_cpu index be8ce7c4..b21cc643 100644 --- a/examples_ptx/deferred/Makefile_cpu +++ b/examples_ptx/deferred/Makefile_cpu @@ -2,7 +2,7 @@ EXAMPLE=deferred_shading CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp ISPC_SRC=kernels.ispc -ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16 +ISPC_IA_TARGETS=avx1-i32x16 ISPC_ARM_TARGETS=neon ISPC_FLAGS=--opt=fast-math diff --git a/examples_ptx/deferred/main.cpp b/examples_ptx/deferred/main.cpp index d7f62f50..cc5ce300 100644 --- a/examples_ptx/deferred/main.cpp +++ b/examples_ptx/deferred/main.cpp @@ -41,15 +41,15 @@ #endif #include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include #include #include -#include +#include #include #ifdef ISPC_IS_WINDOWS #define WIN32_LEAN_AND_MEAN @@ -58,6 +58,7 @@ #include "deferred.h" #include "kernels_ispc.h" #include "../timing.h" +#include "../ispc_malloc.h" /////////////////////////////////////////////////////////////////////////// @@ -96,11 +97,11 @@ int main(int argc, char** argv) { ispc::RenderStatic(input->header, input->arrays, VISUALIZE_LIGHT_COUNT, framebuffer.r, framebuffer.g, framebuffer.b); - double mcycles = get_elapsed_mcycles() / nframes; - printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", mcycles); - ispcCycles = std::min(ispcCycles, mcycles); + double msec = get_elapsed_msec() / nframes; + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", msec); + ispcCycles = std::min(ispcCycles, msec); } - printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render " + printf("[ispc static + tasks]:\t\t[%.3f] msec to render " "%d x %d image\n", ispcCycles, input->header.framebufferWidth, input->header.framebufferHeight); WriteFrame("deferred-ispc-static.ppm", input, framebuffer); @@ -113,11 +114,11 @@ int main(int argc, char** argv) { reset_and_start_timer(); for (int j = 0; j < nframes; ++j) DispatchDynamicCilk(input, &framebuffer); - double mcycles = get_elapsed_mcycles() / nframes; - printf("@time of serial run:\t\t\t[%.3f] million cycles\n", mcycles); - dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles); + double msec = get_elapsed_msec() / nframes; + printf("@time of serial run:\t\t\t[%.3f] msec\n", msec); + dynamicCilkCycles = std::min(dynamicCilkCycles, msec); } - printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", + printf("[ispc + Cilk dynamic]:\t\t[%.3f] msec to render image\n", dynamicCilkCycles); WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer); #endif // __cilk @@ -128,11 +129,11 @@ int main(int argc, char** argv) { reset_and_start_timer(); for (int j = 0; j < nframes; ++j) DispatchDynamicC(input, &framebuffer); - double mcycles = get_elapsed_mcycles() / nframes; - printf("@time of serial run:\t\t\t[%.3f] million cycles\n", mcycles); - serialCycles = std::min(serialCycles, mcycles); + double msec = get_elapsed_msec() / nframes; + printf("@time of serial run:\t\t\t[%.3f] msec\n", msec); + serialCycles = std::min(serialCycles, msec); } - printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", + printf("[C++ serial dynamic, 1 core]:\t[%.3f] msec to render image\n", serialCycles); WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer); diff --git a/examples_ptx/deferred/main_cu.cpp b/examples_ptx/deferred/main_cu.cpp deleted file mode 100644 index 4f2be879..00000000 --- a/examples_ptx/deferred/main_cu.cpp +++ /dev/null @@ -1,139 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifdef _MSC_VER -#define ISPC_IS_WINDOWS -#define NOMINMAX -#elif defined(__linux__) -#define ISPC_IS_LINUX -#elif defined(__APPLE__) -#define ISPC_IS_APPLE -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef ISPC_IS_WINDOWS - #define WIN32_LEAN_AND_MEAN - #include -#endif -#include "deferred.h" -#include "kernels_ispc.h" -#include "../timing.h" - -/////////////////////////////////////////////////////////////////////////// - -int main(int argc, char** argv) { - if (argc != 2) { - printf("usage: deferred_shading \n"); - return 1; - } - - InputData *input = CreateInputDataFromFile(argv[1]); - if (!input) { - printf("Failed to load input file \"%s\"!\n", argv[1]); - return 1; - } - - Framebuffer framebuffer(input->header.framebufferWidth, - input->header.framebufferHeight); - - InitDynamicC(input); -#ifdef __cilk - InitDynamicCilk(input); -#endif // __cilk - - int nframes = 5; - double ispcCycles = 1e30; - for (int i = 0; i < 5; ++i) { - framebuffer.clear(); - reset_and_start_timer(); - for (int j = 0; j < nframes; ++j) - ispc::RenderStatic(input->header, input->arrays, - VISUALIZE_LIGHT_COUNT, - framebuffer.r, framebuffer.g, framebuffer.b); - double mcycles = get_elapsed_mcycles() / nframes; - ispcCycles = std::min(ispcCycles, mcycles); - } - printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render " - "%d x %d image\n", ispcCycles, - input->header.framebufferWidth, input->header.framebufferHeight); - WriteFrame("deferred-ispc-static.ppm", input, framebuffer); - -#ifdef __cilk - double dynamicCilkCycles = 1e30; - for (int i = 0; i < 5; ++i) { - framebuffer.clear(); - reset_and_start_timer(); - for (int j = 0; j < nframes; ++j) - DispatchDynamicCilk(input, &framebuffer); - double mcycles = get_elapsed_mcycles() / nframes; - dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles); - } - printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", - dynamicCilkCycles); - WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer); -#endif // __cilk - - double serialCycles = 1e30; - for (int i = 0; i < 5; ++i) { - framebuffer.clear(); - reset_and_start_timer(); - for (int j = 0; j < nframes; ++j) - DispatchDynamicC(input, &framebuffer); - double mcycles = get_elapsed_mcycles() / nframes; - serialCycles = std::min(serialCycles, mcycles); - } - printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", - serialCycles); - WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer); - -#ifdef __cilk - printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", - serialCycles/ispcCycles, serialCycles/dynamicCilkCycles); -#else - printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles); -#endif // __cilk - - DeleteInputData(input); - - return 0; -} diff --git a/examples_ptx/ispc_malloc.cpp b/examples_ptx/ispc_malloc.cpp new file mode 100644 index 00000000..4b83751f --- /dev/null +++ b/examples_ptx/ispc_malloc.cpp @@ -0,0 +1,35 @@ +#include +#include +#include +#include +#include "ispc_malloc.h" + +#ifdef _CUDA_ + +void * operator new(size_t size) throw(std::bad_alloc) +{ + void *ptr; + ispc_malloc(&ptr, size); + return ptr; +} +void operator delete(void *ptr) throw() +{ + ispc_free(ptr); +} + +#else + +void ispc_malloc(void **ptr, const size_t size) +{ + *ptr = malloc(size); +} +void ispc_free(void *ptr) +{ + free(ptr); +} +void ispc_memset(void *ptr, int value, size_t size) +{ + memset(ptr, value, size); +} + +#endif diff --git a/examples_ptx/ispc_malloc.h b/examples_ptx/ispc_malloc.h index ffecb691..c42ac56b 100644 --- a/examples_ptx/ispc_malloc.h +++ b/examples_ptx/ispc_malloc.h @@ -1,22 +1,5 @@ #pragma once -#ifdef _CUDA_ extern void ispc_malloc(void **ptr, const size_t size); extern void ispc_free(void *ptr); extern void ispc_memset(void *ptr, int value, size_t size); -#else -#include -static inline void ispc_malloc(void **ptr, const size_t size) -{ - *ptr = malloc(size); -} -static inline void ispc_free(void *ptr) -{ - free(ptr); -} -static inline void ispc_memset(void *ptr, int value, size_t size) -{ - memset(ptr, value, size); -} - -#endif diff --git a/examples_ptx/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples_ptx/mandelbrot_tasks/mandelbrot_tasks.cpp index f40950c1..3ff6303b 100644 --- a/examples_ptx/mandelbrot_tasks/mandelbrot_tasks.cpp +++ b/examples_ptx/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -103,8 +103,7 @@ int main(int argc, char *argv[]) { int maxIterations = 512; - int *buf; - ispc_malloc((void**)&buf, sizeof(int)*width*height); + int *buf = new int[width*height]; for (unsigned int i = 0; i < width * height; ++i) buf[i] = 0; diff --git a/examples_ptx/nvcc_helpers.cu b/examples_ptx/nvcc_helpers.cu index 783f8bda..3def93a1 100644 --- a/examples_ptx/nvcc_helpers.cu +++ b/examples_ptx/nvcc_helpers.cu @@ -1,5 +1,3 @@ -#include "ispc_malloc.h" - #ifndef _CUDA_ #error "Something went wrong..." #endif diff --git a/examples_ptx/rt/rt.cpp b/examples_ptx/rt/rt.cpp index f269ca3b..220216c4 100644 --- a/examples_ptx/rt/rt.cpp +++ b/examples_ptx/rt/rt.cpp @@ -45,8 +45,8 @@ #include #include #include "../timing.h" -#include "../ispc_malloc.h" #include "rt_ispc.h" +#include "../ispc_malloc.h" using namespace ispc; @@ -141,15 +141,11 @@ int main(int argc, char *argv[]) { // fread in the bits // int baseWidth, baseHeight; -#if 0 - float camera2world[4][4], raster2camera[4][4]; -#else - float *camera2world_ispc, *raster2camera_ispc; - ispc_malloc((void**)&camera2world_ispc, 4*4*sizeof(float)); - ispc_malloc((void**)&raster2camera_ispc, 4*4*sizeof(float)); +// float camera2world[4][4], raster2camera[4][4]; + float *camera2world_ispc = new float[4*4]; + float *raster2camera_ispc = new float[4*4]; float (*camera2world )[4] = (float (*)[4])camera2world_ispc; float (*raster2camera)[4] = (float (*)[4])raster2camera_ispc; -#endif READ(baseWidth, 1); READ(baseHeight, 1); READ(camera2world[0][0], 16); @@ -170,12 +166,7 @@ int main(int argc, char *argv[]) { uint nNodes; READ(nNodes, 1); -#if 0 LinearBVHNode *nodes = new LinearBVHNode[nNodes]; -#else - LinearBVHNode *nodes; - ispc_malloc((void**)&nodes, nNodes*sizeof(LinearBVHNode)); -#endif for (unsigned int i = 0; i < nNodes; ++i) { // Each node is 6x floats for a boox, then an integer for an offset // to the second child node, then an integer that encodes the type @@ -197,12 +188,7 @@ int main(int argc, char *argv[]) { // And then read the triangles uint nTris; READ(nTris, 1); -#if 0 Triangle *triangles = new Triangle[nTris]; -#else - Triangle *triangles; - ispc_malloc((void**)&triangles, nTris*sizeof(Triangle)); -#endif for (uint i = 0; i < nTris; ++i) { // 9x floats for the 3 vertices float v[9]; @@ -223,15 +209,9 @@ int main(int argc, char *argv[]) { // allocate images; one to hold hit object ids, one to hold depth to // the first interseciton -#if 0 int *id = new int[width*height]; float *image = new float[width*height]; -#else - int *id; - float *image; - ispc_malloc((void**)&id, sizeof( int)*width*height); - ispc_malloc((void**)&image, sizeof(float)*width*height); -#endif + // // Run 3 iterations with ispc + 1 core, record the minimum time // diff --git a/examples_ptx/rt/rt1.ispc b/examples_ptx/rt/rt1.ispc deleted file mode 100644 index 4461bdb9..00000000 --- a/examples_ptx/rt/rt1.ispc +++ /dev/null @@ -1,334 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#define bool int - -typedef float<3> float3; - -struct Ray { - float3 origin, dir, invDir; - //uniform unsigned int dirIsNeg[3]; - unsigned int dirIsNeg0, dirIsNeg1, dirIsNeg2; - float mint, maxt; - int hitId; -}; - -struct Triangle { - float p[3][4]; - int id; - int pad[3]; -}; - -struct LinearBVHNode { - float bounds[2][3]; - unsigned int offset; // num primitives for leaf, second child for interior - unsigned int8 nPrimitives; - unsigned int8 splitAxis; - unsigned int16 pad; -}; - -static inline float3 Cross(const float3 v1, const float3 v2) { - float v1x = v1.x, v1y = v1.y, v1z = v1.z; - float v2x = v2.x, v2y = v2.y, v2z = v2.z; - float3 ret; - ret.x = (v1y * v2z) - (v1z * v2y); - ret.y = (v1z * v2x) - (v1x * v2z); - ret.z = (v1x * v2y) - (v1y * v2x); - return ret; -} - -static inline float Dot(const float3 a, const float3 b) { - return a.x * b.x + a.y * b.y + a.z * b.z; -} - -inline -static void generateRay(uniform const float raster2camera[4][4], - uniform const float camera2world[4][4], - float x, float y, Ray &ray) { - ray.mint = 0.f; - ray.maxt = 1e30f; - - ray.hitId = 0; - - // transform raster coordinate (x, y, 0) to camera space - float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3]; - float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3]; - float camz = raster2camera[2][3]; - float camw = raster2camera[3][3]; - camx /= camw; - camy /= camw; - camz /= camw; - - ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + - camera2world[0][2] * camz; - ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + - camera2world[1][2] * camz; - ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + - camera2world[2][2] * camz; - - ray.origin.x = camera2world[0][3] / camera2world[3][3]; - ray.origin.y = camera2world[1][3] / camera2world[3][3]; - ray.origin.z = camera2world[2][3] / camera2world[3][3]; - - ray.invDir = 1.f / ray.dir; - -#if 0 - ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0; - ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0; - ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0; -#else - ray.dirIsNeg0 = any(ray.invDir.x < 0) ? 1 : 0; - ray.dirIsNeg1 = any(ray.invDir.y < 0) ? 1 : 0; - ray.dirIsNeg2 = any(ray.invDir.z < 0) ? 1 : 0; -#endif -} - -inline -static bool BBoxIntersect(const uniform float bounds[2][3], - const Ray &ray) { - uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] }; - uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] }; - float t0 = ray.mint, t1 = ray.maxt; - - // Check all three axis-aligned slabs. Don't try to early out; it's - // not worth the trouble - float3 tNear = (bounds0 - ray.origin) * ray.invDir; - float3 tFar = (bounds1 - ray.origin) * ray.invDir; - if (tNear.x > tFar.x) { - float tmp = tNear.x; - tNear.x = tFar.x; - tFar.x = tmp; - } - t0 = max(tNear.x, t0); - t1 = min(tFar.x, t1); - - if (tNear.y > tFar.y) { - float tmp = tNear.y; - tNear.y = tFar.y; - tFar.y = tmp; - } - t0 = max(tNear.y, t0); - t1 = min(tFar.y, t1); - - if (tNear.z > tFar.z) { - float tmp = tNear.z; - tNear.z = tFar.z; - tFar.z = tmp; - } - t0 = max(tNear.z, t0); - t1 = min(tFar.z, t1); - - return (t0 <= t1); -} - - -inline -static bool TriIntersect(const uniform Triangle &tri, Ray &ray) { - uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] }; - uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] }; - uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] }; - uniform float3 e1 = p1 - p0; - uniform float3 e2 = p2 - p0; - - float3 s1 = Cross(ray.dir, e2); - float divisor = Dot(s1, e1); - bool hit = true; - - if (divisor == 0.) - hit = false; - float invDivisor = 1.f / divisor; - - // Compute first barycentric coordinate - float3 d = ray.origin - p0; - float b1 = Dot(d, s1) * invDivisor; - if (b1 < 0. || b1 > 1.) - hit = false; - - // Compute second barycentric coordinate - float3 s2 = Cross(d, e1); - float b2 = Dot(ray.dir, s2) * invDivisor; - if (b2 < 0. || b1 + b2 > 1.) - hit = false; - - // Compute _t_ to intersection point - float t = Dot(e2, s2) * invDivisor; - if (t < ray.mint || t > ray.maxt) - hit = false; - - if (hit) { - ray.maxt = t; - ray.hitId = tri.id; - } - return hit; -} - -inline -bool BVHIntersect(const uniform LinearBVHNode nodes[], - const uniform Triangle tris[], Ray &r, - uniform int todo[]) { - Ray ray = r; - bool hit = false; - // Follow ray through BVH nodes to find primitive intersections - uniform int todoOffset = 0, nodeNum = 0; - - while (true) { - // Check ray against BVH node - uniform LinearBVHNode node = nodes[nodeNum]; - if (any(BBoxIntersect(node.bounds, ray))) { - uniform unsigned int nPrimitives = node.nPrimitives; - if (nPrimitives > 0) { - // Intersect ray with primitives in leaf BVH node - uniform unsigned int primitivesOffset = node.offset; - for (uniform unsigned int i = 0; i < nPrimitives; ++i) { - if (TriIntersect(tris[primitivesOffset+i], ray)) - hit = true; - } - if (todoOffset == 0) - break; - nodeNum = todo[--todoOffset]; - } - else { - // Put far BVH node on _todo_ stack, advance to near node - int dirIsNeg; - if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg0; - if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg1; - if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg2; - if (dirIsNeg) { - todo[todoOffset++] = nodeNum + 1; - nodeNum = node.offset; - } - else { - todo[todoOffset++] = node.offset; - nodeNum = nodeNum + 1; - } - } - } - else { - if (todoOffset == 0) - break; - nodeNum = todo[--todoOffset]; - } - } - r.maxt = ray.maxt; - r.hitId = ray.hitId; - - return hit; -} - -inline -static void raytrace_tile(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int width, uniform int height, - uniform int baseWidth, uniform int baseHeight, - const uniform float raster2camera[4][4], - const uniform float camera2world[4][4], - uniform float image[], uniform int id[], - const uniform LinearBVHNode nodes[], - const uniform Triangle triangles[]) { - uniform float widthScale = (float)(baseWidth) / (float)(width); - uniform float heightScale = (float)(baseHeight) / (float)(height); - -#if 0 - uniform int * uniform todo = uniform new uniform int[64]; -#define ALLOC -#else - uniform int todo[64]; -#endif - - foreach_tiled (y = y0 ... y1, x = x0 ... x1) { - Ray ray; - generateRay(raster2camera, camera2world, x*widthScale, - y*heightScale, ray); - BVHIntersect(nodes, triangles, ray, todo); - - int offset = y * width + x; - image[offset] = ray.maxt; - id[offset] = ray.hitId; - } - -#ifdef ALLOC - delete todo; -#endif -} - - -export void raytrace_ispc(uniform int width, uniform int height, - uniform int baseWidth, uniform int baseHeight, - const uniform float raster2camera[4][4], - const uniform float camera2world[4][4], - uniform float image[], uniform int id[], - const uniform LinearBVHNode nodes[], - const uniform Triangle triangles[]) { - raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight, - raster2camera, camera2world, image, - id, nodes, triangles); -} - - -task void raytrace_tile_task(uniform int width, uniform int height, - uniform int baseWidth, uniform int baseHeight, - const uniform float raster2camera[4][4], - const uniform float camera2world[4][4], - uniform float image[], uniform int id[], - const uniform LinearBVHNode nodes[], - const uniform Triangle triangles[]) { - uniform int dx = 64, dy = 8; // must match dx, dy below - uniform int xBuckets = (width + (dx-1)) / dx; - uniform int x0 = (taskIndex % xBuckets) * dx; - uniform int x1 = min(x0 + dx, width); - uniform int y0 = (taskIndex / xBuckets) * dy; - uniform int y1 = min(y0 + dy, height); - - raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, - raster2camera, camera2world, image, - id, nodes, triangles); -} - - -export void raytrace_ispc_tasks(uniform int width, uniform int height, - uniform int baseWidth, uniform int baseHeight, - const uniform float raster2camera[4][4], - const uniform float camera2world[4][4], - uniform float image[], uniform int id[], - const uniform LinearBVHNode nodes[], - const uniform Triangle triangles[]) { - uniform int dx = 64, dy = 8; - uniform int xBuckets = (width + (dx-1)) / dx; - uniform int yBuckets = (height + (dy-1)) / dy; - uniform int nTasks = xBuckets * yBuckets; - launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight, - raster2camera, camera2world, - image, id, nodes, triangles); - sync; -} -