diff --git a/examples/aobench/Makefile b/examples/aobench/Makefile index c8122c07..28f0f051 100644 --- a/examples/aobench/Makefile +++ b/examples/aobench/Makefile @@ -2,7 +2,7 @@ EXAMPLE=ao CPP_SRC=ao.cpp ao_serial.cpp ISPC_SRC=ao.ispc -ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8 +ISPC_IA_TARGETS=avx1-i32x8 ISPC_ARM_TARGETS=neon include ../common.mk diff --git a/examples_ptx/aobench/ao.cpp b/examples_ptx/aobench/ao.cpp index b951113c..629c252e 100644 --- a/examples_ptx/aobench/ao.cpp +++ b/examples_ptx/aobench/ao.cpp @@ -137,8 +137,8 @@ int main(int argc, char **argv) // #ifndef _CUDA_ double minTimeISPC = 1e30; - memset((void *)fimg, 0, sizeof(float) * width * height * 3); for (unsigned int i = 0; i < test_iterations[0]; i++) { + ispc_memset((void *)fimg, 0, sizeof(float) * width * height * 3); assert(NSUBSAMPLES == 2); reset_and_start_timer(); ispc::ao_ispc(width, height, NSUBSAMPLES, fimg); @@ -158,8 +158,8 @@ int main(int argc, char **argv) // minimum time for any of them. // double minTimeISPCTasks = 1e30; - memset((void *)fimg, 0, sizeof(float) * width * height * 3); for (unsigned int i = 0; i < test_iterations[1]; i++) { + ispc_memset((void *)fimg, 0, sizeof(float) * width * height * 3); assert(NSUBSAMPLES == 2); reset_and_start_timer(); @@ -179,8 +179,8 @@ int main(int argc, char **argv) // minimum time. // double minTimeSerial = 1e30; - memset((void *)fimg, 0, sizeof(float) * width * height * 3); for (unsigned int i = 0; i < test_iterations[2]; i++) { + ispc_memset((void *)fimg, 0, sizeof(float) * width * height * 3); reset_and_start_timer(); ao_serial(width, height, NSUBSAMPLES, fimg); double t = get_elapsed_msec(); @@ -199,6 +199,9 @@ int main(int argc, char **argv) minTimeSerial / minTimeISPCTasks); #endif savePPM("ao-serial.ppm", width, height); + + ispc_free(img); + ispc_free(fimg); return 0; } diff --git a/examples_ptx/aobench/ao.ispc b/examples_ptx/aobench/ao.ispc index ec234eaf..a9d8ec31 100644 --- a/examples_ptx/aobench/ao.ispc +++ b/examples_ptx/aobench/ao.ispc @@ -259,7 +259,7 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, ao_scanlines(0, h, w, h, nsubsamples, image); } - +#if 0 static void task ao_task(uniform int width, uniform int height, uniform int nsubsamples, uniform float image[]) { ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image); @@ -270,3 +270,95 @@ export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, uniform float image[]) { launch[h] ao_task(w, h, nsubsamples, image); } +#else + +static inline void ao_tile( + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int w, uniform int h, + uniform int nsubsamples, + uniform float image[]) +{ + uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } }; + uniform Sphere spheres[3] = { + { { -2.0f, 0.0f, -3.5f }, 0.5f }, + { { -0.5f, 0.0f, -3.0f }, 0.5f }, + { { 1.0f, 0.0f, -2.2f }, 0.5f } }; + RNGState rngstate; + + seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15))); + float invSamples = 1.f / nsubsamples; + foreach_tiled (y = y0 ... y1, x = x0 ... x1) + { + const int offset = 3 * (y * w + x); + float res = 0.0f; + + for (uniform int u = 0; u < nsubsamples; u++) + for (uniform int v = 0; v < nsubsamples; v++) + { + float du = (float)u * invSamples, dv = (float)v * invSamples; + + // Figure out x,y pixel in NDC + float px = (x + du - (w / 2.0f)) / (w / 2.0f); + float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); + float ret = 0.f; + Ray ray; + Isect isect; + + ray.org = 0.f; + + // Poor man's perspective projection + ray.dir.x = px; + ray.dir.y = py; + ray.dir.z = -1.0; + vnormalize(ray.dir); + + isect.t = 1.0e+17; + isect.hit = 0; + + for (uniform int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(isect, ray, spheres[snum]); + ray_plane_intersect(isect, ray, plane); + + // Note use of 'coherent' if statement; the set of rays we + // trace will often all hit or all miss the scene + if (isect.hit) { + ret = ambient_occlusion(isect, plane, spheres, rngstate); + ret *= invSamples * invSamples; + res += ret; + } + } + + image[offset ] = res; + image[offset+1] = res; + image[offset+2] = res; + } +} + +#define TILEX 64 +#define TILEY 4 + +void task ao_task(uniform int width, uniform int height, + uniform int nsubsamples, uniform float image[]) +{ + if (taskIndex0 >= taskCount0) return; + if (taskIndex1 >= taskCount1) return; + + const uniform int x0 = taskIndex0 * TILEX; + const uniform int x1 = min(x0 + TILEX, width); + + const uniform int y0 = taskIndex1 * TILEY; + const uniform int y1 = min(y0 + TILEY, height); + ao_tile(x0,x1,y0,y1, width, height, nsubsamples, image); +} + + +export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, + uniform float image[]) +{ + const uniform int ntilex = (w+TILEX-1)/TILEX; + const uniform int ntiley = (h+TILEY-1)/TILEY; + launch[ntilex,ntiley] ao_task(w, h, nsubsamples, image); + sync; +} +#endif diff --git a/examples_ptx/ispc_malloc.h b/examples_ptx/ispc_malloc.h index 6eb206db..ffecb691 100644 --- a/examples_ptx/ispc_malloc.h +++ b/examples_ptx/ispc_malloc.h @@ -3,6 +3,7 @@ #ifdef _CUDA_ extern void ispc_malloc(void **ptr, const size_t size); extern void ispc_free(void *ptr); +extern void ispc_memset(void *ptr, int value, size_t size); #else #include static inline void ispc_malloc(void **ptr, const size_t size) @@ -13,5 +14,9 @@ static inline void ispc_free(void *ptr) { free(ptr); } +static inline void ispc_memset(void *ptr, int value, size_t size) +{ + memset(ptr, value, size); +} #endif diff --git a/examples_ptx/nvcc_helpers.cu b/examples_ptx/nvcc_helpers.cu index 0e6fe8f4..783f8bda 100644 --- a/examples_ptx/nvcc_helpers.cu +++ b/examples_ptx/nvcc_helpers.cu @@ -12,5 +12,9 @@ void ispc_free(void *ptr) { cudaFree(ptr); } +void ispc_memset(void *ptr, int value, size_t size) +{ + cudaMemset(ptr, value, size); +}