From 89169d55060ae293ef2d6950d489c46dd9736c83 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Sun, 5 Jan 2014 10:12:45 +0100 Subject: [PATCH] +1 --- examples_ptx/aobench/.gitignore | 2 + examples_ptx/aobench/Makefile_cpu | 8 + examples_ptx/aobench/Makefile_gpu | 9 + examples_ptx/aobench/ao.cpp | 204 +++++++++++++++++ examples_ptx/aobench/ao.ispc | 272 +++++++++++++++++++++++ examples_ptx/aobench/ao_serial.cpp | 314 +++++++++++++++++++++++++++ examples_ptx/aobench/aobench.vcxproj | 34 +++ examples_ptx/common_gpu.mk | 6 +- 8 files changed, 848 insertions(+), 1 deletion(-) create mode 100644 examples_ptx/aobench/.gitignore create mode 100644 examples_ptx/aobench/Makefile_cpu create mode 100644 examples_ptx/aobench/Makefile_gpu create mode 100644 examples_ptx/aobench/ao.cpp create mode 100644 examples_ptx/aobench/ao.ispc create mode 100644 examples_ptx/aobench/ao_serial.cpp create mode 100644 examples_ptx/aobench/aobench.vcxproj diff --git a/examples_ptx/aobench/.gitignore b/examples_ptx/aobench/.gitignore new file mode 100644 index 00000000..ad080f43 --- /dev/null +++ b/examples_ptx/aobench/.gitignore @@ -0,0 +1,2 @@ +ao +*.ppm diff --git a/examples_ptx/aobench/Makefile_cpu b/examples_ptx/aobench/Makefile_cpu new file mode 100644 index 00000000..28f0f051 --- /dev/null +++ b/examples_ptx/aobench/Makefile_cpu @@ -0,0 +1,8 @@ + +EXAMPLE=ao +CPP_SRC=ao.cpp ao_serial.cpp +ISPC_SRC=ao.ispc +ISPC_IA_TARGETS=avx1-i32x8 +ISPC_ARM_TARGETS=neon + +include ../common.mk diff --git a/examples_ptx/aobench/Makefile_gpu b/examples_ptx/aobench/Makefile_gpu new file mode 100644 index 00000000..fa58d307 --- /dev/null +++ b/examples_ptx/aobench/Makefile_gpu @@ -0,0 +1,9 @@ +PROG=ao_gpu +ISPC_SRC=ao.ispc +CXX_SRC=ao.cpp ao_serial.cpp +PTXCC_REGMAX=64 + +include ../common_gpu.mk + + + diff --git a/examples_ptx/aobench/ao.cpp b/examples_ptx/aobench/ao.cpp new file mode 100644 index 00000000..b951113c --- /dev/null +++ b/examples_ptx/aobench/ao.cpp @@ -0,0 +1,204 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include +#include +#ifdef __linux__ +#include +#endif +#include +#include +#include +#include +#include + +#include "ao_ispc.h" + +#include "../timing.h" +#include "../ispc_malloc.h" + +#define NSUBSAMPLES 2 + +extern void ao_serial(int w, int h, int nsubsamples, float image[]); + +static unsigned int test_iterations[] = {3, 7, 1}; +static unsigned int width, height; +static unsigned char *img; +static float *fimg; + + +static unsigned char +clamp(float f) +{ + int i = (int)(f * 255.5); + + if (i < 0) i = 0; + if (i > 255) i = 255; + + return (unsigned char)i; +} + + +static void +savePPM(const char *fname, int w, int h) +{ + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]); + img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]); + img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]); + } + } + + FILE *fp = fopen(fname, "wb"); + if (!fp) { + perror(fname); + exit(1); + } + + fprintf(fp, "P6\n"); + fprintf(fp, "%d %d\n", w, h); + fprintf(fp, "255\n"); + fwrite(img, w * h * 3, 1, fp); + fclose(fp); + printf("Wrote image file %s\n", fname); +} + + +int main(int argc, char **argv) +{ + if (argc < 3) { + printf ("%s\n", argv[0]); + printf ("Usage: ao [width] [height] [ispc iterations] [tasks iterations] [serial iterations]\n"); + getchar(); + exit(-1); + } + else { + if (argc == 6) { + for (int i = 0; i < 3; i++) { + test_iterations[i] = atoi(argv[3 + i]); + } + } + width = atoi (argv[1]); + height = atoi (argv[2]); + } + + // Allocate space for output images +#if 0 + img = new unsigned char[width * height * 3]; + fimg = new float[width * height * 3]; +#else + ispc_malloc((void**) &img, sizeof(unsigned char)*width*height*3); + ispc_malloc((void**)&fimg, sizeof( float)*width*height*3); +#endif + + // + // Run the ispc path, test_iterations times, and report the minimum + // time for any of them. + // +#ifndef _CUDA_ + double minTimeISPC = 1e30; + memset((void *)fimg, 0, sizeof(float) * width * height * 3); + for (unsigned int i = 0; i < test_iterations[0]; i++) { + assert(NSUBSAMPLES == 2); + reset_and_start_timer(); + ispc::ao_ispc(width, height, NSUBSAMPLES, fimg); + double t = get_elapsed_msec(); + printf("@time of ISPC run:\t\t\t[%.3f] msec\n", t); + minTimeISPC = std::min(minTimeISPC, t); + } + + // Report results and save image + printf("[aobench ispc]:\t\t\t[%.3f] msec (%d x %d image)\n", + minTimeISPC, width, height); + savePPM("ao-ispc.ppm", width, height); +#endif + + // + // Run the ispc + tasks path, test_iterations times, and report the + // minimum time for any of them. + // + double minTimeISPCTasks = 1e30; + memset((void *)fimg, 0, sizeof(float) * width * height * 3); + for (unsigned int i = 0; i < test_iterations[1]; i++) { + assert(NSUBSAMPLES == 2); + + reset_and_start_timer(); + ispc::ao_ispc_tasks(width, height, NSUBSAMPLES, fimg); + double t = get_elapsed_msec(); + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", t); + minTimeISPCTasks = std::min(minTimeISPCTasks, t); + } + + // Report results and save image + printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n", + minTimeISPCTasks, width, height); + savePPM("ao-ispc-tasks.ppm", width, height); + + // + // Run the serial path, again test_iteration times, and report the + // minimum time. + // + double minTimeSerial = 1e30; + memset((void *)fimg, 0, sizeof(float) * width * height * 3); + for (unsigned int i = 0; i < test_iterations[2]; i++) { + reset_and_start_timer(); + ao_serial(width, height, NSUBSAMPLES, fimg); + double t = get_elapsed_msec(); + printf("@time of serial run:\t\t\t\t[%.3f] msec\n", t); + minTimeSerial = std::min(minTimeSerial, t); + } + + // Report more results, save another image... + printf("[aobench serial]:\t\t[%.3f] msec (%d x %d image)\n", minTimeSerial, + width, height); +#ifndef _CUDA_ + printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", + minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); +#else + printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", + minTimeSerial / minTimeISPCTasks); +#endif + savePPM("ao-serial.ppm", width, height); + + return 0; +} diff --git a/examples_ptx/aobench/ao.ispc b/examples_ptx/aobench/ao.ispc new file mode 100644 index 00000000..ec234eaf --- /dev/null +++ b/examples_ptx/aobench/ao.ispc @@ -0,0 +1,272 @@ +// -*- mode: c++ -*- +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +/* + Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench +*/ + +#define NAO_SAMPLES 8 +#define M_PI 3.1415926535f + +typedef float<3> vec; + +struct Isect { + float t; + vec p; + vec n; + int hit; +}; + +struct Sphere { + vec center; + float radius; +}; + +struct Plane { + vec p; + vec n; +}; + +struct Ray { + vec org; + vec dir; +}; + +static inline float dot(vec a, vec b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +static inline vec vcross(vec v0, vec v1) { + vec ret; + ret.x = v0.y * v1.z - v0.z * v1.y; + ret.y = v0.z * v1.x - v0.x * v1.z; + ret.z = v0.x * v1.y - v0.y * v1.x; + return ret; +} + +static inline void vnormalize(vec &v) { + float len2 = dot(v, v); + float invlen = rsqrt(len2); + v *= invlen; +} + + +static void +ray_plane_intersect(Isect &isect, Ray &ray, const uniform Plane &plane) { + float d = -dot(plane.p, plane.n); + float v = dot(ray.dir, plane.n); + + cif (abs(v) < 1.0e-17) + return; + else { + float t = -(dot(ray.org, plane.n) + d) / v; + + cif ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + ray.dir * t; + isect.n = plane.n; + } + } +} + + +static inline void +ray_sphere_intersect(Isect &isect, Ray &ray, const uniform Sphere &sphere) { + vec rs = ray.org - sphere.center; + + float B = dot(rs, ray.dir); + float C = dot(rs, rs) - sphere.radius * sphere.radius; + float D = B * B - C; + + cif (D > 0.) { + float t = -B - sqrt(D); + + cif ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + t * ray.dir; + isect.n = isect.p - sphere.center; + vnormalize(isect.n); + } + } +} + + +static void +orthoBasis(vec basis[3], vec n) { + basis[2] = n; + basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0; + + if ((n.x < 0.6) && (n.x > -0.6)) { + basis[1].x = 1.0; + } else if ((n.y < 0.6) && (n.y > -0.6)) { + basis[1].y = 1.0; + } else if ((n.z < 0.6) && (n.z > -0.6)) { + basis[1].z = 1.0; + } else { + basis[1].x = 1.0; + } + + basis[0] = vcross(basis[1], basis[2]); + vnormalize(basis[0]); + + basis[1] = vcross(basis[2], basis[0]); + vnormalize(basis[1]); +} + + +static float +ambient_occlusion(Isect &isect, const uniform Plane &plane, const uniform Sphere spheres[3], + RNGState &rngstate) { + float eps = 0.0001f; + vec p, n; + vec basis[3]; + float occlusion = 0.0; + + p = isect.p + eps * isect.n; + + orthoBasis(basis, isect.n); + + static const uniform int ntheta = NAO_SAMPLES; + static const uniform int nphi = NAO_SAMPLES; + for (uniform int j = 0; j < ntheta; j++) { + for (uniform int i = 0; i < nphi; i++) { + Ray ray; + Isect occIsect; + + float theta = sqrt(frandom(&rngstate)); + float phi = 2.0f * M_PI * frandom(&rngstate); + float x = cos(phi) * theta; + float y = sin(phi) * theta; + float z = sqrt(1.0 - theta * theta); + + // local . global + float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x; + float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y; + float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z; + + ray.org = p; + ray.dir.x = rx; + ray.dir.y = ry; + ray.dir.z = rz; + + occIsect.t = 1.0e+17; + occIsect.hit = 0; + + for (uniform int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(occIsect, ray, spheres[snum]); + ray_plane_intersect (occIsect, ray, plane); + + if (occIsect.hit) occlusion += 1.0; + } + } + + occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi); + return occlusion; +} + + +/* Compute the image for the scanlines from [y0,y1), for an overall image + of width w and height h. + */ +static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, + uniform int h, uniform int nsubsamples, + uniform float image[]) { + const static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } }; + const static uniform Sphere spheres[3] = { + { { -2.0f, 0.0f, -3.5f }, 0.5f }, + { { -0.5f, 0.0f, -3.0f }, 0.5f }, + { { 1.0f, 0.0f, -2.2f }, 0.5f } }; + RNGState rngstate; + + seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15))); + float invSamples = 1.f / nsubsamples; + + foreach_tiled(y = y0 ... y1, x = 0 ... w, + u = 0 ... nsubsamples, v = 0 ... nsubsamples) { + float du = (float)u * invSamples, dv = (float)v * invSamples; + + // Figure out x,y pixel in NDC + float px = (x + du - (w / 2.0f)) / (w / 2.0f); + float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); + float ret = 0.f; + Ray ray; + Isect isect; + + ray.org = 0.f; + + // Poor man's perspective projection + ray.dir.x = px; + ray.dir.y = py; + ray.dir.z = -1.0; + vnormalize(ray.dir); + + isect.t = 1.0e+17; + isect.hit = 0; + + for (uniform int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(isect, ray, spheres[snum]); + ray_plane_intersect(isect, ray, plane); + + // Note use of 'coherent' if statement; the set of rays we + // trace will often all hit or all miss the scene + cif (isect.hit) { + ret = ambient_occlusion(isect, plane, spheres, rngstate); + ret *= invSamples * invSamples; + + int offset = 3 * (y * w + x); + atomic_add_local(&image[offset], ret); + atomic_add_local(&image[offset+1], ret); + atomic_add_local(&image[offset+2], ret); + } + } +} + + +export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, + uniform float image[]) { + ao_scanlines(0, h, w, h, nsubsamples, image); +} + + +static void task ao_task(uniform int width, uniform int height, + uniform int nsubsamples, uniform float image[]) { + ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image); +} + + +export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, + uniform float image[]) { + launch[h] ao_task(w, h, nsubsamples, image); +} diff --git a/examples_ptx/aobench/ao_serial.cpp b/examples_ptx/aobench/ao_serial.cpp new file mode 100644 index 00000000..69af2eba --- /dev/null +++ b/examples_ptx/aobench/ao_serial.cpp @@ -0,0 +1,314 @@ +// -*- mode: c++ -*- +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +/* + Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include + +#ifdef _MSC_VER +static long long drand48_x = 0x1234ABCD330E; + +static inline void srand48(int x) { + drand48_x = x ^ (x << 16); +} + +static inline double drand48() { + drand48_x = drand48_x * 0x5DEECE66D + 0xB; + return (drand48_x & 0xFFFFFFFFFFFF) * (1.0 / 281474976710656.0); +} +#endif // _MSC_VER + +#ifdef _MSC_VER +__declspec(align(16)) +#endif +struct vec { + vec() { x=y=z=pad=0.; } + vec(float xx, float yy, float zz) { x = xx; y = yy; z = zz; } + + vec operator*(float f) const { return vec(x*f, y*f, z*f); } + vec operator+(const vec &f2) const { + return vec(x+f2.x, y+f2.y, z+f2.z); + } + vec operator-(const vec &f2) const { + return vec(x-f2.x, y-f2.y, z-f2.z); + } + vec operator*(const vec &f2) const { + return vec(x*f2.x, y*f2.y, z*f2.z); + } + float x, y, z; + float pad; +} +#ifndef _MSC_VER +__attribute__ ((aligned(16))) +#endif +; +inline vec operator*(float f, const vec &v) { return vec(f*v.x, f*v.y, f*v.z); } + + +#define NAO_SAMPLES 8 + +#ifdef M_PI +#undef M_PI +#endif +#define M_PI 3.1415926535f + +struct Isect { + float t; + vec p; + vec n; + int hit; +}; + +struct Sphere { + vec center; + float radius; + +}; + +struct Plane { + vec p; + vec n; +}; + +struct Ray { + vec org; + vec dir; +}; + +static inline float dot(const vec &a, const vec &b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +static inline vec vcross(const vec &v0, const vec &v1) { + vec ret; + ret.x = v0.y * v1.z - v0.z * v1.y; + ret.y = v0.z * v1.x - v0.x * v1.z; + ret.z = v0.x * v1.y - v0.y * v1.x; + return ret; +} + +static inline void vnormalize(vec &v) { + float len2 = dot(v, v); + float invlen = 1.f / sqrtf(len2); + v = v * invlen; +} + + +static inline void +ray_plane_intersect(Isect &isect, Ray &ray, + Plane &plane) { + float d = -dot(plane.p, plane.n); + float v = dot(ray.dir, plane.n); + + if (fabsf(v) < 1.0e-17f) + return; + else { + float t = -(dot(ray.org, plane.n) + d) / v; + + if ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + ray.dir * t; + isect.n = plane.n; + } + } +} + + +static inline void +ray_sphere_intersect(Isect &isect, Ray &ray, + Sphere &sphere) { + vec rs = ray.org - sphere.center; + + float B = dot(rs, ray.dir); + float C = dot(rs, rs) - sphere.radius * sphere.radius; + float D = B * B - C; + + if (D > 0.) { + float t = -B - sqrtf(D); + + if ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + t * ray.dir; + isect.n = isect.p - sphere.center; + vnormalize(isect.n); + } + } +} + + +static inline void +orthoBasis(vec basis[3], const vec &n) { + basis[2] = n; + basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0; + + if ((n.x < 0.6f) && (n.x > -0.6f)) { + basis[1].x = 1.0; + } else if ((n.y < 0.6f) && (n.y > -0.6f)) { + basis[1].y = 1.0; + } else if ((n.z < 0.6f) && (n.z > -0.6f)) { + basis[1].z = 1.0; + } else { + basis[1].x = 1.0; + } + + basis[0] = vcross(basis[1], basis[2]); + vnormalize(basis[0]); + + basis[1] = vcross(basis[2], basis[0]); + vnormalize(basis[1]); +} + + +static float +ambient_occlusion(Isect &isect, Plane &plane, + Sphere spheres[3]) { + float eps = 0.0001f; + vec p, n; + vec basis[3]; + float occlusion = 0.0; + + p = isect.p + eps * isect.n; + + orthoBasis(basis, isect.n); + + static const int ntheta = NAO_SAMPLES; + static const int nphi = NAO_SAMPLES; + for (int j = 0; j < ntheta; j++) { + for (int i = 0; i < nphi; i++) { + Ray ray; + Isect occIsect; + + float theta = sqrtf(drand48()); + float phi = 2.0f * M_PI * drand48(); + float x = cosf(phi) * theta; + float y = sinf(phi) * theta; + float z = sqrtf(1.0f - theta * theta); + + // local . global + float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x; + float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y; + float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z; + + ray.org = p; + ray.dir.x = rx; + ray.dir.y = ry; + ray.dir.z = rz; + + occIsect.t = 1.0e+17f; + occIsect.hit = 0; + + for (int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(occIsect, ray, spheres[snum]); + ray_plane_intersect (occIsect, ray, plane); + + if (occIsect.hit) occlusion += 1.f; + } + } + + occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi); + return occlusion; +} + + +/* Compute the image for the scanlines from [y0,y1), for an overall image + of width w and height h. + */ +static void ao_scanlines(int y0, int y1, int w, int h, int nsubsamples, + float image[]) { + static Plane plane = { vec(0.0f, -0.5f, 0.0f), vec(0.f, 1.f, 0.f) }; + static Sphere spheres[3] = { + { vec(-2.0f, 0.0f, -3.5f), 0.5f }, + { vec(-0.5f, 0.0f, -3.0f), 0.5f }, + { vec(1.0f, 0.0f, -2.2f), 0.5f } }; + + srand48(y0); + + for (int y = y0; y < y1; ++y) { + for (int x = 0; x < w; ++x) { + int offset = 3 * (y * w + x); + for (int u = 0; u < nsubsamples; ++u) { + for (int v = 0; v < nsubsamples; ++v) { + float px = (x + (u / (float)nsubsamples) - (w / 2.0f)) / (w / 2.0f); + float py = -(y + (v / (float)nsubsamples) - (h / 2.0f)) / (h / 2.0f); + float ret = 0.f; + Ray ray; + Isect isect; + + ray.org = vec(0.f, 0.f, 0.f); + + ray.dir.x = px; + ray.dir.y = py; + ray.dir.z = -1.0f; + vnormalize(ray.dir); + + isect.t = 1.0e+17f; + isect.hit = 0; + + for (int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(isect, ray, spheres[snum]); + ray_plane_intersect(isect, ray, plane); + + if (isect.hit) + ret = ambient_occlusion(isect, plane, spheres); + + // Update image for AO for this ray + image[offset+0] += ret; + image[offset+1] += ret; + image[offset+2] += ret; + } + } + // Normalize image pixels by number of samples taken per pixel + image[offset+0] /= nsubsamples * nsubsamples; + image[offset+1] /= nsubsamples * nsubsamples; + image[offset+2] /= nsubsamples * nsubsamples; + } + } +} + + +void ao_serial(int w, int h, int nsubsamples, + float image[]) { + ao_scanlines(0, h, w, h, nsubsamples, image); +} diff --git a/examples_ptx/aobench/aobench.vcxproj b/examples_ptx/aobench/aobench.vcxproj new file mode 100644 index 00000000..298be2cb --- /dev/null +++ b/examples_ptx/aobench/aobench.vcxproj @@ -0,0 +1,34 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB} + Win32Proj + aobench + ao + sse2,sse4,avx1-i32x8 + + + + + + + + diff --git a/examples_ptx/common_gpu.mk b/examples_ptx/common_gpu.mk index 46aa5190..2a0c4af9 100644 --- a/examples_ptx/common_gpu.mk +++ b/examples_ptx/common_gpu.mk @@ -11,7 +11,11 @@ LD=nvcc LDFLAGS=-lcudart -lcudadevrt -arch=sm_35 # PTXCC=ptxcc -PTXCC_FLAGS = -maxrregcount=32 -Xptxas=-v +PTXCC_FLAGS = -Xptxas=-v +ifdef PTXCC_REGMAX + PTXCC_FLAGS += -maxrregcount=$(PTXCC_REGMAX) +endif + # ISPC=ispc ISPC_FLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math