diff --git a/examples_ptx/aobench/ao.cu b/examples_ptx/aobench/ao.cu new file mode 100644 index 00000000..7c8c2f43 --- /dev/null +++ b/examples_ptx/aobench/ao.cu @@ -0,0 +1,419 @@ +// -*- mode: c++ -*- +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/* + Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench + */ + +#include "cuda_helpers.cuh" + +#define NAO_SAMPLES 8 +//#define M_PI 3.1415926535f + +#define vec Float3 +struct Float3 +{ + float x,y,z; + + __device__ friend Float3 operator+(const Float3 a, const Float3 b) + { + Float3 c; + c.x = a.x+b.x; + c.y = a.y+b.y; + c.z = a.z+b.z; + return c; + } + __device__ friend Float3 operator-(const Float3 a, const Float3 b) + { + Float3 c; + c.x = a.x-b.x; + c.y = a.y-b.y; + c.z = a.z-b.z; + return c; + } + __device__ friend Float3 operator/(const Float3 a, const Float3 b) + { + Float3 c; + c.x = a.x/b.x; + c.y = a.y/b.y; + c.z = a.z/b.z; + return c; + } + __device__ friend Float3 operator/(const float a, const Float3 b) + { + Float3 c; + c.x = a/b.x; + c.y = a/b.y; + c.z = a/b.z; + return c; + } + __device__ friend Float3 operator*(const Float3 a, const Float3 b) + { + Float3 c; + c.x = a.x*b.x; + c.y = a.y*b.y; + c.z = a.z*b.z; + return c; + } + __device__ friend Float3 operator*(const Float3 a, const float b) + { + Float3 c; + c.x = a.x*b; + c.y = a.y*b; + c.z = a.z*b; + return c; + } +}; + +/////////////////////////////////////////////////////////////////////////// +// RNG stuff + +struct RNGState { + unsigned int z1, z2, z3, z4; +}; + +__device__ +static inline unsigned int random(RNGState * state) +{ + unsigned int b; + + b = ((state->z1 << 6) ^ state->z1) >> 13; + state->z1 = ((state->z1 & 4294967294U) << 18) ^ b; + b = ((state->z2 << 2) ^ state->z2) >> 27; + state->z2 = ((state->z2 & 4294967288U) << 2) ^ b; + b = ((state->z3 << 13) ^ state->z3) >> 21; + state->z3 = ((state->z3 & 4294967280U) << 7) ^ b; + b = ((state->z4 << 3) ^ state->z4) >> 12; + state->z4 = ((state->z4 & 4294967168U) << 13) ^ b; + return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4); +} + + +__device__ +static inline float frandom(RNGState * state) +{ + unsigned int irand = random(state); + irand &= (1ul<<23)-1; + return __int_as_float(0x3F800000 | irand)-1.0f; +} + +__device__ +static inline void seed_rng(RNGState * state, + unsigned int seed) { + state->z1 = seed; + state->z2 = seed ^ 0xbeeff00d; + state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16); + state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) | + ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24); +} + + + +struct Isect { + float t; + vec p; + vec n; + int hit; +}; + +struct Sphere { + vec center; + float radius; +}; + +struct Plane { + vec p; + vec n; +}; + +struct Ray { + vec org; + vec dir; +}; + +__device__ +static inline float dot(vec a, vec b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +__device__ +static inline vec vcross(vec v0, vec v1) { + vec ret; + ret.x = v0.y * v1.z - v0.z * v1.y; + ret.y = v0.z * v1.x - v0.x * v1.z; + ret.z = v0.x * v1.y - v0.y * v1.x; + return ret; +} + +__device__ +static inline void vnormalize(vec &v) { + float len2 = dot(v, v); + float invlen = rsqrt(len2); + v = v*invlen; +} + + +__device__ +static inline void +ray_plane_intersect(Isect &isect,const Ray &ray, const Plane &plane) { + float d = -dot(plane.p, plane.n); + float v = dot(ray.dir, plane.n); + + if (abs(v) < 1.0e-17) + return; + else { + float t = -(dot(ray.org, plane.n) + d) / v; + + if ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + ray.dir * t; + isect.n = plane.n; + } + } +} + + +__device__ +static inline void +ray_sphere_intersect(Isect &isect,const Ray &ray, const Sphere &sphere) { + vec rs = ray.org - sphere.center; + + float B = dot(rs, ray.dir); + float C = dot(rs, rs) - sphere.radius * sphere.radius; + float D = B * B - C; + + if (D > 0.) { + float t = -B - sqrt(D); + + if ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + ray.dir * t; + isect.n = isect.p - sphere.center; + vnormalize(isect.n); + } + } +} + + +__device__ +static inline void +orthoBasis(vec basis[3], vec n) { + basis[2] = n; + basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0; + + if ((n.x < 0.6) && (n.x > -0.6)) { + basis[1].x = 1.0; + } else if ((n.y < 0.6) && (n.y > -0.6)) { + basis[1].y = 1.0; + } else if ((n.z < 0.6) && (n.z > -0.6)) { + basis[1].z = 1.0; + } else { + basis[1].x = 1.0; + } + + basis[0] = vcross(basis[1], basis[2]); + vnormalize(basis[0]); + + basis[1] = vcross(basis[2], basis[0]); + vnormalize(basis[1]); +} + + +__device__ +static inline float +ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3], + RNGState &rngstate) { + float eps = 0.0001f; + vec p; //, n; + vec basis[3]; + float occlusion = 0.0; + + p = isect.p + isect.n * eps; + + orthoBasis(basis, isect.n); + + const int ntheta = NAO_SAMPLES; + const int nphi = NAO_SAMPLES; + for ( int j = 0; j < ntheta; j++) { + for ( int i = 0; i < nphi; i++) { + Ray ray; + Isect occIsect; + + float theta = sqrt(frandom(&rngstate)); + float phi = 2.0f * M_PI * frandom(&rngstate); + float x = cos(phi) * theta; + float y = sin(phi) * theta; + float z = sqrt(1.0 - theta * theta); + + // local . global + float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x; + float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y; + float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z; + + ray.org = p; + ray.dir.x = rx; + ray.dir.y = ry; + ray.dir.z = rz; + + occIsect.t = 1.0e+17; + occIsect.hit = 0; + + for ( int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(occIsect, ray, spheres[snum]); + ray_plane_intersect (occIsect, ray, plane); + + if (occIsect.hit) occlusion += 1.0; + } + } + + occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi); + return occlusion; +} + + +/* Compute the image for the scanlines from [y0,y1), for an overall image + of width w and height h. + */ +__device__ +static inline void ao_tiles( + int x0, int x1, + int y0, int y1, + int w, int h, + int nsubsamples, + float image[]) +{ + const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } }; + const Sphere spheres[3] = { + { { -2.0f, 0.0f, -3.5f }, 0.5f }, + { { -0.5f, 0.0f, -3.0f }, 0.5f }, + { { 1.0f, 0.0f, -2.2f }, 0.5f } }; + RNGState rngstate; + + seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15))); + float invSamples = 1.f / nsubsamples; + for ( int y = y0; y < y1; y++) + for ( int x = programIndex+x0; x < x1; x += programCount) + { + const int offset = 3 * (y * w + x); + float res = 0.0f; + + for ( int u = 0; u < nsubsamples; u++) + for ( int v = 0; v < nsubsamples; v++) + { + float du = (float)u * invSamples, dv = (float)v * invSamples; + + // Figure out x,y pixel in NDC + float px = (x + du - (w / 2.0f)) / (w / 2.0f); + float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); + float ret = 0.f; + Ray ray; + Isect isect; + + ray.org.x = 0.0f; + ray.org.y = 0.0f; + ray.org.z = 0.0f; + + // Poor man's perspective projection + ray.dir.x = px; + ray.dir.y = py; + ray.dir.z = -1.0; + vnormalize(ray.dir); + + isect.t = 1.0e+17; + isect.hit = 0; + + for ( int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(isect, ray, spheres[snum]); + ray_plane_intersect(isect, ray, plane); + + // Note use of 'coherent' if statement; the set of rays we + // trace will often all hit or all miss the scene + if (isect.hit) { + ret = ambient_occlusion(isect, plane, spheres, rngstate); + ret *= invSamples * invSamples; + res += ret; + } + } + + if (x < x1) + { + image[offset ] = res; + image[offset+1] = res; + image[offset+2] = res; + } + } +} + + + +#define TILEX 64 +#define TILEY 4 + +extern "C" +__global__ +void ao_task( int width, int height, + int nsubsamples, float image[]) +{ + if (taskIndex0 >= taskCount0) return; + if (taskIndex1 >= taskCount1) return; + + const int x0 = taskIndex0 * TILEX; + const int x1 = min(x0 + TILEX, width); + + const int y0 = taskIndex1 * TILEY; + const int y1 = min(y0 + TILEY, height); + ao_tiles(x0,x1,y0,y1, width, height, nsubsamples, image); +} + +extern "C" +__global__ +void ao_ispc_tasks___export( + int w, int h, int nsubsamples, + float image[]) +{ + const int ntilex = (w+TILEX-1)/TILEX; + const int ntiley = (h+TILEY-1)/TILEY; + launch(ntilex,ntiley,1,ao_task)(w,h,nsubsamples,image); + cudaDeviceSynchronize(); +} + +extern "C" +__host__ void ao_ispc_tasks( + int w, int h, int nsubsamples, + float image[]) +{ + ao_ispc_tasks___export<<<1,32>>>(w,h,nsubsamples,image); + cudaDeviceSynchronize(); +}