diff --git a/examples_cuda/aobench/Makefile b/examples_cuda/aobench/Makefile index 05d548fa..aca9428a 100644 --- a/examples_cuda/aobench/Makefile +++ b/examples_cuda/aobench/Makefile @@ -1,7 +1,7 @@ EXAMPLE=ao CPP_SRC=ao.cpp ao_serial.cpp -ISPC_SRC=ao.ispc +ISPC_SRC=ao1.ispc ISPC_IA_TARGETS=avx ISPC_ARM_TARGETS=neon diff --git a/examples_cuda/aobench/ao.cu b/examples_cuda/aobench/ao.cu index dd096e86..ca0db34b 100644 --- a/examples_cuda/aobench/ao.cu +++ b/examples_cuda/aobench/ao.cu @@ -1,283 +1,362 @@ // -*- mode: c++ -*- /* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ /* - Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench -*/ + Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench + */ #define NAO_SAMPLES 8 -#define M_PI 3.1415926535f +//#define M_PI 3.1415926535f -typedef float<3> vec; +#define vec Float3 +struct Float3 +{ + float x,y,z; -#ifdef __NVPTX__ -#warning "emitting DEVICE code" -#define programCount warpSize() -#define programIndex laneIndex() -#define taskIndex blockIndex0() -#define taskCount blockCount0() -#else -#warning "emitting HOST code" -#endif + __device__ friend Float3 operator+(const Float3 a, const Float3 b) + { + Float3 c; + c.x = a.x+b.x; + c.y = a.y+b.y; + c.z = a.z+b.z; + return c; + } + __device__ friend Float3 operator-(const Float3 a, const Float3 b) + { + Float3 c; + c.x = a.x-b.x; + c.y = a.y-b.y; + c.z = a.z-b.z; + return c; + } + __device__ friend Float3 operator/(const Float3 a, const Float3 b) + { + Float3 c; + c.x = a.x/b.x; + c.y = a.y/b.y; + c.z = a.z/b.z; + return c; + } + __device__ friend Float3 operator/(const float a, const Float3 b) + { + Float3 c; + c.x = a/b.x; + c.y = a/b.y; + c.z = a/b.z; + return c; + } + __device__ friend Float3 operator*(const Float3 a, const Float3 b) + { + Float3 c; + c.x = a.x*b.x; + c.y = a.y*b.y; + c.z = a.z*b.z; + return c; + } + __device__ friend Float3 operator*(const Float3 a, const float b) + { + Float3 c; + c.x = a.x*b; + c.y = a.y*b; + c.z = a.z*b; + return c; + } +}; + +#define programCount 32 +#define programIndex (threadIdx.x & 31) +#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5)) +#define taskCount0 (gridDim.x*4) +#define taskIndex1 (blockIdx.y) +#define taskCount1 (gridDim.y) +#define warpIdx (threadIdx.x >> 5) struct Isect { - float t; - vec p; - vec n; - int hit; + float t; + vec p; + vec n; + int hit; }; struct Sphere { - vec center; - float radius; + vec center; + float radius; }; struct Plane { - vec p; - vec n; + vec p; + vec n; }; struct Ray { - vec org; - vec dir; + vec org; + vec dir; }; +__device__ static inline float dot(vec a, vec b) { - return a.x * b.x + a.y * b.y + a.z * b.z; + return a.x * b.x + a.y * b.y + a.z * b.z; } +__device__ static inline vec vcross(vec v0, vec v1) { - vec ret; - ret.x = v0.y * v1.z - v0.z * v1.y; - ret.y = v0.z * v1.x - v0.x * v1.z; - ret.z = v0.x * v1.y - v0.y * v1.x; - return ret; + vec ret; + ret.x = v0.y * v1.z - v0.z * v1.y; + ret.y = v0.z * v1.x - v0.x * v1.z; + ret.z = v0.x * v1.y - v0.y * v1.x; + return ret; } +__device__ static inline void vnormalize(vec &v) { - float len2 = dot(v, v); - float invlen = rsqrt(len2); - v *= invlen; + float len2 = dot(v, v); + float invlen = rsqrt(len2); + v = v*invlen; } +__device__ static inline void -ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) { - float d = -dot(plane.p, plane.n); - float v = dot(ray.dir, plane.n); +ray_plane_intersect(Isect &isect,const Ray &ray, const Plane &plane) { + float d = -dot(plane.p, plane.n); + float v = dot(ray.dir, plane.n); - if (abs(v) < 1.0e-17) - return; - else { - float t = -(dot(ray.org, plane.n) + d) / v; + if (abs(v) < 1.0e-17) + return; + else { + float t = -(dot(ray.org, plane.n) + d) / v; - if ((t > 0.0) && (t < isect.t)) { - isect.t = t; - isect.hit = 1; - isect.p = ray.org + ray.dir * t; - isect.n = plane.n; - } + if ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + ray.dir * t; + isect.n = plane.n; } + } } +__device__ static inline void -ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) { - vec rs = ray.org - sphere.center; +ray_sphere_intersect(Isect &isect,const Ray &ray, const Sphere &sphere) { + vec rs = ray.org - sphere.center; - float B = dot(rs, ray.dir); - float C = dot(rs, rs) - sphere.radius * sphere.radius; - float D = B * B - C; + float B = dot(rs, ray.dir); + float C = dot(rs, rs) - sphere.radius * sphere.radius; + float D = B * B - C; - if (D > 0.) { - float t = -B - sqrt(D); + if (D > 0.) { + float t = -B - sqrt(D); - if ((t > 0.0) && (t < isect.t)) { - isect.t = t; - isect.hit = 1; - isect.p = ray.org + t * ray.dir; - isect.n = isect.p - sphere.center; - vnormalize(isect.n); - } + if ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + ray.dir * t; + isect.n = isect.p - sphere.center; + vnormalize(isect.n); } + } } +__device__ static inline void orthoBasis(vec basis[3], vec n) { - basis[2] = n; - basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0; + basis[2] = n; + basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0; - if ((n.x < 0.6) && (n.x > -0.6)) { - basis[1].x = 1.0; - } else if ((n.y < 0.6) && (n.y > -0.6)) { - basis[1].y = 1.0; - } else if ((n.z < 0.6) && (n.z > -0.6)) { - basis[1].z = 1.0; - } else { - basis[1].x = 1.0; - } + if ((n.x < 0.6) && (n.x > -0.6)) { + basis[1].x = 1.0; + } else if ((n.y < 0.6) && (n.y > -0.6)) { + basis[1].y = 1.0; + } else if ((n.z < 0.6) && (n.z > -0.6)) { + basis[1].z = 1.0; + } else { + basis[1].x = 1.0; + } - basis[0] = vcross(basis[1], basis[2]); - vnormalize(basis[0]); + basis[0] = vcross(basis[1], basis[2]); + vnormalize(basis[0]); - basis[1] = vcross(basis[2], basis[0]); - vnormalize(basis[1]); + basis[1] = vcross(basis[2], basis[0]); + vnormalize(basis[1]); } +__device__ static inline float -ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3], - RNGState &rngstate) { - float eps = 0.0001f; - vec p, n; - vec basis[3]; - float occlusion = 0.0; +ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3], + RNGState &rngstate) { + float eps = 0.0001f; + vec p; //, n; + vec basis[3]; + float occlusion = 0.0; - p = isect.p + eps * isect.n; + p = isect.p + isect.n * eps; - orthoBasis(basis, isect.n); + orthoBasis(basis, isect.n); - static const uniform int ntheta = NAO_SAMPLES; - static const uniform int nphi = NAO_SAMPLES; - for (uniform int j = 0; j < ntheta; j++) { - for (uniform int i = 0; i < nphi; i++) { - Ray ray; - Isect occIsect; + const int ntheta = NAO_SAMPLES; + const int nphi = NAO_SAMPLES; + for ( int j = 0; j < ntheta; j++) { + for ( int i = 0; i < nphi; i++) { + Ray ray; + Isect occIsect; - float theta = sqrt(frandom(&rngstate)); - float phi = 2.0f * M_PI * frandom(&rngstate); - float x = cos(phi) * theta; - float y = sin(phi) * theta; - float z = sqrt(1.0 - theta * theta); + float theta = sqrt(frandom(&rngstate)); + float phi = 2.0f * M_PI * frandom(&rngstate); + float x = cos(phi) * theta; + float y = sin(phi) * theta; + float z = sqrt(1.0 - theta * theta); - // local . global - float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x; - float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y; - float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z; + // local . global + float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x; + float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y; + float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z; - ray.org = p; - ray.dir.x = rx; - ray.dir.y = ry; - ray.dir.z = rz; + ray.org = p; + ray.dir.x = rx; + ray.dir.y = ry; + ray.dir.z = rz; - occIsect.t = 1.0e+17; - occIsect.hit = 0; + occIsect.t = 1.0e+17; + occIsect.hit = 0; - for (uniform int snum = 0; snum < 3; ++snum) - ray_sphere_intersect(occIsect, ray, spheres[snum]); - ray_plane_intersect (occIsect, ray, plane); + for ( int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(occIsect, ray, spheres[snum]); + ray_plane_intersect (occIsect, ray, plane); - if (occIsect.hit) occlusion += 1.0; - } + if (occIsect.hit) occlusion += 1.0; } + } - occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi); - return occlusion; + occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi); + return occlusion; } /* Compute the image for the scanlines from [y0,y1), for an overall image of width w and height h. - */ -static inline void ao_scanlines(uniform int y0, uniform int y1, uniform int w, - uniform int h, uniform int nsubsamples, - uniform float image[]) { - static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } }; - static uniform Sphere spheres[3] = { - { { -2.0f, 0.0f, -3.5f }, 0.5f }, - { { -0.5f, 0.0f, -3.0f }, 0.5f }, - { { 1.0f, 0.0f, -2.2f }, 0.5f } }; - RNGState rngstate; + */ +__device__ +static inline void ao_tile( + int x0, int x1, + int y0, int y1, + int w, int h, + int nsubsamples, + float image[]) +{ + const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } }; + const Sphere spheres[3] = { + { { -2.0f, 0.0f, -3.5f }, 0.5f }, + { { -0.5f, 0.0f, -3.0f }, 0.5f }, + { { 1.0f, 0.0f, -2.2f }, 0.5f } }; + RNGState rngstate; - seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15))); - float invSamples = 1.f / nsubsamples; - - foreach_tiled(y = y0 ... y1, x = 0 ... w, - u = 0 ... nsubsamples, v = 0 ... nsubsamples) + seed_rng(&rngstate, programIndex + (y0 << (programIndex & 31))); + float invSamples = 1.f / nsubsamples; + for ( int y = y0; y < y1; y++) + for ( int xb = x0; xb < x1; xb += programCount) { - float du = (float)u * invSamples, dv = (float)v * invSamples; + const int x = xb + programIndex; + const int offset = 3 * (y * w + x); + float res = 0.0f; - // Figure out x,y pixel in NDC - float px = (x + du - (w / 2.0f)) / (w / 2.0f); - float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); - float ret = 0.f; - Ray ray; - Isect isect; + for ( int u = 0; u < nsubsamples; u++) + for ( int v = 0; v < nsubsamples; v++) + { + float du = (float)u * invSamples, dv = (float)v * invSamples; - ray.org = 0.f; + // Figure out x,y pixel in NDC + float px = (x + du - (w / 2.0f)) / (w / 2.0f); + float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); + float ret = 0.f; + Ray ray; + Isect isect; - // Poor man's perspective projection - ray.dir.x = px; - ray.dir.y = py; - ray.dir.z = -1.0; - vnormalize(ray.dir); + ray.org.x = 0.0f; + ray.org.y = 0.0f; + ray.org.z = 0.0f; - isect.t = 1.0e+17; - isect.hit = 0; + // Poor man's perspective projection + ray.dir.x = px; + ray.dir.y = py; + ray.dir.z = -1.0; + vnormalize(ray.dir); - for (uniform int snum = 0; snum < 3; ++snum) + isect.t = 1.0e+17; + isect.hit = 0; + + for ( int snum = 0; snum < 3; ++snum) ray_sphere_intersect(isect, ray, spheres[snum]); - ray_plane_intersect(isect, ray, plane); + ray_plane_intersect(isect, ray, plane); - // Note use of 'coherent' if statement; the set of rays we - // trace will often all hit or all miss the scene - if (isect.hit) { + // Note use of 'coherent' if statement; the set of rays we + // trace will often all hit or all miss the scene + if (isect.hit) { ret = ambient_occlusion(isect, plane, spheres, rngstate); ret *= invSamples * invSamples; - - int offset = 3 * (y * w + x); - atomic_add_local(&image[offset], ret); - atomic_add_local(&image[offset+1], ret); - atomic_add_local(&image[offset+2], ret); + res += ret; + } } + + if (xb < x1) + { + image[offset ] = res; + image[offset+1] = res; + image[offset+2] = res; + } + } } -export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, - uniform float image[]) { - ao_scanlines(0, h, w, h, nsubsamples, image); -} - - -static void task ao_task(uniform int width, uniform int height, - uniform int nsubsamples, uniform float image[]) { - ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image); -} - - -export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, - uniform float image[]) { - launch[h] ao_task(w, h, nsubsamples, image); + +#define TILEX 64 +#define TILEY 4 + +extern "C" +__global__ +void ao_task( int width, int height, + int nsubsamples, float image[]) +{ + if (taskIndex0 >= taskCount0) return; + if (taskIndex1 >= taskCount1) return; + + const int x0 = taskIndex0 * TILEX; + const int x1 = min(x0 + TILEX, width); + + const int y0 = taskIndex1 * TILEY; + const int y1 = min(y0 + TILEY, height); + ao_tile(x0,x1,y0,y1, width, height, nsubsamples, image); } diff --git a/examples_cuda/aobench/ao1.ispc b/examples_cuda/aobench/ao1.ispc index dd096e86..9a568fe9 100644 --- a/examples_cuda/aobench/ao1.ispc +++ b/examples_cuda/aobench/ao1.ispc @@ -1,39 +1,39 @@ // -*- mode: c++ -*- /* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ /* - Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench -*/ + Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench + */ #define NAO_SAMPLES 8 #define M_PI 3.1415926535f @@ -44,240 +44,267 @@ typedef float<3> vec; #warning "emitting DEVICE code" #define programCount warpSize() #define programIndex laneIndex() -#define taskIndex blockIndex0() -#define taskCount blockCount0() +#define taskIndex0 blockIndex0() +#define taskCount0 blockCount0() +#define taskIndex1 blockIndex1() +#define taskCount1 blockCount1() #else #warning "emitting HOST code" #endif struct Isect { - float t; - vec p; - vec n; - int hit; + float t; + vec p; + vec n; + int hit; }; struct Sphere { - vec center; - float radius; + vec center; + float radius; }; struct Plane { - vec p; - vec n; + vec p; + vec n; }; struct Ray { - vec org; - vec dir; + vec org; + vec dir; }; static inline float dot(vec a, vec b) { - return a.x * b.x + a.y * b.y + a.z * b.z; + return a.x * b.x + a.y * b.y + a.z * b.z; } static inline vec vcross(vec v0, vec v1) { - vec ret; - ret.x = v0.y * v1.z - v0.z * v1.y; - ret.y = v0.z * v1.x - v0.x * v1.z; - ret.z = v0.x * v1.y - v0.y * v1.x; - return ret; + vec ret; + ret.x = v0.y * v1.z - v0.z * v1.y; + ret.y = v0.z * v1.x - v0.x * v1.z; + ret.z = v0.x * v1.y - v0.y * v1.x; + return ret; } static inline void vnormalize(vec &v) { - float len2 = dot(v, v); - float invlen = rsqrt(len2); - v *= invlen; + float len2 = dot(v, v); + float invlen = rsqrt(len2); + v *= invlen; } static inline void ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) { - float d = -dot(plane.p, plane.n); - float v = dot(ray.dir, plane.n); + float d = -dot(plane.p, plane.n); + float v = dot(ray.dir, plane.n); - if (abs(v) < 1.0e-17) - return; - else { - float t = -(dot(ray.org, plane.n) + d) / v; + if (abs(v) < 1.0e-17) + return; + else { + float t = -(dot(ray.org, plane.n) + d) / v; - if ((t > 0.0) && (t < isect.t)) { - isect.t = t; - isect.hit = 1; - isect.p = ray.org + ray.dir * t; - isect.n = plane.n; - } + if ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + ray.dir * t; + isect.n = plane.n; } + } } static inline void ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) { - vec rs = ray.org - sphere.center; + vec rs = ray.org - sphere.center; - float B = dot(rs, ray.dir); - float C = dot(rs, rs) - sphere.radius * sphere.radius; - float D = B * B - C; + float B = dot(rs, ray.dir); + float C = dot(rs, rs) - sphere.radius * sphere.radius; + float D = B * B - C; - if (D > 0.) { - float t = -B - sqrt(D); + if (D > 0.) { + float t = -B - sqrt(D); - if ((t > 0.0) && (t < isect.t)) { - isect.t = t; - isect.hit = 1; - isect.p = ray.org + t * ray.dir; - isect.n = isect.p - sphere.center; - vnormalize(isect.n); - } + if ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + t * ray.dir; + isect.n = isect.p - sphere.center; + vnormalize(isect.n); } + } } static inline void orthoBasis(vec basis[3], vec n) { - basis[2] = n; - basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0; + basis[2] = n; + basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0; - if ((n.x < 0.6) && (n.x > -0.6)) { - basis[1].x = 1.0; - } else if ((n.y < 0.6) && (n.y > -0.6)) { - basis[1].y = 1.0; - } else if ((n.z < 0.6) && (n.z > -0.6)) { - basis[1].z = 1.0; - } else { - basis[1].x = 1.0; - } + if ((n.x < 0.6) && (n.x > -0.6)) { + basis[1].x = 1.0; + } else if ((n.y < 0.6) && (n.y > -0.6)) { + basis[1].y = 1.0; + } else if ((n.z < 0.6) && (n.z > -0.6)) { + basis[1].z = 1.0; + } else { + basis[1].x = 1.0; + } - basis[0] = vcross(basis[1], basis[2]); - vnormalize(basis[0]); + basis[0] = vcross(basis[1], basis[2]); + vnormalize(basis[0]); - basis[1] = vcross(basis[2], basis[0]); - vnormalize(basis[1]); + basis[1] = vcross(basis[2], basis[0]); + vnormalize(basis[1]); } static inline float ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3], - RNGState &rngstate) { - float eps = 0.0001f; - vec p, n; - vec basis[3]; - float occlusion = 0.0; + RNGState &rngstate) { + float eps = 0.0001f; + vec p, n; + vec basis[3]; + float occlusion = 0.0; - p = isect.p + eps * isect.n; + p = isect.p + eps * isect.n; - orthoBasis(basis, isect.n); + orthoBasis(basis, isect.n); - static const uniform int ntheta = NAO_SAMPLES; - static const uniform int nphi = NAO_SAMPLES; - for (uniform int j = 0; j < ntheta; j++) { - for (uniform int i = 0; i < nphi; i++) { - Ray ray; - Isect occIsect; + const uniform int ntheta = NAO_SAMPLES; + const uniform int nphi = NAO_SAMPLES; + for (uniform int j = 0; j < ntheta; j++) { + for (uniform int i = 0; i < nphi; i++) { + Ray ray; + Isect occIsect; - float theta = sqrt(frandom(&rngstate)); - float phi = 2.0f * M_PI * frandom(&rngstate); - float x = cos(phi) * theta; - float y = sin(phi) * theta; - float z = sqrt(1.0 - theta * theta); + float theta = sqrt(frandom(&rngstate)); + float phi = 2.0f * M_PI * frandom(&rngstate); + float x = cos(phi) * theta; + float y = sin(phi) * theta; + float z = sqrt(1.0 - theta * theta); - // local . global - float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x; - float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y; - float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z; + // local . global + float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x; + float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y; + float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z; - ray.org = p; - ray.dir.x = rx; - ray.dir.y = ry; - ray.dir.z = rz; + ray.org = p; + ray.dir.x = rx; + ray.dir.y = ry; + ray.dir.z = rz; - occIsect.t = 1.0e+17; - occIsect.hit = 0; + occIsect.t = 1.0e+17; + occIsect.hit = 0; - for (uniform int snum = 0; snum < 3; ++snum) - ray_sphere_intersect(occIsect, ray, spheres[snum]); - ray_plane_intersect (occIsect, ray, plane); + for (uniform int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(occIsect, ray, spheres[snum]); + ray_plane_intersect (occIsect, ray, plane); - if (occIsect.hit) occlusion += 1.0; - } + if (occIsect.hit) occlusion += 1.0; } + } - occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi); - return occlusion; + occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi); + return occlusion; } /* Compute the image for the scanlines from [y0,y1), for an overall image of width w and height h. - */ -static inline void ao_scanlines(uniform int y0, uniform int y1, uniform int w, - uniform int h, uniform int nsubsamples, - uniform float image[]) { - static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } }; - static uniform Sphere spheres[3] = { - { { -2.0f, 0.0f, -3.5f }, 0.5f }, - { { -0.5f, 0.0f, -3.0f }, 0.5f }, - { { 1.0f, 0.0f, -2.2f }, 0.5f } }; - RNGState rngstate; + */ +static inline void ao_tile( + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int w, uniform int h, + uniform int nsubsamples, + uniform float image[]) +{ + uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } }; + uniform Sphere spheres[3] = { + { { -2.0f, 0.0f, -3.5f }, 0.5f }, + { { -0.5f, 0.0f, -3.0f }, 0.5f }, + { { 1.0f, 0.0f, -2.2f }, 0.5f } }; + RNGState rngstate; - seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15))); - float invSamples = 1.f / nsubsamples; - - foreach_tiled(y = y0 ... y1, x = 0 ... w, - u = 0 ... nsubsamples, v = 0 ... nsubsamples) + seed_rng(&rngstate, programIndex + (y0 << (programIndex & 31))); + float invSamples = 1.f / nsubsamples; + for (uniform int y = y0; y < y1; y++) + for (uniform int xb = x0; xb < x1; xb += programCount) { - float du = (float)u * invSamples, dv = (float)v * invSamples; + const int x = xb + programIndex; + const int offset = 3 * (y * w + x); + float res = 0.0f; - // Figure out x,y pixel in NDC - float px = (x + du - (w / 2.0f)) / (w / 2.0f); - float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); - float ret = 0.f; - Ray ray; - Isect isect; + for (uniform int u = 0; u < nsubsamples; u++) + for (uniform int v = 0; v < nsubsamples; v++) + { + float du = (float)u * invSamples, dv = (float)v * invSamples; - ray.org = 0.f; + // Figure out x,y pixel in NDC + float px = (x + du - (w / 2.0f)) / (w / 2.0f); + float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); + float ret = 0.f; + Ray ray; + Isect isect; - // Poor man's perspective projection - ray.dir.x = px; - ray.dir.y = py; - ray.dir.z = -1.0; - vnormalize(ray.dir); + ray.org = 0.f; - isect.t = 1.0e+17; - isect.hit = 0; + // Poor man's perspective projection + ray.dir.x = px; + ray.dir.y = py; + ray.dir.z = -1.0; + vnormalize(ray.dir); - for (uniform int snum = 0; snum < 3; ++snum) + isect.t = 1.0e+17; + isect.hit = 0; + + for (uniform int snum = 0; snum < 3; ++snum) ray_sphere_intersect(isect, ray, spheres[snum]); - ray_plane_intersect(isect, ray, plane); + ray_plane_intersect(isect, ray, plane); - // Note use of 'coherent' if statement; the set of rays we - // trace will often all hit or all miss the scene - if (isect.hit) { + // Note use of 'coherent' if statement; the set of rays we + // trace will often all hit or all miss the scene + if (isect.hit) { ret = ambient_occlusion(isect, plane, spheres, rngstate); ret *= invSamples * invSamples; - - int offset = 3 * (y * w + x); - atomic_add_local(&image[offset], ret); - atomic_add_local(&image[offset+1], ret); - atomic_add_local(&image[offset+2], ret); + res += ret; + } } + + if (xb < x1) + { + image[offset ] = res; + image[offset+1] = res; + image[offset+2] = res; + } + } } -export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, - uniform float image[]) { - ao_scanlines(0, h, w, h, nsubsamples, image); -} +#define TILEX 64 +#define TILEY 4 -static void task ao_task(uniform int width, uniform int height, - uniform int nsubsamples, uniform float image[]) { - ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image); +void task ao_task(uniform int width, uniform int height, + uniform int nsubsamples, uniform float image[]) +{ + if (taskIndex0 >= taskCount0) return; + if (taskIndex1 >= taskCount1) return; + + const uniform int x0 = taskIndex0 * TILEX; + const uniform int x1 = min(x0 + TILEX, width); + + const uniform int y0 = taskIndex1 * TILEY; + const uniform int y1 = min(y0 + TILEY, height); + ao_tile(x0,x1,y0,y1, width, height, nsubsamples, image); } export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, - uniform float image[]) { - launch[h] ao_task(w, h, nsubsamples, image); + uniform float image[]) +{ + const uniform int ntilex = (w+TILEX-1)/TILEX; + const uniform int ntiley = (h+TILEY-1)/TILEY; + launch[ntilex,ntiley] ao_task(w, h, nsubsamples, image); } diff --git a/examples_cuda/aobench/ao_cu.cpp b/examples_cuda/aobench/ao_cu.cpp old mode 100644 new mode 100755 index 747f8466..1432a380 --- a/examples_cuda/aobench/ao_cu.cpp +++ b/examples_cuda/aobench/ao_cu.cpp @@ -69,6 +69,211 @@ static inline double rtc(void) return etime; } +/******************************/ +#include +#include +#include +#include "drvapi_error_string.h" + +#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) +// These are the inline versions for all of the SDK helper functions +void __checkCudaErrors(CUresult err, const char *file, const int line) { + if(CUDA_SUCCESS != err) { + std::cerr << "checkCudeErrors() Driver API error = " << err << "\"" + << getCudaDrvErrorString(err) << "\" from file <" << file + << ", line " << line << "\n"; + exit(-1); + } +} + +/**********************/ +/* Basic CUDriver API */ +CUcontext context; + +void createContext(const int deviceId = 0) +{ + CUdevice device; + int devCount; + checkCudaErrors(cuInit(0)); + checkCudaErrors(cuDeviceGetCount(&devCount)); + assert(devCount > 0); + checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0)); + + char name[128]; + checkCudaErrors(cuDeviceGetName(name, 128, device)); + std::cout << "Using CUDA Device [0]: " << name << "\n"; + + int devMajor, devMinor; + checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); + std::cout << "Device Compute Capability: " + << devMajor << "." << devMinor << "\n"; + if (devMajor < 2) { + std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; + exit(1); + } + + // Create driver context + checkCudaErrors(cuCtxCreate(&context, 0, device)); +} +void destroyContext() +{ + checkCudaErrors(cuCtxDestroy(context)); +} + +CUmodule loadModule(const char * module) +{ + CUmodule cudaModule; + // in this branch we use compilation with parameters + + const unsigned int jitNumOptions = 1; + CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; + void **jitOptVals = new void*[jitNumOptions]; + // set up pointer to set the Maximum # of registers for a particular kernel + jitOptions[0] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 64; + jitOptVals[0] = (void *)(size_t)jitRegCount; +#if 0 + + // set up size of compilation log buffer + jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + int jitLogBufferSize = 1024; + jitOptVals[0] = (void *)(size_t)jitLogBufferSize; + + // set up pointer to the compilation log buffer + jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; + char *jitLogBuffer = new char[jitLogBufferSize]; + jitOptVals[1] = jitLogBuffer; + + // set up pointer to set the Maximum # of registers for a particular kernel + jitOptions[2] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 32; + jitOptVals[2] = (void *)(size_t)jitRegCount; +#endif + + checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals)); + return cudaModule; +} +void unloadModule(CUmodule &cudaModule) +{ + checkCudaErrors(cuModuleUnload(cudaModule)); +} + +CUfunction getFunction(CUmodule &cudaModule, const char * function) +{ + CUfunction cudaFunction; + checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function)); + return cudaFunction; +} + +CUdeviceptr deviceMalloc(const size_t size) +{ + CUdeviceptr d_buf; + checkCudaErrors(cuMemAlloc(&d_buf, size)); + return d_buf; +} +void deviceFree(CUdeviceptr d_buf) +{ + checkCudaErrors(cuMemFree(d_buf)); +} +void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size)); +} +void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); +} +#define deviceLaunch(func,nbx,nby,nbz,params) \ + checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_EQUAL)); \ + checkCudaErrors( \ + cuLaunchKernel( \ + (func), \ + ((nbx-1)/(128/32)+1), (nby), (nbz), \ + 128, 1, 1, \ + 0, NULL, (params), NULL \ + )); + +typedef CUdeviceptr devicePtr; + + +/**************/ +#include +std::vector readBinary(const char * filename) +{ + std::vector buffer; + FILE *fp = fopen(filename, "rb"); + if (!fp ) + { + fprintf(stderr, "file %s not found\n", filename); + assert(0); + } +#if 0 + char c; + while ((c = fgetc(fp)) != EOF) + buffer.push_back(c); +#else + fseek(fp, 0, SEEK_END); + const unsigned long long size = ftell(fp); /*calc the size needed*/ + fseek(fp, 0, SEEK_SET); + buffer.resize(size); + + if (fp == NULL){ /*ERROR detection if file == empty*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n",filename); + exit(1); + } + else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n", filename); + exit(1); + } +#endif + fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size()); + return buffer; +} + +extern "C" +{ + + void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment) + { + return NULL; + } + void CUDALaunch( + void **handlePtr, + const char * module_name, + const char * module_1, + const char * func_name, + void **func_args, + int countx, int county, int countz) + { + assert(module_name != NULL); + assert(module_1 != NULL); + assert(func_name != NULL); + assert(func_args != NULL); +#if 0 + const char * module = module_1; +#else + const std::vector module_str = readBinary("kernel.cubin"); + const char * module = &module_str[0]; +#endif + CUmodule cudaModule = loadModule(module); + CUfunction cudaFunction = getFunction(cudaModule, func_name); + deviceLaunch(cudaFunction, countx, county, countz, func_args); + unloadModule(cudaModule); + } + void CUDASync(void *handle) + { + checkCudaErrors(cuStreamSynchronize(0)); + } + void ISPCSync(void *handle) + { + checkCudaErrors(cuStreamSynchronize(0)); + } + void CUDAFree(void *handle) + { + } +} +/******************************/ + #define NSUBSAMPLES 2 @@ -158,6 +363,11 @@ int main(int argc, char **argv) savePPM("ao-ispc.ppm", width, height); #endif + /*******************/ + createContext(); + /*******************/ + devicePtr d_fimg = deviceMalloc(width*height*3*sizeof(float)); + // // Run the ispc + tasks path, test_iterations times, and report the // minimum time for any of them. @@ -166,18 +376,28 @@ int main(int argc, char **argv) for (unsigned int i = 0; i < test_iterations; i++) { memset((void *)fimg, 0, sizeof(float) * width * height * 3); assert(NSUBSAMPLES == 2); + memcpyH2D(d_fimg, fimg, width*height*3*sizeof(float)); reset_and_start_timer(); const double t0 = rtc(); - ao_ispc_tasks(width, height, NSUBSAMPLES, fimg); + ao_ispc_tasks( + width, + height, + NSUBSAMPLES, + (float*)d_fimg); double t = (rtc() - t0); //get_elapsed_mcycles(); minTimeISPCTasks = std::min(minTimeISPCTasks, t); } + memcpyD2H(fimg, d_fimg, width*height*3*sizeof(float)); + // Report results and save image printf("[aobench ispc + tasks]:\t\t[%.3f] million cycles (%d x %d image)\n", minTimeISPCTasks, width, height); - savePPM("ao-ispc-tasks.ppm", width, height); + savePPM("ao-cuda.ppm", width, height); + /*******************/ + destroyContext(); + /*******************/ return 0; // diff --git a/examples_cuda/aobench/drvapi_error_string.h b/examples_cuda/aobench/drvapi_error_string.h new file mode 100644 index 00000000..ce85f152 --- /dev/null +++ b/examples_cuda/aobench/drvapi_error_string.h @@ -0,0 +1,370 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +#ifndef _DRVAPI_ERROR_STRING_H_ +#define _DRVAPI_ERROR_STRING_H_ + +#include +#include +#include + +// Error Code string definitions here +typedef struct +{ + char const *error_string; + int error_id; +} s_CudaErrorStr; + +/** + * Error codes + */ +static s_CudaErrorStr sCudaDrvErrorString[] = +{ + /** + * The API call returned with no errors. In the case of query calls, this + * can also mean that the operation being queried is complete (see + * ::cuEventQuery() and ::cuStreamQuery()). + */ + { "CUDA_SUCCESS", 0 }, + + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + { "CUDA_ERROR_INVALID_VALUE", 1 }, + + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + { "CUDA_ERROR_OUT_OF_MEMORY", 2 }, + + /** + * This indicates that the CUDA driver has not been initialized with + * ::cuInit() or that initialization has failed. + */ + { "CUDA_ERROR_NOT_INITIALIZED", 3 }, + + /** + * This indicates that the CUDA driver is in the process of shutting down. + */ + { "CUDA_ERROR_DEINITIALIZED", 4 }, + + /** + * This indicates profiling APIs are called while application is running + * in visual profiler mode. + */ + { "CUDA_ERROR_PROFILER_DISABLED", 5 }, + /** + * This indicates profiling has not been initialized for this context. + * Call cuProfilerInitialize() to resolve this. + */ + { "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 }, + /** + * This indicates profiler has already been started and probably + * cuProfilerStart() is incorrectly called. + */ + { "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 }, + /** + * This indicates profiler has already been stopped and probably + * cuProfilerStop() is incorrectly called. + */ + { "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 }, + /** + * This indicates that no CUDA-capable devices were detected by the installed + * CUDA driver. + */ + { "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 }, + + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device. + */ + { "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 }, + + + /** + * This indicates that the device kernel image is invalid. This can also + * indicate an invalid CUDA module. + */ + { "CUDA_ERROR_INVALID_IMAGE", 200 }, + + /** + * This most frequently indicates that there is no context bound to the + * current thread. This can also be returned if the context passed to an + * API call is not a valid handle (such as a context that has had + * ::cuCtxDestroy() invoked on it). This can also be returned if a user + * mixes different API versions (i.e. 3010 context with 3020 API calls). + * See ::cuCtxGetApiVersion() for more details. + */ + { "CUDA_ERROR_INVALID_CONTEXT", 201 }, + + /** + * This indicated that the context being supplied as a parameter to the + * API call was already the active context. + * \deprecated + * This error return is deprecated as of CUDA 3.2. It is no longer an + * error to attempt to push the active context via ::cuCtxPushCurrent(). + */ + { "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 }, + + /** + * This indicates that a map or register operation has failed. + */ + { "CUDA_ERROR_MAP_FAILED", 205 }, + + /** + * This indicates that an unmap or unregister operation has failed. + */ + { "CUDA_ERROR_UNMAP_FAILED", 206 }, + + /** + * This indicates that the specified array is currently mapped and thus + * cannot be destroyed. + */ + { "CUDA_ERROR_ARRAY_IS_MAPPED", 207 }, + + /** + * This indicates that the resource is already mapped. + */ + { "CUDA_ERROR_ALREADY_MAPPED", 208 }, + + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + { "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 }, + + /** + * This indicates that a resource has already been acquired. + */ + { "CUDA_ERROR_ALREADY_ACQUIRED", 210 }, + + /** + * This indicates that a resource is not mapped. + */ + { "CUDA_ERROR_NOT_MAPPED", 211 }, + + /** + * This indicates that a mapped resource is not available for access as an + * array. + */ + { "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 }, + + /** + * This indicates that a mapped resource is not available for access as a + * pointer. + */ + { "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 }, + + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + { "CUDA_ERROR_ECC_UNCORRECTABLE", 214 }, + + /** + * This indicates that the ::CUlimit passed to the API call is not + * supported by the active device. + */ + { "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 }, + + /** + * This indicates that the ::CUcontext passed to the API call can + * only be bound to a single CPU thread at a time but is already + * bound to a CPU thread. + */ + { "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 }, + + /** + * This indicates that peer access is not supported across the given + * devices. + */ + { "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217}, + + /** + * This indicates that the device kernel source is invalid. + */ + { "CUDA_ERROR_INVALID_SOURCE", 300 }, + + /** + * This indicates that the file specified was not found. + */ + { "CUDA_ERROR_FILE_NOT_FOUND", 301 }, + + /** + * This indicates that a link to a shared object failed to resolve. + */ + { "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 }, + + /** + * This indicates that initialization of a shared object failed. + */ + { "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 }, + + /** + * This indicates that an OS call failed. + */ + { "CUDA_ERROR_OPERATING_SYSTEM", 304 }, + + + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::CUstream and ::CUevent. + */ + { "CUDA_ERROR_INVALID_HANDLE", 400 }, + + + /** + * This indicates that a named symbol was not found. Examples of symbols + * are global/constant variable names, texture names }, and surface names. + */ + { "CUDA_ERROR_NOT_FOUND", 500 }, + + + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be indicated + * differently than ::CUDA_SUCCESS (which indicates completion). Calls that + * may return this value include ::cuEventQuery() and ::cuStreamQuery(). + */ + { "CUDA_ERROR_NOT_READY", 600 }, + + + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. The context cannot be used }, so it must + * be destroyed (and a new one should be created). All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + { "CUDA_ERROR_LAUNCH_FAILED", 700 }, + + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. This error usually indicates that the user has + * attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register + * count. Passing arguments of the wrong size (i.e. a 64-bit pointer + * when a 32-bit int is expected) is equivalent to passing too many + * arguments and can also result in this error. + */ + { "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 }, + + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device attribute + * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The + * context cannot be used (and must be destroyed similar to + * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from + * this context are invalid and must be reconstructed if the program is to + * continue using CUDA. + */ + { "CUDA_ERROR_LAUNCH_TIMEOUT", 702 }, + + /** + * This error indicates a kernel launch that uses an incompatible texturing + * mode. + */ + { "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 }, + + /** + * This error indicates that a call to ::cuCtxEnablePeerAccess() is + * trying to re-enable peer access to a context which has already + * had peer access to it enabled. + */ + { "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 }, + + /** + * This error indicates that ::cuCtxDisablePeerAccess() is + * trying to disable peer access which has not been enabled yet + * via ::cuCtxEnablePeerAccess(). + */ + { "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 }, + + /** + * This error indicates that the primary context for the specified device + * has already been initialized. + */ + { "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 }, + + /** + * This error indicates that the context current to the calling thread + * has been destroyed using ::cuCtxDestroy }, or is a primary context which + * has not yet been initialized. + */ + { "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 }, + + /** + * A device-side assert triggered during kernel execution. The context + * cannot be used anymore, and must be destroyed. All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + { "CUDA_ERROR_ASSERT", 710 }, + + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cuCtxEnablePeerAccess(). + */ + { "CUDA_ERROR_TOO_MANY_PEERS", 711 }, + + /** + * This error indicates that the memory range passed to ::cuMemHostRegister() + * has already been registered. + */ + { "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 }, + + /** + * This error indicates that the pointer passed to ::cuMemHostUnregister() + * does not correspond to any currently registered memory region. + */ + { "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 }, + + /** + * This error indicates that the attempted operation is not permitted. + */ + { "CUDA_ERROR_NOT_PERMITTED", 800 }, + + /** + * This error indicates that the attempted operation is not supported + * on the current system or device. + */ + { "CUDA_ERROR_NOT_SUPPORTED", 801 }, + + /** + * This indicates that an unknown internal error has occurred. + */ + { "CUDA_ERROR_UNKNOWN", 999 }, + { NULL, -1 } +}; + +// This is just a linear search through the array, since the error_id's are not +// always ocurring consecutively +const char * getCudaDrvErrorString(CUresult error_id) +{ + int index = 0; + while (sCudaDrvErrorString[index].error_id != error_id && + sCudaDrvErrorString[index].error_id != -1) + { + index++; + } + if (sCudaDrvErrorString[index].error_id == error_id) + return (const char *)sCudaDrvErrorString[index].error_string; + else + return (const char *)"CUDA_ERROR not found!"; +} + +#endif diff --git a/examples_cuda/deferred/main_cu.cpp b/examples_cuda/deferred/main_cu.cpp index 884b5cd0..930db971 100755 --- a/examples_cuda/deferred/main_cu.cpp +++ b/examples_cuda/deferred/main_cu.cpp @@ -71,7 +71,8 @@ static inline double rtc(void) 1.e-6*((double) Tvalue.tv_usec); return etime; } -/******************************/ #include +/******************************/ +#include #include #include #include "drvapi_error_string.h"