added tuned code

This commit is contained in:
Evghenii
2013-11-10 16:02:10 +01:00
parent 66edc180be
commit 8a7801264a
6 changed files with 1063 additions and 366 deletions

View File

@@ -1,7 +1,7 @@
EXAMPLE=ao
CPP_SRC=ao.cpp ao_serial.cpp
ISPC_SRC=ao.ispc
ISPC_SRC=ao1.ispc
ISPC_IA_TARGETS=avx
ISPC_ARM_TARGETS=neon

View File

@@ -1,283 +1,362 @@
// -*- mode: c++ -*-
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
*/
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
*/
#define NAO_SAMPLES 8
#define M_PI 3.1415926535f
//#define M_PI 3.1415926535f
typedef float<3> vec;
#define vec Float3
struct Float3
{
float x,y,z;
#ifdef __NVPTX__
#warning "emitting DEVICE code"
#define programCount warpSize()
#define programIndex laneIndex()
#define taskIndex blockIndex0()
#define taskCount blockCount0()
#else
#warning "emitting HOST code"
#endif
__device__ friend Float3 operator+(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x+b.x;
c.y = a.y+b.y;
c.z = a.z+b.z;
return c;
}
__device__ friend Float3 operator-(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x-b.x;
c.y = a.y-b.y;
c.z = a.z-b.z;
return c;
}
__device__ friend Float3 operator/(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x/b.x;
c.y = a.y/b.y;
c.z = a.z/b.z;
return c;
}
__device__ friend Float3 operator/(const float a, const Float3 b)
{
Float3 c;
c.x = a/b.x;
c.y = a/b.y;
c.z = a/b.z;
return c;
}
__device__ friend Float3 operator*(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x*b.x;
c.y = a.y*b.y;
c.z = a.z*b.z;
return c;
}
__device__ friend Float3 operator*(const Float3 a, const float b)
{
Float3 c;
c.x = a.x*b;
c.y = a.y*b;
c.z = a.z*b;
return c;
}
};
#define programCount 32
#define programIndex (threadIdx.x & 31)
#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5))
#define taskCount0 (gridDim.x*4)
#define taskIndex1 (blockIdx.y)
#define taskCount1 (gridDim.y)
#define warpIdx (threadIdx.x >> 5)
struct Isect {
float t;
vec p;
vec n;
int hit;
float t;
vec p;
vec n;
int hit;
};
struct Sphere {
vec center;
float radius;
vec center;
float radius;
};
struct Plane {
vec p;
vec n;
vec p;
vec n;
};
struct Ray {
vec org;
vec dir;
vec org;
vec dir;
};
__device__
static inline float dot(vec a, vec b) {
return a.x * b.x + a.y * b.y + a.z * b.z;
return a.x * b.x + a.y * b.y + a.z * b.z;
}
__device__
static inline vec vcross(vec v0, vec v1) {
vec ret;
ret.x = v0.y * v1.z - v0.z * v1.y;
ret.y = v0.z * v1.x - v0.x * v1.z;
ret.z = v0.x * v1.y - v0.y * v1.x;
return ret;
vec ret;
ret.x = v0.y * v1.z - v0.z * v1.y;
ret.y = v0.z * v1.x - v0.x * v1.z;
ret.z = v0.x * v1.y - v0.y * v1.x;
return ret;
}
__device__
static inline void vnormalize(vec &v) {
float len2 = dot(v, v);
float invlen = rsqrt(len2);
v *= invlen;
float len2 = dot(v, v);
float invlen = rsqrt(len2);
v = v*invlen;
}
__device__
static inline void
ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {
float d = -dot(plane.p, plane.n);
float v = dot(ray.dir, plane.n);
ray_plane_intersect(Isect &isect,const Ray &ray, const Plane &plane) {
float d = -dot(plane.p, plane.n);
float v = dot(ray.dir, plane.n);
if (abs(v) < 1.0e-17)
return;
else {
float t = -(dot(ray.org, plane.n) + d) / v;
if (abs(v) < 1.0e-17)
return;
else {
float t = -(dot(ray.org, plane.n) + d) / v;
if ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + ray.dir * t;
isect.n = plane.n;
}
if ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + ray.dir * t;
isect.n = plane.n;
}
}
}
__device__
static inline void
ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {
vec rs = ray.org - sphere.center;
ray_sphere_intersect(Isect &isect,const Ray &ray, const Sphere &sphere) {
vec rs = ray.org - sphere.center;
float B = dot(rs, ray.dir);
float C = dot(rs, rs) - sphere.radius * sphere.radius;
float D = B * B - C;
float B = dot(rs, ray.dir);
float C = dot(rs, rs) - sphere.radius * sphere.radius;
float D = B * B - C;
if (D > 0.) {
float t = -B - sqrt(D);
if (D > 0.) {
float t = -B - sqrt(D);
if ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + t * ray.dir;
isect.n = isect.p - sphere.center;
vnormalize(isect.n);
}
if ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + ray.dir * t;
isect.n = isect.p - sphere.center;
vnormalize(isect.n);
}
}
}
__device__
static inline void
orthoBasis(vec basis[3], vec n) {
basis[2] = n;
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
basis[2] = n;
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
if ((n.x < 0.6) && (n.x > -0.6)) {
basis[1].x = 1.0;
} else if ((n.y < 0.6) && (n.y > -0.6)) {
basis[1].y = 1.0;
} else if ((n.z < 0.6) && (n.z > -0.6)) {
basis[1].z = 1.0;
} else {
basis[1].x = 1.0;
}
if ((n.x < 0.6) && (n.x > -0.6)) {
basis[1].x = 1.0;
} else if ((n.y < 0.6) && (n.y > -0.6)) {
basis[1].y = 1.0;
} else if ((n.z < 0.6) && (n.z > -0.6)) {
basis[1].z = 1.0;
} else {
basis[1].x = 1.0;
}
basis[0] = vcross(basis[1], basis[2]);
vnormalize(basis[0]);
basis[0] = vcross(basis[1], basis[2]);
vnormalize(basis[0]);
basis[1] = vcross(basis[2], basis[0]);
vnormalize(basis[1]);
basis[1] = vcross(basis[2], basis[0]);
vnormalize(basis[1]);
}
__device__
static inline float
ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],
RNGState &rngstate) {
float eps = 0.0001f;
vec p, n;
vec basis[3];
float occlusion = 0.0;
ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
RNGState &rngstate) {
float eps = 0.0001f;
vec p; //, n;
vec basis[3];
float occlusion = 0.0;
p = isect.p + eps * isect.n;
p = isect.p + isect.n * eps;
orthoBasis(basis, isect.n);
orthoBasis(basis, isect.n);
static const uniform int ntheta = NAO_SAMPLES;
static const uniform int nphi = NAO_SAMPLES;
for (uniform int j = 0; j < ntheta; j++) {
for (uniform int i = 0; i < nphi; i++) {
Ray ray;
Isect occIsect;
const int ntheta = NAO_SAMPLES;
const int nphi = NAO_SAMPLES;
for ( int j = 0; j < ntheta; j++) {
for ( int i = 0; i < nphi; i++) {
Ray ray;
Isect occIsect;
float theta = sqrt(frandom(&rngstate));
float phi = 2.0f * M_PI * frandom(&rngstate);
float x = cos(phi) * theta;
float y = sin(phi) * theta;
float z = sqrt(1.0 - theta * theta);
float theta = sqrt(frandom(&rngstate));
float phi = 2.0f * M_PI * frandom(&rngstate);
float x = cos(phi) * theta;
float y = sin(phi) * theta;
float z = sqrt(1.0 - theta * theta);
// local . global
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
// local . global
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
ray.org = p;
ray.dir.x = rx;
ray.dir.y = ry;
ray.dir.z = rz;
ray.org = p;
ray.dir.x = rx;
ray.dir.y = ry;
ray.dir.z = rz;
occIsect.t = 1.0e+17;
occIsect.hit = 0;
occIsect.t = 1.0e+17;
occIsect.hit = 0;
for (uniform int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(occIsect, ray, spheres[snum]);
ray_plane_intersect (occIsect, ray, plane);
for ( int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(occIsect, ray, spheres[snum]);
ray_plane_intersect (occIsect, ray, plane);
if (occIsect.hit) occlusion += 1.0;
}
if (occIsect.hit) occlusion += 1.0;
}
}
occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
return occlusion;
occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
return occlusion;
}
/* Compute the image for the scanlines from [y0,y1), for an overall image
of width w and height h.
*/
static inline void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
uniform int h, uniform int nsubsamples,
uniform float image[]) {
static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
static uniform Sphere spheres[3] = {
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
RNGState rngstate;
*/
__device__
static inline void ao_tile(
int x0, int x1,
int y0, int y1,
int w, int h,
int nsubsamples,
float image[])
{
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
const Sphere spheres[3] = {
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
RNGState rngstate;
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
float invSamples = 1.f / nsubsamples;
foreach_tiled(y = y0 ... y1, x = 0 ... w,
u = 0 ... nsubsamples, v = 0 ... nsubsamples)
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 31)));
float invSamples = 1.f / nsubsamples;
for ( int y = y0; y < y1; y++)
for ( int xb = x0; xb < x1; xb += programCount)
{
float du = (float)u * invSamples, dv = (float)v * invSamples;
const int x = xb + programIndex;
const int offset = 3 * (y * w + x);
float res = 0.0f;
// Figure out x,y pixel in NDC
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
float ret = 0.f;
Ray ray;
Isect isect;
for ( int u = 0; u < nsubsamples; u++)
for ( int v = 0; v < nsubsamples; v++)
{
float du = (float)u * invSamples, dv = (float)v * invSamples;
ray.org = 0.f;
// Figure out x,y pixel in NDC
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
float ret = 0.f;
Ray ray;
Isect isect;
// Poor man's perspective projection
ray.dir.x = px;
ray.dir.y = py;
ray.dir.z = -1.0;
vnormalize(ray.dir);
ray.org.x = 0.0f;
ray.org.y = 0.0f;
ray.org.z = 0.0f;
isect.t = 1.0e+17;
isect.hit = 0;
// Poor man's perspective projection
ray.dir.x = px;
ray.dir.y = py;
ray.dir.z = -1.0;
vnormalize(ray.dir);
for (uniform int snum = 0; snum < 3; ++snum)
isect.t = 1.0e+17;
isect.hit = 0;
for ( int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(isect, ray, spheres[snum]);
ray_plane_intersect(isect, ray, plane);
ray_plane_intersect(isect, ray, plane);
// Note use of 'coherent' if statement; the set of rays we
// trace will often all hit or all miss the scene
if (isect.hit) {
// Note use of 'coherent' if statement; the set of rays we
// trace will often all hit or all miss the scene
if (isect.hit) {
ret = ambient_occlusion(isect, plane, spheres, rngstate);
ret *= invSamples * invSamples;
int offset = 3 * (y * w + x);
atomic_add_local(&image[offset], ret);
atomic_add_local(&image[offset+1], ret);
atomic_add_local(&image[offset+2], ret);
res += ret;
}
}
if (xb < x1)
{
image[offset ] = res;
image[offset+1] = res;
image[offset+2] = res;
}
}
}
export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
uniform float image[]) {
ao_scanlines(0, h, w, h, nsubsamples, image);
}
static void task ao_task(uniform int width, uniform int height,
uniform int nsubsamples, uniform float image[]) {
ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
}
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
uniform float image[]) {
launch[h] ao_task(w, h, nsubsamples, image);
#define TILEX 64
#define TILEY 4
extern "C"
__global__
void ao_task( int width, int height,
int nsubsamples, float image[])
{
if (taskIndex0 >= taskCount0) return;
if (taskIndex1 >= taskCount1) return;
const int x0 = taskIndex0 * TILEX;
const int x1 = min(x0 + TILEX, width);
const int y0 = taskIndex1 * TILEY;
const int y1 = min(y0 + TILEY, height);
ao_tile(x0,x1,y0,y1, width, height, nsubsamples, image);
}

View File

@@ -1,39 +1,39 @@
// -*- mode: c++ -*-
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
*/
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
*/
#define NAO_SAMPLES 8
#define M_PI 3.1415926535f
@@ -44,240 +44,267 @@ typedef float<3> vec;
#warning "emitting DEVICE code"
#define programCount warpSize()
#define programIndex laneIndex()
#define taskIndex blockIndex0()
#define taskCount blockCount0()
#define taskIndex0 blockIndex0()
#define taskCount0 blockCount0()
#define taskIndex1 blockIndex1()
#define taskCount1 blockCount1()
#else
#warning "emitting HOST code"
#endif
struct Isect {
float t;
vec p;
vec n;
int hit;
float t;
vec p;
vec n;
int hit;
};
struct Sphere {
vec center;
float radius;
vec center;
float radius;
};
struct Plane {
vec p;
vec n;
vec p;
vec n;
};
struct Ray {
vec org;
vec dir;
vec org;
vec dir;
};
static inline float dot(vec a, vec b) {
return a.x * b.x + a.y * b.y + a.z * b.z;
return a.x * b.x + a.y * b.y + a.z * b.z;
}
static inline vec vcross(vec v0, vec v1) {
vec ret;
ret.x = v0.y * v1.z - v0.z * v1.y;
ret.y = v0.z * v1.x - v0.x * v1.z;
ret.z = v0.x * v1.y - v0.y * v1.x;
return ret;
vec ret;
ret.x = v0.y * v1.z - v0.z * v1.y;
ret.y = v0.z * v1.x - v0.x * v1.z;
ret.z = v0.x * v1.y - v0.y * v1.x;
return ret;
}
static inline void vnormalize(vec &v) {
float len2 = dot(v, v);
float invlen = rsqrt(len2);
v *= invlen;
float len2 = dot(v, v);
float invlen = rsqrt(len2);
v *= invlen;
}
static inline void
ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {
float d = -dot(plane.p, plane.n);
float v = dot(ray.dir, plane.n);
float d = -dot(plane.p, plane.n);
float v = dot(ray.dir, plane.n);
if (abs(v) < 1.0e-17)
return;
else {
float t = -(dot(ray.org, plane.n) + d) / v;
if (abs(v) < 1.0e-17)
return;
else {
float t = -(dot(ray.org, plane.n) + d) / v;
if ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + ray.dir * t;
isect.n = plane.n;
}
if ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + ray.dir * t;
isect.n = plane.n;
}
}
}
static inline void
ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {
vec rs = ray.org - sphere.center;
vec rs = ray.org - sphere.center;
float B = dot(rs, ray.dir);
float C = dot(rs, rs) - sphere.radius * sphere.radius;
float D = B * B - C;
float B = dot(rs, ray.dir);
float C = dot(rs, rs) - sphere.radius * sphere.radius;
float D = B * B - C;
if (D > 0.) {
float t = -B - sqrt(D);
if (D > 0.) {
float t = -B - sqrt(D);
if ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + t * ray.dir;
isect.n = isect.p - sphere.center;
vnormalize(isect.n);
}
if ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + t * ray.dir;
isect.n = isect.p - sphere.center;
vnormalize(isect.n);
}
}
}
static inline void
orthoBasis(vec basis[3], vec n) {
basis[2] = n;
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
basis[2] = n;
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
if ((n.x < 0.6) && (n.x > -0.6)) {
basis[1].x = 1.0;
} else if ((n.y < 0.6) && (n.y > -0.6)) {
basis[1].y = 1.0;
} else if ((n.z < 0.6) && (n.z > -0.6)) {
basis[1].z = 1.0;
} else {
basis[1].x = 1.0;
}
if ((n.x < 0.6) && (n.x > -0.6)) {
basis[1].x = 1.0;
} else if ((n.y < 0.6) && (n.y > -0.6)) {
basis[1].y = 1.0;
} else if ((n.z < 0.6) && (n.z > -0.6)) {
basis[1].z = 1.0;
} else {
basis[1].x = 1.0;
}
basis[0] = vcross(basis[1], basis[2]);
vnormalize(basis[0]);
basis[0] = vcross(basis[1], basis[2]);
vnormalize(basis[0]);
basis[1] = vcross(basis[2], basis[0]);
vnormalize(basis[1]);
basis[1] = vcross(basis[2], basis[0]);
vnormalize(basis[1]);
}
static inline float
ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],
RNGState &rngstate) {
float eps = 0.0001f;
vec p, n;
vec basis[3];
float occlusion = 0.0;
RNGState &rngstate) {
float eps = 0.0001f;
vec p, n;
vec basis[3];
float occlusion = 0.0;
p = isect.p + eps * isect.n;
p = isect.p + eps * isect.n;
orthoBasis(basis, isect.n);
orthoBasis(basis, isect.n);
static const uniform int ntheta = NAO_SAMPLES;
static const uniform int nphi = NAO_SAMPLES;
for (uniform int j = 0; j < ntheta; j++) {
for (uniform int i = 0; i < nphi; i++) {
Ray ray;
Isect occIsect;
const uniform int ntheta = NAO_SAMPLES;
const uniform int nphi = NAO_SAMPLES;
for (uniform int j = 0; j < ntheta; j++) {
for (uniform int i = 0; i < nphi; i++) {
Ray ray;
Isect occIsect;
float theta = sqrt(frandom(&rngstate));
float phi = 2.0f * M_PI * frandom(&rngstate);
float x = cos(phi) * theta;
float y = sin(phi) * theta;
float z = sqrt(1.0 - theta * theta);
float theta = sqrt(frandom(&rngstate));
float phi = 2.0f * M_PI * frandom(&rngstate);
float x = cos(phi) * theta;
float y = sin(phi) * theta;
float z = sqrt(1.0 - theta * theta);
// local . global
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
// local . global
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
ray.org = p;
ray.dir.x = rx;
ray.dir.y = ry;
ray.dir.z = rz;
ray.org = p;
ray.dir.x = rx;
ray.dir.y = ry;
ray.dir.z = rz;
occIsect.t = 1.0e+17;
occIsect.hit = 0;
occIsect.t = 1.0e+17;
occIsect.hit = 0;
for (uniform int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(occIsect, ray, spheres[snum]);
ray_plane_intersect (occIsect, ray, plane);
for (uniform int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(occIsect, ray, spheres[snum]);
ray_plane_intersect (occIsect, ray, plane);
if (occIsect.hit) occlusion += 1.0;
}
if (occIsect.hit) occlusion += 1.0;
}
}
occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
return occlusion;
occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
return occlusion;
}
/* Compute the image for the scanlines from [y0,y1), for an overall image
of width w and height h.
*/
static inline void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
uniform int h, uniform int nsubsamples,
uniform float image[]) {
static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
static uniform Sphere spheres[3] = {
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
RNGState rngstate;
*/
static inline void ao_tile(
uniform int x0, uniform int x1,
uniform int y0, uniform int y1,
uniform int w, uniform int h,
uniform int nsubsamples,
uniform float image[])
{
uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
uniform Sphere spheres[3] = {
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
RNGState rngstate;
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
float invSamples = 1.f / nsubsamples;
foreach_tiled(y = y0 ... y1, x = 0 ... w,
u = 0 ... nsubsamples, v = 0 ... nsubsamples)
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 31)));
float invSamples = 1.f / nsubsamples;
for (uniform int y = y0; y < y1; y++)
for (uniform int xb = x0; xb < x1; xb += programCount)
{
float du = (float)u * invSamples, dv = (float)v * invSamples;
const int x = xb + programIndex;
const int offset = 3 * (y * w + x);
float res = 0.0f;
// Figure out x,y pixel in NDC
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
float ret = 0.f;
Ray ray;
Isect isect;
for (uniform int u = 0; u < nsubsamples; u++)
for (uniform int v = 0; v < nsubsamples; v++)
{
float du = (float)u * invSamples, dv = (float)v * invSamples;
ray.org = 0.f;
// Figure out x,y pixel in NDC
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
float ret = 0.f;
Ray ray;
Isect isect;
// Poor man's perspective projection
ray.dir.x = px;
ray.dir.y = py;
ray.dir.z = -1.0;
vnormalize(ray.dir);
ray.org = 0.f;
isect.t = 1.0e+17;
isect.hit = 0;
// Poor man's perspective projection
ray.dir.x = px;
ray.dir.y = py;
ray.dir.z = -1.0;
vnormalize(ray.dir);
for (uniform int snum = 0; snum < 3; ++snum)
isect.t = 1.0e+17;
isect.hit = 0;
for (uniform int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(isect, ray, spheres[snum]);
ray_plane_intersect(isect, ray, plane);
ray_plane_intersect(isect, ray, plane);
// Note use of 'coherent' if statement; the set of rays we
// trace will often all hit or all miss the scene
if (isect.hit) {
// Note use of 'coherent' if statement; the set of rays we
// trace will often all hit or all miss the scene
if (isect.hit) {
ret = ambient_occlusion(isect, plane, spheres, rngstate);
ret *= invSamples * invSamples;
int offset = 3 * (y * w + x);
atomic_add_local(&image[offset], ret);
atomic_add_local(&image[offset+1], ret);
atomic_add_local(&image[offset+2], ret);
res += ret;
}
}
if (xb < x1)
{
image[offset ] = res;
image[offset+1] = res;
image[offset+2] = res;
}
}
}
export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
uniform float image[]) {
ao_scanlines(0, h, w, h, nsubsamples, image);
}
#define TILEX 64
#define TILEY 4
static void task ao_task(uniform int width, uniform int height,
uniform int nsubsamples, uniform float image[]) {
ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
void task ao_task(uniform int width, uniform int height,
uniform int nsubsamples, uniform float image[])
{
if (taskIndex0 >= taskCount0) return;
if (taskIndex1 >= taskCount1) return;
const uniform int x0 = taskIndex0 * TILEX;
const uniform int x1 = min(x0 + TILEX, width);
const uniform int y0 = taskIndex1 * TILEY;
const uniform int y1 = min(y0 + TILEY, height);
ao_tile(x0,x1,y0,y1, width, height, nsubsamples, image);
}
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
uniform float image[]) {
launch[h] ao_task(w, h, nsubsamples, image);
uniform float image[])
{
const uniform int ntilex = (w+TILEX-1)/TILEX;
const uniform int ntiley = (h+TILEY-1)/TILEY;
launch[ntilex,ntiley] ao_task(w, h, nsubsamples, image);
}

224
examples_cuda/aobench/ao_cu.cpp Normal file → Executable file
View File

@@ -69,6 +69,211 @@ static inline double rtc(void)
return etime;
}
/******************************/
#include <cassert>
#include <iostream>
#include <cuda.h>
#include "drvapi_error_string.h"
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
// These are the inline versions for all of the SDK helper functions
void __checkCudaErrors(CUresult err, const char *file, const int line) {
if(CUDA_SUCCESS != err) {
std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
<< getCudaDrvErrorString(err) << "\" from file <" << file
<< ", line " << line << "\n";
exit(-1);
}
}
/**********************/
/* Basic CUDriver API */
CUcontext context;
void createContext(const int deviceId = 0)
{
CUdevice device;
int devCount;
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGetCount(&devCount));
assert(devCount > 0);
checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
char name[128];
checkCudaErrors(cuDeviceGetName(name, 128, device));
std::cout << "Using CUDA Device [0]: " << name << "\n";
int devMajor, devMinor;
checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
std::cout << "Device Compute Capability: "
<< devMajor << "." << devMinor << "\n";
if (devMajor < 2) {
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
exit(1);
}
// Create driver context
checkCudaErrors(cuCtxCreate(&context, 0, device));
}
void destroyContext()
{
checkCudaErrors(cuCtxDestroy(context));
}
CUmodule loadModule(const char * module)
{
CUmodule cudaModule;
// in this branch we use compilation with parameters
const unsigned int jitNumOptions = 1;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void*[jitNumOptions];
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[0] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 64;
jitOptVals[0] = (void *)(size_t)jitRegCount;
#if 0
// set up size of compilation log buffer
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = 1024;
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
// set up pointer to the compilation log buffer
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[1] = jitLogBuffer;
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[2] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 32;
jitOptVals[2] = (void *)(size_t)jitRegCount;
#endif
checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals));
return cudaModule;
}
void unloadModule(CUmodule &cudaModule)
{
checkCudaErrors(cuModuleUnload(cudaModule));
}
CUfunction getFunction(CUmodule &cudaModule, const char * function)
{
CUfunction cudaFunction;
checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
return cudaFunction;
}
CUdeviceptr deviceMalloc(const size_t size)
{
CUdeviceptr d_buf;
checkCudaErrors(cuMemAlloc(&d_buf, size));
return d_buf;
}
void deviceFree(CUdeviceptr d_buf)
{
checkCudaErrors(cuMemFree(d_buf));
}
void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
{
checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
}
void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
{
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
}
#define deviceLaunch(func,nbx,nby,nbz,params) \
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_EQUAL)); \
checkCudaErrors( \
cuLaunchKernel( \
(func), \
((nbx-1)/(128/32)+1), (nby), (nbz), \
128, 1, 1, \
0, NULL, (params), NULL \
));
typedef CUdeviceptr devicePtr;
/**************/
#include <vector>
std::vector<char> readBinary(const char * filename)
{
std::vector<char> buffer;
FILE *fp = fopen(filename, "rb");
if (!fp )
{
fprintf(stderr, "file %s not found\n", filename);
assert(0);
}
#if 0
char c;
while ((c = fgetc(fp)) != EOF)
buffer.push_back(c);
#else
fseek(fp, 0, SEEK_END);
const unsigned long long size = ftell(fp); /*calc the size needed*/
fseek(fp, 0, SEEK_SET);
buffer.resize(size);
if (fp == NULL){ /*ERROR detection if file == empty*/
fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);
exit(1);
}
else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
fprintf(stderr, "Error: There was an Error reading the file %s \n", filename);
exit(1);
}
#endif
fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size());
return buffer;
}
extern "C"
{
void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment)
{
return NULL;
}
void CUDALaunch(
void **handlePtr,
const char * module_name,
const char * module_1,
const char * func_name,
void **func_args,
int countx, int county, int countz)
{
assert(module_name != NULL);
assert(module_1 != NULL);
assert(func_name != NULL);
assert(func_args != NULL);
#if 0
const char * module = module_1;
#else
const std::vector<char> module_str = readBinary("kernel.cubin");
const char * module = &module_str[0];
#endif
CUmodule cudaModule = loadModule(module);
CUfunction cudaFunction = getFunction(cudaModule, func_name);
deviceLaunch(cudaFunction, countx, county, countz, func_args);
unloadModule(cudaModule);
}
void CUDASync(void *handle)
{
checkCudaErrors(cuStreamSynchronize(0));
}
void ISPCSync(void *handle)
{
checkCudaErrors(cuStreamSynchronize(0));
}
void CUDAFree(void *handle)
{
}
}
/******************************/
#define NSUBSAMPLES 2
@@ -158,6 +363,11 @@ int main(int argc, char **argv)
savePPM("ao-ispc.ppm", width, height);
#endif
/*******************/
createContext();
/*******************/
devicePtr d_fimg = deviceMalloc(width*height*3*sizeof(float));
//
// Run the ispc + tasks path, test_iterations times, and report the
// minimum time for any of them.
@@ -166,18 +376,28 @@ int main(int argc, char **argv)
for (unsigned int i = 0; i < test_iterations; i++) {
memset((void *)fimg, 0, sizeof(float) * width * height * 3);
assert(NSUBSAMPLES == 2);
memcpyH2D(d_fimg, fimg, width*height*3*sizeof(float));
reset_and_start_timer();
const double t0 = rtc();
ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
ao_ispc_tasks(
width,
height,
NSUBSAMPLES,
(float*)d_fimg);
double t = (rtc() - t0); //get_elapsed_mcycles();
minTimeISPCTasks = std::min(minTimeISPCTasks, t);
}
memcpyD2H(fimg, d_fimg, width*height*3*sizeof(float));
// Report results and save image
printf("[aobench ispc + tasks]:\t\t[%.3f] million cycles (%d x %d image)\n",
minTimeISPCTasks, width, height);
savePPM("ao-ispc-tasks.ppm", width, height);
savePPM("ao-cuda.ppm", width, height);
/*******************/
destroyContext();
/*******************/
return 0;
//

View File

@@ -0,0 +1,370 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef _DRVAPI_ERROR_STRING_H_
#define _DRVAPI_ERROR_STRING_H_
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// Error Code string definitions here
typedef struct
{
char const *error_string;
int error_id;
} s_CudaErrorStr;
/**
* Error codes
*/
static s_CudaErrorStr sCudaDrvErrorString[] =
{
/**
* The API call returned with no errors. In the case of query calls, this
* can also mean that the operation being queried is complete (see
* ::cuEventQuery() and ::cuStreamQuery()).
*/
{ "CUDA_SUCCESS", 0 },
/**
* This indicates that one or more of the parameters passed to the API call
* is not within an acceptable range of values.
*/
{ "CUDA_ERROR_INVALID_VALUE", 1 },
/**
* The API call failed because it was unable to allocate enough memory to
* perform the requested operation.
*/
{ "CUDA_ERROR_OUT_OF_MEMORY", 2 },
/**
* This indicates that the CUDA driver has not been initialized with
* ::cuInit() or that initialization has failed.
*/
{ "CUDA_ERROR_NOT_INITIALIZED", 3 },
/**
* This indicates that the CUDA driver is in the process of shutting down.
*/
{ "CUDA_ERROR_DEINITIALIZED", 4 },
/**
* This indicates profiling APIs are called while application is running
* in visual profiler mode.
*/
{ "CUDA_ERROR_PROFILER_DISABLED", 5 },
/**
* This indicates profiling has not been initialized for this context.
* Call cuProfilerInitialize() to resolve this.
*/
{ "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 },
/**
* This indicates profiler has already been started and probably
* cuProfilerStart() is incorrectly called.
*/
{ "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 },
/**
* This indicates profiler has already been stopped and probably
* cuProfilerStop() is incorrectly called.
*/
{ "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 },
/**
* This indicates that no CUDA-capable devices were detected by the installed
* CUDA driver.
*/
{ "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 },
/**
* This indicates that the device ordinal supplied by the user does not
* correspond to a valid CUDA device.
*/
{ "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 },
/**
* This indicates that the device kernel image is invalid. This can also
* indicate an invalid CUDA module.
*/
{ "CUDA_ERROR_INVALID_IMAGE", 200 },
/**
* This most frequently indicates that there is no context bound to the
* current thread. This can also be returned if the context passed to an
* API call is not a valid handle (such as a context that has had
* ::cuCtxDestroy() invoked on it). This can also be returned if a user
* mixes different API versions (i.e. 3010 context with 3020 API calls).
* See ::cuCtxGetApiVersion() for more details.
*/
{ "CUDA_ERROR_INVALID_CONTEXT", 201 },
/**
* This indicated that the context being supplied as a parameter to the
* API call was already the active context.
* \deprecated
* This error return is deprecated as of CUDA 3.2. It is no longer an
* error to attempt to push the active context via ::cuCtxPushCurrent().
*/
{ "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 },
/**
* This indicates that a map or register operation has failed.
*/
{ "CUDA_ERROR_MAP_FAILED", 205 },
/**
* This indicates that an unmap or unregister operation has failed.
*/
{ "CUDA_ERROR_UNMAP_FAILED", 206 },
/**
* This indicates that the specified array is currently mapped and thus
* cannot be destroyed.
*/
{ "CUDA_ERROR_ARRAY_IS_MAPPED", 207 },
/**
* This indicates that the resource is already mapped.
*/
{ "CUDA_ERROR_ALREADY_MAPPED", 208 },
/**
* This indicates that there is no kernel image available that is suitable
* for the device. This can occur when a user specifies code generation
* options for a particular CUDA source file that do not include the
* corresponding device configuration.
*/
{ "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 },
/**
* This indicates that a resource has already been acquired.
*/
{ "CUDA_ERROR_ALREADY_ACQUIRED", 210 },
/**
* This indicates that a resource is not mapped.
*/
{ "CUDA_ERROR_NOT_MAPPED", 211 },
/**
* This indicates that a mapped resource is not available for access as an
* array.
*/
{ "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 },
/**
* This indicates that a mapped resource is not available for access as a
* pointer.
*/
{ "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 },
/**
* This indicates that an uncorrectable ECC error was detected during
* execution.
*/
{ "CUDA_ERROR_ECC_UNCORRECTABLE", 214 },
/**
* This indicates that the ::CUlimit passed to the API call is not
* supported by the active device.
*/
{ "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 },
/**
* This indicates that the ::CUcontext passed to the API call can
* only be bound to a single CPU thread at a time but is already
* bound to a CPU thread.
*/
{ "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 },
/**
* This indicates that peer access is not supported across the given
* devices.
*/
{ "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217},
/**
* This indicates that the device kernel source is invalid.
*/
{ "CUDA_ERROR_INVALID_SOURCE", 300 },
/**
* This indicates that the file specified was not found.
*/
{ "CUDA_ERROR_FILE_NOT_FOUND", 301 },
/**
* This indicates that a link to a shared object failed to resolve.
*/
{ "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 },
/**
* This indicates that initialization of a shared object failed.
*/
{ "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 },
/**
* This indicates that an OS call failed.
*/
{ "CUDA_ERROR_OPERATING_SYSTEM", 304 },
/**
* This indicates that a resource handle passed to the API call was not
* valid. Resource handles are opaque types like ::CUstream and ::CUevent.
*/
{ "CUDA_ERROR_INVALID_HANDLE", 400 },
/**
* This indicates that a named symbol was not found. Examples of symbols
* are global/constant variable names, texture names }, and surface names.
*/
{ "CUDA_ERROR_NOT_FOUND", 500 },
/**
* This indicates that asynchronous operations issued previously have not
* completed yet. This result is not actually an error, but must be indicated
* differently than ::CUDA_SUCCESS (which indicates completion). Calls that
* may return this value include ::cuEventQuery() and ::cuStreamQuery().
*/
{ "CUDA_ERROR_NOT_READY", 600 },
/**
* An exception occurred on the device while executing a kernel. Common
* causes include dereferencing an invalid device pointer and accessing
* out of bounds shared memory. The context cannot be used }, so it must
* be destroyed (and a new one should be created). All existing device
* memory allocations from this context are invalid and must be
* reconstructed if the program is to continue using CUDA.
*/
{ "CUDA_ERROR_LAUNCH_FAILED", 700 },
/**
* This indicates that a launch did not occur because it did not have
* appropriate resources. This error usually indicates that the user has
* attempted to pass too many arguments to the device kernel, or the
* kernel launch specifies too many threads for the kernel's register
* count. Passing arguments of the wrong size (i.e. a 64-bit pointer
* when a 32-bit int is expected) is equivalent to passing too many
* arguments and can also result in this error.
*/
{ "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 },
/**
* This indicates that the device kernel took too long to execute. This can
* only occur if timeouts are enabled - see the device attribute
* ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
* context cannot be used (and must be destroyed similar to
* ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
* this context are invalid and must be reconstructed if the program is to
* continue using CUDA.
*/
{ "CUDA_ERROR_LAUNCH_TIMEOUT", 702 },
/**
* This error indicates a kernel launch that uses an incompatible texturing
* mode.
*/
{ "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 },
/**
* This error indicates that a call to ::cuCtxEnablePeerAccess() is
* trying to re-enable peer access to a context which has already
* had peer access to it enabled.
*/
{ "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 },
/**
* This error indicates that ::cuCtxDisablePeerAccess() is
* trying to disable peer access which has not been enabled yet
* via ::cuCtxEnablePeerAccess().
*/
{ "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 },
/**
* This error indicates that the primary context for the specified device
* has already been initialized.
*/
{ "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 },
/**
* This error indicates that the context current to the calling thread
* has been destroyed using ::cuCtxDestroy }, or is a primary context which
* has not yet been initialized.
*/
{ "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 },
/**
* A device-side assert triggered during kernel execution. The context
* cannot be used anymore, and must be destroyed. All existing device
* memory allocations from this context are invalid and must be
* reconstructed if the program is to continue using CUDA.
*/
{ "CUDA_ERROR_ASSERT", 710 },
/**
* This error indicates that the hardware resources required to enable
* peer access have been exhausted for one or more of the devices
* passed to ::cuCtxEnablePeerAccess().
*/
{ "CUDA_ERROR_TOO_MANY_PEERS", 711 },
/**
* This error indicates that the memory range passed to ::cuMemHostRegister()
* has already been registered.
*/
{ "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 },
/**
* This error indicates that the pointer passed to ::cuMemHostUnregister()
* does not correspond to any currently registered memory region.
*/
{ "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 },
/**
* This error indicates that the attempted operation is not permitted.
*/
{ "CUDA_ERROR_NOT_PERMITTED", 800 },
/**
* This error indicates that the attempted operation is not supported
* on the current system or device.
*/
{ "CUDA_ERROR_NOT_SUPPORTED", 801 },
/**
* This indicates that an unknown internal error has occurred.
*/
{ "CUDA_ERROR_UNKNOWN", 999 },
{ NULL, -1 }
};
// This is just a linear search through the array, since the error_id's are not
// always ocurring consecutively
const char * getCudaDrvErrorString(CUresult error_id)
{
int index = 0;
while (sCudaDrvErrorString[index].error_id != error_id &&
sCudaDrvErrorString[index].error_id != -1)
{
index++;
}
if (sCudaDrvErrorString[index].error_id == error_id)
return (const char *)sCudaDrvErrorString[index].error_string;
else
return (const char *)"CUDA_ERROR not found!";
}
#endif

View File

@@ -71,7 +71,8 @@ static inline double rtc(void)
1.e-6*((double) Tvalue.tv_usec);
return etime;
}
/******************************/ #include <cassert>
/******************************/
#include <cassert>
#include <iostream>
#include <cuda.h>
#include "drvapi_error_string.h"