adding portable examples
This commit is contained in:
2
examples/portable/aobench/.gitignore
vendored
Normal file
2
examples/portable/aobench/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
ao
|
||||
*.ppm
|
||||
8
examples/portable/aobench/Makefile_cpu
Normal file
8
examples/portable/aobench/Makefile_cpu
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=ao
|
||||
CPP_SRC=ao.cpp
|
||||
ISPC_SRC=ao.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x8
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
14
examples/portable/aobench/Makefile_gpu
Normal file
14
examples/portable/aobench/Makefile_gpu
Normal file
@@ -0,0 +1,14 @@
|
||||
PROG=ao
|
||||
ISPC_SRC=ao.ispc
|
||||
CU_SRC=ao.cu
|
||||
CXX_SRC=ao.cpp ao_serial.cpp
|
||||
PTXCC_REGMAX=64
|
||||
#ISPC_FLAGS= --opt=disable-uniform-control-flow
|
||||
|
||||
LLVM_GPU=1
|
||||
NVVM_GPU=1
|
||||
|
||||
include ../common_gpu.mk
|
||||
|
||||
|
||||
|
||||
180
examples/portable/aobench/ao.cpp
Normal file
180
examples/portable/aobench/ao.cpp
Normal file
@@ -0,0 +1,180 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#ifdef __linux__
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#include <math.h>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "ao_ispc.h"
|
||||
|
||||
#include "timing.h"
|
||||
#include "ispc_malloc.h"
|
||||
|
||||
#define NSUBSAMPLES 2
|
||||
|
||||
extern void ao_serial(int w, int h, int nsubsamples, float image[]);
|
||||
|
||||
static unsigned int test_iterations[] = {3, 7, 1};
|
||||
static unsigned int width, height;
|
||||
static unsigned char *img;
|
||||
static float *fimg;
|
||||
|
||||
|
||||
static unsigned char
|
||||
clamp(float f)
|
||||
{
|
||||
int i = (int)(f * 255.5);
|
||||
|
||||
if (i < 0) i = 0;
|
||||
if (i > 255) i = 255;
|
||||
|
||||
return (unsigned char)i;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
savePPM(const char *fname, int w, int h)
|
||||
{
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 0; x < w; x++) {
|
||||
img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]);
|
||||
img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]);
|
||||
img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]);
|
||||
}
|
||||
}
|
||||
|
||||
FILE *fp = fopen(fname, "wb");
|
||||
if (!fp) {
|
||||
perror(fname);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fprintf(fp, "P6\n");
|
||||
fprintf(fp, "%d %d\n", w, h);
|
||||
fprintf(fp, "255\n");
|
||||
fwrite(img, w * h * 3, 1, fp);
|
||||
fclose(fp);
|
||||
printf("Wrote image file %s\n", fname);
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc < 3) {
|
||||
printf ("%s\n", argv[0]);
|
||||
printf ("Usage: ao [width] [height] [ispc iterations] [tasks iterations] [serial iterations]\n");
|
||||
getchar();
|
||||
exit(-1);
|
||||
}
|
||||
else {
|
||||
if (argc == 6) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
test_iterations[i] = atoi(argv[3 + i]);
|
||||
}
|
||||
}
|
||||
width = atoi (argv[1]);
|
||||
height = atoi (argv[2]);
|
||||
}
|
||||
|
||||
// Allocate space for output images
|
||||
#if 0
|
||||
img = new unsigned char[width * height * 3];
|
||||
fimg = new float[width * height * 3];
|
||||
#else
|
||||
ispc_malloc((void**) &img, sizeof(unsigned char)*width*height*3);
|
||||
ispc_malloc((void**)&fimg, sizeof( float)*width*height*3);
|
||||
#endif
|
||||
|
||||
//
|
||||
// Run the ispc + tasks path, test_iterations times, and report the
|
||||
// minimum time for any of them.
|
||||
//
|
||||
double minTimeISPCTasks = 1e30;
|
||||
for (unsigned int i = 0; i < test_iterations[1]; i++) {
|
||||
ispc_memset((void *)fimg, 0, sizeof(float) * width * height * 3);
|
||||
assert(NSUBSAMPLES == 2);
|
||||
|
||||
reset_and_start_timer();
|
||||
ispc::ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
|
||||
double t = get_elapsed_msec();
|
||||
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", t);
|
||||
minTimeISPCTasks = std::min(minTimeISPCTasks, t);
|
||||
}
|
||||
|
||||
// Report results and save image
|
||||
printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n",
|
||||
minTimeISPCTasks, width, height);
|
||||
savePPM("ao-ispc-tasks.ppm", width, height);
|
||||
|
||||
//
|
||||
// Run the serial path, again test_iteration times, and report the
|
||||
// minimum time.
|
||||
//
|
||||
double minTimeSerial = 1e30;
|
||||
for (unsigned int i = 0; i < test_iterations[2]; i++) {
|
||||
ispc_memset((void *)fimg, 0, sizeof(float) * width * height * 3);
|
||||
reset_and_start_timer();
|
||||
ao_serial(width, height, NSUBSAMPLES, fimg);
|
||||
double t = get_elapsed_msec();
|
||||
printf("@time of serial run:\t\t\t\t[%.3f] msec\n", t);
|
||||
minTimeSerial = std::min(minTimeSerial, t);
|
||||
}
|
||||
|
||||
// Report more results, save another image...
|
||||
printf("[aobench serial]:\t\t[%.3f] msec (%d x %d image)\n", minTimeSerial,
|
||||
width, height);
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n",
|
||||
minTimeSerial / minTimeISPCTasks);
|
||||
savePPM("ao-serial.ppm", width, height);
|
||||
|
||||
ispc_free(img);
|
||||
ispc_free(fimg);
|
||||
|
||||
return 0;
|
||||
}
|
||||
447
examples/portable/aobench/ao.cu
Normal file
447
examples/portable/aobench/ao.cu
Normal file
@@ -0,0 +1,447 @@
|
||||
// -*- mode: c++ -*-
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
|
||||
*/
|
||||
|
||||
#include "cuda_helpers.cuh"
|
||||
|
||||
#define NAO_SAMPLES 8
|
||||
//#define M_PI 3.1415926535f
|
||||
|
||||
#define vec Float3
|
||||
struct Float3
|
||||
{
|
||||
float x,y,z;
|
||||
|
||||
__device__ friend Float3 operator+(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x+b.x;
|
||||
c.y = a.y+b.y;
|
||||
c.z = a.z+b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator-(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x-b.x;
|
||||
c.y = a.y-b.y;
|
||||
c.z = a.z-b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator/(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x/b.x;
|
||||
c.y = a.y/b.y;
|
||||
c.z = a.z/b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator/(const float a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a/b.x;
|
||||
c.y = a/b.y;
|
||||
c.z = a/b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator*(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x*b.x;
|
||||
c.y = a.y*b.y;
|
||||
c.z = a.z*b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator*(const Float3 a, const float b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x*b;
|
||||
c.y = a.y*b;
|
||||
c.z = a.z*b;
|
||||
return c;
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// RNG stuff
|
||||
|
||||
struct RNGState {
|
||||
unsigned int z1, z2, z3, z4;
|
||||
};
|
||||
|
||||
__device__
|
||||
static inline unsigned int random(RNGState * state)
|
||||
{
|
||||
unsigned int b;
|
||||
|
||||
b = ((state->z1 << 6) ^ state->z1) >> 13;
|
||||
state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
|
||||
b = ((state->z2 << 2) ^ state->z2) >> 27;
|
||||
state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
|
||||
b = ((state->z3 << 13) ^ state->z3) >> 21;
|
||||
state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
|
||||
b = ((state->z4 << 3) ^ state->z4) >> 12;
|
||||
state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
|
||||
return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline float frandom(RNGState * state)
|
||||
{
|
||||
unsigned int irand = random(state);
|
||||
irand &= (1ul<<23)-1;
|
||||
return __int_as_float(0x3F800000 | irand)-1.0f;
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline void seed_rng(RNGState * state,
|
||||
unsigned int seed) {
|
||||
state->z1 = seed;
|
||||
state->z2 = seed ^ 0xbeeff00d;
|
||||
state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
|
||||
state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) |
|
||||
((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
|
||||
}
|
||||
|
||||
|
||||
|
||||
struct Isect {
|
||||
float t;
|
||||
vec p;
|
||||
vec n;
|
||||
int hit;
|
||||
};
|
||||
|
||||
struct Sphere {
|
||||
vec center;
|
||||
float radius;
|
||||
};
|
||||
|
||||
struct Plane {
|
||||
vec p;
|
||||
vec n;
|
||||
};
|
||||
|
||||
struct Ray {
|
||||
vec org;
|
||||
vec dir;
|
||||
};
|
||||
|
||||
__device__
|
||||
static inline float dot(vec a, vec b) {
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline vec vcross(vec v0, vec v1) {
|
||||
vec ret;
|
||||
ret.x = v0.y * v1.z - v0.z * v1.y;
|
||||
ret.y = v0.z * v1.x - v0.x * v1.z;
|
||||
ret.z = v0.x * v1.y - v0.y * v1.x;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline void vnormalize(vec &v) {
|
||||
float len2 = dot(v, v);
|
||||
float invlen = rsqrt(len2);
|
||||
v = v*invlen;
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline void
|
||||
ray_plane_intersect(Isect &isect,const Ray &ray, const Plane &plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
#if 0
|
||||
if (abs(v) < 1.0f-17)
|
||||
return;
|
||||
else {
|
||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||
|
||||
if ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = plane.n;
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (abs(v) <= 1.0e-17)
|
||||
return;
|
||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||
if ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = plane.n;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline void
|
||||
ray_sphere_intersect(Isect &isect,const Ray &ray, const Sphere &sphere) {
|
||||
vec rs = ray.org - sphere.center;
|
||||
|
||||
float B = dot(rs, ray.dir);
|
||||
float C = dot(rs, rs) - sphere.radius * sphere.radius;
|
||||
float D = B * B - C;
|
||||
|
||||
#if 0
|
||||
if (D > 0.) {
|
||||
float t = -B - sqrt(D);
|
||||
|
||||
if ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = isect.p - sphere.center;
|
||||
vnormalize(isect.n);
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (D <= 0.0f)
|
||||
return;
|
||||
|
||||
float t = -B - sqrt(D);
|
||||
|
||||
if ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = isect.p - sphere.center;
|
||||
vnormalize(isect.n);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline void
|
||||
orthoBasis(vec basis[3], vec n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0f; basis[1].y = 0.0f; basis[1].z = 0.0f;
|
||||
|
||||
if ((n.x < 0.6f) && (n.x > -0.6f)) {
|
||||
basis[1].x = 1.0f;
|
||||
} else if ((n.y < 0.6f) && (n.y > -0.6f)) {
|
||||
basis[1].y = 1.0f;
|
||||
} else if ((n.z < 0.6f) && (n.z > -0.6f)) {
|
||||
basis[1].z = 1.0f;
|
||||
} else {
|
||||
basis[1].x = 1.0f;
|
||||
}
|
||||
|
||||
basis[0] = vcross(basis[1], basis[2]);
|
||||
vnormalize(basis[0]);
|
||||
|
||||
basis[1] = vcross(basis[2], basis[0]);
|
||||
vnormalize(basis[1]);
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline float
|
||||
ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
float eps = 0.0001f;
|
||||
vec p; //, n;
|
||||
vec basis[3];
|
||||
float occlusion = 0.0f;
|
||||
|
||||
p = isect.p + isect.n * eps;
|
||||
|
||||
orthoBasis(basis, isect.n);
|
||||
|
||||
const int ntheta = NAO_SAMPLES;
|
||||
const int nphi = NAO_SAMPLES;
|
||||
for ( int j = 0; j < ntheta; j++) {
|
||||
for ( int i = 0; i < nphi; i++) {
|
||||
Ray ray;
|
||||
Isect occIsect;
|
||||
|
||||
float theta = sqrt(frandom(&rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(&rngstate);
|
||||
float x = cos(phi) * theta;
|
||||
float y = sin(phi) * theta;
|
||||
float z = sqrtf(1.0f - theta * theta);
|
||||
|
||||
// local . global
|
||||
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
|
||||
float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
|
||||
float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
|
||||
|
||||
ray.org = p;
|
||||
ray.dir.x = rx;
|
||||
ray.dir.y = ry;
|
||||
ray.dir.z = rz;
|
||||
|
||||
occIsect.t = 1.0f+17;
|
||||
occIsect.hit = 0;
|
||||
|
||||
for ( int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(occIsect, ray, spheres[snum]);
|
||||
ray_plane_intersect (occIsect, ray, plane);
|
||||
|
||||
if (occIsect.hit) occlusion += 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
|
||||
return occlusion;
|
||||
}
|
||||
|
||||
|
||||
/* Compute the image for the scanlines from [y0,y1), for an overall image
|
||||
of width w and height h.
|
||||
*/
|
||||
__device__
|
||||
static inline void ao_tiles(
|
||||
int x0, int x1,
|
||||
int y0, int y1,
|
||||
int w, int h,
|
||||
int nsubsamples,
|
||||
float image[])
|
||||
{
|
||||
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
const Sphere spheres[3] = {
|
||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
|
||||
float invSamples = 1.f / nsubsamples;
|
||||
for ( int y = y0; y < y1; y++)
|
||||
for ( int x = programIndex+x0; x < x1; x += programCount)
|
||||
{
|
||||
const int offset = 3 * (y * w + x);
|
||||
float res = 0.0f;
|
||||
|
||||
for ( int u = 0; u < nsubsamples; u++)
|
||||
for ( int v = 0; v < nsubsamples; v++)
|
||||
{
|
||||
float du = (float)u * invSamples, dv = (float)v * invSamples;
|
||||
|
||||
// Figure out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
float ret = 0.f;
|
||||
Ray ray;
|
||||
Isect isect;
|
||||
|
||||
ray.org.x = 0.0f;
|
||||
ray.org.y = 0.0f;
|
||||
ray.org.z = 0.0f;
|
||||
|
||||
// Poor man's perspective projection
|
||||
ray.dir.x = px;
|
||||
ray.dir.y = py;
|
||||
ray.dir.z = -1.0;
|
||||
vnormalize(ray.dir);
|
||||
|
||||
isect.t = 1.0e+17;
|
||||
isect.hit = 0;
|
||||
|
||||
for ( int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(isect, ray, spheres[snum]);
|
||||
ray_plane_intersect(isect, ray, plane);
|
||||
|
||||
// Note use of 'coherent' if statement; the set of rays we
|
||||
// trace will often all hit or all miss the scene
|
||||
if (any(isect.hit)) {
|
||||
ret = isect.hit*ambient_occlusion(isect, plane, spheres, rngstate);
|
||||
ret *= invSamples * invSamples;
|
||||
res += ret;
|
||||
}
|
||||
}
|
||||
|
||||
if (x < x1)
|
||||
{
|
||||
image[offset ] = res;
|
||||
image[offset+1] = res;
|
||||
image[offset+2] = res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define TILEX 64
|
||||
#define TILEY 4
|
||||
|
||||
extern "C"
|
||||
__global__
|
||||
void ao_task( int width, int height,
|
||||
int nsubsamples, float image[])
|
||||
{
|
||||
if (taskIndex0 >= taskCount0) return;
|
||||
if (taskIndex1 >= taskCount1) return;
|
||||
|
||||
const int x0 = taskIndex0 * TILEX;
|
||||
const int x1 = min(x0 + TILEX, width);
|
||||
|
||||
const int y0 = taskIndex1 * TILEY;
|
||||
const int y1 = min(y0 + TILEY, height);
|
||||
ao_tiles(x0,x1,y0,y1, width, height, nsubsamples, image);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
__global__
|
||||
void ao_ispc_tasks___export(
|
||||
int w, int h, int nsubsamples,
|
||||
float image[])
|
||||
{
|
||||
const int ntilex = (w+TILEX-1)/TILEX;
|
||||
const int ntiley = (h+TILEY-1)/TILEY;
|
||||
launch(ntilex,ntiley,1,ao_task)(w,h,nsubsamples,image);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
__host__ void ao_ispc_tasks(
|
||||
int w, int h, int nsubsamples,
|
||||
float image[])
|
||||
{
|
||||
ao_ispc_tasks___export<<<1,32>>>(w,h,nsubsamples,image);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
305
examples/portable/aobench/ao.fix.ispc
Normal file
305
examples/portable/aobench/ao.fix.ispc
Normal file
@@ -0,0 +1,305 @@
|
||||
// -*- mode: c++ -*-
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
|
||||
*/
|
||||
|
||||
#define NAO_SAMPLES 8
|
||||
#define M_PI 3.1415926535f
|
||||
|
||||
typedef float<3> vec;
|
||||
|
||||
struct Isect {
|
||||
float t;
|
||||
vec p;
|
||||
vec n;
|
||||
int hit;
|
||||
};
|
||||
|
||||
struct Sphere {
|
||||
vec center;
|
||||
float radius;
|
||||
};
|
||||
|
||||
struct Plane {
|
||||
vec p;
|
||||
vec n;
|
||||
};
|
||||
|
||||
struct Ray {
|
||||
vec org;
|
||||
vec dir;
|
||||
};
|
||||
|
||||
static inline float dot(vec a, vec b) {
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
static inline vec vcross(vec v0, vec v1) {
|
||||
vec ret;
|
||||
ret.x = v0.y * v1.z - v0.z * v1.y;
|
||||
ret.y = v0.z * v1.x - v0.x * v1.z;
|
||||
ret.z = v0.x * v1.y - v0.y * v1.x;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void vnormalize(vec &v) {
|
||||
float len2 = dot(v, v);
|
||||
float invlen = rsqrt(len2);
|
||||
v *= invlen;
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static void
|
||||
ray_plane_intersect(Isect &isect, Ray &ray, const Plane &plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
cif (abs(v) < 1.0e-17)
|
||||
return;
|
||||
else {
|
||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||
|
||||
cif ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = plane.n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
ray_sphere_intersect(Isect &isect, Ray &ray, const Sphere &sphere) {
|
||||
vec rs = ray.org - sphere.center;
|
||||
|
||||
float B = dot(rs, ray.dir);
|
||||
float C = dot(rs, rs) - sphere.radius * sphere.radius;
|
||||
float D = B * B - C;
|
||||
|
||||
cif (D > 0.) {
|
||||
float t = -B - sqrt(D);
|
||||
|
||||
cif ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + t * ray.dir;
|
||||
isect.n = isect.p - sphere.center;
|
||||
vnormalize(isect.n);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static void
|
||||
orthoBasis(vec basis[3], vec n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||
|
||||
if ((n.x < 0.6) && (n.x > -0.6)) {
|
||||
basis[1].x = 1.0;
|
||||
} else if ((n.y < 0.6) && (n.y > -0.6)) {
|
||||
basis[1].y = 1.0;
|
||||
} else if ((n.z < 0.6) && (n.z > -0.6)) {
|
||||
basis[1].z = 1.0;
|
||||
} else {
|
||||
basis[1].x = 1.0;
|
||||
}
|
||||
|
||||
basis[0] = vcross(basis[1], basis[2]);
|
||||
vnormalize(basis[0]);
|
||||
|
||||
basis[1] = vcross(basis[2], basis[0]);
|
||||
vnormalize(basis[1]);
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static float
|
||||
ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
float eps = 0.0001f;
|
||||
vec p, n;
|
||||
vec basis[3];
|
||||
float occlusion = 0.0;
|
||||
|
||||
p = isect.p + eps * isect.n;
|
||||
|
||||
orthoBasis(basis, isect.n);
|
||||
|
||||
static const uniform int ntheta = NAO_SAMPLES;
|
||||
static const uniform int nphi = NAO_SAMPLES;
|
||||
for (uniform int j = 0; j < ntheta; j++) {
|
||||
for (uniform int i = 0; i < nphi; i++) {
|
||||
Ray ray;
|
||||
Isect occIsect;
|
||||
|
||||
float theta = sqrt(frandom(&rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(&rngstate);
|
||||
float x = cos(phi) * theta;
|
||||
float y = sin(phi) * theta;
|
||||
float z = sqrt(1.0 - theta * theta);
|
||||
|
||||
// local . global
|
||||
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
|
||||
float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
|
||||
float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
|
||||
|
||||
ray.org = p;
|
||||
ray.dir.x = rx;
|
||||
ray.dir.y = ry;
|
||||
ray.dir.z = rz;
|
||||
|
||||
occIsect.t = 1.0e+17;
|
||||
occIsect.hit = 0;
|
||||
|
||||
for (uniform int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(occIsect, ray, spheres[snum]);
|
||||
ray_plane_intersect (occIsect, ray, plane);
|
||||
|
||||
if (occIsect.hit) occlusion += 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
|
||||
return occlusion;
|
||||
}
|
||||
|
||||
static inline void ao_tiles(
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int w, uniform int h,
|
||||
uniform int nsubsamples,
|
||||
uniform float image[])
|
||||
{
|
||||
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
const Sphere spheres[3] = {
|
||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
|
||||
float invSamples = 1.f / nsubsamples;
|
||||
foreach_tiled (y = y0 ... y1, x = x0 ... x1)
|
||||
{
|
||||
const int offset = 3 * (y * w + x);
|
||||
float res = 0.0f;
|
||||
|
||||
for (uniform int u = 0; u < nsubsamples; u++)
|
||||
for (uniform int v = 0; v < nsubsamples; v++)
|
||||
{
|
||||
float du = (float)u * invSamples, dv = (float)v * invSamples;
|
||||
|
||||
// Figure out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
float ret = 0.f;
|
||||
Ray ray;
|
||||
Isect isect;
|
||||
|
||||
ray.org = 0.f;
|
||||
|
||||
// Poor man's perspective projection
|
||||
ray.dir.x = px;
|
||||
ray.dir.y = py;
|
||||
ray.dir.z = -1.0;
|
||||
vnormalize(ray.dir);
|
||||
|
||||
isect.t = 1.0e+17;
|
||||
isect.hit = 0;
|
||||
|
||||
for (uniform int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(isect, ray, spheres[snum]);
|
||||
ray_plane_intersect(isect, ray, plane);
|
||||
|
||||
// Note use of 'coherent' if statement; the set of rays we
|
||||
// trace will often all hit or all miss the scene
|
||||
if (isect.hit) {
|
||||
ret = ambient_occlusion(isect, plane, spheres, rngstate);
|
||||
ret *= invSamples * invSamples;
|
||||
res += ret;
|
||||
}
|
||||
}
|
||||
|
||||
image[offset ] = res;
|
||||
image[offset+1] = res;
|
||||
image[offset+2] = res;
|
||||
}
|
||||
}
|
||||
|
||||
#define TILEX max(64,programCount*2)
|
||||
#define TILEY 4
|
||||
|
||||
export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
const uniform int x0 = 0;
|
||||
const uniform int x1 = w;
|
||||
const uniform int y0 = 0;
|
||||
const uniform int y1 = h;
|
||||
ao_tiles(x0,x1,y0,y1, w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
void task ao_task(uniform int width, uniform int height,
|
||||
uniform int nsubsamples, uniform float image[])
|
||||
{
|
||||
if (taskIndex0 >= taskCount0) return;
|
||||
if (taskIndex1 >= taskCount1) return;
|
||||
|
||||
const uniform int x0 = taskIndex0 * TILEX;
|
||||
const uniform int x1 = min(x0 + TILEX, width);
|
||||
|
||||
const uniform int y0 = taskIndex1 * TILEY;
|
||||
const uniform int y1 = min(y0 + TILEY, height);
|
||||
ao_tiles(x0,x1,y0,y1, width, height, nsubsamples, image);
|
||||
}
|
||||
|
||||
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[])
|
||||
{
|
||||
const uniform int ntilex = (w+TILEX-1)/TILEX;
|
||||
const uniform int ntiley = (h+TILEY-1)/TILEY;
|
||||
launch[ntilex,ntiley] ao_task(w, h, nsubsamples, image);
|
||||
sync;
|
||||
}
|
||||
342
examples/portable/aobench/ao.ispc
Normal file
342
examples/portable/aobench/ao.ispc
Normal file
@@ -0,0 +1,342 @@
|
||||
// -*- mode: c++ -*-
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
|
||||
*/
|
||||
|
||||
#define NAO_SAMPLES 8
|
||||
#define M_PI 3.1415926535f
|
||||
|
||||
typedef float<3> vec;
|
||||
|
||||
struct Isect {
|
||||
float t;
|
||||
vec p;
|
||||
vec n;
|
||||
int hit;
|
||||
};
|
||||
|
||||
struct Sphere {
|
||||
vec center;
|
||||
float radius;
|
||||
};
|
||||
|
||||
struct Plane {
|
||||
vec p;
|
||||
vec n;
|
||||
};
|
||||
|
||||
struct Ray {
|
||||
vec org;
|
||||
vec dir;
|
||||
};
|
||||
|
||||
static inline float dot(vec a, vec b) {
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
static inline vec vcross(vec v0, vec v1) {
|
||||
vec ret;
|
||||
ret.x = v0.y * v1.z - v0.z * v1.y;
|
||||
ret.y = v0.z * v1.x - v0.x * v1.z;
|
||||
ret.z = v0.x * v1.y - v0.y * v1.x;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void vnormalize(vec &v) {
|
||||
float len2 = dot(v, v);
|
||||
float invlen = rsqrt(len2);
|
||||
v *= invlen;
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static void
|
||||
ray_plane_intersect(Isect &isect, Ray &ray, const uniform Plane &plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
#if 0
|
||||
cif (abs(v) < 1.0e-17)
|
||||
return;
|
||||
else {
|
||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||
|
||||
cif ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = plane.n;
|
||||
}
|
||||
}
|
||||
#else
|
||||
cif (abs(v) <= 1.0e-17)
|
||||
return;
|
||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||
cif ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = plane.n;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
ray_sphere_intersect(Isect &isect, Ray &ray, const uniform Sphere &sphere) {
|
||||
vec rs = ray.org - sphere.center;
|
||||
|
||||
float B = dot(rs, ray.dir);
|
||||
float C = dot(rs, rs) - sphere.radius * sphere.radius;
|
||||
float D = B * B - C;
|
||||
|
||||
#if 0
|
||||
cif (D > 0.) {
|
||||
float t = -B - sqrt(D);
|
||||
|
||||
cif ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + t * ray.dir;
|
||||
isect.n = isect.p - sphere.center;
|
||||
vnormalize(isect.n);
|
||||
}
|
||||
}
|
||||
#else
|
||||
cif (D <=0.0f)
|
||||
return;
|
||||
|
||||
float t = -B - sqrt(D);
|
||||
cif ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + t * ray.dir;
|
||||
isect.n = isect.p - sphere.center;
|
||||
vnormalize(isect.n);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static void
|
||||
orthoBasis(vec basis[3], vec n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||
|
||||
if ((n.x < 0.6) && (n.x > -0.6)) {
|
||||
basis[1].x = 1.0;
|
||||
} else if ((n.y < 0.6) && (n.y > -0.6)) {
|
||||
basis[1].y = 1.0;
|
||||
} else if ((n.z < 0.6) && (n.z > -0.6)) {
|
||||
basis[1].z = 1.0;
|
||||
} else {
|
||||
basis[1].x = 1.0;
|
||||
}
|
||||
|
||||
basis[0] = vcross(basis[1], basis[2]);
|
||||
vnormalize(basis[0]);
|
||||
|
||||
basis[1] = vcross(basis[2], basis[0]);
|
||||
vnormalize(basis[1]);
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static float
|
||||
ambient_occlusion(Isect &isect, const uniform Plane &plane, const uniform Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
float eps = 0.0001f;
|
||||
vec p, n;
|
||||
vec basis[3];
|
||||
float occlusion = 0.0;
|
||||
|
||||
p = isect.p + eps * isect.n;
|
||||
|
||||
orthoBasis(basis, isect.n);
|
||||
|
||||
static const uniform int ntheta = NAO_SAMPLES;
|
||||
static const uniform int nphi = NAO_SAMPLES;
|
||||
for (uniform int j = 0; j < ntheta; j++) {
|
||||
for (uniform int i = 0; i < nphi; i++) {
|
||||
Ray ray;
|
||||
Isect occIsect;
|
||||
|
||||
float theta = sqrt(frandom(&rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(&rngstate);
|
||||
float x = cos(phi) * theta;
|
||||
float y = sin(phi) * theta;
|
||||
float z = sqrt(1.0 - theta * theta);
|
||||
|
||||
// local . global
|
||||
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
|
||||
float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
|
||||
float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
|
||||
|
||||
ray.org = p;
|
||||
ray.dir.x = rx;
|
||||
ray.dir.y = ry;
|
||||
ray.dir.z = rz;
|
||||
|
||||
occIsect.t = 1.0e+17;
|
||||
occIsect.hit = 0;
|
||||
|
||||
for (uniform int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(occIsect, ray, spheres[snum]);
|
||||
ray_plane_intersect (occIsect, ray, plane);
|
||||
|
||||
if (occIsect.hit) occlusion += 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
|
||||
return occlusion;
|
||||
}
|
||||
|
||||
static inline void ao_tiles(
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int w, uniform int h,
|
||||
uniform int nsubsamples,
|
||||
uniform float image[])
|
||||
{
|
||||
const uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
const uniform Sphere spheres[3] = {
|
||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
|
||||
float invSamples = 1.f / nsubsamples;
|
||||
foreach_tiled (y = y0 ... y1, x = x0 ... x1)
|
||||
{
|
||||
const int offset = 3 * (y * w + x);
|
||||
float res = 0.0f;
|
||||
|
||||
for (uniform int u = 0; u < nsubsamples; u++)
|
||||
for (uniform int v = 0; v < nsubsamples; v++)
|
||||
{
|
||||
float du = (float)u * invSamples, dv = (float)v * invSamples;
|
||||
|
||||
// Figure out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
float ret = 0.f;
|
||||
Ray ray;
|
||||
Isect isect;
|
||||
|
||||
ray.org = 0.f;
|
||||
|
||||
// Poor man's perspective projection
|
||||
ray.dir.x = px;
|
||||
ray.dir.y = py;
|
||||
ray.dir.z = -1.0;
|
||||
vnormalize(ray.dir);
|
||||
|
||||
isect.t = 1.0e+17;
|
||||
isect.hit = 0;
|
||||
|
||||
for (uniform int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(isect, ray, spheres[snum]);
|
||||
ray_plane_intersect(isect, ray, plane);
|
||||
|
||||
// Note use of 'coherent' if statement; the set of rays we
|
||||
// trace will often all hit or all miss the scene
|
||||
#if 0
|
||||
if (isect.hit) {
|
||||
ret = ambient_occlusion(isect, plane, spheres, rngstate);
|
||||
ret *= invSamples * invSamples;
|
||||
res += ret;
|
||||
}
|
||||
#else
|
||||
if(any(isect.hit))
|
||||
{
|
||||
ret = isect.hit*ambient_occlusion(isect, plane, spheres, rngstate);
|
||||
ret *= invSamples * invSamples;
|
||||
res += ret;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
image[offset ] = res;
|
||||
image[offset+1] = res;
|
||||
image[offset+2] = res;
|
||||
}
|
||||
}
|
||||
|
||||
#define TILEX max(64,programCount*2)
|
||||
#define TILEY 4
|
||||
|
||||
export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
const uniform int x0 = 0;
|
||||
const uniform int x1 = w;
|
||||
const uniform int y0 = 0;
|
||||
const uniform int y1 = h;
|
||||
ao_tiles(x0,x1,y0,y1, w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
void task ao_task(uniform int width, uniform int height,
|
||||
uniform int nsubsamples, uniform float image[])
|
||||
{
|
||||
if (taskIndex0 >= taskCount0) return;
|
||||
if (taskIndex1 >= taskCount1) return;
|
||||
|
||||
const uniform int x0 = taskIndex0 * TILEX;
|
||||
const uniform int x1 = min(x0 + TILEX, width);
|
||||
|
||||
const uniform int y0 = taskIndex1 * TILEY;
|
||||
const uniform int y1 = min(y0 + TILEY, height);
|
||||
ao_tiles(x0,x1,y0,y1, width, height, nsubsamples, image);
|
||||
}
|
||||
|
||||
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[])
|
||||
{
|
||||
const uniform int ntilex = (w+TILEX-1)/TILEX;
|
||||
const uniform int ntiley = (h+TILEY-1)/TILEY;
|
||||
launch[ntilex,ntiley] ao_task(w, h, nsubsamples, image);
|
||||
sync;
|
||||
}
|
||||
419
examples/portable/aobench/ao_fast.cu
Normal file
419
examples/portable/aobench/ao_fast.cu
Normal file
@@ -0,0 +1,419 @@
|
||||
// -*- mode: c++ -*-
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
|
||||
*/
|
||||
|
||||
#include "cuda_helpers.cuh"
|
||||
|
||||
#define NAO_SAMPLES 8
|
||||
//#define M_PI 3.1415926535f
|
||||
|
||||
#define vec Float3
|
||||
struct Float3
|
||||
{
|
||||
float x,y,z;
|
||||
|
||||
__device__ friend Float3 operator+(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x+b.x;
|
||||
c.y = a.y+b.y;
|
||||
c.z = a.z+b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator-(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x-b.x;
|
||||
c.y = a.y-b.y;
|
||||
c.z = a.z-b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator/(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x/b.x;
|
||||
c.y = a.y/b.y;
|
||||
c.z = a.z/b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator/(const float a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a/b.x;
|
||||
c.y = a/b.y;
|
||||
c.z = a/b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator*(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x*b.x;
|
||||
c.y = a.y*b.y;
|
||||
c.z = a.z*b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator*(const Float3 a, const float b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x*b;
|
||||
c.y = a.y*b;
|
||||
c.z = a.z*b;
|
||||
return c;
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// RNG stuff
|
||||
|
||||
struct RNGState {
|
||||
unsigned int z1, z2, z3, z4;
|
||||
};
|
||||
|
||||
__device__
|
||||
static inline unsigned int random(RNGState * state)
|
||||
{
|
||||
unsigned int b;
|
||||
|
||||
b = ((state->z1 << 6) ^ state->z1) >> 13;
|
||||
state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
|
||||
b = ((state->z2 << 2) ^ state->z2) >> 27;
|
||||
state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
|
||||
b = ((state->z3 << 13) ^ state->z3) >> 21;
|
||||
state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
|
||||
b = ((state->z4 << 3) ^ state->z4) >> 12;
|
||||
state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
|
||||
return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline float frandom(RNGState * state)
|
||||
{
|
||||
unsigned int irand = random(state);
|
||||
irand &= (1ul<<23)-1;
|
||||
return __int_as_float(0x3F800000 | irand)-1.0f;
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline void seed_rng(RNGState * state,
|
||||
unsigned int seed) {
|
||||
state->z1 = seed;
|
||||
state->z2 = seed ^ 0xbeeff00d;
|
||||
state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
|
||||
state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) |
|
||||
((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
|
||||
}
|
||||
|
||||
|
||||
|
||||
struct Isect {
|
||||
float t;
|
||||
vec p;
|
||||
vec n;
|
||||
int hit;
|
||||
};
|
||||
|
||||
struct Sphere {
|
||||
vec center;
|
||||
float radius;
|
||||
};
|
||||
|
||||
struct Plane {
|
||||
vec p;
|
||||
vec n;
|
||||
};
|
||||
|
||||
struct Ray {
|
||||
vec org;
|
||||
vec dir;
|
||||
};
|
||||
|
||||
__device__
|
||||
static inline float dot(vec a, vec b) {
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline vec vcross(vec v0, vec v1) {
|
||||
vec ret;
|
||||
ret.x = v0.y * v1.z - v0.z * v1.y;
|
||||
ret.y = v0.z * v1.x - v0.x * v1.z;
|
||||
ret.z = v0.x * v1.y - v0.y * v1.x;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline void vnormalize(vec &v) {
|
||||
float len2 = dot(v, v);
|
||||
float invlen = rsqrt(len2);
|
||||
v = v*invlen;
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline void
|
||||
ray_plane_intersect(Isect &isect,const Ray &ray, const Plane &plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
if (abs(v) < 1.0f-17)
|
||||
return;
|
||||
else {
|
||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||
|
||||
if ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = plane.n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline void
|
||||
ray_sphere_intersect(Isect &isect,const Ray &ray, const Sphere &sphere) {
|
||||
vec rs = ray.org - sphere.center;
|
||||
|
||||
float B = dot(rs, ray.dir);
|
||||
float C = dot(rs, rs) - sphere.radius * sphere.radius;
|
||||
float D = B * B - C;
|
||||
|
||||
if (D > 0.) {
|
||||
float t = -B - sqrt(D);
|
||||
|
||||
if ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = isect.p - sphere.center;
|
||||
vnormalize(isect.n);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline void
|
||||
orthoBasis(vec basis[3], vec n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0f; basis[1].y = 0.0f; basis[1].z = 0.0f;
|
||||
|
||||
if ((n.x < 0.6f) && (n.x > -0.6f)) {
|
||||
basis[1].x = 1.0f;
|
||||
} else if ((n.y < 0.6f) && (n.y > -0.6f)) {
|
||||
basis[1].y = 1.0f;
|
||||
} else if ((n.z < 0.6f) && (n.z > -0.6f)) {
|
||||
basis[1].z = 1.0f;
|
||||
} else {
|
||||
basis[1].x = 1.0f;
|
||||
}
|
||||
|
||||
basis[0] = vcross(basis[1], basis[2]);
|
||||
vnormalize(basis[0]);
|
||||
|
||||
basis[1] = vcross(basis[2], basis[0]);
|
||||
vnormalize(basis[1]);
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline float
|
||||
ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
float eps = 0.0001f;
|
||||
vec p; //, n;
|
||||
vec basis[3];
|
||||
float occlusion = 0.0f;
|
||||
|
||||
p = isect.p + isect.n * eps;
|
||||
|
||||
orthoBasis(basis, isect.n);
|
||||
|
||||
const int ntheta = NAO_SAMPLES;
|
||||
const int nphi = NAO_SAMPLES;
|
||||
for ( int j = 0; j < ntheta; j++) {
|
||||
for ( int i = 0; i < nphi; i++) {
|
||||
Ray ray;
|
||||
Isect occIsect;
|
||||
|
||||
float theta = sqrt(frandom(&rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(&rngstate);
|
||||
float x = cos(phi) * theta;
|
||||
float y = sin(phi) * theta;
|
||||
float z = sqrtf(1.0f - theta * theta);
|
||||
|
||||
// local . global
|
||||
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
|
||||
float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
|
||||
float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
|
||||
|
||||
ray.org = p;
|
||||
ray.dir.x = rx;
|
||||
ray.dir.y = ry;
|
||||
ray.dir.z = rz;
|
||||
|
||||
occIsect.t = 1.0f+17;
|
||||
occIsect.hit = 0;
|
||||
|
||||
for ( int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(occIsect, ray, spheres[snum]);
|
||||
ray_plane_intersect (occIsect, ray, plane);
|
||||
|
||||
if (occIsect.hit) occlusion += 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
|
||||
return occlusion;
|
||||
}
|
||||
|
||||
|
||||
/* Compute the image for the scanlines from [y0,y1), for an overall image
|
||||
of width w and height h.
|
||||
*/
|
||||
__device__
|
||||
static inline void ao_tiles(
|
||||
int x0, int x1,
|
||||
int y0, int y1,
|
||||
int w, int h,
|
||||
int nsubsamples,
|
||||
float image[])
|
||||
{
|
||||
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
const Sphere spheres[3] = {
|
||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
|
||||
float invSamples = 1.f / nsubsamples;
|
||||
for ( int y = y0; y < y1; y++)
|
||||
for ( int x = programIndex+x0; x < x1; x += programCount)
|
||||
{
|
||||
const int offset = 3 * (y * w + x);
|
||||
float res = 0.0f;
|
||||
|
||||
for ( int u = 0; u < nsubsamples; u++)
|
||||
for ( int v = 0; v < nsubsamples; v++)
|
||||
{
|
||||
float du = (float)u * invSamples, dv = (float)v * invSamples;
|
||||
|
||||
// Figure out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
float ret = 0.f;
|
||||
Ray ray;
|
||||
Isect isect;
|
||||
|
||||
ray.org.x = 0.0f;
|
||||
ray.org.y = 0.0f;
|
||||
ray.org.z = 0.0f;
|
||||
|
||||
// Poor man's perspective projection
|
||||
ray.dir.x = px;
|
||||
ray.dir.y = py;
|
||||
ray.dir.z = -1.0;
|
||||
vnormalize(ray.dir);
|
||||
|
||||
isect.t = 1.0e+17;
|
||||
isect.hit = 0;
|
||||
|
||||
for ( int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(isect, ray, spheres[snum]);
|
||||
ray_plane_intersect(isect, ray, plane);
|
||||
|
||||
// Note use of 'coherent' if statement; the set of rays we
|
||||
// trace will often all hit or all miss the scene
|
||||
if (isect.hit) {
|
||||
ret = ambient_occlusion(isect, plane, spheres, rngstate);
|
||||
ret *= invSamples * invSamples;
|
||||
res += ret;
|
||||
}
|
||||
}
|
||||
|
||||
if (x < x1)
|
||||
{
|
||||
image[offset ] = res;
|
||||
image[offset+1] = res;
|
||||
image[offset+2] = res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define TILEX 64
|
||||
#define TILEY 4
|
||||
|
||||
extern "C"
|
||||
__global__
|
||||
void ao_task( int width, int height,
|
||||
int nsubsamples, float image[])
|
||||
{
|
||||
if (taskIndex0 >= taskCount0) return;
|
||||
if (taskIndex1 >= taskCount1) return;
|
||||
|
||||
const int x0 = taskIndex0 * TILEX;
|
||||
const int x1 = min(x0 + TILEX, width);
|
||||
|
||||
const int y0 = taskIndex1 * TILEY;
|
||||
const int y1 = min(y0 + TILEY, height);
|
||||
ao_tiles(x0,x1,y0,y1, width, height, nsubsamples, image);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
__global__
|
||||
void ao_ispc_tasks___export(
|
||||
int w, int h, int nsubsamples,
|
||||
float image[])
|
||||
{
|
||||
const int ntilex = (w+TILEX-1)/TILEX;
|
||||
const int ntiley = (h+TILEY-1)/TILEY;
|
||||
launch(ntilex,ntiley,1,ao_task)(w,h,nsubsamples,image);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
__host__ void ao_ispc_tasks(
|
||||
int w, int h, int nsubsamples,
|
||||
float image[])
|
||||
{
|
||||
ao_ispc_tasks___export<<<1,32>>>(w,h,nsubsamples,image);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
129
examples/portable/common_gpu.mk
Normal file
129
examples/portable/common_gpu.mk
Normal file
@@ -0,0 +1,129 @@
|
||||
NVCC_SRC=../../util/nvcc_helpers.cu
|
||||
NVCC_OBJS=objs_gpu/nvcc_helpers_nvcc.o
|
||||
#
|
||||
CXX=g++ -ffast-math
|
||||
CXXFLAGS=-O3 -I$(CUDATK)/include -Iobjs_gpu/ -D_CUDA_ -I../../util -I../../
|
||||
#
|
||||
NVCC=nvcc
|
||||
NVCC_FLAGS+=-O3 -arch=sm_35 -D_CUDA_ -I../../util -Xptxas=-v -Iobjs_gpu/
|
||||
ifdef PTXCC_REGMAX
|
||||
NVCC_FLAGS += --maxrregcount=$(PTXCC_REGMAX)
|
||||
endif
|
||||
NVCC_FLAGS+=--use_fast_math
|
||||
#
|
||||
LD=nvcc
|
||||
LDFLAGS=-lcudart -lcudadevrt -arch=sm_35
|
||||
#
|
||||
PTXCC=$(ISPC_HOME)/ptxtools/ptxcc
|
||||
PTXCC_FLAGS+= -Xptxas=-v
|
||||
ifdef PTXCC_REGMAX
|
||||
PTXCC_FLAGS += -maxrregcount=$(PTXCC_REGMAX)
|
||||
endif
|
||||
|
||||
#
|
||||
ISPC=ispc
|
||||
ISPC_FLAGS+=-O3 --math-lib=fast --target=nvptx --opt=fast-math
|
||||
#
|
||||
#
|
||||
#
|
||||
ISPC_LLVM_OBJS=$(ISPC_SRC:%.ispc=objs_gpu/%_llvm_ispc.o)
|
||||
ISPC_NVVM_OBJS=$(ISPC_SRC:%.ispc=objs_gpu/%_nvvm_ispc.o)
|
||||
ISPC_BCS=$(ISPC_SRC:%.ispc=objs_gpu/%_ispc.bc)
|
||||
ISPC_LLVM_PTX=$(ISPC_SRC:%.ispc=objs_gpu/%_llvm_ispc.ptx)
|
||||
ISPC_NVVM_PTX=$(ISPC_SRC:%.ispc=objs_gpu/%_nvvm_ispc.ptx)
|
||||
ISPC_HEADERS=$(ISPC_SRC:%.ispc=objs_gpu/%_ispc.h)
|
||||
CXX_OBJS=$(CXX_SRC:%.cpp=objs_gpu/%_gcc.o)
|
||||
CU_OBJS=$(CU_SRC:%.cu=objs_gpu/%_cu.o)
|
||||
#NVCC_OBJS=$(NVCC_SRC:%.cu=objs_gpu/%_nvcc.o)
|
||||
|
||||
CXX_SRC+=ispc_malloc.cpp
|
||||
CXX_OBJS+=objs_gpu/ispc_malloc_gcc.o
|
||||
|
||||
PTXGEN = $(ISPC_HOME)/ptxtools/ptxgen
|
||||
PTXGEN += --use_fast_math
|
||||
|
||||
LLVM32=$(HOME)/usr/local/llvm/bin-3.2
|
||||
LLVM32DIS=$(LLVM32)/bin/llvm-dis
|
||||
|
||||
LLC=$(HOME)/usr/local/llvm/bin-trunk/bin/llc
|
||||
LLC_FLAGS=-march=nvptx64 -mcpu=sm_35
|
||||
|
||||
# .SUFFIXES: .bc .o .cu
|
||||
|
||||
ifdef LLVM_GPU
|
||||
OBJSgpu_llvm=$(ISPC_LLVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS)
|
||||
PROGgpu_llvm=$(PROG)_llvm_gpu
|
||||
else
|
||||
ISPC_LLVM_PTX=
|
||||
endif
|
||||
|
||||
|
||||
ifdef NVVM_GPU
|
||||
OBJSgpu_nvvm=$(ISPC_NVVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS) $(ISPC_LVVM_PTX)
|
||||
PROGgpu_nvvm=$(PROG)_nvvm_gpu
|
||||
else
|
||||
ISPC_NVVM_PTX=
|
||||
endif
|
||||
|
||||
ifdef CU_SRC
|
||||
OBJScu=$(CU_OBJS) $(CXX_OBJS) $(NVCC_OBJS)
|
||||
PROGcu=$(PROG)_cu
|
||||
endif
|
||||
|
||||
|
||||
all: dirs \
|
||||
$(PROGgpu_nvvm) \
|
||||
$(PROGgpu_llvm) \
|
||||
$(PROGcu) $(ISPC_BC) $(ISPC_HEADERS) $(ISPC_NVVM_PTX) $(ISPC_LLVM_PTX)
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs_gpu/
|
||||
|
||||
objs_gpu/%.cpp objs_gpu/%.o objs_gpu/%.h: dirs
|
||||
|
||||
clean:
|
||||
/bin/rm -rf $(PROGgpu_nvvm) $(PROGgpu_llvm) $(PROGcu) objs_gpu
|
||||
|
||||
# generate binaries
|
||||
$(PROGgpu_llvm): $(OBJSgpu_llvm)
|
||||
$(LD) -o $@ $^ $(LDFLAGS)
|
||||
$(PROGgpu_nvvm): $(OBJSgpu_nvvm)
|
||||
$(LD) -o $@ $^ $(LDFLAGS)
|
||||
$(PROGcu): $(OBJScu)
|
||||
$(LD) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
# compile C++ code
|
||||
objs_gpu/%_gcc.o: %.cpp $(ISPC_HEADERS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
objs_gpu/%_gcc.o: ../../util/%.cpp
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
# CUDA helpers
|
||||
objs_gpu/%_cu.o: %.cu $(ISPC_HEADERS)
|
||||
$(NVCC) $(NVCC_FLAGS) -o $@ -dc $<
|
||||
|
||||
# compile CUDA code
|
||||
objs_gpu/%_nvcc.o: ../../util/%.cu
|
||||
$(NVCC) $(NVCC_FLAGS) -o $@ -c $<
|
||||
objs_gpu/%_nvcc.o: %.cu
|
||||
$(NVCC) $(NVCC_FLAGS) -o $@ -c $<
|
||||
|
||||
# compile ISPC to LLVM BC
|
||||
objs_gpu/%_ispc.h objs_gpu/%_ispc.bc: %.ispc
|
||||
$(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_gpu/$*_ispc.h -o objs_gpu/$*_ispc.bc $<
|
||||
|
||||
# generate PTX from LLVM BC
|
||||
objs_gpu/%_llvm_ispc.ptx: objs_gpu/%_ispc.bc
|
||||
$(LLC) $(LLC_FLAGS) -o $@ $<
|
||||
objs_gpu/%_nvvm_ispc.ptx: objs_gpu/%_ispc.bc
|
||||
$(LLVM32DIS) $< -o objs_gpu/$*_ispc-ll32.ll
|
||||
$(PTXGEN) objs_gpu/$*_ispc-ll32.ll -o $@
|
||||
|
||||
# generate an object file from PTX
|
||||
objs_gpu/%_ispc.o: objs_gpu/%_ispc.ptx
|
||||
$(PTXCC) $< -Xnvcc="$(PTXCC_FLAGS)" -o $@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user