Merge pull request #1 from dbabokin/egaburov-nvptx_clean

Egaburov nvptx clean
This commit is contained in:
Evghenii Gaburov
2014-07-09 07:48:01 +02:00
33 changed files with 577 additions and 577 deletions

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2010-2011, Intel Corporation Copyright (c) 2010-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#ifdef _MSC_VER #ifdef _MSC_VER
@@ -141,12 +141,12 @@ int main(int argc, char **argv)
} }
// Report results and save image // Report results and save image
printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n", printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n",
minTimeISPCTasks, width, height); minTimeISPCTasks, width, height);
savePPM("ao-ispc-tasks.ppm", width, height); savePPM("ao-ispc-tasks.ppm", width, height);
delete img; delete img;
delete fimg; delete fimg;
return 0; return 0;
} }

View File

@@ -1,6 +1,6 @@
// -*- mode: c++ -*- // -*- mode: c++ -*-
/* /*
Copyright (c) 2010-2011, Intel Corporation Copyright (c) 2010-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,7 @@ met:
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
/* /*
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
@@ -37,7 +37,7 @@ met:
#include "cuda_helpers.cuh" #include "cuda_helpers.cuh"
#define NAO_SAMPLES 8 #define NAO_SAMPLES 8
//#define M_PI 3.1415926535f //#define M_PI 3.1415926535f
#define vec Float3 #define vec Float3
@@ -109,7 +109,7 @@ static inline unsigned int random(RNGState * state)
b = ((state->z1 << 6) ^ state->z1) >> 13; b = ((state->z1 << 6) ^ state->z1) >> 13;
state->z1 = ((state->z1 & 4294967294U) << 18) ^ b; state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
b = ((state->z2 << 2) ^ state->z2) >> 27; b = ((state->z2 << 2) ^ state->z2) >> 27;
state->z2 = ((state->z2 & 4294967288U) << 2) ^ b; state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
b = ((state->z3 << 13) ^ state->z3) >> 21; b = ((state->z3 << 13) ^ state->z3) >> 21;
state->z3 = ((state->z3 & 4294967280U) << 7) ^ b; state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
@@ -128,7 +128,7 @@ static inline float frandom(RNGState * state)
} }
__device__ __device__
static inline void seed_rng(RNGState * state, static inline void seed_rng(RNGState * state,
unsigned int seed) { unsigned int seed) {
state->z1 = seed; state->z1 = seed;
state->z2 = seed ^ 0xbeeff00d; state->z2 = seed ^ 0xbeeff00d;
@@ -143,7 +143,7 @@ struct Isect {
float t; float t;
vec p; vec p;
vec n; vec n;
int hit; int hit;
}; };
struct Sphere { struct Sphere {
@@ -190,7 +190,7 @@ ray_plane_intersect(Isect &isect,const Ray &ray, const Plane &plane) {
float v = dot(ray.dir, plane.n); float v = dot(ray.dir, plane.n);
#if 0 #if 0
if (abs(v) < 1.0f-17) if (abs(v) < 1.0f-17)
return; return;
else { else {
float t = -(dot(ray.org, plane.n) + d) / v; float t = -(dot(ray.org, plane.n) + d) / v;
@@ -238,7 +238,7 @@ ray_sphere_intersect(Isect &isect,const Ray &ray, const Sphere &sphere) {
} }
} }
#else #else
if (D <= 0.0f) if (D <= 0.0f)
return; return;
float t = -B - sqrt(D); float t = -B - sqrt(D);
@@ -319,8 +319,8 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
occIsect.hit = 0; occIsect.hit = 0;
for ( int snum = 0; snum < 3; ++snum) for ( int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(occIsect, ray, spheres[snum]); ray_sphere_intersect(occIsect, ray, spheres[snum]);
ray_plane_intersect (occIsect, ray, plane); ray_plane_intersect (occIsect, ray, plane);
if (occIsect.hit) occlusion += 1.0f; if (occIsect.hit) occlusion += 1.0f;
} }
@@ -337,10 +337,10 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
__device__ __device__
static inline void ao_tiles( static inline void ao_tiles(
int x0, int x1, int x0, int x1,
int y0, int y1, int y0, int y1,
int w, int h, int w, int h,
int nsubsamples, int nsubsamples,
float image[]) float image[])
{ {
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } }; const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
const Sphere spheres[3] = { const Sphere spheres[3] = {
@@ -411,8 +411,8 @@ static inline void ao_tiles(
extern "C" extern "C"
__global__ __global__
void ao_task( int width, int height, void ao_task( int width, int height,
int nsubsamples, float image[]) int nsubsamples, float image[])
{ {
if (taskIndex0 >= taskCount0) return; if (taskIndex0 >= taskCount0) return;
if (taskIndex1 >= taskCount1) return; if (taskIndex1 >= taskCount1) return;
@@ -428,8 +428,8 @@ void ao_task( int width, int height,
extern "C" extern "C"
__global__ __global__
void ao_ispc_tasks___export( void ao_ispc_tasks___export(
int w, int h, int nsubsamples, int w, int h, int nsubsamples,
float image[]) float image[])
{ {
const int ntilex = (w+TILEX-1)/TILEX; const int ntilex = (w+TILEX-1)/TILEX;
const int ntiley = (h+TILEY-1)/TILEY; const int ntiley = (h+TILEY-1)/TILEY;
@@ -439,8 +439,8 @@ void ao_ispc_tasks___export(
extern "C" extern "C"
__host__ void ao_ispc_tasks( __host__ void ao_ispc_tasks(
int w, int h, int nsubsamples, int w, int h, int nsubsamples,
float image[]) float image[])
{ {
ao_ispc_tasks___export<<<1,32>>>(w,h,nsubsamples,image); ao_ispc_tasks___export<<<1,32>>>(w,h,nsubsamples,image);
cudaDeviceSynchronize(); cudaDeviceSynchronize();

View File

@@ -1,6 +1,6 @@
// -*- mode: c++ -*- // -*- mode: c++ -*-
/* /*
Copyright (c) 2010-2011, Intel Corporation Copyright (c) 2010-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -29,13 +29,13 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
/* /*
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
*/ */
#define NAO_SAMPLES 8 #define NAO_SAMPLES 8
#define M_PI 3.1415926535f #define M_PI 3.1415926535f
typedef float<3> vec; typedef float<3> vec;
@@ -50,7 +50,7 @@ struct Isect {
float t; float t;
vec p; vec p;
vec n; vec n;
int hit; int hit;
}; };
struct Sphere { struct Sphere {
@@ -94,7 +94,7 @@ ray_plane_intersect(Isect &isect, Ray &ray, const Plane &plane) {
float v = dot(ray.dir, plane.n); float v = dot(ray.dir, plane.n);
#if 0 #if 0
cif (abs(v) < 1.0e-17) cif (abs(v) < 1.0e-17)
return; return;
else { else {
float t = -(dot(ray.org, plane.n) + d) / v; float t = -(dot(ray.org, plane.n) + d) / v;
@@ -141,7 +141,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, const Sphere &sphere) {
} }
} }
#else #else
cif (D <=0.0f) cif (D <=0.0f)
return; return;
float t = -B - sqrt(D); float t = -B - sqrt(D);
@@ -220,8 +220,8 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
occIsect.hit = 0; occIsect.hit = 0;
for (uniform int snum = 0; snum < 3; ++snum) for (uniform int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(occIsect, ray, spheres[snum]); ray_sphere_intersect(occIsect, ray, spheres[snum]);
ray_plane_intersect (occIsect, ray, plane); ray_plane_intersect (occIsect, ray, plane);
if (occIsect.hit) occlusion += 1.0; if (occIsect.hit) occlusion += 1.0;
} }
@@ -233,10 +233,10 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
static inline void ao_tiles( static inline void ao_tiles(
uniform int x0, uniform int x1, uniform int x0, uniform int x1,
uniform int y0, uniform int y1, uniform int y0, uniform int y1,
uniform int w, uniform int h, uniform int w, uniform int h,
uniform int nsubsamples, uniform int nsubsamples,
uniform float image[]) uniform float image[])
{ {
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } }; const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
const Sphere spheres[3] = { const Sphere spheres[3] = {
@@ -306,7 +306,7 @@ static inline void ao_tiles(
#define TILEX max(64,programCount*2) #define TILEX max(64,programCount*2)
#define TILEY 4 #define TILEY 4
export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
uniform float image[]) { uniform float image[]) {
const uniform int x0 = 0; const uniform int x0 = 0;
const uniform int x1 = w; const uniform int x1 = w;
@@ -315,8 +315,8 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
ao_tiles(x0,x1,y0,y1, w, h, nsubsamples, image); ao_tiles(x0,x1,y0,y1, w, h, nsubsamples, image);
} }
void task ao_task(uniform int width, uniform int height, void task ao_task(uniform int width, uniform int height,
uniform int nsubsamples, uniform float image[]) uniform int nsubsamples, uniform float image[])
{ {
if (taskIndex0 >= taskCount0) return; if (taskIndex0 >= taskCount0) return;
if (taskIndex1 >= taskCount1) return; if (taskIndex1 >= taskCount1) return;
@@ -330,8 +330,8 @@ void task ao_task(uniform int width, uniform int height,
} }
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
uniform float image[]) uniform float image[])
{ {
const uniform int ntilex = (w+TILEX-1)/TILEX; const uniform int ntilex = (w+TILEX-1)/TILEX;
const uniform int ntiley = (h+TILEY-1)/TILEY; const uniform int ntiley = (h+TILEY-1)/TILEY;

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2011, Intel Corporation Copyright (c) 2011-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#ifdef _MSC_VER #ifdef _MSC_VER
@@ -145,13 +145,13 @@ CreateInputDataFromFile(const char *path) {
} }
// Load data chunk and update pointers // Load data chunk and update pointers
input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize, input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
ALIGNMENT_BYTES); ALIGNMENT_BYTES);
if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) { if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path); fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
return NULL; return NULL;
} }
input->arrays.zBuffer = input->arrays.zBuffer =
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]]; (float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
input->arrays.normalEncoded_x = input->arrays.normalEncoded_x =
@@ -199,21 +199,21 @@ void WriteFrame(const char *filename, const InputData *input,
const Framebuffer &framebuffer) { const Framebuffer &framebuffer) {
// Deswizzle and copy to RGBA output // Deswizzle and copy to RGBA output
// Doesn't need to be fast... only happens once // Doesn't need to be fast... only happens once
size_t imageBytes = 3 * input->header.framebufferWidth * size_t imageBytes = 3 * input->header.framebufferWidth *
input->header.framebufferHeight; input->header.framebufferHeight;
uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES); uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
memset(framebufferAOS, 0, imageBytes); memset(framebufferAOS, 0, imageBytes);
for (int i = 0; i < input->header.framebufferWidth * for (int i = 0; i < input->header.framebufferWidth *
input->header.framebufferHeight; ++i) { input->header.framebufferHeight; ++i) {
framebufferAOS[3 * i + 0] = framebuffer.r[i]; framebufferAOS[3 * i + 0] = framebuffer.r[i];
framebufferAOS[3 * i + 1] = framebuffer.g[i]; framebufferAOS[3 * i + 1] = framebuffer.g[i];
framebufferAOS[3 * i + 2] = framebuffer.b[i]; framebufferAOS[3 * i + 2] = framebuffer.b[i];
} }
// Write out simple PPM file // Write out simple PPM file
FILE *out = fopen(filename, "wb"); FILE *out = fopen(filename, "wb");
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
input->header.framebufferHeight); input->header.framebufferHeight);
fwrite(framebufferAOS, imageBytes, 1, out); fwrite(framebufferAOS, imageBytes, 1, out);
fclose(out); fclose(out);

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2011, Intel Corporation Copyright (c) 2011-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#ifndef DEFERRED_H #ifndef DEFERRED_H

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2011, Intel Corporation Copyright (c) 2011-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "deferred.h" #include "deferred.h"
@@ -147,7 +147,7 @@ ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
float minZ, maxZ; float minZ, maxZ;
ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth, ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
tileY * tileHeight, tileY * tileHeight + tileHeight, tileY * tileHeight, tileY * tileHeight + tileHeight,
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
cameraNear, cameraFar, &minZ, &maxZ); cameraNear, cameraFar, &minZ, &maxZ);
minZArray[tileX] = minZ; minZArray[tileX] = minZ;
maxZArray[tileX] = maxZ; maxZArray[tileX] = maxZ;
@@ -167,7 +167,7 @@ public:
{ {
mNumTilesX = gBufferWidth / mTileWidth; mNumTilesX = gBufferWidth / mTileWidth;
mNumTilesY = gBufferHeight / mTileHeight; mNumTilesY = gBufferHeight / mTileHeight;
// Allocate arrays // Allocate arrays
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16); mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16); mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
@@ -213,7 +213,7 @@ public:
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)]; float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)]; float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
if (srcX + 1 < srcTilesX) { if (srcX + 1 < srcTilesX) {
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
(srcX + 1)]); (srcX + 1)]);
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX + maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
(srcX + 1)]); (srcX + 1)]);
@@ -243,7 +243,7 @@ public:
lAlignedFree(mMaxZArrays[i]); lAlignedFree(mMaxZArrays[i]);
} }
lAlignedFree(mMinZArrays); lAlignedFree(mMinZArrays);
lAlignedFree(mMaxZArrays); lAlignedFree(mMaxZArrays);
} }
int Levels() const { return mLevels; } int Levels() const { return mLevels; }
@@ -277,9 +277,9 @@ private:
static MinMaxZTree *gMinMaxZTree = 0; static MinMaxZTree *gMinMaxZTree = 0;
void InitDynamicC(InputData *input) { void InitDynamicC(InputData *input) {
gMinMaxZTree = gMinMaxZTree =
new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS, new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
input->header.framebufferWidth, input->header.framebufferWidth,
input->header.framebufferHeight); input->header.framebufferHeight);
} }
@@ -311,7 +311,7 @@ SplitTileMinMax(
{ {
float gBufferScale_x = 0.5f * (float)gBufferWidth; float gBufferScale_x = 0.5f * (float)gBufferWidth;
float gBufferScale_y = 0.5f * (float)gBufferHeight; float gBufferScale_y = 0.5f * (float)gBufferHeight;
float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x), float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y) }; (cameraProj_22 * gBufferScale_y) };
float frustumPlanes_z[2] = { tileMidX - gBufferScale_x, float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
@@ -319,7 +319,7 @@ SplitTileMinMax(
for (int i = 0; i < 2; ++i) { for (int i = 0; i < 2; ++i) {
// Normalize // Normalize
float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] + float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
frustumPlanes_z[i] * frustumPlanes_z[i]); frustumPlanes_z[i] * frustumPlanes_z[i]);
frustumPlanes_xy[i] *= norm; frustumPlanes_xy[i] *= norm;
frustumPlanes_z[i] *= norm; frustumPlanes_z[i] *= norm;
@@ -340,23 +340,23 @@ SplitTileMinMax(
float light_positionView_z = light_positionView_z_array[lightIndex]; float light_positionView_z = light_positionView_z_array[lightIndex];
float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
float light_attenuationEndNeg = -light_attenuationEnd; float light_attenuationEndNeg = -light_attenuationEnd;
// Test lights again against subtile z bounds // Test lights again against subtile z bounds
bool inFrustum[4]; bool inFrustum[4];
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) && inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg); (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg); (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg); (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg); (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
float dx = light_positionView_z * frustumPlanes_z[0] + float dx = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0]; light_positionView_x * frustumPlanes_xy[0];
float dy = light_positionView_z * frustumPlanes_z[1] + float dy = light_positionView_z * frustumPlanes_z[1] +
light_positionView_y * frustumPlanes_xy[1]; light_positionView_y * frustumPlanes_xy[1];
if (fabsf(dx) > light_attenuationEnd) { if (fabsf(dx) > light_attenuationEnd) {
bool positiveX = dx > 0.0f; bool positiveX = dx > 0.0f;
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
@@ -423,13 +423,13 @@ half_to_float_fast(uint16_t h) {
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
// sign // sign
uint32_t xs = ((uint32_t) hs) << 16; uint32_t xs = ((uint32_t) hs) << 16;
// Exponent: unbias the halfp, then bias the single // Exponent: unbias the halfp, then bias the single
int32_t xes = ((int32_t) (he >> 10)) - 15 + 127; int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
// Exponent // Exponent
uint32_t xe = (uint32_t) (xes << 23); uint32_t xe = (uint32_t) (xes << 23);
// Mantissa // Mantissa
uint32_t xm = ((uint32_t) hm) << 13; uint32_t xm = ((uint32_t) hm) << 13;
uint32_t bits = (xs | xe | xm); uint32_t bits = (xs | xe | xm);
float *fp = reinterpret_cast<float *>(&bits); float *fp = reinterpret_cast<float *>(&bits);
@@ -470,13 +470,13 @@ ShadeTileC(
} else { } else {
float twoOverGBufferWidth = 2.0f / gBufferWidth; float twoOverGBufferWidth = 2.0f / gBufferWidth;
float twoOverGBufferHeight = 2.0f / gBufferHeight; float twoOverGBufferHeight = 2.0f / gBufferHeight;
for (int32_t y = tileStartY; y < tileEndY; ++y) { for (int32_t y = tileStartY; y < tileEndY; ++y) {
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
for (int32_t x = tileStartX; x < tileEndX; ++x) { for (int32_t x = tileStartX; x < tileEndX; ++x) {
int32_t gBufferOffset = y * gBufferWidth + x; int32_t gBufferOffset = y * gBufferWidth + x;
// Reconstruct position and (negative) view vector from G-buffer // Reconstruct position and (negative) view vector from G-buffer
float surface_positionView_x, surface_positionView_y, surface_positionView_z; float surface_positionView_x, surface_positionView_y, surface_positionView_z;
float Vneg_x, Vneg_y, Vneg_z; float Vneg_x, Vneg_y, Vneg_z;
@@ -485,70 +485,70 @@ ShadeTileC(
// Compute screen/clip-space position // Compute screen/clip-space position
// NOTE: Mind DX11 viewport transform and pixel center! // NOTE: Mind DX11 viewport transform and pixel center!
float positionScreen_x = (0.5f + (float)(x)) * float positionScreen_x = (0.5f + (float)(x)) *
twoOverGBufferWidth - 1.0f; twoOverGBufferWidth - 1.0f;
// Unproject depth buffer Z value into view space // Unproject depth buffer Z value into view space
surface_positionView_z = cameraProj_43 / (z - cameraProj_33); surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
surface_positionView_x = positionScreen_x * surface_positionView_z / surface_positionView_x = positionScreen_x * surface_positionView_z /
cameraProj_11; cameraProj_11;
surface_positionView_y = positionScreen_y * surface_positionView_z / surface_positionView_y = positionScreen_y * surface_positionView_z /
cameraProj_22; cameraProj_22;
// We actually end up with a vector pointing *at* the // We actually end up with a vector pointing *at* the
// surface (i.e. the negative view vector) // surface (i.e. the negative view vector)
normalize3(surface_positionView_x, surface_positionView_y, normalize3(surface_positionView_x, surface_positionView_y,
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z); surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
// Reconstruct normal from G-buffer // Reconstruct normal from G-buffer
float surface_normal_x, surface_normal_y, surface_normal_z; float surface_normal_x, surface_normal_y, surface_normal_z;
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]); float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]); float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
float m = sqrtf(4.0f * f - 1.0f); float m = sqrtf(4.0f * f - 1.0f);
surface_normal_x = m * (4.0f * normal_x - 2.0f); surface_normal_x = m * (4.0f * normal_x - 2.0f);
surface_normal_y = m * (4.0f * normal_y - 2.0f); surface_normal_y = m * (4.0f * normal_y - 2.0f);
surface_normal_z = 3.0f - 8.0f * f; surface_normal_z = 3.0f - 8.0f * f;
// Load other G-buffer parameters // Load other G-buffer parameters
float surface_specularAmount = float surface_specularAmount =
half_to_float_fast(inputData.specularAmount[gBufferOffset]); half_to_float_fast(inputData.specularAmount[gBufferOffset]);
float surface_specularPower = float surface_specularPower =
half_to_float_fast(inputData.specularPower[gBufferOffset]); half_to_float_fast(inputData.specularPower[gBufferOffset]);
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]); float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]); float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]); float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
float lit_x = 0.0f; float lit_x = 0.0f;
float lit_y = 0.0f; float lit_y = 0.0f;
float lit_z = 0.0f; float lit_z = 0.0f;
for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights; for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
++tileLightIndex) { ++tileLightIndex) {
int32_t lightIndex = tileLightIndices[tileLightIndex]; int32_t lightIndex = tileLightIndices[tileLightIndex];
// Gather light data relevant to initial culling // Gather light data relevant to initial culling
float light_positionView_x = float light_positionView_x =
inputData.lightPositionView_x[lightIndex]; inputData.lightPositionView_x[lightIndex];
float light_positionView_y = float light_positionView_y =
inputData.lightPositionView_y[lightIndex]; inputData.lightPositionView_y[lightIndex];
float light_positionView_z = float light_positionView_z =
inputData.lightPositionView_z[lightIndex]; inputData.lightPositionView_z[lightIndex];
float light_attenuationEnd = float light_attenuationEnd =
inputData.lightAttenuationEnd[lightIndex]; inputData.lightAttenuationEnd[lightIndex];
// Compute light vector // Compute light vector
float L_x = light_positionView_x - surface_positionView_x; float L_x = light_positionView_x - surface_positionView_x;
float L_y = light_positionView_y - surface_positionView_y; float L_y = light_positionView_y - surface_positionView_y;
float L_z = light_positionView_z - surface_positionView_z; float L_z = light_positionView_z - surface_positionView_z;
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z); float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
// Clip at end of attenuation // Clip at end of attenuation
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
if (distanceToLight2 < light_attenutaionEnd2) { if (distanceToLight2 < light_attenutaionEnd2) {
float distanceToLight = sqrtf(distanceToLight2); float distanceToLight = sqrtf(distanceToLight2);
float distanceToLightRcp = 1.f / distanceToLight; float distanceToLightRcp = 1.f / distanceToLight;
@@ -557,12 +557,12 @@ ShadeTileC(
L_z *= distanceToLightRcp; L_z *= distanceToLightRcp;
// Start computing brdf // Start computing brdf
float NdotL = dot3(surface_normal_x, surface_normal_y, float NdotL = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, L_x, L_y, L_z); surface_normal_z, L_x, L_y, L_z);
// Clip back facing // Clip back facing
if (NdotL > 0.0f) { if (NdotL > 0.0f) {
float light_attenuationBegin = float light_attenuationBegin =
inputData.lightAttenuationBegin[lightIndex]; inputData.lightAttenuationBegin[lightIndex];
// Light distance attenuation (linstep) // Light distance attenuation (linstep)
@@ -574,19 +574,19 @@ ShadeTileC(
float H_y = (L_y - Vneg_y); float H_y = (L_y - Vneg_y);
float H_z = (L_z - Vneg_z); float H_z = (L_z - Vneg_z);
normalize3(H_x, H_y, H_z, H_x, H_y, H_z); normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
float NdotH = dot3(surface_normal_x, surface_normal_y, float NdotH = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, H_x, H_y, H_z); surface_normal_z, H_x, H_y, H_z);
NdotH = std::max(NdotH, 0.0f); NdotH = std::max(NdotH, 0.0f);
float specular = powf(NdotH, surface_specularPower); float specular = powf(NdotH, surface_specularPower);
float specularNorm = (surface_specularPower + 2.0f) * float specularNorm = (surface_specularPower + 2.0f) *
(1.0f / 8.0f); (1.0f / 8.0f);
float specularContrib = surface_specularAmount * float specularContrib = surface_specularAmount *
specularNorm * specular; specularNorm * specular;
float k = attenuation * NdotL * (1.0f + specularContrib); float k = attenuation * NdotL * (1.0f + specularContrib);
float light_color_x = inputData.lightColor_x[lightIndex]; float light_color_x = inputData.lightColor_x[lightIndex];
float light_color_y = inputData.lightColor_y[lightIndex]; float light_color_y = inputData.lightColor_y[lightIndex];
float light_color_z = inputData.lightColor_z[lightIndex]; float light_color_z = inputData.lightColor_z[lightIndex];
@@ -607,7 +607,7 @@ ShadeTileC(
lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma); lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma); lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma); lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
@@ -618,11 +618,11 @@ ShadeTileC(
void void
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
int *lightIndices, int numLights, int *lightIndices, int numLights,
Framebuffer *framebuffer) { Framebuffer *framebuffer) {
const MinMaxZTree *minMaxZTree = gMinMaxZTree; const MinMaxZTree *minMaxZTree = gMinMaxZTree;
// If we few enough lights or this is the base case (last level), shade // If we few enough lights or this is the base case (last level), shade
// this full tile directly // this full tile directly
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) { if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
@@ -632,18 +632,18 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
int startY = tileY * height; int startY = tileY * height;
int endX = std::min(input->header.framebufferWidth, startX + width); int endX = std::min(input->header.framebufferWidth, startX + width);
int endY = std::min(input->header.framebufferHeight, startY + height); int endY = std::min(input->header.framebufferHeight, startY + height);
// Skip entirely offscreen tiles // Skip entirely offscreen tiles
if (endX > startX && endY > startY) { if (endX > startX && endY > startY) {
ShadeTileC(startX, endX, startY, endY, ShadeTileC(startX, endX, startY, endY,
input->header.framebufferWidth, input->header.framebufferHeight, input->header.framebufferWidth, input->header.framebufferHeight,
input->arrays, input->arrays,
input->header.cameraProj[0][0], input->header.cameraProj[1][1], input->header.cameraProj[0][0], input->header.cameraProj[1][1],
input->header.cameraProj[2][2], input->header.cameraProj[3][2], input->header.cameraProj[2][2], input->header.cameraProj[3][2],
lightIndices, numLights, VISUALIZE_LIGHT_COUNT, lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
framebuffer->r, framebuffer->g, framebuffer->b); framebuffer->r, framebuffer->g, framebuffer->b);
} }
} }
else { else {
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
// Move down a level in the tree // Move down a level in the tree
@@ -666,9 +666,9 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
// NOTE: Order is 00, 10, 01, 11 // NOTE: Order is 00, 10, 01, 11
// Set defaults up to cull all lights if the tile doesn't exist (offscreen) // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
float minZ[4] = {input->header.cameraFar, input->header.cameraFar, float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
input->header.cameraFar, input->header.cameraFar}; input->header.cameraFar, input->header.cameraFar};
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
input->header.cameraNear, input->header.cameraNear}; input->header.cameraNear, input->header.cameraNear};
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY); minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
@@ -688,7 +688,7 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
// Cull lights into subtile lists // Cull lights into subtile lists
#ifdef ISPC_IS_WINDOWS #ifdef ISPC_IS_WINDOWS
__declspec(align(ALIGNMENT_BYTES)) __declspec(align(ALIGNMENT_BYTES))
#endif #endif
int subtileLightIndices[4][MAX_LIGHTS] int subtileLightIndices[4][MAX_LIGHTS]
#ifndef ISPC_IS_WINDOWS #ifndef ISPC_IS_WINDOWS
@@ -697,15 +697,15 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
; ;
int subtileNumLights[4]; int subtileNumLights[4];
SplitTileMinMax(midX, midY, minZ, maxZ, SplitTileMinMax(midX, midY, minZ, maxZ,
input->header.framebufferWidth, input->header.framebufferHeight, input->header.framebufferWidth, input->header.framebufferHeight,
input->header.cameraProj[0][0], input->header.cameraProj[1][1], input->header.cameraProj[0][0], input->header.cameraProj[1][1],
lightIndices, numLights, input->arrays.lightPositionView_x, lightIndices, numLights, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
input->arrays.lightAttenuationEnd, input->arrays.lightAttenuationEnd,
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights); subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
// Recurse into subtiles // Recurse into subtiles
ShadeDynamicTileRecurse(input, level, tileX , tileY, ShadeDynamicTileRecurse(input, level, tileX , tileY,
subtileLightIndices[0], subtileNumLights[0], subtileLightIndices[0], subtileNumLights[0],
framebuffer); framebuffer);
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY, ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
@@ -744,7 +744,7 @@ IntersectLightsWithTileMinMax(
{ {
float gBufferScale_x = 0.5f * (float)gBufferWidth; float gBufferScale_x = 0.5f * (float)gBufferWidth;
float gBufferScale_y = 0.5f * (float)gBufferHeight; float gBufferScale_y = 0.5f * (float)gBufferHeight;
float frustumPlanes_xy[4]; float frustumPlanes_xy[4];
float frustumPlanes_z[4]; float frustumPlanes_z[4];
@@ -753,14 +753,14 @@ IntersectLightsWithTileMinMax(
(cameraProj_11 * gBufferScale_x), (cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y), (cameraProj_22 * gBufferScale_y),
-(cameraProj_22 * gBufferScale_y) }; -(cameraProj_22 * gBufferScale_y) };
float frustumPlanes_z_v[4] = { tileEndX - gBufferScale_x, float frustumPlanes_z_v[4] = { tileEndX - gBufferScale_x,
-tileStartX + gBufferScale_x, -tileStartX + gBufferScale_x,
tileEndY - gBufferScale_y, tileEndY - gBufferScale_y,
-tileStartY + gBufferScale_y }; -tileStartY + gBufferScale_y };
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] + float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
frustumPlanes_z_v[i] * frustumPlanes_z_v[i]); frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
frustumPlanes_xy_v[i] *= norm; frustumPlanes_xy_v[i] *= norm;
frustumPlanes_z_v[i] *= norm; frustumPlanes_z_v[i] *= norm;
@@ -781,29 +781,29 @@ IntersectLightsWithTileMinMax(
d = maxZ - light_positionView_z; d = maxZ - light_positionView_z;
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
if (!inFrustum) if (!inFrustum)
continue; continue;
float light_positionView_x = light_positionView_x_array[lightIndex]; float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex]; float light_positionView_y = light_positionView_y_array[lightIndex];
d = light_positionView_z * frustumPlanes_z[0] + d = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0]; light_positionView_x * frustumPlanes_xy[0];
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[1] + d = light_positionView_z * frustumPlanes_z[1] +
light_positionView_x * frustumPlanes_xy[1]; light_positionView_x * frustumPlanes_xy[1];
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[2] + d = light_positionView_z * frustumPlanes_z[2] +
light_positionView_y * frustumPlanes_xy[2]; light_positionView_y * frustumPlanes_xy[2];
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[3] + d = light_positionView_z * frustumPlanes_z[3] +
light_positionView_y * frustumPlanes_xy[3]; light_positionView_y * frustumPlanes_xy[3];
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// Pack and store intersecting lights // Pack and store intersecting lights
if (inFrustum) if (inFrustum)
tileLightIndices[tileNumLights++] = lightIndex; tileLightIndices[tileNumLights++] = lightIndex;
@@ -831,7 +831,7 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
// This is a root tile, so first do a full 6-plane cull // This is a root tile, so first do a full 6-plane cull
#ifdef ISPC_IS_WINDOWS #ifdef ISPC_IS_WINDOWS
__declspec(align(ALIGNMENT_BYTES)) __declspec(align(ALIGNMENT_BYTES))
#endif #endif
int lightIndices[MAX_LIGHTS] int lightIndices[MAX_LIGHTS]
#ifndef ISPC_IS_WINDOWS #ifndef ISPC_IS_WINDOWS
@@ -842,12 +842,12 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
startX, endX, startY, endY, minZ, maxZ, startX, endX, startY, endY, minZ, maxZ,
input->header.framebufferWidth, input->header.framebufferHeight, input->header.framebufferWidth, input->header.framebufferHeight,
input->header.cameraProj[0][0], input->header.cameraProj[1][1], input->header.cameraProj[0][0], input->header.cameraProj[1][1],
MAX_LIGHTS, input->arrays.lightPositionView_x, MAX_LIGHTS, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
input->arrays.lightAttenuationEnd, lightIndices); input->arrays.lightAttenuationEnd, lightIndices);
// Now kick off the recursive process for this tile // Now kick off the recursive process for this tile
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
numLights, framebuffer); numLights, framebuffer);
} }
@@ -856,10 +856,10 @@ void
DispatchDynamicC(InputData *input, Framebuffer *framebuffer) DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
{ {
MinMaxZTree *minMaxZTree = gMinMaxZTree; MinMaxZTree *minMaxZTree = gMinMaxZTree;
// Update min/max Z tree // Update min/max Z tree
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth, minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
input->header.cameraProj[2][2], input->header.cameraProj[3][2], input->header.cameraProj[2][2], input->header.cameraProj[3][2],
input->header.cameraNear, input->header.cameraFar); input->header.cameraNear, input->header.cameraFar);
int rootLevel = minMaxZTree->Levels() - 1; int rootLevel = minMaxZTree->Levels() - 1;

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2011, Intel Corporation Copyright (c) 2011-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#ifdef __cilk #ifdef __cilk
@@ -104,7 +104,7 @@ public:
{ {
mNumTilesX = gBufferWidth / mTileWidth; mNumTilesX = gBufferWidth / mTileWidth;
mNumTilesY = gBufferHeight / mTileHeight; mNumTilesY = gBufferHeight / mTileHeight;
// Allocate arrays // Allocate arrays
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16); mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16); mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
@@ -155,7 +155,7 @@ public:
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)]; float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)]; float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
if (srcX + 1 < srcTilesX) { if (srcX + 1 < srcTilesX) {
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
(srcX + 1)]); (srcX + 1)]);
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX + maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
(srcX + 1)]); (srcX + 1)]);
@@ -185,7 +185,7 @@ public:
lAlignedFree(mMaxZArrays[i]); lAlignedFree(mMaxZArrays[i]);
} }
lAlignedFree(mMinZArrays); lAlignedFree(mMinZArrays);
lAlignedFree(mMaxZArrays); lAlignedFree(mMaxZArrays);
} }
int Levels() const { return mLevels; } int Levels() const { return mLevels; }
@@ -219,19 +219,19 @@ private:
static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0; static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
void InitDynamicCilk(InputData *input) { void InitDynamicCilk(InputData *input) {
gMinMaxZTreeCilk = gMinMaxZTreeCilk =
new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS, new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
input->header.framebufferWidth, input->header.framebufferWidth,
input->header.framebufferHeight); input->header.framebufferHeight);
} }
static void static void
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
int *lightIndices, int numLights, int *lightIndices, int numLights,
Framebuffer *framebuffer) { Framebuffer *framebuffer) {
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk; const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
// If we few enough lights or this is the base case (last level), shade // If we few enough lights or this is the base case (last level), shade
// this full tile directly // this full tile directly
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) { if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
@@ -241,19 +241,19 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
int startY = tileY * height; int startY = tileY * height;
int endX = std::min(input->header.framebufferWidth, startX + width); int endX = std::min(input->header.framebufferWidth, startX + width);
int endY = std::min(input->header.framebufferHeight, startY + height); int endY = std::min(input->header.framebufferHeight, startY + height);
// Skip entirely offscreen tiles // Skip entirely offscreen tiles
if (endX > startX && endY > startY) { if (endX > startX && endY > startY) {
ispc::ShadeTile( ispc::ShadeTile(
startX, endX, startY, endY, startX, endX, startY, endY,
input->header.framebufferWidth, input->header.framebufferHeight, input->header.framebufferWidth, input->header.framebufferHeight,
&input->arrays, &input->arrays,
input->header.cameraProj[0][0], input->header.cameraProj[1][1], input->header.cameraProj[0][0], input->header.cameraProj[1][1],
input->header.cameraProj[2][2], input->header.cameraProj[3][2], input->header.cameraProj[2][2], input->header.cameraProj[3][2],
lightIndices, numLights, VISUALIZE_LIGHT_COUNT, lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
framebuffer->r, framebuffer->g, framebuffer->b); framebuffer->r, framebuffer->g, framebuffer->b);
} }
} }
else { else {
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
// Move down a level in the tree // Move down a level in the tree
@@ -276,9 +276,9 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
// NOTE: Order is 00, 10, 01, 11 // NOTE: Order is 00, 10, 01, 11
// Set defaults up to cull all lights if the tile doesn't exist (offscreen) // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
float minZ[4] = {input->header.cameraFar, input->header.cameraFar, float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
input->header.cameraFar, input->header.cameraFar}; input->header.cameraFar, input->header.cameraFar};
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
input->header.cameraNear, input->header.cameraNear}; input->header.cameraNear, input->header.cameraNear};
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY); minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
@@ -298,7 +298,7 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
// Cull lights into subtile lists // Cull lights into subtile lists
#ifdef ISPC_IS_WINDOWS #ifdef ISPC_IS_WINDOWS
__declspec(align(ALIGNMENT_BYTES)) __declspec(align(ALIGNMENT_BYTES))
#endif #endif
int subtileLightIndices[4][MAX_LIGHTS] int subtileLightIndices[4][MAX_LIGHTS]
#ifndef ISPC_IS_WINDOWS #ifndef ISPC_IS_WINDOWS
@@ -307,15 +307,15 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
; ;
int subtileNumLights[4]; int subtileNumLights[4];
ispc::SplitTileMinMax(midX, midY, minZ, maxZ, ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
input->header.framebufferWidth, input->header.framebufferHeight, input->header.framebufferWidth, input->header.framebufferHeight,
input->header.cameraProj[0][0], input->header.cameraProj[1][1], input->header.cameraProj[0][0], input->header.cameraProj[1][1],
lightIndices, numLights, input->arrays.lightPositionView_x, lightIndices, numLights, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
input->arrays.lightAttenuationEnd, input->arrays.lightAttenuationEnd,
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights); subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
// Recurse into subtiles // Recurse into subtiles
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY, _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY,
subtileLightIndices[0], subtileNumLights[0], subtileLightIndices[0], subtileNumLights[0],
framebuffer); framebuffer);
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY, _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
@@ -349,7 +349,7 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
// This is a root tile, so first do a full 6-plane cull // This is a root tile, so first do a full 6-plane cull
#ifdef ISPC_IS_WINDOWS #ifdef ISPC_IS_WINDOWS
__declspec(align(ALIGNMENT_BYTES)) __declspec(align(ALIGNMENT_BYTES))
#endif #endif
int lightIndices[MAX_LIGHTS] int lightIndices[MAX_LIGHTS]
#ifndef ISPC_IS_WINDOWS #ifndef ISPC_IS_WINDOWS
@@ -360,12 +360,12 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
startX, endX, startY, endY, minZ, maxZ, startX, endX, startY, endY, minZ, maxZ,
input->header.framebufferWidth, input->header.framebufferHeight, input->header.framebufferWidth, input->header.framebufferHeight,
input->header.cameraProj[0][0], input->header.cameraProj[1][1], input->header.cameraProj[0][0], input->header.cameraProj[1][1],
MAX_LIGHTS, input->arrays.lightPositionView_x, MAX_LIGHTS, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
input->arrays.lightAttenuationEnd, lightIndices); input->arrays.lightAttenuationEnd, lightIndices);
// Now kick off the recursive process for this tile // Now kick off the recursive process for this tile
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
numLights, framebuffer); numLights, framebuffer);
} }
@@ -374,10 +374,10 @@ void
DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer) DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
{ {
MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk; MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
// Update min/max Z tree // Update min/max Z tree
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth, minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
input->header.cameraProj[2][2], input->header.cameraProj[3][2], input->header.cameraProj[2][2], input->header.cameraProj[3][2],
input->header.cameraNear, input->header.cameraFar); input->header.cameraNear, input->header.cameraFar);
// Launch the "root" tiles. Ideally these should at least fill the // Launch the "root" tiles. Ideally these should at least fill the

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2010-2011, Intel Corporation Copyright (c) 2010-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
@@ -46,7 +46,7 @@
#define int16 short #define int16 short
#define int8 char #define int8 char
__device__ static inline float clamp(float v, float low, float high) __device__ static inline float clamp(float v, float low, float high)
{ {
return min(max(v, low), high); return min(max(v, low), high);
} }
@@ -122,8 +122,8 @@ struct Uniform
const int2 idx = get_chunk(i); const int2 idx = get_chunk(i);
return __shfl(data[idx.x], idx.y); return __shfl(data[idx.x], idx.y);
} }
__device__ inline void set(const bool active, const int i, T value) __device__ inline void set(const bool active, const int i, T value)
{ {
const int2 idx = get_chunk(i); const int2 idx = get_chunk(i);
const int chunkIdx = idx.x; const int chunkIdx = idx.x;
@@ -160,9 +160,9 @@ struct Uniform
{ {
return data[i]; return data[i];
} }
__device__ inline T* get_ptr(const int i) {return &data[i]; } __device__ inline T* get_ptr(const int i) {return &data[i]; }
__device__ inline void set(const bool active, const int i, T value) __device__ inline void set(const bool active, const int i, T value)
{ {
if (active) if (active)
data[i] = value; data[i] = value;
@@ -185,8 +185,8 @@ struct Uniform
{ {
return shdata[i]; return shdata[i];
} }
__device__ inline void set(const bool active, const int i, T value) __device__ inline void set(const bool active, const int i, T value)
{ {
if (active) if (active)
shdata[i] = value; shdata[i] = value;
@@ -264,7 +264,7 @@ static __device__ __forceinline__ int2 warpBinExclusiveScan(const bool p)
const int b = __ballot(p); const int b = __ballot(p);
return make_int2(__popc(b), __popc(b & lanemask_lt())); return make_int2(__popc(b), __popc(b & lanemask_lt()));
} }
__device__ static inline __device__ static inline
int packed_store_active(bool active, int* ptr, int value) int packed_store_active(bool active, int* ptr, int value)
{ {
const int2 res = warpBinExclusiveScan(active); const int2 res = warpBinExclusiveScan(active);
@@ -358,7 +358,7 @@ IntersectLightsWithTileMinMax(
{ {
float gBufferScale_x = 0.5f * (float)gBufferWidth; float gBufferScale_x = 0.5f * (float)gBufferWidth;
float gBufferScale_y = 0.5f * (float)gBufferHeight; float gBufferScale_y = 0.5f * (float)gBufferHeight;
float frustumPlanes_xy[4] = { float frustumPlanes_xy[4] = {
-(cameraProj_11 * gBufferScale_x), -(cameraProj_11 * gBufferScale_x),
(cameraProj_11 * gBufferScale_x), (cameraProj_11 * gBufferScale_x),
@@ -371,7 +371,7 @@ IntersectLightsWithTileMinMax(
-tileStartY + gBufferScale_y }; -tileStartY + gBufferScale_y };
for ( int i = 0; i < 4; ++i) { for ( int i = 0; i < 4; ++i) {
float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
frustumPlanes_z[i] * frustumPlanes_z[i]); frustumPlanes_z[i] * frustumPlanes_z[i]);
frustumPlanes_xy[i] *= norm; frustumPlanes_xy[i] *= norm;
frustumPlanes_z[i] *= norm; frustumPlanes_z[i] *= norm;
@@ -393,32 +393,32 @@ IntersectLightsWithTileMinMax(
d = maxZ - light_positionView_z; d = maxZ - light_positionView_z;
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// This seems better than cif(!inFrustum) ccontinue; here since we // This seems better than cif(!inFrustum) ccontinue; here since we
// don't actually need to mask the rest of this function - this is // don't actually need to mask the rest of this function - this is
// just a greedy early-out. Could also structure all of this as // just a greedy early-out. Could also structure all of this as
// nested if() statements, but this a bit easier to read // nested if() statements, but this a bit easier to read
if (__ballot(inFrustum) > 0) if (__ballot(inFrustum) > 0)
{ {
float light_positionView_x = light_positionView_x_array[lightIndex]; float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex]; float light_positionView_y = light_positionView_y_array[lightIndex];
d = light_positionView_z * frustumPlanes_z[0] + d = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0]; light_positionView_x * frustumPlanes_xy[0];
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[1] + d = light_positionView_z * frustumPlanes_z[1] +
light_positionView_x * frustumPlanes_xy[1]; light_positionView_x * frustumPlanes_xy[1];
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[2] + d = light_positionView_z * frustumPlanes_z[2] +
light_positionView_y * frustumPlanes_xy[2]; light_positionView_y * frustumPlanes_xy[2];
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[3] + d = light_positionView_z * frustumPlanes_z[3] +
light_positionView_y * frustumPlanes_xy[3]; light_positionView_y * frustumPlanes_xy[3];
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// Pack and store intersecting lights // Pack and store intersecting lights
const bool active = inFrustum && lightIndex < numLights; const bool active = inFrustum && lightIndex < numLights;
#if 0 #if 0
@@ -472,7 +472,7 @@ IntersectLightsWithTile(
int32 tileNumLights = IntersectLightsWithTileMinMax( int32 tileNumLights = IntersectLightsWithTileMinMax(
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ, tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22, gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
light_positionView_z_array, light_attenuationEnd_array, light_positionView_z_array, light_attenuationEnd_array,
tileLightIndices); tileLightIndices);
@@ -505,7 +505,7 @@ ShadeTile(
unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255)); unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
for ( int32 y = tileStartY; y < tileEndY; ++y) { for ( int32 y = tileStartY; y < tileEndY; ++y) {
for ( int xb = tileStartX ; xb < tileEndX; xb += programCount) for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
{ {
const int x = xb + programIndex; const int x = xb + programIndex;
if (x >= tileEndX) continue; if (x >= tileEndX) continue;
int32 framebufferIndex = (y * gBufferWidth + x); int32 framebufferIndex = (y * gBufferWidth + x);
@@ -517,16 +517,16 @@ ShadeTile(
} else { } else {
float twoOverGBufferWidth = 2.0f / gBufferWidth; float twoOverGBufferWidth = 2.0f / gBufferWidth;
float twoOverGBufferHeight = 2.0f / gBufferHeight; float twoOverGBufferHeight = 2.0f / gBufferHeight;
for ( int32 y = tileStartY; y < tileEndY; ++y) { for ( int32 y = tileStartY; y < tileEndY; ++y) {
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
for ( int xb = tileStartX ; xb < tileEndX; xb += programCount) for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
{ {
const int x = xb + programIndex; const int x = xb + programIndex;
// if (x >= tileEndX) break; // if (x >= tileEndX) break;
int32 gBufferOffset = y * gBufferWidth + x; int32 gBufferOffset = y * gBufferWidth + x;
// Reconstruct position and (negative) view vector from G-buffer // Reconstruct position and (negative) view vector from G-buffer
float surface_positionView_x, surface_positionView_y, surface_positionView_z; float surface_positionView_x, surface_positionView_y, surface_positionView_z;
float Vneg_x, Vneg_y, Vneg_z; float Vneg_x, Vneg_y, Vneg_z;
@@ -535,19 +535,19 @@ ShadeTile(
// Compute screen/clip-space position // Compute screen/clip-space position
// NOTE: Mind DX11 viewport transform and pixel center! // NOTE: Mind DX11 viewport transform and pixel center!
float positionScreen_x = (0.5f + (float)(x)) * float positionScreen_x = (0.5f + (float)(x)) *
twoOverGBufferWidth - 1.0f; twoOverGBufferWidth - 1.0f;
// Unproject depth buffer Z value into view space // Unproject depth buffer Z value into view space
surface_positionView_z = cameraProj_43 / (z - cameraProj_33); surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
surface_positionView_x = positionScreen_x * surface_positionView_z / surface_positionView_x = positionScreen_x * surface_positionView_z /
cameraProj_11; cameraProj_11;
surface_positionView_y = positionScreen_y * surface_positionView_z / surface_positionView_y = positionScreen_y * surface_positionView_z /
cameraProj_22; cameraProj_22;
// We actually end up with a vector pointing *at* the // We actually end up with a vector pointing *at* the
// surface (i.e. the negative view vector) // surface (i.e. the negative view vector)
normalize3(surface_positionView_x, surface_positionView_y, normalize3(surface_positionView_x, surface_positionView_y,
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z); surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
// Reconstruct normal from G-buffer // Reconstruct normal from G-buffer
@@ -556,51 +556,51 @@ ShadeTile(
float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]); float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]);
float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]); float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]);
asm("// half2float //"); asm("// half2float //");
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
float m = sqrt(4.0f * f - 1.0f); float m = sqrt(4.0f * f - 1.0f);
surface_normal_x = m * (4.0f * normal_x - 2.0f); surface_normal_x = m * (4.0f * normal_x - 2.0f);
surface_normal_y = m * (4.0f * normal_y - 2.0f); surface_normal_y = m * (4.0f * normal_y - 2.0f);
surface_normal_z = 3.0f - 8.0f * f; surface_normal_z = 3.0f - 8.0f * f;
// Load other G-buffer parameters // Load other G-buffer parameters
float surface_specularAmount = float surface_specularAmount =
__half2float(inputData.specularAmount[gBufferOffset]); __half2float(inputData.specularAmount[gBufferOffset]);
float surface_specularPower = float surface_specularPower =
__half2float(inputData.specularPower[gBufferOffset]); __half2float(inputData.specularPower[gBufferOffset]);
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]); float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]); float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]); float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
float lit_x = 0.0f; float lit_x = 0.0f;
float lit_y = 0.0f; float lit_y = 0.0f;
float lit_z = 0.0f; float lit_z = 0.0f;
for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights; for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
++tileLightIndex) { ++tileLightIndex) {
int32 lightIndex = tileLightIndices.get(tileLightIndex); int32 lightIndex = tileLightIndices.get(tileLightIndex);
// Gather light data relevant to initial culling // Gather light data relevant to initial culling
float light_positionView_x = float light_positionView_x =
__ldg(&inputData.lightPositionView_x[lightIndex]); __ldg(&inputData.lightPositionView_x[lightIndex]);
float light_positionView_y = float light_positionView_y =
__ldg(&inputData.lightPositionView_y[lightIndex]); __ldg(&inputData.lightPositionView_y[lightIndex]);
float light_positionView_z = float light_positionView_z =
__ldg(&inputData.lightPositionView_z[lightIndex]); __ldg(&inputData.lightPositionView_z[lightIndex]);
float light_attenuationEnd = float light_attenuationEnd =
__ldg(&inputData.lightAttenuationEnd[lightIndex]); __ldg(&inputData.lightAttenuationEnd[lightIndex]);
// Compute light vector // Compute light vector
float L_x = light_positionView_x - surface_positionView_x; float L_x = light_positionView_x - surface_positionView_x;
float L_y = light_positionView_y - surface_positionView_y; float L_y = light_positionView_y - surface_positionView_y;
float L_z = light_positionView_z - surface_positionView_z; float L_z = light_positionView_z - surface_positionView_z;
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z); float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
// Clip at end of attenuation // Clip at end of attenuation
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
if (distanceToLight2 < light_attenutaionEnd2) { if (distanceToLight2 < light_attenutaionEnd2) {
float distanceToLight = sqrt(distanceToLight2); float distanceToLight = sqrt(distanceToLight2);
// HLSL "rcp" is allowed to be fairly inaccurate // HLSL "rcp" is allowed to be fairly inaccurate
@@ -610,12 +610,12 @@ ShadeTile(
L_z *= distanceToLightRcp; L_z *= distanceToLightRcp;
// Start computing brdf // Start computing brdf
float NdotL = dot3(surface_normal_x, surface_normal_y, float NdotL = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, L_x, L_y, L_z); surface_normal_z, L_x, L_y, L_z);
// Clip back facing // Clip back facing
if (NdotL > 0.0f) { if (NdotL > 0.0f) {
float light_attenuationBegin = float light_attenuationBegin =
inputData.lightAttenuationBegin[lightIndex]; inputData.lightAttenuationBegin[lightIndex];
// Light distance attenuation (linstep) // Light distance attenuation (linstep)
@@ -627,19 +627,19 @@ ShadeTile(
float H_y = (L_y - Vneg_y); float H_y = (L_y - Vneg_y);
float H_z = (L_z - Vneg_z); float H_z = (L_z - Vneg_z);
normalize3(H_x, H_y, H_z, H_x, H_y, H_z); normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
float NdotH = dot3(surface_normal_x, surface_normal_y, float NdotH = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, H_x, H_y, H_z); surface_normal_z, H_x, H_y, H_z);
NdotH = max(NdotH, 0.0f); NdotH = max(NdotH, 0.0f);
float specular = pow(NdotH, surface_specularPower); float specular = pow(NdotH, surface_specularPower);
float specularNorm = (surface_specularPower + 2.0f) * float specularNorm = (surface_specularPower + 2.0f) *
(1.0f / 8.0f); (1.0f / 8.0f);
float specularContrib = surface_specularAmount * float specularContrib = surface_specularAmount *
specularNorm * specular; specularNorm * specular;
float k = attenuation * NdotL * (1.0f + specularContrib); float k = attenuation * NdotL * (1.0f + specularContrib);
float light_color_x = inputData.lightColor_x[lightIndex]; float light_color_x = inputData.lightColor_x[lightIndex];
float light_color_y = inputData.lightColor_y[lightIndex]; float light_color_y = inputData.lightColor_y[lightIndex];
float light_color_z = inputData.lightColor_z[lightIndex]; float light_color_z = inputData.lightColor_z[lightIndex];
@@ -663,7 +663,7 @@ ShadeTile(
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma); lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
@@ -707,8 +707,8 @@ RenderTile( int num_groups_x, int num_groups_y,
// Light intersection: figure out which lights illuminate this tile. // Light intersection: figure out which lights illuminate this tile.
Uniform<int,MAX_LIGHTS> tileLightIndices; // Light list for the tile Uniform<int,MAX_LIGHTS> tileLightIndices; // Light list for the tile
#if 1 #if 1
int numTileLights = int numTileLights =
IntersectLightsWithTile(tile_start_x, tile_end_x, IntersectLightsWithTile(tile_start_x, tile_end_x,
tile_start_y, tile_end_y, tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight, framebufferWidth, framebufferHeight,
inputData.zBuffer, inputData.zBuffer,
@@ -716,9 +716,9 @@ RenderTile( int num_groups_x, int num_groups_y,
cameraProj_22, cameraProj_32, cameraProj_22, cameraProj_32,
inputHeader.cameraNear, inputHeader.cameraFar, inputHeader.cameraNear, inputHeader.cameraFar,
MAX_LIGHTS, MAX_LIGHTS,
inputData.lightPositionView_x, inputData.lightPositionView_x,
inputData.lightPositionView_y, inputData.lightPositionView_y,
inputData.lightPositionView_z, inputData.lightPositionView_z,
inputData.lightAttenuationEnd, inputData.lightAttenuationEnd,
tileLightIndices); tileLightIndices);
@@ -726,7 +726,7 @@ RenderTile( int num_groups_x, int num_groups_y,
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight, inputData, framebufferWidth, framebufferHeight, inputData,
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32, cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
tileLightIndices, numTileLights, visualizeLightCount, tileLightIndices, numTileLights, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b); framebuffer_r, framebuffer_g, framebuffer_b);
#endif #endif
} }
@@ -745,9 +745,9 @@ RenderStatic___export( InputHeader inputHeaderPtr[],
const InputDataArrays inputData = *inputDataPtr; const InputDataArrays inputData = *inputDataPtr;
int num_groups_x = (inputHeader.framebufferWidth + int num_groups_x = (inputHeader.framebufferWidth +
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH; MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
int num_groups_y = (inputHeader.framebufferHeight + int num_groups_y = (inputHeader.framebufferHeight +
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT; MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
int num_groups = num_groups_x * num_groups_y; int num_groups = num_groups_x * num_groups_y;

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2010-2011, Intel Corporation Copyright (c) 2010-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "deferred.h" #include "deferred.h"
@@ -142,11 +142,11 @@ ComputeZBounds(
maxZ = reduce_max(laneMaxZ); maxZ = reduce_max(laneMaxZ);
} }
#if 1 #if 1
inline inline
#endif #endif
#ifndef __NVPTX__ #ifndef __NVPTX__
export export
#endif #endif
uniform int32 uniform int32
IntersectLightsWithTileMinMax( IntersectLightsWithTileMinMax(
@@ -171,7 +171,7 @@ IntersectLightsWithTileMinMax(
{ {
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth; uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight; uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
uniform_t float frustumPlanes_xy[4] = { uniform_t float frustumPlanes_xy[4] = {
-(cameraProj_11 * gBufferScale_x), -(cameraProj_11 * gBufferScale_x),
(cameraProj_11 * gBufferScale_x), (cameraProj_11 * gBufferScale_x),
@@ -184,7 +184,7 @@ IntersectLightsWithTileMinMax(
-tileStartY + gBufferScale_y }; -tileStartY + gBufferScale_y };
for (uniform int i = 0; i < 4; ++i) { for (uniform int i = 0; i < 4; ++i) {
uniform_t float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + uniform_t float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
frustumPlanes_z[i] * frustumPlanes_z[i]); frustumPlanes_z[i] * frustumPlanes_z[i]);
frustumPlanes_xy[i] *= norm; frustumPlanes_xy[i] *= norm;
frustumPlanes_z[i] *= norm; frustumPlanes_z[i] *= norm;
@@ -202,7 +202,7 @@ IntersectLightsWithTileMinMax(
d = maxZ - light_positionView_z; d = maxZ - light_positionView_z;
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// This seems better than cif(!inFrustum) ccontinue; here since we // This seems better than cif(!inFrustum) ccontinue; here since we
// don't actually need to mask the rest of this function - this is // don't actually need to mask the rest of this function - this is
// just a greedy early-out. Could also structure all of this as // just a greedy early-out. Could also structure all of this as
@@ -211,26 +211,26 @@ IntersectLightsWithTileMinMax(
float light_positionView_x = light_positionView_x_array[lightIndex]; float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex]; float light_positionView_y = light_positionView_y_array[lightIndex];
d = light_positionView_z * frustumPlanes_z[0] + d = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0]; light_positionView_x * frustumPlanes_xy[0];
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[1] + d = light_positionView_z * frustumPlanes_z[1] +
light_positionView_x * frustumPlanes_xy[1]; light_positionView_x * frustumPlanes_xy[1];
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[2] + d = light_positionView_z * frustumPlanes_z[2] +
light_positionView_y * frustumPlanes_xy[2]; light_positionView_y * frustumPlanes_xy[2];
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[3] + d = light_positionView_z * frustumPlanes_z[3] +
light_positionView_y * frustumPlanes_xy[3]; light_positionView_y * frustumPlanes_xy[3];
inFrustum = inFrustum && (d >= light_attenuationEndNeg); inFrustum = inFrustum && (d >= light_attenuationEndNeg);
#if 0 #if 0
// Pack and store intersecting lights // Pack and store intersecting lights
cif (inFrustum) { cif (inFrustum) {
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
lightIndex); lightIndex);
} }
#else #else
@@ -245,7 +245,7 @@ IntersectLightsWithTileMinMax(
} }
#if 1 #if 1
inline inline
#endif #endif
static uniform int32 static uniform int32
@@ -277,7 +277,7 @@ IntersectLightsWithTile(
uniform int32 tileNumLights = IntersectLightsWithTileMinMax( uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ, tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22, gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
light_positionView_z_array, light_attenuationEnd_array, light_positionView_z_array, light_attenuationEnd_array,
tileLightIndices); tileLightIndices);
@@ -285,7 +285,7 @@ IntersectLightsWithTile(
} }
#if 1 #if 1
inline inline
#endif #endif
#ifndef __NVPTX__ #ifndef __NVPTX__
@@ -324,13 +324,13 @@ ShadeTile(
} else { } else {
uniform float twoOverGBufferWidth = 2.0f / gBufferWidth; uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
uniform float twoOverGBufferHeight = 2.0f / gBufferHeight; uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
for (uniform int32 y = tileStartY; y < tileEndY; ++y) { for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
foreach (x = tileStartX ... tileEndX) { foreach (x = tileStartX ... tileEndX) {
int32 gBufferOffset = y * gBufferWidth + x; int32 gBufferOffset = y * gBufferWidth + x;
// Reconstruct position and (negative) view vector from G-buffer // Reconstruct position and (negative) view vector from G-buffer
float surface_positionView_x, surface_positionView_y, surface_positionView_z; float surface_positionView_x, surface_positionView_y, surface_positionView_z;
float Vneg_x, Vneg_y, Vneg_z; float Vneg_x, Vneg_y, Vneg_z;
@@ -339,70 +339,70 @@ ShadeTile(
// Compute screen/clip-space position // Compute screen/clip-space position
// NOTE: Mind DX11 viewport transform and pixel center! // NOTE: Mind DX11 viewport transform and pixel center!
float positionScreen_x = (0.5f + (float)(x)) * float positionScreen_x = (0.5f + (float)(x)) *
twoOverGBufferWidth - 1.0f; twoOverGBufferWidth - 1.0f;
// Unproject depth buffer Z value into view space // Unproject depth buffer Z value into view space
surface_positionView_z = cameraProj_43 / (z - cameraProj_33); surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
surface_positionView_x = positionScreen_x * surface_positionView_z / surface_positionView_x = positionScreen_x * surface_positionView_z /
cameraProj_11; cameraProj_11;
surface_positionView_y = positionScreen_y * surface_positionView_z / surface_positionView_y = positionScreen_y * surface_positionView_z /
cameraProj_22; cameraProj_22;
// We actually end up with a vector pointing *at* the // We actually end up with a vector pointing *at* the
// surface (i.e. the negative view vector) // surface (i.e. the negative view vector)
normalize3(surface_positionView_x, surface_positionView_y, normalize3(surface_positionView_x, surface_positionView_y,
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z); surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
// Reconstruct normal from G-buffer // Reconstruct normal from G-buffer
float surface_normal_x, surface_normal_y, surface_normal_z; float surface_normal_x, surface_normal_y, surface_normal_z;
float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]); float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]); float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
float m = sqrt(4.0f * f - 1.0f); float m = sqrt(4.0f * f - 1.0f);
surface_normal_x = m * (4.0f * normal_x - 2.0f); surface_normal_x = m * (4.0f * normal_x - 2.0f);
surface_normal_y = m * (4.0f * normal_y - 2.0f); surface_normal_y = m * (4.0f * normal_y - 2.0f);
surface_normal_z = 3.0f - 8.0f * f; surface_normal_z = 3.0f - 8.0f * f;
// Load other G-buffer parameters // Load other G-buffer parameters
float surface_specularAmount = float surface_specularAmount =
half_to_float(inputData.specularAmount[gBufferOffset]); half_to_float(inputData.specularAmount[gBufferOffset]);
float surface_specularPower = float surface_specularPower =
half_to_float(inputData.specularPower[gBufferOffset]); half_to_float(inputData.specularPower[gBufferOffset]);
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]); float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]); float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]); float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
float lit_x = 0.0f; float lit_x = 0.0f;
float lit_y = 0.0f; float lit_y = 0.0f;
float lit_z = 0.0f; float lit_z = 0.0f;
for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
++tileLightIndex) { ++tileLightIndex) {
uniform int32 lightIndex = tileLightIndices[tileLightIndex]; uniform int32 lightIndex = tileLightIndices[tileLightIndex];
// Gather light data relevant to initial culling // Gather light data relevant to initial culling
uniform float light_positionView_x = uniform float light_positionView_x =
inputData.lightPositionView_x[lightIndex]; inputData.lightPositionView_x[lightIndex];
uniform float light_positionView_y = uniform float light_positionView_y =
inputData.lightPositionView_y[lightIndex]; inputData.lightPositionView_y[lightIndex];
uniform float light_positionView_z = uniform float light_positionView_z =
inputData.lightPositionView_z[lightIndex]; inputData.lightPositionView_z[lightIndex];
uniform float light_attenuationEnd = uniform float light_attenuationEnd =
inputData.lightAttenuationEnd[lightIndex]; inputData.lightAttenuationEnd[lightIndex];
// Compute light vector // Compute light vector
float L_x = light_positionView_x - surface_positionView_x; float L_x = light_positionView_x - surface_positionView_x;
float L_y = light_positionView_y - surface_positionView_y; float L_y = light_positionView_y - surface_positionView_y;
float L_z = light_positionView_z - surface_positionView_z; float L_z = light_positionView_z - surface_positionView_z;
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z); float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
// Clip at end of attenuation // Clip at end of attenuation
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
cif (distanceToLight2 < light_attenutaionEnd2) { cif (distanceToLight2 < light_attenutaionEnd2) {
float distanceToLight = sqrt(distanceToLight2); float distanceToLight = sqrt(distanceToLight2);
// HLSL "rcp" is allowed to be fairly inaccurate // HLSL "rcp" is allowed to be fairly inaccurate
@@ -412,12 +412,12 @@ ShadeTile(
L_z *= distanceToLightRcp; L_z *= distanceToLightRcp;
// Start computing brdf // Start computing brdf
float NdotL = dot3(surface_normal_x, surface_normal_y, float NdotL = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, L_x, L_y, L_z); surface_normal_z, L_x, L_y, L_z);
// Clip back facing // Clip back facing
cif (NdotL > 0.0f) { cif (NdotL > 0.0f) {
uniform float light_attenuationBegin = uniform float light_attenuationBegin =
inputData.lightAttenuationBegin[lightIndex]; inputData.lightAttenuationBegin[lightIndex];
// Light distance attenuation (linstep) // Light distance attenuation (linstep)
@@ -429,19 +429,19 @@ ShadeTile(
float H_y = (L_y - Vneg_y); float H_y = (L_y - Vneg_y);
float H_z = (L_z - Vneg_z); float H_z = (L_z - Vneg_z);
normalize3(H_x, H_y, H_z, H_x, H_y, H_z); normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
float NdotH = dot3(surface_normal_x, surface_normal_y, float NdotH = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, H_x, H_y, H_z); surface_normal_z, H_x, H_y, H_z);
NdotH = max(NdotH, 0.0f); NdotH = max(NdotH, 0.0f);
float specular = pow(NdotH, surface_specularPower); float specular = pow(NdotH, surface_specularPower);
float specularNorm = (surface_specularPower + 2.0f) * float specularNorm = (surface_specularPower + 2.0f) *
(1.0f / 8.0f); (1.0f / 8.0f);
float specularContrib = surface_specularAmount * float specularContrib = surface_specularAmount *
specularNorm * specular; specularNorm * specular;
float k = attenuation * NdotL * (1.0f + specularContrib); float k = attenuation * NdotL * (1.0f + specularContrib);
uniform float light_color_x = inputData.lightColor_x[lightIndex]; uniform float light_color_x = inputData.lightColor_x[lightIndex];
uniform float light_color_y = inputData.lightColor_y[lightIndex]; uniform float light_color_y = inputData.lightColor_y[lightIndex];
uniform float light_color_z = inputData.lightColor_z[lightIndex]; uniform float light_color_z = inputData.lightColor_z[lightIndex];
@@ -465,7 +465,7 @@ ShadeTile(
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma); lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
@@ -512,8 +512,8 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
#else /* shared memory doesn't full work... why? */ #else /* shared memory doesn't full work... why? */
uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
#endif #endif
uniform int numTileLights = uniform int numTileLights =
IntersectLightsWithTile(tile_start_x, tile_end_x, IntersectLightsWithTile(tile_start_x, tile_end_x,
tile_start_y, tile_end_y, tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight, framebufferWidth, framebufferHeight,
inputData.zBuffer, inputData.zBuffer,
@@ -521,9 +521,9 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
cameraProj_22, cameraProj_32, cameraProj_22, cameraProj_32,
inputHeader.cameraNear, inputHeader.cameraFar, inputHeader.cameraNear, inputHeader.cameraFar,
MAX_LIGHTS, MAX_LIGHTS,
inputData.lightPositionView_x, inputData.lightPositionView_x,
inputData.lightPositionView_y, inputData.lightPositionView_y,
inputData.lightPositionView_z, inputData.lightPositionView_z,
inputData.lightAttenuationEnd, inputData.lightAttenuationEnd,
tileLightIndices); tileLightIndices);
@@ -531,7 +531,7 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight, inputData, framebufferWidth, framebufferHeight, inputData,
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32, cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
tileLightIndices, numTileLights, visualizeLightCount, tileLightIndices, numTileLights, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b); framebuffer_r, framebuffer_g, framebuffer_b);
#ifdef MALLOC #ifdef MALLOC
delete tileLightIndices; delete tileLightIndices;
@@ -551,9 +551,9 @@ RenderStatic(uniform InputHeader inputHeaderPtr[],
uniform InputHeader inputHeader = *inputHeaderPtr; uniform InputHeader inputHeader = *inputHeaderPtr;
uniform InputDataArrays inputData = *inputDataPtr; uniform InputDataArrays inputData = *inputDataPtr;
uniform int num_groups_x = (inputHeader.framebufferWidth + uniform int num_groups_x = (inputHeader.framebufferWidth +
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH; MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
uniform int num_groups_y = (inputHeader.framebufferHeight + uniform int num_groups_y = (inputHeader.framebufferHeight +
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT; MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
uniform int num_groups = num_groups_x * num_groups_y; uniform int num_groups = num_groups_x * num_groups_y;
@@ -627,16 +627,16 @@ SplitTileMinMax(
{ {
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth; uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight; uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
uniform_t float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x), uniform_t float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y) }; (cameraProj_22 * gBufferScale_y) };
uniform_t float frustumPlanes_z[2] = { tileMidX - gBufferScale_x, uniform_t float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
tileMidY - gBufferScale_y }; tileMidY - gBufferScale_y };
// Normalize // Normalize
uniform_t float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + uniform_t float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
frustumPlanes_z[0] * frustumPlanes_z[0]), frustumPlanes_z[0] * frustumPlanes_z[0]),
rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
frustumPlanes_z[1] * frustumPlanes_z[1]) }; frustumPlanes_z[1] * frustumPlanes_z[1]) };
frustumPlanes_xy[0] *= norm[0]; frustumPlanes_xy[0] *= norm[0];
frustumPlanes_xy[1] *= norm[1]; frustumPlanes_xy[1] *= norm[1];
@@ -658,23 +658,23 @@ SplitTileMinMax(
float light_positionView_z = light_positionView_z_array[lightIndex]; float light_positionView_z = light_positionView_z_array[lightIndex];
float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
float light_attenuationEndNeg = -light_attenuationEnd; float light_attenuationEndNeg = -light_attenuationEnd;
// Test lights again subtile z bounds // Test lights again subtile z bounds
bool inFrustum[4]; bool inFrustum[4];
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) && inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg); (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg); (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg); (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg); (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
float dx = light_positionView_z * frustumPlanes_z[0] + float dx = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0]; light_positionView_x * frustumPlanes_xy[0];
float dy = light_positionView_z * frustumPlanes_z[1] + float dy = light_positionView_z * frustumPlanes_z[1] +
light_positionView_y * frustumPlanes_xy[1]; light_positionView_y * frustumPlanes_xy[1];
cif (abs(dx) > light_attenuationEnd) { cif (abs(dx) > light_attenuationEnd) {
bool positiveX = dx > 0.0f; bool positiveX = dx > 0.0f;
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
@@ -693,20 +693,20 @@ SplitTileMinMax(
// Pack and store intersecting lights // Pack and store intersecting lights
// TODO: Experiment with a loop here instead // TODO: Experiment with a loop here instead
cif (inFrustum[0]) cif (inFrustum[0])
subtileLightOffset[0] += subtileLightOffset[0] +=
packed_store_active(&subtileIndices[subtileLightOffset[0]], packed_store_active(&subtileIndices[subtileLightOffset[0]],
lightIndex); lightIndex);
cif (inFrustum[1]) cif (inFrustum[1])
subtileLightOffset[1] += subtileLightOffset[1] +=
packed_store_active(&subtileIndices[subtileLightOffset[1]], packed_store_active(&subtileIndices[subtileLightOffset[1]],
lightIndex); lightIndex);
cif (inFrustum[2]) cif (inFrustum[2])
subtileLightOffset[2] += subtileLightOffset[2] +=
packed_store_active(&subtileIndices[subtileLightOffset[2]], packed_store_active(&subtileIndices[subtileLightOffset[2]],
lightIndex); lightIndex);
cif (inFrustum[3]) cif (inFrustum[3])
subtileLightOffset[3] += subtileLightOffset[3] +=
packed_store_active(&subtileIndices[subtileLightOffset[3]], packed_store_active(&subtileIndices[subtileLightOffset[3]],
lightIndex); lightIndex);
} }

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2011, Intel Corporation Copyright (c) 2011-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#ifdef _MSC_VER #ifdef _MSC_VER

View File

@@ -5,8 +5,8 @@ uniform int gcd(uniform int a, uniform int b)
{ {
while ( a != 0 ) while ( a != 0 )
{ {
uniform int c = a; uniform int c = a;
a = b%a; a = b%a;
b = c; b = c;
} }
return b; return b;
@@ -32,7 +32,7 @@ int __sj(const int i, const uniform int j, const uniform int m, const uniform in
#if 0 #if 0
static inline static inline
void transpose_serial(uniform T A[], const uniform int m, const uniform int n) void transpose_serial(uniform T A[], const uniform int m, const uniform int n)
{ {
const uniform int tmpSize = max(m,n) * programCount; const uniform int tmpSize = max(m,n) * programCount;
@@ -90,7 +90,7 @@ static uniform int * uniform joverb = NULL;
static uniform int * uniform iovera = NULL; static uniform int * uniform iovera = NULL;
static uniform int a,b,c; static uniform int a,b,c;
static static
void transpose_init(const uniform int m, const uniform int n, const uniform int nTask) void transpose_init(const uniform int m, const uniform int n, const uniform int nTask)
{ {
joverb = uniform new uniform int[n]; joverb = uniform new uniform int[n];
@@ -105,14 +105,14 @@ void transpose_init(const uniform int m, const uniform int n, const uniform int
iovera[i] = i/a; iovera[i] = i/a;
} }
static static
void transpose_finalize() void transpose_finalize()
{ {
delete iovera; delete iovera;
delete joverb; delete joverb;
} }
task task
void transpose_step1(uniform T A[], const uniform int m, const uniform int n) void transpose_step1(uniform T A[], const uniform int m, const uniform int n)
{ {
const uniform int n_per_task = (n + taskCount - 1)/taskCount; const uniform int n_per_task = (n + taskCount - 1)/taskCount;
@@ -140,7 +140,7 @@ void transpose_step2(uniform T A[], const uniform int m, const uniform int n)
const uniform int m_per_task = (m + taskCount - 1)/taskCount; const uniform int m_per_task = (m + taskCount - 1)/taskCount;
const uniform int mibeg = taskIndex * m_per_task; const uniform int mibeg = taskIndex * m_per_task;
const uniform int miend = min(mibeg + m_per_task, m); const uniform int miend = min(mibeg + m_per_task, m);
uniform T * uniform tmp = uniform new uniform T[n*programCount]; uniform T * uniform tmp = uniform new uniform T[n*programCount];
uniform T (*uniform tmp2D)[programCount] = (uniform T (*uniform)[programCount])tmp; uniform T (*uniform tmp2D)[programCount] = (uniform T (*uniform)[programCount])tmp;
@@ -161,7 +161,7 @@ void transpose_step3(uniform T A[], const uniform int m, const uniform int n)
const uniform int n_per_task = (n + taskCount - 1)/taskCount; const uniform int n_per_task = (n + taskCount - 1)/taskCount;
const uniform int nibeg = taskIndex * n_per_task; const uniform int nibeg = taskIndex * n_per_task;
const uniform int niend = min(nibeg + n_per_task, n); const uniform int niend = min(nibeg + n_per_task, n);
uniform T * uniform tmp = uniform new uniform T[m]; uniform T * uniform tmp = uniform new uniform T[m];
for (uniform int j = nibeg; j < niend; j++) for (uniform int j = nibeg; j < niend; j++)
@@ -176,7 +176,7 @@ void transpose_step3(uniform T A[], const uniform int m, const uniform int n)
delete tmp; delete tmp;
} }
export export
void transpose(uniform T A[], const uniform int m, const uniform int n) void transpose(uniform T A[], const uniform int m, const uniform int n)
{ {
#if 0 #if 0
@@ -187,7 +187,7 @@ void transpose(uniform T A[], const uniform int m, const uniform int n)
launch [nTask] transpose_step1(A, m, n); launch [nTask] transpose_step1(A, m, n);
sync; sync;
launch [nTask] transpose_step2(A, m, n); launch [nTask] transpose_step2(A, m, n);
sync; sync;

View File

@@ -71,7 +71,7 @@ int main (int argc, char *argv[])
valsGld[i] = valsSrc[i]; valsGld[i] = valsSrc[i];
} }
delete keys; delete keys;
ispcSetMallocHeapLimit(1024*1024*1024); ispcSetMallocHeapLimit(1024*1024*1024);
ispc::openMergeSort(); ispc::openMergeSort();
@@ -115,7 +115,7 @@ int main (int argc, char *argv[])
} }
printf("\n---\n"); printf("\n---\n");
#endif #endif
std::sort(keysGld, keysGld + n); std::sort(keysGld, keysGld + n);

View File

@@ -30,9 +30,9 @@ int nextPowerOfTwo(int x)
__device__ static inline __device__ static inline
int binarySearchInclusiveRanks( int binarySearchInclusiveRanks(
const int val, const int val,
uniform int *data, uniform int *data,
const int L, const int L,
int stride) int stride)
{ {
if (L == 0) if (L == 0)
@@ -52,9 +52,9 @@ int binarySearchInclusiveRanks(
__device__ static inline __device__ static inline
int binarySearchExclusiveRanks( int binarySearchExclusiveRanks(
const int val, const int val,
uniform int *data, uniform int *data,
const int L, const int L,
int stride) int stride)
{ {
if (L == 0) if (L == 0)
@@ -74,9 +74,9 @@ int binarySearchExclusiveRanks(
__device__ static inline __device__ static inline
int binarySearchInclusive( int binarySearchInclusive(
const Key_t val, const Key_t val,
uniform Key_t *data, uniform Key_t *data,
const int L, const int L,
int stride) int stride)
{ {
if (L == 0) if (L == 0)
@@ -96,9 +96,9 @@ int binarySearchInclusive(
__device__ static inline __device__ static inline
int binarySearchExclusive( int binarySearchExclusive(
const Key_t val, const Key_t val,
uniform Key_t *data, uniform Key_t *data,
const int L, const int L,
int stride) int stride)
{ {
if (L == 0) if (L == 0)
@@ -118,9 +118,9 @@ int binarySearchExclusive(
__device__ static inline __device__ static inline
int binarySearchInclusive1( int binarySearchInclusive1(
const Key_t val, const Key_t val,
Key_t data, Key_t data,
const uniform int L, const uniform int L,
uniform int stride) uniform int stride)
{ {
if (L == 0) if (L == 0)
@@ -140,9 +140,9 @@ int binarySearchInclusive1(
__device__ static inline __device__ static inline
int binarySearchExclusive1( int binarySearchExclusive1(
const Key_t val, const Key_t val,
Key_t data, Key_t data,
const uniform int L, const uniform int L,
uniform int stride) uniform int stride)
{ {
if (L == 0) if (L == 0)
@@ -245,7 +245,7 @@ void generateSampleRanksKernel(
const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount; const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blkBeg = blkIdx * blkDim; const uniform int blkBeg = blkIdx * blkDim;
const uniform int blkEnd = min(blkBeg + blkDim, nBlocks); const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
for (uniform int blk = blkBeg; blk < blkEnd; blk++) for (uniform int blk = blkBeg; blk < blkEnd; blk++)
{ {
const int pos = blk * programCount + programIndex; const int pos = blk * programCount + programIndex;
@@ -291,8 +291,8 @@ void generateSampleRanks(
uniform int N) uniform int N)
{ {
uniform int lastSegmentElements = N % (2 * stride); uniform int lastSegmentElements = N % (2 * stride);
uniform int threadCount = (lastSegmentElements > stride) ? uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE); (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE); uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
@@ -304,7 +304,7 @@ void generateSampleRanks(
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices // Merge step 2: generate sample ranks and indices
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ __global__
void mergeRanksAndIndicesKernel( void mergeRanksAndIndicesKernel(
uniform int nBlocks, uniform int nBlocks,
uniform int in_Limits[], uniform int in_Limits[],
@@ -317,7 +317,7 @@ void mergeRanksAndIndicesKernel(
const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount; const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blkBeg = blkIdx * blkDim; const uniform int blkBeg = blkIdx * blkDim;
const uniform int blkEnd = min(blkBeg + blkDim, nBlocks); const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
for (uniform int blk = blkBeg; blk < blkEnd; blk++) for (uniform int blk = blkBeg; blk < blkEnd; blk++)
{ {
int pos = blk * programCount + programIndex; int pos = blk * programCount + programIndex;
@@ -357,8 +357,8 @@ void mergeRanksAndIndices(
uniform int N) uniform int N)
{ {
const uniform int lastSegmentElements = N % (2 * stride); const uniform int lastSegmentElements = N % (2 * stride);
const uniform int threadCount = (lastSegmentElements > stride) ? const uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE); (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE); const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
@@ -457,13 +457,13 @@ void mergeElementaryIntervalsKernel(
dstB = segmentBase + startDstB + dstPosB; dstB = segmentBase + startDstB + dstPosB;
// store merge data // store merge data
if (dstA >= 0) if (dstA >= 0)
{ {
// int dstA = segmentBase + startSrcA + programIndex; // int dstA = segmentBase + startSrcA + programIndex;
dstKey[dstA] = keyA; dstKey[dstA] = keyA;
dstVal[dstA] = valA; dstVal[dstA] = valA;
} }
if (dstB >= 0) if (dstB >= 0)
{ {
// int dstB = segmentBase + stride + startSrcB + programIndex; // int dstB = segmentBase + stride + startSrcB + programIndex;
dstKey[dstB] = keyB; dstKey[dstB] = keyB;
@@ -513,7 +513,7 @@ __device__ static uniform int * uniform limitsB;
__device__ static uniform int nTasks; __device__ static uniform int nTasks;
__device__ static uniform int MAX_SAMPLE_COUNT = 0; __device__ static uniform int MAX_SAMPLE_COUNT = 0;
__global__ __global__
void openMergeSort___export() void openMergeSort___export()
{ {
nTasks = 13*32*13; nTasks = 13*32*13;

View File

@@ -25,9 +25,9 @@ int nextPowerOfTwo(int x)
static inline static inline
int binarySearchInclusiveRanks( int binarySearchInclusiveRanks(
const int val, const int val,
uniform int *data, uniform int *data,
const int L, const int L,
int stride) int stride)
{ {
cif (L == 0) cif (L == 0)
@@ -47,9 +47,9 @@ int binarySearchInclusiveRanks(
static inline static inline
int binarySearchExclusiveRanks( int binarySearchExclusiveRanks(
const int val, const int val,
uniform int *data, uniform int *data,
const int L, const int L,
int stride) int stride)
{ {
cif (L == 0) cif (L == 0)
@@ -69,9 +69,9 @@ int binarySearchExclusiveRanks(
static inline static inline
int binarySearchInclusive( int binarySearchInclusive(
const Key_t val, const Key_t val,
uniform Key_t *data, uniform Key_t *data,
const int L, const int L,
int stride) int stride)
{ {
cif (L == 0) cif (L == 0)
@@ -91,9 +91,9 @@ int binarySearchInclusive(
static inline static inline
int binarySearchExclusive( int binarySearchExclusive(
const Key_t val, const Key_t val,
uniform Key_t *data, uniform Key_t *data,
const int L, const int L,
int stride) int stride)
{ {
cif (L == 0) cif (L == 0)
@@ -113,9 +113,9 @@ int binarySearchExclusive(
static inline static inline
int binarySearchInclusive1( int binarySearchInclusive1(
const Key_t val, const Key_t val,
Key_t data, Key_t data,
const uniform int L, const uniform int L,
uniform int stride) uniform int stride)
{ {
if (L == 0) if (L == 0)
@@ -135,9 +135,9 @@ int binarySearchInclusive1(
static inline static inline
int binarySearchExclusive1( int binarySearchExclusive1(
const Key_t val, const Key_t val,
Key_t data, Key_t data,
const uniform int L, const uniform int L,
uniform int stride) uniform int stride)
{ {
if (L == 0) if (L == 0)
@@ -158,7 +158,7 @@ int binarySearchExclusive1(
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Bottom-level merge sort (binary search-based) // Bottom-level merge sort (binary search-based)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
task task
void mergeSortGangKernel( void mergeSortGangKernel(
uniform int batchSize, uniform int batchSize,
uniform Key_t dstKey[], uniform Key_t dstKey[],
@@ -189,7 +189,7 @@ void mergeSortGangKernel(
const int offset = 2 * (programIndex - lPos); const int offset = 2 * (programIndex - lPos);
uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos); uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos); uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
Key_t keyA = baseKey[lPos + 0]; Key_t keyA = baseKey[lPos + 0];
Val_t valA = baseVal[lPos + 0]; Val_t valA = baseVal[lPos + 0];
Key_t keyB = baseKey[lPos + stride]; Key_t keyB = baseKey[lPos + stride];
@@ -244,7 +244,7 @@ void generateSampleRanksKernel(
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount; const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim; const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks); const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
for (uniform int block = blockBeg; block < blockEnd; block++) for (uniform int block = blockBeg; block < blockEnd; block++)
{ {
const int pos = block * programCount + programIndex; const int pos = block * programCount + programIndex;
@@ -290,8 +290,8 @@ void generateSampleRanks(
uniform int N) uniform int N)
{ {
uniform int lastSegmentElements = N % (2 * stride); uniform int lastSegmentElements = N % (2 * stride);
uniform int threadCount = (lastSegmentElements > stride) ? uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE); (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE); uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
@@ -306,7 +306,7 @@ void generateSampleRanks(
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices // Merge step 2: generate sample ranks and indices
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
task task
void mergeRanksAndIndicesKernel( void mergeRanksAndIndicesKernel(
uniform int nBlocks, uniform int nBlocks,
uniform int in_Limits[], uniform int in_Limits[],
@@ -319,7 +319,7 @@ void mergeRanksAndIndicesKernel(
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount; const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim; const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks); const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
for (uniform int block = blockBeg; block < blockEnd; block++) for (uniform int block = blockBeg; block < blockEnd; block++)
{ {
int pos = block * programCount + programIndex; int pos = block * programCount + programIndex;
@@ -359,8 +359,8 @@ void mergeRanksAndIndices(
uniform int N) uniform int N)
{ {
const uniform int lastSegmentElements = N % (2 * stride); const uniform int lastSegmentElements = N % (2 * stride);
const uniform int threadCount = (lastSegmentElements > stride) ? const uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE); (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE); const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
@@ -462,12 +462,12 @@ void mergeElementaryIntervalsKernel(
if (programIndex < lenSrcB && dstPosB < lenSrcB) if (programIndex < lenSrcB && dstPosB < lenSrcB)
dstB = segmentBase + startDstB + dstPosB; dstB = segmentBase + startDstB + dstPosB;
if (dstA >= 0) if (dstA >= 0)
{ {
dstKey[dstA] = keyA; dstKey[dstA] = keyA;
dstVal[dstA] = valA; dstVal[dstA] = valA;
} }
if (dstB >= 0) if (dstB >= 0)
{ {
dstKey[dstB] = keyB; dstKey[dstB] = keyB;
dstVal[dstB] = valB; dstVal[dstB] = valB;
@@ -521,7 +521,7 @@ static uniform int * uniform limitsA;
static uniform int * uniform limitsB; static uniform int * uniform limitsB;
static uniform int MAX_SAMPLE_COUNT = 0; static uniform int MAX_SAMPLE_COUNT = 0;
export export
void openMergeSort() void openMergeSort()
{ {
MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount; MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
@@ -542,7 +542,7 @@ void closeMergeSort()
memPool = NULL; memPool = NULL;
} }
export export
void mergeSort( void mergeSort(
uniform Key_t dstKey[], uniform Key_t dstKey[],
uniform Val_t dstVal[], uniform Val_t dstVal[],
@@ -601,7 +601,7 @@ void mergeSort(
} }
#endif #endif
// cpu: 287 gpu: 194 M/s // cpu: 287 gpu: 194 M/s
//Merge elementary intervals //Merge elementary intervals
mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N); mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
} }

View File

@@ -61,7 +61,7 @@ struct Hermite4
const real R0 = 1; const real R0 = 1;
const real mp = 1.0/n; const real mp = 1.0/n;
#pragma omp parallel for schedule(runtime) #pragma omp parallel for schedule(runtime)
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
{ {
real xp, yp, zp, s2 = 2*R0; real xp, yp, zp, s2 = 2*R0;
real vx, vy, vz; real vx, vy, vz;
@@ -73,7 +73,7 @@ struct Hermite4
vx = drand48() * 0.1; vx = drand48() * 0.1;
vy = drand48() * 0.1; vy = drand48() * 0.1;
vz = drand48() * 0.1; vz = drand48() * 0.1;
} }
g_posx[i] = xp; g_posx[i] = xp;
g_posy[i] = yp; g_posy[i] = yp;
g_posz[i] = zp; g_posz[i] = zp;
@@ -104,7 +104,7 @@ struct Hermite4
void forces(); void forces();
real step(const real dt) real step(const real dt)
{ {
const real dt2 = dt*real(1.0/2.0); const real dt2 = dt*real(1.0/2.0);
const real dt3 = dt*real(1.0/3.0); const real dt3 = dt*real(1.0/3.0);
@@ -149,9 +149,9 @@ struct Hermite4
{ {
/* compute snp & crk */ /* compute snp & crk */
const real Amx = g_accx[i] - accx0[i]; const real Amx = g_accx[i] - accx0[i];
const real Amy = g_accy[i] - accy0[i]; const real Amy = g_accy[i] - accy0[i];
const real Amz = g_accz[i] - accz0[i]; const real Amz = g_accz[i] - accz0[i];
const real Jmx = h*(g_jrkx[i] - jrkx0[i]); const real Jmx = h*(g_jrkx[i] - jrkx0[i]);
const real Jmy = h*(g_jrky[i] - jrky0[i]); const real Jmy = h*(g_jrky[i] - jrky0[i]);
@@ -199,18 +199,18 @@ struct Hermite4
} }
} }
if (dt_min == HUGE) if (dt_min == HUGE)
return dt; return dt;
else else
return dt_min; return dt_min;
} }
void energy(real &Ekin, real &Epot) void energy(real &Ekin, real &Epot)
{ {
real ekin = 0, epot = 0; real ekin = 0, epot = 0;
#pragma omp parallel for reduction(+:ekin,epot) #pragma omp parallel for reduction(+:ekin,epot)
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
{ {
ekin += g_mass[i] * (g_velx[i]*g_velx[i] + g_vely[i]*g_vely[i] + g_velz[i]*g_velz[i]) * real(0.5f); ekin += g_mass[i] * (g_velx[i]*g_velx[i] + g_vely[i]*g_vely[i] + g_velz[i]*g_velz[i]) * real(0.5f);
epot += real(0.5f)*g_mass[i] * g_gpot[i]; epot += real(0.5f)*g_mass[i] * g_gpot[i];
@@ -241,7 +241,7 @@ struct Hermite4
real dt = 1.0/131072; real dt = 1.0/131072;
real Epot, Ekin, Etot = Etot0; real Epot, Ekin, Etot = Etot0;
while (t_global < t_end) { while (t_global < t_end) {
if (iter % ntime == 0) if (iter % ntime == 0)
t0 = rtc(); t0 = rtc();
if (iter >= niter) return; if (iter >= niter) return;
@@ -302,7 +302,7 @@ void run(const int nbodies, const real eta, const int nstep)
h4.integrate(nstep); h4.integrate(nstep);
} }
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
printf(" Usage: %s [nbodies=8192] [nsteps=40] [eta=0.1] \n", argv[0]); printf(" Usage: %s [nbodies=8192] [nsteps=40] [eta=0.1] \n", argv[0]);

View File

@@ -12,7 +12,7 @@ struct Predictor
vec3 pos, vel; vec3 pos, vel;
}; };
static inline static inline
void body_body_force( void body_body_force(
Force &fi, Force &fi,
const Predictor &pi, const Predictor &pi,
@@ -40,14 +40,14 @@ void body_body_force(
fi.acc.y += minv_ds3 * dy; fi.acc.y += minv_ds3 * dy;
fi.acc.z += minv_ds3 * dz; fi.acc.z += minv_ds3 * dz;
fi.pot -= minv_ds; fi.pot -= minv_ds;
const real dvx = pj.vel.x - pi.vel.x; const real dvx = pj.vel.x - pi.vel.x;
const real dvy = pj.vel.y - pi.vel.y; const real dvy = pj.vel.y - pi.vel.y;
const real dvz = pj.vel.z - pi.vel.z; const real dvz = pj.vel.z - pi.vel.z;
const real rv = dx*dvx + dy*dvy + dz*dvz; const real rv = dx*dvx + dy*dvy + dz*dvz;
const real Jij = (real)(-3.0) * (rv * inv_ds2 * minv_ds3); const real Jij = (real)(-3.0) * (rv * inv_ds2 * minv_ds3);
fi.jrk.x += minv_ds3*dvx + Jij*dx; fi.jrk.x += minv_ds3*dvx + Jij*dx;
fi.jrk.y += minv_ds3*dvy + Jij*dy; fi.jrk.y += minv_ds3*dvy + Jij*dy;
fi.jrk.z += minv_ds3*dvz + Jij*dz; fi.jrk.z += minv_ds3*dvz + Jij*dz;
@@ -75,7 +75,7 @@ task void compute_forces_task(
const uniform int nibeg = taskIndex * nPerTask; const uniform int nibeg = taskIndex * nPerTask;
const uniform int niend = min(n, nibeg + nPerTask); const uniform int niend = min(n, nibeg + nPerTask);
if (nibeg >= n) if (nibeg >= n)
return; return;
uniform real shdata[7][programCount]; uniform real shdata[7][programCount];
@@ -88,7 +88,7 @@ task void compute_forces_task(
fi.acc = (real)0.0; fi.acc = (real)0.0;
fi.jrk = (real)0.0; fi.jrk = (real)0.0;
fi.pot = (real)0.0; fi.pot = (real)0.0;
Predictor pi; Predictor pi;
pi.pos.x = posx[i]; pi.pos.x = posx[i];
pi.pos.y = posy[i]; pi.pos.y = posy[i];
@@ -155,7 +155,7 @@ export void compute_forces(
launch [nTask] compute_forces_task( launch [nTask] compute_forces_task(
n, nPerTask, n, nPerTask,
mass, mass,
posx,posy,posz, posx,posy,posz,
velx,vely,velz, velx,vely,velz,
accx,accy,accz, accx,accy,accz,

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2011-2012, Intel Corporation Copyright (c) 2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,11 +28,11 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#define DBG(x) #define DBG(x)
#include <omp.h> #include <omp.h>
#include <malloc.h> #include <malloc.h>
@@ -62,15 +62,15 @@ struct TaskInfo {
event taskEvent; event taskEvent;
#endif #endif
int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; } int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; }
int taskIndex0() const int taskIndex0() const
{ {
return taskIndex % taskCount3d[0]; return taskIndex % taskCount3d[0];
} }
int taskIndex1() const int taskIndex1() const
{ {
return ( taskIndex / taskCount3d[0] ) % taskCount3d[1]; return ( taskIndex / taskCount3d[0] ) % taskCount3d[1];
} }
int taskIndex2() const int taskIndex2() const
{ {
return taskIndex / ( taskCount3d[0]*taskCount3d[1] ); return taskIndex / ( taskCount3d[0]*taskCount3d[1] );
} }
@@ -85,7 +85,7 @@ __attribute__((aligned(32)));
; ;
// ispc expects these functions to have C linkage / not be mangled // ispc expects these functions to have C linkage / not be mangled
extern "C" { extern "C" {
void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz); void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz);
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
void ISPCSync(void *handle); void ISPCSync(void *handle);
@@ -144,10 +144,10 @@ private:
}; };
inline TaskGroupBase::TaskGroupBase() { inline TaskGroupBase::TaskGroupBase() {
nextTaskInfoIndex = 0; nextTaskInfoIndex = 0;
curMemBuffer = 0; curMemBuffer = 0;
curMemBufferOffset = 0; curMemBufferOffset = 0;
memBuffers[0] = mem; memBuffers[0] = mem;
memBufferSize[0] = sizeof(mem) / sizeof(mem[0]); memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
@@ -171,8 +171,8 @@ inline TaskGroupBase::~TaskGroupBase() {
inline void inline void
TaskGroupBase::Reset() { TaskGroupBase::Reset() {
nextTaskInfoIndex = 0; nextTaskInfoIndex = 0;
curMemBuffer = 0; curMemBuffer = 0;
curMemBufferOffset = 0; curMemBufferOffset = 0;
} }
@@ -253,7 +253,7 @@ lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
#endif // ISPC_IS_WINDOWS #endif // ISPC_IS_WINDOWS
} }
static int32_t static int32_t
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) { lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
#ifdef ISPC_IS_WINDOWS #ifdef ISPC_IS_WINDOWS
return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue); return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue);
@@ -264,7 +264,7 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue)
#endif // ISPC_IS_WINDOWS #endif // ISPC_IS_WINDOWS
} }
static inline int32_t static inline int32_t
lAtomicAdd(volatile int32_t *v, int32_t delta) { lAtomicAdd(volatile int32_t *v, int32_t delta) {
#ifdef ISPC_IS_WINDOWS #ifdef ISPC_IS_WINDOWS
return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta; return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
@@ -300,11 +300,11 @@ TaskGroup::Launch(int baseIndex, int count) {
TaskInfo ti = *GetTaskInfo(baseIndex); TaskInfo ti = *GetTaskInfo(baseIndex);
#pragma omp for schedule(runtime) #pragma omp for schedule(runtime)
for(int i = 0; i < count; i++) for(int i = 0; i < count; i++)
{ {
ti.taskIndex = i; ti.taskIndex = i;
// Actually run the task. // Actually run the task.
ti.func(ti.data, threadIndex, threadCount, ti.taskIndex, ti.taskCount(), ti.func(ti.data, threadIndex, threadCount, ti.taskIndex, ti.taskCount(),
ti.taskIndex0(), ti.taskIndex1(), ti.taskIndex2(), ti.taskIndex0(), ti.taskIndex1(), ti.taskIndex2(),
ti.taskCount0(), ti.taskCount1(), ti.taskCount2()); ti.taskCount0(), ti.taskCount1(), ti.taskCount2());
@@ -322,7 +322,7 @@ TaskGroup::Sync() {
static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS]; static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
static inline TaskGroup * static inline TaskGroup *
AllocTaskGroup() AllocTaskGroup()
{ {
for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) { for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
TaskGroup *tg = freeTaskGroups[i]; TaskGroup *tg = freeTaskGroups[i];
@@ -339,7 +339,7 @@ AllocTaskGroup()
static inline void static inline void
FreeTaskGroup(TaskGroup *tg) FreeTaskGroup(TaskGroup *tg)
{ {
tg->Reset(); tg->Reset();
@@ -355,7 +355,7 @@ FreeTaskGroup(TaskGroup *tg)
} }
void void
ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2) ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2)
{ {
const int count = count0*count1*count2; const int count = count0*count1*count2;
TaskGroup *taskGroup; TaskGroup *taskGroup;
@@ -382,7 +382,7 @@ ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1,
void void
ISPCSync(void *h) ISPCSync(void *h)
{ {
TaskGroup *taskGroup = (TaskGroup *)h; TaskGroup *taskGroup = (TaskGroup *)h;
if (taskGroup != NULL) { if (taskGroup != NULL) {
@@ -393,7 +393,7 @@ ISPCSync(void *h)
void * void *
ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment)
{ {
TaskGroup *taskGroup; TaskGroup *taskGroup;
if (*taskGroupPtr == NULL) { if (*taskGroupPtr == NULL) {

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2010-2011, Intel Corporation Copyright (c) 2010-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#define NOMINMAX #define NOMINMAX
@@ -96,7 +96,7 @@ int main(int argc, char *argv[]) {
sum = 0.; sum = 0.;
for (int i = 0; i < nOptions; ++i) for (int i = 0; i < nOptions; ++i)
sum += result[i]; sum += result[i];
printf("[binomial ispc, tasks]:\t\t[%.3f] msec (avg %f)\n", printf("[binomial ispc, tasks]:\t\t[%.3f] msec (avg %f)\n",
binomial_tasks, sum / nOptions); binomial_tasks, sum / nOptions);
// //
@@ -112,7 +112,7 @@ int main(int argc, char *argv[]) {
sum += result[i]; sum += result[i];
bs_ispc_tasks = std::min(bs_ispc_tasks, dt); bs_ispc_tasks = std::min(bs_ispc_tasks, dt);
} }
printf("[black-scholes ispc, tasks]:\t[%.3f] msec (avg %f)\n", printf("[black-scholes ispc, tasks]:\t[%.3f] msec (avg %f)\n",
bs_ispc_tasks, sum / nOptions); bs_ispc_tasks, sum / nOptions);

View File

@@ -1,6 +1,6 @@
// -*- mode: c++ -*- // -*- mode: c++ -*-
/* /*
Copyright (c) 2010-2011, Intel Corporation Copyright (c) 2010-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -29,13 +29,13 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "options_defs.h" #include "options_defs.h"
#include "cuda_helpers.cuh" #include "cuda_helpers.cuh"
__device__ static inline void __range_reduce_log(float input, float * reduced, __device__ static inline void __range_reduce_log(float input, float * reduced,
int * exponent) { int * exponent) {
int int_version = __float_as_int(input); //intbits(input); int int_version = __float_as_int(input); //intbits(input);
// single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
@@ -195,9 +195,9 @@ CND(float X) {
return w; return w;
} }
__global__ __global__
void bs_task( float Sa[], float Xa[], float Ta[], void bs_task( float Sa[], float Xa[], float Ta[],
float ra[], float va[], float ra[], float va[],
float result[], int count) { float result[], int count) {
if (taskIndex >= taskCount) return; if (taskIndex >= taskCount) return;
int first = taskIndex * (count/taskCount); int first = taskIndex * (count/taskCount);
@@ -218,7 +218,7 @@ void bs_task( float Sa[], float Xa[], float Ta[],
extern "C" extern "C"
__global__ void __global__ void
black_scholes_ispc_tasks___export( float Sa[], float Xa[], float Ta[], black_scholes_ispc_tasks___export( float Sa[], float Xa[], float Ta[],
float ra[], float va[], float ra[], float va[],
float result[], int count) { float result[], int count) {
int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384); int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
launch(nTasks,1,1,bs_task) launch(nTasks,1,1,bs_task)
@@ -228,7 +228,7 @@ black_scholes_ispc_tasks___export( float Sa[], float Xa[], float Ta[],
extern "C" extern "C"
__host__ void __host__ void
black_scholes_ispc_tasks( float Sa[], float Xa[], float Ta[], black_scholes_ispc_tasks( float Sa[], float Xa[], float Ta[],
float ra[], float va[], float ra[], float va[],
float result[], int count) { float result[], int count) {
black_scholes_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count); black_scholes_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
@@ -243,8 +243,8 @@ struct loop
__device__ static void op1(float V[], const float u, const float X, const float S) __device__ static void op1(float V[], const float u, const float X, const float S)
{ {
const int j = NBEG; const int j = NBEG;
float upow = powf(u, (float)(2*j-BINOMIAL_NUM)); float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
V[j] = max(0.0f, X - S * upow); V[j] = max(0.0f, X - S * upow);
loop<j+STEP,NEND,STEP>::op1(V,u,X,S); loop<j+STEP,NEND,STEP>::op1(V,u,X,S);
} }
__device__ static void op2(float V[], const float Pu, const float disc) __device__ static void op2(float V[], const float Pu, const float disc)
@@ -257,9 +257,9 @@ struct loop
} }
}; };
template<int NEND, int STEP> template<int NEND, int STEP>
struct loop<NEND,NEND,STEP> struct loop<NEND,NEND,STEP>
{ {
__device__ static void op1(float V[], const float u, const float X, const float S) {} __device__ static void op1(float V[], const float u, const float X, const float S) {}
__device__ static void op2(float V[], const float Pu, const float disc) {} __device__ static void op2(float V[], const float Pu, const float disc) {}
}; };
@@ -295,10 +295,10 @@ binomial_put(float S, float X, float T, float r, float v)
__global__ void __global__ void
binomial_task( float Sa[], float Xa[], binomial_task( float Sa[], float Xa[],
float Ta[], float ra[], float Ta[], float ra[],
float va[], float result[], float va[], float result[],
int count) int count)
{ {
int first = taskIndex * (count/taskCount); int first = taskIndex * (count/taskCount);
int last = min(count, (int)((taskIndex+1) * (count/taskCount))); int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
@@ -313,9 +313,9 @@ binomial_task( float Sa[], float Xa[],
extern "C" __global__ void extern "C" __global__ void
binomial_put_ispc_tasks___export( float Sa[], float Xa[], binomial_put_ispc_tasks___export( float Sa[], float Xa[],
float Ta[], float ra[], float Ta[], float ra[],
float va[], float result[], float va[], float result[],
int count) { int count) {
int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384); int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
launch(nTasks,1,1,binomial_task) launch(nTasks,1,1,binomial_task)
@@ -325,7 +325,7 @@ binomial_put_ispc_tasks___export( float Sa[], float Xa[],
extern "C" extern "C"
__host__ void __host__ void
binomial_put_ispc_tasks( float Sa[], float Xa[], float Ta[], binomial_put_ispc_tasks( float Sa[], float Xa[], float Ta[],
float ra[], float va[], float ra[], float va[],
float result[], int count) { float result[], int count) {
cudaDeviceSetCacheConfig (cudaFuncCachePreferL1); cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);

View File

@@ -1,6 +1,6 @@
// -*- mode: c++ -*- // -*- mode: c++ -*-
/* /*
Copyright (c) 2010-2011, Intel Corporation Copyright (c) 2010-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "options_defs.h" #include "options_defs.h"
@@ -57,7 +57,7 @@ CND(float X) {
task void task void
bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[], bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[], uniform float ra[], uniform float va[],
uniform float result[], uniform int count) { uniform float result[], uniform int count) {
uniform int first = taskIndex * (count/taskCount); uniform int first = taskIndex * (count/taskCount);
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount))); uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
@@ -74,7 +74,7 @@ bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
export void export void
black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[], black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[], uniform float ra[], uniform float va[],
uniform float result[], uniform int count) { uniform float result[], uniform int count) {
uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384); uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count); launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
@@ -85,7 +85,7 @@ black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float T
export void export void
black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[], black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[], uniform float ra[], uniform float va[],
uniform float result[], uniform int count) { uniform float result[], uniform int count) {
foreach (i = 0 ... count) { foreach (i = 0 ... count) {
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i]; float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
@@ -135,7 +135,7 @@ binomial_put(float S, float X, float T, float r, float v) {
V[j] = max(0., X - S * upow); } V[j] = max(0., X - S * upow); }
#define OP10(k) \ #define OP10(k) \
OP(k+0); OP(k+1); OP(k+2); OP(k+3); OP(k+4) \ OP(k+0); OP(k+1); OP(k+2); OP(k+3); OP(k+4) \
OP(k+5); OP(k+6); OP(k+7); OP(k+8); OP(k+9); OP(k+5); OP(k+6); OP(k+7); OP(k+8); OP(k+9);
OP10(0) OP10(0)
OP10(10) OP10(10)
OP10(20) OP10(20)
@@ -176,8 +176,8 @@ binomial_put(float S, float X, float T, float r, float v) {
export void export void
binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[], binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[], uniform float ra[], uniform float va[],
uniform float result[], uniform int count) { uniform float result[], uniform int count) {
foreach (i = 0 ... count) { foreach (i = 0 ... count) {
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i]; float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
@@ -187,9 +187,9 @@ binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
task void task void
binomial_task(uniform float Sa[], uniform float Xa[], binomial_task(uniform float Sa[], uniform float Xa[],
uniform float Ta[], uniform float ra[], uniform float Ta[], uniform float ra[],
uniform float va[], uniform float result[], uniform float va[], uniform float result[],
uniform int count) { uniform int count) {
uniform int first = taskIndex * (count/taskCount); uniform int first = taskIndex * (count/taskCount);
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount))); uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
@@ -202,9 +202,9 @@ binomial_task(uniform float Sa[], uniform float Xa[],
export void export void
binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[], binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
uniform float Ta[], uniform float ra[], uniform float Ta[], uniform float ra[],
uniform float va[], uniform float result[], uniform float va[], uniform float result[],
uniform int count) { uniform int count) {
uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384); uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count); launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2010-2011, Intel Corporation Copyright (c) 2010-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#ifndef OPTIONS_DEFS_H #ifndef OPTIONS_DEFS_H

View File

@@ -42,7 +42,7 @@ int main (int argc, char *argv[])
Key *keys = new Key [n]; Key *keys = new Key [n];
Key *keys_orig = new Key [n]; Key *keys_orig = new Key [n];
unsigned int *keys_gold = new unsigned int [n]; unsigned int *keys_gold = new unsigned int [n];
srand48(rtc()*65536); srand48(rtc()*65536);
int sortBits = 32; int sortBits = 32;
@@ -63,7 +63,7 @@ int main (int argc, char *argv[])
keys_gold[i] = keys[i].key; keys_gold[i] = keys[i].key;
keys_orig[i] = keys[i]; keys_orig[i] = keys[i];
} }
ispcSetMallocHeapLimit(1024*1024*1024); ispcSetMallocHeapLimit(1024*1024*1024);
ispc::radixSort_alloc(n); ispc::radixSort_alloc(n);

View File

@@ -9,7 +9,7 @@ typedef long long Key;
__forceinline__ __device__ int atomic_add_global(int* ptr, int value) __forceinline__ __device__ int atomic_add_global(int* ptr, int value)
{ {
return atomicAdd(ptr, value); return atomicAdd(ptr, value);
} }
static __device__ __forceinline__ int shfl_scan_add_step(int partial, int up_offset) static __device__ __forceinline__ int shfl_scan_add_step(int partial, int up_offset)
{ {
@@ -92,7 +92,7 @@ void sortPass(
const int mask = (1 << NUMBITS) - 1; const int mask = (1 << NUMBITS) - 1;
/* copy digit offset from Gmem to Lmem */ /* copy digit offset from Gmem to Lmem */
#if 1 #if 1
__shared__ int digitOffsets_sh[NUMDIGITS*4]; __shared__ int digitOffsets_sh[NUMDIGITS*4];
volatile int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS; volatile int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS;
@@ -191,7 +191,7 @@ void completeScanGlobal(
} }
} }
__device__ static __device__ static
inline void radixExclusiveScan( inline void radixExclusiveScan(
const int numBlocks, const int numBlocks,
int excScanPtr[], int excScanPtr[],
@@ -242,11 +242,11 @@ void radixSort_alloc___export(const int n)
nPrefixSum = NUMDIGITS*numBlocks; nPrefixSum = NUMDIGITS*numBlocks;
const int nalloc = const int nalloc =
nSharedCounts + nSharedCounts +
nCountsGlobal + nCountsGlobal +
nExcScan + nExcScan +
nCountsBlock + nCountsBlock +
nPartialSum + nPartialSum +
nPrefixSum; nPrefixSum;
@@ -261,7 +261,7 @@ void radixSort_alloc___export(const int n)
prefixSum = partialSum + nPartialSum; prefixSum = partialSum + nPartialSum;
} }
extern "C" extern "C"
void radixSort_alloc(const int n) void radixSort_alloc(const int n)
{ {
radixSort_alloc___export<<<1,32>>>(n); radixSort_alloc___export<<<1,32>>>(n);
@@ -269,7 +269,7 @@ void radixSort_alloc(const int n)
} }
__device__ static __device__ static
void radixSort_freeBufKeys() void radixSort_freeBufKeys()
{ {
if (numElementsBuf > 0) if (numElementsBuf > 0)
@@ -344,9 +344,9 @@ __global__ void radixSort___export(
/* sorting */ /* sorting */
launch (numBlocks,1,1, launch (numBlocks,1,1,
sortPass)( sortPass)(
bufKeys, bufKeys,
keys, keys,
bit, bit,
numElements, numElements,
excScan); excScan);
sync; sync;

View File

@@ -63,7 +63,7 @@ void sortPass(
const uniform int mask = (1 << NUMBITS) - 1; const uniform int mask = (1 << NUMBITS) - 1;
/* copy digit offset from Gmem to Lmem */ /* copy digit offset from Gmem to Lmem */
#if 1 #if 1
uniform int digitOffsets[NUMDIGITS]; uniform int digitOffsets[NUMDIGITS];
foreach (digit = 0 ... NUMDIGITS) foreach (digit = 0 ... NUMDIGITS)
@@ -95,7 +95,7 @@ void partialScanLocal(
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount; const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
const uniform int bbeg = blockIdx * blockDim; const uniform int bbeg = blockIdx * blockDim;
const uniform int bend = min(bbeg + blockDim, numBlocks); const uniform int bend = min(bbeg + blockDim, numBlocks);
uniform int (* uniform countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])countsAll; uniform int (* uniform countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])countsAll;
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll; uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll; uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
@@ -142,7 +142,7 @@ void completeScanGlobal(
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount; const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
const uniform int bbeg = blockIdx * blockDim; const uniform int bbeg = blockIdx * blockDim;
const uniform int bend = min(bbeg + blockDim, numBlocks); const uniform int bend = min(bbeg + blockDim, numBlocks);
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll; uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
uniform int (* uniform carryValue)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])carryValueAll; uniform int (* uniform carryValue)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])carryValueAll;
@@ -154,7 +154,7 @@ void completeScanGlobal(
} }
} }
static static
inline void radixExclusiveScan( inline void radixExclusiveScan(
const uniform int numBlocks, const uniform int numBlocks,
uniform int excScanPtr[], uniform int excScanPtr[],
@@ -207,11 +207,11 @@ export void radixSort_alloc(const uniform int n)
nPrefixSum = NUMDIGITS*numBlocks; nPrefixSum = NUMDIGITS*numBlocks;
const uniform int nalloc = const uniform int nalloc =
nSharedCounts + nSharedCounts +
nCountsGlobal + nCountsGlobal +
nExcScan + nExcScan +
nCountsBlock + nCountsBlock +
nPartialSum + nPartialSum +
nPrefixSum; nPrefixSum;
@@ -225,7 +225,7 @@ export void radixSort_alloc(const uniform int n)
prefixSum = partialSum + nPartialSum; prefixSum = partialSum + nPartialSum;
} }
static static
void radixSort_freeBufKeys() void radixSort_freeBufKeys()
{ {
if (numElementsBuf > 0) if (numElementsBuf > 0)
@@ -283,16 +283,16 @@ export void radixSort(
excScan[digit] = scan + carry; excScan[digit] = scan + carry;
carry += broadcast(scan+value, programCount-1); carry += broadcast(scan+value, programCount-1);
} }
/* computing offsets for each digit */ /* computing offsets for each digit */
radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum); radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
/* sorting */ /* sorting */
launch [numBlocks] launch [numBlocks]
sortPass( sortPass(
bufKeys, bufKeys,
keys, keys,
bit, bit,
numElements, numElements,
excScan); excScan);
sync; sync;

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2010-2011, Intel Corporation Copyright (c) 2010-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#ifdef _MSC_VER #ifdef _MSC_VER
@@ -83,7 +83,7 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
fputc(g, f); fputc(g, f);
fputc(b, f); fputc(b, f);
} }
} }
fclose(f); fclose(f);
printf("Wrote image file %s\n", filename); printf("Wrote image file %s\n", filename);
} }
@@ -116,7 +116,7 @@ int main(int argc, char *argv[]) {
if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \ if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \
fprintf(stderr, "Unexpected EOF reading scene file\n"); \ fprintf(stderr, "Unexpected EOF reading scene file\n"); \
return 1; \ return 1; \
} else /* eat ; */ } else /* eat ; */
// //
// Read the camera specification information from the camera file // Read the camera specification information from the camera file
@@ -145,7 +145,7 @@ int main(int argc, char *argv[]) {
READ(raster2camera[0][0], 16); READ(raster2camera[0][0], 16);
// //
// Read in the serialized BVH // Read in the serialized BVH
// //
sprintf(fnbuf, "%s.bvh", filename); sprintf(fnbuf, "%s.bvh", filename);
f = fopen(fnbuf, "rb"); f = fopen(fnbuf, "rb");
@@ -178,7 +178,7 @@ int main(int argc, char *argv[]) {
READ(nodes[i].pad, 1); READ(nodes[i].pad, 1);
} }
// And then read the triangles // And then read the triangles
uint nTris; uint nTris;
READ(nTris, 1); READ(nTris, 1);
Triangle *triangles = new Triangle[nTris]; Triangle *triangles = new Triangle[nTris];
@@ -204,7 +204,7 @@ int main(int argc, char *argv[]) {
// the first interseciton // the first interseciton
int *id = new int[width*height]; int *id = new int[width*height];
float *image = new float[width*height]; float *image = new float[width*height];
ispc_memset(id, 0, width*height*sizeof(int)); ispc_memset(id, 0, width*height*sizeof(int));
ispc_memset(image, 0, width*height*sizeof(float)); ispc_memset(image, 0, width*height*sizeof(float));
@@ -220,7 +220,7 @@ int main(int argc, char *argv[]) {
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt); printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
minTimeISPCtasks = std::min(dt, minTimeISPCtasks); minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
} }
printf("[rt ispc + tasks]:\t\t[%.3f] msec for %d x %d image\n", printf("[rt ispc + tasks]:\t\t[%.3f] msec for %d x %d image\n",
minTimeISPCtasks, width, height); minTimeISPCtasks, width, height);
writeImage(id, image, width, height, "rt-ispc-tasks.ppm"); writeImage(id, image, width, height, "rt-ispc-tasks.ppm");

View File

@@ -96,7 +96,7 @@ static inline float Dot(const float3 a, const float3 b) {
__device__ __device__
inline inline
static void generateRay( const float raster2camera[4][4], static void generateRay( const float raster2camera[4][4],
const float camera2world[4][4], const float camera2world[4][4],
float x, float y, Ray &ray) { float x, float y, Ray &ray) {
ray.mint = 0.f; ray.mint = 0.f;
@@ -113,11 +113,11 @@ static void generateRay( const float raster2camera[4][4],
camy /= camw; camy /= camw;
camz /= camw; camz /= camw;
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
camera2world[0][2] * camz; camera2world[0][2] * camz;
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
camera2world[1][2] * camz; camera2world[1][2] * camz;
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
camera2world[2][2] * camz; camera2world[2][2] * camz;
ray.origin.x = camera2world[0][3] / camera2world[3][3]; ray.origin.x = camera2world[0][3] / camera2world[3][3];
@@ -139,7 +139,7 @@ static void generateRay( const float raster2camera[4][4],
__device__ __device__
inline inline
static bool BBoxIntersect(const float bounds[2][3], static bool BBoxIntersect(const float bounds[2][3],
const Ray &ray) { const Ray &ray) {
float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] }; float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] }; float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
@@ -172,7 +172,7 @@ static bool BBoxIntersect(const float bounds[2][3],
} }
t0 = max(tNear.z, t0); t0 = max(tNear.z, t0);
t1 = min(tFar.z, t1); t1 = min(tFar.z, t1);
return (t0 <= t1); return (t0 <= t1);
} }
@@ -220,7 +220,7 @@ static bool TriIntersect(const Triangle &tri, Ray &ray) {
__device__ __device__
inline inline
bool BVHIntersect(const LinearBVHNode nodes[], bool BVHIntersect(const LinearBVHNode nodes[],
const Triangle tris[], Ray &r, const Triangle tris[], Ray &r,
int todo[]) { int todo[]) {
Ray ray = r; Ray ray = r;
@@ -240,7 +240,7 @@ bool BVHIntersect(const LinearBVHNode nodes[],
if (TriIntersect(tris[primitivesOffset+i], ray)) if (TriIntersect(tris[primitivesOffset+i], ray))
hit = true; hit = true;
} }
if (todoOffset == 0) if (todoOffset == 0)
break; break;
nodeNum = todo[--todoOffset]; nodeNum = todo[--todoOffset];
} }
@@ -275,10 +275,10 @@ bool BVHIntersect(const LinearBVHNode nodes[],
__device__ __device__
inline inline
static void raytrace_tile( int x0, int x1, static void raytrace_tile( int x0, int x1,
int y0, int y1, int y0, int y1,
int width, int height, int width, int height,
int baseWidth, int baseHeight, int baseWidth, int baseHeight,
const float raster2camera[4][4], const float raster2camera[4][4],
const float camera2world[4][4], const float camera2world[4][4],
float image[], int id[], float image[], int id[],
const LinearBVHNode nodes[], const LinearBVHNode nodes[],
@@ -317,7 +317,7 @@ static void raytrace_tile( int x0, int x1,
__global__ __global__
void raytrace_tile_task( int width, int height, void raytrace_tile_task( int width, int height,
int baseWidth, int baseHeight, int baseWidth, int baseHeight,
const float raster2camera[4][4], const float raster2camera[4][4],
const float camera2world[4][4], const float camera2world[4][4],
float image[], int id[], float image[], int id[],
const LinearBVHNode nodes[], const LinearBVHNode nodes[],
@@ -328,8 +328,8 @@ void raytrace_tile_task( int width, int height,
int x1 = min(x0 + dx, width); int x1 = min(x0 + dx, width);
int y0 = (taskIndex / xBuckets) * dy; int y0 = (taskIndex / xBuckets) * dy;
int y1 = min(y0 + dy, height); int y1 = min(y0 + dy, height);
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
raster2camera, camera2world, image, raster2camera, camera2world, image,
id, nodes, triangles); id, nodes, triangles);
} }
@@ -337,7 +337,7 @@ void raytrace_tile_task( int width, int height,
extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height, extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height,
int baseWidth, int baseHeight, int baseWidth, int baseHeight,
const float raster2camera[4][4], const float raster2camera[4][4],
const float camera2world[4][4], const float camera2world[4][4],
float image[], int id[], float image[], int id[],
const LinearBVHNode nodes[], const LinearBVHNode nodes[],
@@ -347,8 +347,8 @@ extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height,
int yBuckets = (height + (dy-1)) / dy; int yBuckets = (height + (dy-1)) / dy;
int nTasks = xBuckets * yBuckets; int nTasks = xBuckets * yBuckets;
launch(nTasks,1,1,raytrace_tile_task) launch(nTasks,1,1,raytrace_tile_task)
(width, height, baseWidth, baseHeight, (width, height, baseWidth, baseHeight,
raster2camera, camera2world, raster2camera, camera2world,
image, id, nodes, triangles); image, id, nodes, triangles);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
} }
@@ -357,7 +357,7 @@ extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height,
extern "C" __host__ void raytrace_ispc_tasks( int width, int height, extern "C" __host__ void raytrace_ispc_tasks( int width, int height,
int baseWidth, int baseHeight, int baseWidth, int baseHeight,
const float raster2camera[4][4], const float raster2camera[4][4],
const float camera2world[4][4], const float camera2world[4][4],
float image[], int id[], float image[], int id[],
const LinearBVHNode nodes[], const LinearBVHNode nodes[],

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2010-2011, Intel Corporation Copyright (c) 2010-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#if 1 #if 1
@@ -90,7 +90,7 @@ static inline float Dot(const float3 a, const float3 b) {
#if 1 #if 1
inline inline
#endif #endif
static void generateRay(uniform const float raster2camera[4][4], static void generateRay(uniform const float raster2camera[4][4],
uniform const float camera2world[4][4], uniform const float camera2world[4][4],
float x, float y, Ray &ray) { float x, float y, Ray &ray) {
ray.mint = 0.f; ray.mint = 0.f;
@@ -107,11 +107,11 @@ static void generateRay(uniform const float raster2camera[4][4],
camy /= camw; camy /= camw;
camz /= camw; camz /= camw;
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
camera2world[0][2] * camz; camera2world[0][2] * camz;
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
camera2world[1][2] * camz; camera2world[1][2] * camz;
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
camera2world[2][2] * camz; camera2world[2][2] * camz;
ray.origin.x = camera2world[0][3] / camera2world[3][3]; ray.origin.x = camera2world[0][3] / camera2world[3][3];
@@ -129,7 +129,7 @@ static void generateRay(uniform const float raster2camera[4][4],
#if 1 #if 1
inline inline
#endif #endif
static bool_t BBoxIntersect(const uniform float bounds[2][3], static bool_t BBoxIntersect(const uniform float bounds[2][3],
const Ray &ray) { const Ray &ray) {
const uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] }; const uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
const uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] }; const uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
@@ -162,7 +162,7 @@ static bool_t BBoxIntersect(const uniform float bounds[2][3],
} }
t0 = max(tNear.z, t0); t0 = max(tNear.z, t0);
t1 = min(tFar.z, t1); t1 = min(tFar.z, t1);
return (t0 <= t1); return (t0 <= t1);
} }
@@ -215,7 +215,7 @@ static bool_t TriIntersect(const uniform_t Triangle tri, Ray &ray) {
inline inline
#endif #endif
bool_t bool_t
BVHIntersect(const uniform LinearBVHNode nodes[], BVHIntersect(const uniform LinearBVHNode nodes[],
const uniform Triangle tris[], Ray &r) { const uniform Triangle tris[], Ray &r) {
Ray ray = r; Ray ray = r;
bool_t hit = false; bool_t hit = false;
@@ -235,7 +235,7 @@ BVHIntersect(const uniform LinearBVHNode nodes[],
if (TriIntersect(tris[primitivesOffset+i], ray)) if (TriIntersect(tris[primitivesOffset+i], ray))
hit = true; hit = true;
} }
if (todoOffset == 0) if (todoOffset == 0)
break; break;
nodeNum = todo[--todoOffset]; nodeNum = todo[--todoOffset];
} }
@@ -276,10 +276,10 @@ BVHIntersect(const uniform LinearBVHNode nodes[],
inline inline
#endif #endif
static void raytrace_tile(uniform int x0, uniform int x1, static void raytrace_tile(uniform int x0, uniform int x1,
uniform int y0, uniform int y1, uniform int y0, uniform int y1,
uniform int width, uniform int height, uniform int width, uniform int height,
uniform int baseWidth, uniform int baseHeight, uniform int baseWidth, uniform int baseHeight,
const uniform float raster2camera[4][4], const uniform float raster2camera[4][4],
const uniform float camera2world[4][4], const uniform float camera2world[4][4],
uniform float image[], uniform int id[], uniform float image[], uniform int id[],
const uniform LinearBVHNode nodes[], const uniform LinearBVHNode nodes[],
@@ -302,7 +302,7 @@ static void raytrace_tile(uniform int x0, uniform int x1,
export void raytrace_ispc(uniform int width, uniform int height, export void raytrace_ispc(uniform int width, uniform int height,
uniform int baseWidth, uniform int baseHeight, uniform int baseWidth, uniform int baseHeight,
const uniform float raster2camera[4][4], const uniform float raster2camera[4][4],
const uniform float camera2world[4][4], const uniform float camera2world[4][4],
uniform float image[], uniform int id[], uniform float image[], uniform int id[],
const uniform LinearBVHNode nodes[], const uniform LinearBVHNode nodes[],
@@ -315,7 +315,7 @@ export void raytrace_ispc(uniform int width, uniform int height,
task void raytrace_tile_task(uniform int width, uniform int height, task void raytrace_tile_task(uniform int width, uniform int height,
uniform int baseWidth, uniform int baseHeight, uniform int baseWidth, uniform int baseHeight,
const uniform float raster2camera[4][4], const uniform float raster2camera[4][4],
const uniform float camera2world[4][4], const uniform float camera2world[4][4],
uniform float image[], uniform int id[], uniform float image[], uniform int id[],
const uniform LinearBVHNode nodes[], const uniform LinearBVHNode nodes[],
@@ -326,8 +326,8 @@ task void raytrace_tile_task(uniform int width, uniform int height,
const uniform int x1 = min(x0 + dx, width); const uniform int x1 = min(x0 + dx, width);
const uniform int y0 = (taskIndex / xBuckets) * dy; const uniform int y0 = (taskIndex / xBuckets) * dy;
const uniform int y1 = min(y0 + dy, height); const uniform int y1 = min(y0 + dy, height);
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
raster2camera, camera2world, image, raster2camera, camera2world, image,
id, nodes, triangles); id, nodes, triangles);
} }
@@ -335,7 +335,7 @@ task void raytrace_tile_task(uniform int width, uniform int height,
export void raytrace_ispc_tasks(uniform int width, uniform int height, export void raytrace_ispc_tasks(uniform int width, uniform int height,
uniform int baseWidth, uniform int baseHeight, uniform int baseWidth, uniform int baseHeight,
const uniform float raster2camera[4][4], const uniform float raster2camera[4][4],
const uniform float camera2world[4][4], const uniform float camera2world[4][4],
uniform float image[], uniform int id[], uniform float image[], uniform int id[],
const uniform LinearBVHNode nodes[], const uniform LinearBVHNode nodes[],
@@ -344,8 +344,8 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
const uniform int xBuckets = (width + (dx-1)) / dx; const uniform int xBuckets = (width + (dx-1)) / dx;
const uniform int yBuckets = (height + (dy-1)) / dy; const uniform int yBuckets = (height + (dy-1)) / dy;
const uniform int nTasks = xBuckets * yBuckets; const uniform int nTasks = xBuckets * yBuckets;
launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight, launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight,
raster2camera, camera2world, raster2camera, camera2world,
image, id, nodes, triangles); image, id, nodes, triangles);
} }

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2011, Intel Corporation Copyright (c) 2011-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#ifdef _MSC_VER #ifdef _MSC_VER

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2011, Intel Corporation Copyright (c) 2011-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,11 +28,11 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "cuda_helpers.cuh" #include "cuda_helpers.cuh"
__device__ static inline float clamp(float v, float low, float high) __device__ static inline float clamp(float v, float low, float high)
{ {
return min(max(v, low), high); return min(max(v, low), high);
} }
@@ -90,7 +90,7 @@ struct Ray {
__device__ static void __device__ static void
generateRay(const float raster2camera[4][4], generateRay(const float raster2camera[4][4],
const float camera2world[4][4], const float camera2world[4][4],
float x, float y, Ray &ray) { float x, float y, Ray &ray) {
// transform raster coordinate (x, y, 0) to camera space // transform raster coordinate (x, y, 0) to camera space
@@ -149,7 +149,7 @@ IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
} }
t0 = max(tNear.z, t0); t0 = max(tNear.z, t0);
t1 = min(tFar.z, t1); t1 = min(tFar.z, t1);
if (t0 <= t1) { if (t0 <= t1) {
hit0 = t0; hit0 = t0;
hit1 = t1; hit1 = t1;
@@ -165,7 +165,7 @@ __device__ static inline float Lerp(float t, float a, float b) {
} }
__device__ static inline float D(int x, int y, int z, int nVoxels[3], __device__ static inline float D(int x, int y, int z, int nVoxels[3],
float density[]) { float density[]) {
x = clamp(x, 0, nVoxels[0]-1); x = clamp(x, 0, nVoxels[0]-1);
y = clamp(y, 0, nVoxels[1]-1); y = clamp(y, 0, nVoxels[1]-1);
@@ -180,9 +180,9 @@ __device__ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
} }
__device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax, __device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
float density[], int nVoxels[3]) { float density[], int nVoxels[3]) {
if (!Inside(Pobj, pMin, pMax)) if (!Inside(Pobj, pMin, pMax))
return 0; return 0;
// Compute voxel coordinates and offsets for _Pobj_ // Compute voxel coordinates and offsets for _Pobj_
float3 vox = Offset(Pobj, pMin, pMax); float3 vox = Offset(Pobj, pMin, pMax);
@@ -193,13 +193,13 @@ __device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz; float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
// Trilinearly interpolate density values to compute local density // Trilinearly interpolate density values to compute local density
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density), float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
D(vx+1, vy, vz, nVoxels, density)); D(vx+1, vy, vz, nVoxels, density));
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density), float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
D(vx+1, vy+1, vz, nVoxels, density)); D(vx+1, vy+1, vz, nVoxels, density));
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density), float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
D(vx+1, vy, vz+1, nVoxels, density)); D(vx+1, vy, vz+1, nVoxels, density));
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
D(vx+1, vy+1, vz+1, nVoxels, density)); D(vx+1, vy+1, vz+1, nVoxels, density));
float d0 = Lerp(dy, d00, d10); float d0 = Lerp(dy, d00, d10);
float d1 = Lerp(dy, d01, d11); float d1 = Lerp(dy, d01, d11);
@@ -213,7 +213,7 @@ __device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
array. */ array. */
__device__ static inline float __device__ static inline float
transmittance(float3 p0, float3 p1, float3 pMin, transmittance(float3 p0, float3 p1, float3 pMin,
float3 pMax, float sigma_t, float3 pMax, float sigma_t,
float density[], int nVoxels[3]) { float density[], int nVoxels[3]) {
float rayT0, rayT1; float rayT0, rayT1;
Ray ray; Ray ray;
@@ -253,7 +253,7 @@ distanceSquared(float3 a, float3 b) {
} }
__device__ static inline float __device__ static inline float
raymarch(float density[], int nVoxels[3], Ray ray) { raymarch(float density[], int nVoxels[3], Ray ray) {
float rayT0, rayT1; float rayT0, rayT1;
float3 pMin = {.3f, -.2f, .3f}, pMax = {1.8f, 2.3f, 1.8f}; float3 pMin = {.3f, -.2f, .3f}, pMax = {1.8f, 2.3f, 1.8f};
@@ -281,7 +281,7 @@ raymarch(float density[], int nVoxels[3], Ray ray) {
float t = rayT0; float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0; float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT; float3 dirStep = ray.dir * stepT;
while (t < rayT1) while (t < rayT1)
{ {
float d = Density(pos, pMin, pMax, density, nVoxels); float d = Density(pos, pMin, pMax, density, nVoxels);
@@ -291,7 +291,7 @@ raymarch(float density[], int nVoxels[3], Ray ray) {
break; break;
// direct lighting // direct lighting
float Li = lightIntensity / distanceSquared(lightPos, pos) * float Li = lightIntensity / distanceSquared(lightPos, pos) *
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s, transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
density, nVoxels); density, nVoxels);
L += stepDist * atten * d * sigma_s * (Li + Le); L += stepDist * atten * d * sigma_s * (Li + Le);
@@ -314,20 +314,20 @@ raymarch(float density[], int nVoxels[3], Ray ray) {
*/ */
__device__ static void __device__ static void
volume_tile(int x0, int y0, int x1, volume_tile(int x0, int y0, int x1,
int y1, float density[], int nVoxels[3], int y1, float density[], int nVoxels[3],
const float raster2camera[4][4], const float raster2camera[4][4],
const float camera2world[4][4], const float camera2world[4][4],
int width, int height, float image[]) { int width, int height, float image[]) {
// Work on 4x4=16 pixel big tiles of the image. This function thus // Work on 4x4=16 pixel big tiles of the image. This function thus
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
// by 4. // by 4.
for (int y = y0; y < y1; y += 8) { for (int y = y0; y < y1; y += 8) {
for (int x = x0; x < x1; x += 8) { for (int x = x0; x < x1; x += 8) {
for (int ob = 0; ob < 64; ob += programCount) for (int ob = 0; ob < 64; ob += programCount)
{ {
const int o = ob + programIndex; const int o = ob + programIndex;
// These two arrays encode the mapping from [0,15] to // These two arrays encode the mapping from [0,15] to
// offsets within the 4x4 pixel block so that we render // offsets within the 4x4 pixel block so that we render
// each pixel inside the block // each pixel inside the block
@@ -360,9 +360,9 @@ volume_tile(int x0, int y0, int x1,
__global__ void __global__ void
volume_task(float density[], int _nVoxels[3], volume_task(float density[], int _nVoxels[3],
const float _raster2camera[4][4], const float _raster2camera[4][4],
const float _camera2world[4][4], const float _camera2world[4][4],
int width, int height, float image[]) { int width, int height, float image[]) {
if (taskIndex0 >= taskCount0) return; if (taskIndex0 >= taskCount0) return;
@@ -389,7 +389,7 @@ volume_task(float density[], int _nVoxels[3],
raster2camera[3][1] = _raster2camera[3][1]; raster2camera[3][1] = _raster2camera[3][1];
raster2camera[3][2] = _raster2camera[3][2]; raster2camera[3][2] = _raster2camera[3][2];
raster2camera[3][3] = _raster2camera[3][3]; raster2camera[3][3] = _raster2camera[3][3];
float camera2world[4][4]; float camera2world[4][4];
camera2world[0][0] = _camera2world[0][0]; camera2world[0][0] = _camera2world[0][0];
camera2world[0][1] = _camera2world[0][1]; camera2world[0][1] = _camera2world[0][1];
@@ -430,24 +430,24 @@ volume_task(float density[], int _nVoxels[3],
extern "C" extern "C"
__global__ void __global__ void
volume_ispc_tasks___export( float density[], int nVoxels[3], volume_ispc_tasks___export( float density[], int nVoxels[3],
const float raster2camera[4][4], const float raster2camera[4][4],
const float camera2world[4][4], const float camera2world[4][4],
int width, int height, float image[]) { int width, int height, float image[]) {
// Launch tasks to work on (dx,dy)-sized tiles of the image // Launch tasks to work on (dx,dy)-sized tiles of the image
int dx = 8, dy = 8; int dx = 8, dy = 8;
int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy); int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
launch(nTasks,1,1,volume_task) launch(nTasks,1,1,volume_task)
(density, nVoxels, raster2camera, camera2world, (density, nVoxels, raster2camera, camera2world,
width, height, image); width, height, image);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
} }
extern "C" extern "C"
__host__ void __host__ void
volume_ispc_tasks( float density[], int nVoxels[3], volume_ispc_tasks( float density[], int nVoxels[3],
const float raster2camera[4][4], const float raster2camera[4][4],
const float camera2world[4][4], const float camera2world[4][4],
int width, int height, float image[]) { int width, int height, float image[]) {
volume_ispc_tasks___export<<<1,32>>>(density, nVoxels, raster2camera, camera2world, width, height,image); volume_ispc_tasks___export<<<1,32>>>(density, nVoxels, raster2camera, camera2world, width, height,image);
cudaDeviceSynchronize(); cudaDeviceSynchronize();

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2011, Intel Corporation Copyright (c) 2011-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
@@ -41,7 +41,7 @@ struct Ray {
static inline void static inline void
generateRay(const uniform float raster2camera[4][4], generateRay(const uniform float raster2camera[4][4],
const uniform float camera2world[4][4], const uniform float camera2world[4][4],
float x, float y, Ray &ray) { float x, float y, Ray &ray) {
// transform raster coordinate (x, y, 0) to camera space // transform raster coordinate (x, y, 0) to camera space
@@ -100,7 +100,7 @@ IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
} }
t0 = max(tNear.z, t0); t0 = max(tNear.z, t0);
t1 = min(tFar.z, t1); t1 = min(tFar.z, t1);
if (t0 <= t1) { if (t0 <= t1) {
hit0 = t0; hit0 = t0;
hit1 = t1; hit1 = t1;
@@ -116,7 +116,7 @@ static inline float Lerp(float t, float a, float b) {
} }
static inline float D(int x, int y, int z, uniform int nVoxels[3], static inline float D(int x, int y, int z, uniform int nVoxels[3],
uniform float density[]) { uniform float density[]) {
x = clamp(x, 0, nVoxels[0]-1); x = clamp(x, 0, nVoxels[0]-1);
y = clamp(y, 0, nVoxels[1]-1); y = clamp(y, 0, nVoxels[1]-1);
@@ -131,9 +131,9 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
} }
static inline float Density(float3 Pobj, float3 pMin, float3 pMax, static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
uniform float density[], uniform int nVoxels[3]) { uniform float density[], uniform int nVoxels[3]) {
if (!Inside(Pobj, pMin, pMax)) if (!Inside(Pobj, pMin, pMax))
return 0; return 0;
// Compute voxel coordinates and offsets for _Pobj_ // Compute voxel coordinates and offsets for _Pobj_
float3 vox = Offset(Pobj, pMin, pMax); float3 vox = Offset(Pobj, pMin, pMax);
@@ -144,13 +144,13 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz; float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
// Trilinearly interpolate density values to compute local density // Trilinearly interpolate density values to compute local density
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density), float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
D(vx+1, vy, vz, nVoxels, density)); D(vx+1, vy, vz, nVoxels, density));
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density), float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
D(vx+1, vy+1, vz, nVoxels, density)); D(vx+1, vy+1, vz, nVoxels, density));
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density), float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
D(vx+1, vy, vz+1, nVoxels, density)); D(vx+1, vy, vz+1, nVoxels, density));
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
D(vx+1, vy+1, vz+1, nVoxels, density)); D(vx+1, vy+1, vz+1, nVoxels, density));
float d0 = Lerp(dy, d00, d10); float d0 = Lerp(dy, d00, d10);
float d1 = Lerp(dy, d01, d11); float d1 = Lerp(dy, d01, d11);
@@ -164,7 +164,7 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
array. */ array. */
static inline float static inline float
transmittance(uniform float3 p0, float3 p1, uniform float3 pMin, transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
uniform float3 pMax, uniform float sigma_t, uniform float3 pMax, uniform float sigma_t,
uniform float density[], uniform int nVoxels[3]) { uniform float density[], uniform int nVoxels[3]) {
float rayT0, rayT1; float rayT0, rayT1;
Ray ray; Ray ray;
@@ -204,7 +204,7 @@ distanceSquared(float3 a, float3 b) {
} }
static inline float static inline float
raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) { raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
float rayT0, rayT1; float rayT0, rayT1;
const uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8}; const uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
@@ -232,7 +232,7 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
float t = rayT0; float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0; float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT; float3 dirStep = ray.dir * stepT;
while (t < rayT1) while (t < rayT1)
{ {
float d = Density(pos, pMin, pMax, density, nVoxels); float d = Density(pos, pMin, pMax, density, nVoxels);
@@ -242,7 +242,7 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
break; break;
// direct lighting // direct lighting
float Li = lightIntensity / distanceSquared(lightPos, pos) * float Li = lightIntensity / distanceSquared(lightPos, pos) *
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s, transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
density, nVoxels); density, nVoxels);
L += stepDist * atten * d * sigma_s * (Li + Le); L += stepDist * atten * d * sigma_s * (Li + Le);
@@ -265,16 +265,16 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
*/ */
static inline void static inline void
volume_tile(uniform int x0, uniform int y0, uniform int x1, volume_tile(uniform int x0, uniform int y0, uniform int x1,
uniform int y1, uniform float density[], uniform int nVoxels[3], uniform int y1, uniform float density[], uniform int nVoxels[3],
const uniform float raster2camera[4][4], const uniform float raster2camera[4][4],
const uniform float camera2world[4][4], const uniform float camera2world[4][4],
uniform int width, uniform int height, uniform float image[]) { uniform int width, uniform int height, uniform float image[]) {
// Work on 4x4=16 pixel big tiles of the image. This function thus // Work on 4x4=16 pixel big tiles of the image. This function thus
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
// by 4. // by 4.
#if 0 #if 0
for (uniform int y = y0; y < y1; y += 8) for (uniform int y = y0; y < y1; y += 8)
for (uniform int x = x0; x < x1; x += 8) for (uniform int x = x0; x < x1; x += 8)
foreach (o = 0 ... 64) foreach (o = 0 ... 64)
{ {
// These two arrays encode the mapping from [0,15] to // These two arrays encode the mapping from [0,15] to
@@ -304,7 +304,7 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
image[offset] = raymarch(density, nVoxels, ray); image[offset] = raymarch(density, nVoxels, ray);
} }
#else #else
foreach_tiled (y = y0 ... y1, x = x0 ... x1) foreach_tiled (y = y0 ... y1, x = x0 ... x1)
{ {
// Use viewing parameters to compute the corresponding ray // Use viewing parameters to compute the corresponding ray
// for the pixel // for the pixel
@@ -321,10 +321,10 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
task void task void
volume_task(uniform float density[], uniform int _nVoxels[3], volume_task(uniform float density[], uniform int _nVoxels[3],
const uniform float _raster2camera[4][4], const uniform float _raster2camera[4][4],
const uniform float _camera2world[4][4], const uniform float _camera2world[4][4],
uniform int width, uniform int height, uniform float image[]) uniform int width, uniform int height, uniform float image[])
{ {
if (taskIndex >= taskCount) return; if (taskIndex >= taskCount) return;
@@ -351,7 +351,7 @@ volume_task(uniform float density[], uniform int _nVoxels[3],
raster2camera[3][1] = _raster2camera[3][1]; raster2camera[3][1] = _raster2camera[3][1];
raster2camera[3][2] = _raster2camera[3][2]; raster2camera[3][2] = _raster2camera[3][2];
raster2camera[3][3] = _raster2camera[3][3]; raster2camera[3][3] = _raster2camera[3][3];
uniform float camera2world[4][4]; uniform float camera2world[4][4];
camera2world[0][0] = _camera2world[0][0]; camera2world[0][0] = _camera2world[0][0];
camera2world[0][1] = _camera2world[0][1]; camera2world[0][1] = _camera2world[0][1];
@@ -390,24 +390,24 @@ volume_task(uniform float density[], uniform int _nVoxels[3],
export void export void
volume_ispc(uniform float density[], uniform int nVoxels[3], volume_ispc(uniform float density[], uniform int nVoxels[3],
const uniform float raster2camera[4][4], const uniform float raster2camera[4][4],
const uniform float camera2world[4][4], const uniform float camera2world[4][4],
uniform int width, uniform int height, uniform float image[]) { uniform int width, uniform int height, uniform float image[]) {
volume_tile(0, 0, width, height, density, nVoxels, raster2camera, volume_tile(0, 0, width, height, density, nVoxels, raster2camera,
camera2world, width, height, image); camera2world, width, height, image);
} }
export void export void
volume_ispc_tasks(uniform float density[], uniform int nVoxels[3], volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
const uniform float raster2camera[4][4], const uniform float raster2camera[4][4],
const uniform float camera2world[4][4], const uniform float camera2world[4][4],
uniform int width, uniform int height, uniform float image[]) { uniform int width, uniform int height, uniform float image[]) {
// Launch tasks to work on (dx,dy)-sized tiles of the image // Launch tasks to work on (dx,dy)-sized tiles of the image
const uniform int dx = 8, dy = 8; const uniform int dx = 8, dy = 8;
const uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy); const uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world, launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
width, height, image); width, height, image);
sync; sync;
} }

View File

@@ -66,7 +66,7 @@ extern "C" {
void ISPCSync(void *handle); void ISPCSync(void *handle);
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
} }
void ISPCLaunch(void **handle, void *f, void *d, int count0, int count1, int count2) { void ISPCLaunch(void **handle, void *f, void *d, int count0, int count1, int count2) {
*handle = (void *)0xdeadbeef; *handle = (void *)0xdeadbeef;
typedef void (*TaskFuncType)(void *, int, int, int, int, int, int, int, int, int, int); typedef void (*TaskFuncType)(void *, int, int, int, int, int, int, int, int, int, int);

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2010-2011, Intel Corporation Copyright (c) 2010-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -94,12 +94,12 @@ static void createContext(const int deviceId = 0, const bool verbose = true)
int devMajor, devMinor; int devMajor, devMinor;
checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
if (verbose) if (verbose)
std::cout << "Device Compute Capability: " std::cout << "Device Compute Capability: "
<< devMajor << "." << devMinor << "\n"; << devMajor << "." << devMinor << "\n";
if (devMajor < 2) { if (devMajor < 2) {
if (verbose) if (verbose)
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
exit(1); exit(1);
} }
// Create driver context // Create driver context
@@ -180,7 +180,7 @@ static CUmodule loadModule(
if (print_log) if (print_log)
fprintf(stderr, "Loading ptx..\n"); fprintf(stderr, "Loading ptx..\n");
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0); myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0);
myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, cudadevrt_lib, 0,0,0); myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, cudadevrt_lib, 0,0,0);
// PTX May also be loaded from file, as per below. // PTX May also be loaded from file, as per below.
// myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0); // myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0);
} }
@@ -190,10 +190,10 @@ static CUmodule loadModule(
if ( myErr != CUDA_SUCCESS ) if ( myErr != CUDA_SUCCESS )
{ {
// Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above.
fprintf(stderr,"PTX Linker Error:\n%s\n",error_log); fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
assert(0); assert(0);
} }
// Linker walltime and info_log were requested in options above. // Linker walltime and info_log were requested in options above.
if (print_log) if (print_log)
@@ -260,13 +260,13 @@ static std::vector<char> readBinary(const char * filename, const bool print_size
fprintf(stderr, "file %s not found\n", filename); fprintf(stderr, "file %s not found\n", filename);
assert(0); assert(0);
} }
fseek(fp, 0, SEEK_END); fseek(fp, 0, SEEK_END);
const unsigned long long size = ftell(fp); /*calc the size needed*/ const unsigned long long size = ftell(fp); /*calc the size needed*/
fseek(fp, 0, SEEK_SET); fseek(fp, 0, SEEK_SET);
buffer.resize(size); buffer.resize(size);
if (fp == NULL){ /*ERROR detection if file == empty*/ if (fp == NULL){ /*ERROR detection if file == empty*/
fprintf(stderr, "Error: There was an Error reading the file %s \n",filename); fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);
exit(1); exit(1);
} }
else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/ else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
@@ -279,7 +279,7 @@ static std::vector<char> readBinary(const char * filename, const bool print_size
} }
static double CUDALaunch( static double CUDALaunch(
void **handlePtr, void **handlePtr,
const char * func_name, const char * func_name,
void **func_args, void **func_args,
const bool print_log = true, const bool print_log = true,
@@ -396,7 +396,7 @@ int main(int argc, char *argv[]) {
#endif #endif
float expected_result[64]; float expected_result[64];
memset(expected_result, 0, 64*sizeof(float)); memset(expected_result, 0, 64*sizeof(float));
devicePtr d_expected_result = deviceMalloc(64*sizeof(float)); devicePtr d_expected_result = deviceMalloc(64*sizeof(float));
memcpyH2D(d_expected_result, expected_result, 64*sizeof(float)); memcpyH2D(d_expected_result, expected_result, 64*sizeof(float));

View File

@@ -1,5 +1,5 @@
/* /*
Copyright (c) 2010-2011, Intel Corporation Copyright (c) 2010-2014, Intel Corporation
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -110,7 +110,7 @@ int main(int argc, char *argv[]) {
int errors = 0; int errors = 0;
for (int i = 0; i < w; ++i) { for (int i = 0; i < w; ++i) {
if (returned_result[i] != expected_result[i]) if (returned_result[i] != expected_result[i])
{ {
#ifdef EXPECT_FAILURE #ifdef EXPECT_FAILURE
// bingo, failed // bingo, failed