White space and copyright fixes in examples.
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
@@ -141,12 +141,12 @@ int main(int argc, char **argv)
|
||||
}
|
||||
|
||||
// Report results and save image
|
||||
printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n",
|
||||
printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n",
|
||||
minTimeISPCTasks, width, height);
|
||||
savePPM("ao-ispc-tasks.ppm", width, height);
|
||||
savePPM("ao-ispc-tasks.ppm", width, height);
|
||||
|
||||
delete img;
|
||||
delete fimg;
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// -*- mode: c++ -*-
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -29,7 +29,7 @@ met:
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
|
||||
@@ -37,7 +37,7 @@ met:
|
||||
|
||||
#include "cuda_helpers.cuh"
|
||||
|
||||
#define NAO_SAMPLES 8
|
||||
#define NAO_SAMPLES 8
|
||||
//#define M_PI 3.1415926535f
|
||||
|
||||
#define vec Float3
|
||||
@@ -109,7 +109,7 @@ static inline unsigned int random(RNGState * state)
|
||||
|
||||
b = ((state->z1 << 6) ^ state->z1) >> 13;
|
||||
state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
|
||||
b = ((state->z2 << 2) ^ state->z2) >> 27;
|
||||
b = ((state->z2 << 2) ^ state->z2) >> 27;
|
||||
state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
|
||||
b = ((state->z3 << 13) ^ state->z3) >> 21;
|
||||
state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
|
||||
@@ -128,7 +128,7 @@ static inline float frandom(RNGState * state)
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline void seed_rng(RNGState * state,
|
||||
static inline void seed_rng(RNGState * state,
|
||||
unsigned int seed) {
|
||||
state->z1 = seed;
|
||||
state->z2 = seed ^ 0xbeeff00d;
|
||||
@@ -143,7 +143,7 @@ struct Isect {
|
||||
float t;
|
||||
vec p;
|
||||
vec n;
|
||||
int hit;
|
||||
int hit;
|
||||
};
|
||||
|
||||
struct Sphere {
|
||||
@@ -190,7 +190,7 @@ ray_plane_intersect(Isect &isect,const Ray &ray, const Plane &plane) {
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
#if 0
|
||||
if (abs(v) < 1.0f-17)
|
||||
if (abs(v) < 1.0f-17)
|
||||
return;
|
||||
else {
|
||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||
@@ -238,7 +238,7 @@ ray_sphere_intersect(Isect &isect,const Ray &ray, const Sphere &sphere) {
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (D <= 0.0f)
|
||||
if (D <= 0.0f)
|
||||
return;
|
||||
|
||||
float t = -B - sqrt(D);
|
||||
@@ -319,8 +319,8 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
|
||||
occIsect.hit = 0;
|
||||
|
||||
for ( int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(occIsect, ray, spheres[snum]);
|
||||
ray_plane_intersect (occIsect, ray, plane);
|
||||
ray_sphere_intersect(occIsect, ray, spheres[snum]);
|
||||
ray_plane_intersect (occIsect, ray, plane);
|
||||
|
||||
if (occIsect.hit) occlusion += 1.0f;
|
||||
}
|
||||
@@ -337,10 +337,10 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
|
||||
__device__
|
||||
static inline void ao_tiles(
|
||||
int x0, int x1,
|
||||
int y0, int y1,
|
||||
int y0, int y1,
|
||||
int w, int h,
|
||||
int nsubsamples,
|
||||
float image[])
|
||||
int nsubsamples,
|
||||
float image[])
|
||||
{
|
||||
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
const Sphere spheres[3] = {
|
||||
@@ -411,8 +411,8 @@ static inline void ao_tiles(
|
||||
|
||||
extern "C"
|
||||
__global__
|
||||
void ao_task( int width, int height,
|
||||
int nsubsamples, float image[])
|
||||
void ao_task( int width, int height,
|
||||
int nsubsamples, float image[])
|
||||
{
|
||||
if (taskIndex0 >= taskCount0) return;
|
||||
if (taskIndex1 >= taskCount1) return;
|
||||
@@ -428,8 +428,8 @@ void ao_task( int width, int height,
|
||||
extern "C"
|
||||
__global__
|
||||
void ao_ispc_tasks___export(
|
||||
int w, int h, int nsubsamples,
|
||||
float image[])
|
||||
int w, int h, int nsubsamples,
|
||||
float image[])
|
||||
{
|
||||
const int ntilex = (w+TILEX-1)/TILEX;
|
||||
const int ntiley = (h+TILEY-1)/TILEY;
|
||||
@@ -439,8 +439,8 @@ void ao_ispc_tasks___export(
|
||||
|
||||
extern "C"
|
||||
__host__ void ao_ispc_tasks(
|
||||
int w, int h, int nsubsamples,
|
||||
float image[])
|
||||
int w, int h, int nsubsamples,
|
||||
float image[])
|
||||
{
|
||||
ao_ispc_tasks___export<<<1,32>>>(w,h,nsubsamples,image);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// -*- mode: c++ -*-
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -29,13 +29,13 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
|
||||
*/
|
||||
|
||||
#define NAO_SAMPLES 8
|
||||
#define NAO_SAMPLES 8
|
||||
#define M_PI 3.1415926535f
|
||||
|
||||
typedef float<3> vec;
|
||||
@@ -50,7 +50,7 @@ struct Isect {
|
||||
float t;
|
||||
vec p;
|
||||
vec n;
|
||||
int hit;
|
||||
int hit;
|
||||
};
|
||||
|
||||
struct Sphere {
|
||||
@@ -94,7 +94,7 @@ ray_plane_intersect(Isect &isect, Ray &ray, const Plane &plane) {
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
#if 0
|
||||
cif (abs(v) < 1.0e-17)
|
||||
cif (abs(v) < 1.0e-17)
|
||||
return;
|
||||
else {
|
||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||
@@ -141,7 +141,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, const Sphere &sphere) {
|
||||
}
|
||||
}
|
||||
#else
|
||||
cif (D <=0.0f)
|
||||
cif (D <=0.0f)
|
||||
return;
|
||||
|
||||
float t = -B - sqrt(D);
|
||||
@@ -220,8 +220,8 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
|
||||
occIsect.hit = 0;
|
||||
|
||||
for (uniform int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(occIsect, ray, spheres[snum]);
|
||||
ray_plane_intersect (occIsect, ray, plane);
|
||||
ray_sphere_intersect(occIsect, ray, spheres[snum]);
|
||||
ray_plane_intersect (occIsect, ray, plane);
|
||||
|
||||
if (occIsect.hit) occlusion += 1.0;
|
||||
}
|
||||
@@ -233,10 +233,10 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
|
||||
|
||||
static inline void ao_tiles(
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int w, uniform int h,
|
||||
uniform int nsubsamples,
|
||||
uniform float image[])
|
||||
uniform int nsubsamples,
|
||||
uniform float image[])
|
||||
{
|
||||
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
const Sphere spheres[3] = {
|
||||
@@ -306,7 +306,7 @@ static inline void ao_tiles(
|
||||
#define TILEX max(64,programCount*2)
|
||||
#define TILEY 4
|
||||
|
||||
export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
const uniform int x0 = 0;
|
||||
const uniform int x1 = w;
|
||||
@@ -315,8 +315,8 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
ao_tiles(x0,x1,y0,y1, w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
void task ao_task(uniform int width, uniform int height,
|
||||
uniform int nsubsamples, uniform float image[])
|
||||
void task ao_task(uniform int width, uniform int height,
|
||||
uniform int nsubsamples, uniform float image[])
|
||||
{
|
||||
if (taskIndex0 >= taskCount0) return;
|
||||
if (taskIndex1 >= taskCount1) return;
|
||||
@@ -330,8 +330,8 @@ void task ao_task(uniform int width, uniform int height,
|
||||
}
|
||||
|
||||
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[])
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[])
|
||||
{
|
||||
const uniform int ntilex = (w+TILEX-1)/TILEX;
|
||||
const uniform int ntiley = (h+TILEY-1)/TILEY;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
@@ -145,13 +145,13 @@ CreateInputDataFromFile(const char *path) {
|
||||
}
|
||||
|
||||
// Load data chunk and update pointers
|
||||
input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
|
||||
input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
|
||||
ALIGNMENT_BYTES);
|
||||
if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
|
||||
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
input->arrays.zBuffer =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
|
||||
input->arrays.normalEncoded_x =
|
||||
@@ -199,21 +199,21 @@ void WriteFrame(const char *filename, const InputData *input,
|
||||
const Framebuffer &framebuffer) {
|
||||
// Deswizzle and copy to RGBA output
|
||||
// Doesn't need to be fast... only happens once
|
||||
size_t imageBytes = 3 * input->header.framebufferWidth *
|
||||
size_t imageBytes = 3 * input->header.framebufferWidth *
|
||||
input->header.framebufferHeight;
|
||||
uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
|
||||
memset(framebufferAOS, 0, imageBytes);
|
||||
|
||||
for (int i = 0; i < input->header.framebufferWidth *
|
||||
for (int i = 0; i < input->header.framebufferWidth *
|
||||
input->header.framebufferHeight; ++i) {
|
||||
framebufferAOS[3 * i + 0] = framebuffer.r[i];
|
||||
framebufferAOS[3 * i + 1] = framebuffer.g[i];
|
||||
framebufferAOS[3 * i + 2] = framebuffer.b[i];
|
||||
}
|
||||
|
||||
|
||||
// Write out simple PPM file
|
||||
FILE *out = fopen(filename, "wb");
|
||||
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
|
||||
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
fwrite(framebufferAOS, imageBytes, 1, out);
|
||||
fclose(out);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DEFERRED_H
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "deferred.h"
|
||||
@@ -147,7 +147,7 @@ ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
|
||||
float minZ, maxZ;
|
||||
ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
|
||||
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
|
||||
cameraNear, cameraFar, &minZ, &maxZ);
|
||||
minZArray[tileX] = minZ;
|
||||
maxZArray[tileX] = maxZ;
|
||||
@@ -167,7 +167,7 @@ public:
|
||||
{
|
||||
mNumTilesX = gBufferWidth / mTileWidth;
|
||||
mNumTilesY = gBufferHeight / mTileHeight;
|
||||
|
||||
|
||||
// Allocate arrays
|
||||
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
@@ -213,7 +213,7 @@ public:
|
||||
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
if (srcX + 1 < srcTilesX) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
@@ -243,7 +243,7 @@ public:
|
||||
lAlignedFree(mMaxZArrays[i]);
|
||||
}
|
||||
lAlignedFree(mMinZArrays);
|
||||
lAlignedFree(mMaxZArrays);
|
||||
lAlignedFree(mMaxZArrays);
|
||||
}
|
||||
|
||||
int Levels() const { return mLevels; }
|
||||
@@ -277,9 +277,9 @@ private:
|
||||
static MinMaxZTree *gMinMaxZTree = 0;
|
||||
|
||||
void InitDynamicC(InputData *input) {
|
||||
gMinMaxZTree =
|
||||
gMinMaxZTree =
|
||||
new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
|
||||
input->header.framebufferWidth,
|
||||
input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
}
|
||||
|
||||
@@ -311,7 +311,7 @@ SplitTileMinMax(
|
||||
{
|
||||
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
|
||||
float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y) };
|
||||
float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
|
||||
@@ -319,7 +319,7 @@ SplitTileMinMax(
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
// Normalize
|
||||
float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
frustumPlanes_z[i] * frustumPlanes_z[i]);
|
||||
frustumPlanes_xy[i] *= norm;
|
||||
frustumPlanes_z[i] *= norm;
|
||||
@@ -340,23 +340,23 @@ SplitTileMinMax(
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
|
||||
// Test lights again against subtile z bounds
|
||||
bool inFrustum[4];
|
||||
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
|
||||
|
||||
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
float dy = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_y * frustumPlanes_xy[1];
|
||||
|
||||
|
||||
if (fabsf(dx) > light_attenuationEnd) {
|
||||
bool positiveX = dx > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
|
||||
@@ -423,13 +423,13 @@ half_to_float_fast(uint16_t h) {
|
||||
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
|
||||
|
||||
// sign
|
||||
uint32_t xs = ((uint32_t) hs) << 16;
|
||||
uint32_t xs = ((uint32_t) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
|
||||
int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
|
||||
// Exponent
|
||||
uint32_t xe = (uint32_t) (xes << 23);
|
||||
// Mantissa
|
||||
uint32_t xm = ((uint32_t) hm) << 13;
|
||||
uint32_t xm = ((uint32_t) hm) << 13;
|
||||
|
||||
uint32_t bits = (xs | xe | xm);
|
||||
float *fp = reinterpret_cast<float *>(&bits);
|
||||
@@ -470,13 +470,13 @@ ShadeTileC(
|
||||
} else {
|
||||
float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||
float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||
|
||||
|
||||
for (int32_t y = tileStartY; y < tileEndY; ++y) {
|
||||
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||
|
||||
for (int32_t x = tileStartX; x < tileEndX; ++x) {
|
||||
int32_t gBufferOffset = y * gBufferWidth + x;
|
||||
|
||||
|
||||
// Reconstruct position and (negative) view vector from G-buffer
|
||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||
float Vneg_x, Vneg_y, Vneg_z;
|
||||
@@ -485,70 +485,70 @@ ShadeTileC(
|
||||
|
||||
// Compute screen/clip-space position
|
||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
twoOverGBufferWidth - 1.0f;
|
||||
|
||||
// Unproject depth buffer Z value into view space
|
||||
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
cameraProj_11;
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
cameraProj_22;
|
||||
|
||||
|
||||
// We actually end up with a vector pointing *at* the
|
||||
// surface (i.e. the negative view vector)
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
|
||||
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrtf(4.0f * f - 1.0f);
|
||||
|
||||
|
||||
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||
surface_normal_z = 3.0f - 8.0f * f;
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
float surface_specularAmount =
|
||||
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
float surface_specularPower =
|
||||
half_to_float_fast(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
|
||||
|
||||
float lit_x = 0.0f;
|
||||
float lit_y = 0.0f;
|
||||
float lit_z = 0.0f;
|
||||
for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
++tileLightIndex) {
|
||||
int32_t lightIndex = tileLightIndices[tileLightIndex];
|
||||
|
||||
|
||||
// Gather light data relevant to initial culling
|
||||
float light_positionView_x =
|
||||
float light_positionView_x =
|
||||
inputData.lightPositionView_x[lightIndex];
|
||||
float light_positionView_y =
|
||||
float light_positionView_y =
|
||||
inputData.lightPositionView_y[lightIndex];
|
||||
float light_positionView_z =
|
||||
float light_positionView_z =
|
||||
inputData.lightPositionView_z[lightIndex];
|
||||
float light_attenuationEnd =
|
||||
float light_attenuationEnd =
|
||||
inputData.lightAttenuationEnd[lightIndex];
|
||||
|
||||
|
||||
// Compute light vector
|
||||
float L_x = light_positionView_x - surface_positionView_x;
|
||||
float L_y = light_positionView_y - surface_positionView_y;
|
||||
float L_z = light_positionView_z - surface_positionView_z;
|
||||
|
||||
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||
|
||||
|
||||
// Clip at end of attenuation
|
||||
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||
|
||||
if (distanceToLight2 < light_attenutaionEnd2) {
|
||||
if (distanceToLight2 < light_attenutaionEnd2) {
|
||||
float distanceToLight = sqrtf(distanceToLight2);
|
||||
|
||||
float distanceToLightRcp = 1.f / distanceToLight;
|
||||
@@ -557,12 +557,12 @@ ShadeTileC(
|
||||
L_z *= distanceToLightRcp;
|
||||
|
||||
// Start computing brdf
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, L_x, L_y, L_z);
|
||||
|
||||
|
||||
// Clip back facing
|
||||
if (NdotL > 0.0f) {
|
||||
float light_attenuationBegin =
|
||||
float light_attenuationBegin =
|
||||
inputData.lightAttenuationBegin[lightIndex];
|
||||
|
||||
// Light distance attenuation (linstep)
|
||||
@@ -574,19 +574,19 @@ ShadeTileC(
|
||||
float H_y = (L_y - Vneg_y);
|
||||
float H_z = (L_z - Vneg_z);
|
||||
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, H_x, H_y, H_z);
|
||||
NdotH = std::max(NdotH, 0.0f);
|
||||
|
||||
float specular = powf(NdotH, surface_specularPower);
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
(1.0f / 8.0f);
|
||||
float specularContrib = surface_specularAmount *
|
||||
float specularContrib = surface_specularAmount *
|
||||
specularNorm * specular;
|
||||
|
||||
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||
|
||||
|
||||
float light_color_x = inputData.lightColor_x[lightIndex];
|
||||
float light_color_y = inputData.lightColor_y[lightIndex];
|
||||
float light_color_z = inputData.lightColor_z[lightIndex];
|
||||
@@ -607,7 +607,7 @@ ShadeTileC(
|
||||
lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
|
||||
lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
|
||||
lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
|
||||
|
||||
|
||||
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||
@@ -618,11 +618,11 @@ ShadeTileC(
|
||||
|
||||
|
||||
void
|
||||
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int *lightIndices, int numLights,
|
||||
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int *lightIndices, int numLights,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
|
||||
// If we few enough lights or this is the base case (last level), shade
|
||||
// this full tile directly
|
||||
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
|
||||
@@ -632,18 +632,18 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
|
||||
// Skip entirely offscreen tiles
|
||||
if (endX > startX && endY > startY) {
|
||||
ShadeTileC(startX, endX, startY, endY,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->arrays,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer->r, framebuffer->g, framebuffer->b);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
|
||||
// Move down a level in the tree
|
||||
@@ -666,9 +666,9 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
|
||||
// NOTE: Order is 00, 10, 01, 11
|
||||
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
|
||||
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||
input->header.cameraFar, input->header.cameraFar};
|
||||
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||
input->header.cameraNear, input->header.cameraNear};
|
||||
|
||||
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
@@ -688,7 +688,7 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
|
||||
// Cull lights into subtile lists
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int subtileLightIndices[4][MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
@@ -697,15 +697,15 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
;
|
||||
int subtileNumLights[4];
|
||||
SplitTileMinMax(midX, midY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd,
|
||||
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
|
||||
|
||||
|
||||
// Recurse into subtiles
|
||||
ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||
ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||
subtileLightIndices[0], subtileNumLights[0],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
|
||||
@@ -744,7 +744,7 @@ IntersectLightsWithTileMinMax(
|
||||
{
|
||||
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
|
||||
float frustumPlanes_xy[4];
|
||||
float frustumPlanes_z[4];
|
||||
|
||||
@@ -753,14 +753,14 @@ IntersectLightsWithTileMinMax(
|
||||
(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y),
|
||||
-(cameraProj_22 * gBufferScale_y) };
|
||||
|
||||
|
||||
float frustumPlanes_z_v[4] = { tileEndX - gBufferScale_x,
|
||||
-tileStartX + gBufferScale_x,
|
||||
tileEndY - gBufferScale_y,
|
||||
-tileStartY + gBufferScale_y };
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
|
||||
float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
|
||||
frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
|
||||
frustumPlanes_xy_v[i] *= norm;
|
||||
frustumPlanes_z_v[i] *= norm;
|
||||
@@ -781,29 +781,29 @@ IntersectLightsWithTileMinMax(
|
||||
|
||||
d = maxZ - light_positionView_z;
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
if (!inFrustum)
|
||||
|
||||
if (!inFrustum)
|
||||
continue;
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
|
||||
// Pack and store intersecting lights
|
||||
if (inFrustum)
|
||||
tileLightIndices[tileNumLights++] = lightIndex;
|
||||
@@ -831,7 +831,7 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||
|
||||
// This is a root tile, so first do a full 6-plane cull
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int lightIndices[MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
@@ -842,12 +842,12 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||
startX, endX, startY, endY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd, lightIndices);
|
||||
|
||||
// Now kick off the recursive process for this tile
|
||||
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||
numLights, framebuffer);
|
||||
}
|
||||
|
||||
@@ -856,10 +856,10 @@ void
|
||||
DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
|
||||
{
|
||||
MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
|
||||
// Update min/max Z tree
|
||||
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
input->header.cameraNear, input->header.cameraFar);
|
||||
|
||||
int rootLevel = minMaxZTree->Levels() - 1;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef __cilk
|
||||
@@ -104,7 +104,7 @@ public:
|
||||
{
|
||||
mNumTilesX = gBufferWidth / mTileWidth;
|
||||
mNumTilesY = gBufferHeight / mTileHeight;
|
||||
|
||||
|
||||
// Allocate arrays
|
||||
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
@@ -155,7 +155,7 @@ public:
|
||||
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
if (srcX + 1 < srcTilesX) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
@@ -185,7 +185,7 @@ public:
|
||||
lAlignedFree(mMaxZArrays[i]);
|
||||
}
|
||||
lAlignedFree(mMinZArrays);
|
||||
lAlignedFree(mMaxZArrays);
|
||||
lAlignedFree(mMaxZArrays);
|
||||
}
|
||||
|
||||
int Levels() const { return mLevels; }
|
||||
@@ -219,19 +219,19 @@ private:
|
||||
static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
|
||||
|
||||
void InitDynamicCilk(InputData *input) {
|
||||
gMinMaxZTreeCilk =
|
||||
gMinMaxZTreeCilk =
|
||||
new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
|
||||
input->header.framebufferWidth,
|
||||
input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int *lightIndices, int numLights,
|
||||
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int *lightIndices, int numLights,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
|
||||
// If we few enough lights or this is the base case (last level), shade
|
||||
// this full tile directly
|
||||
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
|
||||
@@ -241,19 +241,19 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
|
||||
// Skip entirely offscreen tiles
|
||||
if (endX > startX && endY > startY) {
|
||||
ispc::ShadeTile(
|
||||
startX, endX, startY, endY,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
&input->arrays,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer->r, framebuffer->g, framebuffer->b);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
|
||||
// Move down a level in the tree
|
||||
@@ -276,9 +276,9 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
|
||||
// NOTE: Order is 00, 10, 01, 11
|
||||
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
|
||||
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||
input->header.cameraFar, input->header.cameraFar};
|
||||
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||
input->header.cameraNear, input->header.cameraNear};
|
||||
|
||||
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
@@ -298,7 +298,7 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
|
||||
// Cull lights into subtile lists
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int subtileLightIndices[4][MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
@@ -307,15 +307,15 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
;
|
||||
int subtileNumLights[4];
|
||||
ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd,
|
||||
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
|
||||
|
||||
|
||||
// Recurse into subtiles
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||
subtileLightIndices[0], subtileNumLights[0],
|
||||
framebuffer);
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
|
||||
@@ -349,7 +349,7 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||
|
||||
// This is a root tile, so first do a full 6-plane cull
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int lightIndices[MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
@@ -360,12 +360,12 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||
startX, endX, startY, endY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd, lightIndices);
|
||||
|
||||
// Now kick off the recursive process for this tile
|
||||
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||
numLights, framebuffer);
|
||||
}
|
||||
|
||||
@@ -374,10 +374,10 @@ void
|
||||
DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
|
||||
{
|
||||
MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
|
||||
// Update min/max Z tree
|
||||
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
input->header.cameraNear, input->header.cameraFar);
|
||||
|
||||
// Launch the "root" tiles. Ideally these should at least fill the
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@
|
||||
#define int16 short
|
||||
#define int8 char
|
||||
|
||||
__device__ static inline float clamp(float v, float low, float high)
|
||||
__device__ static inline float clamp(float v, float low, float high)
|
||||
{
|
||||
return min(max(v, low), high);
|
||||
}
|
||||
@@ -122,8 +122,8 @@ struct Uniform
|
||||
const int2 idx = get_chunk(i);
|
||||
return __shfl(data[idx.x], idx.y);
|
||||
}
|
||||
|
||||
__device__ inline void set(const bool active, const int i, T value)
|
||||
|
||||
__device__ inline void set(const bool active, const int i, T value)
|
||||
{
|
||||
const int2 idx = get_chunk(i);
|
||||
const int chunkIdx = idx.x;
|
||||
@@ -160,9 +160,9 @@ struct Uniform
|
||||
{
|
||||
return data[i];
|
||||
}
|
||||
|
||||
|
||||
__device__ inline T* get_ptr(const int i) {return &data[i]; }
|
||||
__device__ inline void set(const bool active, const int i, T value)
|
||||
__device__ inline void set(const bool active, const int i, T value)
|
||||
{
|
||||
if (active)
|
||||
data[i] = value;
|
||||
@@ -185,8 +185,8 @@ struct Uniform
|
||||
{
|
||||
return shdata[i];
|
||||
}
|
||||
|
||||
__device__ inline void set(const bool active, const int i, T value)
|
||||
|
||||
__device__ inline void set(const bool active, const int i, T value)
|
||||
{
|
||||
if (active)
|
||||
shdata[i] = value;
|
||||
@@ -264,7 +264,7 @@ static __device__ __forceinline__ int2 warpBinExclusiveScan(const bool p)
|
||||
const int b = __ballot(p);
|
||||
return make_int2(__popc(b), __popc(b & lanemask_lt()));
|
||||
}
|
||||
__device__ static inline
|
||||
__device__ static inline
|
||||
int packed_store_active(bool active, int* ptr, int value)
|
||||
{
|
||||
const int2 res = warpBinExclusiveScan(active);
|
||||
@@ -358,7 +358,7 @@ IntersectLightsWithTileMinMax(
|
||||
{
|
||||
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
|
||||
float frustumPlanes_xy[4] = {
|
||||
-(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_11 * gBufferScale_x),
|
||||
@@ -371,7 +371,7 @@ IntersectLightsWithTileMinMax(
|
||||
-tileStartY + gBufferScale_y };
|
||||
|
||||
for ( int i = 0; i < 4; ++i) {
|
||||
float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
frustumPlanes_z[i] * frustumPlanes_z[i]);
|
||||
frustumPlanes_xy[i] *= norm;
|
||||
frustumPlanes_z[i] *= norm;
|
||||
@@ -393,32 +393,32 @@ IntersectLightsWithTileMinMax(
|
||||
|
||||
d = maxZ - light_positionView_z;
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
|
||||
// This seems better than cif(!inFrustum) ccontinue; here since we
|
||||
// don't actually need to mask the rest of this function - this is
|
||||
// just a greedy early-out. Could also structure all of this as
|
||||
// nested if() statements, but this a bit easier to read
|
||||
if (__ballot(inFrustum) > 0)
|
||||
if (__ballot(inFrustum) > 0)
|
||||
{
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
|
||||
// Pack and store intersecting lights
|
||||
const bool active = inFrustum && lightIndex < numLights;
|
||||
#if 0
|
||||
@@ -472,7 +472,7 @@ IntersectLightsWithTile(
|
||||
int32 tileNumLights = IntersectLightsWithTileMinMax(
|
||||
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
|
||||
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
|
||||
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
|
||||
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
|
||||
light_positionView_z_array, light_attenuationEnd_array,
|
||||
tileLightIndices);
|
||||
|
||||
@@ -505,7 +505,7 @@ ShadeTile(
|
||||
unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
|
||||
for ( int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
|
||||
{
|
||||
{
|
||||
const int x = xb + programIndex;
|
||||
if (x >= tileEndX) continue;
|
||||
int32 framebufferIndex = (y * gBufferWidth + x);
|
||||
@@ -517,16 +517,16 @@ ShadeTile(
|
||||
} else {
|
||||
float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||
float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||
|
||||
|
||||
for ( int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||
|
||||
for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
|
||||
{
|
||||
{
|
||||
const int x = xb + programIndex;
|
||||
// if (x >= tileEndX) break;
|
||||
int32 gBufferOffset = y * gBufferWidth + x;
|
||||
|
||||
|
||||
// Reconstruct position and (negative) view vector from G-buffer
|
||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||
float Vneg_x, Vneg_y, Vneg_z;
|
||||
@@ -535,19 +535,19 @@ ShadeTile(
|
||||
|
||||
// Compute screen/clip-space position
|
||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
twoOverGBufferWidth - 1.0f;
|
||||
|
||||
// Unproject depth buffer Z value into view space
|
||||
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
cameraProj_11;
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
cameraProj_22;
|
||||
|
||||
|
||||
// We actually end up with a vector pointing *at* the
|
||||
// surface (i.e. the negative view vector)
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
@@ -556,51 +556,51 @@ ShadeTile(
|
||||
float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]);
|
||||
asm("// half2float //");
|
||||
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrt(4.0f * f - 1.0f);
|
||||
|
||||
|
||||
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||
surface_normal_z = 3.0f - 8.0f * f;
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
float surface_specularAmount =
|
||||
__half2float(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
float surface_specularPower =
|
||||
__half2float(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
|
||||
|
||||
float lit_x = 0.0f;
|
||||
float lit_y = 0.0f;
|
||||
float lit_z = 0.0f;
|
||||
for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
++tileLightIndex) {
|
||||
int32 lightIndex = tileLightIndices.get(tileLightIndex);
|
||||
|
||||
|
||||
// Gather light data relevant to initial culling
|
||||
float light_positionView_x =
|
||||
float light_positionView_x =
|
||||
__ldg(&inputData.lightPositionView_x[lightIndex]);
|
||||
float light_positionView_y =
|
||||
float light_positionView_y =
|
||||
__ldg(&inputData.lightPositionView_y[lightIndex]);
|
||||
float light_positionView_z =
|
||||
float light_positionView_z =
|
||||
__ldg(&inputData.lightPositionView_z[lightIndex]);
|
||||
float light_attenuationEnd =
|
||||
float light_attenuationEnd =
|
||||
__ldg(&inputData.lightAttenuationEnd[lightIndex]);
|
||||
|
||||
|
||||
// Compute light vector
|
||||
float L_x = light_positionView_x - surface_positionView_x;
|
||||
float L_y = light_positionView_y - surface_positionView_y;
|
||||
float L_z = light_positionView_z - surface_positionView_z;
|
||||
|
||||
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||
|
||||
|
||||
// Clip at end of attenuation
|
||||
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||
|
||||
if (distanceToLight2 < light_attenutaionEnd2) {
|
||||
if (distanceToLight2 < light_attenutaionEnd2) {
|
||||
float distanceToLight = sqrt(distanceToLight2);
|
||||
|
||||
// HLSL "rcp" is allowed to be fairly inaccurate
|
||||
@@ -610,12 +610,12 @@ ShadeTile(
|
||||
L_z *= distanceToLightRcp;
|
||||
|
||||
// Start computing brdf
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, L_x, L_y, L_z);
|
||||
|
||||
|
||||
// Clip back facing
|
||||
if (NdotL > 0.0f) {
|
||||
float light_attenuationBegin =
|
||||
float light_attenuationBegin =
|
||||
inputData.lightAttenuationBegin[lightIndex];
|
||||
|
||||
// Light distance attenuation (linstep)
|
||||
@@ -627,19 +627,19 @@ ShadeTile(
|
||||
float H_y = (L_y - Vneg_y);
|
||||
float H_z = (L_z - Vneg_z);
|
||||
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, H_x, H_y, H_z);
|
||||
NdotH = max(NdotH, 0.0f);
|
||||
|
||||
float specular = pow(NdotH, surface_specularPower);
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
(1.0f / 8.0f);
|
||||
float specularContrib = surface_specularAmount *
|
||||
float specularContrib = surface_specularAmount *
|
||||
specularNorm * specular;
|
||||
|
||||
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||
|
||||
|
||||
float light_color_x = inputData.lightColor_x[lightIndex];
|
||||
float light_color_y = inputData.lightColor_y[lightIndex];
|
||||
float light_color_z = inputData.lightColor_z[lightIndex];
|
||||
@@ -663,7 +663,7 @@ ShadeTile(
|
||||
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
|
||||
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
|
||||
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
|
||||
|
||||
|
||||
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||
@@ -707,8 +707,8 @@ RenderTile( int num_groups_x, int num_groups_y,
|
||||
// Light intersection: figure out which lights illuminate this tile.
|
||||
Uniform<int,MAX_LIGHTS> tileLightIndices; // Light list for the tile
|
||||
#if 1
|
||||
int numTileLights =
|
||||
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
||||
int numTileLights =
|
||||
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
||||
tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight,
|
||||
inputData.zBuffer,
|
||||
@@ -716,9 +716,9 @@ RenderTile( int num_groups_x, int num_groups_y,
|
||||
cameraProj_22, cameraProj_32,
|
||||
inputHeader.cameraNear, inputHeader.cameraFar,
|
||||
MAX_LIGHTS,
|
||||
inputData.lightPositionView_x,
|
||||
inputData.lightPositionView_y,
|
||||
inputData.lightPositionView_z,
|
||||
inputData.lightPositionView_x,
|
||||
inputData.lightPositionView_y,
|
||||
inputData.lightPositionView_z,
|
||||
inputData.lightAttenuationEnd,
|
||||
tileLightIndices);
|
||||
|
||||
@@ -726,7 +726,7 @@ RenderTile( int num_groups_x, int num_groups_y,
|
||||
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight, inputData,
|
||||
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
|
||||
tileLightIndices, numTileLights, visualizeLightCount,
|
||||
tileLightIndices, numTileLights, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||
#endif
|
||||
}
|
||||
@@ -745,9 +745,9 @@ RenderStatic___export( InputHeader inputHeaderPtr[],
|
||||
const InputDataArrays inputData = *inputDataPtr;
|
||||
|
||||
|
||||
int num_groups_x = (inputHeader.framebufferWidth +
|
||||
int num_groups_x = (inputHeader.framebufferWidth +
|
||||
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
||||
int num_groups_y = (inputHeader.framebufferHeight +
|
||||
int num_groups_y = (inputHeader.framebufferHeight +
|
||||
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
||||
int num_groups = num_groups_x * num_groups_y;
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "deferred.h"
|
||||
@@ -142,11 +142,11 @@ ComputeZBounds(
|
||||
maxZ = reduce_max(laneMaxZ);
|
||||
}
|
||||
|
||||
#if 1
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
#ifndef __NVPTX__
|
||||
export
|
||||
export
|
||||
#endif
|
||||
uniform int32
|
||||
IntersectLightsWithTileMinMax(
|
||||
@@ -171,7 +171,7 @@ IntersectLightsWithTileMinMax(
|
||||
{
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
|
||||
uniform_t float frustumPlanes_xy[4] = {
|
||||
-(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_11 * gBufferScale_x),
|
||||
@@ -184,7 +184,7 @@ IntersectLightsWithTileMinMax(
|
||||
-tileStartY + gBufferScale_y };
|
||||
|
||||
for (uniform int i = 0; i < 4; ++i) {
|
||||
uniform_t float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
uniform_t float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
frustumPlanes_z[i] * frustumPlanes_z[i]);
|
||||
frustumPlanes_xy[i] *= norm;
|
||||
frustumPlanes_z[i] *= norm;
|
||||
@@ -202,7 +202,7 @@ IntersectLightsWithTileMinMax(
|
||||
|
||||
d = maxZ - light_positionView_z;
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
|
||||
// This seems better than cif(!inFrustum) ccontinue; here since we
|
||||
// don't actually need to mask the rest of this function - this is
|
||||
// just a greedy early-out. Could also structure all of this as
|
||||
@@ -211,26 +211,26 @@ IntersectLightsWithTileMinMax(
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
#if 0
|
||||
|
||||
#if 0
|
||||
// Pack and store intersecting lights
|
||||
cif (inFrustum) {
|
||||
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
|
||||
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
|
||||
lightIndex);
|
||||
}
|
||||
#else
|
||||
@@ -245,7 +245,7 @@ IntersectLightsWithTileMinMax(
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static uniform int32
|
||||
@@ -277,7 +277,7 @@ IntersectLightsWithTile(
|
||||
uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
|
||||
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
|
||||
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
|
||||
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
|
||||
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
|
||||
light_positionView_z_array, light_attenuationEnd_array,
|
||||
tileLightIndices);
|
||||
|
||||
@@ -285,7 +285,7 @@ IntersectLightsWithTile(
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
#ifndef __NVPTX__
|
||||
@@ -324,13 +324,13 @@ ShadeTile(
|
||||
} else {
|
||||
uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||
uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||
|
||||
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||
|
||||
foreach (x = tileStartX ... tileEndX) {
|
||||
int32 gBufferOffset = y * gBufferWidth + x;
|
||||
|
||||
|
||||
// Reconstruct position and (negative) view vector from G-buffer
|
||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||
float Vneg_x, Vneg_y, Vneg_z;
|
||||
@@ -339,70 +339,70 @@ ShadeTile(
|
||||
|
||||
// Compute screen/clip-space position
|
||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
twoOverGBufferWidth - 1.0f;
|
||||
|
||||
// Unproject depth buffer Z value into view space
|
||||
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
cameraProj_11;
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
cameraProj_22;
|
||||
|
||||
|
||||
// We actually end up with a vector pointing *at* the
|
||||
// surface (i.e. the negative view vector)
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||
float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
|
||||
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrt(4.0f * f - 1.0f);
|
||||
|
||||
|
||||
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||
surface_normal_z = 3.0f - 8.0f * f;
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
float surface_specularAmount =
|
||||
half_to_float(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
float surface_specularPower =
|
||||
half_to_float(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
|
||||
|
||||
float lit_x = 0.0f;
|
||||
float lit_y = 0.0f;
|
||||
float lit_z = 0.0f;
|
||||
for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
++tileLightIndex) {
|
||||
uniform int32 lightIndex = tileLightIndices[tileLightIndex];
|
||||
|
||||
|
||||
// Gather light data relevant to initial culling
|
||||
uniform float light_positionView_x =
|
||||
uniform float light_positionView_x =
|
||||
inputData.lightPositionView_x[lightIndex];
|
||||
uniform float light_positionView_y =
|
||||
uniform float light_positionView_y =
|
||||
inputData.lightPositionView_y[lightIndex];
|
||||
uniform float light_positionView_z =
|
||||
uniform float light_positionView_z =
|
||||
inputData.lightPositionView_z[lightIndex];
|
||||
uniform float light_attenuationEnd =
|
||||
uniform float light_attenuationEnd =
|
||||
inputData.lightAttenuationEnd[lightIndex];
|
||||
|
||||
|
||||
// Compute light vector
|
||||
float L_x = light_positionView_x - surface_positionView_x;
|
||||
float L_y = light_positionView_y - surface_positionView_y;
|
||||
float L_z = light_positionView_z - surface_positionView_z;
|
||||
|
||||
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||
|
||||
|
||||
// Clip at end of attenuation
|
||||
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||
|
||||
cif (distanceToLight2 < light_attenutaionEnd2) {
|
||||
cif (distanceToLight2 < light_attenutaionEnd2) {
|
||||
float distanceToLight = sqrt(distanceToLight2);
|
||||
|
||||
// HLSL "rcp" is allowed to be fairly inaccurate
|
||||
@@ -412,12 +412,12 @@ ShadeTile(
|
||||
L_z *= distanceToLightRcp;
|
||||
|
||||
// Start computing brdf
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, L_x, L_y, L_z);
|
||||
|
||||
|
||||
// Clip back facing
|
||||
cif (NdotL > 0.0f) {
|
||||
uniform float light_attenuationBegin =
|
||||
uniform float light_attenuationBegin =
|
||||
inputData.lightAttenuationBegin[lightIndex];
|
||||
|
||||
// Light distance attenuation (linstep)
|
||||
@@ -429,19 +429,19 @@ ShadeTile(
|
||||
float H_y = (L_y - Vneg_y);
|
||||
float H_z = (L_z - Vneg_z);
|
||||
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, H_x, H_y, H_z);
|
||||
NdotH = max(NdotH, 0.0f);
|
||||
|
||||
float specular = pow(NdotH, surface_specularPower);
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
(1.0f / 8.0f);
|
||||
float specularContrib = surface_specularAmount *
|
||||
float specularContrib = surface_specularAmount *
|
||||
specularNorm * specular;
|
||||
|
||||
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||
|
||||
|
||||
uniform float light_color_x = inputData.lightColor_x[lightIndex];
|
||||
uniform float light_color_y = inputData.lightColor_y[lightIndex];
|
||||
uniform float light_color_z = inputData.lightColor_z[lightIndex];
|
||||
@@ -465,7 +465,7 @@ ShadeTile(
|
||||
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
|
||||
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
|
||||
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
|
||||
|
||||
|
||||
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||
@@ -512,8 +512,8 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
||||
#else /* shared memory doesn't full work... why? */
|
||||
uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
|
||||
#endif
|
||||
uniform int numTileLights =
|
||||
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
||||
uniform int numTileLights =
|
||||
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
||||
tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight,
|
||||
inputData.zBuffer,
|
||||
@@ -521,9 +521,9 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
||||
cameraProj_22, cameraProj_32,
|
||||
inputHeader.cameraNear, inputHeader.cameraFar,
|
||||
MAX_LIGHTS,
|
||||
inputData.lightPositionView_x,
|
||||
inputData.lightPositionView_y,
|
||||
inputData.lightPositionView_z,
|
||||
inputData.lightPositionView_x,
|
||||
inputData.lightPositionView_y,
|
||||
inputData.lightPositionView_z,
|
||||
inputData.lightAttenuationEnd,
|
||||
tileLightIndices);
|
||||
|
||||
@@ -531,7 +531,7 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
||||
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight, inputData,
|
||||
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
|
||||
tileLightIndices, numTileLights, visualizeLightCount,
|
||||
tileLightIndices, numTileLights, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||
#ifdef MALLOC
|
||||
delete tileLightIndices;
|
||||
@@ -551,9 +551,9 @@ RenderStatic(uniform InputHeader inputHeaderPtr[],
|
||||
uniform InputHeader inputHeader = *inputHeaderPtr;
|
||||
uniform InputDataArrays inputData = *inputDataPtr;
|
||||
|
||||
uniform int num_groups_x = (inputHeader.framebufferWidth +
|
||||
uniform int num_groups_x = (inputHeader.framebufferWidth +
|
||||
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
||||
uniform int num_groups_y = (inputHeader.framebufferHeight +
|
||||
uniform int num_groups_y = (inputHeader.framebufferHeight +
|
||||
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
||||
uniform int num_groups = num_groups_x * num_groups_y;
|
||||
|
||||
@@ -627,16 +627,16 @@ SplitTileMinMax(
|
||||
{
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
|
||||
uniform_t float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y) };
|
||||
uniform_t float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
|
||||
tileMidY - gBufferScale_y };
|
||||
|
||||
// Normalize
|
||||
uniform_t float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
|
||||
uniform_t float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
|
||||
frustumPlanes_z[0] * frustumPlanes_z[0]),
|
||||
rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
|
||||
rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
|
||||
frustumPlanes_z[1] * frustumPlanes_z[1]) };
|
||||
frustumPlanes_xy[0] *= norm[0];
|
||||
frustumPlanes_xy[1] *= norm[1];
|
||||
@@ -658,23 +658,23 @@ SplitTileMinMax(
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
|
||||
// Test lights again subtile z bounds
|
||||
bool inFrustum[4];
|
||||
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
|
||||
|
||||
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
float dy = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_y * frustumPlanes_xy[1];
|
||||
|
||||
|
||||
cif (abs(dx) > light_attenuationEnd) {
|
||||
bool positiveX = dx > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
|
||||
@@ -693,20 +693,20 @@ SplitTileMinMax(
|
||||
// Pack and store intersecting lights
|
||||
// TODO: Experiment with a loop here instead
|
||||
cif (inFrustum[0])
|
||||
subtileLightOffset[0] +=
|
||||
subtileLightOffset[0] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[0]],
|
||||
lightIndex);
|
||||
cif (inFrustum[1])
|
||||
subtileLightOffset[1] +=
|
||||
subtileLightOffset[1] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[1]],
|
||||
lightIndex);
|
||||
cif (inFrustum[2])
|
||||
subtileLightOffset[2] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[2]],
|
||||
subtileLightOffset[2] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[2]],
|
||||
lightIndex);
|
||||
cif (inFrustum[3])
|
||||
subtileLightOffset[3] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[3]],
|
||||
subtileLightOffset[3] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[3]],
|
||||
lightIndex);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
||||
@@ -5,8 +5,8 @@ uniform int gcd(uniform int a, uniform int b)
|
||||
{
|
||||
while ( a != 0 )
|
||||
{
|
||||
uniform int c = a;
|
||||
a = b%a;
|
||||
uniform int c = a;
|
||||
a = b%a;
|
||||
b = c;
|
||||
}
|
||||
return b;
|
||||
@@ -32,7 +32,7 @@ int __sj(const int i, const uniform int j, const uniform int m, const uniform in
|
||||
|
||||
|
||||
#if 0
|
||||
static inline
|
||||
static inline
|
||||
void transpose_serial(uniform T A[], const uniform int m, const uniform int n)
|
||||
{
|
||||
const uniform int tmpSize = max(m,n) * programCount;
|
||||
@@ -90,7 +90,7 @@ static uniform int * uniform joverb = NULL;
|
||||
static uniform int * uniform iovera = NULL;
|
||||
static uniform int a,b,c;
|
||||
|
||||
static
|
||||
static
|
||||
void transpose_init(const uniform int m, const uniform int n, const uniform int nTask)
|
||||
{
|
||||
joverb = uniform new uniform int[n];
|
||||
@@ -105,14 +105,14 @@ void transpose_init(const uniform int m, const uniform int n, const uniform int
|
||||
iovera[i] = i/a;
|
||||
}
|
||||
|
||||
static
|
||||
static
|
||||
void transpose_finalize()
|
||||
{
|
||||
delete iovera;
|
||||
delete joverb;
|
||||
}
|
||||
|
||||
task
|
||||
task
|
||||
void transpose_step1(uniform T A[], const uniform int m, const uniform int n)
|
||||
{
|
||||
const uniform int n_per_task = (n + taskCount - 1)/taskCount;
|
||||
@@ -140,7 +140,7 @@ void transpose_step2(uniform T A[], const uniform int m, const uniform int n)
|
||||
const uniform int m_per_task = (m + taskCount - 1)/taskCount;
|
||||
const uniform int mibeg = taskIndex * m_per_task;
|
||||
const uniform int miend = min(mibeg + m_per_task, m);
|
||||
|
||||
|
||||
uniform T * uniform tmp = uniform new uniform T[n*programCount];
|
||||
|
||||
uniform T (*uniform tmp2D)[programCount] = (uniform T (*uniform)[programCount])tmp;
|
||||
@@ -161,7 +161,7 @@ void transpose_step3(uniform T A[], const uniform int m, const uniform int n)
|
||||
const uniform int n_per_task = (n + taskCount - 1)/taskCount;
|
||||
const uniform int nibeg = taskIndex * n_per_task;
|
||||
const uniform int niend = min(nibeg + n_per_task, n);
|
||||
|
||||
|
||||
uniform T * uniform tmp = uniform new uniform T[m];
|
||||
|
||||
for (uniform int j = nibeg; j < niend; j++)
|
||||
@@ -176,7 +176,7 @@ void transpose_step3(uniform T A[], const uniform int m, const uniform int n)
|
||||
delete tmp;
|
||||
}
|
||||
|
||||
export
|
||||
export
|
||||
void transpose(uniform T A[], const uniform int m, const uniform int n)
|
||||
{
|
||||
#if 0
|
||||
@@ -187,7 +187,7 @@ void transpose(uniform T A[], const uniform int m, const uniform int n)
|
||||
|
||||
launch [nTask] transpose_step1(A, m, n);
|
||||
sync;
|
||||
|
||||
|
||||
launch [nTask] transpose_step2(A, m, n);
|
||||
sync;
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ int main (int argc, char *argv[])
|
||||
valsGld[i] = valsSrc[i];
|
||||
}
|
||||
delete keys;
|
||||
|
||||
|
||||
ispcSetMallocHeapLimit(1024*1024*1024);
|
||||
|
||||
ispc::openMergeSort();
|
||||
@@ -115,7 +115,7 @@ int main (int argc, char *argv[])
|
||||
}
|
||||
printf("\n---\n");
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
std::sort(keysGld, keysGld + n);
|
||||
|
||||
@@ -30,9 +30,9 @@ int nextPowerOfTwo(int x)
|
||||
|
||||
__device__ static inline
|
||||
int binarySearchInclusiveRanks(
|
||||
const int val,
|
||||
const int val,
|
||||
uniform int *data,
|
||||
const int L,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
@@ -52,9 +52,9 @@ int binarySearchInclusiveRanks(
|
||||
|
||||
__device__ static inline
|
||||
int binarySearchExclusiveRanks(
|
||||
const int val,
|
||||
uniform int *data,
|
||||
const int L,
|
||||
const int val,
|
||||
uniform int *data,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
@@ -74,9 +74,9 @@ int binarySearchExclusiveRanks(
|
||||
|
||||
__device__ static inline
|
||||
int binarySearchInclusive(
|
||||
const Key_t val,
|
||||
const Key_t val,
|
||||
uniform Key_t *data,
|
||||
const int L,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
@@ -96,9 +96,9 @@ int binarySearchInclusive(
|
||||
|
||||
__device__ static inline
|
||||
int binarySearchExclusive(
|
||||
const Key_t val,
|
||||
uniform Key_t *data,
|
||||
const int L,
|
||||
const Key_t val,
|
||||
uniform Key_t *data,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
@@ -118,9 +118,9 @@ int binarySearchExclusive(
|
||||
|
||||
__device__ static inline
|
||||
int binarySearchInclusive1(
|
||||
const Key_t val,
|
||||
const Key_t val,
|
||||
Key_t data,
|
||||
const uniform int L,
|
||||
const uniform int L,
|
||||
uniform int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
@@ -140,9 +140,9 @@ int binarySearchInclusive1(
|
||||
|
||||
__device__ static inline
|
||||
int binarySearchExclusive1(
|
||||
const Key_t val,
|
||||
Key_t data,
|
||||
const uniform int L,
|
||||
const Key_t val,
|
||||
Key_t data,
|
||||
const uniform int L,
|
||||
uniform int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
@@ -245,7 +245,7 @@ void generateSampleRanksKernel(
|
||||
const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
|
||||
const uniform int blkBeg = blkIdx * blkDim;
|
||||
const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
|
||||
|
||||
|
||||
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
|
||||
{
|
||||
const int pos = blk * programCount + programIndex;
|
||||
@@ -291,8 +291,8 @@ void generateSampleRanks(
|
||||
uniform int N)
|
||||
{
|
||||
uniform int lastSegmentElements = N % (2 * stride);
|
||||
uniform int threadCount = (lastSegmentElements > stride) ?
|
||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||
uniform int threadCount = (lastSegmentElements > stride) ?
|
||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
|
||||
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
||||
@@ -304,7 +304,7 @@ void generateSampleRanks(
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge step 2: generate sample ranks and indices
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__global__
|
||||
__global__
|
||||
void mergeRanksAndIndicesKernel(
|
||||
uniform int nBlocks,
|
||||
uniform int in_Limits[],
|
||||
@@ -317,7 +317,7 @@ void mergeRanksAndIndicesKernel(
|
||||
const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
|
||||
const uniform int blkBeg = blkIdx * blkDim;
|
||||
const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
|
||||
|
||||
|
||||
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
|
||||
{
|
||||
int pos = blk * programCount + programIndex;
|
||||
@@ -357,8 +357,8 @@ void mergeRanksAndIndices(
|
||||
uniform int N)
|
||||
{
|
||||
const uniform int lastSegmentElements = N % (2 * stride);
|
||||
const uniform int threadCount = (lastSegmentElements > stride) ?
|
||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||
const uniform int threadCount = (lastSegmentElements > stride) ?
|
||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
|
||||
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
||||
@@ -457,13 +457,13 @@ void mergeElementaryIntervalsKernel(
|
||||
dstB = segmentBase + startDstB + dstPosB;
|
||||
|
||||
// store merge data
|
||||
if (dstA >= 0)
|
||||
if (dstA >= 0)
|
||||
{
|
||||
// int dstA = segmentBase + startSrcA + programIndex;
|
||||
dstKey[dstA] = keyA;
|
||||
dstVal[dstA] = valA;
|
||||
}
|
||||
if (dstB >= 0)
|
||||
if (dstB >= 0)
|
||||
{
|
||||
// int dstB = segmentBase + stride + startSrcB + programIndex;
|
||||
dstKey[dstB] = keyB;
|
||||
@@ -513,7 +513,7 @@ __device__ static uniform int * uniform limitsB;
|
||||
__device__ static uniform int nTasks;
|
||||
__device__ static uniform int MAX_SAMPLE_COUNT = 0;
|
||||
|
||||
__global__
|
||||
__global__
|
||||
void openMergeSort___export()
|
||||
{
|
||||
nTasks = 13*32*13;
|
||||
|
||||
@@ -25,9 +25,9 @@ int nextPowerOfTwo(int x)
|
||||
|
||||
static inline
|
||||
int binarySearchInclusiveRanks(
|
||||
const int val,
|
||||
const int val,
|
||||
uniform int *data,
|
||||
const int L,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
cif (L == 0)
|
||||
@@ -47,9 +47,9 @@ int binarySearchInclusiveRanks(
|
||||
|
||||
static inline
|
||||
int binarySearchExclusiveRanks(
|
||||
const int val,
|
||||
uniform int *data,
|
||||
const int L,
|
||||
const int val,
|
||||
uniform int *data,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
cif (L == 0)
|
||||
@@ -69,9 +69,9 @@ int binarySearchExclusiveRanks(
|
||||
|
||||
static inline
|
||||
int binarySearchInclusive(
|
||||
const Key_t val,
|
||||
const Key_t val,
|
||||
uniform Key_t *data,
|
||||
const int L,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
cif (L == 0)
|
||||
@@ -91,9 +91,9 @@ int binarySearchInclusive(
|
||||
|
||||
static inline
|
||||
int binarySearchExclusive(
|
||||
const Key_t val,
|
||||
uniform Key_t *data,
|
||||
const int L,
|
||||
const Key_t val,
|
||||
uniform Key_t *data,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
cif (L == 0)
|
||||
@@ -113,9 +113,9 @@ int binarySearchExclusive(
|
||||
|
||||
static inline
|
||||
int binarySearchInclusive1(
|
||||
const Key_t val,
|
||||
const Key_t val,
|
||||
Key_t data,
|
||||
const uniform int L,
|
||||
const uniform int L,
|
||||
uniform int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
@@ -135,9 +135,9 @@ int binarySearchInclusive1(
|
||||
|
||||
static inline
|
||||
int binarySearchExclusive1(
|
||||
const Key_t val,
|
||||
Key_t data,
|
||||
const uniform int L,
|
||||
const Key_t val,
|
||||
Key_t data,
|
||||
const uniform int L,
|
||||
uniform int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
@@ -158,7 +158,7 @@ int binarySearchExclusive1(
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Bottom-level merge sort (binary search-based)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
task
|
||||
task
|
||||
void mergeSortGangKernel(
|
||||
uniform int batchSize,
|
||||
uniform Key_t dstKey[],
|
||||
@@ -189,7 +189,7 @@ void mergeSortGangKernel(
|
||||
const int offset = 2 * (programIndex - lPos);
|
||||
uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
|
||||
uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
|
||||
|
||||
|
||||
Key_t keyA = baseKey[lPos + 0];
|
||||
Val_t valA = baseVal[lPos + 0];
|
||||
Key_t keyB = baseKey[lPos + stride];
|
||||
@@ -244,7 +244,7 @@ void generateSampleRanksKernel(
|
||||
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
|
||||
const uniform int blockBeg = blockIdx * blockDim;
|
||||
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
|
||||
|
||||
|
||||
for (uniform int block = blockBeg; block < blockEnd; block++)
|
||||
{
|
||||
const int pos = block * programCount + programIndex;
|
||||
@@ -290,8 +290,8 @@ void generateSampleRanks(
|
||||
uniform int N)
|
||||
{
|
||||
uniform int lastSegmentElements = N % (2 * stride);
|
||||
uniform int threadCount = (lastSegmentElements > stride) ?
|
||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||
uniform int threadCount = (lastSegmentElements > stride) ?
|
||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
|
||||
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
||||
@@ -306,7 +306,7 @@ void generateSampleRanks(
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge step 2: generate sample ranks and indices
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
task
|
||||
task
|
||||
void mergeRanksAndIndicesKernel(
|
||||
uniform int nBlocks,
|
||||
uniform int in_Limits[],
|
||||
@@ -319,7 +319,7 @@ void mergeRanksAndIndicesKernel(
|
||||
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
|
||||
const uniform int blockBeg = blockIdx * blockDim;
|
||||
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
|
||||
|
||||
|
||||
for (uniform int block = blockBeg; block < blockEnd; block++)
|
||||
{
|
||||
int pos = block * programCount + programIndex;
|
||||
@@ -359,8 +359,8 @@ void mergeRanksAndIndices(
|
||||
uniform int N)
|
||||
{
|
||||
const uniform int lastSegmentElements = N % (2 * stride);
|
||||
const uniform int threadCount = (lastSegmentElements > stride) ?
|
||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||
const uniform int threadCount = (lastSegmentElements > stride) ?
|
||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
|
||||
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
||||
@@ -462,12 +462,12 @@ void mergeElementaryIntervalsKernel(
|
||||
if (programIndex < lenSrcB && dstPosB < lenSrcB)
|
||||
dstB = segmentBase + startDstB + dstPosB;
|
||||
|
||||
if (dstA >= 0)
|
||||
if (dstA >= 0)
|
||||
{
|
||||
dstKey[dstA] = keyA;
|
||||
dstVal[dstA] = valA;
|
||||
}
|
||||
if (dstB >= 0)
|
||||
if (dstB >= 0)
|
||||
{
|
||||
dstKey[dstB] = keyB;
|
||||
dstVal[dstB] = valB;
|
||||
@@ -521,7 +521,7 @@ static uniform int * uniform limitsA;
|
||||
static uniform int * uniform limitsB;
|
||||
static uniform int MAX_SAMPLE_COUNT = 0;
|
||||
|
||||
export
|
||||
export
|
||||
void openMergeSort()
|
||||
{
|
||||
MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
|
||||
@@ -542,7 +542,7 @@ void closeMergeSort()
|
||||
memPool = NULL;
|
||||
}
|
||||
|
||||
export
|
||||
export
|
||||
void mergeSort(
|
||||
uniform Key_t dstKey[],
|
||||
uniform Val_t dstVal[],
|
||||
@@ -601,7 +601,7 @@ void mergeSort(
|
||||
}
|
||||
#endif
|
||||
|
||||
// cpu: 287 gpu: 194 M/s
|
||||
// cpu: 287 gpu: 194 M/s
|
||||
//Merge elementary intervals
|
||||
mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
|
||||
}
|
||||
|
||||
@@ -61,7 +61,7 @@ struct Hermite4
|
||||
const real R0 = 1;
|
||||
const real mp = 1.0/n;
|
||||
#pragma omp parallel for schedule(runtime)
|
||||
for (int i = 0; i < n; i++)
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
real xp, yp, zp, s2 = 2*R0;
|
||||
real vx, vy, vz;
|
||||
@@ -73,7 +73,7 @@ struct Hermite4
|
||||
vx = drand48() * 0.1;
|
||||
vy = drand48() * 0.1;
|
||||
vz = drand48() * 0.1;
|
||||
}
|
||||
}
|
||||
g_posx[i] = xp;
|
||||
g_posy[i] = yp;
|
||||
g_posz[i] = zp;
|
||||
@@ -104,7 +104,7 @@ struct Hermite4
|
||||
|
||||
void forces();
|
||||
|
||||
real step(const real dt)
|
||||
real step(const real dt)
|
||||
{
|
||||
const real dt2 = dt*real(1.0/2.0);
|
||||
const real dt3 = dt*real(1.0/3.0);
|
||||
@@ -149,9 +149,9 @@ struct Hermite4
|
||||
{
|
||||
/* compute snp & crk */
|
||||
|
||||
const real Amx = g_accx[i] - accx0[i];
|
||||
const real Amy = g_accy[i] - accy0[i];
|
||||
const real Amz = g_accz[i] - accz0[i];
|
||||
const real Amx = g_accx[i] - accx0[i];
|
||||
const real Amy = g_accy[i] - accy0[i];
|
||||
const real Amz = g_accz[i] - accz0[i];
|
||||
|
||||
const real Jmx = h*(g_jrkx[i] - jrkx0[i]);
|
||||
const real Jmy = h*(g_jrky[i] - jrky0[i]);
|
||||
@@ -199,18 +199,18 @@ struct Hermite4
|
||||
}
|
||||
}
|
||||
|
||||
if (dt_min == HUGE)
|
||||
if (dt_min == HUGE)
|
||||
return dt;
|
||||
else
|
||||
else
|
||||
return dt_min;
|
||||
}
|
||||
|
||||
void energy(real &Ekin, real &Epot)
|
||||
void energy(real &Ekin, real &Epot)
|
||||
{
|
||||
real ekin = 0, epot = 0;
|
||||
|
||||
#pragma omp parallel for reduction(+:ekin,epot)
|
||||
for (int i = 0; i < n; i++)
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
ekin += g_mass[i] * (g_velx[i]*g_velx[i] + g_vely[i]*g_vely[i] + g_velz[i]*g_velz[i]) * real(0.5f);
|
||||
epot += real(0.5f)*g_mass[i] * g_gpot[i];
|
||||
@@ -241,7 +241,7 @@ struct Hermite4
|
||||
real dt = 1.0/131072;
|
||||
real Epot, Ekin, Etot = Etot0;
|
||||
while (t_global < t_end) {
|
||||
if (iter % ntime == 0)
|
||||
if (iter % ntime == 0)
|
||||
t0 = rtc();
|
||||
|
||||
if (iter >= niter) return;
|
||||
@@ -302,7 +302,7 @@ void run(const int nbodies, const real eta, const int nstep)
|
||||
h4.integrate(nstep);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
printf(" Usage: %s [nbodies=8192] [nsteps=40] [eta=0.1] \n", argv[0]);
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ struct Predictor
|
||||
vec3 pos, vel;
|
||||
};
|
||||
|
||||
static inline
|
||||
static inline
|
||||
void body_body_force(
|
||||
Force &fi,
|
||||
const Predictor &pi,
|
||||
@@ -40,14 +40,14 @@ void body_body_force(
|
||||
fi.acc.y += minv_ds3 * dy;
|
||||
fi.acc.z += minv_ds3 * dz;
|
||||
fi.pot -= minv_ds;
|
||||
|
||||
|
||||
const real dvx = pj.vel.x - pi.vel.x;
|
||||
const real dvy = pj.vel.y - pi.vel.y;
|
||||
const real dvz = pj.vel.z - pi.vel.z;
|
||||
const real rv = dx*dvx + dy*dvy + dz*dvz;
|
||||
|
||||
|
||||
const real Jij = (real)(-3.0) * (rv * inv_ds2 * minv_ds3);
|
||||
|
||||
|
||||
fi.jrk.x += minv_ds3*dvx + Jij*dx;
|
||||
fi.jrk.y += minv_ds3*dvy + Jij*dy;
|
||||
fi.jrk.z += minv_ds3*dvz + Jij*dz;
|
||||
@@ -75,7 +75,7 @@ task void compute_forces_task(
|
||||
const uniform int nibeg = taskIndex * nPerTask;
|
||||
const uniform int niend = min(n, nibeg + nPerTask);
|
||||
|
||||
if (nibeg >= n)
|
||||
if (nibeg >= n)
|
||||
return;
|
||||
|
||||
uniform real shdata[7][programCount];
|
||||
@@ -88,7 +88,7 @@ task void compute_forces_task(
|
||||
fi.acc = (real)0.0;
|
||||
fi.jrk = (real)0.0;
|
||||
fi.pot = (real)0.0;
|
||||
|
||||
|
||||
Predictor pi;
|
||||
pi.pos.x = posx[i];
|
||||
pi.pos.y = posy[i];
|
||||
@@ -155,7 +155,7 @@ export void compute_forces(
|
||||
|
||||
launch [nTask] compute_forces_task(
|
||||
n, nPerTask,
|
||||
mass,
|
||||
mass,
|
||||
posx,posy,posz,
|
||||
velx,vely,velz,
|
||||
accx,accy,accz,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011-2012, Intel Corporation
|
||||
Copyright (c) 2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,11 +28,11 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#define DBG(x)
|
||||
#define DBG(x)
|
||||
#include <omp.h>
|
||||
#include <malloc.h>
|
||||
|
||||
@@ -62,15 +62,15 @@ struct TaskInfo {
|
||||
event taskEvent;
|
||||
#endif
|
||||
int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; }
|
||||
int taskIndex0() const
|
||||
int taskIndex0() const
|
||||
{
|
||||
return taskIndex % taskCount3d[0];
|
||||
}
|
||||
int taskIndex1() const
|
||||
int taskIndex1() const
|
||||
{
|
||||
return ( taskIndex / taskCount3d[0] ) % taskCount3d[1];
|
||||
}
|
||||
int taskIndex2() const
|
||||
int taskIndex2() const
|
||||
{
|
||||
return taskIndex / ( taskCount3d[0]*taskCount3d[1] );
|
||||
}
|
||||
@@ -85,7 +85,7 @@ __attribute__((aligned(32)));
|
||||
;
|
||||
|
||||
// ispc expects these functions to have C linkage / not be mangled
|
||||
extern "C" {
|
||||
extern "C" {
|
||||
void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz);
|
||||
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
|
||||
void ISPCSync(void *handle);
|
||||
@@ -144,10 +144,10 @@ private:
|
||||
};
|
||||
|
||||
|
||||
inline TaskGroupBase::TaskGroupBase() {
|
||||
nextTaskInfoIndex = 0;
|
||||
inline TaskGroupBase::TaskGroupBase() {
|
||||
nextTaskInfoIndex = 0;
|
||||
|
||||
curMemBuffer = 0;
|
||||
curMemBuffer = 0;
|
||||
curMemBufferOffset = 0;
|
||||
memBuffers[0] = mem;
|
||||
memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
|
||||
@@ -171,8 +171,8 @@ inline TaskGroupBase::~TaskGroupBase() {
|
||||
|
||||
inline void
|
||||
TaskGroupBase::Reset() {
|
||||
nextTaskInfoIndex = 0;
|
||||
curMemBuffer = 0;
|
||||
nextTaskInfoIndex = 0;
|
||||
curMemBuffer = 0;
|
||||
curMemBufferOffset = 0;
|
||||
}
|
||||
|
||||
@@ -253,7 +253,7 @@ lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
}
|
||||
|
||||
static int32_t
|
||||
static int32_t
|
||||
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue);
|
||||
@@ -264,7 +264,7 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue)
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
}
|
||||
|
||||
static inline int32_t
|
||||
static inline int32_t
|
||||
lAtomicAdd(volatile int32_t *v, int32_t delta) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
|
||||
@@ -300,11 +300,11 @@ TaskGroup::Launch(int baseIndex, int count) {
|
||||
|
||||
TaskInfo ti = *GetTaskInfo(baseIndex);
|
||||
#pragma omp for schedule(runtime)
|
||||
for(int i = 0; i < count; i++)
|
||||
for(int i = 0; i < count; i++)
|
||||
{
|
||||
ti.taskIndex = i;
|
||||
|
||||
// Actually run the task.
|
||||
// Actually run the task.
|
||||
ti.func(ti.data, threadIndex, threadCount, ti.taskIndex, ti.taskCount(),
|
||||
ti.taskIndex0(), ti.taskIndex1(), ti.taskIndex2(),
|
||||
ti.taskCount0(), ti.taskCount1(), ti.taskCount2());
|
||||
@@ -322,7 +322,7 @@ TaskGroup::Sync() {
|
||||
static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
|
||||
|
||||
static inline TaskGroup *
|
||||
AllocTaskGroup()
|
||||
AllocTaskGroup()
|
||||
{
|
||||
for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
|
||||
TaskGroup *tg = freeTaskGroups[i];
|
||||
@@ -339,7 +339,7 @@ AllocTaskGroup()
|
||||
|
||||
|
||||
static inline void
|
||||
FreeTaskGroup(TaskGroup *tg)
|
||||
FreeTaskGroup(TaskGroup *tg)
|
||||
{
|
||||
tg->Reset();
|
||||
|
||||
@@ -355,7 +355,7 @@ FreeTaskGroup(TaskGroup *tg)
|
||||
}
|
||||
|
||||
void
|
||||
ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2)
|
||||
ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2)
|
||||
{
|
||||
const int count = count0*count1*count2;
|
||||
TaskGroup *taskGroup;
|
||||
@@ -382,7 +382,7 @@ ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1,
|
||||
|
||||
|
||||
void
|
||||
ISPCSync(void *h)
|
||||
ISPCSync(void *h)
|
||||
{
|
||||
TaskGroup *taskGroup = (TaskGroup *)h;
|
||||
if (taskGroup != NULL) {
|
||||
@@ -393,7 +393,7 @@ ISPCSync(void *h)
|
||||
|
||||
|
||||
void *
|
||||
ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment)
|
||||
ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment)
|
||||
{
|
||||
TaskGroup *taskGroup;
|
||||
if (*taskGroupPtr == NULL) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#define NOMINMAX
|
||||
@@ -96,7 +96,7 @@ int main(int argc, char *argv[]) {
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum += result[i];
|
||||
printf("[binomial ispc, tasks]:\t\t[%.3f] msec (avg %f)\n",
|
||||
printf("[binomial ispc, tasks]:\t\t[%.3f] msec (avg %f)\n",
|
||||
binomial_tasks, sum / nOptions);
|
||||
|
||||
//
|
||||
@@ -112,7 +112,7 @@ int main(int argc, char *argv[]) {
|
||||
sum += result[i];
|
||||
bs_ispc_tasks = std::min(bs_ispc_tasks, dt);
|
||||
}
|
||||
printf("[black-scholes ispc, tasks]:\t[%.3f] msec (avg %f)\n",
|
||||
printf("[black-scholes ispc, tasks]:\t[%.3f] msec (avg %f)\n",
|
||||
bs_ispc_tasks, sum / nOptions);
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// -*- mode: c++ -*-
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -29,13 +29,13 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "options_defs.h"
|
||||
#include "cuda_helpers.cuh"
|
||||
|
||||
__device__ static inline void __range_reduce_log(float input, float * reduced,
|
||||
__device__ static inline void __range_reduce_log(float input, float * reduced,
|
||||
int * exponent) {
|
||||
int int_version = __float_as_int(input); //intbits(input);
|
||||
// single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
|
||||
@@ -195,9 +195,9 @@ CND(float X) {
|
||||
return w;
|
||||
}
|
||||
|
||||
__global__
|
||||
__global__
|
||||
void bs_task( float Sa[], float Xa[], float Ta[],
|
||||
float ra[], float va[],
|
||||
float ra[], float va[],
|
||||
float result[], int count) {
|
||||
if (taskIndex >= taskCount) return;
|
||||
int first = taskIndex * (count/taskCount);
|
||||
@@ -218,7 +218,7 @@ void bs_task( float Sa[], float Xa[], float Ta[],
|
||||
extern "C"
|
||||
__global__ void
|
||||
black_scholes_ispc_tasks___export( float Sa[], float Xa[], float Ta[],
|
||||
float ra[], float va[],
|
||||
float ra[], float va[],
|
||||
float result[], int count) {
|
||||
int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
|
||||
launch(nTasks,1,1,bs_task)
|
||||
@@ -228,7 +228,7 @@ black_scholes_ispc_tasks___export( float Sa[], float Xa[], float Ta[],
|
||||
extern "C"
|
||||
__host__ void
|
||||
black_scholes_ispc_tasks( float Sa[], float Xa[], float Ta[],
|
||||
float ra[], float va[],
|
||||
float ra[], float va[],
|
||||
float result[], int count) {
|
||||
black_scholes_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count);
|
||||
cudaDeviceSynchronize();
|
||||
@@ -243,8 +243,8 @@ struct loop
|
||||
__device__ static void op1(float V[], const float u, const float X, const float S)
|
||||
{
|
||||
const int j = NBEG;
|
||||
float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
|
||||
V[j] = max(0.0f, X - S * upow);
|
||||
float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
|
||||
V[j] = max(0.0f, X - S * upow);
|
||||
loop<j+STEP,NEND,STEP>::op1(V,u,X,S);
|
||||
}
|
||||
__device__ static void op2(float V[], const float Pu, const float disc)
|
||||
@@ -257,9 +257,9 @@ struct loop
|
||||
}
|
||||
};
|
||||
|
||||
template<int NEND, int STEP>
|
||||
template<int NEND, int STEP>
|
||||
struct loop<NEND,NEND,STEP>
|
||||
{
|
||||
{
|
||||
__device__ static void op1(float V[], const float u, const float X, const float S) {}
|
||||
__device__ static void op2(float V[], const float Pu, const float disc) {}
|
||||
};
|
||||
@@ -295,10 +295,10 @@ binomial_put(float S, float X, float T, float r, float v)
|
||||
|
||||
|
||||
__global__ void
|
||||
binomial_task( float Sa[], float Xa[],
|
||||
float Ta[], float ra[],
|
||||
float va[], float result[],
|
||||
int count)
|
||||
binomial_task( float Sa[], float Xa[],
|
||||
float Ta[], float ra[],
|
||||
float va[], float result[],
|
||||
int count)
|
||||
{
|
||||
int first = taskIndex * (count/taskCount);
|
||||
int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
|
||||
@@ -313,9 +313,9 @@ binomial_task( float Sa[], float Xa[],
|
||||
|
||||
|
||||
extern "C" __global__ void
|
||||
binomial_put_ispc_tasks___export( float Sa[], float Xa[],
|
||||
float Ta[], float ra[],
|
||||
float va[], float result[],
|
||||
binomial_put_ispc_tasks___export( float Sa[], float Xa[],
|
||||
float Ta[], float ra[],
|
||||
float va[], float result[],
|
||||
int count) {
|
||||
int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
|
||||
launch(nTasks,1,1,binomial_task)
|
||||
@@ -325,7 +325,7 @@ binomial_put_ispc_tasks___export( float Sa[], float Xa[],
|
||||
extern "C"
|
||||
__host__ void
|
||||
binomial_put_ispc_tasks( float Sa[], float Xa[], float Ta[],
|
||||
float ra[], float va[],
|
||||
float ra[], float va[],
|
||||
float result[], int count) {
|
||||
|
||||
cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// -*- mode: c++ -*-
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -29,7 +29,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "options_defs.h"
|
||||
@@ -57,7 +57,7 @@ CND(float X) {
|
||||
|
||||
task void
|
||||
bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
uniform int first = taskIndex * (count/taskCount);
|
||||
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
|
||||
@@ -74,7 +74,7 @@ bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
|
||||
export void
|
||||
black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
|
||||
launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
|
||||
@@ -85,7 +85,7 @@ black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float T
|
||||
|
||||
export void
|
||||
black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
foreach (i = 0 ... count) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
@@ -135,7 +135,7 @@ binomial_put(float S, float X, float T, float r, float v) {
|
||||
V[j] = max(0., X - S * upow); }
|
||||
#define OP10(k) \
|
||||
OP(k+0); OP(k+1); OP(k+2); OP(k+3); OP(k+4) \
|
||||
OP(k+5); OP(k+6); OP(k+7); OP(k+8); OP(k+9);
|
||||
OP(k+5); OP(k+6); OP(k+7); OP(k+8); OP(k+9);
|
||||
OP10(0)
|
||||
OP10(10)
|
||||
OP10(20)
|
||||
@@ -176,8 +176,8 @@ binomial_put(float S, float X, float T, float r, float v) {
|
||||
|
||||
|
||||
export void
|
||||
binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
foreach (i = 0 ... count) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
@@ -187,9 +187,9 @@ binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
|
||||
|
||||
task void
|
||||
binomial_task(uniform float Sa[], uniform float Xa[],
|
||||
uniform float Ta[], uniform float ra[],
|
||||
uniform float va[], uniform float result[],
|
||||
binomial_task(uniform float Sa[], uniform float Xa[],
|
||||
uniform float Ta[], uniform float ra[],
|
||||
uniform float va[], uniform float result[],
|
||||
uniform int count) {
|
||||
uniform int first = taskIndex * (count/taskCount);
|
||||
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
|
||||
@@ -202,9 +202,9 @@ binomial_task(uniform float Sa[], uniform float Xa[],
|
||||
|
||||
|
||||
export void
|
||||
binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
|
||||
uniform float Ta[], uniform float ra[],
|
||||
uniform float va[], uniform float result[],
|
||||
binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
|
||||
uniform float Ta[], uniform float ra[],
|
||||
uniform float va[], uniform float result[],
|
||||
uniform int count) {
|
||||
uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
|
||||
launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef OPTIONS_DEFS_H
|
||||
|
||||
@@ -42,7 +42,7 @@ int main (int argc, char *argv[])
|
||||
Key *keys = new Key [n];
|
||||
Key *keys_orig = new Key [n];
|
||||
unsigned int *keys_gold = new unsigned int [n];
|
||||
|
||||
|
||||
srand48(rtc()*65536);
|
||||
|
||||
int sortBits = 32;
|
||||
@@ -63,7 +63,7 @@ int main (int argc, char *argv[])
|
||||
keys_gold[i] = keys[i].key;
|
||||
keys_orig[i] = keys[i];
|
||||
}
|
||||
|
||||
|
||||
ispcSetMallocHeapLimit(1024*1024*1024);
|
||||
|
||||
ispc::radixSort_alloc(n);
|
||||
|
||||
@@ -9,7 +9,7 @@ typedef long long Key;
|
||||
__forceinline__ __device__ int atomic_add_global(int* ptr, int value)
|
||||
{
|
||||
return atomicAdd(ptr, value);
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int shfl_scan_add_step(int partial, int up_offset)
|
||||
{
|
||||
@@ -92,7 +92,7 @@ void sortPass(
|
||||
|
||||
const int mask = (1 << NUMBITS) - 1;
|
||||
|
||||
/* copy digit offset from Gmem to Lmem */
|
||||
/* copy digit offset from Gmem to Lmem */
|
||||
#if 1
|
||||
__shared__ int digitOffsets_sh[NUMDIGITS*4];
|
||||
volatile int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS;
|
||||
@@ -191,7 +191,7 @@ void completeScanGlobal(
|
||||
}
|
||||
}
|
||||
|
||||
__device__ static
|
||||
__device__ static
|
||||
inline void radixExclusiveScan(
|
||||
const int numBlocks,
|
||||
int excScanPtr[],
|
||||
@@ -242,11 +242,11 @@ void radixSort_alloc___export(const int n)
|
||||
nPrefixSum = NUMDIGITS*numBlocks;
|
||||
|
||||
|
||||
const int nalloc =
|
||||
const int nalloc =
|
||||
nSharedCounts +
|
||||
nCountsGlobal +
|
||||
nExcScan +
|
||||
nCountsBlock +
|
||||
nCountsBlock +
|
||||
nPartialSum +
|
||||
nPrefixSum;
|
||||
|
||||
@@ -261,7 +261,7 @@ void radixSort_alloc___export(const int n)
|
||||
prefixSum = partialSum + nPartialSum;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
extern "C"
|
||||
void radixSort_alloc(const int n)
|
||||
{
|
||||
radixSort_alloc___export<<<1,32>>>(n);
|
||||
@@ -269,7 +269,7 @@ void radixSort_alloc(const int n)
|
||||
}
|
||||
|
||||
|
||||
__device__ static
|
||||
__device__ static
|
||||
void radixSort_freeBufKeys()
|
||||
{
|
||||
if (numElementsBuf > 0)
|
||||
@@ -344,9 +344,9 @@ __global__ void radixSort___export(
|
||||
/* sorting */
|
||||
launch (numBlocks,1,1,
|
||||
sortPass)(
|
||||
bufKeys,
|
||||
keys,
|
||||
bit,
|
||||
bufKeys,
|
||||
keys,
|
||||
bit,
|
||||
numElements,
|
||||
excScan);
|
||||
sync;
|
||||
|
||||
@@ -63,7 +63,7 @@ void sortPass(
|
||||
|
||||
const uniform int mask = (1 << NUMBITS) - 1;
|
||||
|
||||
/* copy digit offset from Gmem to Lmem */
|
||||
/* copy digit offset from Gmem to Lmem */
|
||||
#if 1
|
||||
uniform int digitOffsets[NUMDIGITS];
|
||||
foreach (digit = 0 ... NUMDIGITS)
|
||||
@@ -95,7 +95,7 @@ void partialScanLocal(
|
||||
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
||||
const uniform int bbeg = blockIdx * blockDim;
|
||||
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||
|
||||
|
||||
uniform int (* uniform countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])countsAll;
|
||||
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
|
||||
uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
|
||||
@@ -142,7 +142,7 @@ void completeScanGlobal(
|
||||
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
||||
const uniform int bbeg = blockIdx * blockDim;
|
||||
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||
|
||||
|
||||
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
|
||||
uniform int (* uniform carryValue)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])carryValueAll;
|
||||
|
||||
@@ -154,7 +154,7 @@ void completeScanGlobal(
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
static
|
||||
inline void radixExclusiveScan(
|
||||
const uniform int numBlocks,
|
||||
uniform int excScanPtr[],
|
||||
@@ -207,11 +207,11 @@ export void radixSort_alloc(const uniform int n)
|
||||
nPrefixSum = NUMDIGITS*numBlocks;
|
||||
|
||||
|
||||
const uniform int nalloc =
|
||||
const uniform int nalloc =
|
||||
nSharedCounts +
|
||||
nCountsGlobal +
|
||||
nExcScan +
|
||||
nCountsBlock +
|
||||
nCountsBlock +
|
||||
nPartialSum +
|
||||
nPrefixSum;
|
||||
|
||||
@@ -225,7 +225,7 @@ export void radixSort_alloc(const uniform int n)
|
||||
prefixSum = partialSum + nPartialSum;
|
||||
}
|
||||
|
||||
static
|
||||
static
|
||||
void radixSort_freeBufKeys()
|
||||
{
|
||||
if (numElementsBuf > 0)
|
||||
@@ -283,16 +283,16 @@ export void radixSort(
|
||||
excScan[digit] = scan + carry;
|
||||
carry += broadcast(scan+value, programCount-1);
|
||||
}
|
||||
|
||||
|
||||
/* computing offsets for each digit */
|
||||
radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
|
||||
|
||||
/* sorting */
|
||||
launch [numBlocks]
|
||||
launch [numBlocks]
|
||||
sortPass(
|
||||
bufKeys,
|
||||
keys,
|
||||
bit,
|
||||
bufKeys,
|
||||
keys,
|
||||
bit,
|
||||
numElements,
|
||||
excScan);
|
||||
sync;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
@@ -83,7 +83,7 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
|
||||
fputc(g, f);
|
||||
fputc(b, f);
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
printf("Wrote image file %s\n", filename);
|
||||
}
|
||||
@@ -116,7 +116,7 @@ int main(int argc, char *argv[]) {
|
||||
if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \
|
||||
fprintf(stderr, "Unexpected EOF reading scene file\n"); \
|
||||
return 1; \
|
||||
} else /* eat ; */
|
||||
} else /* eat ; */
|
||||
|
||||
//
|
||||
// Read the camera specification information from the camera file
|
||||
@@ -145,7 +145,7 @@ int main(int argc, char *argv[]) {
|
||||
READ(raster2camera[0][0], 16);
|
||||
|
||||
//
|
||||
// Read in the serialized BVH
|
||||
// Read in the serialized BVH
|
||||
//
|
||||
sprintf(fnbuf, "%s.bvh", filename);
|
||||
f = fopen(fnbuf, "rb");
|
||||
@@ -178,7 +178,7 @@ int main(int argc, char *argv[]) {
|
||||
READ(nodes[i].pad, 1);
|
||||
}
|
||||
|
||||
// And then read the triangles
|
||||
// And then read the triangles
|
||||
uint nTris;
|
||||
READ(nTris, 1);
|
||||
Triangle *triangles = new Triangle[nTris];
|
||||
@@ -204,7 +204,7 @@ int main(int argc, char *argv[]) {
|
||||
// the first interseciton
|
||||
int *id = new int[width*height];
|
||||
float *image = new float[width*height];
|
||||
|
||||
|
||||
ispc_memset(id, 0, width*height*sizeof(int));
|
||||
ispc_memset(image, 0, width*height*sizeof(float));
|
||||
|
||||
@@ -220,7 +220,7 @@ int main(int argc, char *argv[]) {
|
||||
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
|
||||
minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
|
||||
}
|
||||
printf("[rt ispc + tasks]:\t\t[%.3f] msec for %d x %d image\n",
|
||||
printf("[rt ispc + tasks]:\t\t[%.3f] msec for %d x %d image\n",
|
||||
minTimeISPCtasks, width, height);
|
||||
|
||||
writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
|
||||
|
||||
@@ -96,7 +96,7 @@ static inline float Dot(const float3 a, const float3 b) {
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void generateRay( const float raster2camera[4][4],
|
||||
static void generateRay( const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float x, float y, Ray &ray) {
|
||||
ray.mint = 0.f;
|
||||
@@ -113,11 +113,11 @@ static void generateRay( const float raster2camera[4][4],
|
||||
camy /= camw;
|
||||
camz /= camw;
|
||||
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
|
||||
camera2world[0][2] * camz;
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
|
||||
camera2world[1][2] * camz;
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
|
||||
camera2world[2][2] * camz;
|
||||
|
||||
ray.origin.x = camera2world[0][3] / camera2world[3][3];
|
||||
@@ -139,7 +139,7 @@ static void generateRay( const float raster2camera[4][4],
|
||||
|
||||
__device__
|
||||
inline
|
||||
static bool BBoxIntersect(const float bounds[2][3],
|
||||
static bool BBoxIntersect(const float bounds[2][3],
|
||||
const Ray &ray) {
|
||||
float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
|
||||
float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
|
||||
@@ -172,7 +172,7 @@ static bool BBoxIntersect(const float bounds[2][3],
|
||||
}
|
||||
t0 = max(tNear.z, t0);
|
||||
t1 = min(tFar.z, t1);
|
||||
|
||||
|
||||
return (t0 <= t1);
|
||||
}
|
||||
|
||||
@@ -220,7 +220,7 @@ static bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
|
||||
__device__
|
||||
inline
|
||||
bool BVHIntersect(const LinearBVHNode nodes[],
|
||||
bool BVHIntersect(const LinearBVHNode nodes[],
|
||||
const Triangle tris[], Ray &r,
|
||||
int todo[]) {
|
||||
Ray ray = r;
|
||||
@@ -240,7 +240,7 @@ bool BVHIntersect(const LinearBVHNode nodes[],
|
||||
if (TriIntersect(tris[primitivesOffset+i], ray))
|
||||
hit = true;
|
||||
}
|
||||
if (todoOffset == 0)
|
||||
if (todoOffset == 0)
|
||||
break;
|
||||
nodeNum = todo[--todoOffset];
|
||||
}
|
||||
@@ -275,10 +275,10 @@ bool BVHIntersect(const LinearBVHNode nodes[],
|
||||
__device__
|
||||
inline
|
||||
static void raytrace_tile( int x0, int x1,
|
||||
int y0, int y1,
|
||||
int y0, int y1,
|
||||
int width, int height,
|
||||
int baseWidth, int baseHeight,
|
||||
const float raster2camera[4][4],
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float image[], int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
@@ -317,7 +317,7 @@ static void raytrace_tile( int x0, int x1,
|
||||
__global__
|
||||
void raytrace_tile_task( int width, int height,
|
||||
int baseWidth, int baseHeight,
|
||||
const float raster2camera[4][4],
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float image[], int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
@@ -328,8 +328,8 @@ void raytrace_tile_task( int width, int height,
|
||||
int x1 = min(x0 + dx, width);
|
||||
int y0 = (taskIndex / xBuckets) * dy;
|
||||
int y1 = min(y0 + dy, height);
|
||||
|
||||
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
|
||||
|
||||
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world, image,
|
||||
id, nodes, triangles);
|
||||
}
|
||||
@@ -337,7 +337,7 @@ void raytrace_tile_task( int width, int height,
|
||||
|
||||
extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height,
|
||||
int baseWidth, int baseHeight,
|
||||
const float raster2camera[4][4],
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float image[], int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
@@ -347,8 +347,8 @@ extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height,
|
||||
int yBuckets = (height + (dy-1)) / dy;
|
||||
int nTasks = xBuckets * yBuckets;
|
||||
launch(nTasks,1,1,raytrace_tile_task)
|
||||
(width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world,
|
||||
(width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world,
|
||||
image, id, nodes, triangles);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
@@ -357,7 +357,7 @@ extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height,
|
||||
|
||||
extern "C" __host__ void raytrace_ispc_tasks( int width, int height,
|
||||
int baseWidth, int baseHeight,
|
||||
const float raster2camera[4][4],
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float image[], int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#if 1
|
||||
@@ -90,7 +90,7 @@ static inline float Dot(const float3 a, const float3 b) {
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static void generateRay(uniform const float raster2camera[4][4],
|
||||
static void generateRay(uniform const float raster2camera[4][4],
|
||||
uniform const float camera2world[4][4],
|
||||
float x, float y, Ray &ray) {
|
||||
ray.mint = 0.f;
|
||||
@@ -107,11 +107,11 @@ static void generateRay(uniform const float raster2camera[4][4],
|
||||
camy /= camw;
|
||||
camz /= camw;
|
||||
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
|
||||
camera2world[0][2] * camz;
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
|
||||
camera2world[1][2] * camz;
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
|
||||
camera2world[2][2] * camz;
|
||||
|
||||
ray.origin.x = camera2world[0][3] / camera2world[3][3];
|
||||
@@ -129,7 +129,7 @@ static void generateRay(uniform const float raster2camera[4][4],
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static bool_t BBoxIntersect(const uniform float bounds[2][3],
|
||||
static bool_t BBoxIntersect(const uniform float bounds[2][3],
|
||||
const Ray &ray) {
|
||||
const uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
|
||||
const uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
|
||||
@@ -162,7 +162,7 @@ static bool_t BBoxIntersect(const uniform float bounds[2][3],
|
||||
}
|
||||
t0 = max(tNear.z, t0);
|
||||
t1 = min(tFar.z, t1);
|
||||
|
||||
|
||||
return (t0 <= t1);
|
||||
}
|
||||
|
||||
@@ -215,7 +215,7 @@ static bool_t TriIntersect(const uniform_t Triangle tri, Ray &ray) {
|
||||
inline
|
||||
#endif
|
||||
bool_t
|
||||
BVHIntersect(const uniform LinearBVHNode nodes[],
|
||||
BVHIntersect(const uniform LinearBVHNode nodes[],
|
||||
const uniform Triangle tris[], Ray &r) {
|
||||
Ray ray = r;
|
||||
bool_t hit = false;
|
||||
@@ -235,7 +235,7 @@ BVHIntersect(const uniform LinearBVHNode nodes[],
|
||||
if (TriIntersect(tris[primitivesOffset+i], ray))
|
||||
hit = true;
|
||||
}
|
||||
if (todoOffset == 0)
|
||||
if (todoOffset == 0)
|
||||
break;
|
||||
nodeNum = todo[--todoOffset];
|
||||
}
|
||||
@@ -276,10 +276,10 @@ BVHIntersect(const uniform LinearBVHNode nodes[],
|
||||
inline
|
||||
#endif
|
||||
static void raytrace_tile(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const uniform LinearBVHNode nodes[],
|
||||
@@ -302,7 +302,7 @@ static void raytrace_tile(uniform int x0, uniform int x1,
|
||||
|
||||
export void raytrace_ispc(uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const uniform LinearBVHNode nodes[],
|
||||
@@ -315,7 +315,7 @@ export void raytrace_ispc(uniform int width, uniform int height,
|
||||
|
||||
task void raytrace_tile_task(uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const uniform LinearBVHNode nodes[],
|
||||
@@ -326,8 +326,8 @@ task void raytrace_tile_task(uniform int width, uniform int height,
|
||||
const uniform int x1 = min(x0 + dx, width);
|
||||
const uniform int y0 = (taskIndex / xBuckets) * dy;
|
||||
const uniform int y1 = min(y0 + dy, height);
|
||||
|
||||
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
|
||||
|
||||
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world, image,
|
||||
id, nodes, triangles);
|
||||
}
|
||||
@@ -335,7 +335,7 @@ task void raytrace_tile_task(uniform int width, uniform int height,
|
||||
|
||||
export void raytrace_ispc_tasks(uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const uniform LinearBVHNode nodes[],
|
||||
@@ -344,8 +344,8 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
|
||||
const uniform int xBuckets = (width + (dx-1)) / dx;
|
||||
const uniform int yBuckets = (height + (dy-1)) / dy;
|
||||
const uniform int nTasks = xBuckets * yBuckets;
|
||||
launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world,
|
||||
launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world,
|
||||
image, id, nodes, triangles);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,11 +28,11 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "cuda_helpers.cuh"
|
||||
__device__ static inline float clamp(float v, float low, float high)
|
||||
__device__ static inline float clamp(float v, float low, float high)
|
||||
{
|
||||
return min(max(v, low), high);
|
||||
}
|
||||
@@ -90,7 +90,7 @@ struct Ray {
|
||||
|
||||
|
||||
__device__ static void
|
||||
generateRay(const float raster2camera[4][4],
|
||||
generateRay(const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float x, float y, Ray &ray) {
|
||||
// transform raster coordinate (x, y, 0) to camera space
|
||||
@@ -149,7 +149,7 @@ IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
|
||||
}
|
||||
t0 = max(tNear.z, t0);
|
||||
t1 = min(tFar.z, t1);
|
||||
|
||||
|
||||
if (t0 <= t1) {
|
||||
hit0 = t0;
|
||||
hit1 = t1;
|
||||
@@ -165,7 +165,7 @@ __device__ static inline float Lerp(float t, float a, float b) {
|
||||
}
|
||||
|
||||
|
||||
__device__ static inline float D(int x, int y, int z, int nVoxels[3],
|
||||
__device__ static inline float D(int x, int y, int z, int nVoxels[3],
|
||||
float density[]) {
|
||||
x = clamp(x, 0, nVoxels[0]-1);
|
||||
y = clamp(y, 0, nVoxels[1]-1);
|
||||
@@ -180,9 +180,9 @@ __device__ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
|
||||
}
|
||||
|
||||
|
||||
__device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
__device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
float density[], int nVoxels[3]) {
|
||||
if (!Inside(Pobj, pMin, pMax))
|
||||
if (!Inside(Pobj, pMin, pMax))
|
||||
return 0;
|
||||
// Compute voxel coordinates and offsets for _Pobj_
|
||||
float3 vox = Offset(Pobj, pMin, pMax);
|
||||
@@ -193,13 +193,13 @@ __device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
|
||||
|
||||
// Trilinearly interpolate density values to compute local density
|
||||
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
|
||||
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
|
||||
D(vx+1, vy, vz, nVoxels, density));
|
||||
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
|
||||
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
|
||||
D(vx+1, vy+1, vz, nVoxels, density));
|
||||
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
|
||||
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
|
||||
D(vx+1, vy, vz+1, nVoxels, density));
|
||||
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
|
||||
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
|
||||
D(vx+1, vy+1, vz+1, nVoxels, density));
|
||||
float d0 = Lerp(dy, d00, d10);
|
||||
float d1 = Lerp(dy, d01, d11);
|
||||
@@ -213,7 +213,7 @@ __device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
array. */
|
||||
__device__ static inline float
|
||||
transmittance(float3 p0, float3 p1, float3 pMin,
|
||||
float3 pMax, float sigma_t,
|
||||
float3 pMax, float sigma_t,
|
||||
float density[], int nVoxels[3]) {
|
||||
float rayT0, rayT1;
|
||||
Ray ray;
|
||||
@@ -253,7 +253,7 @@ distanceSquared(float3 a, float3 b) {
|
||||
}
|
||||
|
||||
|
||||
__device__ static inline float
|
||||
__device__ static inline float
|
||||
raymarch(float density[], int nVoxels[3], Ray ray) {
|
||||
float rayT0, rayT1;
|
||||
float3 pMin = {.3f, -.2f, .3f}, pMax = {1.8f, 2.3f, 1.8f};
|
||||
@@ -281,7 +281,7 @@ raymarch(float density[], int nVoxels[3], Ray ray) {
|
||||
float t = rayT0;
|
||||
float3 pos = ray.origin + ray.dir * rayT0;
|
||||
float3 dirStep = ray.dir * stepT;
|
||||
while (t < rayT1)
|
||||
while (t < rayT1)
|
||||
{
|
||||
float d = Density(pos, pMin, pMax, density, nVoxels);
|
||||
|
||||
@@ -291,7 +291,7 @@ raymarch(float density[], int nVoxels[3], Ray ray) {
|
||||
break;
|
||||
|
||||
// direct lighting
|
||||
float Li = lightIntensity / distanceSquared(lightPos, pos) *
|
||||
float Li = lightIntensity / distanceSquared(lightPos, pos) *
|
||||
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
|
||||
density, nVoxels);
|
||||
L += stepDist * atten * d * sigma_s * (Li + Le);
|
||||
@@ -314,20 +314,20 @@ raymarch(float density[], int nVoxels[3], Ray ray) {
|
||||
*/
|
||||
__device__ static void
|
||||
volume_tile(int x0, int y0, int x1,
|
||||
int y1, float density[], int nVoxels[3],
|
||||
int y1, float density[], int nVoxels[3],
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
const float camera2world[4][4],
|
||||
int width, int height, float image[]) {
|
||||
// Work on 4x4=16 pixel big tiles of the image. This function thus
|
||||
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
|
||||
// by 4.
|
||||
for (int y = y0; y < y1; y += 8) {
|
||||
for (int x = x0; x < x1; x += 8) {
|
||||
for (int ob = 0; ob < 64; ob += programCount)
|
||||
for (int ob = 0; ob < 64; ob += programCount)
|
||||
{
|
||||
const int o = ob + programIndex;
|
||||
|
||||
|
||||
|
||||
// These two arrays encode the mapping from [0,15] to
|
||||
// offsets within the 4x4 pixel block so that we render
|
||||
// each pixel inside the block
|
||||
@@ -360,9 +360,9 @@ volume_tile(int x0, int y0, int x1,
|
||||
|
||||
|
||||
__global__ void
|
||||
volume_task(float density[], int _nVoxels[3],
|
||||
volume_task(float density[], int _nVoxels[3],
|
||||
const float _raster2camera[4][4],
|
||||
const float _camera2world[4][4],
|
||||
const float _camera2world[4][4],
|
||||
int width, int height, float image[]) {
|
||||
if (taskIndex0 >= taskCount0) return;
|
||||
|
||||
@@ -389,7 +389,7 @@ volume_task(float density[], int _nVoxels[3],
|
||||
raster2camera[3][1] = _raster2camera[3][1];
|
||||
raster2camera[3][2] = _raster2camera[3][2];
|
||||
raster2camera[3][3] = _raster2camera[3][3];
|
||||
|
||||
|
||||
float camera2world[4][4];
|
||||
camera2world[0][0] = _camera2world[0][0];
|
||||
camera2world[0][1] = _camera2world[0][1];
|
||||
@@ -430,24 +430,24 @@ volume_task(float density[], int _nVoxels[3],
|
||||
|
||||
extern "C"
|
||||
__global__ void
|
||||
volume_ispc_tasks___export( float density[], int nVoxels[3],
|
||||
volume_ispc_tasks___export( float density[], int nVoxels[3],
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
const float camera2world[4][4],
|
||||
int width, int height, float image[]) {
|
||||
// Launch tasks to work on (dx,dy)-sized tiles of the image
|
||||
int dx = 8, dy = 8;
|
||||
int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
|
||||
launch(nTasks,1,1,volume_task)
|
||||
(density, nVoxels, raster2camera, camera2world,
|
||||
(density, nVoxels, raster2camera, camera2world,
|
||||
width, height, image);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
__host__ void
|
||||
volume_ispc_tasks( float density[], int nVoxels[3],
|
||||
volume_ispc_tasks( float density[], int nVoxels[3],
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
const float camera2world[4][4],
|
||||
int width, int height, float image[]) {
|
||||
volume_ispc_tasks___export<<<1,32>>>(density, nVoxels, raster2camera, camera2world, width, height,image);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -28,7 +28,7 @@
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ struct Ray {
|
||||
|
||||
|
||||
static inline void
|
||||
generateRay(const uniform float raster2camera[4][4],
|
||||
generateRay(const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
float x, float y, Ray &ray) {
|
||||
// transform raster coordinate (x, y, 0) to camera space
|
||||
@@ -100,7 +100,7 @@ IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
|
||||
}
|
||||
t0 = max(tNear.z, t0);
|
||||
t1 = min(tFar.z, t1);
|
||||
|
||||
|
||||
if (t0 <= t1) {
|
||||
hit0 = t0;
|
||||
hit1 = t1;
|
||||
@@ -116,7 +116,7 @@ static inline float Lerp(float t, float a, float b) {
|
||||
}
|
||||
|
||||
|
||||
static inline float D(int x, int y, int z, uniform int nVoxels[3],
|
||||
static inline float D(int x, int y, int z, uniform int nVoxels[3],
|
||||
uniform float density[]) {
|
||||
x = clamp(x, 0, nVoxels[0]-1);
|
||||
y = clamp(y, 0, nVoxels[1]-1);
|
||||
@@ -131,9 +131,9 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
|
||||
}
|
||||
|
||||
|
||||
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
uniform float density[], uniform int nVoxels[3]) {
|
||||
if (!Inside(Pobj, pMin, pMax))
|
||||
if (!Inside(Pobj, pMin, pMax))
|
||||
return 0;
|
||||
// Compute voxel coordinates and offsets for _Pobj_
|
||||
float3 vox = Offset(Pobj, pMin, pMax);
|
||||
@@ -144,13 +144,13 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
|
||||
|
||||
// Trilinearly interpolate density values to compute local density
|
||||
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
|
||||
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
|
||||
D(vx+1, vy, vz, nVoxels, density));
|
||||
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
|
||||
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
|
||||
D(vx+1, vy+1, vz, nVoxels, density));
|
||||
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
|
||||
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
|
||||
D(vx+1, vy, vz+1, nVoxels, density));
|
||||
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
|
||||
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
|
||||
D(vx+1, vy+1, vz+1, nVoxels, density));
|
||||
float d0 = Lerp(dy, d00, d10);
|
||||
float d1 = Lerp(dy, d01, d11);
|
||||
@@ -164,7 +164,7 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
array. */
|
||||
static inline float
|
||||
transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
|
||||
uniform float3 pMax, uniform float sigma_t,
|
||||
uniform float3 pMax, uniform float sigma_t,
|
||||
uniform float density[], uniform int nVoxels[3]) {
|
||||
float rayT0, rayT1;
|
||||
Ray ray;
|
||||
@@ -204,7 +204,7 @@ distanceSquared(float3 a, float3 b) {
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
static inline float
|
||||
raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
|
||||
float rayT0, rayT1;
|
||||
const uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
|
||||
@@ -232,7 +232,7 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
|
||||
float t = rayT0;
|
||||
float3 pos = ray.origin + ray.dir * rayT0;
|
||||
float3 dirStep = ray.dir * stepT;
|
||||
while (t < rayT1)
|
||||
while (t < rayT1)
|
||||
{
|
||||
float d = Density(pos, pMin, pMax, density, nVoxels);
|
||||
|
||||
@@ -242,7 +242,7 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
|
||||
break;
|
||||
|
||||
// direct lighting
|
||||
float Li = lightIntensity / distanceSquared(lightPos, pos) *
|
||||
float Li = lightIntensity / distanceSquared(lightPos, pos) *
|
||||
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
|
||||
density, nVoxels);
|
||||
L += stepDist * atten * d * sigma_s * (Li + Le);
|
||||
@@ -265,16 +265,16 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
|
||||
*/
|
||||
static inline void
|
||||
volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
||||
uniform int y1, uniform float density[], uniform int nVoxels[3],
|
||||
uniform int y1, uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
// Work on 4x4=16 pixel big tiles of the image. This function thus
|
||||
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
|
||||
// by 4.
|
||||
#if 0
|
||||
for (uniform int y = y0; y < y1; y += 8)
|
||||
for (uniform int x = x0; x < x1; x += 8)
|
||||
for (uniform int y = y0; y < y1; y += 8)
|
||||
for (uniform int x = x0; x < x1; x += 8)
|
||||
foreach (o = 0 ... 64)
|
||||
{
|
||||
// These two arrays encode the mapping from [0,15] to
|
||||
@@ -304,7 +304,7 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
||||
image[offset] = raymarch(density, nVoxels, ray);
|
||||
}
|
||||
#else
|
||||
foreach_tiled (y = y0 ... y1, x = x0 ... x1)
|
||||
foreach_tiled (y = y0 ... y1, x = x0 ... x1)
|
||||
{
|
||||
// Use viewing parameters to compute the corresponding ray
|
||||
// for the pixel
|
||||
@@ -321,10 +321,10 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
||||
|
||||
|
||||
task void
|
||||
volume_task(uniform float density[], uniform int _nVoxels[3],
|
||||
volume_task(uniform float density[], uniform int _nVoxels[3],
|
||||
const uniform float _raster2camera[4][4],
|
||||
const uniform float _camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[])
|
||||
const uniform float _camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[])
|
||||
{
|
||||
if (taskIndex >= taskCount) return;
|
||||
|
||||
@@ -351,7 +351,7 @@ volume_task(uniform float density[], uniform int _nVoxels[3],
|
||||
raster2camera[3][1] = _raster2camera[3][1];
|
||||
raster2camera[3][2] = _raster2camera[3][2];
|
||||
raster2camera[3][3] = _raster2camera[3][3];
|
||||
|
||||
|
||||
uniform float camera2world[4][4];
|
||||
camera2world[0][0] = _camera2world[0][0];
|
||||
camera2world[0][1] = _camera2world[0][1];
|
||||
@@ -390,24 +390,24 @@ volume_task(uniform float density[], uniform int _nVoxels[3],
|
||||
|
||||
|
||||
export void
|
||||
volume_ispc(uniform float density[], uniform int nVoxels[3],
|
||||
volume_ispc(uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
volume_tile(0, 0, width, height, density, nVoxels, raster2camera,
|
||||
volume_tile(0, 0, width, height, density, nVoxels, raster2camera,
|
||||
camera2world, width, height, image);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
|
||||
volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
// Launch tasks to work on (dx,dy)-sized tiles of the image
|
||||
const uniform int dx = 8, dy = 8;
|
||||
const uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
|
||||
launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
|
||||
launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
|
||||
width, height, image);
|
||||
sync;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user