White space and copyright fixes in examples.

This commit is contained in:
Dmitry Babokin
2014-07-08 20:08:34 +04:00
parent 8894156df5
commit d8e2fdf913
30 changed files with 563 additions and 563 deletions

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER
@@ -141,12 +141,12 @@ int main(int argc, char **argv)
}
// Report results and save image
printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n",
printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n",
minTimeISPCTasks, width, height);
savePPM("ao-ispc-tasks.ppm", width, height);
savePPM("ao-ispc-tasks.ppm", width, height);
delete img;
delete fimg;
return 0;
}

View File

@@ -1,6 +1,6 @@
// -*- mode: c++ -*-
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,7 @@ met:
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
@@ -37,7 +37,7 @@ met:
#include "cuda_helpers.cuh"
#define NAO_SAMPLES 8
#define NAO_SAMPLES 8
//#define M_PI 3.1415926535f
#define vec Float3
@@ -109,7 +109,7 @@ static inline unsigned int random(RNGState * state)
b = ((state->z1 << 6) ^ state->z1) >> 13;
state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
b = ((state->z2 << 2) ^ state->z2) >> 27;
b = ((state->z2 << 2) ^ state->z2) >> 27;
state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
b = ((state->z3 << 13) ^ state->z3) >> 21;
state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
@@ -128,7 +128,7 @@ static inline float frandom(RNGState * state)
}
__device__
static inline void seed_rng(RNGState * state,
static inline void seed_rng(RNGState * state,
unsigned int seed) {
state->z1 = seed;
state->z2 = seed ^ 0xbeeff00d;
@@ -143,7 +143,7 @@ struct Isect {
float t;
vec p;
vec n;
int hit;
int hit;
};
struct Sphere {
@@ -190,7 +190,7 @@ ray_plane_intersect(Isect &isect,const Ray &ray, const Plane &plane) {
float v = dot(ray.dir, plane.n);
#if 0
if (abs(v) < 1.0f-17)
if (abs(v) < 1.0f-17)
return;
else {
float t = -(dot(ray.org, plane.n) + d) / v;
@@ -238,7 +238,7 @@ ray_sphere_intersect(Isect &isect,const Ray &ray, const Sphere &sphere) {
}
}
#else
if (D <= 0.0f)
if (D <= 0.0f)
return;
float t = -B - sqrt(D);
@@ -319,8 +319,8 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
occIsect.hit = 0;
for ( int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(occIsect, ray, spheres[snum]);
ray_plane_intersect (occIsect, ray, plane);
ray_sphere_intersect(occIsect, ray, spheres[snum]);
ray_plane_intersect (occIsect, ray, plane);
if (occIsect.hit) occlusion += 1.0f;
}
@@ -337,10 +337,10 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
__device__
static inline void ao_tiles(
int x0, int x1,
int y0, int y1,
int y0, int y1,
int w, int h,
int nsubsamples,
float image[])
int nsubsamples,
float image[])
{
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
const Sphere spheres[3] = {
@@ -411,8 +411,8 @@ static inline void ao_tiles(
extern "C"
__global__
void ao_task( int width, int height,
int nsubsamples, float image[])
void ao_task( int width, int height,
int nsubsamples, float image[])
{
if (taskIndex0 >= taskCount0) return;
if (taskIndex1 >= taskCount1) return;
@@ -428,8 +428,8 @@ void ao_task( int width, int height,
extern "C"
__global__
void ao_ispc_tasks___export(
int w, int h, int nsubsamples,
float image[])
int w, int h, int nsubsamples,
float image[])
{
const int ntilex = (w+TILEX-1)/TILEX;
const int ntiley = (h+TILEY-1)/TILEY;
@@ -439,8 +439,8 @@ void ao_ispc_tasks___export(
extern "C"
__host__ void ao_ispc_tasks(
int w, int h, int nsubsamples,
float image[])
int w, int h, int nsubsamples,
float image[])
{
ao_ispc_tasks___export<<<1,32>>>(w,h,nsubsamples,image);
cudaDeviceSynchronize();

View File

@@ -1,6 +1,6 @@
// -*- mode: c++ -*-
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -29,13 +29,13 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
*/
#define NAO_SAMPLES 8
#define NAO_SAMPLES 8
#define M_PI 3.1415926535f
typedef float<3> vec;
@@ -50,7 +50,7 @@ struct Isect {
float t;
vec p;
vec n;
int hit;
int hit;
};
struct Sphere {
@@ -94,7 +94,7 @@ ray_plane_intersect(Isect &isect, Ray &ray, const Plane &plane) {
float v = dot(ray.dir, plane.n);
#if 0
cif (abs(v) < 1.0e-17)
cif (abs(v) < 1.0e-17)
return;
else {
float t = -(dot(ray.org, plane.n) + d) / v;
@@ -141,7 +141,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, const Sphere &sphere) {
}
}
#else
cif (D <=0.0f)
cif (D <=0.0f)
return;
float t = -B - sqrt(D);
@@ -220,8 +220,8 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
occIsect.hit = 0;
for (uniform int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(occIsect, ray, spheres[snum]);
ray_plane_intersect (occIsect, ray, plane);
ray_sphere_intersect(occIsect, ray, spheres[snum]);
ray_plane_intersect (occIsect, ray, plane);
if (occIsect.hit) occlusion += 1.0;
}
@@ -233,10 +233,10 @@ ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
static inline void ao_tiles(
uniform int x0, uniform int x1,
uniform int y0, uniform int y1,
uniform int y0, uniform int y1,
uniform int w, uniform int h,
uniform int nsubsamples,
uniform float image[])
uniform int nsubsamples,
uniform float image[])
{
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
const Sphere spheres[3] = {
@@ -306,7 +306,7 @@ static inline void ao_tiles(
#define TILEX max(64,programCount*2)
#define TILEY 4
export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
uniform float image[]) {
const uniform int x0 = 0;
const uniform int x1 = w;
@@ -315,8 +315,8 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
ao_tiles(x0,x1,y0,y1, w, h, nsubsamples, image);
}
void task ao_task(uniform int width, uniform int height,
uniform int nsubsamples, uniform float image[])
void task ao_task(uniform int width, uniform int height,
uniform int nsubsamples, uniform float image[])
{
if (taskIndex0 >= taskCount0) return;
if (taskIndex1 >= taskCount1) return;
@@ -330,8 +330,8 @@ void task ao_task(uniform int width, uniform int height,
}
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
uniform float image[])
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
uniform float image[])
{
const uniform int ntilex = (w+TILEX-1)/TILEX;
const uniform int ntiley = (h+TILEY-1)/TILEY;

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, Intel Corporation
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER
@@ -145,13 +145,13 @@ CreateInputDataFromFile(const char *path) {
}
// Load data chunk and update pointers
input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
ALIGNMENT_BYTES);
if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
return NULL;
}
input->arrays.zBuffer =
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
input->arrays.normalEncoded_x =
@@ -199,21 +199,21 @@ void WriteFrame(const char *filename, const InputData *input,
const Framebuffer &framebuffer) {
// Deswizzle and copy to RGBA output
// Doesn't need to be fast... only happens once
size_t imageBytes = 3 * input->header.framebufferWidth *
size_t imageBytes = 3 * input->header.framebufferWidth *
input->header.framebufferHeight;
uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
memset(framebufferAOS, 0, imageBytes);
for (int i = 0; i < input->header.framebufferWidth *
for (int i = 0; i < input->header.framebufferWidth *
input->header.framebufferHeight; ++i) {
framebufferAOS[3 * i + 0] = framebuffer.r[i];
framebufferAOS[3 * i + 1] = framebuffer.g[i];
framebufferAOS[3 * i + 2] = framebuffer.b[i];
}
// Write out simple PPM file
FILE *out = fopen(filename, "wb");
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
input->header.framebufferHeight);
fwrite(framebufferAOS, imageBytes, 1, out);
fclose(out);

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, Intel Corporation
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DEFERRED_H

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, Intel Corporation
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "deferred.h"
@@ -147,7 +147,7 @@ ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
float minZ, maxZ;
ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
tileY * tileHeight, tileY * tileHeight + tileHeight,
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
cameraNear, cameraFar, &minZ, &maxZ);
minZArray[tileX] = minZ;
maxZArray[tileX] = maxZ;
@@ -167,7 +167,7 @@ public:
{
mNumTilesX = gBufferWidth / mTileWidth;
mNumTilesY = gBufferHeight / mTileHeight;
// Allocate arrays
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
@@ -213,7 +213,7 @@ public:
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
if (srcX + 1 < srcTilesX) {
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
(srcX + 1)]);
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
(srcX + 1)]);
@@ -243,7 +243,7 @@ public:
lAlignedFree(mMaxZArrays[i]);
}
lAlignedFree(mMinZArrays);
lAlignedFree(mMaxZArrays);
lAlignedFree(mMaxZArrays);
}
int Levels() const { return mLevels; }
@@ -277,9 +277,9 @@ private:
static MinMaxZTree *gMinMaxZTree = 0;
void InitDynamicC(InputData *input) {
gMinMaxZTree =
gMinMaxZTree =
new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
input->header.framebufferWidth,
input->header.framebufferWidth,
input->header.framebufferHeight);
}
@@ -311,7 +311,7 @@ SplitTileMinMax(
{
float gBufferScale_x = 0.5f * (float)gBufferWidth;
float gBufferScale_y = 0.5f * (float)gBufferHeight;
float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y) };
float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
@@ -319,7 +319,7 @@ SplitTileMinMax(
for (int i = 0; i < 2; ++i) {
// Normalize
float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
frustumPlanes_z[i] * frustumPlanes_z[i]);
frustumPlanes_xy[i] *= norm;
frustumPlanes_z[i] *= norm;
@@ -340,23 +340,23 @@ SplitTileMinMax(
float light_positionView_z = light_positionView_z_array[lightIndex];
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
float light_attenuationEndNeg = -light_attenuationEnd;
// Test lights again against subtile z bounds
bool inFrustum[4];
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
float dx = light_positionView_z * frustumPlanes_z[0] +
float dx = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0];
float dy = light_positionView_z * frustumPlanes_z[1] +
light_positionView_y * frustumPlanes_xy[1];
if (fabsf(dx) > light_attenuationEnd) {
bool positiveX = dx > 0.0f;
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
@@ -423,13 +423,13 @@ half_to_float_fast(uint16_t h) {
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
// sign
uint32_t xs = ((uint32_t) hs) << 16;
uint32_t xs = ((uint32_t) hs) << 16;
// Exponent: unbias the halfp, then bias the single
int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
// Exponent
uint32_t xe = (uint32_t) (xes << 23);
// Mantissa
uint32_t xm = ((uint32_t) hm) << 13;
uint32_t xm = ((uint32_t) hm) << 13;
uint32_t bits = (xs | xe | xm);
float *fp = reinterpret_cast<float *>(&bits);
@@ -470,13 +470,13 @@ ShadeTileC(
} else {
float twoOverGBufferWidth = 2.0f / gBufferWidth;
float twoOverGBufferHeight = 2.0f / gBufferHeight;
for (int32_t y = tileStartY; y < tileEndY; ++y) {
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
for (int32_t x = tileStartX; x < tileEndX; ++x) {
int32_t gBufferOffset = y * gBufferWidth + x;
// Reconstruct position and (negative) view vector from G-buffer
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
float Vneg_x, Vneg_y, Vneg_z;
@@ -485,70 +485,70 @@ ShadeTileC(
// Compute screen/clip-space position
// NOTE: Mind DX11 viewport transform and pixel center!
float positionScreen_x = (0.5f + (float)(x)) *
float positionScreen_x = (0.5f + (float)(x)) *
twoOverGBufferWidth - 1.0f;
// Unproject depth buffer Z value into view space
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
surface_positionView_x = positionScreen_x * surface_positionView_z /
surface_positionView_x = positionScreen_x * surface_positionView_z /
cameraProj_11;
surface_positionView_y = positionScreen_y * surface_positionView_z /
surface_positionView_y = positionScreen_y * surface_positionView_z /
cameraProj_22;
// We actually end up with a vector pointing *at* the
// surface (i.e. the negative view vector)
normalize3(surface_positionView_x, surface_positionView_y,
normalize3(surface_positionView_x, surface_positionView_y,
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
// Reconstruct normal from G-buffer
float surface_normal_x, surface_normal_y, surface_normal_z;
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
float m = sqrtf(4.0f * f - 1.0f);
surface_normal_x = m * (4.0f * normal_x - 2.0f);
surface_normal_y = m * (4.0f * normal_y - 2.0f);
surface_normal_z = 3.0f - 8.0f * f;
// Load other G-buffer parameters
float surface_specularAmount =
float surface_specularAmount =
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
float surface_specularPower =
float surface_specularPower =
half_to_float_fast(inputData.specularPower[gBufferOffset]);
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
float lit_x = 0.0f;
float lit_y = 0.0f;
float lit_z = 0.0f;
for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
++tileLightIndex) {
int32_t lightIndex = tileLightIndices[tileLightIndex];
// Gather light data relevant to initial culling
float light_positionView_x =
float light_positionView_x =
inputData.lightPositionView_x[lightIndex];
float light_positionView_y =
float light_positionView_y =
inputData.lightPositionView_y[lightIndex];
float light_positionView_z =
float light_positionView_z =
inputData.lightPositionView_z[lightIndex];
float light_attenuationEnd =
float light_attenuationEnd =
inputData.lightAttenuationEnd[lightIndex];
// Compute light vector
float L_x = light_positionView_x - surface_positionView_x;
float L_y = light_positionView_y - surface_positionView_y;
float L_z = light_positionView_z - surface_positionView_z;
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
// Clip at end of attenuation
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
if (distanceToLight2 < light_attenutaionEnd2) {
if (distanceToLight2 < light_attenutaionEnd2) {
float distanceToLight = sqrtf(distanceToLight2);
float distanceToLightRcp = 1.f / distanceToLight;
@@ -557,12 +557,12 @@ ShadeTileC(
L_z *= distanceToLightRcp;
// Start computing brdf
float NdotL = dot3(surface_normal_x, surface_normal_y,
float NdotL = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, L_x, L_y, L_z);
// Clip back facing
if (NdotL > 0.0f) {
float light_attenuationBegin =
float light_attenuationBegin =
inputData.lightAttenuationBegin[lightIndex];
// Light distance attenuation (linstep)
@@ -574,19 +574,19 @@ ShadeTileC(
float H_y = (L_y - Vneg_y);
float H_z = (L_z - Vneg_z);
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
float NdotH = dot3(surface_normal_x, surface_normal_y,
float NdotH = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, H_x, H_y, H_z);
NdotH = std::max(NdotH, 0.0f);
float specular = powf(NdotH, surface_specularPower);
float specularNorm = (surface_specularPower + 2.0f) *
float specularNorm = (surface_specularPower + 2.0f) *
(1.0f / 8.0f);
float specularContrib = surface_specularAmount *
float specularContrib = surface_specularAmount *
specularNorm * specular;
float k = attenuation * NdotL * (1.0f + specularContrib);
float light_color_x = inputData.lightColor_x[lightIndex];
float light_color_y = inputData.lightColor_y[lightIndex];
float light_color_z = inputData.lightColor_z[lightIndex];
@@ -607,7 +607,7 @@ ShadeTileC(
lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
@@ -618,11 +618,11 @@ ShadeTileC(
void
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
int *lightIndices, int numLights,
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
int *lightIndices, int numLights,
Framebuffer *framebuffer) {
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
// If we few enough lights or this is the base case (last level), shade
// this full tile directly
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
@@ -632,18 +632,18 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
int startY = tileY * height;
int endX = std::min(input->header.framebufferWidth, startX + width);
int endY = std::min(input->header.framebufferHeight, startY + height);
// Skip entirely offscreen tiles
if (endX > startX && endY > startY) {
ShadeTileC(startX, endX, startY, endY,
input->header.framebufferWidth, input->header.framebufferHeight,
input->arrays,
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
framebuffer->r, framebuffer->g, framebuffer->b);
}
}
}
else {
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
// Move down a level in the tree
@@ -666,9 +666,9 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
// NOTE: Order is 00, 10, 01, 11
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
input->header.cameraFar, input->header.cameraFar};
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
input->header.cameraNear, input->header.cameraNear};
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
@@ -688,7 +688,7 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
// Cull lights into subtile lists
#ifdef ISPC_IS_WINDOWS
__declspec(align(ALIGNMENT_BYTES))
__declspec(align(ALIGNMENT_BYTES))
#endif
int subtileLightIndices[4][MAX_LIGHTS]
#ifndef ISPC_IS_WINDOWS
@@ -697,15 +697,15 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
;
int subtileNumLights[4];
SplitTileMinMax(midX, midY, minZ, maxZ,
input->header.framebufferWidth, input->header.framebufferHeight,
input->header.framebufferWidth, input->header.framebufferHeight,
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
lightIndices, numLights, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
lightIndices, numLights, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
input->arrays.lightAttenuationEnd,
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
// Recurse into subtiles
ShadeDynamicTileRecurse(input, level, tileX , tileY,
ShadeDynamicTileRecurse(input, level, tileX , tileY,
subtileLightIndices[0], subtileNumLights[0],
framebuffer);
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
@@ -744,7 +744,7 @@ IntersectLightsWithTileMinMax(
{
float gBufferScale_x = 0.5f * (float)gBufferWidth;
float gBufferScale_y = 0.5f * (float)gBufferHeight;
float frustumPlanes_xy[4];
float frustumPlanes_z[4];
@@ -753,14 +753,14 @@ IntersectLightsWithTileMinMax(
(cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y),
-(cameraProj_22 * gBufferScale_y) };
float frustumPlanes_z_v[4] = { tileEndX - gBufferScale_x,
-tileStartX + gBufferScale_x,
tileEndY - gBufferScale_y,
-tileStartY + gBufferScale_y };
for (int i = 0; i < 4; ++i) {
float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
frustumPlanes_xy_v[i] *= norm;
frustumPlanes_z_v[i] *= norm;
@@ -781,29 +781,29 @@ IntersectLightsWithTileMinMax(
d = maxZ - light_positionView_z;
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
if (!inFrustum)
if (!inFrustum)
continue;
float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex];
d = light_positionView_z * frustumPlanes_z[0] +
d = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[1] +
d = light_positionView_z * frustumPlanes_z[1] +
light_positionView_x * frustumPlanes_xy[1];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[2] +
d = light_positionView_z * frustumPlanes_z[2] +
light_positionView_y * frustumPlanes_xy[2];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[3] +
d = light_positionView_z * frustumPlanes_z[3] +
light_positionView_y * frustumPlanes_xy[3];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// Pack and store intersecting lights
if (inFrustum)
tileLightIndices[tileNumLights++] = lightIndex;
@@ -831,7 +831,7 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
// This is a root tile, so first do a full 6-plane cull
#ifdef ISPC_IS_WINDOWS
__declspec(align(ALIGNMENT_BYTES))
__declspec(align(ALIGNMENT_BYTES))
#endif
int lightIndices[MAX_LIGHTS]
#ifndef ISPC_IS_WINDOWS
@@ -842,12 +842,12 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
startX, endX, startY, endY, minZ, maxZ,
input->header.framebufferWidth, input->header.framebufferHeight,
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
MAX_LIGHTS, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
MAX_LIGHTS, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
input->arrays.lightAttenuationEnd, lightIndices);
// Now kick off the recursive process for this tile
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
numLights, framebuffer);
}
@@ -856,10 +856,10 @@ void
DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
{
MinMaxZTree *minMaxZTree = gMinMaxZTree;
// Update min/max Z tree
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
input->header.cameraNear, input->header.cameraFar);
int rootLevel = minMaxZTree->Levels() - 1;

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, Intel Corporation
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef __cilk
@@ -104,7 +104,7 @@ public:
{
mNumTilesX = gBufferWidth / mTileWidth;
mNumTilesY = gBufferHeight / mTileHeight;
// Allocate arrays
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
@@ -155,7 +155,7 @@ public:
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
if (srcX + 1 < srcTilesX) {
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
(srcX + 1)]);
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
(srcX + 1)]);
@@ -185,7 +185,7 @@ public:
lAlignedFree(mMaxZArrays[i]);
}
lAlignedFree(mMinZArrays);
lAlignedFree(mMaxZArrays);
lAlignedFree(mMaxZArrays);
}
int Levels() const { return mLevels; }
@@ -219,19 +219,19 @@ private:
static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
void InitDynamicCilk(InputData *input) {
gMinMaxZTreeCilk =
gMinMaxZTreeCilk =
new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
input->header.framebufferWidth,
input->header.framebufferWidth,
input->header.framebufferHeight);
}
static void
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
int *lightIndices, int numLights,
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
int *lightIndices, int numLights,
Framebuffer *framebuffer) {
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
// If we few enough lights or this is the base case (last level), shade
// this full tile directly
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
@@ -241,19 +241,19 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
int startY = tileY * height;
int endX = std::min(input->header.framebufferWidth, startX + width);
int endY = std::min(input->header.framebufferHeight, startY + height);
// Skip entirely offscreen tiles
if (endX > startX && endY > startY) {
ispc::ShadeTile(
startX, endX, startY, endY,
input->header.framebufferWidth, input->header.framebufferHeight,
&input->arrays,
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
framebuffer->r, framebuffer->g, framebuffer->b);
}
}
}
else {
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
// Move down a level in the tree
@@ -276,9 +276,9 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
// NOTE: Order is 00, 10, 01, 11
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
input->header.cameraFar, input->header.cameraFar};
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
input->header.cameraNear, input->header.cameraNear};
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
@@ -298,7 +298,7 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
// Cull lights into subtile lists
#ifdef ISPC_IS_WINDOWS
__declspec(align(ALIGNMENT_BYTES))
__declspec(align(ALIGNMENT_BYTES))
#endif
int subtileLightIndices[4][MAX_LIGHTS]
#ifndef ISPC_IS_WINDOWS
@@ -307,15 +307,15 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
;
int subtileNumLights[4];
ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
input->header.framebufferWidth, input->header.framebufferHeight,
input->header.framebufferWidth, input->header.framebufferHeight,
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
lightIndices, numLights, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
lightIndices, numLights, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
input->arrays.lightAttenuationEnd,
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
// Recurse into subtiles
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY,
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY,
subtileLightIndices[0], subtileNumLights[0],
framebuffer);
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
@@ -349,7 +349,7 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
// This is a root tile, so first do a full 6-plane cull
#ifdef ISPC_IS_WINDOWS
__declspec(align(ALIGNMENT_BYTES))
__declspec(align(ALIGNMENT_BYTES))
#endif
int lightIndices[MAX_LIGHTS]
#ifndef ISPC_IS_WINDOWS
@@ -360,12 +360,12 @@ ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
startX, endX, startY, endY, minZ, maxZ,
input->header.framebufferWidth, input->header.framebufferHeight,
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
MAX_LIGHTS, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
MAX_LIGHTS, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
input->arrays.lightAttenuationEnd, lightIndices);
// Now kick off the recursive process for this tile
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
numLights, framebuffer);
}
@@ -374,10 +374,10 @@ void
DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
{
MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
// Update min/max Z tree
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
input->header.cameraNear, input->header.cameraFar);
// Launch the "root" tiles. Ideally these should at least fill the

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
@@ -46,7 +46,7 @@
#define int16 short
#define int8 char
__device__ static inline float clamp(float v, float low, float high)
__device__ static inline float clamp(float v, float low, float high)
{
return min(max(v, low), high);
}
@@ -122,8 +122,8 @@ struct Uniform
const int2 idx = get_chunk(i);
return __shfl(data[idx.x], idx.y);
}
__device__ inline void set(const bool active, const int i, T value)
__device__ inline void set(const bool active, const int i, T value)
{
const int2 idx = get_chunk(i);
const int chunkIdx = idx.x;
@@ -160,9 +160,9 @@ struct Uniform
{
return data[i];
}
__device__ inline T* get_ptr(const int i) {return &data[i]; }
__device__ inline void set(const bool active, const int i, T value)
__device__ inline void set(const bool active, const int i, T value)
{
if (active)
data[i] = value;
@@ -185,8 +185,8 @@ struct Uniform
{
return shdata[i];
}
__device__ inline void set(const bool active, const int i, T value)
__device__ inline void set(const bool active, const int i, T value)
{
if (active)
shdata[i] = value;
@@ -264,7 +264,7 @@ static __device__ __forceinline__ int2 warpBinExclusiveScan(const bool p)
const int b = __ballot(p);
return make_int2(__popc(b), __popc(b & lanemask_lt()));
}
__device__ static inline
__device__ static inline
int packed_store_active(bool active, int* ptr, int value)
{
const int2 res = warpBinExclusiveScan(active);
@@ -358,7 +358,7 @@ IntersectLightsWithTileMinMax(
{
float gBufferScale_x = 0.5f * (float)gBufferWidth;
float gBufferScale_y = 0.5f * (float)gBufferHeight;
float frustumPlanes_xy[4] = {
-(cameraProj_11 * gBufferScale_x),
(cameraProj_11 * gBufferScale_x),
@@ -371,7 +371,7 @@ IntersectLightsWithTileMinMax(
-tileStartY + gBufferScale_y };
for ( int i = 0; i < 4; ++i) {
float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
frustumPlanes_z[i] * frustumPlanes_z[i]);
frustumPlanes_xy[i] *= norm;
frustumPlanes_z[i] *= norm;
@@ -393,32 +393,32 @@ IntersectLightsWithTileMinMax(
d = maxZ - light_positionView_z;
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// This seems better than cif(!inFrustum) ccontinue; here since we
// don't actually need to mask the rest of this function - this is
// just a greedy early-out. Could also structure all of this as
// nested if() statements, but this a bit easier to read
if (__ballot(inFrustum) > 0)
if (__ballot(inFrustum) > 0)
{
float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex];
d = light_positionView_z * frustumPlanes_z[0] +
d = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[1] +
d = light_positionView_z * frustumPlanes_z[1] +
light_positionView_x * frustumPlanes_xy[1];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[2] +
d = light_positionView_z * frustumPlanes_z[2] +
light_positionView_y * frustumPlanes_xy[2];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[3] +
d = light_positionView_z * frustumPlanes_z[3] +
light_positionView_y * frustumPlanes_xy[3];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// Pack and store intersecting lights
const bool active = inFrustum && lightIndex < numLights;
#if 0
@@ -472,7 +472,7 @@ IntersectLightsWithTile(
int32 tileNumLights = IntersectLightsWithTileMinMax(
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
light_positionView_z_array, light_attenuationEnd_array,
tileLightIndices);
@@ -505,7 +505,7 @@ ShadeTile(
unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
for ( int32 y = tileStartY; y < tileEndY; ++y) {
for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
{
{
const int x = xb + programIndex;
if (x >= tileEndX) continue;
int32 framebufferIndex = (y * gBufferWidth + x);
@@ -517,16 +517,16 @@ ShadeTile(
} else {
float twoOverGBufferWidth = 2.0f / gBufferWidth;
float twoOverGBufferHeight = 2.0f / gBufferHeight;
for ( int32 y = tileStartY; y < tileEndY; ++y) {
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
{
{
const int x = xb + programIndex;
// if (x >= tileEndX) break;
int32 gBufferOffset = y * gBufferWidth + x;
// Reconstruct position and (negative) view vector from G-buffer
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
float Vneg_x, Vneg_y, Vneg_z;
@@ -535,19 +535,19 @@ ShadeTile(
// Compute screen/clip-space position
// NOTE: Mind DX11 viewport transform and pixel center!
float positionScreen_x = (0.5f + (float)(x)) *
float positionScreen_x = (0.5f + (float)(x)) *
twoOverGBufferWidth - 1.0f;
// Unproject depth buffer Z value into view space
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
surface_positionView_x = positionScreen_x * surface_positionView_z /
surface_positionView_x = positionScreen_x * surface_positionView_z /
cameraProj_11;
surface_positionView_y = positionScreen_y * surface_positionView_z /
surface_positionView_y = positionScreen_y * surface_positionView_z /
cameraProj_22;
// We actually end up with a vector pointing *at* the
// surface (i.e. the negative view vector)
normalize3(surface_positionView_x, surface_positionView_y,
normalize3(surface_positionView_x, surface_positionView_y,
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
// Reconstruct normal from G-buffer
@@ -556,51 +556,51 @@ ShadeTile(
float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]);
float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]);
asm("// half2float //");
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
float m = sqrt(4.0f * f - 1.0f);
surface_normal_x = m * (4.0f * normal_x - 2.0f);
surface_normal_y = m * (4.0f * normal_y - 2.0f);
surface_normal_z = 3.0f - 8.0f * f;
// Load other G-buffer parameters
float surface_specularAmount =
float surface_specularAmount =
__half2float(inputData.specularAmount[gBufferOffset]);
float surface_specularPower =
float surface_specularPower =
__half2float(inputData.specularPower[gBufferOffset]);
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
float lit_x = 0.0f;
float lit_y = 0.0f;
float lit_z = 0.0f;
for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
++tileLightIndex) {
int32 lightIndex = tileLightIndices.get(tileLightIndex);
// Gather light data relevant to initial culling
float light_positionView_x =
float light_positionView_x =
__ldg(&inputData.lightPositionView_x[lightIndex]);
float light_positionView_y =
float light_positionView_y =
__ldg(&inputData.lightPositionView_y[lightIndex]);
float light_positionView_z =
float light_positionView_z =
__ldg(&inputData.lightPositionView_z[lightIndex]);
float light_attenuationEnd =
float light_attenuationEnd =
__ldg(&inputData.lightAttenuationEnd[lightIndex]);
// Compute light vector
float L_x = light_positionView_x - surface_positionView_x;
float L_y = light_positionView_y - surface_positionView_y;
float L_z = light_positionView_z - surface_positionView_z;
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
// Clip at end of attenuation
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
if (distanceToLight2 < light_attenutaionEnd2) {
if (distanceToLight2 < light_attenutaionEnd2) {
float distanceToLight = sqrt(distanceToLight2);
// HLSL "rcp" is allowed to be fairly inaccurate
@@ -610,12 +610,12 @@ ShadeTile(
L_z *= distanceToLightRcp;
// Start computing brdf
float NdotL = dot3(surface_normal_x, surface_normal_y,
float NdotL = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, L_x, L_y, L_z);
// Clip back facing
if (NdotL > 0.0f) {
float light_attenuationBegin =
float light_attenuationBegin =
inputData.lightAttenuationBegin[lightIndex];
// Light distance attenuation (linstep)
@@ -627,19 +627,19 @@ ShadeTile(
float H_y = (L_y - Vneg_y);
float H_z = (L_z - Vneg_z);
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
float NdotH = dot3(surface_normal_x, surface_normal_y,
float NdotH = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, H_x, H_y, H_z);
NdotH = max(NdotH, 0.0f);
float specular = pow(NdotH, surface_specularPower);
float specularNorm = (surface_specularPower + 2.0f) *
float specularNorm = (surface_specularPower + 2.0f) *
(1.0f / 8.0f);
float specularContrib = surface_specularAmount *
float specularContrib = surface_specularAmount *
specularNorm * specular;
float k = attenuation * NdotL * (1.0f + specularContrib);
float light_color_x = inputData.lightColor_x[lightIndex];
float light_color_y = inputData.lightColor_y[lightIndex];
float light_color_z = inputData.lightColor_z[lightIndex];
@@ -663,7 +663,7 @@ ShadeTile(
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
@@ -707,8 +707,8 @@ RenderTile( int num_groups_x, int num_groups_y,
// Light intersection: figure out which lights illuminate this tile.
Uniform<int,MAX_LIGHTS> tileLightIndices; // Light list for the tile
#if 1
int numTileLights =
IntersectLightsWithTile(tile_start_x, tile_end_x,
int numTileLights =
IntersectLightsWithTile(tile_start_x, tile_end_x,
tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight,
inputData.zBuffer,
@@ -716,9 +716,9 @@ RenderTile( int num_groups_x, int num_groups_y,
cameraProj_22, cameraProj_32,
inputHeader.cameraNear, inputHeader.cameraFar,
MAX_LIGHTS,
inputData.lightPositionView_x,
inputData.lightPositionView_y,
inputData.lightPositionView_z,
inputData.lightPositionView_x,
inputData.lightPositionView_y,
inputData.lightPositionView_z,
inputData.lightAttenuationEnd,
tileLightIndices);
@@ -726,7 +726,7 @@ RenderTile( int num_groups_x, int num_groups_y,
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight, inputData,
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
tileLightIndices, numTileLights, visualizeLightCount,
tileLightIndices, numTileLights, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
#endif
}
@@ -745,9 +745,9 @@ RenderStatic___export( InputHeader inputHeaderPtr[],
const InputDataArrays inputData = *inputDataPtr;
int num_groups_x = (inputHeader.framebufferWidth +
int num_groups_x = (inputHeader.framebufferWidth +
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
int num_groups_y = (inputHeader.framebufferHeight +
int num_groups_y = (inputHeader.framebufferHeight +
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
int num_groups = num_groups_x * num_groups_y;

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "deferred.h"
@@ -142,11 +142,11 @@ ComputeZBounds(
maxZ = reduce_max(laneMaxZ);
}
#if 1
#if 1
inline
#endif
#ifndef __NVPTX__
export
export
#endif
uniform int32
IntersectLightsWithTileMinMax(
@@ -171,7 +171,7 @@ IntersectLightsWithTileMinMax(
{
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
uniform_t float frustumPlanes_xy[4] = {
-(cameraProj_11 * gBufferScale_x),
(cameraProj_11 * gBufferScale_x),
@@ -184,7 +184,7 @@ IntersectLightsWithTileMinMax(
-tileStartY + gBufferScale_y };
for (uniform int i = 0; i < 4; ++i) {
uniform_t float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
uniform_t float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
frustumPlanes_z[i] * frustumPlanes_z[i]);
frustumPlanes_xy[i] *= norm;
frustumPlanes_z[i] *= norm;
@@ -202,7 +202,7 @@ IntersectLightsWithTileMinMax(
d = maxZ - light_positionView_z;
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// This seems better than cif(!inFrustum) ccontinue; here since we
// don't actually need to mask the rest of this function - this is
// just a greedy early-out. Could also structure all of this as
@@ -211,26 +211,26 @@ IntersectLightsWithTileMinMax(
float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex];
d = light_positionView_z * frustumPlanes_z[0] +
d = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[1] +
d = light_positionView_z * frustumPlanes_z[1] +
light_positionView_x * frustumPlanes_xy[1];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[2] +
d = light_positionView_z * frustumPlanes_z[2] +
light_positionView_y * frustumPlanes_xy[2];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[3] +
d = light_positionView_z * frustumPlanes_z[3] +
light_positionView_y * frustumPlanes_xy[3];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
#if 0
#if 0
// Pack and store intersecting lights
cif (inFrustum) {
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
lightIndex);
}
#else
@@ -245,7 +245,7 @@ IntersectLightsWithTileMinMax(
}
#if 1
#if 1
inline
#endif
static uniform int32
@@ -277,7 +277,7 @@ IntersectLightsWithTile(
uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
light_positionView_z_array, light_attenuationEnd_array,
tileLightIndices);
@@ -285,7 +285,7 @@ IntersectLightsWithTile(
}
#if 1
#if 1
inline
#endif
#ifndef __NVPTX__
@@ -324,13 +324,13 @@ ShadeTile(
} else {
uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
foreach (x = tileStartX ... tileEndX) {
int32 gBufferOffset = y * gBufferWidth + x;
// Reconstruct position and (negative) view vector from G-buffer
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
float Vneg_x, Vneg_y, Vneg_z;
@@ -339,70 +339,70 @@ ShadeTile(
// Compute screen/clip-space position
// NOTE: Mind DX11 viewport transform and pixel center!
float positionScreen_x = (0.5f + (float)(x)) *
float positionScreen_x = (0.5f + (float)(x)) *
twoOverGBufferWidth - 1.0f;
// Unproject depth buffer Z value into view space
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
surface_positionView_x = positionScreen_x * surface_positionView_z /
surface_positionView_x = positionScreen_x * surface_positionView_z /
cameraProj_11;
surface_positionView_y = positionScreen_y * surface_positionView_z /
surface_positionView_y = positionScreen_y * surface_positionView_z /
cameraProj_22;
// We actually end up with a vector pointing *at* the
// surface (i.e. the negative view vector)
normalize3(surface_positionView_x, surface_positionView_y,
normalize3(surface_positionView_x, surface_positionView_y,
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
// Reconstruct normal from G-buffer
float surface_normal_x, surface_normal_y, surface_normal_z;
float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
float m = sqrt(4.0f * f - 1.0f);
surface_normal_x = m * (4.0f * normal_x - 2.0f);
surface_normal_y = m * (4.0f * normal_y - 2.0f);
surface_normal_z = 3.0f - 8.0f * f;
// Load other G-buffer parameters
float surface_specularAmount =
float surface_specularAmount =
half_to_float(inputData.specularAmount[gBufferOffset]);
float surface_specularPower =
float surface_specularPower =
half_to_float(inputData.specularPower[gBufferOffset]);
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
float lit_x = 0.0f;
float lit_y = 0.0f;
float lit_z = 0.0f;
for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
++tileLightIndex) {
uniform int32 lightIndex = tileLightIndices[tileLightIndex];
// Gather light data relevant to initial culling
uniform float light_positionView_x =
uniform float light_positionView_x =
inputData.lightPositionView_x[lightIndex];
uniform float light_positionView_y =
uniform float light_positionView_y =
inputData.lightPositionView_y[lightIndex];
uniform float light_positionView_z =
uniform float light_positionView_z =
inputData.lightPositionView_z[lightIndex];
uniform float light_attenuationEnd =
uniform float light_attenuationEnd =
inputData.lightAttenuationEnd[lightIndex];
// Compute light vector
float L_x = light_positionView_x - surface_positionView_x;
float L_y = light_positionView_y - surface_positionView_y;
float L_z = light_positionView_z - surface_positionView_z;
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
// Clip at end of attenuation
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
cif (distanceToLight2 < light_attenutaionEnd2) {
cif (distanceToLight2 < light_attenutaionEnd2) {
float distanceToLight = sqrt(distanceToLight2);
// HLSL "rcp" is allowed to be fairly inaccurate
@@ -412,12 +412,12 @@ ShadeTile(
L_z *= distanceToLightRcp;
// Start computing brdf
float NdotL = dot3(surface_normal_x, surface_normal_y,
float NdotL = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, L_x, L_y, L_z);
// Clip back facing
cif (NdotL > 0.0f) {
uniform float light_attenuationBegin =
uniform float light_attenuationBegin =
inputData.lightAttenuationBegin[lightIndex];
// Light distance attenuation (linstep)
@@ -429,19 +429,19 @@ ShadeTile(
float H_y = (L_y - Vneg_y);
float H_z = (L_z - Vneg_z);
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
float NdotH = dot3(surface_normal_x, surface_normal_y,
float NdotH = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, H_x, H_y, H_z);
NdotH = max(NdotH, 0.0f);
float specular = pow(NdotH, surface_specularPower);
float specularNorm = (surface_specularPower + 2.0f) *
float specularNorm = (surface_specularPower + 2.0f) *
(1.0f / 8.0f);
float specularContrib = surface_specularAmount *
float specularContrib = surface_specularAmount *
specularNorm * specular;
float k = attenuation * NdotL * (1.0f + specularContrib);
uniform float light_color_x = inputData.lightColor_x[lightIndex];
uniform float light_color_y = inputData.lightColor_y[lightIndex];
uniform float light_color_z = inputData.lightColor_z[lightIndex];
@@ -465,7 +465,7 @@ ShadeTile(
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
@@ -512,8 +512,8 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
#else /* shared memory doesn't full work... why? */
uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
#endif
uniform int numTileLights =
IntersectLightsWithTile(tile_start_x, tile_end_x,
uniform int numTileLights =
IntersectLightsWithTile(tile_start_x, tile_end_x,
tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight,
inputData.zBuffer,
@@ -521,9 +521,9 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
cameraProj_22, cameraProj_32,
inputHeader.cameraNear, inputHeader.cameraFar,
MAX_LIGHTS,
inputData.lightPositionView_x,
inputData.lightPositionView_y,
inputData.lightPositionView_z,
inputData.lightPositionView_x,
inputData.lightPositionView_y,
inputData.lightPositionView_z,
inputData.lightAttenuationEnd,
tileLightIndices);
@@ -531,7 +531,7 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight, inputData,
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
tileLightIndices, numTileLights, visualizeLightCount,
tileLightIndices, numTileLights, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
#ifdef MALLOC
delete tileLightIndices;
@@ -551,9 +551,9 @@ RenderStatic(uniform InputHeader inputHeaderPtr[],
uniform InputHeader inputHeader = *inputHeaderPtr;
uniform InputDataArrays inputData = *inputDataPtr;
uniform int num_groups_x = (inputHeader.framebufferWidth +
uniform int num_groups_x = (inputHeader.framebufferWidth +
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
uniform int num_groups_y = (inputHeader.framebufferHeight +
uniform int num_groups_y = (inputHeader.framebufferHeight +
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
uniform int num_groups = num_groups_x * num_groups_y;
@@ -627,16 +627,16 @@ SplitTileMinMax(
{
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
uniform_t float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y) };
uniform_t float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
tileMidY - gBufferScale_y };
// Normalize
uniform_t float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
uniform_t float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
frustumPlanes_z[0] * frustumPlanes_z[0]),
rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
frustumPlanes_z[1] * frustumPlanes_z[1]) };
frustumPlanes_xy[0] *= norm[0];
frustumPlanes_xy[1] *= norm[1];
@@ -658,23 +658,23 @@ SplitTileMinMax(
float light_positionView_z = light_positionView_z_array[lightIndex];
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
float light_attenuationEndNeg = -light_attenuationEnd;
// Test lights again subtile z bounds
bool inFrustum[4];
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
float dx = light_positionView_z * frustumPlanes_z[0] +
float dx = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0];
float dy = light_positionView_z * frustumPlanes_z[1] +
light_positionView_y * frustumPlanes_xy[1];
cif (abs(dx) > light_attenuationEnd) {
bool positiveX = dx > 0.0f;
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
@@ -693,20 +693,20 @@ SplitTileMinMax(
// Pack and store intersecting lights
// TODO: Experiment with a loop here instead
cif (inFrustum[0])
subtileLightOffset[0] +=
subtileLightOffset[0] +=
packed_store_active(&subtileIndices[subtileLightOffset[0]],
lightIndex);
cif (inFrustum[1])
subtileLightOffset[1] +=
subtileLightOffset[1] +=
packed_store_active(&subtileIndices[subtileLightOffset[1]],
lightIndex);
cif (inFrustum[2])
subtileLightOffset[2] +=
packed_store_active(&subtileIndices[subtileLightOffset[2]],
subtileLightOffset[2] +=
packed_store_active(&subtileIndices[subtileLightOffset[2]],
lightIndex);
cif (inFrustum[3])
subtileLightOffset[3] +=
packed_store_active(&subtileIndices[subtileLightOffset[3]],
subtileLightOffset[3] +=
packed_store_active(&subtileIndices[subtileLightOffset[3]],
lightIndex);
}

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, Intel Corporation
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER

View File

@@ -5,8 +5,8 @@ uniform int gcd(uniform int a, uniform int b)
{
while ( a != 0 )
{
uniform int c = a;
a = b%a;
uniform int c = a;
a = b%a;
b = c;
}
return b;
@@ -32,7 +32,7 @@ int __sj(const int i, const uniform int j, const uniform int m, const uniform in
#if 0
static inline
static inline
void transpose_serial(uniform T A[], const uniform int m, const uniform int n)
{
const uniform int tmpSize = max(m,n) * programCount;
@@ -90,7 +90,7 @@ static uniform int * uniform joverb = NULL;
static uniform int * uniform iovera = NULL;
static uniform int a,b,c;
static
static
void transpose_init(const uniform int m, const uniform int n, const uniform int nTask)
{
joverb = uniform new uniform int[n];
@@ -105,14 +105,14 @@ void transpose_init(const uniform int m, const uniform int n, const uniform int
iovera[i] = i/a;
}
static
static
void transpose_finalize()
{
delete iovera;
delete joverb;
}
task
task
void transpose_step1(uniform T A[], const uniform int m, const uniform int n)
{
const uniform int n_per_task = (n + taskCount - 1)/taskCount;
@@ -140,7 +140,7 @@ void transpose_step2(uniform T A[], const uniform int m, const uniform int n)
const uniform int m_per_task = (m + taskCount - 1)/taskCount;
const uniform int mibeg = taskIndex * m_per_task;
const uniform int miend = min(mibeg + m_per_task, m);
uniform T * uniform tmp = uniform new uniform T[n*programCount];
uniform T (*uniform tmp2D)[programCount] = (uniform T (*uniform)[programCount])tmp;
@@ -161,7 +161,7 @@ void transpose_step3(uniform T A[], const uniform int m, const uniform int n)
const uniform int n_per_task = (n + taskCount - 1)/taskCount;
const uniform int nibeg = taskIndex * n_per_task;
const uniform int niend = min(nibeg + n_per_task, n);
uniform T * uniform tmp = uniform new uniform T[m];
for (uniform int j = nibeg; j < niend; j++)
@@ -176,7 +176,7 @@ void transpose_step3(uniform T A[], const uniform int m, const uniform int n)
delete tmp;
}
export
export
void transpose(uniform T A[], const uniform int m, const uniform int n)
{
#if 0
@@ -187,7 +187,7 @@ void transpose(uniform T A[], const uniform int m, const uniform int n)
launch [nTask] transpose_step1(A, m, n);
sync;
launch [nTask] transpose_step2(A, m, n);
sync;

View File

@@ -71,7 +71,7 @@ int main (int argc, char *argv[])
valsGld[i] = valsSrc[i];
}
delete keys;
ispcSetMallocHeapLimit(1024*1024*1024);
ispc::openMergeSort();
@@ -115,7 +115,7 @@ int main (int argc, char *argv[])
}
printf("\n---\n");
#endif
std::sort(keysGld, keysGld + n);

View File

@@ -30,9 +30,9 @@ int nextPowerOfTwo(int x)
__device__ static inline
int binarySearchInclusiveRanks(
const int val,
const int val,
uniform int *data,
const int L,
const int L,
int stride)
{
if (L == 0)
@@ -52,9 +52,9 @@ int binarySearchInclusiveRanks(
__device__ static inline
int binarySearchExclusiveRanks(
const int val,
uniform int *data,
const int L,
const int val,
uniform int *data,
const int L,
int stride)
{
if (L == 0)
@@ -74,9 +74,9 @@ int binarySearchExclusiveRanks(
__device__ static inline
int binarySearchInclusive(
const Key_t val,
const Key_t val,
uniform Key_t *data,
const int L,
const int L,
int stride)
{
if (L == 0)
@@ -96,9 +96,9 @@ int binarySearchInclusive(
__device__ static inline
int binarySearchExclusive(
const Key_t val,
uniform Key_t *data,
const int L,
const Key_t val,
uniform Key_t *data,
const int L,
int stride)
{
if (L == 0)
@@ -118,9 +118,9 @@ int binarySearchExclusive(
__device__ static inline
int binarySearchInclusive1(
const Key_t val,
const Key_t val,
Key_t data,
const uniform int L,
const uniform int L,
uniform int stride)
{
if (L == 0)
@@ -140,9 +140,9 @@ int binarySearchInclusive1(
__device__ static inline
int binarySearchExclusive1(
const Key_t val,
Key_t data,
const uniform int L,
const Key_t val,
Key_t data,
const uniform int L,
uniform int stride)
{
if (L == 0)
@@ -245,7 +245,7 @@ void generateSampleRanksKernel(
const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blkBeg = blkIdx * blkDim;
const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
{
const int pos = blk * programCount + programIndex;
@@ -291,8 +291,8 @@ void generateSampleRanks(
uniform int N)
{
uniform int lastSegmentElements = N % (2 * stride);
uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
@@ -304,7 +304,7 @@ void generateSampleRanks(
////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices
////////////////////////////////////////////////////////////////////////////////
__global__
__global__
void mergeRanksAndIndicesKernel(
uniform int nBlocks,
uniform int in_Limits[],
@@ -317,7 +317,7 @@ void mergeRanksAndIndicesKernel(
const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blkBeg = blkIdx * blkDim;
const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
{
int pos = blk * programCount + programIndex;
@@ -357,8 +357,8 @@ void mergeRanksAndIndices(
uniform int N)
{
const uniform int lastSegmentElements = N % (2 * stride);
const uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
const uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
@@ -457,13 +457,13 @@ void mergeElementaryIntervalsKernel(
dstB = segmentBase + startDstB + dstPosB;
// store merge data
if (dstA >= 0)
if (dstA >= 0)
{
// int dstA = segmentBase + startSrcA + programIndex;
dstKey[dstA] = keyA;
dstVal[dstA] = valA;
}
if (dstB >= 0)
if (dstB >= 0)
{
// int dstB = segmentBase + stride + startSrcB + programIndex;
dstKey[dstB] = keyB;
@@ -513,7 +513,7 @@ __device__ static uniform int * uniform limitsB;
__device__ static uniform int nTasks;
__device__ static uniform int MAX_SAMPLE_COUNT = 0;
__global__
__global__
void openMergeSort___export()
{
nTasks = 13*32*13;

View File

@@ -25,9 +25,9 @@ int nextPowerOfTwo(int x)
static inline
int binarySearchInclusiveRanks(
const int val,
const int val,
uniform int *data,
const int L,
const int L,
int stride)
{
cif (L == 0)
@@ -47,9 +47,9 @@ int binarySearchInclusiveRanks(
static inline
int binarySearchExclusiveRanks(
const int val,
uniform int *data,
const int L,
const int val,
uniform int *data,
const int L,
int stride)
{
cif (L == 0)
@@ -69,9 +69,9 @@ int binarySearchExclusiveRanks(
static inline
int binarySearchInclusive(
const Key_t val,
const Key_t val,
uniform Key_t *data,
const int L,
const int L,
int stride)
{
cif (L == 0)
@@ -91,9 +91,9 @@ int binarySearchInclusive(
static inline
int binarySearchExclusive(
const Key_t val,
uniform Key_t *data,
const int L,
const Key_t val,
uniform Key_t *data,
const int L,
int stride)
{
cif (L == 0)
@@ -113,9 +113,9 @@ int binarySearchExclusive(
static inline
int binarySearchInclusive1(
const Key_t val,
const Key_t val,
Key_t data,
const uniform int L,
const uniform int L,
uniform int stride)
{
if (L == 0)
@@ -135,9 +135,9 @@ int binarySearchInclusive1(
static inline
int binarySearchExclusive1(
const Key_t val,
Key_t data,
const uniform int L,
const Key_t val,
Key_t data,
const uniform int L,
uniform int stride)
{
if (L == 0)
@@ -158,7 +158,7 @@ int binarySearchExclusive1(
////////////////////////////////////////////////////////////////////////////////
// Bottom-level merge sort (binary search-based)
////////////////////////////////////////////////////////////////////////////////
task
task
void mergeSortGangKernel(
uniform int batchSize,
uniform Key_t dstKey[],
@@ -189,7 +189,7 @@ void mergeSortGangKernel(
const int offset = 2 * (programIndex - lPos);
uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
Key_t keyA = baseKey[lPos + 0];
Val_t valA = baseVal[lPos + 0];
Key_t keyB = baseKey[lPos + stride];
@@ -244,7 +244,7 @@ void generateSampleRanksKernel(
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
for (uniform int block = blockBeg; block < blockEnd; block++)
{
const int pos = block * programCount + programIndex;
@@ -290,8 +290,8 @@ void generateSampleRanks(
uniform int N)
{
uniform int lastSegmentElements = N % (2 * stride);
uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
@@ -306,7 +306,7 @@ void generateSampleRanks(
////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices
////////////////////////////////////////////////////////////////////////////////
task
task
void mergeRanksAndIndicesKernel(
uniform int nBlocks,
uniform int in_Limits[],
@@ -319,7 +319,7 @@ void mergeRanksAndIndicesKernel(
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
for (uniform int block = blockBeg; block < blockEnd; block++)
{
int pos = block * programCount + programIndex;
@@ -359,8 +359,8 @@ void mergeRanksAndIndices(
uniform int N)
{
const uniform int lastSegmentElements = N % (2 * stride);
const uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
const uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
@@ -462,12 +462,12 @@ void mergeElementaryIntervalsKernel(
if (programIndex < lenSrcB && dstPosB < lenSrcB)
dstB = segmentBase + startDstB + dstPosB;
if (dstA >= 0)
if (dstA >= 0)
{
dstKey[dstA] = keyA;
dstVal[dstA] = valA;
}
if (dstB >= 0)
if (dstB >= 0)
{
dstKey[dstB] = keyB;
dstVal[dstB] = valB;
@@ -521,7 +521,7 @@ static uniform int * uniform limitsA;
static uniform int * uniform limitsB;
static uniform int MAX_SAMPLE_COUNT = 0;
export
export
void openMergeSort()
{
MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
@@ -542,7 +542,7 @@ void closeMergeSort()
memPool = NULL;
}
export
export
void mergeSort(
uniform Key_t dstKey[],
uniform Val_t dstVal[],
@@ -601,7 +601,7 @@ void mergeSort(
}
#endif
// cpu: 287 gpu: 194 M/s
// cpu: 287 gpu: 194 M/s
//Merge elementary intervals
mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
}

View File

@@ -61,7 +61,7 @@ struct Hermite4
const real R0 = 1;
const real mp = 1.0/n;
#pragma omp parallel for schedule(runtime)
for (int i = 0; i < n; i++)
for (int i = 0; i < n; i++)
{
real xp, yp, zp, s2 = 2*R0;
real vx, vy, vz;
@@ -73,7 +73,7 @@ struct Hermite4
vx = drand48() * 0.1;
vy = drand48() * 0.1;
vz = drand48() * 0.1;
}
}
g_posx[i] = xp;
g_posy[i] = yp;
g_posz[i] = zp;
@@ -104,7 +104,7 @@ struct Hermite4
void forces();
real step(const real dt)
real step(const real dt)
{
const real dt2 = dt*real(1.0/2.0);
const real dt3 = dt*real(1.0/3.0);
@@ -149,9 +149,9 @@ struct Hermite4
{
/* compute snp & crk */
const real Amx = g_accx[i] - accx0[i];
const real Amy = g_accy[i] - accy0[i];
const real Amz = g_accz[i] - accz0[i];
const real Amx = g_accx[i] - accx0[i];
const real Amy = g_accy[i] - accy0[i];
const real Amz = g_accz[i] - accz0[i];
const real Jmx = h*(g_jrkx[i] - jrkx0[i]);
const real Jmy = h*(g_jrky[i] - jrky0[i]);
@@ -199,18 +199,18 @@ struct Hermite4
}
}
if (dt_min == HUGE)
if (dt_min == HUGE)
return dt;
else
else
return dt_min;
}
void energy(real &Ekin, real &Epot)
void energy(real &Ekin, real &Epot)
{
real ekin = 0, epot = 0;
#pragma omp parallel for reduction(+:ekin,epot)
for (int i = 0; i < n; i++)
for (int i = 0; i < n; i++)
{
ekin += g_mass[i] * (g_velx[i]*g_velx[i] + g_vely[i]*g_vely[i] + g_velz[i]*g_velz[i]) * real(0.5f);
epot += real(0.5f)*g_mass[i] * g_gpot[i];
@@ -241,7 +241,7 @@ struct Hermite4
real dt = 1.0/131072;
real Epot, Ekin, Etot = Etot0;
while (t_global < t_end) {
if (iter % ntime == 0)
if (iter % ntime == 0)
t0 = rtc();
if (iter >= niter) return;
@@ -302,7 +302,7 @@ void run(const int nbodies, const real eta, const int nstep)
h4.integrate(nstep);
}
int main(int argc, char *argv[])
int main(int argc, char *argv[])
{
printf(" Usage: %s [nbodies=8192] [nsteps=40] [eta=0.1] \n", argv[0]);

View File

@@ -12,7 +12,7 @@ struct Predictor
vec3 pos, vel;
};
static inline
static inline
void body_body_force(
Force &fi,
const Predictor &pi,
@@ -40,14 +40,14 @@ void body_body_force(
fi.acc.y += minv_ds3 * dy;
fi.acc.z += minv_ds3 * dz;
fi.pot -= minv_ds;
const real dvx = pj.vel.x - pi.vel.x;
const real dvy = pj.vel.y - pi.vel.y;
const real dvz = pj.vel.z - pi.vel.z;
const real rv = dx*dvx + dy*dvy + dz*dvz;
const real Jij = (real)(-3.0) * (rv * inv_ds2 * minv_ds3);
fi.jrk.x += minv_ds3*dvx + Jij*dx;
fi.jrk.y += minv_ds3*dvy + Jij*dy;
fi.jrk.z += minv_ds3*dvz + Jij*dz;
@@ -75,7 +75,7 @@ task void compute_forces_task(
const uniform int nibeg = taskIndex * nPerTask;
const uniform int niend = min(n, nibeg + nPerTask);
if (nibeg >= n)
if (nibeg >= n)
return;
uniform real shdata[7][programCount];
@@ -88,7 +88,7 @@ task void compute_forces_task(
fi.acc = (real)0.0;
fi.jrk = (real)0.0;
fi.pot = (real)0.0;
Predictor pi;
pi.pos.x = posx[i];
pi.pos.y = posy[i];
@@ -155,7 +155,7 @@ export void compute_forces(
launch [nTask] compute_forces_task(
n, nPerTask,
mass,
mass,
posx,posy,posz,
velx,vely,velz,
accx,accy,accz,

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011-2012, Intel Corporation
Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,11 +28,11 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define DBG(x)
#define DBG(x)
#include <omp.h>
#include <malloc.h>
@@ -62,15 +62,15 @@ struct TaskInfo {
event taskEvent;
#endif
int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; }
int taskIndex0() const
int taskIndex0() const
{
return taskIndex % taskCount3d[0];
}
int taskIndex1() const
int taskIndex1() const
{
return ( taskIndex / taskCount3d[0] ) % taskCount3d[1];
}
int taskIndex2() const
int taskIndex2() const
{
return taskIndex / ( taskCount3d[0]*taskCount3d[1] );
}
@@ -85,7 +85,7 @@ __attribute__((aligned(32)));
;
// ispc expects these functions to have C linkage / not be mangled
extern "C" {
extern "C" {
void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz);
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
void ISPCSync(void *handle);
@@ -144,10 +144,10 @@ private:
};
inline TaskGroupBase::TaskGroupBase() {
nextTaskInfoIndex = 0;
inline TaskGroupBase::TaskGroupBase() {
nextTaskInfoIndex = 0;
curMemBuffer = 0;
curMemBuffer = 0;
curMemBufferOffset = 0;
memBuffers[0] = mem;
memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
@@ -171,8 +171,8 @@ inline TaskGroupBase::~TaskGroupBase() {
inline void
TaskGroupBase::Reset() {
nextTaskInfoIndex = 0;
curMemBuffer = 0;
nextTaskInfoIndex = 0;
curMemBuffer = 0;
curMemBufferOffset = 0;
}
@@ -253,7 +253,7 @@ lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
#endif // ISPC_IS_WINDOWS
}
static int32_t
static int32_t
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
#ifdef ISPC_IS_WINDOWS
return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue);
@@ -264,7 +264,7 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue)
#endif // ISPC_IS_WINDOWS
}
static inline int32_t
static inline int32_t
lAtomicAdd(volatile int32_t *v, int32_t delta) {
#ifdef ISPC_IS_WINDOWS
return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
@@ -300,11 +300,11 @@ TaskGroup::Launch(int baseIndex, int count) {
TaskInfo ti = *GetTaskInfo(baseIndex);
#pragma omp for schedule(runtime)
for(int i = 0; i < count; i++)
for(int i = 0; i < count; i++)
{
ti.taskIndex = i;
// Actually run the task.
// Actually run the task.
ti.func(ti.data, threadIndex, threadCount, ti.taskIndex, ti.taskCount(),
ti.taskIndex0(), ti.taskIndex1(), ti.taskIndex2(),
ti.taskCount0(), ti.taskCount1(), ti.taskCount2());
@@ -322,7 +322,7 @@ TaskGroup::Sync() {
static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
static inline TaskGroup *
AllocTaskGroup()
AllocTaskGroup()
{
for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
TaskGroup *tg = freeTaskGroups[i];
@@ -339,7 +339,7 @@ AllocTaskGroup()
static inline void
FreeTaskGroup(TaskGroup *tg)
FreeTaskGroup(TaskGroup *tg)
{
tg->Reset();
@@ -355,7 +355,7 @@ FreeTaskGroup(TaskGroup *tg)
}
void
ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2)
ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2)
{
const int count = count0*count1*count2;
TaskGroup *taskGroup;
@@ -382,7 +382,7 @@ ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1,
void
ISPCSync(void *h)
ISPCSync(void *h)
{
TaskGroup *taskGroup = (TaskGroup *)h;
if (taskGroup != NULL) {
@@ -393,7 +393,7 @@ ISPCSync(void *h)
void *
ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment)
ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment)
{
TaskGroup *taskGroup;
if (*taskGroupPtr == NULL) {

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define NOMINMAX
@@ -96,7 +96,7 @@ int main(int argc, char *argv[]) {
sum = 0.;
for (int i = 0; i < nOptions; ++i)
sum += result[i];
printf("[binomial ispc, tasks]:\t\t[%.3f] msec (avg %f)\n",
printf("[binomial ispc, tasks]:\t\t[%.3f] msec (avg %f)\n",
binomial_tasks, sum / nOptions);
//
@@ -112,7 +112,7 @@ int main(int argc, char *argv[]) {
sum += result[i];
bs_ispc_tasks = std::min(bs_ispc_tasks, dt);
}
printf("[black-scholes ispc, tasks]:\t[%.3f] msec (avg %f)\n",
printf("[black-scholes ispc, tasks]:\t[%.3f] msec (avg %f)\n",
bs_ispc_tasks, sum / nOptions);

View File

@@ -1,6 +1,6 @@
// -*- mode: c++ -*-
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -29,13 +29,13 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "options_defs.h"
#include "cuda_helpers.cuh"
__device__ static inline void __range_reduce_log(float input, float * reduced,
__device__ static inline void __range_reduce_log(float input, float * reduced,
int * exponent) {
int int_version = __float_as_int(input); //intbits(input);
// single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
@@ -195,9 +195,9 @@ CND(float X) {
return w;
}
__global__
__global__
void bs_task( float Sa[], float Xa[], float Ta[],
float ra[], float va[],
float ra[], float va[],
float result[], int count) {
if (taskIndex >= taskCount) return;
int first = taskIndex * (count/taskCount);
@@ -218,7 +218,7 @@ void bs_task( float Sa[], float Xa[], float Ta[],
extern "C"
__global__ void
black_scholes_ispc_tasks___export( float Sa[], float Xa[], float Ta[],
float ra[], float va[],
float ra[], float va[],
float result[], int count) {
int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
launch(nTasks,1,1,bs_task)
@@ -228,7 +228,7 @@ black_scholes_ispc_tasks___export( float Sa[], float Xa[], float Ta[],
extern "C"
__host__ void
black_scholes_ispc_tasks( float Sa[], float Xa[], float Ta[],
float ra[], float va[],
float ra[], float va[],
float result[], int count) {
black_scholes_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count);
cudaDeviceSynchronize();
@@ -243,8 +243,8 @@ struct loop
__device__ static void op1(float V[], const float u, const float X, const float S)
{
const int j = NBEG;
float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
V[j] = max(0.0f, X - S * upow);
float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
V[j] = max(0.0f, X - S * upow);
loop<j+STEP,NEND,STEP>::op1(V,u,X,S);
}
__device__ static void op2(float V[], const float Pu, const float disc)
@@ -257,9 +257,9 @@ struct loop
}
};
template<int NEND, int STEP>
template<int NEND, int STEP>
struct loop<NEND,NEND,STEP>
{
{
__device__ static void op1(float V[], const float u, const float X, const float S) {}
__device__ static void op2(float V[], const float Pu, const float disc) {}
};
@@ -295,10 +295,10 @@ binomial_put(float S, float X, float T, float r, float v)
__global__ void
binomial_task( float Sa[], float Xa[],
float Ta[], float ra[],
float va[], float result[],
int count)
binomial_task( float Sa[], float Xa[],
float Ta[], float ra[],
float va[], float result[],
int count)
{
int first = taskIndex * (count/taskCount);
int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
@@ -313,9 +313,9 @@ binomial_task( float Sa[], float Xa[],
extern "C" __global__ void
binomial_put_ispc_tasks___export( float Sa[], float Xa[],
float Ta[], float ra[],
float va[], float result[],
binomial_put_ispc_tasks___export( float Sa[], float Xa[],
float Ta[], float ra[],
float va[], float result[],
int count) {
int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
launch(nTasks,1,1,binomial_task)
@@ -325,7 +325,7 @@ binomial_put_ispc_tasks___export( float Sa[], float Xa[],
extern "C"
__host__ void
binomial_put_ispc_tasks( float Sa[], float Xa[], float Ta[],
float ra[], float va[],
float ra[], float va[],
float result[], int count) {
cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);

View File

@@ -1,6 +1,6 @@
// -*- mode: c++ -*-
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "options_defs.h"
@@ -57,7 +57,7 @@ CND(float X) {
task void
bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[],
uniform float ra[], uniform float va[],
uniform float result[], uniform int count) {
uniform int first = taskIndex * (count/taskCount);
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
@@ -74,7 +74,7 @@ bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
export void
black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[],
uniform float ra[], uniform float va[],
uniform float result[], uniform int count) {
uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
@@ -85,7 +85,7 @@ black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float T
export void
black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[],
uniform float ra[], uniform float va[],
uniform float result[], uniform int count) {
foreach (i = 0 ... count) {
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
@@ -135,7 +135,7 @@ binomial_put(float S, float X, float T, float r, float v) {
V[j] = max(0., X - S * upow); }
#define OP10(k) \
OP(k+0); OP(k+1); OP(k+2); OP(k+3); OP(k+4) \
OP(k+5); OP(k+6); OP(k+7); OP(k+8); OP(k+9);
OP(k+5); OP(k+6); OP(k+7); OP(k+8); OP(k+9);
OP10(0)
OP10(10)
OP10(20)
@@ -176,8 +176,8 @@ binomial_put(float S, float X, float T, float r, float v) {
export void
binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[],
binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[],
uniform float result[], uniform int count) {
foreach (i = 0 ... count) {
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
@@ -187,9 +187,9 @@ binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
task void
binomial_task(uniform float Sa[], uniform float Xa[],
uniform float Ta[], uniform float ra[],
uniform float va[], uniform float result[],
binomial_task(uniform float Sa[], uniform float Xa[],
uniform float Ta[], uniform float ra[],
uniform float va[], uniform float result[],
uniform int count) {
uniform int first = taskIndex * (count/taskCount);
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
@@ -202,9 +202,9 @@ binomial_task(uniform float Sa[], uniform float Xa[],
export void
binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
uniform float Ta[], uniform float ra[],
uniform float va[], uniform float result[],
binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
uniform float Ta[], uniform float ra[],
uniform float va[], uniform float result[],
uniform int count) {
uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef OPTIONS_DEFS_H

View File

@@ -42,7 +42,7 @@ int main (int argc, char *argv[])
Key *keys = new Key [n];
Key *keys_orig = new Key [n];
unsigned int *keys_gold = new unsigned int [n];
srand48(rtc()*65536);
int sortBits = 32;
@@ -63,7 +63,7 @@ int main (int argc, char *argv[])
keys_gold[i] = keys[i].key;
keys_orig[i] = keys[i];
}
ispcSetMallocHeapLimit(1024*1024*1024);
ispc::radixSort_alloc(n);

View File

@@ -9,7 +9,7 @@ typedef long long Key;
__forceinline__ __device__ int atomic_add_global(int* ptr, int value)
{
return atomicAdd(ptr, value);
}
}
static __device__ __forceinline__ int shfl_scan_add_step(int partial, int up_offset)
{
@@ -92,7 +92,7 @@ void sortPass(
const int mask = (1 << NUMBITS) - 1;
/* copy digit offset from Gmem to Lmem */
/* copy digit offset from Gmem to Lmem */
#if 1
__shared__ int digitOffsets_sh[NUMDIGITS*4];
volatile int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS;
@@ -191,7 +191,7 @@ void completeScanGlobal(
}
}
__device__ static
__device__ static
inline void radixExclusiveScan(
const int numBlocks,
int excScanPtr[],
@@ -242,11 +242,11 @@ void radixSort_alloc___export(const int n)
nPrefixSum = NUMDIGITS*numBlocks;
const int nalloc =
const int nalloc =
nSharedCounts +
nCountsGlobal +
nExcScan +
nCountsBlock +
nCountsBlock +
nPartialSum +
nPrefixSum;
@@ -261,7 +261,7 @@ void radixSort_alloc___export(const int n)
prefixSum = partialSum + nPartialSum;
}
extern "C"
extern "C"
void radixSort_alloc(const int n)
{
radixSort_alloc___export<<<1,32>>>(n);
@@ -269,7 +269,7 @@ void radixSort_alloc(const int n)
}
__device__ static
__device__ static
void radixSort_freeBufKeys()
{
if (numElementsBuf > 0)
@@ -344,9 +344,9 @@ __global__ void radixSort___export(
/* sorting */
launch (numBlocks,1,1,
sortPass)(
bufKeys,
keys,
bit,
bufKeys,
keys,
bit,
numElements,
excScan);
sync;

View File

@@ -63,7 +63,7 @@ void sortPass(
const uniform int mask = (1 << NUMBITS) - 1;
/* copy digit offset from Gmem to Lmem */
/* copy digit offset from Gmem to Lmem */
#if 1
uniform int digitOffsets[NUMDIGITS];
foreach (digit = 0 ... NUMDIGITS)
@@ -95,7 +95,7 @@ void partialScanLocal(
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
const uniform int bbeg = blockIdx * blockDim;
const uniform int bend = min(bbeg + blockDim, numBlocks);
uniform int (* uniform countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])countsAll;
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
@@ -142,7 +142,7 @@ void completeScanGlobal(
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
const uniform int bbeg = blockIdx * blockDim;
const uniform int bend = min(bbeg + blockDim, numBlocks);
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
uniform int (* uniform carryValue)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])carryValueAll;
@@ -154,7 +154,7 @@ void completeScanGlobal(
}
}
static
static
inline void radixExclusiveScan(
const uniform int numBlocks,
uniform int excScanPtr[],
@@ -207,11 +207,11 @@ export void radixSort_alloc(const uniform int n)
nPrefixSum = NUMDIGITS*numBlocks;
const uniform int nalloc =
const uniform int nalloc =
nSharedCounts +
nCountsGlobal +
nExcScan +
nCountsBlock +
nCountsBlock +
nPartialSum +
nPrefixSum;
@@ -225,7 +225,7 @@ export void radixSort_alloc(const uniform int n)
prefixSum = partialSum + nPartialSum;
}
static
static
void radixSort_freeBufKeys()
{
if (numElementsBuf > 0)
@@ -283,16 +283,16 @@ export void radixSort(
excScan[digit] = scan + carry;
carry += broadcast(scan+value, programCount-1);
}
/* computing offsets for each digit */
radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
/* sorting */
launch [numBlocks]
launch [numBlocks]
sortPass(
bufKeys,
keys,
bit,
bufKeys,
keys,
bit,
numElements,
excScan);
sync;

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER
@@ -83,7 +83,7 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
fputc(g, f);
fputc(b, f);
}
}
}
fclose(f);
printf("Wrote image file %s\n", filename);
}
@@ -116,7 +116,7 @@ int main(int argc, char *argv[]) {
if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \
fprintf(stderr, "Unexpected EOF reading scene file\n"); \
return 1; \
} else /* eat ; */
} else /* eat ; */
//
// Read the camera specification information from the camera file
@@ -145,7 +145,7 @@ int main(int argc, char *argv[]) {
READ(raster2camera[0][0], 16);
//
// Read in the serialized BVH
// Read in the serialized BVH
//
sprintf(fnbuf, "%s.bvh", filename);
f = fopen(fnbuf, "rb");
@@ -178,7 +178,7 @@ int main(int argc, char *argv[]) {
READ(nodes[i].pad, 1);
}
// And then read the triangles
// And then read the triangles
uint nTris;
READ(nTris, 1);
Triangle *triangles = new Triangle[nTris];
@@ -204,7 +204,7 @@ int main(int argc, char *argv[]) {
// the first interseciton
int *id = new int[width*height];
float *image = new float[width*height];
ispc_memset(id, 0, width*height*sizeof(int));
ispc_memset(image, 0, width*height*sizeof(float));
@@ -220,7 +220,7 @@ int main(int argc, char *argv[]) {
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
}
printf("[rt ispc + tasks]:\t\t[%.3f] msec for %d x %d image\n",
printf("[rt ispc + tasks]:\t\t[%.3f] msec for %d x %d image\n",
minTimeISPCtasks, width, height);
writeImage(id, image, width, height, "rt-ispc-tasks.ppm");

View File

@@ -96,7 +96,7 @@ static inline float Dot(const float3 a, const float3 b) {
__device__
inline
static void generateRay( const float raster2camera[4][4],
static void generateRay( const float raster2camera[4][4],
const float camera2world[4][4],
float x, float y, Ray &ray) {
ray.mint = 0.f;
@@ -113,11 +113,11 @@ static void generateRay( const float raster2camera[4][4],
camy /= camw;
camz /= camw;
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
camera2world[0][2] * camz;
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
camera2world[1][2] * camz;
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
camera2world[2][2] * camz;
ray.origin.x = camera2world[0][3] / camera2world[3][3];
@@ -139,7 +139,7 @@ static void generateRay( const float raster2camera[4][4],
__device__
inline
static bool BBoxIntersect(const float bounds[2][3],
static bool BBoxIntersect(const float bounds[2][3],
const Ray &ray) {
float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
@@ -172,7 +172,7 @@ static bool BBoxIntersect(const float bounds[2][3],
}
t0 = max(tNear.z, t0);
t1 = min(tFar.z, t1);
return (t0 <= t1);
}
@@ -220,7 +220,7 @@ static bool TriIntersect(const Triangle &tri, Ray &ray) {
__device__
inline
bool BVHIntersect(const LinearBVHNode nodes[],
bool BVHIntersect(const LinearBVHNode nodes[],
const Triangle tris[], Ray &r,
int todo[]) {
Ray ray = r;
@@ -240,7 +240,7 @@ bool BVHIntersect(const LinearBVHNode nodes[],
if (TriIntersect(tris[primitivesOffset+i], ray))
hit = true;
}
if (todoOffset == 0)
if (todoOffset == 0)
break;
nodeNum = todo[--todoOffset];
}
@@ -275,10 +275,10 @@ bool BVHIntersect(const LinearBVHNode nodes[],
__device__
inline
static void raytrace_tile( int x0, int x1,
int y0, int y1,
int y0, int y1,
int width, int height,
int baseWidth, int baseHeight,
const float raster2camera[4][4],
const float raster2camera[4][4],
const float camera2world[4][4],
float image[], int id[],
const LinearBVHNode nodes[],
@@ -317,7 +317,7 @@ static void raytrace_tile( int x0, int x1,
__global__
void raytrace_tile_task( int width, int height,
int baseWidth, int baseHeight,
const float raster2camera[4][4],
const float raster2camera[4][4],
const float camera2world[4][4],
float image[], int id[],
const LinearBVHNode nodes[],
@@ -328,8 +328,8 @@ void raytrace_tile_task( int width, int height,
int x1 = min(x0 + dx, width);
int y0 = (taskIndex / xBuckets) * dy;
int y1 = min(y0 + dy, height);
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
raster2camera, camera2world, image,
id, nodes, triangles);
}
@@ -337,7 +337,7 @@ void raytrace_tile_task( int width, int height,
extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height,
int baseWidth, int baseHeight,
const float raster2camera[4][4],
const float raster2camera[4][4],
const float camera2world[4][4],
float image[], int id[],
const LinearBVHNode nodes[],
@@ -347,8 +347,8 @@ extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height,
int yBuckets = (height + (dy-1)) / dy;
int nTasks = xBuckets * yBuckets;
launch(nTasks,1,1,raytrace_tile_task)
(width, height, baseWidth, baseHeight,
raster2camera, camera2world,
(width, height, baseWidth, baseHeight,
raster2camera, camera2world,
image, id, nodes, triangles);
cudaDeviceSynchronize();
}
@@ -357,7 +357,7 @@ extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height,
extern "C" __host__ void raytrace_ispc_tasks( int width, int height,
int baseWidth, int baseHeight,
const float raster2camera[4][4],
const float raster2camera[4][4],
const float camera2world[4][4],
float image[], int id[],
const LinearBVHNode nodes[],

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2010-2011, Intel Corporation
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if 1
@@ -90,7 +90,7 @@ static inline float Dot(const float3 a, const float3 b) {
#if 1
inline
#endif
static void generateRay(uniform const float raster2camera[4][4],
static void generateRay(uniform const float raster2camera[4][4],
uniform const float camera2world[4][4],
float x, float y, Ray &ray) {
ray.mint = 0.f;
@@ -107,11 +107,11 @@ static void generateRay(uniform const float raster2camera[4][4],
camy /= camw;
camz /= camw;
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
camera2world[0][2] * camz;
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
camera2world[1][2] * camz;
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
camera2world[2][2] * camz;
ray.origin.x = camera2world[0][3] / camera2world[3][3];
@@ -129,7 +129,7 @@ static void generateRay(uniform const float raster2camera[4][4],
#if 1
inline
#endif
static bool_t BBoxIntersect(const uniform float bounds[2][3],
static bool_t BBoxIntersect(const uniform float bounds[2][3],
const Ray &ray) {
const uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
const uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
@@ -162,7 +162,7 @@ static bool_t BBoxIntersect(const uniform float bounds[2][3],
}
t0 = max(tNear.z, t0);
t1 = min(tFar.z, t1);
return (t0 <= t1);
}
@@ -215,7 +215,7 @@ static bool_t TriIntersect(const uniform_t Triangle tri, Ray &ray) {
inline
#endif
bool_t
BVHIntersect(const uniform LinearBVHNode nodes[],
BVHIntersect(const uniform LinearBVHNode nodes[],
const uniform Triangle tris[], Ray &r) {
Ray ray = r;
bool_t hit = false;
@@ -235,7 +235,7 @@ BVHIntersect(const uniform LinearBVHNode nodes[],
if (TriIntersect(tris[primitivesOffset+i], ray))
hit = true;
}
if (todoOffset == 0)
if (todoOffset == 0)
break;
nodeNum = todo[--todoOffset];
}
@@ -276,10 +276,10 @@ BVHIntersect(const uniform LinearBVHNode nodes[],
inline
#endif
static void raytrace_tile(uniform int x0, uniform int x1,
uniform int y0, uniform int y1,
uniform int y0, uniform int y1,
uniform int width, uniform int height,
uniform int baseWidth, uniform int baseHeight,
const uniform float raster2camera[4][4],
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform float image[], uniform int id[],
const uniform LinearBVHNode nodes[],
@@ -302,7 +302,7 @@ static void raytrace_tile(uniform int x0, uniform int x1,
export void raytrace_ispc(uniform int width, uniform int height,
uniform int baseWidth, uniform int baseHeight,
const uniform float raster2camera[4][4],
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform float image[], uniform int id[],
const uniform LinearBVHNode nodes[],
@@ -315,7 +315,7 @@ export void raytrace_ispc(uniform int width, uniform int height,
task void raytrace_tile_task(uniform int width, uniform int height,
uniform int baseWidth, uniform int baseHeight,
const uniform float raster2camera[4][4],
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform float image[], uniform int id[],
const uniform LinearBVHNode nodes[],
@@ -326,8 +326,8 @@ task void raytrace_tile_task(uniform int width, uniform int height,
const uniform int x1 = min(x0 + dx, width);
const uniform int y0 = (taskIndex / xBuckets) * dy;
const uniform int y1 = min(y0 + dy, height);
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
raster2camera, camera2world, image,
id, nodes, triangles);
}
@@ -335,7 +335,7 @@ task void raytrace_tile_task(uniform int width, uniform int height,
export void raytrace_ispc_tasks(uniform int width, uniform int height,
uniform int baseWidth, uniform int baseHeight,
const uniform float raster2camera[4][4],
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform float image[], uniform int id[],
const uniform LinearBVHNode nodes[],
@@ -344,8 +344,8 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
const uniform int xBuckets = (width + (dx-1)) / dx;
const uniform int yBuckets = (height + (dy-1)) / dy;
const uniform int nTasks = xBuckets * yBuckets;
launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight,
raster2camera, camera2world,
launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight,
raster2camera, camera2world,
image, id, nodes, triangles);
}

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, Intel Corporation
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, Intel Corporation
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,11 +28,11 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cuda_helpers.cuh"
__device__ static inline float clamp(float v, float low, float high)
__device__ static inline float clamp(float v, float low, float high)
{
return min(max(v, low), high);
}
@@ -90,7 +90,7 @@ struct Ray {
__device__ static void
generateRay(const float raster2camera[4][4],
generateRay(const float raster2camera[4][4],
const float camera2world[4][4],
float x, float y, Ray &ray) {
// transform raster coordinate (x, y, 0) to camera space
@@ -149,7 +149,7 @@ IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
}
t0 = max(tNear.z, t0);
t1 = min(tFar.z, t1);
if (t0 <= t1) {
hit0 = t0;
hit1 = t1;
@@ -165,7 +165,7 @@ __device__ static inline float Lerp(float t, float a, float b) {
}
__device__ static inline float D(int x, int y, int z, int nVoxels[3],
__device__ static inline float D(int x, int y, int z, int nVoxels[3],
float density[]) {
x = clamp(x, 0, nVoxels[0]-1);
y = clamp(y, 0, nVoxels[1]-1);
@@ -180,9 +180,9 @@ __device__ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
}
__device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
__device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
float density[], int nVoxels[3]) {
if (!Inside(Pobj, pMin, pMax))
if (!Inside(Pobj, pMin, pMax))
return 0;
// Compute voxel coordinates and offsets for _Pobj_
float3 vox = Offset(Pobj, pMin, pMax);
@@ -193,13 +193,13 @@ __device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
// Trilinearly interpolate density values to compute local density
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
D(vx+1, vy, vz, nVoxels, density));
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
D(vx+1, vy+1, vz, nVoxels, density));
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
D(vx+1, vy, vz+1, nVoxels, density));
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
D(vx+1, vy+1, vz+1, nVoxels, density));
float d0 = Lerp(dy, d00, d10);
float d1 = Lerp(dy, d01, d11);
@@ -213,7 +213,7 @@ __device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
array. */
__device__ static inline float
transmittance(float3 p0, float3 p1, float3 pMin,
float3 pMax, float sigma_t,
float3 pMax, float sigma_t,
float density[], int nVoxels[3]) {
float rayT0, rayT1;
Ray ray;
@@ -253,7 +253,7 @@ distanceSquared(float3 a, float3 b) {
}
__device__ static inline float
__device__ static inline float
raymarch(float density[], int nVoxels[3], Ray ray) {
float rayT0, rayT1;
float3 pMin = {.3f, -.2f, .3f}, pMax = {1.8f, 2.3f, 1.8f};
@@ -281,7 +281,7 @@ raymarch(float density[], int nVoxels[3], Ray ray) {
float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT;
while (t < rayT1)
while (t < rayT1)
{
float d = Density(pos, pMin, pMax, density, nVoxels);
@@ -291,7 +291,7 @@ raymarch(float density[], int nVoxels[3], Ray ray) {
break;
// direct lighting
float Li = lightIntensity / distanceSquared(lightPos, pos) *
float Li = lightIntensity / distanceSquared(lightPos, pos) *
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
density, nVoxels);
L += stepDist * atten * d * sigma_s * (Li + Le);
@@ -314,20 +314,20 @@ raymarch(float density[], int nVoxels[3], Ray ray) {
*/
__device__ static void
volume_tile(int x0, int y0, int x1,
int y1, float density[], int nVoxels[3],
int y1, float density[], int nVoxels[3],
const float raster2camera[4][4],
const float camera2world[4][4],
const float camera2world[4][4],
int width, int height, float image[]) {
// Work on 4x4=16 pixel big tiles of the image. This function thus
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
// by 4.
for (int y = y0; y < y1; y += 8) {
for (int x = x0; x < x1; x += 8) {
for (int ob = 0; ob < 64; ob += programCount)
for (int ob = 0; ob < 64; ob += programCount)
{
const int o = ob + programIndex;
// These two arrays encode the mapping from [0,15] to
// offsets within the 4x4 pixel block so that we render
// each pixel inside the block
@@ -360,9 +360,9 @@ volume_tile(int x0, int y0, int x1,
__global__ void
volume_task(float density[], int _nVoxels[3],
volume_task(float density[], int _nVoxels[3],
const float _raster2camera[4][4],
const float _camera2world[4][4],
const float _camera2world[4][4],
int width, int height, float image[]) {
if (taskIndex0 >= taskCount0) return;
@@ -389,7 +389,7 @@ volume_task(float density[], int _nVoxels[3],
raster2camera[3][1] = _raster2camera[3][1];
raster2camera[3][2] = _raster2camera[3][2];
raster2camera[3][3] = _raster2camera[3][3];
float camera2world[4][4];
camera2world[0][0] = _camera2world[0][0];
camera2world[0][1] = _camera2world[0][1];
@@ -430,24 +430,24 @@ volume_task(float density[], int _nVoxels[3],
extern "C"
__global__ void
volume_ispc_tasks___export( float density[], int nVoxels[3],
volume_ispc_tasks___export( float density[], int nVoxels[3],
const float raster2camera[4][4],
const float camera2world[4][4],
const float camera2world[4][4],
int width, int height, float image[]) {
// Launch tasks to work on (dx,dy)-sized tiles of the image
int dx = 8, dy = 8;
int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
launch(nTasks,1,1,volume_task)
(density, nVoxels, raster2camera, camera2world,
(density, nVoxels, raster2camera, camera2world,
width, height, image);
cudaDeviceSynchronize();
}
extern "C"
__host__ void
volume_ispc_tasks( float density[], int nVoxels[3],
volume_ispc_tasks( float density[], int nVoxels[3],
const float raster2camera[4][4],
const float camera2world[4][4],
const float camera2world[4][4],
int width, int height, float image[]) {
volume_ispc_tasks___export<<<1,32>>>(density, nVoxels, raster2camera, camera2world, width, height,image);
cudaDeviceSynchronize();

View File

@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, Intel Corporation
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
@@ -41,7 +41,7 @@ struct Ray {
static inline void
generateRay(const uniform float raster2camera[4][4],
generateRay(const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
float x, float y, Ray &ray) {
// transform raster coordinate (x, y, 0) to camera space
@@ -100,7 +100,7 @@ IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
}
t0 = max(tNear.z, t0);
t1 = min(tFar.z, t1);
if (t0 <= t1) {
hit0 = t0;
hit1 = t1;
@@ -116,7 +116,7 @@ static inline float Lerp(float t, float a, float b) {
}
static inline float D(int x, int y, int z, uniform int nVoxels[3],
static inline float D(int x, int y, int z, uniform int nVoxels[3],
uniform float density[]) {
x = clamp(x, 0, nVoxels[0]-1);
y = clamp(y, 0, nVoxels[1]-1);
@@ -131,9 +131,9 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
}
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
uniform float density[], uniform int nVoxels[3]) {
if (!Inside(Pobj, pMin, pMax))
if (!Inside(Pobj, pMin, pMax))
return 0;
// Compute voxel coordinates and offsets for _Pobj_
float3 vox = Offset(Pobj, pMin, pMax);
@@ -144,13 +144,13 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
// Trilinearly interpolate density values to compute local density
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
D(vx+1, vy, vz, nVoxels, density));
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
D(vx+1, vy+1, vz, nVoxels, density));
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
D(vx+1, vy, vz+1, nVoxels, density));
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
D(vx+1, vy+1, vz+1, nVoxels, density));
float d0 = Lerp(dy, d00, d10);
float d1 = Lerp(dy, d01, d11);
@@ -164,7 +164,7 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
array. */
static inline float
transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
uniform float3 pMax, uniform float sigma_t,
uniform float3 pMax, uniform float sigma_t,
uniform float density[], uniform int nVoxels[3]) {
float rayT0, rayT1;
Ray ray;
@@ -204,7 +204,7 @@ distanceSquared(float3 a, float3 b) {
}
static inline float
static inline float
raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
float rayT0, rayT1;
const uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
@@ -232,7 +232,7 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT;
while (t < rayT1)
while (t < rayT1)
{
float d = Density(pos, pMin, pMax, density, nVoxels);
@@ -242,7 +242,7 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
break;
// direct lighting
float Li = lightIntensity / distanceSquared(lightPos, pos) *
float Li = lightIntensity / distanceSquared(lightPos, pos) *
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
density, nVoxels);
L += stepDist * atten * d * sigma_s * (Li + Le);
@@ -265,16 +265,16 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
*/
static inline void
volume_tile(uniform int x0, uniform int y0, uniform int x1,
uniform int y1, uniform float density[], uniform int nVoxels[3],
uniform int y1, uniform float density[], uniform int nVoxels[3],
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
const uniform float camera2world[4][4],
uniform int width, uniform int height, uniform float image[]) {
// Work on 4x4=16 pixel big tiles of the image. This function thus
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
// by 4.
#if 0
for (uniform int y = y0; y < y1; y += 8)
for (uniform int x = x0; x < x1; x += 8)
for (uniform int y = y0; y < y1; y += 8)
for (uniform int x = x0; x < x1; x += 8)
foreach (o = 0 ... 64)
{
// These two arrays encode the mapping from [0,15] to
@@ -304,7 +304,7 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
image[offset] = raymarch(density, nVoxels, ray);
}
#else
foreach_tiled (y = y0 ... y1, x = x0 ... x1)
foreach_tiled (y = y0 ... y1, x = x0 ... x1)
{
// Use viewing parameters to compute the corresponding ray
// for the pixel
@@ -321,10 +321,10 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
task void
volume_task(uniform float density[], uniform int _nVoxels[3],
volume_task(uniform float density[], uniform int _nVoxels[3],
const uniform float _raster2camera[4][4],
const uniform float _camera2world[4][4],
uniform int width, uniform int height, uniform float image[])
const uniform float _camera2world[4][4],
uniform int width, uniform int height, uniform float image[])
{
if (taskIndex >= taskCount) return;
@@ -351,7 +351,7 @@ volume_task(uniform float density[], uniform int _nVoxels[3],
raster2camera[3][1] = _raster2camera[3][1];
raster2camera[3][2] = _raster2camera[3][2];
raster2camera[3][3] = _raster2camera[3][3];
uniform float camera2world[4][4];
camera2world[0][0] = _camera2world[0][0];
camera2world[0][1] = _camera2world[0][1];
@@ -390,24 +390,24 @@ volume_task(uniform float density[], uniform int _nVoxels[3],
export void
volume_ispc(uniform float density[], uniform int nVoxels[3],
volume_ispc(uniform float density[], uniform int nVoxels[3],
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
const uniform float camera2world[4][4],
uniform int width, uniform int height, uniform float image[]) {
volume_tile(0, 0, width, height, density, nVoxels, raster2camera,
volume_tile(0, 0, width, height, density, nVoxels, raster2camera,
camera2world, width, height, image);
}
export void
volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
const uniform float camera2world[4][4],
uniform int width, uniform int height, uniform float image[]) {
// Launch tasks to work on (dx,dy)-sized tiles of the image
const uniform int dx = 8, dy = 8;
const uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
width, height, image);
sync;
}