working on deferred shading

This commit is contained in:
Evghenii
2013-11-10 12:10:10 +01:00
parent 1a37135f98
commit 78d509dba5
9 changed files with 1563 additions and 168 deletions

View File

@@ -337,8 +337,20 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline
ret float %ret
}
declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone
declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone
define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone alwaysinline
{
%v = extractelement <1 x float> %0, i32 0
%r = call float @__rcp_uniform_float(float %v)
%rv = insertelement <1 x float> undef, float %r, i32 0
ret <WIDTH x float> %rv
}
define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline
{
%v = extractelement <1 x float> %0, i32 0
%r = call float @__rsqrt_uniform_float(float %v)
%rv = insertelement <1 x float> undef, float %r, i32 0
ret <WIDTH x float> %rv
}
define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline
{
%v = extractelement <1 x float> %0, i32 0
@@ -397,19 +409,19 @@ define i64 @__movmsk(<1 x i1>) nounwind readnone alwaysinline {
define i1 @__any(<1 x i1>) nounwind readnone alwaysinline {
%v = extractelement <1 x i1> %0, i32 0
; %cmp = icmp ne i1 %v, 0
ret i1 %v
%cmp = icmp ne i1 %v, 0
ret i1 %cmp
}
define i1 @__all(<1 x i1>) nounwind readnone alwaysinline {
%v = extractelement <1 x i1> %0, i32 0
; %cmp = icmp eq i1 %v, 1
ret i1 %v
%cmp = icmp eq i1 %v, 1
ret i1 %cmp
}
define i1 @__none(<1 x i1>) nounwind readnone alwaysinline {
%v = extractelement <1 x i1> %0, i32 0
%cmp = xor i1 %v, 1; ;icmp eq i1 %v, 0
%cmp = icmp eq i1 %v, 0
ret i1 %cmp
}

View File

@@ -35,7 +35,7 @@
#define DEFERRED_H
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
#define MIN_TILE_WIDTH 16
#define MIN_TILE_WIDTH 64
#define MIN_TILE_HEIGHT 16
#define MAX_LIGHTS 1024

View File

@@ -0,0 +1,650 @@
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "deferred.h"
#include <stdio.h>
#include <assert.h>
#define programCount 32
#define programIndex (threadIdx.x & 31)
#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5))
#define taskCount (gridDim.x*4)
#define warpIdx (threadIdx.x >> 5)
#define int32 int
#define int16 short
#define int8 char
__device__ static inline float clamp(float v, float low, float high)
{
return min(max(v, low), high);
}
struct InputDataArrays
{
float *zBuffer;
unsigned int16 *normalEncoded_x; // half float
unsigned int16 *normalEncoded_y; // half float
unsigned int16 *specularAmount; // half float
unsigned int16 *specularPower; // half float
unsigned int8 *albedo_x; // unorm8
unsigned int8 *albedo_y; // unorm8
unsigned int8 *albedo_z; // unorm8
float *lightPositionView_x;
float *lightPositionView_y;
float *lightPositionView_z;
float *lightAttenuationBegin;
float *lightColor_x;
float *lightColor_y;
float *lightColor_z;
float *lightAttenuationEnd;
};
struct InputHeader
{
float cameraProj[4][4];
float cameraNear;
float cameraFar;
int32 framebufferWidth;
int32 framebufferHeight;
int32 numLights;
int32 inputDataChunkSize;
int32 inputDataArrayOffsets[idaNum];
};
///////////////////////////////////////////////////////////////////////////
// Common utility routines
__device__
static inline float
dot3(float x, float y, float z, float a, float b, float c) {
return (x*a + y*b + z*c);
}
#if 0
template<typename T, int N>
struct Uniform
{
T data[(N-1)/programCount+1];
__device__ inline const T& operator[](const int i) const
{
const int laneIdx = i & (programCount-1);
const int chunkIdx = i >> 5;
return __shfl(data[chunkIdx], laneIdx);
}
}
#endif
__device__
static inline void
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
float n = rsqrt(x*x + y*y + z*z);
ox = x * n;
oy = y * n;
oz = z * n;
}
__device__ inline
static float reduce_min(float value)
{
#pragma unroll
for (int i = 4; i >=0; i--)
value = min(value, __shfl_xor(value, 1<<i, 32));
return value;
}
__device__ inline
static float reduce_max(float value)
{
#pragma unroll
for (int i = 4; i >=0; i--)
value = max(value, __shfl_xor(value, 1<<i, 32));
return value;
}
__device__ inline
static int reduce_sum(int value)
{
#pragma unroll
for (int i = 4; i >=0; i--)
value += __shfl_xor(value, 1<<i, 32);
return value;
}
static __device__ __forceinline__ uint shfl_scan_add_step(uint partial, uint up_offset)
{
uint result;
asm(
"{.reg .u32 r0;"
".reg .pred p;"
"shfl.up.b32 r0|p, %1, %2, 0;"
"@p add.u32 r0, r0, %3;"
"mov.u32 %0, r0;}"
: "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
return result;
}
static __device__ __forceinline__ int inclusive_scan_warp(const int value)
{
uint sum = value;
#pragma unroll
for(int i = 0; i < 5; ++i)
sum = shfl_scan_add_step(sum, 1 << i);
return sum - value;
}
static __device__ __forceinline__ int lanemask_lt()
{
int mask;
asm("mov.u32 %0, %lanemask_lt;" : "=r" (mask));
return mask;
}
static __device__ __forceinline__ int2 warpBinExclusiveScan(const bool p)
{
const unsigned int b = __ballot(p);
return make_int2(__popc(b & lanemask_lt()), __popc(b));
}
__device__
static inline float
Unorm8ToFloat32(unsigned int8 u) {
return (float)u * (1.0f / 255.0f);
}
__device__
static inline unsigned int8
Float32ToUnorm8(float f) {
return (unsigned int8)(f * 255.0f);
}
__device__
static inline void
ComputeZBounds(
int32 tileStartX, int32 tileEndX,
int32 tileStartY, int32 tileEndY,
// G-buffer data
float zBuffer[],
int32 gBufferWidth,
// Camera data
float cameraProj_33, float cameraProj_43,
float cameraNear, float cameraFar,
// Output
float &minZ,
float &maxZ
)
{
// Find Z bounds
float laneMinZ = cameraFar;
float laneMaxZ = cameraNear;
for ( int32 y = tileStartY; y < tileEndY; ++y) {
for ( int xb = tileStartX; xb < tileEndX; xb += programCount)
{
const int x = xb + programIndex;
if (x >= tileEndX) break;
// Unproject depth buffer Z value into view space
float z = zBuffer[y * gBufferWidth + x];
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
// Work out Z bounds for our samples
// Avoid considering skybox/background or otherwise invalid pixels
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
laneMinZ = min(laneMinZ, viewSpaceZ);
laneMaxZ = max(laneMaxZ, viewSpaceZ);
}
}
}
minZ = reduce_min(laneMinZ);
maxZ = reduce_max(laneMaxZ);
}
__device__
static inline int32
IntersectLightsWithTileMinMax(
int32 tileStartX, int32 tileEndX,
int32 tileStartY, int32 tileEndY,
// Tile data
float minZ,
float maxZ,
// G-buffer data
int32 gBufferWidth, int32 gBufferHeight,
// Camera data
float cameraProj_11, float cameraProj_22,
// Light Data
int32 numLights,
float light_positionView_x_array[],
float light_positionView_y_array[],
float light_positionView_z_array[],
float light_attenuationEnd_array[],
// Output
volatile int32 tileLightIndices[]
)
{
float gBufferScale_x = 0.5f * (float)gBufferWidth;
float gBufferScale_y = 0.5f * (float)gBufferHeight;
float frustumPlanes_xy[4] = {
-(cameraProj_11 * gBufferScale_x),
(cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y),
-(cameraProj_22 * gBufferScale_y) };
float frustumPlanes_z[4] = {
tileEndX - gBufferScale_x,
-tileStartX + gBufferScale_x,
tileEndY - gBufferScale_y,
-tileStartY + gBufferScale_y };
for ( int i = 0; i < 4; ++i) {
float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
frustumPlanes_z[i] * frustumPlanes_z[i]);
frustumPlanes_xy[i] *= norm;
frustumPlanes_z[i] *= norm;
}
int32 tileNumLights = 0;
for ( int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount)
{
const int lightIndex = lightIndexB + programIndex;
float light_positionView_z = light_positionView_z_array[lightIndex];
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
float light_attenuationEndNeg = -light_attenuationEnd;
float d = light_positionView_z - minZ;
bool inFrustum = (d >= light_attenuationEndNeg);
d = maxZ - light_positionView_z;
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// This seems better than cif(!inFrustum) ccontinue; here since we
// don't actually need to mask the rest of this function - this is
// just a greedy early-out. Could also structure all of this as
// nested if() statements, but this a bit easier to read
int active = 0;
if ((inFrustum)) {
float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex];
d = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[1] +
light_positionView_x * frustumPlanes_xy[1];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[2] +
light_positionView_y * frustumPlanes_xy[2];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[3] +
light_positionView_y * frustumPlanes_xy[3];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// Pack and store intersecting lights
#if 0
if (inFrustum) {
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
lightIndex);
}
#else
if (inFrustum)
{
active = 1;
}
#endif
}
#if 1
if (lightIndex >= numLights)
active = 0;
#if 0
const int idx = tileNumLights + inclusive_scan_warp(active);
const int nactive = reduce_sum(active);
#else
const int2 res = warpBinExclusiveScan(active);
const int idx = tileNumLights + res.x;
const int nactive = res.y;
#endif
if (active)
tileLightIndices[idx] = lightIndex;
tileNumLights += nactive;
#endif
}
return tileNumLights;
}
__device__
static inline int32
IntersectLightsWithTile(
int32 tileStartX, int32 tileEndX,
int32 tileStartY, int32 tileEndY,
int32 gBufferWidth, int32 gBufferHeight,
// G-buffer data
float zBuffer[],
// Camera data
float cameraProj_11, float cameraProj_22,
float cameraProj_33, float cameraProj_43,
float cameraNear, float cameraFar,
// Light Data
int32 numLights,
float light_positionView_x_array[],
float light_positionView_y_array[],
float light_positionView_z_array[],
float light_attenuationEnd_array[],
// Output
int32 tileLightIndices[]
)
{
float minZ, maxZ;
ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
minZ, maxZ);
int32 tileNumLights = IntersectLightsWithTileMinMax(
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
light_positionView_z_array, light_attenuationEnd_array,
tileLightIndices);
return tileNumLights;
}
__device__
static inline void
ShadeTile(
int32 tileStartX, int32 tileEndX,
int32 tileStartY, int32 tileEndY,
int32 gBufferWidth, int32 gBufferHeight,
const InputDataArrays &inputData,
// Camera data
float cameraProj_11, float cameraProj_22,
float cameraProj_33, float cameraProj_43,
// Light list
volatile int32 tileLightIndices[],
int32 tileNumLights,
// UI
bool visualizeLightCount,
// Output
unsigned int8 framebuffer_r[],
unsigned int8 framebuffer_g[],
unsigned int8 framebuffer_b[]
)
{
if (tileNumLights == 0 || visualizeLightCount) {
unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
for ( int32 y = tileStartY; y < tileEndY; ++y) {
for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
{
const int x = xb + programIndex;
if (x >= tileEndX) continue;
int32 framebufferIndex = (y * gBufferWidth + x);
framebuffer_r[framebufferIndex] = c;
framebuffer_g[framebufferIndex] = c;
framebuffer_b[framebufferIndex] = c;
}
}
} else {
float twoOverGBufferWidth = 2.0f / gBufferWidth;
float twoOverGBufferHeight = 2.0f / gBufferHeight;
for ( int32 y = tileStartY; y < tileEndY; ++y) {
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
{
const int x = xb + programIndex;
// if (x >= tileEndX) break;
int32 gBufferOffset = y * gBufferWidth + x;
// Reconstruct position and (negative) view vector from G-buffer
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
float Vneg_x, Vneg_y, Vneg_z;
float z = inputData.zBuffer[gBufferOffset];
// Compute screen/clip-space position
// NOTE: Mind DX11 viewport transform and pixel center!
float positionScreen_x = (0.5f + (float)(x)) *
twoOverGBufferWidth - 1.0f;
// Unproject depth buffer Z value into view space
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
surface_positionView_x = positionScreen_x * surface_positionView_z /
cameraProj_11;
surface_positionView_y = positionScreen_y * surface_positionView_z /
cameraProj_22;
// We actually end up with a vector pointing *at* the
// surface (i.e. the negative view vector)
normalize3(surface_positionView_x, surface_positionView_y,
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
// Reconstruct normal from G-buffer
float surface_normal_x, surface_normal_y, surface_normal_z;
float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]);
float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]);
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
float m = sqrt(4.0f * f - 1.0f);
surface_normal_x = m * (4.0f * normal_x - 2.0f);
surface_normal_y = m * (4.0f * normal_y - 2.0f);
surface_normal_z = 3.0f - 8.0f * f;
// Load other G-buffer parameters
float surface_specularAmount =
__half2float(inputData.specularAmount[gBufferOffset]);
float surface_specularPower =
__half2float(inputData.specularPower[gBufferOffset]);
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
float lit_x = 0.0f;
float lit_y = 0.0f;
float lit_z = 0.0f;
for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
++tileLightIndex) {
int32 lightIndex = tileLightIndices[tileLightIndex];
// Gather light data relevant to initial culling
float light_positionView_x =
inputData.lightPositionView_x[lightIndex];
float light_positionView_y =
inputData.lightPositionView_y[lightIndex];
float light_positionView_z =
inputData.lightPositionView_z[lightIndex];
float light_attenuationEnd =
inputData.lightAttenuationEnd[lightIndex];
// Compute light vector
float L_x = light_positionView_x - surface_positionView_x;
float L_y = light_positionView_y - surface_positionView_y;
float L_z = light_positionView_z - surface_positionView_z;
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
// Clip at end of attenuation
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
if (distanceToLight2 < light_attenutaionEnd2) {
float distanceToLight = sqrt(distanceToLight2);
// HLSL "rcp" is allowed to be fairly inaccurate
float distanceToLightRcp = 1.0f/distanceToLight;
L_x *= distanceToLightRcp;
L_y *= distanceToLightRcp;
L_z *= distanceToLightRcp;
// Start computing brdf
float NdotL = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, L_x, L_y, L_z);
// Clip back facing
if (NdotL > 0.0f) {
float light_attenuationBegin =
inputData.lightAttenuationBegin[lightIndex];
// Light distance attenuation (linstep)
float lightRange = (light_attenuationEnd - light_attenuationBegin);
float falloffPosition = (light_attenuationEnd - distanceToLight);
float attenuation = min(falloffPosition / lightRange, 1.0f);
float H_x = (L_x - Vneg_x);
float H_y = (L_y - Vneg_y);
float H_z = (L_z - Vneg_z);
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
float NdotH = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, H_x, H_y, H_z);
NdotH = max(NdotH, 0.0f);
float specular = pow(NdotH, surface_specularPower);
float specularNorm = (surface_specularPower + 2.0f) *
(1.0f / 8.0f);
float specularContrib = surface_specularAmount *
specularNorm * specular;
float k = attenuation * NdotL * (1.0f + specularContrib);
float light_color_x = inputData.lightColor_x[lightIndex];
float light_color_y = inputData.lightColor_y[lightIndex];
float light_color_z = inputData.lightColor_z[lightIndex];
float lightContrib_x = surface_albedo_x * light_color_x;
float lightContrib_y = surface_albedo_y * light_color_y;
float lightContrib_z = surface_albedo_z * light_color_z;
lit_x += lightContrib_x * k;
lit_y += lightContrib_y * k;
lit_z += lightContrib_z * k;
}
}
}
// Gamma correct
// These pows are pretty slow right now, but we can do
// something faster if really necessary to squeeze every
// last bit of performance out of it
float gamma = 1.0 / 2.2f;
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
}
}
}
}
///////////////////////////////////////////////////////////////////////////
// Static decomposition
extern "C" __global__ void
RenderTile( int num_groups_x, int num_groups_y,
const InputHeader *inputHeaderPtr,
const InputDataArrays *inputDataPtr,
int visualizeLightCount,
// Output
unsigned int8 framebuffer_r[],
unsigned int8 framebuffer_g[],
unsigned int8 framebuffer_b[]) {
if (taskIndex >= taskCount) return;
const InputHeader inputHeader = *inputHeaderPtr;
const InputDataArrays inputData = *inputDataPtr;
int32 group_y = taskIndex / num_groups_x;
int32 group_x = taskIndex % num_groups_x;
int32 tile_start_x = group_x * MIN_TILE_WIDTH;
int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
int framebufferWidth = inputHeader.framebufferWidth;
int framebufferHeight = inputHeader.framebufferHeight;
float cameraProj_00 = inputHeader.cameraProj[0][0];
float cameraProj_11 = inputHeader.cameraProj[1][1];
float cameraProj_22 = inputHeader.cameraProj[2][2];
float cameraProj_32 = inputHeader.cameraProj[3][2];
// Light intersection: figure out which lights illuminate this tile.
#if 0
int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
#else
__shared__ int tileLightIndicesFull[4*MAX_LIGHTS]; // Light list for the tile
int *tileLightIndices = &tileLightIndicesFull[warpIdx*MAX_LIGHTS];
#endif
int numTileLights =
IntersectLightsWithTile(tile_start_x, tile_end_x,
tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight,
inputData.zBuffer,
cameraProj_00, cameraProj_11,
cameraProj_22, cameraProj_32,
inputHeader.cameraNear, inputHeader.cameraFar,
MAX_LIGHTS,
inputData.lightPositionView_x,
inputData.lightPositionView_y,
inputData.lightPositionView_z,
inputData.lightAttenuationEnd,
tileLightIndices);
// And now shade the tile, using the lights in tileLightIndices
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight, inputData,
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
tileLightIndices, numTileLights, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
}

View File

@@ -472,6 +472,7 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
// Light intersection: figure out which lights illuminate this tile.
uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
uniform int numTileLights =
@@ -489,6 +490,7 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
inputData.lightAttenuationEnd,
tileLightIndices);
// And now shade the tile, using the lights in tileLightIndices
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight, inputData,
@@ -519,154 +521,3 @@ RenderStatic(uniform InputHeader &inputHeader,
framebuffer_r, framebuffer_g, framebuffer_b);
}
///////////////////////////////////////////////////////////////////////////
// Routines for dynamic decomposition path
// This computes the z min/max range for a whole row worth of tiles.
export void
ComputeZBoundsRow(
uniform int32 tileY,
uniform int32 tileWidth, uniform int32 tileHeight,
uniform int32 numTilesX, uniform int32 numTilesY,
// G-buffer data
uniform float zBuffer[],
uniform int32 gBufferWidth,
// Camera data
uniform float cameraProj_33, uniform float cameraProj_43,
uniform float cameraNear, uniform float cameraFar,
// Output
uniform float minZArray[],
uniform float maxZArray[]
)
{
for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
uniform float minZ, maxZ;
ComputeZBounds(
tileX * tileWidth, tileX * tileWidth + tileWidth,
tileY * tileHeight, tileY * tileHeight + tileHeight,
zBuffer, gBufferWidth,
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
minZ, maxZ);
minZArray[tileX] = minZ;
maxZArray[tileX] = maxZ;
}
}
// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
// numLights need not be a multiple of programCount here, but the input and output arrays
// should be able to handle programCount-sized load/stores.
export void
SplitTileMinMax(
uniform int32 tileMidX, uniform int32 tileMidY,
// Subtile data (00, 10, 01, 11)
uniform float subtileMinZ[],
uniform float subtileMaxZ[],
// G-buffer data
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
// Camera data
uniform float cameraProj_11, uniform float cameraProj_22,
// Light Data
uniform int32 lightIndices[],
uniform int32 numLights,
uniform float light_positionView_x_array[],
uniform float light_positionView_y_array[],
uniform float light_positionView_z_array[],
uniform float light_attenuationEnd_array[],
// Outputs
uniform int32 subtileIndices[],
uniform int32 subtileIndicesPitch,
uniform int32 subtileNumLights[]
)
{
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y) };
uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
tileMidY - gBufferScale_y };
// Normalize
uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
frustumPlanes_z[0] * frustumPlanes_z[0]),
rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
frustumPlanes_z[1] * frustumPlanes_z[1]) };
frustumPlanes_xy[0] *= norm[0];
frustumPlanes_xy[1] *= norm[1];
frustumPlanes_z[0] *= norm[0];
frustumPlanes_z[1] *= norm[1];
// Initialize
uniform int32 subtileLightOffset[4];
subtileLightOffset[0] = 0 * subtileIndicesPitch;
subtileLightOffset[1] = 1 * subtileIndicesPitch;
subtileLightOffset[2] = 2 * subtileIndicesPitch;
subtileLightOffset[3] = 3 * subtileIndicesPitch;
foreach (i = 0 ... numLights) {
int32 lightIndex = lightIndices[i];
float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex];
float light_positionView_z = light_positionView_z_array[lightIndex];
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
float light_attenuationEndNeg = -light_attenuationEnd;
// Test lights again subtile z bounds
bool inFrustum[4];
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
float dx = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0];
float dy = light_positionView_z * frustumPlanes_z[1] +
light_positionView_y * frustumPlanes_xy[1];
cif (abs(dx) > light_attenuationEnd) {
bool positiveX = dx > 0.0f;
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile
inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile
inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile
}
cif (abs(dy) > light_attenuationEnd) {
bool positiveY = dy > 0.0f;
inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile
inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile
inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile
inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile
}
// Pack and store intersecting lights
// TODO: Experiment with a loop here instead
cif (inFrustum[0])
subtileLightOffset[0] +=
packed_store_active(&subtileIndices[subtileLightOffset[0]],
lightIndex);
cif (inFrustum[1])
subtileLightOffset[1] +=
packed_store_active(&subtileIndices[subtileLightOffset[1]],
lightIndex);
cif (inFrustum[2])
subtileLightOffset[2] +=
packed_store_active(&subtileIndices[subtileLightOffset[2]],
lightIndex);
cif (inFrustum[3])
subtileLightOffset[3] +=
packed_store_active(&subtileIndices[subtileLightOffset[3]],
lightIndex);
}
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
}

View File

@@ -0,0 +1,557 @@
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef __NVPTX__
#warning "emitting DEVICE code"
#define programCount warpSize()
#define programIndex laneIndex()
#define taskIndex blockIndex0()
#define taskCount blockCount0()
#else
#warning "emitting HOST code"
#endif
#include "deferred.h"
struct InputDataArrays
{
float *zBuffer;
unsigned int16 *normalEncoded_x; // half float
unsigned int16 *normalEncoded_y; // half float
unsigned int16 *specularAmount; // half float
unsigned int16 *specularPower; // half float
unsigned int8 *albedo_x; // unorm8
unsigned int8 *albedo_y; // unorm8
unsigned int8 *albedo_z; // unorm8
float *lightPositionView_x;
float *lightPositionView_y;
float *lightPositionView_z;
float *lightAttenuationBegin;
float *lightColor_x;
float *lightColor_y;
float *lightColor_z;
float *lightAttenuationEnd;
};
struct InputHeader
{
float cameraProj[4][4];
float cameraNear;
float cameraFar;
int32 framebufferWidth;
int32 framebufferHeight;
int32 numLights;
int32 inputDataChunkSize;
int32 inputDataArrayOffsets[idaNum];
};
///////////////////////////////////////////////////////////////////////////
// Common utility routines
static inline float
dot3(float x, float y, float z, float a, float b, float c) {
return (x*a + y*b + z*c);
}
static inline void
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
float n = rsqrt(x*x + y*y + z*z);
ox = x * n;
oy = y * n;
oz = z * n;
}
static inline float
Unorm8ToFloat32(unsigned int8 u) {
return (float)u * (1.0f / 255.0f);
}
static inline unsigned int8
Float32ToUnorm8(float f) {
return (unsigned int8)(f * 255.0f);
}
static inline void
ComputeZBounds(
uniform int32 tileStartX, uniform int32 tileEndX,
uniform int32 tileStartY, uniform int32 tileEndY,
// G-buffer data
uniform float zBuffer[],
uniform int32 gBufferWidth,
// Camera data
uniform float cameraProj_33, uniform float cameraProj_43,
uniform float cameraNear, uniform float cameraFar,
// Output
uniform float &minZ,
uniform float &maxZ
)
{
// Find Z bounds
float laneMinZ = cameraFar;
float laneMaxZ = cameraNear;
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
// foreach (x = tileStartX ... tileEndX) {
for (uniform int xb = tileStartX; xb < tileEndX; xb += programCount)
{
const int x = xb + programIndex;
if (x >= tileEndX) continue;
// Unproject depth buffer Z value into view space
float z = zBuffer[y * gBufferWidth + x];
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
// Work out Z bounds for our samples
// Avoid considering skybox/background or otherwise invalid pixels
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
laneMinZ = min(laneMinZ, viewSpaceZ);
laneMaxZ = max(laneMaxZ, viewSpaceZ);
}
}
}
minZ = reduce_min(laneMinZ);
maxZ = reduce_max(laneMaxZ);
}
static inline uniform int32
IntersectLightsWithTileMinMax(
uniform int32 tileStartX, uniform int32 tileEndX,
uniform int32 tileStartY, uniform int32 tileEndY,
// Tile data
uniform float minZ,
uniform float maxZ,
// G-buffer data
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
// Camera data
uniform float cameraProj_11, uniform float cameraProj_22,
// Light Data
uniform int32 numLights,
uniform float light_positionView_x_array[],
uniform float light_positionView_y_array[],
uniform float light_positionView_z_array[],
uniform float light_attenuationEnd_array[],
// Output
uniform int32 tileLightIndices[]
)
{
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
uniform float frustumPlanes_xy[4] = {
-(cameraProj_11 * gBufferScale_x),
(cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y),
-(cameraProj_22 * gBufferScale_y) };
uniform float frustumPlanes_z[4] = {
tileEndX - gBufferScale_x,
-tileStartX + gBufferScale_x,
tileEndY - gBufferScale_y,
-tileStartY + gBufferScale_y };
for (uniform int i = 0; i < 4; ++i) {
uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
frustumPlanes_z[i] * frustumPlanes_z[i]);
frustumPlanes_xy[i] *= norm;
frustumPlanes_z[i] *= norm;
}
uniform int32 tileNumLights = 0;
// foreach (lightIndex = 0 ... numLights) {
for (uniform int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount)
{
const int lightIndex = lightIndexB + programIndex;
if (lightIndex >= numLights) continue;
float light_positionView_z = light_positionView_z_array[lightIndex];
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
float light_attenuationEndNeg = -light_attenuationEnd;
float d = light_positionView_z - minZ;
bool inFrustum = (d >= light_attenuationEndNeg);
d = maxZ - light_positionView_z;
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// This seems better than cif(!inFrustum) ccontinue; here since we
// don't actually need to mask the rest of this function - this is
// just a greedy early-out. Could also structure all of this as
// nested if() statements, but this a bit easier to read
if (any(inFrustum)) {
float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex];
d = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[1] +
light_positionView_x * frustumPlanes_xy[1];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[2] +
light_positionView_y * frustumPlanes_xy[2];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[3] +
light_positionView_y * frustumPlanes_xy[3];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// Pack and store intersecting lights
if (inFrustum) {
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
lightIndex);
}
}
}
return tileNumLights;
}
static inline uniform int32
IntersectLightsWithTile(
uniform int32 tileStartX, uniform int32 tileEndX,
uniform int32 tileStartY, uniform int32 tileEndY,
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
// G-buffer data
uniform float zBuffer[],
// Camera data
uniform float cameraProj_11, uniform float cameraProj_22,
uniform float cameraProj_33, uniform float cameraProj_43,
uniform float cameraNear, uniform float cameraFar,
// Light Data
uniform int32 numLights,
uniform float light_positionView_x_array[],
uniform float light_positionView_y_array[],
uniform float light_positionView_z_array[],
uniform float light_attenuationEnd_array[],
// Output
uniform int32 tileLightIndices[]
)
{
uniform float minZ, maxZ;
ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
minZ, maxZ);
uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
light_positionView_z_array, light_attenuationEnd_array,
tileLightIndices);
return tileNumLights;
}
static inline void
ShadeTile(
uniform int32 tileStartX, uniform int32 tileEndX,
uniform int32 tileStartY, uniform int32 tileEndY,
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
const uniform InputDataArrays &inputData,
// Camera data
uniform float cameraProj_11, uniform float cameraProj_22,
uniform float cameraProj_33, uniform float cameraProj_43,
// Light list
uniform int32 tileLightIndices[],
uniform int32 tileNumLights,
// UI
uniform bool visualizeLightCount,
// Output
uniform unsigned int8 framebuffer_r[],
uniform unsigned int8 framebuffer_g[],
uniform unsigned int8 framebuffer_b[]
)
{
if (tileNumLights == 0 || visualizeLightCount) {
uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
// foreach (x = tileStartX ... tileEndX)
for (uniform int xb = tileStartX ; xb < tileEndX; xb += programCount)
{
const int x = xb + programIndex;
if (x >= tileEndX) continue;
int32 framebufferIndex = (y * gBufferWidth + x);
framebuffer_r[framebufferIndex] = c;
framebuffer_g[framebufferIndex] = c;
framebuffer_b[framebufferIndex] = c;
}
}
} else {
uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
// foreach (x = tileStartX ... tileEndX) {
for (uniform int xb = tileStartX ; xb < tileEndX; xb += programCount)
{
const int x = xb + programIndex;
int32 gBufferOffset = y * gBufferWidth + x;
// Reconstruct position and (negative) view vector from G-buffer
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
float Vneg_x, Vneg_y, Vneg_z;
float z = inputData.zBuffer[gBufferOffset];
// Compute screen/clip-space position
// NOTE: Mind DX11 viewport transform and pixel center!
float positionScreen_x = (0.5f + (float)(x)) *
twoOverGBufferWidth - 1.0f;
// Unproject depth buffer Z value into view space
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
surface_positionView_x = positionScreen_x * surface_positionView_z /
cameraProj_11;
surface_positionView_y = positionScreen_y * surface_positionView_z /
cameraProj_22;
// We actually end up with a vector pointing *at* the
// surface (i.e. the negative view vector)
normalize3(surface_positionView_x, surface_positionView_y,
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
// Reconstruct normal from G-buffer
float surface_normal_x, surface_normal_y, surface_normal_z;
float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
float m = sqrt(4.0f * f - 1.0f);
surface_normal_x = m * (4.0f * normal_x - 2.0f);
surface_normal_y = m * (4.0f * normal_y - 2.0f);
surface_normal_z = 3.0f - 8.0f * f;
// Load other G-buffer parameters
float surface_specularAmount =
half_to_float(inputData.specularAmount[gBufferOffset]);
float surface_specularPower =
half_to_float(inputData.specularPower[gBufferOffset]);
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
float lit_x = 0.0f;
float lit_y = 0.0f;
float lit_z = 0.0f;
for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
++tileLightIndex) {
uniform int32 lightIndex = tileLightIndices[tileLightIndex];
// Gather light data relevant to initial culling
uniform float light_positionView_x =
inputData.lightPositionView_x[lightIndex];
uniform float light_positionView_y =
inputData.lightPositionView_y[lightIndex];
uniform float light_positionView_z =
inputData.lightPositionView_z[lightIndex];
uniform float light_attenuationEnd =
inputData.lightAttenuationEnd[lightIndex];
// Compute light vector
float L_x = light_positionView_x - surface_positionView_x;
float L_y = light_positionView_y - surface_positionView_y;
float L_z = light_positionView_z - surface_positionView_z;
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
// Clip at end of attenuation
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
if (distanceToLight2 < light_attenutaionEnd2) {
float distanceToLight = sqrt(distanceToLight2);
// HLSL "rcp" is allowed to be fairly inaccurate
float distanceToLightRcp = rcp(distanceToLight);
L_x *= distanceToLightRcp;
L_y *= distanceToLightRcp;
L_z *= distanceToLightRcp;
// Start computing brdf
float NdotL = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, L_x, L_y, L_z);
// Clip back facing
if (NdotL > 0.0f) {
uniform float light_attenuationBegin =
inputData.lightAttenuationBegin[lightIndex];
// Light distance attenuation (linstep)
float lightRange = (light_attenuationEnd - light_attenuationBegin);
float falloffPosition = (light_attenuationEnd - distanceToLight);
float attenuation = min(falloffPosition / lightRange, 1.0f);
float H_x = (L_x - Vneg_x);
float H_y = (L_y - Vneg_y);
float H_z = (L_z - Vneg_z);
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
float NdotH = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, H_x, H_y, H_z);
NdotH = max(NdotH, 0.0f);
float specular = pow(NdotH, surface_specularPower);
float specularNorm = (surface_specularPower + 2.0f) *
(1.0f / 8.0f);
float specularContrib = surface_specularAmount *
specularNorm * specular;
float k = attenuation * NdotL * (1.0f + specularContrib);
uniform float light_color_x = inputData.lightColor_x[lightIndex];
uniform float light_color_y = inputData.lightColor_y[lightIndex];
uniform float light_color_z = inputData.lightColor_z[lightIndex];
float lightContrib_x = surface_albedo_x * light_color_x;
float lightContrib_y = surface_albedo_y * light_color_y;
float lightContrib_z = surface_albedo_z * light_color_z;
lit_x += lightContrib_x * k;
lit_y += lightContrib_y * k;
lit_z += lightContrib_z * k;
}
}
}
// Gamma correct
// These pows are pretty slow right now, but we can do
// something faster if really necessary to squeeze every
// last bit of performance out of it
float gamma = 1.0 / 2.2f;
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
}
}
}
}
///////////////////////////////////////////////////////////////////////////
// Static decomposition
task void
RenderTile(uniform int num_groups_x, uniform int num_groups_y,
const uniform InputHeader inputHeaderPtr[],
const uniform InputDataArrays inputDataPtr[],
uniform int visualizeLightCount,
// Output
uniform unsigned int8 framebuffer_r[],
uniform unsigned int8 framebuffer_g[],
uniform unsigned int8 framebuffer_b[]) {
if (taskIndex >= taskCount) return;
const uniform InputHeader inputHeader = *inputHeaderPtr;
const uniform InputDataArrays inputData = *inputDataPtr;
uniform int32 group_y = taskIndex / num_groups_x;
uniform int32 group_x = taskIndex % num_groups_x;
uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
uniform int framebufferWidth = inputHeader.framebufferWidth;
uniform int framebufferHeight = inputHeader.framebufferHeight;
uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
// Light intersection: figure out which lights illuminate this tile.
uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
uniform int numTileLights =
IntersectLightsWithTile(tile_start_x, tile_end_x,
tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight,
inputData.zBuffer,
cameraProj_00, cameraProj_11,
cameraProj_22, cameraProj_32,
inputHeader.cameraNear, inputHeader.cameraFar,
MAX_LIGHTS,
inputData.lightPositionView_x,
inputData.lightPositionView_y,
inputData.lightPositionView_z,
inputData.lightAttenuationEnd,
tileLightIndices);
// And now shade the tile, using the lights in tileLightIndices
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight, inputData,
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
tileLightIndices, numTileLights, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
}
export void
RenderStatic(uniform InputHeader inputHeaderPtr[],
uniform InputDataArrays inputDataPtr[],
uniform InputHeader &inputHeader,
uniform int visualizeLightCount,
// Output
uniform unsigned int8 framebuffer_r[],
uniform unsigned int8 framebuffer_g[],
uniform unsigned int8 framebuffer_b[]) {
uniform int num_groups_x = (inputHeader.framebufferWidth +
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
uniform int num_groups_y = (inputHeader.framebufferHeight +
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
uniform int num_groups = num_groups_x * num_groups_y;
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
// by MIN_TILE_HEIGHT pixels.
launch[num_groups] RenderTile(num_groups_x, num_groups_y,
inputHeaderPtr, inputDataPtr, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
}

View File

@@ -59,6 +59,19 @@
#include "kernels_ispc.h"
#include "../timing.h"
#include <sys/time.h>
static inline double rtc(void)
{
struct timeval Tvalue;
double etime;
struct timezone dummy;
gettimeofday(&Tvalue,&dummy);
etime = (double) Tvalue.tv_sec +
1.e-6*((double) Tvalue.tv_usec);
return etime;
}
///////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv) {
@@ -76,27 +89,36 @@ int main(int argc, char** argv) {
Framebuffer framebuffer(input->header.framebufferWidth,
input->header.framebufferHeight);
#if 0
InitDynamicC(input);
#ifdef __cilk
InitDynamicCilk(input);
#endif // __cilk
#endif
const int buffsize = input->header.framebufferWidth*input->header.framebufferHeight;
for (int i = 0; i < buffsize; i++)
framebuffer.r[i] = framebuffer.g[i] = framebuffer.b[i] = 0;
int nframes = 5;
double ispcCycles = 1e30;
for (int i = 0; i < 5; ++i) {
framebuffer.clear();
reset_and_start_timer();
const double t0 = rtc();
for (int j = 0; j < nframes; ++j)
ispc::RenderStatic(input->header, input->arrays,
VISUALIZE_LIGHT_COUNT,
framebuffer.r, framebuffer.g, framebuffer.b);
double mcycles = get_elapsed_mcycles() / nframes;
double mcycles = (rtc() - t0) / nframes;
ispcCycles = std::min(ispcCycles, mcycles);
}
printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
"%d x %d image\n", ispcCycles,
input->header.framebufferWidth, input->header.framebufferHeight);
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
return 0;
#if 0
#ifdef __cilk
double dynamicCilkCycles = 1e30;
@@ -132,6 +154,7 @@ int main(int argc, char** argv) {
#else
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles);
#endif // __cilk
#endif
DeleteInputData(input);

316
examples_cuda/deferred/main_cu.cpp Normal file → Executable file
View File

@@ -59,6 +59,222 @@
#include "kernels_ispc.h"
#include "../timing.h"
#include <sys/time.h>
static inline double rtc(void)
{
struct timeval Tvalue;
double etime;
struct timezone dummy;
gettimeofday(&Tvalue,&dummy);
etime = (double) Tvalue.tv_sec +
1.e-6*((double) Tvalue.tv_usec);
return etime;
}
/******************************/ #include <cassert>
#include <iostream>
#include <cuda.h>
#include "drvapi_error_string.h"
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
// These are the inline versions for all of the SDK helper functions
void __checkCudaErrors(CUresult err, const char *file, const int line) {
if(CUDA_SUCCESS != err) {
std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
<< getCudaDrvErrorString(err) << "\" from file <" << file
<< ", line " << line << "\n";
exit(-1);
}
}
/**********************/
/* Basic CUDriver API */
CUcontext context;
void createContext(const int deviceId = 0)
{
CUdevice device;
int devCount;
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGetCount(&devCount));
assert(devCount > 0);
checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
char name[128];
checkCudaErrors(cuDeviceGetName(name, 128, device));
std::cout << "Using CUDA Device [0]: " << name << "\n";
int devMajor, devMinor;
checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
std::cout << "Device Compute Capability: "
<< devMajor << "." << devMinor << "\n";
if (devMajor < 2) {
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
exit(1);
}
// Create driver context
checkCudaErrors(cuCtxCreate(&context, 0, device));
}
void destroyContext()
{
checkCudaErrors(cuCtxDestroy(context));
}
CUmodule loadModule(const char * module)
{
CUmodule cudaModule;
// in this branch we use compilation with parameters
const unsigned int jitNumOptions = 1;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void*[jitNumOptions];
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[0] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 64;
jitOptVals[0] = (void *)(size_t)jitRegCount;
#if 0
// set up size of compilation log buffer
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = 1024;
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
// set up pointer to the compilation log buffer
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[1] = jitLogBuffer;
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[2] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 32;
jitOptVals[2] = (void *)(size_t)jitRegCount;
#endif
checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals));
return cudaModule;
}
void unloadModule(CUmodule &cudaModule)
{
checkCudaErrors(cuModuleUnload(cudaModule));
}
CUfunction getFunction(CUmodule &cudaModule, const char * function)
{
CUfunction cudaFunction;
checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
return cudaFunction;
}
CUdeviceptr deviceMalloc(const size_t size)
{
CUdeviceptr d_buf;
checkCudaErrors(cuMemAlloc(&d_buf, size));
return d_buf;
}
void deviceFree(CUdeviceptr d_buf)
{
checkCudaErrors(cuMemFree(d_buf));
}
void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
{
checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
}
void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
{
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
}
#define deviceLaunch(func,nbx,nby,nbz,params) \
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_EQUAL)); \
checkCudaErrors( \
cuLaunchKernel( \
(func), \
((nbx-1)/(128/32)+1), (nby), (nbz), \
128, 1, 1, \
0, NULL, (params), NULL \
));
typedef CUdeviceptr devicePtr;
/**************/
#include <vector>
std::vector<char> readBinary(const char * filename)
{
std::vector<char> buffer;
FILE *fp = fopen(filename, "rb");
if (!fp )
{
fprintf(stderr, "file %s not found\n", filename);
assert(0);
}
#if 0
char c;
while ((c = fgetc(fp)) != EOF)
buffer.push_back(c);
#else
fseek(fp, 0, SEEK_END);
const unsigned long long size = ftell(fp); /*calc the size needed*/
fseek(fp, 0, SEEK_SET);
buffer.resize(size);
if (fp == NULL){ /*ERROR detection if file == empty*/
fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);
exit(1);
}
else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
fprintf(stderr, "Error: There was an Error reading the file %s \n", filename);
exit(1);
}
#endif
fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size());
return buffer;
}
extern "C"
{
void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment)
{
return NULL;
}
void CUDALaunch(
void **handlePtr,
const char * module_name,
const char * module_1,
const char * func_name,
void **func_args,
int countx, int county, int countz)
{
assert(module_name != NULL);
assert(module_1 != NULL);
assert(func_name != NULL);
assert(func_args != NULL);
#if 0
const char * module = module_1;
#else
const std::vector<char> module_str = readBinary("kernel.cubin");
const char * module = &module_str[0];
#endif
CUmodule cudaModule = loadModule(module);
CUfunction cudaFunction = getFunction(cudaModule, func_name);
deviceLaunch(cudaFunction, countx, county, countz, func_args);
unloadModule(cudaModule);
}
void CUDASync(void *handle)
{
checkCudaErrors(cuStreamSynchronize(0));
}
void ISPCSync(void *handle)
{
checkCudaErrors(cuStreamSynchronize(0));
}
void CUDAFree(void *handle)
{
}
}
/******************************/
///////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv) {
@@ -76,27 +292,112 @@ int main(int argc, char** argv) {
Framebuffer framebuffer(input->header.framebufferWidth,
input->header.framebufferHeight);
InitDynamicC(input);
// InitDynamicC(input);
#if 0
#ifdef __cilk
InitDynamicCilk(input);
#endif // __cilk
#endif
/*******************/
createContext();
/*******************/
devicePtr d_header = deviceMalloc(sizeof(ispc::InputHeader));
devicePtr d_arrays = deviceMalloc(sizeof(ispc::InputDataArrays));
const int buffsize = input->header.framebufferWidth*input->header.framebufferHeight;
devicePtr d_r = deviceMalloc(buffsize);
devicePtr d_g = deviceMalloc(buffsize);
devicePtr d_b = deviceMalloc(buffsize);
for (int i = 0; i < buffsize; i++)
framebuffer.r[i] = framebuffer.g[i] = framebuffer.b[i] = 0;
ispc::InputDataArrays dh_arrays;
{
devicePtr d_chunk = deviceMalloc(input->header.inputDataChunkSize);
memcpyH2D(d_chunk, input->chunk, input->header.inputDataChunkSize);
dh_arrays.zBuffer = (float*)(d_chunk + input->header.inputDataArrayOffsets[idaZBuffer]);
dh_arrays.normalEncoded_x =
(uint16_t *)(d_chunk+input->header.inputDataArrayOffsets[idaNormalEncoded_x]);
fprintf(stderr, "%p %p \n",
dh_arrays.zBuffer, dh_arrays.normalEncoded_x);
fprintf(stderr, " diff= %d %d \n",
input->header.inputDataArrayOffsets[idaZBuffer],
input->header.inputDataArrayOffsets[idaNormalEncoded_x]);
dh_arrays.normalEncoded_y =
(uint16_t *)(d_chunk+input->header.inputDataArrayOffsets[idaNormalEncoded_y]);
dh_arrays.specularAmount =
(uint16_t *)(d_chunk+input->header.inputDataArrayOffsets[idaSpecularAmount]);
dh_arrays.specularPower =
(uint16_t *)(d_chunk+input->header.inputDataArrayOffsets[idaSpecularPower]);
dh_arrays.albedo_x =
(uint8_t *)(d_chunk+input->header.inputDataArrayOffsets[idaAlbedo_x]);
dh_arrays.albedo_y =
(uint8_t *)(d_chunk+input->header.inputDataArrayOffsets[idaAlbedo_y]);
dh_arrays.albedo_z =
(uint8_t *)(d_chunk+input->header.inputDataArrayOffsets[idaAlbedo_z]);
dh_arrays.lightPositionView_x =
(float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightPositionView_x]);
dh_arrays.lightPositionView_y =
(float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightPositionView_y]);
dh_arrays.lightPositionView_z =
(float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightPositionView_z]);
dh_arrays.lightAttenuationBegin =
(float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightAttenuationBegin]);
dh_arrays.lightColor_x =
(float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightColor_x]);
dh_arrays.lightColor_y =
(float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightColor_y]);
dh_arrays.lightColor_z =
(float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightColor_z]);
dh_arrays.lightAttenuationEnd =
(float *)(d_chunk+input->header.inputDataArrayOffsets[idaLightAttenuationEnd]);
}
memcpyH2D(d_header, &input->header, sizeof(ispc::InputHeader));
memcpyH2D(d_arrays, &dh_arrays, sizeof(ispc::InputDataArrays));
memcpyH2D(d_r, framebuffer.r, buffsize);
memcpyH2D(d_g, framebuffer.g, buffsize);
memcpyH2D(d_b, framebuffer.b, buffsize);
int nframes = 5;
double ispcCycles = 1e30;
for (int i = 0; i < 5; ++i) {
framebuffer.clear();
reset_and_start_timer();
const double t0 = rtc();
for (int j = 0; j < nframes; ++j)
ispc::RenderStatic(input->header, input->arrays,
VISUALIZE_LIGHT_COUNT,
framebuffer.r, framebuffer.g, framebuffer.b);
double mcycles = get_elapsed_mcycles() / nframes;
ispc::RenderStatic(
(ispc::InputHeader*)d_header,
(ispc::InputDataArrays*)d_arrays,
input->header,
VISUALIZE_LIGHT_COUNT,
(uint8_t*)d_r,
(uint8_t*)d_g,
(uint8_t*)d_b);
double mcycles = (rtc() - t0) / nframes;
ispcCycles = std::min(ispcCycles, mcycles);
}
memcpyD2H(framebuffer.r, d_r, buffsize);
memcpyD2H(framebuffer.g, d_g, buffsize);
memcpyD2H(framebuffer.b, d_b, buffsize);
printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
"%d x %d image\n", ispcCycles,
input->header.framebufferWidth, input->header.framebufferHeight);
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
WriteFrame("deferred-cuda.ppm", input, framebuffer);
/*******************/
destroyContext();
/*******************/
return 0;
#if 0
#ifdef __cilk
double dynamicCilkCycles = 1e30;
@@ -132,6 +433,7 @@ int main(int argc, char** argv) {
#else
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles);
#endif // __cilk
#endif
DeleteInputData(input);