Cleanups to deferred shading workload

This commit is contained in:
Matt Pharr
2011-09-30 20:35:42 -07:00
parent 9de34eb22c
commit 65c50b60fc
5 changed files with 87 additions and 49 deletions

View File

@@ -64,7 +64,7 @@
///////////////////////////////////////////////////////////////////////////
static void *
lAlignedMalloc(int64_t size, int32_t alignment) {
lAlignedMalloc(size_t size, int32_t alignment) {
#ifdef ISPC_IS_WINDOWS
return _aligned_malloc(size, alignment);
#endif
@@ -118,6 +118,7 @@ Framebuffer::clear() {
memset(b, 0, nPixels);
}
InputData *
CreateInputDataFromFile(const char *path) {
FILE *in = fopen(path, "rb");
@@ -177,8 +178,7 @@ CreateInputDataFromFile(const char *path) {
}
void DeleteInputData(InputData *input)
{
void DeleteInputData(InputData *input) {
lAlignedFree(input->chunk);
}

View File

@@ -60,7 +60,7 @@
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
static void *
lAlignedMalloc(int64_t size, int32_t alignment) {
lAlignedMalloc(size_t size, int32_t alignment) {
#ifdef ISPC_IS_WINDOWS
return _aligned_malloc(size, alignment);
#endif
@@ -141,12 +141,10 @@ ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
{
for (int tileX = 0; tileX < numTilesX; ++tileX) {
float minZ, maxZ;
ComputeZBounds(
tileX * tileWidth, tileX * tileWidth + tileWidth,
tileY * tileHeight, tileY * tileHeight + tileHeight,
zBuffer, gBufferWidth,
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
&minZ, &maxZ);
ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
tileY * tileHeight, tileY * tileHeight + tileHeight,
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
cameraNear, cameraFar, &minZ, &maxZ);
minZArray[tileX] = minZ;
maxZArray[tileX] = maxZ;
}
@@ -282,8 +280,8 @@ void InitDynamicC(InputData *input) {
}
// numLights need not be a multiple of programCount here, but the input and output arrays
// should be able to handle programCount-sized load/stores.
/* We're going to split a tile into 4 sub-tiles. This function
reclassifies the tile's lights with respect to the sub-tiles. */
static void
SplitTileMinMax(
int tileMidX, int tileMidY,
@@ -339,7 +337,7 @@ SplitTileMinMax(
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
float light_attenuationEndNeg = -light_attenuationEnd;
// Test lights again subtile z bounds
// Test lights again against subtile z bounds
bool inFrustum[4];
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
@@ -414,7 +412,8 @@ Float32ToUnorm8(float f) {
}
static inline float half_to_float_fast(uint16_t h) {
static inline float
half_to_float_fast(uint16_t h) {
uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit
uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits

View File

@@ -60,7 +60,7 @@
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
static void *
lAlignedMalloc(int64_t size, int32_t alignment) {
lAlignedMalloc(size_t size, int32_t alignment) {
#ifdef ISPC_IS_WINDOWS
return _aligned_malloc(size, alignment);
#endif
@@ -395,4 +395,4 @@ DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
}
}
#endif // __cilkplusplus
#endif // __cilk

View File

@@ -479,7 +479,7 @@ ShadeTile(
// Static decomposition
task void
RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
RenderTile(uniform int num_groups_x, uniform int num_groups_y,
reference uniform InputHeader inputHeader,
reference uniform InputDataArrays inputData,
uniform int visualizeLightCount,
@@ -487,16 +487,13 @@ RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
reference uniform unsigned int8 framebuffer_r[],
reference uniform unsigned int8 framebuffer_g[],
reference uniform unsigned int8 framebuffer_b[]) {
uniform int32 group_y = g / num_groups_x;
uniform int32 group_x = g % num_groups_x;
uniform int32 group_y = taskIndex / num_groups_x;
uniform int32 group_x = taskIndex % num_groups_x;
uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
uniform int sTileNumLights = 0;
uniform int sTileLightIndices[MAX_LIGHTS]; // Light list for the tile
uniform int framebufferWidth = inputHeader.framebufferWidth;
uniform int framebufferHeight = inputHeader.framebufferHeight;
uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
@@ -504,8 +501,9 @@ RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
// Light intersection
sTileNumLights =
// Light intersection: figure out which lights illuminate this tile.
uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
uniform int numTileLights =
IntersectLightsWithTile(tile_start_x, tile_end_x,
tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight,
@@ -518,12 +516,13 @@ RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
inputData.lightPositionView_y,
inputData.lightPositionView_z,
inputData.lightAttenuationEnd,
sTileLightIndices);
tileLightIndices);
// And now shade the tile, using the lights in tileLightIndices
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight, inputData,
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
sTileLightIndices, sTileNumLights, visualizeLightCount,
tileLightIndices, numTileLights, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
}
@@ -542,17 +541,19 @@ RenderStatic(reference uniform InputHeader inputHeader,
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
uniform int num_groups = num_groups_x * num_groups_y;
for (uniform int g = 0; g < num_groups; ++g)
launch < RenderTile(g, num_groups_x, num_groups_y,
inputHeader, inputData, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b) >;
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
// by MIN_TILE_HEIGHT pixels.
launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
inputHeader, inputData, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b) >;
}
///////////////////////////////////////////////////////////////////////////
// Routines for dynamic decomposition path
// tile width must be a multiple of programCount (SIMD size)
// This computes the z min/max range for a whole row worth of tiles.
// The tile width must be a multiple of programCount (SIMD size)
export void
ComputeZBoundsRow(
uniform int32 tileY,
@@ -583,6 +584,7 @@ ComputeZBoundsRow(
}
// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
// numLights need not be a multiple of programCount here, but the input and output arrays
// should be able to handle programCount-sized load/stores.
export void

View File

@@ -58,10 +58,45 @@
#include "deferred.h"
#include "kernels_ispc.h"
#include "../timing.h"
#include "../cpuid.h"
///////////////////////////////////////////////////////////////////////////
// Make sure that the vector ISA used during compilation is supported by
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
// header file that we include above.
static void
ensureTargetISAIsSupported() {
#if defined(ISPC_TARGET_SSE2)
bool isaSupported = CPUSupportsSSE2();
const char *target = "SSE2";
#elif defined(ISPC_TARGET_SSE4)
bool isaSupported = CPUSupportsSSE4();
const char *target = "SSE4";
#elif defined(ISPC_TARGET_AVX)
bool isaSupported = CPUSupportsAVX();
const char *target = "AVX";
#else
#error "Unknown ISPC_TARGET_* value"
#endif
if (!isaSupported) {
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
"set, which isn't\n*** supported by this computer's CPU!\n", target);
fprintf(stderr, "***\n*** Please modify the "
#ifdef _MSC_VER
"MSVC project file "
#else
"Makefile "
#endif
"to select another target (e.g. sse2)\n***\n");
exit(1);
}
}
int main(int argc, char** argv) {
ensureTargetISAIsSupported();
if (argc != 2) {
printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
return 1;
@@ -77,9 +112,9 @@ int main(int argc, char** argv) {
input->header.framebufferHeight);
InitDynamicC(input);
#ifdef __cilkplusplus
#ifdef __cilk
InitDynamicCilk(input);
#endif // __cilkplusplus
#endif // __cilk
int nframes = 5;
double ispcCycles = 1e30;
@@ -98,20 +133,7 @@ int main(int argc, char** argv) {
input->header.framebufferWidth, input->header.framebufferHeight);
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
double serialCycles = 1e30;
for (int i = 0; i < 5; ++i) {
framebuffer.clear();
reset_and_start_timer();
for (int j = 0; j < nframes; ++j)
DispatchDynamicC(input, &framebuffer);
double mcycles = get_elapsed_mcycles() / nframes;
serialCycles = std::min(serialCycles, mcycles);
}
printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles\n",
serialCycles);
WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
#ifdef __cilkplusplus
#ifdef __cilk
double dynamicCilkCycles = 1e30;
for (int i = 0; i < 5; ++i) {
framebuffer.clear();
@@ -121,15 +143,30 @@ int main(int argc, char** argv) {
double mcycles = get_elapsed_mcycles() / nframes;
dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
}
printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles\n",
printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n",
dynamicCilkCycles);
WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
#endif // __cilk
double serialCycles = 1e30;
for (int i = 0; i < 5; ++i) {
framebuffer.clear();
reset_and_start_timer();
for (int j = 0; j < nframes; ++j)
DispatchDynamicC(input, &framebuffer);
double mcycles = get_elapsed_mcycles() / nframes;
serialCycles = std::min(serialCycles, mcycles);
}
printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n",
serialCycles);
WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
#ifdef __cilk
printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n",
serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
#else
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
#endif // __cilkplusplus
#endif // __cilk
DeleteInputData(input);