Cleanups to deferred shading workload
This commit is contained in:
@@ -64,7 +64,7 @@
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
static void *
|
static void *
|
||||||
lAlignedMalloc(int64_t size, int32_t alignment) {
|
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||||
#ifdef ISPC_IS_WINDOWS
|
#ifdef ISPC_IS_WINDOWS
|
||||||
return _aligned_malloc(size, alignment);
|
return _aligned_malloc(size, alignment);
|
||||||
#endif
|
#endif
|
||||||
@@ -118,6 +118,7 @@ Framebuffer::clear() {
|
|||||||
memset(b, 0, nPixels);
|
memset(b, 0, nPixels);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
InputData *
|
InputData *
|
||||||
CreateInputDataFromFile(const char *path) {
|
CreateInputDataFromFile(const char *path) {
|
||||||
FILE *in = fopen(path, "rb");
|
FILE *in = fopen(path, "rb");
|
||||||
@@ -177,8 +178,7 @@ CreateInputDataFromFile(const char *path) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void DeleteInputData(InputData *input)
|
void DeleteInputData(InputData *input) {
|
||||||
{
|
|
||||||
lAlignedFree(input->chunk);
|
lAlignedFree(input->chunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -60,7 +60,7 @@
|
|||||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||||
|
|
||||||
static void *
|
static void *
|
||||||
lAlignedMalloc(int64_t size, int32_t alignment) {
|
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||||
#ifdef ISPC_IS_WINDOWS
|
#ifdef ISPC_IS_WINDOWS
|
||||||
return _aligned_malloc(size, alignment);
|
return _aligned_malloc(size, alignment);
|
||||||
#endif
|
#endif
|
||||||
@@ -141,12 +141,10 @@ ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
|
|||||||
{
|
{
|
||||||
for (int tileX = 0; tileX < numTilesX; ++tileX) {
|
for (int tileX = 0; tileX < numTilesX; ++tileX) {
|
||||||
float minZ, maxZ;
|
float minZ, maxZ;
|
||||||
ComputeZBounds(
|
ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||||
tileX * tileWidth, tileX * tileWidth + tileWidth,
|
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||||
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
|
||||||
zBuffer, gBufferWidth,
|
cameraNear, cameraFar, &minZ, &maxZ);
|
||||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
|
||||||
&minZ, &maxZ);
|
|
||||||
minZArray[tileX] = minZ;
|
minZArray[tileX] = minZ;
|
||||||
maxZArray[tileX] = maxZ;
|
maxZArray[tileX] = maxZ;
|
||||||
}
|
}
|
||||||
@@ -282,8 +280,8 @@ void InitDynamicC(InputData *input) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// numLights need not be a multiple of programCount here, but the input and output arrays
|
/* We're going to split a tile into 4 sub-tiles. This function
|
||||||
// should be able to handle programCount-sized load/stores.
|
reclassifies the tile's lights with respect to the sub-tiles. */
|
||||||
static void
|
static void
|
||||||
SplitTileMinMax(
|
SplitTileMinMax(
|
||||||
int tileMidX, int tileMidY,
|
int tileMidX, int tileMidY,
|
||||||
@@ -339,7 +337,7 @@ SplitTileMinMax(
|
|||||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||||
|
|
||||||
// Test lights again subtile z bounds
|
// Test lights again against subtile z bounds
|
||||||
bool inFrustum[4];
|
bool inFrustum[4];
|
||||||
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||||
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||||
@@ -414,7 +412,8 @@ Float32ToUnorm8(float f) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static inline float half_to_float_fast(uint16_t h) {
|
static inline float
|
||||||
|
half_to_float_fast(uint16_t h) {
|
||||||
uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit
|
uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit
|
||||||
uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits
|
uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits
|
||||||
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
|
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
|
||||||
|
|||||||
@@ -60,7 +60,7 @@
|
|||||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||||
|
|
||||||
static void *
|
static void *
|
||||||
lAlignedMalloc(int64_t size, int32_t alignment) {
|
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||||
#ifdef ISPC_IS_WINDOWS
|
#ifdef ISPC_IS_WINDOWS
|
||||||
return _aligned_malloc(size, alignment);
|
return _aligned_malloc(size, alignment);
|
||||||
#endif
|
#endif
|
||||||
@@ -395,4 +395,4 @@ DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // __cilkplusplus
|
#endif // __cilk
|
||||||
|
|||||||
@@ -479,7 +479,7 @@ ShadeTile(
|
|||||||
// Static decomposition
|
// Static decomposition
|
||||||
|
|
||||||
task void
|
task void
|
||||||
RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
|
RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
||||||
reference uniform InputHeader inputHeader,
|
reference uniform InputHeader inputHeader,
|
||||||
reference uniform InputDataArrays inputData,
|
reference uniform InputDataArrays inputData,
|
||||||
uniform int visualizeLightCount,
|
uniform int visualizeLightCount,
|
||||||
@@ -487,16 +487,13 @@ RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
|
|||||||
reference uniform unsigned int8 framebuffer_r[],
|
reference uniform unsigned int8 framebuffer_r[],
|
||||||
reference uniform unsigned int8 framebuffer_g[],
|
reference uniform unsigned int8 framebuffer_g[],
|
||||||
reference uniform unsigned int8 framebuffer_b[]) {
|
reference uniform unsigned int8 framebuffer_b[]) {
|
||||||
uniform int32 group_y = g / num_groups_x;
|
uniform int32 group_y = taskIndex / num_groups_x;
|
||||||
uniform int32 group_x = g % num_groups_x;
|
uniform int32 group_x = taskIndex % num_groups_x;
|
||||||
uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
|
uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
|
||||||
uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
|
uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
|
||||||
uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
|
uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
|
||||||
uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
|
uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
|
||||||
|
|
||||||
uniform int sTileNumLights = 0;
|
|
||||||
uniform int sTileLightIndices[MAX_LIGHTS]; // Light list for the tile
|
|
||||||
|
|
||||||
uniform int framebufferWidth = inputHeader.framebufferWidth;
|
uniform int framebufferWidth = inputHeader.framebufferWidth;
|
||||||
uniform int framebufferHeight = inputHeader.framebufferHeight;
|
uniform int framebufferHeight = inputHeader.framebufferHeight;
|
||||||
uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
|
uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
|
||||||
@@ -504,8 +501,9 @@ RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
|
|||||||
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
|
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
|
||||||
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
|
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
|
||||||
|
|
||||||
// Light intersection
|
// Light intersection: figure out which lights illuminate this tile.
|
||||||
sTileNumLights =
|
uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
|
||||||
|
uniform int numTileLights =
|
||||||
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
||||||
tile_start_y, tile_end_y,
|
tile_start_y, tile_end_y,
|
||||||
framebufferWidth, framebufferHeight,
|
framebufferWidth, framebufferHeight,
|
||||||
@@ -518,12 +516,13 @@ RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
|
|||||||
inputData.lightPositionView_y,
|
inputData.lightPositionView_y,
|
||||||
inputData.lightPositionView_z,
|
inputData.lightPositionView_z,
|
||||||
inputData.lightAttenuationEnd,
|
inputData.lightAttenuationEnd,
|
||||||
sTileLightIndices);
|
tileLightIndices);
|
||||||
|
|
||||||
|
// And now shade the tile, using the lights in tileLightIndices
|
||||||
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
|
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
|
||||||
framebufferWidth, framebufferHeight, inputData,
|
framebufferWidth, framebufferHeight, inputData,
|
||||||
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
|
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
|
||||||
sTileLightIndices, sTileNumLights, visualizeLightCount,
|
tileLightIndices, numTileLights, visualizeLightCount,
|
||||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -542,17 +541,19 @@ RenderStatic(reference uniform InputHeader inputHeader,
|
|||||||
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
||||||
uniform int num_groups = num_groups_x * num_groups_y;
|
uniform int num_groups = num_groups_x * num_groups_y;
|
||||||
|
|
||||||
for (uniform int g = 0; g < num_groups; ++g)
|
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
|
||||||
launch < RenderTile(g, num_groups_x, num_groups_y,
|
// by MIN_TILE_HEIGHT pixels.
|
||||||
inputHeader, inputData, visualizeLightCount,
|
launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
|
||||||
framebuffer_r, framebuffer_g, framebuffer_b) >;
|
inputHeader, inputData, visualizeLightCount,
|
||||||
|
framebuffer_r, framebuffer_g, framebuffer_b) >;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Routines for dynamic decomposition path
|
// Routines for dynamic decomposition path
|
||||||
|
|
||||||
// tile width must be a multiple of programCount (SIMD size)
|
// This computes the z min/max range for a whole row worth of tiles.
|
||||||
|
// The tile width must be a multiple of programCount (SIMD size)
|
||||||
export void
|
export void
|
||||||
ComputeZBoundsRow(
|
ComputeZBoundsRow(
|
||||||
uniform int32 tileY,
|
uniform int32 tileY,
|
||||||
@@ -583,6 +584,7 @@ ComputeZBoundsRow(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
|
||||||
// numLights need not be a multiple of programCount here, but the input and output arrays
|
// numLights need not be a multiple of programCount here, but the input and output arrays
|
||||||
// should be able to handle programCount-sized load/stores.
|
// should be able to handle programCount-sized load/stores.
|
||||||
export void
|
export void
|
||||||
|
|||||||
@@ -58,10 +58,45 @@
|
|||||||
#include "deferred.h"
|
#include "deferred.h"
|
||||||
#include "kernels_ispc.h"
|
#include "kernels_ispc.h"
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
|
#include "../cpuid.h"
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// Make sure that the vector ISA used during compilation is supported by
|
||||||
|
// the processor. The ISPC_TARGET_* macro is set in the ispc-generated
|
||||||
|
// header file that we include above.
|
||||||
|
static void
|
||||||
|
ensureTargetISAIsSupported() {
|
||||||
|
#if defined(ISPC_TARGET_SSE2)
|
||||||
|
bool isaSupported = CPUSupportsSSE2();
|
||||||
|
const char *target = "SSE2";
|
||||||
|
#elif defined(ISPC_TARGET_SSE4)
|
||||||
|
bool isaSupported = CPUSupportsSSE4();
|
||||||
|
const char *target = "SSE4";
|
||||||
|
#elif defined(ISPC_TARGET_AVX)
|
||||||
|
bool isaSupported = CPUSupportsAVX();
|
||||||
|
const char *target = "AVX";
|
||||||
|
#else
|
||||||
|
#error "Unknown ISPC_TARGET_* value"
|
||||||
|
#endif
|
||||||
|
if (!isaSupported) {
|
||||||
|
fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
|
||||||
|
"set, which isn't\n*** supported by this computer's CPU!\n", target);
|
||||||
|
fprintf(stderr, "***\n*** Please modify the "
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
"MSVC project file "
|
||||||
|
#else
|
||||||
|
"Makefile "
|
||||||
|
#endif
|
||||||
|
"to select another target (e.g. sse2)\n***\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
ensureTargetISAIsSupported();
|
||||||
|
|
||||||
if (argc != 2) {
|
if (argc != 2) {
|
||||||
printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
|
printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
|
||||||
return 1;
|
return 1;
|
||||||
@@ -77,9 +112,9 @@ int main(int argc, char** argv) {
|
|||||||
input->header.framebufferHeight);
|
input->header.framebufferHeight);
|
||||||
|
|
||||||
InitDynamicC(input);
|
InitDynamicC(input);
|
||||||
#ifdef __cilkplusplus
|
#ifdef __cilk
|
||||||
InitDynamicCilk(input);
|
InitDynamicCilk(input);
|
||||||
#endif // __cilkplusplus
|
#endif // __cilk
|
||||||
|
|
||||||
int nframes = 5;
|
int nframes = 5;
|
||||||
double ispcCycles = 1e30;
|
double ispcCycles = 1e30;
|
||||||
@@ -98,20 +133,7 @@ int main(int argc, char** argv) {
|
|||||||
input->header.framebufferWidth, input->header.framebufferHeight);
|
input->header.framebufferWidth, input->header.framebufferHeight);
|
||||||
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
|
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
|
||||||
|
|
||||||
double serialCycles = 1e30;
|
#ifdef __cilk
|
||||||
for (int i = 0; i < 5; ++i) {
|
|
||||||
framebuffer.clear();
|
|
||||||
reset_and_start_timer();
|
|
||||||
for (int j = 0; j < nframes; ++j)
|
|
||||||
DispatchDynamicC(input, &framebuffer);
|
|
||||||
double mcycles = get_elapsed_mcycles() / nframes;
|
|
||||||
serialCycles = std::min(serialCycles, mcycles);
|
|
||||||
}
|
|
||||||
printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles\n",
|
|
||||||
serialCycles);
|
|
||||||
WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
|
|
||||||
|
|
||||||
#ifdef __cilkplusplus
|
|
||||||
double dynamicCilkCycles = 1e30;
|
double dynamicCilkCycles = 1e30;
|
||||||
for (int i = 0; i < 5; ++i) {
|
for (int i = 0; i < 5; ++i) {
|
||||||
framebuffer.clear();
|
framebuffer.clear();
|
||||||
@@ -121,15 +143,30 @@ int main(int argc, char** argv) {
|
|||||||
double mcycles = get_elapsed_mcycles() / nframes;
|
double mcycles = get_elapsed_mcycles() / nframes;
|
||||||
dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
|
dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
|
||||||
}
|
}
|
||||||
printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles\n",
|
printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n",
|
||||||
dynamicCilkCycles);
|
dynamicCilkCycles);
|
||||||
WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
|
WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
|
||||||
|
#endif // __cilk
|
||||||
|
|
||||||
|
double serialCycles = 1e30;
|
||||||
|
for (int i = 0; i < 5; ++i) {
|
||||||
|
framebuffer.clear();
|
||||||
|
reset_and_start_timer();
|
||||||
|
for (int j = 0; j < nframes; ++j)
|
||||||
|
DispatchDynamicC(input, &framebuffer);
|
||||||
|
double mcycles = get_elapsed_mcycles() / nframes;
|
||||||
|
serialCycles = std::min(serialCycles, mcycles);
|
||||||
|
}
|
||||||
|
printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n",
|
||||||
|
serialCycles);
|
||||||
|
WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
|
||||||
|
|
||||||
|
#ifdef __cilk
|
||||||
printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n",
|
printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n",
|
||||||
serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
|
serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
|
||||||
#else
|
#else
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
|
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
|
||||||
#endif // __cilkplusplus
|
#endif // __cilk
|
||||||
|
|
||||||
DeleteInputData(input);
|
DeleteInputData(input);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user