diff --git a/examples/deferred/common.cpp b/examples/deferred/common.cpp index c8fdc36a..dc2b928f 100644 --- a/examples/deferred/common.cpp +++ b/examples/deferred/common.cpp @@ -64,7 +64,7 @@ /////////////////////////////////////////////////////////////////////////// static void * -lAlignedMalloc(int64_t size, int32_t alignment) { +lAlignedMalloc(size_t size, int32_t alignment) { #ifdef ISPC_IS_WINDOWS return _aligned_malloc(size, alignment); #endif @@ -118,6 +118,7 @@ Framebuffer::clear() { memset(b, 0, nPixels); } + InputData * CreateInputDataFromFile(const char *path) { FILE *in = fopen(path, "rb"); @@ -177,8 +178,7 @@ CreateInputDataFromFile(const char *path) { } -void DeleteInputData(InputData *input) -{ +void DeleteInputData(InputData *input) { lAlignedFree(input->chunk); } diff --git a/examples/deferred/dynamic_c.cpp b/examples/deferred/dynamic_c.cpp index 27e9a839..8ed9a648 100644 --- a/examples/deferred/dynamic_c.cpp +++ b/examples/deferred/dynamic_c.cpp @@ -60,7 +60,7 @@ #define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1 static void * -lAlignedMalloc(int64_t size, int32_t alignment) { +lAlignedMalloc(size_t size, int32_t alignment) { #ifdef ISPC_IS_WINDOWS return _aligned_malloc(size, alignment); #endif @@ -141,12 +141,10 @@ ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight, { for (int tileX = 0; tileX < numTilesX; ++tileX) { float minZ, maxZ; - ComputeZBounds( - tileX * tileWidth, tileX * tileWidth + tileWidth, - tileY * tileHeight, tileY * tileHeight + tileHeight, - zBuffer, gBufferWidth, - cameraProj_33, cameraProj_43, cameraNear, cameraFar, - &minZ, &maxZ); + ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth, + tileY * tileHeight, tileY * tileHeight + tileHeight, + zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, + cameraNear, cameraFar, &minZ, &maxZ); minZArray[tileX] = minZ; maxZArray[tileX] = maxZ; } @@ -282,8 +280,8 @@ void InitDynamicC(InputData *input) { } -// numLights need not be a multiple of programCount here, but the input and output arrays -// should be able to handle programCount-sized load/stores. +/* We're going to split a tile into 4 sub-tiles. This function + reclassifies the tile's lights with respect to the sub-tiles. */ static void SplitTileMinMax( int tileMidX, int tileMidY, @@ -339,7 +337,7 @@ SplitTileMinMax( float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; float light_attenuationEndNeg = -light_attenuationEnd; - // Test lights again subtile z bounds + // Test lights again against subtile z bounds bool inFrustum[4]; inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) && (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg); @@ -414,7 +412,8 @@ Float32ToUnorm8(float f) { } -static inline float half_to_float_fast(uint16_t h) { +static inline float +half_to_float_fast(uint16_t h) { uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits diff --git a/examples/deferred/dynamic_cilk.cpp b/examples/deferred/dynamic_cilk.cpp index 2bcfced6..58cfb7b1 100644 --- a/examples/deferred/dynamic_cilk.cpp +++ b/examples/deferred/dynamic_cilk.cpp @@ -60,7 +60,7 @@ #define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1 static void * -lAlignedMalloc(int64_t size, int32_t alignment) { +lAlignedMalloc(size_t size, int32_t alignment) { #ifdef ISPC_IS_WINDOWS return _aligned_malloc(size, alignment); #endif @@ -395,4 +395,4 @@ DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer) } } -#endif // __cilkplusplus +#endif // __cilk diff --git a/examples/deferred/kernels.ispc b/examples/deferred/kernels.ispc index 27e9c5d1..7fc046e4 100644 --- a/examples/deferred/kernels.ispc +++ b/examples/deferred/kernels.ispc @@ -479,7 +479,7 @@ ShadeTile( // Static decomposition task void -RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y, +RenderTile(uniform int num_groups_x, uniform int num_groups_y, reference uniform InputHeader inputHeader, reference uniform InputDataArrays inputData, uniform int visualizeLightCount, @@ -487,16 +487,13 @@ RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y, reference uniform unsigned int8 framebuffer_r[], reference uniform unsigned int8 framebuffer_g[], reference uniform unsigned int8 framebuffer_b[]) { - uniform int32 group_y = g / num_groups_x; - uniform int32 group_x = g % num_groups_x; + uniform int32 group_y = taskIndex / num_groups_x; + uniform int32 group_x = taskIndex % num_groups_x; uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH; uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT; uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH; uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT; - uniform int sTileNumLights = 0; - uniform int sTileLightIndices[MAX_LIGHTS]; // Light list for the tile - uniform int framebufferWidth = inputHeader.framebufferWidth; uniform int framebufferHeight = inputHeader.framebufferHeight; uniform float cameraProj_00 = inputHeader.cameraProj[0][0]; @@ -504,8 +501,9 @@ RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y, uniform float cameraProj_22 = inputHeader.cameraProj[2][2]; uniform float cameraProj_32 = inputHeader.cameraProj[3][2]; - // Light intersection - sTileNumLights = + // Light intersection: figure out which lights illuminate this tile. + uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile + uniform int numTileLights = IntersectLightsWithTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, framebufferWidth, framebufferHeight, @@ -518,12 +516,13 @@ RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y, inputData.lightPositionView_y, inputData.lightPositionView_z, inputData.lightAttenuationEnd, - sTileLightIndices); + tileLightIndices); + // And now shade the tile, using the lights in tileLightIndices ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, framebufferWidth, framebufferHeight, inputData, cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32, - sTileLightIndices, sTileNumLights, visualizeLightCount, + tileLightIndices, numTileLights, visualizeLightCount, framebuffer_r, framebuffer_g, framebuffer_b); } @@ -542,17 +541,19 @@ RenderStatic(reference uniform InputHeader inputHeader, MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT; uniform int num_groups = num_groups_x * num_groups_y; - for (uniform int g = 0; g < num_groups; ++g) - launch < RenderTile(g, num_groups_x, num_groups_y, - inputHeader, inputData, visualizeLightCount, - framebuffer_r, framebuffer_g, framebuffer_b) >; + // Launch a task to render each tile, each of which is MIN_TILE_WIDTH + // by MIN_TILE_HEIGHT pixels. + launch[num_groups] < RenderTile(num_groups_x, num_groups_y, + inputHeader, inputData, visualizeLightCount, + framebuffer_r, framebuffer_g, framebuffer_b) >; } /////////////////////////////////////////////////////////////////////////// // Routines for dynamic decomposition path -// tile width must be a multiple of programCount (SIMD size) +// This computes the z min/max range for a whole row worth of tiles. +// The tile width must be a multiple of programCount (SIMD size) export void ComputeZBoundsRow( uniform int32 tileY, @@ -583,6 +584,7 @@ ComputeZBoundsRow( } +// Reclassifies the lights with respect to four sub-tiles when we refine a tile. // numLights need not be a multiple of programCount here, but the input and output arrays // should be able to handle programCount-sized load/stores. export void diff --git a/examples/deferred/main.cpp b/examples/deferred/main.cpp index 40964295..0c770049 100644 --- a/examples/deferred/main.cpp +++ b/examples/deferred/main.cpp @@ -58,10 +58,45 @@ #include "deferred.h" #include "kernels_ispc.h" #include "../timing.h" +#include "../cpuid.h" /////////////////////////////////////////////////////////////////////////// +// Make sure that the vector ISA used during compilation is supported by +// the processor. The ISPC_TARGET_* macro is set in the ispc-generated +// header file that we include above. +static void +ensureTargetISAIsSupported() { +#if defined(ISPC_TARGET_SSE2) + bool isaSupported = CPUSupportsSSE2(); + const char *target = "SSE2"; +#elif defined(ISPC_TARGET_SSE4) + bool isaSupported = CPUSupportsSSE4(); + const char *target = "SSE4"; +#elif defined(ISPC_TARGET_AVX) + bool isaSupported = CPUSupportsAVX(); + const char *target = "AVX"; +#else +#error "Unknown ISPC_TARGET_* value" +#endif + if (!isaSupported) { + fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction " + "set, which isn't\n*** supported by this computer's CPU!\n", target); + fprintf(stderr, "***\n*** Please modify the " +#ifdef _MSC_VER + "MSVC project file " +#else + "Makefile " +#endif + "to select another target (e.g. sse2)\n***\n"); + exit(1); + } +} + + int main(int argc, char** argv) { + ensureTargetISAIsSupported(); + if (argc != 2) { printf("usage: deferred_shading \n"); return 1; @@ -77,9 +112,9 @@ int main(int argc, char** argv) { input->header.framebufferHeight); InitDynamicC(input); -#ifdef __cilkplusplus +#ifdef __cilk InitDynamicCilk(input); -#endif // __cilkplusplus +#endif // __cilk int nframes = 5; double ispcCycles = 1e30; @@ -98,20 +133,7 @@ int main(int argc, char** argv) { input->header.framebufferWidth, input->header.framebufferHeight); WriteFrame("deferred-ispc-static.ppm", input, framebuffer); - double serialCycles = 1e30; - for (int i = 0; i < 5; ++i) { - framebuffer.clear(); - reset_and_start_timer(); - for (int j = 0; j < nframes; ++j) - DispatchDynamicC(input, &framebuffer); - double mcycles = get_elapsed_mcycles() / nframes; - serialCycles = std::min(serialCycles, mcycles); - } - printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles\n", - serialCycles); - WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer); - -#ifdef __cilkplusplus +#ifdef __cilk double dynamicCilkCycles = 1e30; for (int i = 0; i < 5; ++i) { framebuffer.clear(); @@ -121,15 +143,30 @@ int main(int argc, char** argv) { double mcycles = get_elapsed_mcycles() / nframes; dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles); } - printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles\n", + printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", dynamicCilkCycles); WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer); +#endif // __cilk + double serialCycles = 1e30; + for (int i = 0; i < 5; ++i) { + framebuffer.clear(); + reset_and_start_timer(); + for (int j = 0; j < nframes; ++j) + DispatchDynamicC(input, &framebuffer); + double mcycles = get_elapsed_mcycles() / nframes; + serialCycles = std::min(serialCycles, mcycles); + } + printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", + serialCycles); + WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer); + +#ifdef __cilk printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", serialCycles/ispcCycles, serialCycles/dynamicCilkCycles); #else printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles); -#endif // __cilkplusplus +#endif // __cilk DeleteInputData(input);