diff --git a/examples_cuda/common.mk b/examples_cuda/common.mk index db7b8eee..fc956329 100644 --- a/examples_cuda/common.mk +++ b/examples_cuda/common.mk @@ -3,9 +3,9 @@ TASK_CXX=../tasksys.cpp TASK_LIB=-lpthread TASK_OBJ=objs/tasksys.o -CXX=clang++ +CXX=icc -openmp CXXFLAGS+=-Iobjs/ -O2 -CC=clang +CC=icc -openmp CCFLAGS+=-Iobjs/ -O2 LIBS=-lm $(TASK_LIB) -lstdc++ diff --git a/examples_cuda/deferred/deferred-serial-dynamic.ppm b/examples_cuda/deferred/deferred-serial-dynamic.ppm index 57aa4cf1..a3f3b9bf 100644 Binary files a/examples_cuda/deferred/deferred-serial-dynamic.ppm and b/examples_cuda/deferred/deferred-serial-dynamic.ppm differ diff --git a/examples_cuda/deferred/dynamic_c.cpp b/examples_cuda/deferred/dynamic_c.cpp index 8ed9a648..f0d4703a 100644 --- a/examples_cuda/deferred/dynamic_c.cpp +++ b/examples_cuda/deferred/dynamic_c.cpp @@ -51,8 +51,8 @@ #endif // ISPC_IS_LINUX // Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)! -#define MIN_TILE_WIDTH 16 -#define MIN_TILE_HEIGHT 16 +//#define MIN_TILE_WIDTH 16 +//#define MIN_TILE_HEIGHT 16 #define DYNAMIC_TREE_LEVELS 5 diff --git a/examples_cuda/deferred/dynamic_cilk.cpp b/examples_cuda/deferred/dynamic_cilk.cpp index 87a0c7da..a6125559 100644 --- a/examples_cuda/deferred/dynamic_cilk.cpp +++ b/examples_cuda/deferred/dynamic_cilk.cpp @@ -51,8 +51,8 @@ #endif // ISPC_IS_LINUX // Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)! -#define MIN_TILE_WIDTH 16 -#define MIN_TILE_HEIGHT 16 +//#define MIN_TILE_WIDTH 64 +//#define MIN_TILE_HEIGHT 16 #define DYNAMIC_TREE_LEVELS 5 @@ -247,7 +247,7 @@ ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, ispc::ShadeTile( startX, endX, startY, endY, input->header.framebufferWidth, input->header.framebufferHeight, - &input->arrays, + input->arrays, input->header.cameraProj[0][0], input->header.cameraProj[1][1], input->header.cameraProj[2][2], input->header.cameraProj[3][2], lightIndices, numLights, VISUALIZE_LIGHT_COUNT, diff --git a/examples_cuda/deferred/kernels.ispc b/examples_cuda/deferred/kernels.ispc index 80b70ed4..6d2a8cc9 100644 --- a/examples_cuda/deferred/kernels.ispc +++ b/examples_cuda/deferred/kernels.ispc @@ -472,7 +472,6 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y, uniform float cameraProj_22 = inputHeader.cameraProj[2][2]; uniform float cameraProj_32 = inputHeader.cameraProj[3][2]; - // Light intersection: figure out which lights illuminate this tile. uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile uniform int numTileLights = @@ -490,7 +489,6 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y, inputData.lightAttenuationEnd, tileLightIndices); - // And now shade the tile, using the lights in tileLightIndices ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, framebufferWidth, framebufferHeight, inputData, @@ -521,3 +519,154 @@ RenderStatic(uniform InputHeader &inputHeader, framebuffer_r, framebuffer_g, framebuffer_b); } + +/////////////////////////////////////////////////////////////////////////// +// Routines for dynamic decomposition path + +// This computes the z min/max range for a whole row worth of tiles. +export void +ComputeZBoundsRow( + uniform int32 tileY, + uniform int32 tileWidth, uniform int32 tileHeight, + uniform int32 numTilesX, uniform int32 numTilesY, + // G-buffer data + uniform float zBuffer[], + uniform int32 gBufferWidth, + // Camera data + uniform float cameraProj_33, uniform float cameraProj_43, + uniform float cameraNear, uniform float cameraFar, + // Output + uniform float minZArray[], + uniform float maxZArray[] + ) +{ + for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) { + uniform float minZ, maxZ; + ComputeZBounds( + tileX * tileWidth, tileX * tileWidth + tileWidth, + tileY * tileHeight, tileY * tileHeight + tileHeight, + zBuffer, gBufferWidth, + cameraProj_33, cameraProj_43, cameraNear, cameraFar, + minZ, maxZ); + minZArray[tileX] = minZ; + maxZArray[tileX] = maxZ; + } +} + + +// Reclassifies the lights with respect to four sub-tiles when we refine a tile. +// numLights need not be a multiple of programCount here, but the input and output arrays +// should be able to handle programCount-sized load/stores. +export void +SplitTileMinMax( + uniform int32 tileMidX, uniform int32 tileMidY, + // Subtile data (00, 10, 01, 11) + uniform float subtileMinZ[], + uniform float subtileMaxZ[], + // G-buffer data + uniform int32 gBufferWidth, uniform int32 gBufferHeight, + // Camera data + uniform float cameraProj_11, uniform float cameraProj_22, + // Light Data + uniform int32 lightIndices[], + uniform int32 numLights, + uniform float light_positionView_x_array[], + uniform float light_positionView_y_array[], + uniform float light_positionView_z_array[], + uniform float light_attenuationEnd_array[], + // Outputs + uniform int32 subtileIndices[], + uniform int32 subtileIndicesPitch, + uniform int32 subtileNumLights[] + ) +{ + uniform float gBufferScale_x = 0.5f * (float)gBufferWidth; + uniform float gBufferScale_y = 0.5f * (float)gBufferHeight; + + uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x), + (cameraProj_22 * gBufferScale_y) }; + uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x, + tileMidY - gBufferScale_y }; + + // Normalize + uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + + frustumPlanes_z[0] * frustumPlanes_z[0]), + rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + + frustumPlanes_z[1] * frustumPlanes_z[1]) }; + frustumPlanes_xy[0] *= norm[0]; + frustumPlanes_xy[1] *= norm[1]; + frustumPlanes_z[0] *= norm[0]; + frustumPlanes_z[1] *= norm[1]; + + // Initialize + uniform int32 subtileLightOffset[4]; + subtileLightOffset[0] = 0 * subtileIndicesPitch; + subtileLightOffset[1] = 1 * subtileIndicesPitch; + subtileLightOffset[2] = 2 * subtileIndicesPitch; + subtileLightOffset[3] = 3 * subtileIndicesPitch; + + foreach (i = 0 ... numLights) { + int32 lightIndex = lightIndices[i]; + + float light_positionView_x = light_positionView_x_array[lightIndex]; + float light_positionView_y = light_positionView_y_array[lightIndex]; + float light_positionView_z = light_positionView_z_array[lightIndex]; + float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; + float light_attenuationEndNeg = -light_attenuationEnd; + + // Test lights again subtile z bounds + bool inFrustum[4]; + inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) && + (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg); + inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && + (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg); + inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && + (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg); + inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && + (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg); + + float dx = light_positionView_z * frustumPlanes_z[0] + + light_positionView_x * frustumPlanes_xy[0]; + float dy = light_positionView_z * frustumPlanes_z[1] + + light_positionView_y * frustumPlanes_xy[1]; + + cif (abs(dx) > light_attenuationEnd) { + bool positiveX = dx > 0.0f; + inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile + inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile + inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile + inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile + } + cif (abs(dy) > light_attenuationEnd) { + bool positiveY = dy > 0.0f; + inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile + inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile + inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile + inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile + } + + // Pack and store intersecting lights + // TODO: Experiment with a loop here instead + cif (inFrustum[0]) + subtileLightOffset[0] += + packed_store_active(&subtileIndices[subtileLightOffset[0]], + lightIndex); + cif (inFrustum[1]) + subtileLightOffset[1] += + packed_store_active(&subtileIndices[subtileLightOffset[1]], + lightIndex); + cif (inFrustum[2]) + subtileLightOffset[2] += + packed_store_active(&subtileIndices[subtileLightOffset[2]], + lightIndex); + cif (inFrustum[3]) + subtileLightOffset[3] += + packed_store_active(&subtileIndices[subtileLightOffset[3]], + lightIndex); + } + + subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch; + subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch; + subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch; + subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch; +} diff --git a/examples_cuda/deferred/main.cpp b/examples_cuda/deferred/main.cpp index 4f2be879..1e4cee4d 100644 --- a/examples_cuda/deferred/main.cpp +++ b/examples_cuda/deferred/main.cpp @@ -59,6 +59,19 @@ #include "kernels_ispc.h" #include "../timing.h" +#include +static inline double rtc(void) +{ + struct timeval Tvalue; + double etime; + struct timezone dummy; + + gettimeofday(&Tvalue,&dummy); + etime = (double) Tvalue.tv_sec + + 1.e-6*((double) Tvalue.tv_usec); + return etime; +} + /////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { @@ -85,12 +98,12 @@ int main(int argc, char** argv) { double ispcCycles = 1e30; for (int i = 0; i < 5; ++i) { framebuffer.clear(); - reset_and_start_timer(); + const double t0 = rtc(); for (int j = 0; j < nframes; ++j) ispc::RenderStatic(input->header, input->arrays, VISUALIZE_LIGHT_COUNT, framebuffer.r, framebuffer.g, framebuffer.b); - double mcycles = get_elapsed_mcycles() / nframes; + double mcycles = 1000*(rtc() - t0) / nframes; ispcCycles = std::min(ispcCycles, mcycles); } printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render " @@ -102,10 +115,10 @@ int main(int argc, char** argv) { double dynamicCilkCycles = 1e30; for (int i = 0; i < 5; ++i) { framebuffer.clear(); - reset_and_start_timer(); + const double t0 = rtc(); for (int j = 0; j < nframes; ++j) DispatchDynamicCilk(input, &framebuffer); - double mcycles = get_elapsed_mcycles() / nframes; + double mcycles = 1000*(rtc() - t0) / nframes; dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles); } printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", @@ -116,10 +129,10 @@ int main(int argc, char** argv) { double serialCycles = 1e30; for (int i = 0; i < 5; ++i) { framebuffer.clear(); - reset_and_start_timer(); + const double t0 = rtc(); for (int j = 0; j < nframes; ++j) DispatchDynamicC(input, &framebuffer); - double mcycles = get_elapsed_mcycles() / nframes; + double mcycles = 1000*(rtc() - t0) / nframes; serialCycles = std::min(serialCycles, mcycles); } printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n",