diff --git a/examples_ptx/deferred/Makefile_cpu b/examples_ptx/deferred/Makefile_cpu new file mode 100644 index 00000000..be8ce7c4 --- /dev/null +++ b/examples_ptx/deferred/Makefile_cpu @@ -0,0 +1,9 @@ + +EXAMPLE=deferred_shading +CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp +ISPC_SRC=kernels.ispc +ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16 +ISPC_ARM_TARGETS=neon +ISPC_FLAGS=--opt=fast-math + +include ../common.mk diff --git a/examples_ptx/deferred/common.cpp b/examples_ptx/deferred/common.cpp new file mode 100644 index 00000000..fa4ee57b --- /dev/null +++ b/examples_ptx/deferred/common.cpp @@ -0,0 +1,210 @@ +/* + Copyright (c) 2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define ISPC_IS_WINDOWS +#elif defined(__linux__) +#define ISPC_IS_LINUX +#elif defined(__APPLE__) +#define ISPC_IS_APPLE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ISPC_IS_WINDOWS + #define WIN32_LEAN_AND_MEAN + #include +#endif +#ifdef ISPC_IS_LINUX + #include +#endif +#include "deferred.h" +#include "../timing.h" + +/////////////////////////////////////////////////////////////////////////// + +static void * +lAlignedMalloc(size_t size, int32_t alignment) { +#ifdef ISPC_IS_WINDOWS + return _aligned_malloc(size, alignment); +#endif +#ifdef ISPC_IS_LINUX + return memalign(alignment, size); +#endif +#ifdef ISPC_IS_APPLE + void *mem = malloc(size + (alignment-1) + sizeof(void*)); + char *amem = ((char*)mem) + sizeof(void*); + amem = amem + uint32_t(alignment - (reinterpret_cast(amem) & + (alignment - 1))); + ((void**)amem)[-1] = mem; + return amem; +#endif +} + + +static void +lAlignedFree(void *ptr) { +#ifdef ISPC_IS_WINDOWS + _aligned_free(ptr); +#endif +#ifdef ISPC_IS_LINUX + free(ptr); +#endif +#ifdef ISPC_IS_APPLE + free(((void**)ptr)[-1]); +#endif +} + + +Framebuffer::Framebuffer(int width, int height) { + nPixels = width*height; + r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES); + g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES); + b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES); +} + + +Framebuffer::~Framebuffer() { + lAlignedFree(r); + lAlignedFree(g); + lAlignedFree(b); +} + + +void +Framebuffer::clear() { + memset(r, 0, nPixels); + memset(g, 0, nPixels); + memset(b, 0, nPixels); +} + + +InputData * +CreateInputDataFromFile(const char *path) { + FILE *in = fopen(path, "rb"); + if (!in) return 0; + + InputData *input = new InputData; + + // Load header + if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) { + fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path); + return NULL; + } + + // Load data chunk and update pointers + input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize, + ALIGNMENT_BYTES); + if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) { + fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path); + return NULL; + } + + input->arrays.zBuffer = + (float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]]; + input->arrays.normalEncoded_x = + (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]]; + input->arrays.normalEncoded_y = + (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]]; + input->arrays.specularAmount = + (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]]; + input->arrays.specularPower = + (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]]; + input->arrays.albedo_x = + (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]]; + input->arrays.albedo_y = + (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]]; + input->arrays.albedo_z = + (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]]; + input->arrays.lightPositionView_x = + (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]]; + input->arrays.lightPositionView_y = + (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]]; + input->arrays.lightPositionView_z = + (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]]; + input->arrays.lightAttenuationBegin = + (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]]; + input->arrays.lightColor_x = + (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]]; + input->arrays.lightColor_y = + (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]]; + input->arrays.lightColor_z = + (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]]; + input->arrays.lightAttenuationEnd = + (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]]; + + fclose(in); + return input; +} + + +void DeleteInputData(InputData *input) { + lAlignedFree(input->chunk); +} + + +void WriteFrame(const char *filename, const InputData *input, + const Framebuffer &framebuffer) { + // Deswizzle and copy to RGBA output + // Doesn't need to be fast... only happens once + size_t imageBytes = 3 * input->header.framebufferWidth * + input->header.framebufferHeight; + uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES); + memset(framebufferAOS, 0, imageBytes); + + for (int i = 0; i < input->header.framebufferWidth * + input->header.framebufferHeight; ++i) { + framebufferAOS[3 * i + 0] = framebuffer.r[i]; + framebufferAOS[3 * i + 1] = framebuffer.g[i]; + framebufferAOS[3 * i + 2] = framebuffer.b[i]; + } + + // Write out simple PPM file + FILE *out = fopen(filename, "wb"); + fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, + input->header.framebufferHeight); + fwrite(framebufferAOS, imageBytes, 1, out); + fclose(out); + + lAlignedFree(framebufferAOS); +} diff --git a/examples_ptx/deferred/data/pp1280x720.bin b/examples_ptx/deferred/data/pp1280x720.bin new file mode 100644 index 00000000..32a6a6af Binary files /dev/null and b/examples_ptx/deferred/data/pp1280x720.bin differ diff --git a/examples_ptx/deferred/data/pp1920x1200.bin b/examples_ptx/deferred/data/pp1920x1200.bin new file mode 100644 index 00000000..1bf84c46 Binary files /dev/null and b/examples_ptx/deferred/data/pp1920x1200.bin differ diff --git a/examples_ptx/deferred/deferred.h b/examples_ptx/deferred/deferred.h new file mode 100644 index 00000000..5e814ca5 --- /dev/null +++ b/examples_ptx/deferred/deferred.h @@ -0,0 +1,108 @@ +/* + Copyright (c) 2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef DEFERRED_H +#define DEFERRED_H + +// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)! +#define MIN_TILE_WIDTH 16 +#define MIN_TILE_HEIGHT 16 +#define MAX_LIGHTS 1024 + +enum InputDataArraysEnum { + idaZBuffer = 0, + idaNormalEncoded_x, + idaNormalEncoded_y, + idaSpecularAmount, + idaSpecularPower, + idaAlbedo_x, + idaAlbedo_y, + idaAlbedo_z, + idaLightPositionView_x, + idaLightPositionView_y, + idaLightPositionView_z, + idaLightAttenuationBegin, + idaLightColor_x, + idaLightColor_y, + idaLightColor_z, + idaLightAttenuationEnd, + + idaNum +}; + +#ifndef ISPC + +#include +#include "kernels_ispc.h" + +#define ALIGNMENT_BYTES 64 + +#define MAX_LIGHTS 1024 + +#define VISUALIZE_LIGHT_COUNT 0 + +struct InputData +{ + ispc::InputHeader header; + ispc::InputDataArrays arrays; + uint8_t *chunk; +}; + + +struct Framebuffer { + Framebuffer(int width, int height); + ~Framebuffer(); + + void clear(); + + uint8_t *r, *g, *b; + +private: + int nPixels; + Framebuffer(const Framebuffer &); + Framebuffer &operator=(const Framebuffer *); +}; + + +InputData *CreateInputDataFromFile(const char *path); +void DeleteInputData(InputData *input); +void WriteFrame(const char *filename, const InputData *input, + const Framebuffer &framebuffer); +void InitDynamicC(InputData *input); +void InitDynamicCilk(InputData *input); +void DispatchDynamicC(InputData *input, Framebuffer *framebuffer); +void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer); + +#endif // !ISPC + +#endif // DEFERRED_H diff --git a/examples_ptx/deferred/deferred_shading.vcxproj b/examples_ptx/deferred/deferred_shading.vcxproj new file mode 100644 index 00000000..974e870b --- /dev/null +++ b/examples_ptx/deferred/deferred_shading.vcxproj @@ -0,0 +1,36 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {87f53c53-957e-4e91-878a-bc27828fb9eb} + Win32Proj + deferred + kernels + sse2,sse4-x2,avx1-x2 + + + + + + + + + + diff --git a/examples_ptx/deferred/dynamic_c.cpp b/examples_ptx/deferred/dynamic_c.cpp new file mode 100644 index 00000000..8ed9a648 --- /dev/null +++ b/examples_ptx/deferred/dynamic_c.cpp @@ -0,0 +1,870 @@ +/* + Copyright (c) 2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "deferred.h" +#include "kernels_ispc.h" +#include +#include +#include +#include + +#ifdef _MSC_VER +#define ISPC_IS_WINDOWS +#elif defined(__linux__) +#define ISPC_IS_LINUX +#elif defined(__APPLE__) +#define ISPC_IS_APPLE +#endif + +#ifdef ISPC_IS_LINUX +#include +#endif // ISPC_IS_LINUX + +// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)! +#define MIN_TILE_WIDTH 16 +#define MIN_TILE_HEIGHT 16 + + +#define DYNAMIC_TREE_LEVELS 5 +// If this is set to 1 then the result will be identical to the static version +#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1 + +static void * +lAlignedMalloc(size_t size, int32_t alignment) { +#ifdef ISPC_IS_WINDOWS + return _aligned_malloc(size, alignment); +#endif +#ifdef ISPC_IS_LINUX + return memalign(alignment, size); +#endif +#ifdef ISPC_IS_APPLE + void *mem = malloc(size + (alignment-1) + sizeof(void*)); + char *amem = ((char*)mem) + sizeof(void*); + amem = amem + uint32_t(alignment - (reinterpret_cast(amem) & + (alignment - 1))); + ((void**)amem)[-1] = mem; + return amem; +#endif +} + + +static void +lAlignedFree(void *ptr) { +#ifdef ISPC_IS_WINDOWS + _aligned_free(ptr); +#endif +#ifdef ISPC_IS_LINUX + free(ptr); +#endif +#ifdef ISPC_IS_APPLE + free(((void**)ptr)[-1]); +#endif +} + + +static void +ComputeZBounds(int tileStartX, int tileEndX, + int tileStartY, int tileEndY, + // G-buffer data + float zBuffer[], + int gBufferWidth, + // Camera data + float cameraProj_33, float cameraProj_43, + float cameraNear, float cameraFar, + // Output + float *minZ, float *maxZ) +{ + // Find Z bounds + float laneMinZ = cameraFar; + float laneMaxZ = cameraNear; + for (int y = tileStartY; y < tileEndY; ++y) { + for (int x = tileStartX; x < tileEndX; ++x) { + // Unproject depth buffer Z value into view space + float z = zBuffer[(y * gBufferWidth + x)]; + float viewSpaceZ = cameraProj_43 / (z - cameraProj_33); + + // Work out Z bounds for our samples + // Avoid considering skybox/background or otherwise invalid pixels + if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) { + laneMinZ = std::min(laneMinZ, viewSpaceZ); + laneMaxZ = std::max(laneMaxZ, viewSpaceZ); + } + } + } + *minZ = laneMinZ; + *maxZ = laneMaxZ; +} + + +static void +ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight, + int numTilesX, int numTilesY, + // G-buffer data + float zBuffer[], + int gBufferWidth, + // Camera data + float cameraProj_33, float cameraProj_43, + float cameraNear, float cameraFar, + // Output + float minZArray[], + float maxZArray[]) +{ + for (int tileX = 0; tileX < numTilesX; ++tileX) { + float minZ, maxZ; + ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth, + tileY * tileHeight, tileY * tileHeight + tileHeight, + zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, + cameraNear, cameraFar, &minZ, &maxZ); + minZArray[tileX] = minZ; + maxZArray[tileX] = maxZ; + } +} + + +class MinMaxZTree +{ +public: + // Currently (min) tile dimensions must divide gBuffer dimensions evenly + // Levels must be small enough that neither dimension goes below one tile + MinMaxZTree( + int tileWidth, int tileHeight, int levels, + int gBufferWidth, int gBufferHeight) + : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels) + { + mNumTilesX = gBufferWidth / mTileWidth; + mNumTilesY = gBufferHeight / mTileHeight; + + // Allocate arrays + mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16); + mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16); + for (int i = 0; i < mLevels; ++i) { + int x = NumTilesX(i); + int y = NumTilesY(i); + assert(x > 0); + assert(y > 0); + // NOTE: If the following two asserts fire it probably means that + // the base tile dimensions do not evenly divide the G-buffer dimensions + assert(x * (mTileWidth << i) >= gBufferWidth); + assert(y * (mTileHeight << i) >= gBufferHeight); + mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16); + mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16); + } + } + + void Update(float *zBuffer, int gBufferPitchInElements, + float cameraProj_33, float cameraProj_43, + float cameraNear, float cameraFar) + { + for (int tileY = 0; tileY < mNumTilesY; ++tileY) { + ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY, + zBuffer, gBufferPitchInElements, + cameraProj_33, cameraProj_43, cameraNear, cameraFar, + mMinZArrays[0] + (tileY * mNumTilesX), + mMaxZArrays[0] + (tileY * mNumTilesX)); + } + + // Generate other levels + for (int level = 1; level < mLevels; ++level) { + int destTilesX = NumTilesX(level); + int destTilesY = NumTilesY(level); + int srcLevel = level - 1; + int srcTilesX = NumTilesX(srcLevel); + int srcTilesY = NumTilesY(srcLevel); + for (int y = 0; y < destTilesY; ++y) { + for (int x = 0; x < destTilesX; ++x) { + int srcX = x << 1; + int srcY = y << 1; + // NOTE: Ugly branches to deal with non-multiple dimensions at some levels + // TODO: SSE branchless min/max is probably better... + float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)]; + float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)]; + if (srcX + 1 < srcTilesX) { + minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + + (srcX + 1)]); + maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX + + (srcX + 1)]); + if (srcY + 1 < srcTilesY) { + minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX + + (srcX + 1)]); + maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX + + (srcX + 1)]); + } + } + if (srcY + 1 < srcTilesY) { + minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX + + (srcX )]); + maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX + + (srcX )]); + } + mMinZArrays[level][y * destTilesX + x] = minZ; + mMaxZArrays[level][y * destTilesX + x] = maxZ; + } + } + } + } + + ~MinMaxZTree() { + for (int i = 0; i < mLevels; ++i) { + lAlignedFree(mMinZArrays[i]); + lAlignedFree(mMaxZArrays[i]); + } + lAlignedFree(mMinZArrays); + lAlignedFree(mMaxZArrays); + } + + int Levels() const { return mLevels; } + + // These round UP, so beware that the last tile for a given level may not be completely full + // TODO: Verify this... + int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; } + int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; } + int TileWidth(int level = 0) const { return (mTileWidth << level); } + int TileHeight(int level = 0) const { return (mTileHeight << level); } + + float MinZ(int level, int tileX, int tileY) const { + return mMinZArrays[level][tileY * NumTilesX(level) + tileX]; + } + float MaxZ(int level, int tileX, int tileY) const { + return mMaxZArrays[level][tileY * NumTilesX(level) + tileX]; + } + +private: + int mTileWidth; + int mTileHeight; + int mLevels; + int mNumTilesX; + int mNumTilesY; + + // One array for each "level" in the tree + float **mMinZArrays; + float **mMaxZArrays; +}; + +static MinMaxZTree *gMinMaxZTree = 0; + +void InitDynamicC(InputData *input) { + gMinMaxZTree = + new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS, + input->header.framebufferWidth, + input->header.framebufferHeight); +} + + +/* We're going to split a tile into 4 sub-tiles. This function + reclassifies the tile's lights with respect to the sub-tiles. */ +static void +SplitTileMinMax( + int tileMidX, int tileMidY, + // Subtile data (00, 10, 01, 11) + float subtileMinZ[], + float subtileMaxZ[], + // G-buffer data + int gBufferWidth, int gBufferHeight, + // Camera data + float cameraProj_11, float cameraProj_22, + // Light Data + int lightIndices[], + int numLights, + float light_positionView_x_array[], + float light_positionView_y_array[], + float light_positionView_z_array[], + float light_attenuationEnd_array[], + // Outputs + int subtileIndices[], + int subtileIndicesPitch, + int subtileNumLights[] + ) +{ + float gBufferScale_x = 0.5f * (float)gBufferWidth; + float gBufferScale_y = 0.5f * (float)gBufferHeight; + + float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x), + (cameraProj_22 * gBufferScale_y) }; + float frustumPlanes_z[2] = { tileMidX - gBufferScale_x, + tileMidY - gBufferScale_y }; + + for (int i = 0; i < 2; ++i) { + // Normalize + float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] + + frustumPlanes_z[i] * frustumPlanes_z[i]); + frustumPlanes_xy[i] *= norm; + frustumPlanes_z[i] *= norm; + } + + // Initialize + int subtileLightOffset[4]; + subtileLightOffset[0] = 0 * subtileIndicesPitch; + subtileLightOffset[1] = 1 * subtileIndicesPitch; + subtileLightOffset[2] = 2 * subtileIndicesPitch; + subtileLightOffset[3] = 3 * subtileIndicesPitch; + + for (int i = 0; i < numLights; ++i) { + int lightIndex = lightIndices[i]; + + float light_positionView_x = light_positionView_x_array[lightIndex]; + float light_positionView_y = light_positionView_y_array[lightIndex]; + float light_positionView_z = light_positionView_z_array[lightIndex]; + float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; + float light_attenuationEndNeg = -light_attenuationEnd; + + // Test lights again against subtile z bounds + bool inFrustum[4]; + inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) && + (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg); + inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && + (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg); + inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && + (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg); + inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && + (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg); + + float dx = light_positionView_z * frustumPlanes_z[0] + + light_positionView_x * frustumPlanes_xy[0]; + float dy = light_positionView_z * frustumPlanes_z[1] + + light_positionView_y * frustumPlanes_xy[1]; + + if (fabsf(dx) > light_attenuationEnd) { + bool positiveX = dx > 0.0f; + inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile + inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile + inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile + inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile + } + if (fabsf(dy) > light_attenuationEnd) { + bool positiveY = dy > 0.0f; + inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile + inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile + inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile + inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile + } + + if (inFrustum[0]) + subtileIndices[subtileLightOffset[0]++] = lightIndex; + if (inFrustum[1]) + subtileIndices[subtileLightOffset[1]++] = lightIndex; + if (inFrustum[2]) + subtileIndices[subtileLightOffset[2]++] = lightIndex; + if (inFrustum[3]) + subtileIndices[subtileLightOffset[3]++] = lightIndex; + } + + subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch; + subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch; + subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch; + subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch; +} + + +static inline float +dot3(float x, float y, float z, float a, float b, float c) { + return (x*a + y*b + z*c); +} + + +static inline void +normalize3(float x, float y, float z, float &ox, float &oy, float &oz) { + float n = 1.f / sqrtf(x*x + y*y + z*z); + ox = x * n; + oy = y * n; + oz = z * n; +} + + +static inline float +Unorm8ToFloat32(uint8_t u) { + return (float)u * (1.0f / 255.0f); +} + + +static inline uint8_t +Float32ToUnorm8(float f) { + return (uint8_t)(f * 255.0f); +} + + +static inline float +half_to_float_fast(uint16_t h) { + uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit + uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits + uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits + + // sign + uint32_t xs = ((uint32_t) hs) << 16; + // Exponent: unbias the halfp, then bias the single + int32_t xes = ((int32_t) (he >> 10)) - 15 + 127; + // Exponent + uint32_t xe = (uint32_t) (xes << 23); + // Mantissa + uint32_t xm = ((uint32_t) hm) << 13; + + uint32_t bits = (xs | xe | xm); + float *fp = reinterpret_cast(&bits); + return *fp; +} + + +static void +ShadeTileC( + int32_t tileStartX, int32_t tileEndX, + int32_t tileStartY, int32_t tileEndY, + int32_t gBufferWidth, int32_t gBufferHeight, + const ispc::InputDataArrays &inputData, + // Camera data + float cameraProj_11, float cameraProj_22, + float cameraProj_33, float cameraProj_43, + // Light list + int32_t tileLightIndices[], + int32_t tileNumLights, + // UI + bool visualizeLightCount, + // Output + uint8_t framebuffer_r[], + uint8_t framebuffer_g[], + uint8_t framebuffer_b[] + ) +{ + if (tileNumLights == 0 || visualizeLightCount) { + uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255)); + for (int32_t y = tileStartY; y < tileEndY; ++y) { + for (int32_t x = tileStartX; x < tileEndX; ++x) { + int32_t framebufferIndex = (y * gBufferWidth + x); + framebuffer_r[framebufferIndex] = c; + framebuffer_g[framebufferIndex] = c; + framebuffer_b[framebufferIndex] = c; + } + } + } else { + float twoOverGBufferWidth = 2.0f / gBufferWidth; + float twoOverGBufferHeight = 2.0f / gBufferHeight; + + for (int32_t y = tileStartY; y < tileEndY; ++y) { + float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); + + for (int32_t x = tileStartX; x < tileEndX; ++x) { + int32_t gBufferOffset = y * gBufferWidth + x; + + // Reconstruct position and (negative) view vector from G-buffer + float surface_positionView_x, surface_positionView_y, surface_positionView_z; + float Vneg_x, Vneg_y, Vneg_z; + + float z = inputData.zBuffer[gBufferOffset]; + + // Compute screen/clip-space position + // NOTE: Mind DX11 viewport transform and pixel center! + float positionScreen_x = (0.5f + (float)(x)) * + twoOverGBufferWidth - 1.0f; + + // Unproject depth buffer Z value into view space + surface_positionView_z = cameraProj_43 / (z - cameraProj_33); + surface_positionView_x = positionScreen_x * surface_positionView_z / + cameraProj_11; + surface_positionView_y = positionScreen_y * surface_positionView_z / + cameraProj_22; + + // We actually end up with a vector pointing *at* the + // surface (i.e. the negative view vector) + normalize3(surface_positionView_x, surface_positionView_y, + surface_positionView_z, Vneg_x, Vneg_y, Vneg_z); + + // Reconstruct normal from G-buffer + float surface_normal_x, surface_normal_y, surface_normal_z; + float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]); + float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]); + + float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); + float m = sqrtf(4.0f * f - 1.0f); + + surface_normal_x = m * (4.0f * normal_x - 2.0f); + surface_normal_y = m * (4.0f * normal_y - 2.0f); + surface_normal_z = 3.0f - 8.0f * f; + + // Load other G-buffer parameters + float surface_specularAmount = + half_to_float_fast(inputData.specularAmount[gBufferOffset]); + float surface_specularPower = + half_to_float_fast(inputData.specularPower[gBufferOffset]); + float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]); + float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]); + float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]); + + float lit_x = 0.0f; + float lit_y = 0.0f; + float lit_z = 0.0f; + for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights; + ++tileLightIndex) { + int32_t lightIndex = tileLightIndices[tileLightIndex]; + + // Gather light data relevant to initial culling + float light_positionView_x = + inputData.lightPositionView_x[lightIndex]; + float light_positionView_y = + inputData.lightPositionView_y[lightIndex]; + float light_positionView_z = + inputData.lightPositionView_z[lightIndex]; + float light_attenuationEnd = + inputData.lightAttenuationEnd[lightIndex]; + + // Compute light vector + float L_x = light_positionView_x - surface_positionView_x; + float L_y = light_positionView_y - surface_positionView_y; + float L_z = light_positionView_z - surface_positionView_z; + + float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z); + + // Clip at end of attenuation + float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; + + if (distanceToLight2 < light_attenutaionEnd2) { + float distanceToLight = sqrtf(distanceToLight2); + + float distanceToLightRcp = 1.f / distanceToLight; + L_x *= distanceToLightRcp; + L_y *= distanceToLightRcp; + L_z *= distanceToLightRcp; + + // Start computing brdf + float NdotL = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, L_x, L_y, L_z); + + // Clip back facing + if (NdotL > 0.0f) { + float light_attenuationBegin = + inputData.lightAttenuationBegin[lightIndex]; + + // Light distance attenuation (linstep) + float lightRange = (light_attenuationEnd - light_attenuationBegin); + float falloffPosition = (light_attenuationEnd - distanceToLight); + float attenuation = std::min(falloffPosition / lightRange, 1.0f); + + float H_x = (L_x - Vneg_x); + float H_y = (L_y - Vneg_y); + float H_z = (L_z - Vneg_z); + normalize3(H_x, H_y, H_z, H_x, H_y, H_z); + + float NdotH = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, H_x, H_y, H_z); + NdotH = std::max(NdotH, 0.0f); + + float specular = powf(NdotH, surface_specularPower); + float specularNorm = (surface_specularPower + 2.0f) * + (1.0f / 8.0f); + float specularContrib = surface_specularAmount * + specularNorm * specular; + + float k = attenuation * NdotL * (1.0f + specularContrib); + + float light_color_x = inputData.lightColor_x[lightIndex]; + float light_color_y = inputData.lightColor_y[lightIndex]; + float light_color_z = inputData.lightColor_z[lightIndex]; + + float lightContrib_x = surface_albedo_x * light_color_x; + float lightContrib_y = surface_albedo_y * light_color_y; + float lightContrib_z = surface_albedo_z * light_color_z; + + lit_x += lightContrib_x * k; + lit_y += lightContrib_y * k; + lit_z += lightContrib_z * k; + } + } + } + + // Gamma correct + float gamma = 1.0 / 2.2f; + lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma); + lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma); + lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma); + + framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); + framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); + framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); + } + } + } +} + + +void +ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, + int *lightIndices, int numLights, + Framebuffer *framebuffer) { + const MinMaxZTree *minMaxZTree = gMinMaxZTree; + + // If we few enough lights or this is the base case (last level), shade + // this full tile directly + if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) { + int width = minMaxZTree->TileWidth(level); + int height = minMaxZTree->TileHeight(level); + int startX = tileX * width; + int startY = tileY * height; + int endX = std::min(input->header.framebufferWidth, startX + width); + int endY = std::min(input->header.framebufferHeight, startY + height); + + // Skip entirely offscreen tiles + if (endX > startX && endY > startY) { + ShadeTileC(startX, endX, startY, endY, + input->header.framebufferWidth, input->header.framebufferHeight, + input->arrays, + input->header.cameraProj[0][0], input->header.cameraProj[1][1], + input->header.cameraProj[2][2], input->header.cameraProj[3][2], + lightIndices, numLights, VISUALIZE_LIGHT_COUNT, + framebuffer->r, framebuffer->g, framebuffer->b); + } + } + else { + // Otherwise, subdivide and 4-way recurse using X and Y splitting planes + // Move down a level in the tree + --level; + tileX <<= 1; + tileY <<= 1; + int width = minMaxZTree->TileWidth(level); + int height = minMaxZTree->TileHeight(level); + + // Work out splitting coords + int midX = (tileX + 1) * width; + int midY = (tileY + 1) * height; + + // Read subtile min/max data + // NOTE: We must be sure to handle out-of-bounds access here since + // sometimes we'll only have 1 or 2 subtiles for non-pow-2 + // framebuffer sizes. + bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level)); + bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level)); + + // NOTE: Order is 00, 10, 01, 11 + // Set defaults up to cull all lights if the tile doesn't exist (offscreen) + float minZ[4] = {input->header.cameraFar, input->header.cameraFar, + input->header.cameraFar, input->header.cameraFar}; + float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, + input->header.cameraNear, input->header.cameraNear}; + + minZ[0] = minMaxZTree->MinZ(level, tileX, tileY); + maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY); + if (rightTileExists) { + minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY); + maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY); + if (bottomTileExists) { + minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1); + maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1); + } + } + if (bottomTileExists) { + minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1); + maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1); + } + + // Cull lights into subtile lists +#ifdef ISPC_IS_WINDOWS + __declspec(align(ALIGNMENT_BYTES)) +#endif + int subtileLightIndices[4][MAX_LIGHTS] +#ifndef ISPC_IS_WINDOWS + __attribute__ ((aligned(ALIGNMENT_BYTES))) +#endif +; + int subtileNumLights[4]; + SplitTileMinMax(midX, midY, minZ, maxZ, + input->header.framebufferWidth, input->header.framebufferHeight, + input->header.cameraProj[0][0], input->header.cameraProj[1][1], + lightIndices, numLights, input->arrays.lightPositionView_x, + input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, + input->arrays.lightAttenuationEnd, + subtileLightIndices[0], MAX_LIGHTS, subtileNumLights); + + // Recurse into subtiles + ShadeDynamicTileRecurse(input, level, tileX , tileY, + subtileLightIndices[0], subtileNumLights[0], + framebuffer); + ShadeDynamicTileRecurse(input, level, tileX + 1, tileY, + subtileLightIndices[1], subtileNumLights[1], + framebuffer); + ShadeDynamicTileRecurse(input, level, tileX , tileY + 1, + subtileLightIndices[2], subtileNumLights[2], + framebuffer); + ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1, + subtileLightIndices[3], subtileNumLights[3], + framebuffer); + } +} + + +static int +IntersectLightsWithTileMinMax( + int tileStartX, int tileEndX, + int tileStartY, int tileEndY, + // Tile data + float minZ, + float maxZ, + // G-buffer data + int gBufferWidth, int gBufferHeight, + // Camera data + float cameraProj_11, float cameraProj_22, + // Light Data + int numLights, + float light_positionView_x_array[], + float light_positionView_y_array[], + float light_positionView_z_array[], + float light_attenuationEnd_array[], + // Output + int tileLightIndices[] + ) +{ + float gBufferScale_x = 0.5f * (float)gBufferWidth; + float gBufferScale_y = 0.5f * (float)gBufferHeight; + + float frustumPlanes_xy[4]; + float frustumPlanes_z[4]; + + // This one is totally constant over the whole screen... worth pulling it up at all? + float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x), + (cameraProj_11 * gBufferScale_x), + (cameraProj_22 * gBufferScale_y), + -(cameraProj_22 * gBufferScale_y) }; + + float frustumPlanes_z_v[4] = { tileEndX - gBufferScale_x, + -tileStartX + gBufferScale_x, + tileEndY - gBufferScale_y, + -tileStartY + gBufferScale_y }; + + for (int i = 0; i < 4; ++i) { + float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] + + frustumPlanes_z_v[i] * frustumPlanes_z_v[i]); + frustumPlanes_xy_v[i] *= norm; + frustumPlanes_z_v[i] *= norm; + + frustumPlanes_xy[i] = frustumPlanes_xy_v[i]; + frustumPlanes_z[i] = frustumPlanes_z_v[i]; + } + + int tileNumLights = 0; + + for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) { + float light_positionView_z = light_positionView_z_array[lightIndex]; + float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; + float light_attenuationEndNeg = -light_attenuationEnd; + + float d = light_positionView_z - minZ; + bool inFrustum = (d >= light_attenuationEndNeg); + + d = maxZ - light_positionView_z; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + if (!inFrustum) + continue; + + float light_positionView_x = light_positionView_x_array[lightIndex]; + float light_positionView_y = light_positionView_y_array[lightIndex]; + + d = light_positionView_z * frustumPlanes_z[0] + + light_positionView_x * frustumPlanes_xy[0]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[1] + + light_positionView_x * frustumPlanes_xy[1]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[2] + + light_positionView_y * frustumPlanes_xy[2]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[3] + + light_positionView_y * frustumPlanes_xy[3]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + // Pack and store intersecting lights + if (inFrustum) + tileLightIndices[tileNumLights++] = lightIndex; + } + + return tileNumLights; +} + + +void +ShadeDynamicTile(InputData *input, int level, int tileX, int tileY, + Framebuffer *framebuffer) { + const MinMaxZTree *minMaxZTree = gMinMaxZTree; + + // Get Z min/max for this tile + int width = minMaxZTree->TileWidth(level); + int height = minMaxZTree->TileHeight(level); + float minZ = minMaxZTree->MinZ(level, tileX, tileY); + float maxZ = minMaxZTree->MaxZ(level, tileX, tileY); + + int startX = tileX * width; + int startY = tileY * height; + int endX = std::min(input->header.framebufferWidth, startX + width); + int endY = std::min(input->header.framebufferHeight, startY + height); + + // This is a root tile, so first do a full 6-plane cull +#ifdef ISPC_IS_WINDOWS + __declspec(align(ALIGNMENT_BYTES)) +#endif + int lightIndices[MAX_LIGHTS] +#ifndef ISPC_IS_WINDOWS + __attribute__ ((aligned(ALIGNMENT_BYTES))) +#endif +; + int numLights = IntersectLightsWithTileMinMax( + startX, endX, startY, endY, minZ, maxZ, + input->header.framebufferWidth, input->header.framebufferHeight, + input->header.cameraProj[0][0], input->header.cameraProj[1][1], + MAX_LIGHTS, input->arrays.lightPositionView_x, + input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, + input->arrays.lightAttenuationEnd, lightIndices); + + // Now kick off the recursive process for this tile + ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, + numLights, framebuffer); +} + + +void +DispatchDynamicC(InputData *input, Framebuffer *framebuffer) +{ + MinMaxZTree *minMaxZTree = gMinMaxZTree; + + // Update min/max Z tree + minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth, + input->header.cameraProj[2][2], input->header.cameraProj[3][2], + input->header.cameraNear, input->header.cameraFar); + + int rootLevel = minMaxZTree->Levels() - 1; + int rootTilesX = minMaxZTree->NumTilesX(rootLevel); + int rootTilesY = minMaxZTree->NumTilesY(rootLevel); + int rootTiles = rootTilesX * rootTilesY; + for (int g = 0; g < rootTiles; ++g) { + uint32_t tileY = g / rootTilesX; + uint32_t tileX = g % rootTilesX; + ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer); + } +} diff --git a/examples_ptx/deferred/dynamic_cilk.cpp b/examples_ptx/deferred/dynamic_cilk.cpp new file mode 100644 index 00000000..87a0c7da --- /dev/null +++ b/examples_ptx/deferred/dynamic_cilk.cpp @@ -0,0 +1,398 @@ +/* + Copyright (c) 2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef __cilk + +#include "deferred.h" +#include "kernels_ispc.h" +#include +#include + +#ifdef _MSC_VER +#define ISPC_IS_WINDOWS +#elif defined(__linux__) +#define ISPC_IS_LINUX +#elif defined(__APPLE__) +#define ISPC_IS_APPLE +#endif + +#ifdef ISPC_IS_LINUX +#include +#endif // ISPC_IS_LINUX + +// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)! +#define MIN_TILE_WIDTH 16 +#define MIN_TILE_HEIGHT 16 + + +#define DYNAMIC_TREE_LEVELS 5 +// If this is set to 1 then the result will be identical to the static version +#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1 + +static void * +lAlignedMalloc(size_t size, int32_t alignment) { +#ifdef ISPC_IS_WINDOWS + return _aligned_malloc(size, alignment); +#endif +#ifdef ISPC_IS_LINUX + return memalign(alignment, size); +#endif +#ifdef ISPC_IS_APPLE + void *mem = malloc(size + (alignment-1) + sizeof(void*)); + char *amem = ((char*)mem) + sizeof(void*); + amem = amem + uint32_t(alignment - (reinterpret_cast(amem) & + (alignment - 1))); + ((void**)amem)[-1] = mem; + return amem; +#endif +} + + +static void +lAlignedFree(void *ptr) { +#ifdef ISPC_IS_WINDOWS + _aligned_free(ptr); +#endif +#ifdef ISPC_IS_LINUX + free(ptr); +#endif +#ifdef ISPC_IS_APPLE + free(((void**)ptr)[-1]); +#endif +} + + +class MinMaxZTreeCilk +{ +public: + // Currently (min) tile dimensions must divide gBuffer dimensions evenly + // Levels must be small enough that neither dimension goes below one tile + MinMaxZTreeCilk( + int tileWidth, int tileHeight, int levels, + int gBufferWidth, int gBufferHeight) + : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels) + { + mNumTilesX = gBufferWidth / mTileWidth; + mNumTilesY = gBufferHeight / mTileHeight; + + // Allocate arrays + mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16); + mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16); + for (int i = 0; i < mLevels; ++i) { + int x = NumTilesX(i); + int y = NumTilesY(i); + assert(x > 0); + assert(y > 0); + // NOTE: If the following two asserts fire it probably means that + // the base tile dimensions do not evenly divide the G-buffer dimensions + assert(x * (mTileWidth << i) >= gBufferWidth); + assert(y * (mTileHeight << i) >= gBufferHeight); + mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16); + mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16); + } + } + + void Update(float *zBuffer, int gBufferPitchInElements, + float cameraProj_33, float cameraProj_43, + float cameraNear, float cameraFar) + { + // Compute level 0 in parallel. Outer loops is here since we use Cilk + _Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) { + ispc::ComputeZBoundsRow(tileY, + mTileWidth, mTileHeight, mNumTilesX, mNumTilesY, + zBuffer, gBufferPitchInElements, + cameraProj_33, cameraProj_43, cameraNear, cameraFar, + mMinZArrays[0] + (tileY * mNumTilesX), + mMaxZArrays[0] + (tileY * mNumTilesX)); + } + + // Generate other levels + // NOTE: We currently don't use ispc here since it's sort of an + // awkward gather-based reduction Using SSE odd pack/unpack + // instructions might actually work here when we need to optimize + for (int level = 1; level < mLevels; ++level) { + int destTilesX = NumTilesX(level); + int destTilesY = NumTilesY(level); + int srcLevel = level - 1; + int srcTilesX = NumTilesX(srcLevel); + int srcTilesY = NumTilesY(srcLevel); + _Cilk_for (int y = 0; y < destTilesY; ++y) { + for (int x = 0; x < destTilesX; ++x) { + int srcX = x << 1; + int srcY = y << 1; + // NOTE: Ugly branches to deal with non-multiple dimensions at some levels + // TODO: SSE branchless min/max is probably better... + float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)]; + float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)]; + if (srcX + 1 < srcTilesX) { + minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + + (srcX + 1)]); + maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX + + (srcX + 1)]); + if (srcY + 1 < srcTilesY) { + minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX + + (srcX + 1)]); + maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX + + (srcX + 1)]); + } + } + if (srcY + 1 < srcTilesY) { + minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX + + (srcX )]); + maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX + + (srcX )]); + } + mMinZArrays[level][y * destTilesX + x] = minZ; + mMaxZArrays[level][y * destTilesX + x] = maxZ; + } + } + } + } + + ~MinMaxZTreeCilk() { + for (int i = 0; i < mLevels; ++i) { + lAlignedFree(mMinZArrays[i]); + lAlignedFree(mMaxZArrays[i]); + } + lAlignedFree(mMinZArrays); + lAlignedFree(mMaxZArrays); + } + + int Levels() const { return mLevels; } + + // These round UP, so beware that the last tile for a given level may not be completely full + // TODO: Verify this... + int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; } + int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; } + int TileWidth(int level = 0) const { return (mTileWidth << level); } + int TileHeight(int level = 0) const { return (mTileHeight << level); } + + float MinZ(int level, int tileX, int tileY) const { + return mMinZArrays[level][tileY * NumTilesX(level) + tileX]; + } + float MaxZ(int level, int tileX, int tileY) const { + return mMaxZArrays[level][tileY * NumTilesX(level) + tileX]; + } + +private: + int mTileWidth; + int mTileHeight; + int mLevels; + int mNumTilesX; + int mNumTilesY; + + // One array for each "level" in the tree + float **mMinZArrays; + float **mMaxZArrays; +}; + +static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0; + +void InitDynamicCilk(InputData *input) { + gMinMaxZTreeCilk = + new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS, + input->header.framebufferWidth, + input->header.framebufferHeight); +} + + +static void +ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, + int *lightIndices, int numLights, + Framebuffer *framebuffer) { + const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk; + + // If we few enough lights or this is the base case (last level), shade + // this full tile directly + if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) { + int width = minMaxZTree->TileWidth(level); + int height = minMaxZTree->TileHeight(level); + int startX = tileX * width; + int startY = tileY * height; + int endX = std::min(input->header.framebufferWidth, startX + width); + int endY = std::min(input->header.framebufferHeight, startY + height); + + // Skip entirely offscreen tiles + if (endX > startX && endY > startY) { + ispc::ShadeTile( + startX, endX, startY, endY, + input->header.framebufferWidth, input->header.framebufferHeight, + &input->arrays, + input->header.cameraProj[0][0], input->header.cameraProj[1][1], + input->header.cameraProj[2][2], input->header.cameraProj[3][2], + lightIndices, numLights, VISUALIZE_LIGHT_COUNT, + framebuffer->r, framebuffer->g, framebuffer->b); + } + } + else { + // Otherwise, subdivide and 4-way recurse using X and Y splitting planes + // Move down a level in the tree + --level; + tileX <<= 1; + tileY <<= 1; + int width = minMaxZTree->TileWidth(level); + int height = minMaxZTree->TileHeight(level); + + // Work out splitting coords + int midX = (tileX + 1) * width; + int midY = (tileY + 1) * height; + + // Read subtile min/max data + // NOTE: We must be sure to handle out-of-bounds access here since + // sometimes we'll only have 1 or 2 subtiles for non-pow-2 + // framebuffer sizes. + bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level)); + bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level)); + + // NOTE: Order is 00, 10, 01, 11 + // Set defaults up to cull all lights if the tile doesn't exist (offscreen) + float minZ[4] = {input->header.cameraFar, input->header.cameraFar, + input->header.cameraFar, input->header.cameraFar}; + float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, + input->header.cameraNear, input->header.cameraNear}; + + minZ[0] = minMaxZTree->MinZ(level, tileX, tileY); + maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY); + if (rightTileExists) { + minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY); + maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY); + if (bottomTileExists) { + minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1); + maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1); + } + } + if (bottomTileExists) { + minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1); + maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1); + } + + // Cull lights into subtile lists +#ifdef ISPC_IS_WINDOWS + __declspec(align(ALIGNMENT_BYTES)) +#endif + int subtileLightIndices[4][MAX_LIGHTS] +#ifndef ISPC_IS_WINDOWS + __attribute__ ((aligned(ALIGNMENT_BYTES))) +#endif +; + int subtileNumLights[4]; + ispc::SplitTileMinMax(midX, midY, minZ, maxZ, + input->header.framebufferWidth, input->header.framebufferHeight, + input->header.cameraProj[0][0], input->header.cameraProj[1][1], + lightIndices, numLights, input->arrays.lightPositionView_x, + input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, + input->arrays.lightAttenuationEnd, + subtileLightIndices[0], MAX_LIGHTS, subtileNumLights); + + // Recurse into subtiles + _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY, + subtileLightIndices[0], subtileNumLights[0], + framebuffer); + _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY, + subtileLightIndices[1], subtileNumLights[1], + framebuffer); + _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY + 1, + subtileLightIndices[2], subtileNumLights[2], + framebuffer); + ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1, + subtileLightIndices[3], subtileNumLights[3], + framebuffer); + } +} + + +static void +ShadeDynamicTile(InputData *input, int level, int tileX, int tileY, + Framebuffer *framebuffer) { + const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk; + + // Get Z min/max for this tile + int width = minMaxZTree->TileWidth(level); + int height = minMaxZTree->TileHeight(level); + float minZ = minMaxZTree->MinZ(level, tileX, tileY); + float maxZ = minMaxZTree->MaxZ(level, tileX, tileY); + + int startX = tileX * width; + int startY = tileY * height; + int endX = std::min(input->header.framebufferWidth, startX + width); + int endY = std::min(input->header.framebufferHeight, startY + height); + + // This is a root tile, so first do a full 6-plane cull +#ifdef ISPC_IS_WINDOWS + __declspec(align(ALIGNMENT_BYTES)) +#endif + int lightIndices[MAX_LIGHTS] +#ifndef ISPC_IS_WINDOWS + __attribute__ ((aligned(ALIGNMENT_BYTES))) +#endif +; + int numLights = ispc::IntersectLightsWithTileMinMax( + startX, endX, startY, endY, minZ, maxZ, + input->header.framebufferWidth, input->header.framebufferHeight, + input->header.cameraProj[0][0], input->header.cameraProj[1][1], + MAX_LIGHTS, input->arrays.lightPositionView_x, + input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, + input->arrays.lightAttenuationEnd, lightIndices); + + // Now kick off the recursive process for this tile + ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, + numLights, framebuffer); +} + + +void +DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer) +{ + MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk; + + // Update min/max Z tree + minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth, + input->header.cameraProj[2][2], input->header.cameraProj[3][2], + input->header.cameraNear, input->header.cameraFar); + + // Launch the "root" tiles. Ideally these should at least fill the + // machine... at the moment we have a static number of "levels" to the + // mip tree but it might make sense to compute it based on the width of + // the machine. + int rootLevel = minMaxZTree->Levels() - 1; + int rootTilesX = minMaxZTree->NumTilesX(rootLevel); + int rootTilesY = minMaxZTree->NumTilesY(rootLevel); + int rootTiles = rootTilesX * rootTilesY; + _Cilk_for (int g = 0; g < rootTiles; ++g) { + uint32_t tileY = g / rootTilesX; + uint32_t tileX = g % rootTilesX; + ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer); + } +} + +#endif // __cilk diff --git a/examples_ptx/deferred/kernels.cu b/examples_ptx/deferred/kernels.cu new file mode 100644 index 00000000..2530532a --- /dev/null +++ b/examples_ptx/deferred/kernels.cu @@ -0,0 +1,761 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#include "deferred.h" +#include +#include + +#define programCount 32 +#define programIndex (threadIdx.x & 31) +#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5)) +#define taskCount (gridDim.x*4) +#define warpIdx (threadIdx.x >> 5) + +#define int32 int +#define int16 short +#define int8 char + +__device__ static inline float clamp(float v, float low, float high) +{ + return min(max(v, low), high); +} + +struct InputDataArrays +{ + float *zBuffer; + unsigned int16 *normalEncoded_x; // half float + unsigned int16 *normalEncoded_y; // half float + unsigned int16 *specularAmount; // half float + unsigned int16 *specularPower; // half float + unsigned int8 *albedo_x; // unorm8 + unsigned int8 *albedo_y; // unorm8 + unsigned int8 *albedo_z; // unorm8 + float *lightPositionView_x; + float *lightPositionView_y; + float *lightPositionView_z; + float *lightAttenuationBegin; + float *lightColor_x; + float *lightColor_y; + float *lightColor_z; + float *lightAttenuationEnd; +}; + +struct InputHeader +{ + float cameraProj[4][4]; + float cameraNear; + float cameraFar; + + int32 framebufferWidth; + int32 framebufferHeight; + int32 numLights; + int32 inputDataChunkSize; + int32 inputDataArrayOffsets[idaNum]; +}; + + +/////////////////////////////////////////////////////////////////////////// +// Common utility routines + +__device__ +static inline float +dot3(float x, float y, float z, float a, float b, float c) { + return (x*a + y*b + z*c); +} + + +#if 0 +static __shared__ int shdata_full[128]; +template +struct Uniform +{ + T data[(N+programCount-1)/programCount]; + volatile T *shdata; + + __device__ inline Uniform() + { + shdata = ((T*)shdata_full) + warpIdx*32; + } + + __device__ inline int2 get_chunk(const int i) const + { + const int elem = i & (programCount - 1); + const int chunk = i >> 5; + shdata[programIndex] = chunk; + shdata[ elem] = chunk; + return make_int2(shdata[programIndex], elem); + } + + __device__ inline const T get(const int i) const + { + const int2 idx = get_chunk(i); + return __shfl(data[idx.x], idx.y); + } + + __device__ inline void set(const bool active, const int i, T value) + { + const int2 idx = get_chunk(i); + const int chunkIdx = idx.x; + const int elemIdx = idx.y; + shdata[programIndex] = data[chunkIdx]; + if (active) shdata[elemIdx] = value; + data[chunkIdx] = shdata[programIndex]; + } +}; +#elif 1 +template +struct Uniform +{ + union + { + T *data; + int32_t ptr[2]; + }; + + __device__ inline Uniform() + { + if (programIndex == 0) + data = (T*)malloc(N*sizeof(T)); + ptr[0] = __shfl(ptr[0], 0); + ptr[1] = __shfl(ptr[1], 0); + } + __device__ inline ~Uniform() + { + if (programIndex == 0) + free(data); + } + + __device__ inline const T get(const int i) const + { + return data[i]; + } + + __device__ inline T* get_ptr(const int i) {return &data[i]; } + __device__ inline void set(const bool active, const int i, T value) + { + if (active) + data[i] = value; + } +}; + +#else +__shared__ int shdata_full[4*MAX_LIGHTS]; +template +struct Uniform +{ + volatile T *shdata; + + __device__ Uniform() + { + shdata = (T*)&shdata_full[warpIdx*MAX_LIGHTS]; + } + + __device__ inline const T get(const int i) const + { + return shdata[i]; + } + + __device__ inline void set(const bool active, const int i, T value) + { + if (active) + shdata[i] = value; + } +}; +#endif + + +__device__ +static inline void +normalize3(float x, float y, float z, float &ox, float &oy, float &oz) { + float n = rsqrt(x*x + y*y + z*z); + ox = x * n; + oy = y * n; + oz = z * n; +} + +__device__ inline +static float reduce_min(float value) +{ +#pragma unroll + for (int i = 4; i >=0; i--) + value = fminf(value, __shfl_xor(value, 1<=0; i--) + value = fmaxf(value, __shfl_xor(value, 1<=0; i--) + value += __shfl_xor(value, 1<= tileEndX) break; + // Unproject depth buffer Z value into view space + float z = zBuffer[y * gBufferWidth + x]; + float viewSpaceZ = cameraProj_43 / (z - cameraProj_33); + + // Work out Z bounds for our samples + // Avoid considering skybox/background or otherwise invalid pixels + if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) { + laneMinZ = min(laneMinZ, viewSpaceZ); + laneMaxZ = max(laneMaxZ, viewSpaceZ); + } + } + } + minZ = reduce_min(laneMinZ); + maxZ = reduce_max(laneMaxZ); +} + + +__device__ +static inline int32 +IntersectLightsWithTileMinMax( + int32 tileStartX, int32 tileEndX, + int32 tileStartY, int32 tileEndY, + // Tile data + float minZ, + float maxZ, + // G-buffer data + int32 gBufferWidth, int32 gBufferHeight, + // Camera data + float cameraProj_11, float cameraProj_22, + // Light Data + int32 numLights, + float light_positionView_x_array[], + float light_positionView_y_array[], + float light_positionView_z_array[], + float light_attenuationEnd_array[], + // Output + Uniform &tileLightIndices + ) +{ + float gBufferScale_x = 0.5f * (float)gBufferWidth; + float gBufferScale_y = 0.5f * (float)gBufferHeight; + + float frustumPlanes_xy[4] = { + -(cameraProj_11 * gBufferScale_x), + (cameraProj_11 * gBufferScale_x), + (cameraProj_22 * gBufferScale_y), + -(cameraProj_22 * gBufferScale_y) }; + float frustumPlanes_z[4] = { + tileEndX - gBufferScale_x, + -tileStartX + gBufferScale_x, + tileEndY - gBufferScale_y, + -tileStartY + gBufferScale_y }; + + for ( int i = 0; i < 4; ++i) { + float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + + frustumPlanes_z[i] * frustumPlanes_z[i]); + frustumPlanes_xy[i] *= norm; + frustumPlanes_z[i] *= norm; + } + + int32 tileNumLights = 0; + + for ( int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount) + { + const int lightIndex = lightIndexB + programIndex; + if (lightIndex >= numLights) break; + + float light_positionView_z = light_positionView_z_array[lightIndex]; + float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; + float light_attenuationEndNeg = -light_attenuationEnd; + + float d = light_positionView_z - minZ; + bool inFrustum = (d >= light_attenuationEndNeg); + + d = maxZ - light_positionView_z; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + // This seems better than cif(!inFrustum) ccontinue; here since we + // don't actually need to mask the rest of this function - this is + // just a greedy early-out. Could also structure all of this as + // nested if() statements, but this a bit easier to read + if (__ballot(inFrustum) > 0) + { + float light_positionView_x = light_positionView_x_array[lightIndex]; + float light_positionView_y = light_positionView_y_array[lightIndex]; + + d = light_positionView_z * frustumPlanes_z[0] + + light_positionView_x * frustumPlanes_xy[0]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[1] + + light_positionView_x * frustumPlanes_xy[1]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[2] + + light_positionView_y * frustumPlanes_xy[2]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[3] + + light_positionView_y * frustumPlanes_xy[3]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + // Pack and store intersecting lights + const bool active = inFrustum && lightIndex < numLights; +#if 0 + if (__ballot(active) > 0) + tileNumLights += packed_store_active(active, tileLightIndices.get_ptr(tileNumLights), lightIndex); +#else + if (__ballot(active) > 0) + { + const int2 res = warpBinExclusiveScan(active); + const int idx = tileNumLights + res.y; + const int nactive = res.x; + tileLightIndices.set(active, idx, lightIndex); + tileNumLights += nactive; + } +#endif + } + } + + return tileNumLights; +} + + +__device__ +static inline int32 +IntersectLightsWithTile( + int32 tileStartX, int32 tileEndX, + int32 tileStartY, int32 tileEndY, + int32 gBufferWidth, int32 gBufferHeight, + // G-buffer data + float zBuffer[], + // Camera data + float cameraProj_11, float cameraProj_22, + float cameraProj_33, float cameraProj_43, + float cameraNear, float cameraFar, + // Light Data + int32 numLights, + float light_positionView_x_array[], + float light_positionView_y_array[], + float light_positionView_z_array[], + float light_attenuationEnd_array[], + // Output + Uniform &tileLightIndices + ) +{ + float minZ, maxZ; + ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY, + zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar, + minZ, maxZ); + + + int32 tileNumLights = IntersectLightsWithTileMinMax( + tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ, + gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22, + MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, + light_positionView_z_array, light_attenuationEnd_array, + tileLightIndices); + + return tileNumLights; +} + + +__device__ +static inline void +ShadeTile( + int32 tileStartX, int32 tileEndX, + int32 tileStartY, int32 tileEndY, + int32 gBufferWidth, int32 gBufferHeight, + const InputDataArrays &inputData, + // Camera data + float cameraProj_11, float cameraProj_22, + float cameraProj_33, float cameraProj_43, + // Light list + Uniform &tileLightIndices, + int32 tileNumLights, + // UI + bool visualizeLightCount, + // Output + unsigned int8 framebuffer_r[], + unsigned int8 framebuffer_g[], + unsigned int8 framebuffer_b[] + ) +{ + if (tileNumLights == 0 || visualizeLightCount) { + unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255)); + for ( int32 y = tileStartY; y < tileEndY; ++y) { + for ( int xb = tileStartX ; xb < tileEndX; xb += programCount) + { + const int x = xb + programIndex; + if (x >= tileEndX) continue; + int32 framebufferIndex = (y * gBufferWidth + x); + framebuffer_r[framebufferIndex] = c; + framebuffer_g[framebufferIndex] = c; + framebuffer_b[framebufferIndex] = c; + } + } + } else { + float twoOverGBufferWidth = 2.0f / gBufferWidth; + float twoOverGBufferHeight = 2.0f / gBufferHeight; + + for ( int32 y = tileStartY; y < tileEndY; ++y) { + float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); + + for ( int xb = tileStartX ; xb < tileEndX; xb += programCount) + { + const int x = xb + programIndex; +// if (x >= tileEndX) break; + int32 gBufferOffset = y * gBufferWidth + x; + + // Reconstruct position and (negative) view vector from G-buffer + float surface_positionView_x, surface_positionView_y, surface_positionView_z; + float Vneg_x, Vneg_y, Vneg_z; + + float z = inputData.zBuffer[gBufferOffset]; + + // Compute screen/clip-space position + // NOTE: Mind DX11 viewport transform and pixel center! + float positionScreen_x = (0.5f + (float)(x)) * + twoOverGBufferWidth - 1.0f; + + // Unproject depth buffer Z value into view space + surface_positionView_z = cameraProj_43 / (z - cameraProj_33); + surface_positionView_x = positionScreen_x * surface_positionView_z / + cameraProj_11; + surface_positionView_y = positionScreen_y * surface_positionView_z / + cameraProj_22; + + // We actually end up with a vector pointing *at* the + // surface (i.e. the negative view vector) + normalize3(surface_positionView_x, surface_positionView_y, + surface_positionView_z, Vneg_x, Vneg_y, Vneg_z); + + // Reconstruct normal from G-buffer + float surface_normal_x, surface_normal_y, surface_normal_z; + asm("// half2float //"); + float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]); + float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]); + asm("// half2float //"); + + float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); + float m = sqrt(4.0f * f - 1.0f); + + surface_normal_x = m * (4.0f * normal_x - 2.0f); + surface_normal_y = m * (4.0f * normal_y - 2.0f); + surface_normal_z = 3.0f - 8.0f * f; + + // Load other G-buffer parameters + float surface_specularAmount = + __half2float(inputData.specularAmount[gBufferOffset]); + float surface_specularPower = + __half2float(inputData.specularPower[gBufferOffset]); + float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]); + float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]); + float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]); + + float lit_x = 0.0f; + float lit_y = 0.0f; + float lit_z = 0.0f; + for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights; + ++tileLightIndex) { + int32 lightIndex = tileLightIndices.get(tileLightIndex); + + // Gather light data relevant to initial culling + float light_positionView_x = + __ldg(&inputData.lightPositionView_x[lightIndex]); + float light_positionView_y = + __ldg(&inputData.lightPositionView_y[lightIndex]); + float light_positionView_z = + __ldg(&inputData.lightPositionView_z[lightIndex]); + float light_attenuationEnd = + __ldg(&inputData.lightAttenuationEnd[lightIndex]); + + // Compute light vector + float L_x = light_positionView_x - surface_positionView_x; + float L_y = light_positionView_y - surface_positionView_y; + float L_z = light_positionView_z - surface_positionView_z; + + float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z); + + // Clip at end of attenuation + float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; + + if (distanceToLight2 < light_attenutaionEnd2) { + float distanceToLight = sqrt(distanceToLight2); + + // HLSL "rcp" is allowed to be fairly inaccurate + float distanceToLightRcp = 1.0f/distanceToLight; + L_x *= distanceToLightRcp; + L_y *= distanceToLightRcp; + L_z *= distanceToLightRcp; + + // Start computing brdf + float NdotL = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, L_x, L_y, L_z); + + // Clip back facing + if (NdotL > 0.0f) { + float light_attenuationBegin = + inputData.lightAttenuationBegin[lightIndex]; + + // Light distance attenuation (linstep) + float lightRange = (light_attenuationEnd - light_attenuationBegin); + float falloffPosition = (light_attenuationEnd - distanceToLight); + float attenuation = min(falloffPosition / lightRange, 1.0f); + + float H_x = (L_x - Vneg_x); + float H_y = (L_y - Vneg_y); + float H_z = (L_z - Vneg_z); + normalize3(H_x, H_y, H_z, H_x, H_y, H_z); + + float NdotH = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, H_x, H_y, H_z); + NdotH = max(NdotH, 0.0f); + + float specular = pow(NdotH, surface_specularPower); + float specularNorm = (surface_specularPower + 2.0f) * + (1.0f / 8.0f); + float specularContrib = surface_specularAmount * + specularNorm * specular; + + float k = attenuation * NdotL * (1.0f + specularContrib); + + float light_color_x = inputData.lightColor_x[lightIndex]; + float light_color_y = inputData.lightColor_y[lightIndex]; + float light_color_z = inputData.lightColor_z[lightIndex]; + + float lightContrib_x = surface_albedo_x * light_color_x; + float lightContrib_y = surface_albedo_y * light_color_y; + float lightContrib_z = surface_albedo_z * light_color_z; + + lit_x += lightContrib_x * k; + lit_y += lightContrib_y * k; + lit_z += lightContrib_z * k; + } + } + } + + // Gamma correct + // These pows are pretty slow right now, but we can do + // something faster if really necessary to squeeze every + // last bit of performance out of it + float gamma = 1.0 / 2.2f; + lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma); + lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); + lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); + + framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); + framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); + framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); + } + } + } +} + + +/////////////////////////////////////////////////////////////////////////// +// Static decomposition + +__global__ void +RenderTile( int num_groups_x, int num_groups_y, + const InputHeader *inputHeaderPtr, + const InputDataArrays *inputDataPtr, + int visualizeLightCount, + // Output + unsigned int8 framebuffer_r[], + unsigned int8 framebuffer_g[], + unsigned int8 framebuffer_b[]) { + if (taskIndex >= taskCount) return; + + const InputHeader inputHeader = *inputHeaderPtr; + const InputDataArrays inputData = *inputDataPtr; + int32 group_y = taskIndex / num_groups_x; + int32 group_x = taskIndex % num_groups_x; + + int32 tile_start_x = group_x * MIN_TILE_WIDTH; + int32 tile_start_y = group_y * MIN_TILE_HEIGHT; + int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH; + int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT; + + int framebufferWidth = inputHeader.framebufferWidth; + int framebufferHeight = inputHeader.framebufferHeight; + float cameraProj_00 = inputHeader.cameraProj[0][0]; + float cameraProj_11 = inputHeader.cameraProj[1][1]; + float cameraProj_22 = inputHeader.cameraProj[2][2]; + float cameraProj_32 = inputHeader.cameraProj[3][2]; + + // Light intersection: figure out which lights illuminate this tile. + Uniform tileLightIndices; // Light list for the tile +#if 1 + int numTileLights = + IntersectLightsWithTile(tile_start_x, tile_end_x, + tile_start_y, tile_end_y, + framebufferWidth, framebufferHeight, + inputData.zBuffer, + cameraProj_00, cameraProj_11, + cameraProj_22, cameraProj_32, + inputHeader.cameraNear, inputHeader.cameraFar, + MAX_LIGHTS, + inputData.lightPositionView_x, + inputData.lightPositionView_y, + inputData.lightPositionView_z, + inputData.lightAttenuationEnd, + tileLightIndices); + + // And now shade the tile, using the lights in tileLightIndices + ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, + framebufferWidth, framebufferHeight, inputData, + cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32, + tileLightIndices, numTileLights, visualizeLightCount, + framebuffer_r, framebuffer_g, framebuffer_b); +#endif +} + + +extern "C" __global__ void +RenderStatic( InputHeader inputHeaderPtr[], + InputDataArrays inputDataPtr[], + int visualizeLightCount, + // Output + unsigned int8 framebuffer_r[], + unsigned int8 framebuffer_g[], + unsigned int8 framebuffer_b[]) { + + const InputHeader inputHeader = *inputHeaderPtr; + const InputDataArrays inputData = *inputDataPtr; + + + int num_groups_x = (inputHeader.framebufferWidth + + MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH; + int num_groups_y = (inputHeader.framebufferHeight + + MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT; + int num_groups = num_groups_x * num_groups_y; + + // Launch a task to render each tile, each of which is MIN_TILE_WIDTH + // by MIN_TILE_HEIGHT pixels. + if (programIndex == 0) + RenderTile<<<(num_groups+4-1)/4,128>>>(num_groups_x, num_groups_y, + inputHeaderPtr, inputDataPtr, visualizeLightCount, + framebuffer_r, framebuffer_g, framebuffer_b); + cudaDeviceSynchronize(); +} diff --git a/examples_ptx/deferred/kernels.ispc b/examples_ptx/deferred/kernels.ispc new file mode 100644 index 00000000..6d2a8cc9 --- /dev/null +++ b/examples_ptx/deferred/kernels.ispc @@ -0,0 +1,672 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "deferred.h" + +struct InputDataArrays +{ + float *zBuffer; + unsigned int16 *normalEncoded_x; // half float + unsigned int16 *normalEncoded_y; // half float + unsigned int16 *specularAmount; // half float + unsigned int16 *specularPower; // half float + unsigned int8 *albedo_x; // unorm8 + unsigned int8 *albedo_y; // unorm8 + unsigned int8 *albedo_z; // unorm8 + float *lightPositionView_x; + float *lightPositionView_y; + float *lightPositionView_z; + float *lightAttenuationBegin; + float *lightColor_x; + float *lightColor_y; + float *lightColor_z; + float *lightAttenuationEnd; +}; + +struct InputHeader +{ + float cameraProj[4][4]; + float cameraNear; + float cameraFar; + + int32 framebufferWidth; + int32 framebufferHeight; + int32 numLights; + int32 inputDataChunkSize; + int32 inputDataArrayOffsets[idaNum]; +}; + + +/////////////////////////////////////////////////////////////////////////// +// Common utility routines + +static inline float +dot3(float x, float y, float z, float a, float b, float c) { + return (x*a + y*b + z*c); +} + + +static inline void +normalize3(float x, float y, float z, float &ox, float &oy, float &oz) { + float n = rsqrt(x*x + y*y + z*z); + ox = x * n; + oy = y * n; + oz = z * n; +} + + +static inline float +Unorm8ToFloat32(unsigned int8 u) { + return (float)u * (1.0f / 255.0f); +} + + +static inline unsigned int8 +Float32ToUnorm8(float f) { + return (unsigned int8)(f * 255.0f); +} + + +static void +ComputeZBounds( + uniform int32 tileStartX, uniform int32 tileEndX, + uniform int32 tileStartY, uniform int32 tileEndY, + // G-buffer data + uniform float zBuffer[], + uniform int32 gBufferWidth, + // Camera data + uniform float cameraProj_33, uniform float cameraProj_43, + uniform float cameraNear, uniform float cameraFar, + // Output + uniform float &minZ, + uniform float &maxZ + ) +{ + // Find Z bounds + float laneMinZ = cameraFar; + float laneMaxZ = cameraNear; + for (uniform int32 y = tileStartY; y < tileEndY; ++y) { + foreach (x = tileStartX ... tileEndX) { + // Unproject depth buffer Z value into view space + float z = zBuffer[y * gBufferWidth + x]; + float viewSpaceZ = cameraProj_43 / (z - cameraProj_33); + + // Work out Z bounds for our samples + // Avoid considering skybox/background or otherwise invalid pixels + if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) { + laneMinZ = min(laneMinZ, viewSpaceZ); + laneMaxZ = max(laneMaxZ, viewSpaceZ); + } + } + } + minZ = reduce_min(laneMinZ); + maxZ = reduce_max(laneMaxZ); +} + + +export uniform int32 +IntersectLightsWithTileMinMax( + uniform int32 tileStartX, uniform int32 tileEndX, + uniform int32 tileStartY, uniform int32 tileEndY, + // Tile data + uniform float minZ, + uniform float maxZ, + // G-buffer data + uniform int32 gBufferWidth, uniform int32 gBufferHeight, + // Camera data + uniform float cameraProj_11, uniform float cameraProj_22, + // Light Data + uniform int32 numLights, + uniform float light_positionView_x_array[], + uniform float light_positionView_y_array[], + uniform float light_positionView_z_array[], + uniform float light_attenuationEnd_array[], + // Output + uniform int32 tileLightIndices[] + ) +{ + uniform float gBufferScale_x = 0.5f * (float)gBufferWidth; + uniform float gBufferScale_y = 0.5f * (float)gBufferHeight; + + uniform float frustumPlanes_xy[4] = { + -(cameraProj_11 * gBufferScale_x), + (cameraProj_11 * gBufferScale_x), + (cameraProj_22 * gBufferScale_y), + -(cameraProj_22 * gBufferScale_y) }; + uniform float frustumPlanes_z[4] = { + tileEndX - gBufferScale_x, + -tileStartX + gBufferScale_x, + tileEndY - gBufferScale_y, + -tileStartY + gBufferScale_y }; + + for (uniform int i = 0; i < 4; ++i) { + uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + + frustumPlanes_z[i] * frustumPlanes_z[i]); + frustumPlanes_xy[i] *= norm; + frustumPlanes_z[i] *= norm; + } + + uniform int32 tileNumLights = 0; + + foreach (lightIndex = 0 ... numLights) { + float light_positionView_z = light_positionView_z_array[lightIndex]; + float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; + float light_attenuationEndNeg = -light_attenuationEnd; + + float d = light_positionView_z - minZ; + bool inFrustum = (d >= light_attenuationEndNeg); + + d = maxZ - light_positionView_z; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + // This seems better than cif(!inFrustum) ccontinue; here since we + // don't actually need to mask the rest of this function - this is + // just a greedy early-out. Could also structure all of this as + // nested if() statements, but this a bit easier to read + if (any(inFrustum)) { + float light_positionView_x = light_positionView_x_array[lightIndex]; + float light_positionView_y = light_positionView_y_array[lightIndex]; + + d = light_positionView_z * frustumPlanes_z[0] + + light_positionView_x * frustumPlanes_xy[0]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[1] + + light_positionView_x * frustumPlanes_xy[1]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[2] + + light_positionView_y * frustumPlanes_xy[2]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[3] + + light_positionView_y * frustumPlanes_xy[3]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + // Pack and store intersecting lights + cif (inFrustum) { + tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], + lightIndex); + } + } + } + + return tileNumLights; +} + + +static uniform int32 +IntersectLightsWithTile( + uniform int32 tileStartX, uniform int32 tileEndX, + uniform int32 tileStartY, uniform int32 tileEndY, + uniform int32 gBufferWidth, uniform int32 gBufferHeight, + // G-buffer data + uniform float zBuffer[], + // Camera data + uniform float cameraProj_11, uniform float cameraProj_22, + uniform float cameraProj_33, uniform float cameraProj_43, + uniform float cameraNear, uniform float cameraFar, + // Light Data + uniform int32 numLights, + uniform float light_positionView_x_array[], + uniform float light_positionView_y_array[], + uniform float light_positionView_z_array[], + uniform float light_attenuationEnd_array[], + // Output + uniform int32 tileLightIndices[] + ) +{ + uniform float minZ, maxZ; + ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY, + zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar, + minZ, maxZ); + + uniform int32 tileNumLights = IntersectLightsWithTileMinMax( + tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ, + gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22, + MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, + light_positionView_z_array, light_attenuationEnd_array, + tileLightIndices); + + return tileNumLights; +} + + +export void +ShadeTile( + uniform int32 tileStartX, uniform int32 tileEndX, + uniform int32 tileStartY, uniform int32 tileEndY, + uniform int32 gBufferWidth, uniform int32 gBufferHeight, + uniform InputDataArrays &inputData, + // Camera data + uniform float cameraProj_11, uniform float cameraProj_22, + uniform float cameraProj_33, uniform float cameraProj_43, + // Light list + uniform int32 tileLightIndices[], + uniform int32 tileNumLights, + // UI + uniform bool visualizeLightCount, + // Output + uniform unsigned int8 framebuffer_r[], + uniform unsigned int8 framebuffer_g[], + uniform unsigned int8 framebuffer_b[] + ) +{ + if (tileNumLights == 0 || visualizeLightCount) { + uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255)); + for (uniform int32 y = tileStartY; y < tileEndY; ++y) { + foreach (x = tileStartX ... tileEndX) { + int32 framebufferIndex = (y * gBufferWidth + x); + framebuffer_r[framebufferIndex] = c; + framebuffer_g[framebufferIndex] = c; + framebuffer_b[framebufferIndex] = c; + } + } + } else { + uniform float twoOverGBufferWidth = 2.0f / gBufferWidth; + uniform float twoOverGBufferHeight = 2.0f / gBufferHeight; + + for (uniform int32 y = tileStartY; y < tileEndY; ++y) { + uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); + + foreach (x = tileStartX ... tileEndX) { + int32 gBufferOffset = y * gBufferWidth + x; + + // Reconstruct position and (negative) view vector from G-buffer + float surface_positionView_x, surface_positionView_y, surface_positionView_z; + float Vneg_x, Vneg_y, Vneg_z; + + float z = inputData.zBuffer[gBufferOffset]; + + // Compute screen/clip-space position + // NOTE: Mind DX11 viewport transform and pixel center! + float positionScreen_x = (0.5f + (float)(x)) * + twoOverGBufferWidth - 1.0f; + + // Unproject depth buffer Z value into view space + surface_positionView_z = cameraProj_43 / (z - cameraProj_33); + surface_positionView_x = positionScreen_x * surface_positionView_z / + cameraProj_11; + surface_positionView_y = positionScreen_y * surface_positionView_z / + cameraProj_22; + + // We actually end up with a vector pointing *at* the + // surface (i.e. the negative view vector) + normalize3(surface_positionView_x, surface_positionView_y, + surface_positionView_z, Vneg_x, Vneg_y, Vneg_z); + + // Reconstruct normal from G-buffer + float surface_normal_x, surface_normal_y, surface_normal_z; + float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]); + float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]); + + float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); + float m = sqrt(4.0f * f - 1.0f); + + surface_normal_x = m * (4.0f * normal_x - 2.0f); + surface_normal_y = m * (4.0f * normal_y - 2.0f); + surface_normal_z = 3.0f - 8.0f * f; + + // Load other G-buffer parameters + float surface_specularAmount = + half_to_float(inputData.specularAmount[gBufferOffset]); + float surface_specularPower = + half_to_float(inputData.specularPower[gBufferOffset]); + float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]); + float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]); + float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]); + + float lit_x = 0.0f; + float lit_y = 0.0f; + float lit_z = 0.0f; + for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; + ++tileLightIndex) { + uniform int32 lightIndex = tileLightIndices[tileLightIndex]; + + // Gather light data relevant to initial culling + uniform float light_positionView_x = + inputData.lightPositionView_x[lightIndex]; + uniform float light_positionView_y = + inputData.lightPositionView_y[lightIndex]; + uniform float light_positionView_z = + inputData.lightPositionView_z[lightIndex]; + uniform float light_attenuationEnd = + inputData.lightAttenuationEnd[lightIndex]; + + // Compute light vector + float L_x = light_positionView_x - surface_positionView_x; + float L_y = light_positionView_y - surface_positionView_y; + float L_z = light_positionView_z - surface_positionView_z; + + float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z); + + // Clip at end of attenuation + float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; + + cif (distanceToLight2 < light_attenutaionEnd2) { + float distanceToLight = sqrt(distanceToLight2); + + // HLSL "rcp" is allowed to be fairly inaccurate + float distanceToLightRcp = rcp(distanceToLight); + L_x *= distanceToLightRcp; + L_y *= distanceToLightRcp; + L_z *= distanceToLightRcp; + + // Start computing brdf + float NdotL = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, L_x, L_y, L_z); + + // Clip back facing + cif (NdotL > 0.0f) { + uniform float light_attenuationBegin = + inputData.lightAttenuationBegin[lightIndex]; + + // Light distance attenuation (linstep) + float lightRange = (light_attenuationEnd - light_attenuationBegin); + float falloffPosition = (light_attenuationEnd - distanceToLight); + float attenuation = min(falloffPosition / lightRange, 1.0f); + + float H_x = (L_x - Vneg_x); + float H_y = (L_y - Vneg_y); + float H_z = (L_z - Vneg_z); + normalize3(H_x, H_y, H_z, H_x, H_y, H_z); + + float NdotH = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, H_x, H_y, H_z); + NdotH = max(NdotH, 0.0f); + + float specular = pow(NdotH, surface_specularPower); + float specularNorm = (surface_specularPower + 2.0f) * + (1.0f / 8.0f); + float specularContrib = surface_specularAmount * + specularNorm * specular; + + float k = attenuation * NdotL * (1.0f + specularContrib); + + uniform float light_color_x = inputData.lightColor_x[lightIndex]; + uniform float light_color_y = inputData.lightColor_y[lightIndex]; + uniform float light_color_z = inputData.lightColor_z[lightIndex]; + + float lightContrib_x = surface_albedo_x * light_color_x; + float lightContrib_y = surface_albedo_y * light_color_y; + float lightContrib_z = surface_albedo_z * light_color_z; + + lit_x += lightContrib_x * k; + lit_y += lightContrib_y * k; + lit_z += lightContrib_z * k; + } + } + } + + // Gamma correct + // These pows are pretty slow right now, but we can do + // something faster if really necessary to squeeze every + // last bit of performance out of it + float gamma = 1.0 / 2.2f; + lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma); + lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); + lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); + + framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); + framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); + framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); + } + } + } +} + + +/////////////////////////////////////////////////////////////////////////// +// Static decomposition + +task void +RenderTile(uniform int num_groups_x, uniform int num_groups_y, + uniform InputHeader &inputHeader, + uniform InputDataArrays &inputData, + uniform int visualizeLightCount, + // Output + uniform unsigned int8 framebuffer_r[], + uniform unsigned int8 framebuffer_g[], + uniform unsigned int8 framebuffer_b[]) { + uniform int32 group_y = taskIndex / num_groups_x; + uniform int32 group_x = taskIndex % num_groups_x; + uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH; + uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT; + uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH; + uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT; + + uniform int framebufferWidth = inputHeader.framebufferWidth; + uniform int framebufferHeight = inputHeader.framebufferHeight; + uniform float cameraProj_00 = inputHeader.cameraProj[0][0]; + uniform float cameraProj_11 = inputHeader.cameraProj[1][1]; + uniform float cameraProj_22 = inputHeader.cameraProj[2][2]; + uniform float cameraProj_32 = inputHeader.cameraProj[3][2]; + + // Light intersection: figure out which lights illuminate this tile. + uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile + uniform int numTileLights = + IntersectLightsWithTile(tile_start_x, tile_end_x, + tile_start_y, tile_end_y, + framebufferWidth, framebufferHeight, + inputData.zBuffer, + cameraProj_00, cameraProj_11, + cameraProj_22, cameraProj_32, + inputHeader.cameraNear, inputHeader.cameraFar, + MAX_LIGHTS, + inputData.lightPositionView_x, + inputData.lightPositionView_y, + inputData.lightPositionView_z, + inputData.lightAttenuationEnd, + tileLightIndices); + + // And now shade the tile, using the lights in tileLightIndices + ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, + framebufferWidth, framebufferHeight, inputData, + cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32, + tileLightIndices, numTileLights, visualizeLightCount, + framebuffer_r, framebuffer_g, framebuffer_b); +} + + +export void +RenderStatic(uniform InputHeader &inputHeader, + uniform InputDataArrays &inputData, + uniform int visualizeLightCount, + // Output + uniform unsigned int8 framebuffer_r[], + uniform unsigned int8 framebuffer_g[], + uniform unsigned int8 framebuffer_b[]) { + uniform int num_groups_x = (inputHeader.framebufferWidth + + MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH; + uniform int num_groups_y = (inputHeader.framebufferHeight + + MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT; + uniform int num_groups = num_groups_x * num_groups_y; + + // Launch a task to render each tile, each of which is MIN_TILE_WIDTH + // by MIN_TILE_HEIGHT pixels. + launch[num_groups] RenderTile(num_groups_x, num_groups_y, + inputHeader, inputData, visualizeLightCount, + framebuffer_r, framebuffer_g, framebuffer_b); +} + + +/////////////////////////////////////////////////////////////////////////// +// Routines for dynamic decomposition path + +// This computes the z min/max range for a whole row worth of tiles. +export void +ComputeZBoundsRow( + uniform int32 tileY, + uniform int32 tileWidth, uniform int32 tileHeight, + uniform int32 numTilesX, uniform int32 numTilesY, + // G-buffer data + uniform float zBuffer[], + uniform int32 gBufferWidth, + // Camera data + uniform float cameraProj_33, uniform float cameraProj_43, + uniform float cameraNear, uniform float cameraFar, + // Output + uniform float minZArray[], + uniform float maxZArray[] + ) +{ + for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) { + uniform float minZ, maxZ; + ComputeZBounds( + tileX * tileWidth, tileX * tileWidth + tileWidth, + tileY * tileHeight, tileY * tileHeight + tileHeight, + zBuffer, gBufferWidth, + cameraProj_33, cameraProj_43, cameraNear, cameraFar, + minZ, maxZ); + minZArray[tileX] = minZ; + maxZArray[tileX] = maxZ; + } +} + + +// Reclassifies the lights with respect to four sub-tiles when we refine a tile. +// numLights need not be a multiple of programCount here, but the input and output arrays +// should be able to handle programCount-sized load/stores. +export void +SplitTileMinMax( + uniform int32 tileMidX, uniform int32 tileMidY, + // Subtile data (00, 10, 01, 11) + uniform float subtileMinZ[], + uniform float subtileMaxZ[], + // G-buffer data + uniform int32 gBufferWidth, uniform int32 gBufferHeight, + // Camera data + uniform float cameraProj_11, uniform float cameraProj_22, + // Light Data + uniform int32 lightIndices[], + uniform int32 numLights, + uniform float light_positionView_x_array[], + uniform float light_positionView_y_array[], + uniform float light_positionView_z_array[], + uniform float light_attenuationEnd_array[], + // Outputs + uniform int32 subtileIndices[], + uniform int32 subtileIndicesPitch, + uniform int32 subtileNumLights[] + ) +{ + uniform float gBufferScale_x = 0.5f * (float)gBufferWidth; + uniform float gBufferScale_y = 0.5f * (float)gBufferHeight; + + uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x), + (cameraProj_22 * gBufferScale_y) }; + uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x, + tileMidY - gBufferScale_y }; + + // Normalize + uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + + frustumPlanes_z[0] * frustumPlanes_z[0]), + rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + + frustumPlanes_z[1] * frustumPlanes_z[1]) }; + frustumPlanes_xy[0] *= norm[0]; + frustumPlanes_xy[1] *= norm[1]; + frustumPlanes_z[0] *= norm[0]; + frustumPlanes_z[1] *= norm[1]; + + // Initialize + uniform int32 subtileLightOffset[4]; + subtileLightOffset[0] = 0 * subtileIndicesPitch; + subtileLightOffset[1] = 1 * subtileIndicesPitch; + subtileLightOffset[2] = 2 * subtileIndicesPitch; + subtileLightOffset[3] = 3 * subtileIndicesPitch; + + foreach (i = 0 ... numLights) { + int32 lightIndex = lightIndices[i]; + + float light_positionView_x = light_positionView_x_array[lightIndex]; + float light_positionView_y = light_positionView_y_array[lightIndex]; + float light_positionView_z = light_positionView_z_array[lightIndex]; + float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; + float light_attenuationEndNeg = -light_attenuationEnd; + + // Test lights again subtile z bounds + bool inFrustum[4]; + inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) && + (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg); + inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && + (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg); + inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && + (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg); + inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && + (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg); + + float dx = light_positionView_z * frustumPlanes_z[0] + + light_positionView_x * frustumPlanes_xy[0]; + float dy = light_positionView_z * frustumPlanes_z[1] + + light_positionView_y * frustumPlanes_xy[1]; + + cif (abs(dx) > light_attenuationEnd) { + bool positiveX = dx > 0.0f; + inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile + inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile + inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile + inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile + } + cif (abs(dy) > light_attenuationEnd) { + bool positiveY = dy > 0.0f; + inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile + inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile + inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile + inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile + } + + // Pack and store intersecting lights + // TODO: Experiment with a loop here instead + cif (inFrustum[0]) + subtileLightOffset[0] += + packed_store_active(&subtileIndices[subtileLightOffset[0]], + lightIndex); + cif (inFrustum[1]) + subtileLightOffset[1] += + packed_store_active(&subtileIndices[subtileLightOffset[1]], + lightIndex); + cif (inFrustum[2]) + subtileLightOffset[2] += + packed_store_active(&subtileIndices[subtileLightOffset[2]], + lightIndex); + cif (inFrustum[3]) + subtileLightOffset[3] += + packed_store_active(&subtileIndices[subtileLightOffset[3]], + lightIndex); + } + + subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch; + subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch; + subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch; + subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch; +} diff --git a/examples_ptx/deferred/kernels1.ispc b/examples_ptx/deferred/kernels1.ispc new file mode 100644 index 00000000..1c0962cc --- /dev/null +++ b/examples_ptx/deferred/kernels1.ispc @@ -0,0 +1,556 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef __NVPTX__ +#warning "emitting DEVICE code" +#define programCount warpSize() +#define programIndex laneIndex() +#define taskIndex blockIndex0() +#define taskCount blockCount0() +#define cif if +#else +#warning "emitting HOST code" +#endif + + +#include "deferred.h" + +struct InputDataArrays +{ + float *zBuffer; + unsigned int16 *normalEncoded_x; // half float + unsigned int16 *normalEncoded_y; // half float + unsigned int16 *specularAmount; // half float + unsigned int16 *specularPower; // half float + unsigned int8 *albedo_x; // unorm8 + unsigned int8 *albedo_y; // unorm8 + unsigned int8 *albedo_z; // unorm8 + float *lightPositionView_x; + float *lightPositionView_y; + float *lightPositionView_z; + float *lightAttenuationBegin; + float *lightColor_x; + float *lightColor_y; + float *lightColor_z; + float *lightAttenuationEnd; +}; + +struct InputHeader +{ + float cameraProj[4][4]; + float cameraNear; + float cameraFar; + + int32 framebufferWidth; + int32 framebufferHeight; + int32 numLights; + int32 inputDataChunkSize; + int32 inputDataArrayOffsets[idaNum]; +}; + + +/////////////////////////////////////////////////////////////////////////// +// Common utility routines + +static inline float +dot3(float x, float y, float z, float a, float b, float c) { + return (x*a + y*b + z*c); +} + + +static inline void +normalize3(float x, float y, float z, float &ox, float &oy, float &oz) { + float n = rsqrt(x*x + y*y + z*z); + ox = x * n; + oy = y * n; + oz = z * n; +} + + +static inline float +Unorm8ToFloat32(unsigned int8 u) { + return (float)u * (1.0f / 255.0f); +} + + +static inline unsigned int8 +Float32ToUnorm8(float f) { + return (unsigned int8)(f * 255.0f); +} + + +static inline void +ComputeZBounds( + uniform int32 tileStartX, uniform int32 tileEndX, + uniform int32 tileStartY, uniform int32 tileEndY, + // G-buffer data + uniform float zBuffer[], + uniform int32 gBufferWidth, + // Camera data + uniform float cameraProj_33, uniform float cameraProj_43, + uniform float cameraNear, uniform float cameraFar, + // Output + uniform float &minZ, + uniform float &maxZ + ) +{ + // Find Z bounds + float laneMinZ = cameraFar; + float laneMaxZ = cameraNear; + for (uniform int32 y = tileStartY; y < tileEndY; ++y) + foreach (x = tileStartX ... tileEndX) + { + // Unproject depth buffer Z value into view space + float z = zBuffer[y * gBufferWidth + x]; + float viewSpaceZ = cameraProj_43 / (z - cameraProj_33); + + // Work out Z bounds for our samples + // Avoid considering skybox/background or otherwise invalid pixels + if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) { + laneMinZ = min(laneMinZ, viewSpaceZ); + laneMaxZ = max(laneMaxZ, viewSpaceZ); + } + } + minZ = reduce_min(laneMinZ); + maxZ = reduce_max(laneMaxZ); +} + + +static inline uniform int32 +IntersectLightsWithTileMinMax( + uniform int32 tileStartX, uniform int32 tileEndX, + uniform int32 tileStartY, uniform int32 tileEndY, + // Tile data + uniform float minZ, + uniform float maxZ, + // G-buffer data + uniform int32 gBufferWidth, uniform int32 gBufferHeight, + // Camera data + uniform float cameraProj_11, uniform float cameraProj_22, + // Light Data + uniform int32 numLights, + uniform float light_positionView_x_array[], + uniform float light_positionView_y_array[], + uniform float light_positionView_z_array[], + uniform float light_attenuationEnd_array[], + // Output + uniform int32 tileLightIndices[] + ) +{ + uniform float gBufferScale_x = 0.5f * (float)gBufferWidth; + uniform float gBufferScale_y = 0.5f * (float)gBufferHeight; + + uniform float frustumPlanes_xy[4] = { + -(cameraProj_11 * gBufferScale_x), + (cameraProj_11 * gBufferScale_x), + (cameraProj_22 * gBufferScale_y), + -(cameraProj_22 * gBufferScale_y) }; + uniform float frustumPlanes_z[4] = { + tileEndX - gBufferScale_x, + -tileStartX + gBufferScale_x, + tileEndY - gBufferScale_y, + -tileStartY + gBufferScale_y }; + + for (uniform int i = 0; i < 4; ++i) { + uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + + frustumPlanes_z[i] * frustumPlanes_z[i]); + frustumPlanes_xy[i] *= norm; + frustumPlanes_z[i] *= norm; + } + + uniform int32 tileNumLights = 0; + + foreach (lightIndex = 0 ... numLights) + { + float light_positionView_z = light_positionView_z_array[lightIndex]; + float light_attenuationEnd = light_attenuationEnd_array[lightIndex]; + float light_attenuationEndNeg = -light_attenuationEnd; + + float d = light_positionView_z - minZ; + bool inFrustum = (d >= light_attenuationEndNeg); + + d = maxZ - light_positionView_z; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + // This seems better than cif(!inFrustum) ccontinue; here since we + // don't actually need to mask the rest of this function - this is + // just a greedy early-out. Could also structure all of this as + // nested if() statements, but this a bit easier to read + if (any(inFrustum)) + { + float light_positionView_x = light_positionView_x_array[lightIndex]; + float light_positionView_y = light_positionView_y_array[lightIndex]; + + d = light_positionView_z * frustumPlanes_z[0] + + light_positionView_x * frustumPlanes_xy[0]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[1] + + light_positionView_x * frustumPlanes_xy[1]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[2] + + light_positionView_y * frustumPlanes_xy[2]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + d = light_positionView_z * frustumPlanes_z[3] + + light_positionView_y * frustumPlanes_xy[3]; + inFrustum = inFrustum && (d >= light_attenuationEndNeg); + + // Pack and store intersecting lights + const bool active = inFrustum && lightIndex < numLights; + + if(any(active)) + tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex); + } + } + + return tileNumLights; +} + + +static inline uniform int32 +IntersectLightsWithTile( + uniform int32 tileStartX, uniform int32 tileEndX, + uniform int32 tileStartY, uniform int32 tileEndY, + uniform int32 gBufferWidth, uniform int32 gBufferHeight, + // G-buffer data + uniform float zBuffer[], + // Camera data + uniform float cameraProj_11, uniform float cameraProj_22, + uniform float cameraProj_33, uniform float cameraProj_43, + uniform float cameraNear, uniform float cameraFar, + // Light Data + uniform int32 numLights, + uniform float light_positionView_x_array[], + uniform float light_positionView_y_array[], + uniform float light_positionView_z_array[], + uniform float light_attenuationEnd_array[], + // Output + uniform int32 tileLightIndices[] + ) +{ + uniform float minZ, maxZ; + ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY, + zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar, + minZ, maxZ); + + uniform int32 tileNumLights = IntersectLightsWithTileMinMax( + tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ, + gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22, + MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, + light_positionView_z_array, light_attenuationEnd_array, + tileLightIndices); + + return tileNumLights; +} + + +static inline void +ShadeTile( + uniform int32 tileStartX, uniform int32 tileEndX, + uniform int32 tileStartY, uniform int32 tileEndY, + uniform int32 gBufferWidth, uniform int32 gBufferHeight, + const uniform InputDataArrays &inputData, + // Camera data + uniform float cameraProj_11, uniform float cameraProj_22, + uniform float cameraProj_33, uniform float cameraProj_43, + // Light list + uniform int32 tileLightIndices[], + uniform int32 tileNumLights, + // UI + uniform bool visualizeLightCount, + // Output + uniform unsigned int8 framebuffer_r[], + uniform unsigned int8 framebuffer_g[], + uniform unsigned int8 framebuffer_b[] + ) +{ + if (tileNumLights == 0 || visualizeLightCount) { + uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255)); + for (uniform int32 y = tileStartY; y < tileEndY; ++y) + foreach (x = tileStartX ... tileEndX) + { + int32 framebufferIndex = (y * gBufferWidth + x); + framebuffer_r[framebufferIndex] = c; + framebuffer_g[framebufferIndex] = c; + framebuffer_b[framebufferIndex] = c; + } + } else { + uniform float twoOverGBufferWidth = 2.0f / gBufferWidth; + uniform float twoOverGBufferHeight = 2.0f / gBufferHeight; + + for (uniform int32 y = tileStartY; y < tileEndY; ++y) { + uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f); + + foreach (x = tileStartX ... tileEndX) { + int32 gBufferOffset = y * gBufferWidth + x; + + // Reconstruct position and (negative) view vector from G-buffer + float surface_positionView_x, surface_positionView_y, surface_positionView_z; + float Vneg_x, Vneg_y, Vneg_z; + + float z = inputData.zBuffer[gBufferOffset]; + + // Compute screen/clip-space position + // NOTE: Mind DX11 viewport transform and pixel center! + float positionScreen_x = (0.5f + (float)(x)) * + twoOverGBufferWidth - 1.0f; + + // Unproject depth buffer Z value into view space + surface_positionView_z = cameraProj_43 / (z - cameraProj_33); + surface_positionView_x = positionScreen_x * surface_positionView_z / + cameraProj_11; + surface_positionView_y = positionScreen_y * surface_positionView_z / + cameraProj_22; + + // We actually end up with a vector pointing *at* the + // surface (i.e. the negative view vector) + normalize3(surface_positionView_x, surface_positionView_y, + surface_positionView_z, Vneg_x, Vneg_y, Vneg_z); + + // Reconstruct normal from G-buffer + float surface_normal_x, surface_normal_y, surface_normal_z; + float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]); + float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]); + + float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y); + float m = sqrt(4.0f * f - 1.0f); + + surface_normal_x = m * (4.0f * normal_x - 2.0f); + surface_normal_y = m * (4.0f * normal_y - 2.0f); + surface_normal_z = 3.0f - 8.0f * f; + + // Load other G-buffer parameters + float surface_specularAmount = + half_to_float(inputData.specularAmount[gBufferOffset]); + float surface_specularPower = + half_to_float(inputData.specularPower[gBufferOffset]); + float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]); + float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]); + float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]); + + float lit_x = 0.0f; + float lit_y = 0.0f; + float lit_z = 0.0f; + for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; + ++tileLightIndex) { + uniform int32 lightIndex = tileLightIndices[tileLightIndex]; + + // Gather light data relevant to initial culling + uniform float light_positionView_x = + inputData.lightPositionView_x[lightIndex]; + uniform float light_positionView_y = + inputData.lightPositionView_y[lightIndex]; + uniform float light_positionView_z = + inputData.lightPositionView_z[lightIndex]; + uniform float light_attenuationEnd = + inputData.lightAttenuationEnd[lightIndex]; + + // Compute light vector + float L_x = light_positionView_x - surface_positionView_x; + float L_y = light_positionView_y - surface_positionView_y; + float L_z = light_positionView_z - surface_positionView_z; + + float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z); + + // Clip at end of attenuation + float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd; + + cif (distanceToLight2 < light_attenutaionEnd2) { + float distanceToLight = sqrt(distanceToLight2); + + // HLSL "rcp" is allowed to be fairly inaccurate + float distanceToLightRcp = rcp(distanceToLight); + L_x *= distanceToLightRcp; + L_y *= distanceToLightRcp; + L_z *= distanceToLightRcp; + + // Start computing brdf + float NdotL = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, L_x, L_y, L_z); + + // Clip back facing + cif (NdotL > 0.0f) { + uniform float light_attenuationBegin = + inputData.lightAttenuationBegin[lightIndex]; + + // Light distance attenuation (linstep) + float lightRange = (light_attenuationEnd - light_attenuationBegin); + float falloffPosition = (light_attenuationEnd - distanceToLight); + float attenuation = min(falloffPosition / lightRange, 1.0f); + + float H_x = (L_x - Vneg_x); + float H_y = (L_y - Vneg_y); + float H_z = (L_z - Vneg_z); + normalize3(H_x, H_y, H_z, H_x, H_y, H_z); + + float NdotH = dot3(surface_normal_x, surface_normal_y, + surface_normal_z, H_x, H_y, H_z); + NdotH = max(NdotH, 0.0f); + + float specular = pow(NdotH, surface_specularPower); + float specularNorm = (surface_specularPower + 2.0f) * + (1.0f / 8.0f); + float specularContrib = surface_specularAmount * + specularNorm * specular; + + float k = attenuation * NdotL * (1.0f + specularContrib); + + uniform float light_color_x = inputData.lightColor_x[lightIndex]; + uniform float light_color_y = inputData.lightColor_y[lightIndex]; + uniform float light_color_z = inputData.lightColor_z[lightIndex]; + + float lightContrib_x = surface_albedo_x * light_color_x; + float lightContrib_y = surface_albedo_y * light_color_y; + float lightContrib_z = surface_albedo_z * light_color_z; + + lit_x += lightContrib_x * k; + lit_y += lightContrib_y * k; + lit_z += lightContrib_z * k; + } + } + } + + // Gamma correct + // These pows are pretty slow right now, but we can do + // something faster if really necessary to squeeze every + // last bit of performance out of it + float gamma = 1.0 / 2.2f; + lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma); + lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma); + lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma); + + framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x); + framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y); + framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z); + } + } + } +} + + +/////////////////////////////////////////////////////////////////////////// +// Static decomposition + +void task +RenderTile(uniform int num_groups_x, uniform int num_groups_y, + const uniform InputHeader inputHeaderPtr[], + const uniform InputDataArrays inputDataPtr[], + uniform int visualizeLightCount, + // Output + uniform unsigned int8 framebuffer_r[], + uniform unsigned int8 framebuffer_g[], + uniform unsigned int8 framebuffer_b[]) { + if (taskIndex >= taskCount) return; + const uniform InputHeader inputHeader = *inputHeaderPtr; + const uniform InputDataArrays inputData = *inputDataPtr; + + uniform int32 group_y = taskIndex / num_groups_x; + uniform int32 group_x = taskIndex % num_groups_x; + uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH; + uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT; + uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH; + uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT; + + uniform int framebufferWidth = inputHeader.framebufferWidth; + uniform int framebufferHeight = inputHeader.framebufferHeight; + uniform float cameraProj_00 = inputHeader.cameraProj[0][0]; + uniform float cameraProj_11 = inputHeader.cameraProj[1][1]; + uniform float cameraProj_22 = inputHeader.cameraProj[2][2]; + uniform float cameraProj_32 = inputHeader.cameraProj[3][2]; + + // Light intersection: figure out which lights illuminate this tile. +#if 1 + uniform int * uniform tileLightIndices = uniform new uniform int [MAX_LIGHTS]; +#else + uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile +#endif +#if 1 + uniform int numTileLights = + IntersectLightsWithTile(tile_start_x, tile_end_x, + tile_start_y, tile_end_y, + framebufferWidth, framebufferHeight, + inputData.zBuffer, + cameraProj_00, cameraProj_11, + cameraProj_22, cameraProj_32, + inputHeader.cameraNear, inputHeader.cameraFar, + MAX_LIGHTS, + inputData.lightPositionView_x, + inputData.lightPositionView_y, + inputData.lightPositionView_z, + inputData.lightAttenuationEnd, + tileLightIndices); + + // And now shade the tile, using the lights in tileLightIndices + ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y, + framebufferWidth, framebufferHeight, inputData, + cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32, + tileLightIndices, numTileLights, visualizeLightCount, + framebuffer_r, framebuffer_g, framebuffer_b); +#endif +#if 1 + delete tileLightIndices; +#endif +} + + +export void +RenderStatic(uniform InputHeader inputHeaderPtr[], + uniform InputDataArrays inputDataPtr[], + uniform int visualizeLightCount, + // Output + uniform unsigned int8 framebuffer_r[], + uniform unsigned int8 framebuffer_g[], + uniform unsigned int8 framebuffer_b[]) { + + const uniform InputHeader inputHeader = *inputHeaderPtr; + const uniform InputDataArrays inputData = *inputDataPtr; + + + uniform int num_groups_x = (inputHeader.framebufferWidth + + MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH; + uniform int num_groups_y = (inputHeader.framebufferHeight + + MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT; + uniform int num_groups = num_groups_x * num_groups_y; + + // Launch a task to render each tile, each of which is MIN_TILE_WIDTH + // by MIN_TILE_HEIGHT pixels. + launch[num_groups] RenderTile(num_groups_x, num_groups_y, + inputHeaderPtr, inputDataPtr, visualizeLightCount, + framebuffer_r, framebuffer_g, framebuffer_b); + sync; +} + + + diff --git a/examples_ptx/deferred/main.cpp b/examples_ptx/deferred/main.cpp new file mode 100644 index 00000000..d7f62f50 --- /dev/null +++ b/examples_ptx/deferred/main.cpp @@ -0,0 +1,149 @@ +/* + Copyright (c) 2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define ISPC_IS_WINDOWS +#define NOMINMAX +#elif defined(__linux__) +#define ISPC_IS_LINUX +#elif defined(__APPLE__) +#define ISPC_IS_APPLE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ISPC_IS_WINDOWS + #define WIN32_LEAN_AND_MEAN + #include +#endif +#include "deferred.h" +#include "kernels_ispc.h" +#include "../timing.h" + +/////////////////////////////////////////////////////////////////////////// + +int main(int argc, char** argv) { + if (argc < 2) { + printf("usage: deferred_shading [tasks iterations] [serial iterations]\n"); + return 1; + } + static unsigned int test_iterations[] = {5, 3, 500}; //last value is for nframes, it is scale. + if (argc == 5) { + for (int i = 0; i < 3; i++) { + test_iterations[i] = atoi(argv[2 + i]); + } + } + + InputData *input = CreateInputDataFromFile(argv[1]); + if (!input) { + printf("Failed to load input file \"%s\"!\n", argv[1]); + return 1; + } + + Framebuffer framebuffer(input->header.framebufferWidth, + input->header.framebufferHeight); + + InitDynamicC(input); +#ifdef __cilk + InitDynamicCilk(input); +#endif // __cilk + + int nframes = test_iterations[2]; + double ispcCycles = 1e30; + for (int i = 0; i < test_iterations[0]; ++i) { + framebuffer.clear(); + reset_and_start_timer(); + for (int j = 0; j < nframes; ++j) + ispc::RenderStatic(input->header, input->arrays, + VISUALIZE_LIGHT_COUNT, + framebuffer.r, framebuffer.g, framebuffer.b); + double mcycles = get_elapsed_mcycles() / nframes; + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", mcycles); + ispcCycles = std::min(ispcCycles, mcycles); + } + printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render " + "%d x %d image\n", ispcCycles, + input->header.framebufferWidth, input->header.framebufferHeight); + WriteFrame("deferred-ispc-static.ppm", input, framebuffer); + + nframes = 3; +#ifdef __cilk + double dynamicCilkCycles = 1e30; + for (int i = 0; i < test_iterations[1]; ++i) { + framebuffer.clear(); + reset_and_start_timer(); + for (int j = 0; j < nframes; ++j) + DispatchDynamicCilk(input, &framebuffer); + double mcycles = get_elapsed_mcycles() / nframes; + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", mcycles); + dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles); + } + printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", + dynamicCilkCycles); + WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer); +#endif // __cilk + + double serialCycles = 1e30; + for (int i = 0; i < test_iterations[1]; ++i) { + framebuffer.clear(); + reset_and_start_timer(); + for (int j = 0; j < nframes; ++j) + DispatchDynamicC(input, &framebuffer); + double mcycles = get_elapsed_mcycles() / nframes; + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", mcycles); + serialCycles = std::min(serialCycles, mcycles); + } + printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", + serialCycles); + WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer); + +#ifdef __cilk + printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", + serialCycles/ispcCycles, serialCycles/dynamicCilkCycles); +#else + printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles); +#endif // __cilk + + DeleteInputData(input); + + return 0; +} diff --git a/examples_ptx/deferred/main_cu.cpp b/examples_ptx/deferred/main_cu.cpp new file mode 100644 index 00000000..4f2be879 --- /dev/null +++ b/examples_ptx/deferred/main_cu.cpp @@ -0,0 +1,139 @@ +/* + Copyright (c) 2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define ISPC_IS_WINDOWS +#define NOMINMAX +#elif defined(__linux__) +#define ISPC_IS_LINUX +#elif defined(__APPLE__) +#define ISPC_IS_APPLE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ISPC_IS_WINDOWS + #define WIN32_LEAN_AND_MEAN + #include +#endif +#include "deferred.h" +#include "kernels_ispc.h" +#include "../timing.h" + +/////////////////////////////////////////////////////////////////////////// + +int main(int argc, char** argv) { + if (argc != 2) { + printf("usage: deferred_shading \n"); + return 1; + } + + InputData *input = CreateInputDataFromFile(argv[1]); + if (!input) { + printf("Failed to load input file \"%s\"!\n", argv[1]); + return 1; + } + + Framebuffer framebuffer(input->header.framebufferWidth, + input->header.framebufferHeight); + + InitDynamicC(input); +#ifdef __cilk + InitDynamicCilk(input); +#endif // __cilk + + int nframes = 5; + double ispcCycles = 1e30; + for (int i = 0; i < 5; ++i) { + framebuffer.clear(); + reset_and_start_timer(); + for (int j = 0; j < nframes; ++j) + ispc::RenderStatic(input->header, input->arrays, + VISUALIZE_LIGHT_COUNT, + framebuffer.r, framebuffer.g, framebuffer.b); + double mcycles = get_elapsed_mcycles() / nframes; + ispcCycles = std::min(ispcCycles, mcycles); + } + printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render " + "%d x %d image\n", ispcCycles, + input->header.framebufferWidth, input->header.framebufferHeight); + WriteFrame("deferred-ispc-static.ppm", input, framebuffer); + +#ifdef __cilk + double dynamicCilkCycles = 1e30; + for (int i = 0; i < 5; ++i) { + framebuffer.clear(); + reset_and_start_timer(); + for (int j = 0; j < nframes; ++j) + DispatchDynamicCilk(input, &framebuffer); + double mcycles = get_elapsed_mcycles() / nframes; + dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles); + } + printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", + dynamicCilkCycles); + WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer); +#endif // __cilk + + double serialCycles = 1e30; + for (int i = 0; i < 5; ++i) { + framebuffer.clear(); + reset_and_start_timer(); + for (int j = 0; j < nframes; ++j) + DispatchDynamicC(input, &framebuffer); + double mcycles = get_elapsed_mcycles() / nframes; + serialCycles = std::min(serialCycles, mcycles); + } + printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", + serialCycles); + WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer); + +#ifdef __cilk + printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", + serialCycles/ispcCycles, serialCycles/dynamicCilkCycles); +#else + printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles); +#endif // __cilk + + DeleteInputData(input); + + return 0; +}