+CDP works with deferred shading

This commit is contained in:
Evghenii
2013-11-13 11:57:37 +01:00
parent 268be7f0b5
commit 62bc39e600
3 changed files with 159 additions and 52 deletions

View File

@@ -184,7 +184,7 @@ struct Uniform
__device__ inline Uniform() __device__ inline Uniform()
{ {
#if 1 #if 0
if (programIndex == 0) if (programIndex == 0)
data = new T[N]; data = new T[N];
ptr[0] = __shfl(ptr[0], 0); ptr[0] = __shfl(ptr[0], 0);
@@ -200,7 +200,7 @@ struct Uniform
} }
__device__ inline ~Uniform() __device__ inline ~Uniform()
{ {
#if 1 #if 0
if (programIndex == 0) if (programIndex == 0)
delete data; delete data;
#else #else
@@ -730,20 +730,21 @@ ShadeTile(
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// Static decomposition // Static decomposition
extern "C" __global__ void
__global__ void
RenderTile( int num_groups_x, int num_groups_y, RenderTile( int num_groups_x, int num_groups_y,
const InputHeader *inputHeaderPtr, const InputHeader inputHeaderPtr[],
const InputDataArrays *inputDataPtr, const InputDataArrays inputDataPtr[],
int visualizeLightCount, int visualizeLightCount,
// Output // Output
unsigned int8 framebuffer_r[], unsigned int8 framebuffer_r[],
unsigned int8 framebuffer_g[], unsigned int8 framebuffer_g[],
unsigned int8 framebuffer_b[]) { unsigned int8 framebuffer_b[]) {
if (taskIndex >= taskCount) return; if (taskIndex >= taskCount) return;
const InputHeader &inputHeader = *inputHeaderPtr;
const InputDataArrays &inputData = *inputDataPtr;
#if 1 #if 1
const InputHeader inputHeader = *inputHeaderPtr;
const InputDataArrays inputData = *inputDataPtr;
int32 group_y = taskIndex / num_groups_x; int32 group_y = taskIndex / num_groups_x;
int32 group_x = taskIndex % num_groups_x; int32 group_x = taskIndex % num_groups_x;
@@ -794,3 +795,28 @@ RenderTile( int num_groups_x, int num_groups_y,
} }
extern "C"
__global__ void
RenderStatic(InputHeader inputHeaderPtr[],
InputDataArrays inputDataPtr[],
int visualizeLightCount,
// Output
unsigned int8 framebuffer_r[],
unsigned int8 framebuffer_g[],
unsigned int8 framebuffer_b[]) {
const InputHeader &inputHeader = *inputHeaderPtr;
const InputDataArrays &inputData = *inputDataPtr;
int num_groups_x = (inputHeader.framebufferWidth +
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
int num_groups_y = (inputHeader.framebufferHeight +
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
int num_groups = num_groups_x * num_groups_y;
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
// by MIN_TILE_HEIGHT pixels.
RenderTile<<<dim3(num_groups_x,num_groups_y), 128>>>(num_groups_x, num_groups_y,
inputHeaderPtr, inputDataPtr, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
}

View File

@@ -477,7 +477,7 @@ ShadeTile(
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// Static decomposition // Static decomposition
task void void task
RenderTile(uniform int num_groups_x, uniform int num_groups_y, RenderTile(uniform int num_groups_x, uniform int num_groups_y,
const uniform InputHeader inputHeaderPtr[], const uniform InputHeader inputHeaderPtr[],
const uniform InputDataArrays inputDataPtr[], const uniform InputDataArrays inputDataPtr[],
@@ -487,7 +487,6 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
uniform unsigned int8 framebuffer_g[], uniform unsigned int8 framebuffer_g[],
uniform unsigned int8 framebuffer_b[]) { uniform unsigned int8 framebuffer_b[]) {
if (taskIndex >= taskCount) return; if (taskIndex >= taskCount) return;
const uniform InputHeader inputHeader = *inputHeaderPtr; const uniform InputHeader inputHeader = *inputHeaderPtr;
const uniform InputDataArrays inputData = *inputDataPtr; const uniform InputDataArrays inputData = *inputDataPtr;
@@ -543,13 +542,16 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
export void export void
RenderStatic(uniform InputHeader inputHeaderPtr[], RenderStatic(uniform InputHeader inputHeaderPtr[],
uniform InputDataArrays inputDataPtr[], uniform InputDataArrays inputDataPtr[],
uniform InputHeader &inputHeader,
uniform int visualizeLightCount, uniform int visualizeLightCount,
// Output // Output
uniform unsigned int8 framebuffer_r[], uniform unsigned int8 framebuffer_r[],
uniform unsigned int8 framebuffer_g[], uniform unsigned int8 framebuffer_g[],
uniform unsigned int8 framebuffer_b[]) { uniform unsigned int8 framebuffer_b[]) {
const uniform InputHeader inputHeader = *inputHeaderPtr;
const uniform InputDataArrays inputData = *inputDataPtr;
uniform int num_groups_x = (inputHeader.framebufferWidth + uniform int num_groups_x = (inputHeader.framebufferWidth +
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH; MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
uniform int num_groups_y = (inputHeader.framebufferHeight + uniform int num_groups_y = (inputHeader.framebufferHeight +

View File

@@ -124,10 +124,12 @@ void destroyContext()
CUmodule loadModule(const char * module) CUmodule loadModule(const char * module)
{ {
const double t0 = rtc();
CUmodule cudaModule; CUmodule cudaModule;
// in this branch we use compilation with parameters // in this branch we use compilation with parameters
const unsigned int jitNumOptions = 1; #if 0
unsigned int jitNumOptions = 1;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void*[jitNumOptions]; void **jitOptVals = new void*[jitNumOptions];
// set up pointer to set the Maximum # of registers for a particular kernel // set up pointer to set the Maximum # of registers for a particular kernel
@@ -136,6 +138,8 @@ CUmodule loadModule(const char * module)
jitOptVals[0] = (void *)(size_t)jitRegCount; jitOptVals[0] = (void *)(size_t)jitRegCount;
#if 0 #if 0
{
jitNumOptions = 3;
// set up size of compilation log buffer // set up size of compilation log buffer
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = 1024; int jitLogBufferSize = 1024;
@@ -150,9 +154,90 @@ CUmodule loadModule(const char * module)
jitOptions[2] = CU_JIT_MAX_REGISTERS; jitOptions[2] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 32; int jitRegCount = 32;
jitOptVals[2] = (void *)(size_t)jitRegCount; jitOptVals[2] = (void *)(size_t)jitRegCount;
}
#endif #endif
checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals)); checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals));
#else
CUlinkState CUState;
CUlinkState *lState = &CUState;
const int nOptions = 7;
CUjit_option options[nOptions];
void* optionVals[nOptions];
float walltime;
const unsigned int logSize = 32768;
char error_log[logSize],
info_log[logSize];
void *cuOut;
size_t outSize;
int myErr = 0;
// Setup linker options
// Return walltime from JIT compilation
options[0] = CU_JIT_WALL_TIME;
optionVals[0] = (void*) &walltime;
// Pass a buffer for info messages
options[1] = CU_JIT_INFO_LOG_BUFFER;
optionVals[1] = (void*) info_log;
// Pass the size of the info buffer
options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
optionVals[2] = (void*) logSize;
// Pass a buffer for error message
options[3] = CU_JIT_ERROR_LOG_BUFFER;
optionVals[3] = (void*) error_log;
// Pass the size of the error buffer
options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
optionVals[4] = (void*) logSize;
// Make the linker verbose
options[5] = CU_JIT_LOG_VERBOSE;
optionVals[5] = (void*) 1;
// Max # of registers/pthread
options[6] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 48;
optionVals[6] = (void *)(size_t)jitRegCount;
// Create a pending linker invocation
checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState));
#if 0
if (sizeof(void *)==4)
{
// Load the PTX from the string myPtx32
printf("Loading myPtx32[] program\n");
// PTX May also be loaded from file, as per below.
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0);
}
else
#endif
{
// Load the PTX from the string myPtx (64-bit)
fprintf(stderr, "Loading ptx..\n");
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0);
myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "libcudadevrt.a", 0,0,0);
// PTX May also be loaded from file, as per below.
// myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0);
}
// Complete the linker step
myErr = cuLinkComplete(*lState, &cuOut, &outSize);
if ( myErr != CUDA_SUCCESS )
{
// Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above.
fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
assert(0);
}
// Linker walltime and info_log were requested in options above.
fprintf(stderr, "CUDA Link Completed in %fms [ %g ms]. Linker Output:\n%s\n",walltime,info_log,1e3*(rtc() - t0));
// Load resulting cuBin into module
checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut));
// Destroy the linker invocation
checkCudaErrors(cuLinkDestroy(*lState));
#endif
fprintf(stderr, " loadModule took %g ms \n", 1e3*(rtc() - t0));
return cudaModule; return cudaModule;
} }
void unloadModule(CUmodule &cudaModule) void unloadModule(CUmodule &cudaModule)
@@ -185,16 +270,17 @@ void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
{ {
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
} }
#define deviceLaunch(func,nbx,nby,nbz,params) \ #define deviceLaunch(func,params) \
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_SHARED)); \ checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_SHARED)); \
checkCudaErrors( \ checkCudaErrors( \
cuLaunchKernel( \ cuLaunchKernel( \
(func), \ (func), \
((nbx-1)/(128/32)+1), (nby), (nbz), \ 1,1,1, \
128, 1, 1, \ 32, 1, 1, \
0, NULL, (params), NULL \ 0, NULL, (params), NULL \
)); ));
typedef CUdeviceptr devicePtr; typedef CUdeviceptr devicePtr;
@@ -241,25 +327,14 @@ extern "C"
} }
void CUDALaunch( void CUDALaunch(
void **handlePtr, void **handlePtr,
const char * module_name,
const char * module_1,
const char * func_name, const char * func_name,
void **func_args, void **func_args)
int countx, int county, int countz)
{ {
assert(module_name != NULL); const std::vector<char> module_str = readBinary("kernel.ptx");
assert(module_1 != NULL);
assert(func_name != NULL);
assert(func_args != NULL);
#if 0
const char * module = module_1;
#else
const std::vector<char> module_str = readBinary("kernel.cubin");
const char * module = &module_str[0]; const char * module = &module_str[0];
#endif
CUmodule cudaModule = loadModule(module); CUmodule cudaModule = loadModule(module);
CUfunction cudaFunction = getFunction(cudaModule, func_name); CUfunction cudaFunction = getFunction(cudaModule, func_name);
deviceLaunch(cudaFunction, countx, county, countz, func_args); deviceLaunch(cudaFunction, func_args);
unloadModule(cudaModule); unloadModule(cudaModule);
} }
void CUDASync(void *handle) void CUDASync(void *handle)
@@ -372,14 +447,18 @@ int main(int argc, char** argv) {
framebuffer.clear(); framebuffer.clear();
const double t0 = rtc(); const double t0 = rtc();
for (int j = 0; j < nframes; ++j) for (int j = 0; j < nframes; ++j)
ispc::RenderStatic( {
(ispc::InputHeader*)d_header, const char * func_name = "RenderStatic";
(ispc::InputDataArrays*)d_arrays, int light_count = VISUALIZE_LIGHT_COUNT;
input->header, void *func_args[] = {
VISUALIZE_LIGHT_COUNT, &d_header,
(uint8_t*)d_r, &d_arrays,
(uint8_t*)d_g, &light_count,
(uint8_t*)d_b); &d_r,
&d_g,
&d_b};
CUDALaunch(NULL, func_name, func_args);
}
double mcycles = 1000*(rtc() - t0) / nframes; double mcycles = 1000*(rtc() - t0) / nframes;
fprintf(stderr, "dt= %g\n", mcycles); fprintf(stderr, "dt= %g\n", mcycles);
ispcCycles = std::min(ispcCycles, mcycles); ispcCycles = std::min(ispcCycles, mcycles);