+CDP works with deferred shading
This commit is contained in:
@@ -184,7 +184,7 @@ struct Uniform
|
|||||||
|
|
||||||
__device__ inline Uniform()
|
__device__ inline Uniform()
|
||||||
{
|
{
|
||||||
#if 1
|
#if 0
|
||||||
if (programIndex == 0)
|
if (programIndex == 0)
|
||||||
data = new T[N];
|
data = new T[N];
|
||||||
ptr[0] = __shfl(ptr[0], 0);
|
ptr[0] = __shfl(ptr[0], 0);
|
||||||
@@ -200,7 +200,7 @@ struct Uniform
|
|||||||
}
|
}
|
||||||
__device__ inline ~Uniform()
|
__device__ inline ~Uniform()
|
||||||
{
|
{
|
||||||
#if 1
|
#if 0
|
||||||
if (programIndex == 0)
|
if (programIndex == 0)
|
||||||
delete data;
|
delete data;
|
||||||
#else
|
#else
|
||||||
@@ -730,20 +730,21 @@ ShadeTile(
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Static decomposition
|
// Static decomposition
|
||||||
|
|
||||||
extern "C" __global__ void
|
|
||||||
|
__global__ void
|
||||||
RenderTile( int num_groups_x, int num_groups_y,
|
RenderTile( int num_groups_x, int num_groups_y,
|
||||||
const InputHeader *inputHeaderPtr,
|
const InputHeader inputHeaderPtr[],
|
||||||
const InputDataArrays *inputDataPtr,
|
const InputDataArrays inputDataPtr[],
|
||||||
int visualizeLightCount,
|
int visualizeLightCount,
|
||||||
// Output
|
// Output
|
||||||
unsigned int8 framebuffer_r[],
|
unsigned int8 framebuffer_r[],
|
||||||
unsigned int8 framebuffer_g[],
|
unsigned int8 framebuffer_g[],
|
||||||
unsigned int8 framebuffer_b[]) {
|
unsigned int8 framebuffer_b[]) {
|
||||||
if (taskIndex >= taskCount) return;
|
if (taskIndex >= taskCount) return;
|
||||||
|
const InputHeader &inputHeader = *inputHeaderPtr;
|
||||||
|
const InputDataArrays &inputData = *inputDataPtr;
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
const InputHeader inputHeader = *inputHeaderPtr;
|
|
||||||
const InputDataArrays inputData = *inputDataPtr;
|
|
||||||
int32 group_y = taskIndex / num_groups_x;
|
int32 group_y = taskIndex / num_groups_x;
|
||||||
int32 group_x = taskIndex % num_groups_x;
|
int32 group_x = taskIndex % num_groups_x;
|
||||||
|
|
||||||
@@ -794,3 +795,28 @@ RenderTile( int num_groups_x, int num_groups_y,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
__global__ void
|
||||||
|
RenderStatic(InputHeader inputHeaderPtr[],
|
||||||
|
InputDataArrays inputDataPtr[],
|
||||||
|
int visualizeLightCount,
|
||||||
|
// Output
|
||||||
|
unsigned int8 framebuffer_r[],
|
||||||
|
unsigned int8 framebuffer_g[],
|
||||||
|
unsigned int8 framebuffer_b[]) {
|
||||||
|
|
||||||
|
const InputHeader &inputHeader = *inputHeaderPtr;
|
||||||
|
const InputDataArrays &inputData = *inputDataPtr;
|
||||||
|
|
||||||
|
int num_groups_x = (inputHeader.framebufferWidth +
|
||||||
|
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
||||||
|
int num_groups_y = (inputHeader.framebufferHeight +
|
||||||
|
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
||||||
|
int num_groups = num_groups_x * num_groups_y;
|
||||||
|
|
||||||
|
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
|
||||||
|
// by MIN_TILE_HEIGHT pixels.
|
||||||
|
RenderTile<<<dim3(num_groups_x,num_groups_y), 128>>>(num_groups_x, num_groups_y,
|
||||||
|
inputHeaderPtr, inputDataPtr, visualizeLightCount,
|
||||||
|
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||||
|
}
|
||||||
|
|||||||
@@ -477,7 +477,7 @@ ShadeTile(
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Static decomposition
|
// Static decomposition
|
||||||
|
|
||||||
task void
|
void task
|
||||||
RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
||||||
const uniform InputHeader inputHeaderPtr[],
|
const uniform InputHeader inputHeaderPtr[],
|
||||||
const uniform InputDataArrays inputDataPtr[],
|
const uniform InputDataArrays inputDataPtr[],
|
||||||
@@ -487,7 +487,6 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
|||||||
uniform unsigned int8 framebuffer_g[],
|
uniform unsigned int8 framebuffer_g[],
|
||||||
uniform unsigned int8 framebuffer_b[]) {
|
uniform unsigned int8 framebuffer_b[]) {
|
||||||
if (taskIndex >= taskCount) return;
|
if (taskIndex >= taskCount) return;
|
||||||
|
|
||||||
const uniform InputHeader inputHeader = *inputHeaderPtr;
|
const uniform InputHeader inputHeader = *inputHeaderPtr;
|
||||||
const uniform InputDataArrays inputData = *inputDataPtr;
|
const uniform InputDataArrays inputData = *inputDataPtr;
|
||||||
|
|
||||||
@@ -543,13 +542,16 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
|||||||
export void
|
export void
|
||||||
RenderStatic(uniform InputHeader inputHeaderPtr[],
|
RenderStatic(uniform InputHeader inputHeaderPtr[],
|
||||||
uniform InputDataArrays inputDataPtr[],
|
uniform InputDataArrays inputDataPtr[],
|
||||||
uniform InputHeader &inputHeader,
|
|
||||||
uniform int visualizeLightCount,
|
uniform int visualizeLightCount,
|
||||||
// Output
|
// Output
|
||||||
uniform unsigned int8 framebuffer_r[],
|
uniform unsigned int8 framebuffer_r[],
|
||||||
uniform unsigned int8 framebuffer_g[],
|
uniform unsigned int8 framebuffer_g[],
|
||||||
uniform unsigned int8 framebuffer_b[]) {
|
uniform unsigned int8 framebuffer_b[]) {
|
||||||
|
|
||||||
|
const uniform InputHeader inputHeader = *inputHeaderPtr;
|
||||||
|
const uniform InputDataArrays inputData = *inputDataPtr;
|
||||||
|
|
||||||
|
|
||||||
uniform int num_groups_x = (inputHeader.framebufferWidth +
|
uniform int num_groups_x = (inputHeader.framebufferWidth +
|
||||||
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
||||||
uniform int num_groups_y = (inputHeader.framebufferHeight +
|
uniform int num_groups_y = (inputHeader.framebufferHeight +
|
||||||
|
|||||||
@@ -124,10 +124,12 @@ void destroyContext()
|
|||||||
|
|
||||||
CUmodule loadModule(const char * module)
|
CUmodule loadModule(const char * module)
|
||||||
{
|
{
|
||||||
|
const double t0 = rtc();
|
||||||
CUmodule cudaModule;
|
CUmodule cudaModule;
|
||||||
// in this branch we use compilation with parameters
|
// in this branch we use compilation with parameters
|
||||||
|
|
||||||
const unsigned int jitNumOptions = 1;
|
#if 0
|
||||||
|
unsigned int jitNumOptions = 1;
|
||||||
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
|
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
|
||||||
void **jitOptVals = new void*[jitNumOptions];
|
void **jitOptVals = new void*[jitNumOptions];
|
||||||
// set up pointer to set the Maximum # of registers for a particular kernel
|
// set up pointer to set the Maximum # of registers for a particular kernel
|
||||||
@@ -136,6 +138,8 @@ CUmodule loadModule(const char * module)
|
|||||||
jitOptVals[0] = (void *)(size_t)jitRegCount;
|
jitOptVals[0] = (void *)(size_t)jitRegCount;
|
||||||
#if 0
|
#if 0
|
||||||
|
|
||||||
|
{
|
||||||
|
jitNumOptions = 3;
|
||||||
// set up size of compilation log buffer
|
// set up size of compilation log buffer
|
||||||
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
||||||
int jitLogBufferSize = 1024;
|
int jitLogBufferSize = 1024;
|
||||||
@@ -150,9 +154,90 @@ CUmodule loadModule(const char * module)
|
|||||||
jitOptions[2] = CU_JIT_MAX_REGISTERS;
|
jitOptions[2] = CU_JIT_MAX_REGISTERS;
|
||||||
int jitRegCount = 32;
|
int jitRegCount = 32;
|
||||||
jitOptVals[2] = (void *)(size_t)jitRegCount;
|
jitOptVals[2] = (void *)(size_t)jitRegCount;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals));
|
checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals));
|
||||||
|
#else
|
||||||
|
CUlinkState CUState;
|
||||||
|
CUlinkState *lState = &CUState;
|
||||||
|
const int nOptions = 7;
|
||||||
|
CUjit_option options[nOptions];
|
||||||
|
void* optionVals[nOptions];
|
||||||
|
float walltime;
|
||||||
|
const unsigned int logSize = 32768;
|
||||||
|
char error_log[logSize],
|
||||||
|
info_log[logSize];
|
||||||
|
void *cuOut;
|
||||||
|
size_t outSize;
|
||||||
|
int myErr = 0;
|
||||||
|
|
||||||
|
// Setup linker options
|
||||||
|
// Return walltime from JIT compilation
|
||||||
|
options[0] = CU_JIT_WALL_TIME;
|
||||||
|
optionVals[0] = (void*) &walltime;
|
||||||
|
// Pass a buffer for info messages
|
||||||
|
options[1] = CU_JIT_INFO_LOG_BUFFER;
|
||||||
|
optionVals[1] = (void*) info_log;
|
||||||
|
// Pass the size of the info buffer
|
||||||
|
options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
||||||
|
optionVals[2] = (void*) logSize;
|
||||||
|
// Pass a buffer for error message
|
||||||
|
options[3] = CU_JIT_ERROR_LOG_BUFFER;
|
||||||
|
optionVals[3] = (void*) error_log;
|
||||||
|
// Pass the size of the error buffer
|
||||||
|
options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
|
||||||
|
optionVals[4] = (void*) logSize;
|
||||||
|
// Make the linker verbose
|
||||||
|
options[5] = CU_JIT_LOG_VERBOSE;
|
||||||
|
optionVals[5] = (void*) 1;
|
||||||
|
// Max # of registers/pthread
|
||||||
|
options[6] = CU_JIT_MAX_REGISTERS;
|
||||||
|
int jitRegCount = 48;
|
||||||
|
optionVals[6] = (void *)(size_t)jitRegCount;
|
||||||
|
|
||||||
|
// Create a pending linker invocation
|
||||||
|
checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState));
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
if (sizeof(void *)==4)
|
||||||
|
{
|
||||||
|
// Load the PTX from the string myPtx32
|
||||||
|
printf("Loading myPtx32[] program\n");
|
||||||
|
// PTX May also be loaded from file, as per below.
|
||||||
|
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
// Load the PTX from the string myPtx (64-bit)
|
||||||
|
fprintf(stderr, "Loading ptx..\n");
|
||||||
|
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0);
|
||||||
|
myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "libcudadevrt.a", 0,0,0);
|
||||||
|
// PTX May also be loaded from file, as per below.
|
||||||
|
// myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Complete the linker step
|
||||||
|
myErr = cuLinkComplete(*lState, &cuOut, &outSize);
|
||||||
|
|
||||||
|
if ( myErr != CUDA_SUCCESS )
|
||||||
|
{
|
||||||
|
// Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above.
|
||||||
|
fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Linker walltime and info_log were requested in options above.
|
||||||
|
fprintf(stderr, "CUDA Link Completed in %fms [ %g ms]. Linker Output:\n%s\n",walltime,info_log,1e3*(rtc() - t0));
|
||||||
|
|
||||||
|
// Load resulting cuBin into module
|
||||||
|
checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut));
|
||||||
|
|
||||||
|
// Destroy the linker invocation
|
||||||
|
checkCudaErrors(cuLinkDestroy(*lState));
|
||||||
|
#endif
|
||||||
|
fprintf(stderr, " loadModule took %g ms \n", 1e3*(rtc() - t0));
|
||||||
return cudaModule;
|
return cudaModule;
|
||||||
}
|
}
|
||||||
void unloadModule(CUmodule &cudaModule)
|
void unloadModule(CUmodule &cudaModule)
|
||||||
@@ -185,16 +270,17 @@ void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
|
|||||||
{
|
{
|
||||||
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
|
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
|
||||||
}
|
}
|
||||||
#define deviceLaunch(func,nbx,nby,nbz,params) \
|
#define deviceLaunch(func,params) \
|
||||||
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_SHARED)); \
|
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_SHARED)); \
|
||||||
checkCudaErrors( \
|
checkCudaErrors( \
|
||||||
cuLaunchKernel( \
|
cuLaunchKernel( \
|
||||||
(func), \
|
(func), \
|
||||||
((nbx-1)/(128/32)+1), (nby), (nbz), \
|
1,1,1, \
|
||||||
128, 1, 1, \
|
32, 1, 1, \
|
||||||
0, NULL, (params), NULL \
|
0, NULL, (params), NULL \
|
||||||
));
|
));
|
||||||
|
|
||||||
|
|
||||||
typedef CUdeviceptr devicePtr;
|
typedef CUdeviceptr devicePtr;
|
||||||
|
|
||||||
|
|
||||||
@@ -241,25 +327,14 @@ extern "C"
|
|||||||
}
|
}
|
||||||
void CUDALaunch(
|
void CUDALaunch(
|
||||||
void **handlePtr,
|
void **handlePtr,
|
||||||
const char * module_name,
|
|
||||||
const char * module_1,
|
|
||||||
const char * func_name,
|
const char * func_name,
|
||||||
void **func_args,
|
void **func_args)
|
||||||
int countx, int county, int countz)
|
|
||||||
{
|
{
|
||||||
assert(module_name != NULL);
|
const std::vector<char> module_str = readBinary("kernel.ptx");
|
||||||
assert(module_1 != NULL);
|
|
||||||
assert(func_name != NULL);
|
|
||||||
assert(func_args != NULL);
|
|
||||||
#if 0
|
|
||||||
const char * module = module_1;
|
|
||||||
#else
|
|
||||||
const std::vector<char> module_str = readBinary("kernel.cubin");
|
|
||||||
const char * module = &module_str[0];
|
const char * module = &module_str[0];
|
||||||
#endif
|
|
||||||
CUmodule cudaModule = loadModule(module);
|
CUmodule cudaModule = loadModule(module);
|
||||||
CUfunction cudaFunction = getFunction(cudaModule, func_name);
|
CUfunction cudaFunction = getFunction(cudaModule, func_name);
|
||||||
deviceLaunch(cudaFunction, countx, county, countz, func_args);
|
deviceLaunch(cudaFunction, func_args);
|
||||||
unloadModule(cudaModule);
|
unloadModule(cudaModule);
|
||||||
}
|
}
|
||||||
void CUDASync(void *handle)
|
void CUDASync(void *handle)
|
||||||
@@ -372,14 +447,18 @@ int main(int argc, char** argv) {
|
|||||||
framebuffer.clear();
|
framebuffer.clear();
|
||||||
const double t0 = rtc();
|
const double t0 = rtc();
|
||||||
for (int j = 0; j < nframes; ++j)
|
for (int j = 0; j < nframes; ++j)
|
||||||
ispc::RenderStatic(
|
{
|
||||||
(ispc::InputHeader*)d_header,
|
const char * func_name = "RenderStatic";
|
||||||
(ispc::InputDataArrays*)d_arrays,
|
int light_count = VISUALIZE_LIGHT_COUNT;
|
||||||
input->header,
|
void *func_args[] = {
|
||||||
VISUALIZE_LIGHT_COUNT,
|
&d_header,
|
||||||
(uint8_t*)d_r,
|
&d_arrays,
|
||||||
(uint8_t*)d_g,
|
&light_count,
|
||||||
(uint8_t*)d_b);
|
&d_r,
|
||||||
|
&d_g,
|
||||||
|
&d_b};
|
||||||
|
CUDALaunch(NULL, func_name, func_args);
|
||||||
|
}
|
||||||
double mcycles = 1000*(rtc() - t0) / nframes;
|
double mcycles = 1000*(rtc() - t0) / nframes;
|
||||||
fprintf(stderr, "dt= %g\n", mcycles);
|
fprintf(stderr, "dt= %g\n", mcycles);
|
||||||
ispcCycles = std::min(ispcCycles, mcycles);
|
ispcCycles = std::min(ispcCycles, mcycles);
|
||||||
|
|||||||
Reference in New Issue
Block a user