fixed cuda kernel

This commit is contained in:
Evghenii
2013-11-13 12:54:52 +01:00
parent 74db8cbab3
commit ca1dbc3d3b
2 changed files with 21 additions and 11 deletions

View File

@@ -805,8 +805,8 @@ RenderStatic(InputHeader inputHeaderPtr[],
unsigned int8 framebuffer_g[], unsigned int8 framebuffer_g[],
unsigned int8 framebuffer_b[]) { unsigned int8 framebuffer_b[]) {
const InputHeader &inputHeader = *inputHeaderPtr; const InputHeader inputHeader = *inputHeaderPtr;
const InputDataArrays &inputData = *inputDataPtr; const InputDataArrays inputData = *inputDataPtr;
int num_groups_x = (inputHeader.framebufferWidth + int num_groups_x = (inputHeader.framebufferWidth +
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH; MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
@@ -816,7 +816,9 @@ RenderStatic(InputHeader inputHeaderPtr[],
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
// by MIN_TILE_HEIGHT pixels. // by MIN_TILE_HEIGHT pixels.
RenderTile<<<dim3(num_groups_x,num_groups_y), 128>>>(num_groups_x, num_groups_y, if (programIndex == 0)
inputHeaderPtr, inputDataPtr, visualizeLightCount, RenderTile<<<num_groups, 128>>>(num_groups_x, num_groups_y,
framebuffer_r, framebuffer_g, framebuffer_b); inputHeaderPtr, inputDataPtr, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
cudaDeviceSynchronize();
} }

View File

@@ -161,7 +161,7 @@ CUmodule loadModule(const char * module)
#else #else
CUlinkState CUState; CUlinkState CUState;
CUlinkState *lState = &CUState; CUlinkState *lState = &CUState;
const int nOptions = 7; const int nOptions = 8;
CUjit_option options[nOptions]; CUjit_option options[nOptions];
void* optionVals[nOptions]; void* optionVals[nOptions];
float walltime; float walltime;
@@ -195,7 +195,9 @@ CUmodule loadModule(const char * module)
options[6] = CU_JIT_MAX_REGISTERS; options[6] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 48; int jitRegCount = 48;
optionVals[6] = (void *)(size_t)jitRegCount; optionVals[6] = (void *)(size_t)jitRegCount;
// Caching
options[7] = CU_JIT_CACHE_MODE;
optionVals[7] = (void *)CU_JIT_CACHE_OPTION_CA;
// Create a pending linker invocation // Create a pending linker invocation
checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState)); checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState));
@@ -271,7 +273,7 @@ void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
} }
#define deviceLaunch(func,params) \ #define deviceLaunch(func,params) \
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_SHARED)); \ checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \
checkCudaErrors( \ checkCudaErrors( \
cuLaunchKernel( \ cuLaunchKernel( \
(func), \ (func), \
@@ -320,7 +322,7 @@ std::vector<char> readBinary(const char * filename)
extern "C" extern "C"
{ {
void CUDALaunch( double CUDALaunch(
void **handlePtr, void **handlePtr,
const char * func_name, const char * func_name,
void **func_args) void **func_args)
@@ -329,8 +331,12 @@ extern "C"
const char * module = &module_str[0]; const char * module = &module_str[0];
CUmodule cudaModule = loadModule(module); CUmodule cudaModule = loadModule(module);
CUfunction cudaFunction = getFunction(cudaModule, func_name); CUfunction cudaFunction = getFunction(cudaModule, func_name);
const double t0 = rtc();
deviceLaunch(cudaFunction, func_args); deviceLaunch(cudaFunction, func_args);
checkCudaErrors(cuStreamSynchronize(0));
const double dt = rtc() - t0;
unloadModule(cudaModule); unloadModule(cudaModule);
return dt;
} }
} }
/******************************/ /******************************/
@@ -430,6 +436,7 @@ int main(int argc, char** argv) {
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
framebuffer.clear(); framebuffer.clear();
const double t0 = rtc(); const double t0 = rtc();
double dt = 0.0;
for (int j = 0; j < nframes; ++j) for (int j = 0; j < nframes; ++j)
{ {
const char * func_name = "RenderStatic"; const char * func_name = "RenderStatic";
@@ -441,9 +448,10 @@ int main(int argc, char** argv) {
&d_r, &d_r,
&d_g, &d_g,
&d_b}; &d_b};
CUDALaunch(NULL, func_name, func_args); dt += CUDALaunch(NULL, func_name, func_args);
} }
double mcycles = 1000*(rtc() - t0) / nframes; //double mcycles = 1000*(rtc() - t0) / nframes;
double mcycles = 1000*dt / nframes;
fprintf(stderr, "dt= %g\n", mcycles); fprintf(stderr, "dt= %g\n", mcycles);
ispcCycles = std::min(ispcCycles, mcycles); ispcCycles = std::min(ispcCycles, mcycles);
} }