This commit is contained in:
Evghenii
2013-11-13 16:32:56 +01:00
parent 780e9f31fe
commit 525eacd035
4 changed files with 521 additions and 55 deletions

View File

@@ -109,34 +109,88 @@ void destroyContext()
CUmodule loadModule(const char * module)
{
const double t0 = rtc();
CUmodule cudaModule;
// in this branch we use compilation with parameters
const unsigned int jitNumOptions = 1;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void*[jitNumOptions];
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[0] = CU_JIT_MAX_REGISTERS;
CUlinkState CUState;
CUlinkState *lState = &CUState;
const int nOptions = 7;
CUjit_option options[nOptions];
void* optionVals[nOptions];
float walltime;
const unsigned int logSize = 32768;
char error_log[logSize],
info_log[logSize];
void *cuOut;
size_t outSize;
int myErr = 0;
// Setup linker options
// Return walltime from JIT compilation
options[0] = CU_JIT_WALL_TIME;
optionVals[0] = (void*) &walltime;
// Pass a buffer for info messages
options[1] = CU_JIT_INFO_LOG_BUFFER;
optionVals[1] = (void*) info_log;
// Pass the size of the info buffer
options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
optionVals[2] = (void*) logSize;
// Pass a buffer for error message
options[3] = CU_JIT_ERROR_LOG_BUFFER;
optionVals[3] = (void*) error_log;
// Pass the size of the error buffer
options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
optionVals[4] = (void*) logSize;
// Make the linker verbose
options[5] = CU_JIT_LOG_VERBOSE;
optionVals[5] = (void*) 1;
// Max # of registers/pthread
options[6] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 64;
jitOptVals[0] = (void *)(size_t)jitRegCount;
optionVals[6] = (void *)(size_t)jitRegCount;
// Create a pending linker invocation
checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState));
#if 0
// set up size of compilation log buffer
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = 1024;
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
// set up pointer to the compilation log buffer
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[1] = jitLogBuffer;
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[2] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 32;
jitOptVals[2] = (void *)(size_t)jitRegCount;
if (sizeof(void *)==4)
{
// Load the PTX from the string myPtx32
printf("Loading myPtx32[] program\n");
// PTX May also be loaded from file, as per below.
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0);
}
else
#endif
{
// Load the PTX from the string myPtx (64-bit)
fprintf(stderr, "Loading ptx..\n");
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0);
myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "libcudadevrt.a", 0,0,0);
// PTX May also be loaded from file, as per below.
// myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0);
}
checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals));
// Complete the linker step
myErr = cuLinkComplete(*lState, &cuOut, &outSize);
if ( myErr != CUDA_SUCCESS )
{
// Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above.
fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
assert(0);
}
// Linker walltime and info_log were requested in options above.
fprintf(stderr, "CUDA Link Completed in %fms [ %g ms]. Linker Output:\n%s\n",walltime,info_log,1e3*(rtc() - t0));
// Load resulting cuBin into module
checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut));
// Destroy the linker invocation
checkCudaErrors(cuLinkDestroy(*lState));
fprintf(stderr, " loadModule took %g ms \n", 1e3*(rtc() - t0));
return cudaModule;
}
void unloadModule(CUmodule &cudaModule)
@@ -169,16 +223,17 @@ void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
{
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
}
#define deviceLaunch(func,nbx,nby,nbz,params) \
#define deviceLaunch(func,params) \
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \
checkCudaErrors( \
cuLaunchKernel( \
(func), \
((nbx-1)/(128/32)+1), (nby), (nbz), \
128, 1, 1, \
1,1,1, \
32, 1, 1, \
0, NULL, (params), NULL \
));
typedef CUdeviceptr devicePtr;
@@ -219,43 +274,21 @@ std::vector<char> readBinary(const char * filename)
extern "C"
{
void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment)
{
return NULL;
}
void CUDALaunch(
double CUDALaunch(
void **handlePtr,
const char * module_name,
const char * module_1,
const char * func_name,
void **func_args,
int countx, int county, int countz)
void **func_args)
{
assert(module_name != NULL);
assert(module_1 != NULL);
assert(func_name != NULL);
assert(func_args != NULL);
#if 1
const char * module = module_1;
#else
const std::vector<char> module_str = readBinary("kernel.cubin");
const std::vector<char> module_str = readBinary("__kernels.ptx");
const char * module = &module_str[0];
#endif
CUmodule cudaModule = loadModule(module);
CUfunction cudaFunction = getFunction(cudaModule, func_name);
deviceLaunch(cudaFunction, countx, county, countz, func_args);
const double t0 = rtc();
deviceLaunch(cudaFunction, func_args);
checkCudaErrors(cuStreamSynchronize(0));
const double dt = rtc() - t0;
unloadModule(cudaModule);
}
void CUDASync(void *handle)
{
checkCudaErrors(cuStreamSynchronize(0));
}
void ISPCSync(void *handle)
{
checkCudaErrors(cuStreamSynchronize(0));
}
void CUDAFree(void *handle)
{
return dt;
}
}
@@ -426,6 +459,7 @@ int main(int argc, char *argv[]) {
//
double minISPCtasks = 1e30;
for (int i = 0; i < 3; ++i) {
#if 0
reset_and_start_timer();
const double t0 = rtc();
volume_ispc_tasks(
@@ -436,6 +470,16 @@ int main(int argc, char *argv[]) {
width, height,
(float*)d_image);
double dt = rtc() - t0; //get_elapsed_mcycles();
#else
const char * func_name = "volume_ispc_tasks";
void *func_args[] = {
&d_density,
&d_n,
&d_raster2camera, &d_camera2world,
&width, &height,
&d_image};
const double dt = CUDALaunch(NULL, func_name, func_args);
#endif
minISPCtasks = std::min(minISPCtasks, dt);
}