diff --git a/examples_cuda/aobench/ao.cu b/examples_cuda/aobench/ao.cu index 8a80281f..2ebcb011 100644 --- a/examples_cuda/aobench/ao.cu +++ b/examples_cuda/aobench/ao.cu @@ -403,3 +403,20 @@ void ao_task( int width, int height, const int y1 = min(y0 + TILEY, height); ao_tile(x0,x1,y0,y1, width, height, nsubsamples, image); } + +#if 1 +extern "C" +__global__ +void ao_ispc_tasks( + int w, int h, int nsubsamples, + float image[]) +{ + const int ntilex = (w+TILEX-1)/TILEX; + const int ntiley = (h+TILEY-1)/TILEY; + const int nbx = (ntilex-1)/4 + 1; + const int nby = ntiley; + const int nbz = 1; + const dim3 blocks (nbx, nby, nbz); + ao_task<<>>(w,h,nsubsamples,image); +} +#endif diff --git a/examples_cuda/aobench/ao_cu.cpp b/examples_cuda/aobench/ao_cu.cpp index 992ed232..b0aa607d 100755 --- a/examples_cuda/aobench/ao_cu.cpp +++ b/examples_cuda/aobench/ao_cu.cpp @@ -122,10 +122,12 @@ void destroyContext() CUmodule loadModule(const char * module) { + const double t0 = rtc(); CUmodule cudaModule; // in this branch we use compilation with parameters - const unsigned int jitNumOptions = 1; +#if 0 + unsigned int jitNumOptions = 1; CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; void **jitOptVals = new void*[jitNumOptions]; // set up pointer to set the Maximum # of registers for a particular kernel @@ -134,23 +136,106 @@ CUmodule loadModule(const char * module) jitOptVals[0] = (void *)(size_t)jitRegCount; #if 0 - // set up size of compilation log buffer - jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; - int jitLogBufferSize = 1024; - jitOptVals[0] = (void *)(size_t)jitLogBufferSize; + { + jitNumOptions = 3; + // set up size of compilation log buffer + jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + int jitLogBufferSize = 1024; + jitOptVals[0] = (void *)(size_t)jitLogBufferSize; - // set up pointer to the compilation log buffer - jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; - char *jitLogBuffer = new char[jitLogBufferSize]; - jitOptVals[1] = jitLogBuffer; + // set up pointer to the compilation log buffer + jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; + char *jitLogBuffer = new char[jitLogBufferSize]; + jitOptVals[1] = jitLogBuffer; - // set up pointer to set the Maximum # of registers for a particular kernel - jitOptions[2] = CU_JIT_MAX_REGISTERS; - int jitRegCount = 32; - jitOptVals[2] = (void *)(size_t)jitRegCount; + // set up pointer to set the Maximum # of registers for a particular kernel + jitOptions[2] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 32; + jitOptVals[2] = (void *)(size_t)jitRegCount; + } #endif checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals)); +#else + CUlinkState CUState; + CUlinkState *lState = &CUState; + const int nOptions = 7; + CUjit_option options[nOptions]; + void* optionVals[nOptions]; + float walltime; + const unsigned int logSize = 8192; + char error_log[logSize], + info_log[logSize]; + void *cuOut; + size_t outSize; + int myErr = 0; + + // Setup linker options + // Return walltime from JIT compilation + options[0] = CU_JIT_WALL_TIME; + optionVals[0] = (void*) &walltime; + // Pass a buffer for info messages + options[1] = CU_JIT_INFO_LOG_BUFFER; + optionVals[1] = (void*) info_log; + // Pass the size of the info buffer + options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + optionVals[2] = (void*) logSize; + // Pass a buffer for error message + options[3] = CU_JIT_ERROR_LOG_BUFFER; + optionVals[3] = (void*) error_log; + // Pass the size of the error buffer + options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; + optionVals[4] = (void*) logSize; + // Make the linker verbose + options[5] = CU_JIT_LOG_VERBOSE; + optionVals[5] = (void*) 1; + // Max # of registers/pthread + options[6] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 64; + optionVals[6] = (void *)(size_t)jitRegCount; + + // Create a pending linker invocation + checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState)); + +#if 0 + if (sizeof(void *)==4) + { + // Load the PTX from the string myPtx32 + printf("Loading myPtx32[] program\n"); + // PTX May also be loaded from file, as per below. + myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0); + } + else +#endif + { + // Load the PTX from the string myPtx (64-bit) + fprintf(stderr, "Loading ptx..\n"); + myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0); + myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "libcudadevrt.a", 0,0,0); + // PTX May also be loaded from file, as per below. + // myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0); + } + + // Complete the linker step + myErr = cuLinkComplete(*lState, &cuOut, &outSize); + + if ( myErr != CUDA_SUCCESS ) + { + // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. + fprintf(stderr,"PTX Linker Error:\n%s\n",error_log); + assert(0); + } + + // Linker walltime and info_log were requested in options above. + fprintf(stderr, "CUDA Link Completed in %fms [ %g ms]. Linker Output:\n%s\n",walltime,info_log,1e3*(rtc() - t0)); + + // Load resulting cuBin into module + checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut)); + + // Destroy the linker invocation + checkCudaErrors(cuLinkDestroy(*lState)); +#endif + fprintf(stderr, " loadModule took %g ms \n", 1e3*(rtc() - t0)); return cudaModule; } void unloadModule(CUmodule &cudaModule) @@ -251,7 +336,7 @@ extern "C" #if 0 const char * module = module_1; #else - const std::vector module_str = readBinary("kernel.cubin"); + const std::vector module_str = readBinary("kernel.ptx"); const char * module = &module_str[0]; #endif CUmodule cudaModule = loadModule(module);