diff --git a/examples_cuda/volume_rendering/Makefile_gpu b/examples_cuda/volume_rendering/Makefile_gpu new file mode 100644 index 00000000..80cbcd1b --- /dev/null +++ b/examples_cuda/volume_rendering/Makefile_gpu @@ -0,0 +1,53 @@ +PROG=volume_cu +ISPC_SRC=volume1.ispc +CXX_SRC=volume_cu.cpp + +CXX=g++ +CXXFLAGS=-O3 -I$(CUDATK)/include +LD=g++ +LDFLAGS=-lcuda + +ISPC=ispc +ISPCFLAGS=-O3 --math-lib=default --target=nvptx64,avx + +LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 +LLVM = $(HOME)/usr/local/llvm/bin-3.3 +PTXGEN = $(HOME)/ptxgen + +LLVM32DIS=$(LLVM32)/bin/llvm-dis + +.SUFFIXES: .bc .o .ptx .cu _ispc_nvptx64.bc + + +ISPC_OBJ=$(ISPC_SRC:%.ispc=%_ispc.o) +ISPC_BC=$(ISPC_SRC:%.ispc=%_ispc_nvptx64.bc) +PTXSRC=$(ISPC_SRC:%.ispc=%_ispc_nvptx64.ptx) +CXX_OBJ=$(CXX_SRC:%.cpp=%.o) + +all: $(PROG) + + +$(CXX_OBJ) : kernel.ptx +$(PROG): $(CXX_OBJ) kernel.ptx + /bin/cp kernel.ptx __kernels.ptx + $(LD) -o $@ $(CXX_OBJ) $(LDFLAGS) + +%.o: %.cpp + $(CXX) $(CXXFLAGS) -o $@ -c $< + + +%_ispc_nvptx64.bc: %.ispc + $(ISPC) $(ISPCFLAGS) --emit-llvm -o `basename $< .ispc`_ispc.bc -h `basename $< .ispc`_ispc.h $< --emit-llvm + +%.ptx: %.bc + $(LLVM32DIS) $< + $(PTXGEN) `basename $< .bc`.ll > $@ + +kernel.ptx: $(PTXSRC) + cat $^ > kernel.ptx + +clean: + /bin/rm -rf *.ptx *.bc *.ll $(PROG) + + + diff --git a/examples_cuda/volume_rendering/drvapi_error_string.h b/examples_cuda/volume_rendering/drvapi_error_string.h new file mode 100644 index 00000000..ce85f152 --- /dev/null +++ b/examples_cuda/volume_rendering/drvapi_error_string.h @@ -0,0 +1,370 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +#ifndef _DRVAPI_ERROR_STRING_H_ +#define _DRVAPI_ERROR_STRING_H_ + +#include +#include +#include + +// Error Code string definitions here +typedef struct +{ + char const *error_string; + int error_id; +} s_CudaErrorStr; + +/** + * Error codes + */ +static s_CudaErrorStr sCudaDrvErrorString[] = +{ + /** + * The API call returned with no errors. In the case of query calls, this + * can also mean that the operation being queried is complete (see + * ::cuEventQuery() and ::cuStreamQuery()). + */ + { "CUDA_SUCCESS", 0 }, + + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + { "CUDA_ERROR_INVALID_VALUE", 1 }, + + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + { "CUDA_ERROR_OUT_OF_MEMORY", 2 }, + + /** + * This indicates that the CUDA driver has not been initialized with + * ::cuInit() or that initialization has failed. + */ + { "CUDA_ERROR_NOT_INITIALIZED", 3 }, + + /** + * This indicates that the CUDA driver is in the process of shutting down. + */ + { "CUDA_ERROR_DEINITIALIZED", 4 }, + + /** + * This indicates profiling APIs are called while application is running + * in visual profiler mode. + */ + { "CUDA_ERROR_PROFILER_DISABLED", 5 }, + /** + * This indicates profiling has not been initialized for this context. + * Call cuProfilerInitialize() to resolve this. + */ + { "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 }, + /** + * This indicates profiler has already been started and probably + * cuProfilerStart() is incorrectly called. + */ + { "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 }, + /** + * This indicates profiler has already been stopped and probably + * cuProfilerStop() is incorrectly called. + */ + { "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 }, + /** + * This indicates that no CUDA-capable devices were detected by the installed + * CUDA driver. + */ + { "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 }, + + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device. + */ + { "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 }, + + + /** + * This indicates that the device kernel image is invalid. This can also + * indicate an invalid CUDA module. + */ + { "CUDA_ERROR_INVALID_IMAGE", 200 }, + + /** + * This most frequently indicates that there is no context bound to the + * current thread. This can also be returned if the context passed to an + * API call is not a valid handle (such as a context that has had + * ::cuCtxDestroy() invoked on it). This can also be returned if a user + * mixes different API versions (i.e. 3010 context with 3020 API calls). + * See ::cuCtxGetApiVersion() for more details. + */ + { "CUDA_ERROR_INVALID_CONTEXT", 201 }, + + /** + * This indicated that the context being supplied as a parameter to the + * API call was already the active context. + * \deprecated + * This error return is deprecated as of CUDA 3.2. It is no longer an + * error to attempt to push the active context via ::cuCtxPushCurrent(). + */ + { "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 }, + + /** + * This indicates that a map or register operation has failed. + */ + { "CUDA_ERROR_MAP_FAILED", 205 }, + + /** + * This indicates that an unmap or unregister operation has failed. + */ + { "CUDA_ERROR_UNMAP_FAILED", 206 }, + + /** + * This indicates that the specified array is currently mapped and thus + * cannot be destroyed. + */ + { "CUDA_ERROR_ARRAY_IS_MAPPED", 207 }, + + /** + * This indicates that the resource is already mapped. + */ + { "CUDA_ERROR_ALREADY_MAPPED", 208 }, + + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + { "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 }, + + /** + * This indicates that a resource has already been acquired. + */ + { "CUDA_ERROR_ALREADY_ACQUIRED", 210 }, + + /** + * This indicates that a resource is not mapped. + */ + { "CUDA_ERROR_NOT_MAPPED", 211 }, + + /** + * This indicates that a mapped resource is not available for access as an + * array. + */ + { "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 }, + + /** + * This indicates that a mapped resource is not available for access as a + * pointer. + */ + { "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 }, + + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + { "CUDA_ERROR_ECC_UNCORRECTABLE", 214 }, + + /** + * This indicates that the ::CUlimit passed to the API call is not + * supported by the active device. + */ + { "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 }, + + /** + * This indicates that the ::CUcontext passed to the API call can + * only be bound to a single CPU thread at a time but is already + * bound to a CPU thread. + */ + { "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 }, + + /** + * This indicates that peer access is not supported across the given + * devices. + */ + { "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217}, + + /** + * This indicates that the device kernel source is invalid. + */ + { "CUDA_ERROR_INVALID_SOURCE", 300 }, + + /** + * This indicates that the file specified was not found. + */ + { "CUDA_ERROR_FILE_NOT_FOUND", 301 }, + + /** + * This indicates that a link to a shared object failed to resolve. + */ + { "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 }, + + /** + * This indicates that initialization of a shared object failed. + */ + { "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 }, + + /** + * This indicates that an OS call failed. + */ + { "CUDA_ERROR_OPERATING_SYSTEM", 304 }, + + + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::CUstream and ::CUevent. + */ + { "CUDA_ERROR_INVALID_HANDLE", 400 }, + + + /** + * This indicates that a named symbol was not found. Examples of symbols + * are global/constant variable names, texture names }, and surface names. + */ + { "CUDA_ERROR_NOT_FOUND", 500 }, + + + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be indicated + * differently than ::CUDA_SUCCESS (which indicates completion). Calls that + * may return this value include ::cuEventQuery() and ::cuStreamQuery(). + */ + { "CUDA_ERROR_NOT_READY", 600 }, + + + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. The context cannot be used }, so it must + * be destroyed (and a new one should be created). All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + { "CUDA_ERROR_LAUNCH_FAILED", 700 }, + + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. This error usually indicates that the user has + * attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register + * count. Passing arguments of the wrong size (i.e. a 64-bit pointer + * when a 32-bit int is expected) is equivalent to passing too many + * arguments and can also result in this error. + */ + { "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 }, + + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device attribute + * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The + * context cannot be used (and must be destroyed similar to + * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from + * this context are invalid and must be reconstructed if the program is to + * continue using CUDA. + */ + { "CUDA_ERROR_LAUNCH_TIMEOUT", 702 }, + + /** + * This error indicates a kernel launch that uses an incompatible texturing + * mode. + */ + { "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 }, + + /** + * This error indicates that a call to ::cuCtxEnablePeerAccess() is + * trying to re-enable peer access to a context which has already + * had peer access to it enabled. + */ + { "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 }, + + /** + * This error indicates that ::cuCtxDisablePeerAccess() is + * trying to disable peer access which has not been enabled yet + * via ::cuCtxEnablePeerAccess(). + */ + { "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 }, + + /** + * This error indicates that the primary context for the specified device + * has already been initialized. + */ + { "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 }, + + /** + * This error indicates that the context current to the calling thread + * has been destroyed using ::cuCtxDestroy }, or is a primary context which + * has not yet been initialized. + */ + { "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 }, + + /** + * A device-side assert triggered during kernel execution. The context + * cannot be used anymore, and must be destroyed. All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + { "CUDA_ERROR_ASSERT", 710 }, + + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cuCtxEnablePeerAccess(). + */ + { "CUDA_ERROR_TOO_MANY_PEERS", 711 }, + + /** + * This error indicates that the memory range passed to ::cuMemHostRegister() + * has already been registered. + */ + { "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 }, + + /** + * This error indicates that the pointer passed to ::cuMemHostUnregister() + * does not correspond to any currently registered memory region. + */ + { "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 }, + + /** + * This error indicates that the attempted operation is not permitted. + */ + { "CUDA_ERROR_NOT_PERMITTED", 800 }, + + /** + * This error indicates that the attempted operation is not supported + * on the current system or device. + */ + { "CUDA_ERROR_NOT_SUPPORTED", 801 }, + + /** + * This indicates that an unknown internal error has occurred. + */ + { "CUDA_ERROR_UNKNOWN", 999 }, + { NULL, -1 } +}; + +// This is just a linear search through the array, since the error_id's are not +// always ocurring consecutively +const char * getCudaDrvErrorString(CUresult error_id) +{ + int index = 0; + while (sCudaDrvErrorString[index].error_id != error_id && + sCudaDrvErrorString[index].error_id != -1) + { + index++; + } + if (sCudaDrvErrorString[index].error_id == error_id) + return (const char *)sCudaDrvErrorString[index].error_string; + else + return (const char *)"CUDA_ERROR not found!"; +} + +#endif diff --git a/examples_cuda/volume_rendering/volume1.ispc b/examples_cuda/volume_rendering/volume1.ispc index 88423ad5..4dd7029f 100644 --- a/examples_cuda/volume_rendering/volume1.ispc +++ b/examples_cuda/volume_rendering/volume1.ispc @@ -407,7 +407,6 @@ volume_ispc_tasks(uniform float density[], uniform int nVoxels[3], // Launch tasks to work on (dx,dy)-sized tiles of the image uniform int dx = 8, dy = 8; uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy); - print("nTasks= %\n", nTasks); launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world, width, height, image); } diff --git a/examples_cuda/volume_rendering/volume_cu.cpp b/examples_cuda/volume_rendering/volume_cu.cpp index b0d52374..41a184c9 100644 --- a/examples_cuda/volume_rendering/volume_cu.cpp +++ b/examples_cuda/volume_rendering/volume_cu.cpp @@ -109,34 +109,88 @@ void destroyContext() CUmodule loadModule(const char * module) { + const double t0 = rtc(); CUmodule cudaModule; // in this branch we use compilation with parameters - const unsigned int jitNumOptions = 1; - CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; - void **jitOptVals = new void*[jitNumOptions]; - // set up pointer to set the Maximum # of registers for a particular kernel - jitOptions[0] = CU_JIT_MAX_REGISTERS; + + CUlinkState CUState; + CUlinkState *lState = &CUState; + const int nOptions = 7; + CUjit_option options[nOptions]; + void* optionVals[nOptions]; + float walltime; + const unsigned int logSize = 32768; + char error_log[logSize], + info_log[logSize]; + void *cuOut; + size_t outSize; + int myErr = 0; + + // Setup linker options + // Return walltime from JIT compilation + options[0] = CU_JIT_WALL_TIME; + optionVals[0] = (void*) &walltime; + // Pass a buffer for info messages + options[1] = CU_JIT_INFO_LOG_BUFFER; + optionVals[1] = (void*) info_log; + // Pass the size of the info buffer + options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + optionVals[2] = (void*) logSize; + // Pass a buffer for error message + options[3] = CU_JIT_ERROR_LOG_BUFFER; + optionVals[3] = (void*) error_log; + // Pass the size of the error buffer + options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; + optionVals[4] = (void*) logSize; + // Make the linker verbose + options[5] = CU_JIT_LOG_VERBOSE; + optionVals[5] = (void*) 1; + // Max # of registers/pthread + options[6] = CU_JIT_MAX_REGISTERS; int jitRegCount = 64; - jitOptVals[0] = (void *)(size_t)jitRegCount; + optionVals[6] = (void *)(size_t)jitRegCount; + + // Create a pending linker invocation + checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState)); #if 0 - // set up size of compilation log buffer - jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; - int jitLogBufferSize = 1024; - jitOptVals[0] = (void *)(size_t)jitLogBufferSize; - - // set up pointer to the compilation log buffer - jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; - char *jitLogBuffer = new char[jitLogBufferSize]; - jitOptVals[1] = jitLogBuffer; - - // set up pointer to set the Maximum # of registers for a particular kernel - jitOptions[2] = CU_JIT_MAX_REGISTERS; - int jitRegCount = 32; - jitOptVals[2] = (void *)(size_t)jitRegCount; + if (sizeof(void *)==4) + { + // Load the PTX from the string myPtx32 + printf("Loading myPtx32[] program\n"); + // PTX May also be loaded from file, as per below. + myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0); + } + else #endif + { + // Load the PTX from the string myPtx (64-bit) + fprintf(stderr, "Loading ptx..\n"); + myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0); + myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "libcudadevrt.a", 0,0,0); + // PTX May also be loaded from file, as per below. + // myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0); + } - checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals)); + // Complete the linker step + myErr = cuLinkComplete(*lState, &cuOut, &outSize); + + if ( myErr != CUDA_SUCCESS ) + { + // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. + fprintf(stderr,"PTX Linker Error:\n%s\n",error_log); + assert(0); + } + + // Linker walltime and info_log were requested in options above. + fprintf(stderr, "CUDA Link Completed in %fms [ %g ms]. Linker Output:\n%s\n",walltime,info_log,1e3*(rtc() - t0)); + + // Load resulting cuBin into module + checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut)); + + // Destroy the linker invocation + checkCudaErrors(cuLinkDestroy(*lState)); + fprintf(stderr, " loadModule took %g ms \n", 1e3*(rtc() - t0)); return cudaModule; } void unloadModule(CUmodule &cudaModule) @@ -169,16 +223,17 @@ void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) { checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); } -#define deviceLaunch(func,nbx,nby,nbz,params) \ +#define deviceLaunch(func,params) \ checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \ checkCudaErrors( \ cuLaunchKernel( \ (func), \ - ((nbx-1)/(128/32)+1), (nby), (nbz), \ - 128, 1, 1, \ + 1,1,1, \ + 32, 1, 1, \ 0, NULL, (params), NULL \ )); + typedef CUdeviceptr devicePtr; @@ -219,43 +274,21 @@ std::vector readBinary(const char * filename) extern "C" { - void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment) - { - return NULL; - } - void CUDALaunch( + double CUDALaunch( void **handlePtr, - const char * module_name, - const char * module_1, const char * func_name, - void **func_args, - int countx, int county, int countz) + void **func_args) { - assert(module_name != NULL); - assert(module_1 != NULL); - assert(func_name != NULL); - assert(func_args != NULL); -#if 1 - const char * module = module_1; -#else - const std::vector module_str = readBinary("kernel.cubin"); + const std::vector module_str = readBinary("__kernels.ptx"); const char * module = &module_str[0]; -#endif CUmodule cudaModule = loadModule(module); CUfunction cudaFunction = getFunction(cudaModule, func_name); - deviceLaunch(cudaFunction, countx, county, countz, func_args); + const double t0 = rtc(); + deviceLaunch(cudaFunction, func_args); + checkCudaErrors(cuStreamSynchronize(0)); + const double dt = rtc() - t0; unloadModule(cudaModule); - } - void CUDASync(void *handle) - { - checkCudaErrors(cuStreamSynchronize(0)); - } - void ISPCSync(void *handle) - { - checkCudaErrors(cuStreamSynchronize(0)); - } - void CUDAFree(void *handle) - { + return dt; } } @@ -426,6 +459,7 @@ int main(int argc, char *argv[]) { // double minISPCtasks = 1e30; for (int i = 0; i < 3; ++i) { +#if 0 reset_and_start_timer(); const double t0 = rtc(); volume_ispc_tasks( @@ -436,6 +470,16 @@ int main(int argc, char *argv[]) { width, height, (float*)d_image); double dt = rtc() - t0; //get_elapsed_mcycles(); +#else + const char * func_name = "volume_ispc_tasks"; + void *func_args[] = { + &d_density, + &d_n, + &d_raster2camera, &d_camera2world, + &width, &height, + &d_image}; + const double dt = CUDALaunch(NULL, func_name, func_args); +#endif minISPCtasks = std::min(minISPCtasks, dt); }