From c0c1cc1ba778552112685a577cdd6776ab2eabf1 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Wed, 13 Nov 2013 14:16:48 +0100 Subject: [PATCH] +added Makefile and some fixes --- examples_cuda/aobench/Makefile_gpu | 2 +- examples_cuda/aobench/ao_cu.cpp | 2 +- examples_cuda/common.mk | 3 +- examples_cuda/mandelbrot_tasks3d/Makefile_gpu | 53 +++ .../mandelbrot_tasks3d/drvapi_error_string.h | 370 ++++++++++++++++++ .../mandelbrot_tasks3d/mandel_cu.cpp | 234 ++++++----- .../mandelbrot_tasks3d/mandelbrot_tasks3d.cpp | 17 +- .../mandelbrot_tasks3d.ispc | 30 +- examples_cuda/tasksys.cpp | 2 +- 9 files changed, 596 insertions(+), 117 deletions(-) create mode 100644 examples_cuda/mandelbrot_tasks3d/Makefile_gpu create mode 100644 examples_cuda/mandelbrot_tasks3d/drvapi_error_string.h diff --git a/examples_cuda/aobench/Makefile_gpu b/examples_cuda/aobench/Makefile_gpu index 619dfcb8..e217f388 100644 --- a/examples_cuda/aobench/Makefile_gpu +++ b/examples_cuda/aobench/Makefile_gpu @@ -1,4 +1,4 @@ -PROG=aob_cu +PROG=ao_cu ISPC_SRC=ao1.ispc CXX_SRC=ao_cu.cpp diff --git a/examples_cuda/aobench/ao_cu.cpp b/examples_cuda/aobench/ao_cu.cpp index 046985f1..7ec9c1a3 100755 --- a/examples_cuda/aobench/ao_cu.cpp +++ b/examples_cuda/aobench/ao_cu.cpp @@ -326,7 +326,7 @@ extern "C" const char * func_name, void **func_args) { - const std::vector module_str = readBinary("kernel.ptx"); + const std::vector module_str = readBinary("__kernels.ptx"); const char * module = &module_str[0]; CUmodule cudaModule = loadModule(module); CUfunction cudaFunction = getFunction(cudaModule, func_name); diff --git a/examples_cuda/common.mk b/examples_cuda/common.mk index fc956329..24581dd8 100644 --- a/examples_cuda/common.mk +++ b/examples_cuda/common.mk @@ -16,8 +16,9 @@ ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h) ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/) ifeq ($(ARCH),x86) - ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \ +# ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \ $(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o) + ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o ) ISPC_TARGETS=$(ISPC_IA_TARGETS) ARCH_BIT:=$(shell getconf LONG_BIT) ifeq ($(ARCH_BIT),32) diff --git a/examples_cuda/mandelbrot_tasks3d/Makefile_gpu b/examples_cuda/mandelbrot_tasks3d/Makefile_gpu new file mode 100644 index 00000000..73f0523f --- /dev/null +++ b/examples_cuda/mandelbrot_tasks3d/Makefile_gpu @@ -0,0 +1,53 @@ +PROG=mandel_cu +ISPC_SRC=mandelbrot_tasks3d.ispc +CXX_SRC=mandel_cu.cpp mandelbrot_tasks_serial.cpp + +CXX=g++ +CXXFLAGS=-O3 -I$(CUDATK)/include +LD=g++ +LDFLAGS=-lcuda + +ISPC=ispc +ISPCFLAGS=-O3 --math-lib=default --target=nvptx64,avx + +LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 +LLVM = $(HOME)/usr/local/llvm/bin-3.3 +PTXGEN = $(HOME)/ptxgen + +LLVM32DIS=$(LLVM32)/bin/llvm-dis + +.SUFFIXES: .bc .o .ptx .cu _ispc_nvptx64.bc + + +ISPC_OBJ=$(ISPC_SRC:%.ispc=%_ispc.o) +ISPC_BC=$(ISPC_SRC:%.ispc=%_ispc_nvptx64.bc) +PTXSRC=$(ISPC_SRC:%.ispc=%_ispc_nvptx64.ptx) +CXX_OBJ=$(CXX_SRC:%.cpp=%.o) + +all: $(PROG) + + +$(CXX_OBJ) : kernel.ptx +$(PROG): $(CXX_OBJ) kernel.ptx + /bin/cp kernel.ptx __kernels.ptx + $(LD) -o $@ $(CXX_OBJ) $(LDFLAGS) + +%.o: %.cpp + $(CXX) $(CXXFLAGS) -o $@ -c $< + + +%_ispc_nvptx64.bc: %.ispc + $(ISPC) $(ISPCFLAGS) --emit-llvm -o `basename $< .ispc`_ispc.bc -h `basename $< .ispc`_ispc.h $< --emit-llvm + +%.ptx: %.bc + $(LLVM32DIS) $< + $(PTXGEN) `basename $< .bc`.ll > $@ + +kernel.ptx: $(PTXSRC) + cat $^ > kernel.ptx + +clean: + /bin/rm -rf *.ptx *.bc *.ll $(PROG) + + + diff --git a/examples_cuda/mandelbrot_tasks3d/drvapi_error_string.h b/examples_cuda/mandelbrot_tasks3d/drvapi_error_string.h new file mode 100644 index 00000000..ce85f152 --- /dev/null +++ b/examples_cuda/mandelbrot_tasks3d/drvapi_error_string.h @@ -0,0 +1,370 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +#ifndef _DRVAPI_ERROR_STRING_H_ +#define _DRVAPI_ERROR_STRING_H_ + +#include +#include +#include + +// Error Code string definitions here +typedef struct +{ + char const *error_string; + int error_id; +} s_CudaErrorStr; + +/** + * Error codes + */ +static s_CudaErrorStr sCudaDrvErrorString[] = +{ + /** + * The API call returned with no errors. In the case of query calls, this + * can also mean that the operation being queried is complete (see + * ::cuEventQuery() and ::cuStreamQuery()). + */ + { "CUDA_SUCCESS", 0 }, + + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + { "CUDA_ERROR_INVALID_VALUE", 1 }, + + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + { "CUDA_ERROR_OUT_OF_MEMORY", 2 }, + + /** + * This indicates that the CUDA driver has not been initialized with + * ::cuInit() or that initialization has failed. + */ + { "CUDA_ERROR_NOT_INITIALIZED", 3 }, + + /** + * This indicates that the CUDA driver is in the process of shutting down. + */ + { "CUDA_ERROR_DEINITIALIZED", 4 }, + + /** + * This indicates profiling APIs are called while application is running + * in visual profiler mode. + */ + { "CUDA_ERROR_PROFILER_DISABLED", 5 }, + /** + * This indicates profiling has not been initialized for this context. + * Call cuProfilerInitialize() to resolve this. + */ + { "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 }, + /** + * This indicates profiler has already been started and probably + * cuProfilerStart() is incorrectly called. + */ + { "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 }, + /** + * This indicates profiler has already been stopped and probably + * cuProfilerStop() is incorrectly called. + */ + { "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 }, + /** + * This indicates that no CUDA-capable devices were detected by the installed + * CUDA driver. + */ + { "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 }, + + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device. + */ + { "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 }, + + + /** + * This indicates that the device kernel image is invalid. This can also + * indicate an invalid CUDA module. + */ + { "CUDA_ERROR_INVALID_IMAGE", 200 }, + + /** + * This most frequently indicates that there is no context bound to the + * current thread. This can also be returned if the context passed to an + * API call is not a valid handle (such as a context that has had + * ::cuCtxDestroy() invoked on it). This can also be returned if a user + * mixes different API versions (i.e. 3010 context with 3020 API calls). + * See ::cuCtxGetApiVersion() for more details. + */ + { "CUDA_ERROR_INVALID_CONTEXT", 201 }, + + /** + * This indicated that the context being supplied as a parameter to the + * API call was already the active context. + * \deprecated + * This error return is deprecated as of CUDA 3.2. It is no longer an + * error to attempt to push the active context via ::cuCtxPushCurrent(). + */ + { "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 }, + + /** + * This indicates that a map or register operation has failed. + */ + { "CUDA_ERROR_MAP_FAILED", 205 }, + + /** + * This indicates that an unmap or unregister operation has failed. + */ + { "CUDA_ERROR_UNMAP_FAILED", 206 }, + + /** + * This indicates that the specified array is currently mapped and thus + * cannot be destroyed. + */ + { "CUDA_ERROR_ARRAY_IS_MAPPED", 207 }, + + /** + * This indicates that the resource is already mapped. + */ + { "CUDA_ERROR_ALREADY_MAPPED", 208 }, + + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + { "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 }, + + /** + * This indicates that a resource has already been acquired. + */ + { "CUDA_ERROR_ALREADY_ACQUIRED", 210 }, + + /** + * This indicates that a resource is not mapped. + */ + { "CUDA_ERROR_NOT_MAPPED", 211 }, + + /** + * This indicates that a mapped resource is not available for access as an + * array. + */ + { "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 }, + + /** + * This indicates that a mapped resource is not available for access as a + * pointer. + */ + { "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 }, + + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + { "CUDA_ERROR_ECC_UNCORRECTABLE", 214 }, + + /** + * This indicates that the ::CUlimit passed to the API call is not + * supported by the active device. + */ + { "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 }, + + /** + * This indicates that the ::CUcontext passed to the API call can + * only be bound to a single CPU thread at a time but is already + * bound to a CPU thread. + */ + { "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 }, + + /** + * This indicates that peer access is not supported across the given + * devices. + */ + { "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217}, + + /** + * This indicates that the device kernel source is invalid. + */ + { "CUDA_ERROR_INVALID_SOURCE", 300 }, + + /** + * This indicates that the file specified was not found. + */ + { "CUDA_ERROR_FILE_NOT_FOUND", 301 }, + + /** + * This indicates that a link to a shared object failed to resolve. + */ + { "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 }, + + /** + * This indicates that initialization of a shared object failed. + */ + { "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 }, + + /** + * This indicates that an OS call failed. + */ + { "CUDA_ERROR_OPERATING_SYSTEM", 304 }, + + + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::CUstream and ::CUevent. + */ + { "CUDA_ERROR_INVALID_HANDLE", 400 }, + + + /** + * This indicates that a named symbol was not found. Examples of symbols + * are global/constant variable names, texture names }, and surface names. + */ + { "CUDA_ERROR_NOT_FOUND", 500 }, + + + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be indicated + * differently than ::CUDA_SUCCESS (which indicates completion). Calls that + * may return this value include ::cuEventQuery() and ::cuStreamQuery(). + */ + { "CUDA_ERROR_NOT_READY", 600 }, + + + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. The context cannot be used }, so it must + * be destroyed (and a new one should be created). All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + { "CUDA_ERROR_LAUNCH_FAILED", 700 }, + + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. This error usually indicates that the user has + * attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register + * count. Passing arguments of the wrong size (i.e. a 64-bit pointer + * when a 32-bit int is expected) is equivalent to passing too many + * arguments and can also result in this error. + */ + { "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 }, + + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device attribute + * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The + * context cannot be used (and must be destroyed similar to + * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from + * this context are invalid and must be reconstructed if the program is to + * continue using CUDA. + */ + { "CUDA_ERROR_LAUNCH_TIMEOUT", 702 }, + + /** + * This error indicates a kernel launch that uses an incompatible texturing + * mode. + */ + { "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 }, + + /** + * This error indicates that a call to ::cuCtxEnablePeerAccess() is + * trying to re-enable peer access to a context which has already + * had peer access to it enabled. + */ + { "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 }, + + /** + * This error indicates that ::cuCtxDisablePeerAccess() is + * trying to disable peer access which has not been enabled yet + * via ::cuCtxEnablePeerAccess(). + */ + { "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 }, + + /** + * This error indicates that the primary context for the specified device + * has already been initialized. + */ + { "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 }, + + /** + * This error indicates that the context current to the calling thread + * has been destroyed using ::cuCtxDestroy }, or is a primary context which + * has not yet been initialized. + */ + { "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 }, + + /** + * A device-side assert triggered during kernel execution. The context + * cannot be used anymore, and must be destroyed. All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + { "CUDA_ERROR_ASSERT", 710 }, + + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cuCtxEnablePeerAccess(). + */ + { "CUDA_ERROR_TOO_MANY_PEERS", 711 }, + + /** + * This error indicates that the memory range passed to ::cuMemHostRegister() + * has already been registered. + */ + { "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 }, + + /** + * This error indicates that the pointer passed to ::cuMemHostUnregister() + * does not correspond to any currently registered memory region. + */ + { "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 }, + + /** + * This error indicates that the attempted operation is not permitted. + */ + { "CUDA_ERROR_NOT_PERMITTED", 800 }, + + /** + * This error indicates that the attempted operation is not supported + * on the current system or device. + */ + { "CUDA_ERROR_NOT_SUPPORTED", 801 }, + + /** + * This indicates that an unknown internal error has occurred. + */ + { "CUDA_ERROR_UNKNOWN", 999 }, + { NULL, -1 } +}; + +// This is just a linear search through the array, since the error_id's are not +// always ocurring consecutively +const char * getCudaDrvErrorString(CUresult error_id) +{ + int index = 0; + while (sCudaDrvErrorString[index].error_id != error_id && + sCudaDrvErrorString[index].error_id != -1) + { + index++; + } + if (sCudaDrvErrorString[index].error_id == error_id) + return (const char *)sCudaDrvErrorString[index].error_string; + else + return (const char *)"CUDA_ERROR not found!"; +} + +#endif diff --git a/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp b/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp index 57ce9ac7..98ed0bb7 100644 --- a/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp +++ b/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp @@ -44,8 +44,6 @@ #include "../timing.h" #include - - double rtc(void) { struct timeval Tvalue; @@ -74,12 +72,6 @@ void __checkCudaErrors(CUresult err, const char *file, const int line) { exit(-1); } } -extern "C" -void mandelbrot_ispc( - float x0, float y0, - float x1, float y1, - int width, int height, - int maxIterations, int output[]) ; /**********************/ @@ -118,8 +110,120 @@ void destroyContext() CUmodule loadModule(const char * module) { + const double t0 = rtc(); CUmodule cudaModule; - checkCudaErrors(cuModuleLoadData(&cudaModule, module)); + // in this branch we use compilation with parameters + +#if 0 + unsigned int jitNumOptions = 1; + CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; + void **jitOptVals = new void*[jitNumOptions]; + // set up pointer to set the Maximum # of registers for a particular kernel + jitOptions[0] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 64; + jitOptVals[0] = (void *)(size_t)jitRegCount; +#if 0 + + { + jitNumOptions = 3; + // set up size of compilation log buffer + jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + int jitLogBufferSize = 1024; + jitOptVals[0] = (void *)(size_t)jitLogBufferSize; + + // set up pointer to the compilation log buffer + jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; + char *jitLogBuffer = new char[jitLogBufferSize]; + jitOptVals[1] = jitLogBuffer; + + // set up pointer to set the Maximum # of registers for a particular kernel + jitOptions[2] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 32; + jitOptVals[2] = (void *)(size_t)jitRegCount; + } +#endif + + checkCudaErrors(cuModuleLoadDataEx(&cudaModule, module,jitNumOptions, jitOptions, (void **)jitOptVals)); +#else + CUlinkState CUState; + CUlinkState *lState = &CUState; + const int nOptions = 7; + CUjit_option options[nOptions]; + void* optionVals[nOptions]; + float walltime; + const unsigned int logSize = 32768; + char error_log[logSize], + info_log[logSize]; + void *cuOut; + size_t outSize; + int myErr = 0; + + // Setup linker options + // Return walltime from JIT compilation + options[0] = CU_JIT_WALL_TIME; + optionVals[0] = (void*) &walltime; + // Pass a buffer for info messages + options[1] = CU_JIT_INFO_LOG_BUFFER; + optionVals[1] = (void*) info_log; + // Pass the size of the info buffer + options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + optionVals[2] = (void*) logSize; + // Pass a buffer for error message + options[3] = CU_JIT_ERROR_LOG_BUFFER; + optionVals[3] = (void*) error_log; + // Pass the size of the error buffer + options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; + optionVals[4] = (void*) logSize; + // Make the linker verbose + options[5] = CU_JIT_LOG_VERBOSE; + optionVals[5] = (void*) 1; + // Max # of registers/pthread + options[6] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 48; + optionVals[6] = (void *)(size_t)jitRegCount; + + // Create a pending linker invocation + checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState)); + +#if 0 + if (sizeof(void *)==4) + { + // Load the PTX from the string myPtx32 + printf("Loading myPtx32[] program\n"); + // PTX May also be loaded from file, as per below. + myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0); + } + else +#endif + { + // Load the PTX from the string myPtx (64-bit) + fprintf(stderr, "Loading ptx..\n"); + myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0); + myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "libcudadevrt.a", 0,0,0); + // PTX May also be loaded from file, as per below. + // myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0); + } + + // Complete the linker step + myErr = cuLinkComplete(*lState, &cuOut, &outSize); + + if ( myErr != CUDA_SUCCESS ) + { + // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. + fprintf(stderr,"PTX Linker Error:\n%s\n",error_log); + assert(0); + } + + // Linker walltime and info_log were requested in options above. + fprintf(stderr, "CUDA Link Completed in %fms [ %g ms]. Linker Output:\n%s\n",walltime,info_log,1e3*(rtc() - t0)); + + // Load resulting cuBin into module + checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut)); + + // Destroy the linker invocation + checkCudaErrors(cuLinkDestroy(*lState)); +#endif + fprintf(stderr, " loadModule took %g ms \n", 1e3*(rtc() - t0)); return cudaModule; } void unloadModule(CUmodule &cudaModule) @@ -152,12 +256,13 @@ void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) { checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); } -#define deviceLaunch(func,nbx,nby,nbz,params) \ +#define deviceLaunch(func,params) \ + checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_EQUAL)); \ checkCudaErrors( \ cuLaunchKernel( \ (func), \ - ((nbx-1)/(128/32)+1), (nby), (nbz), \ - 128, 1, 1, \ + 1,1,1, \ + 32, 1, 1, \ 0, NULL, (params), NULL \ )); @@ -200,104 +305,23 @@ std::vector readBinary(const char * filename) extern "C" { -#if 0 - struct ModuleManager - { - private: - typedef std::pair ModulePair; - typedef std::map ModuleMap; - ModuleMap module_list; - - ModuleMap::iterator findModule(const char * module_name) - { - return module_list.find(std::string(module_name)); - } - - public: - - CUmodule loadModule(const char * module_name, const char * module_data) - { - const ModuleMap::iterator it = findModule(module_name) - if (it != ModuleMap::end) - { - CUmodule cudaModule = loadModule(module); - module_list.insert(std::make_pair(std::string(module_name), cudaModule)); - return cudaModule - } - return it->second; - } - void unloadModule(const char * module_name) - { - ModuleMap::iterator it = findModule(module_name) - if (it != ModuleMap::end) - module_list.erase(it); - } - }; -#endif - - void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment) - { -#if 0 - fprintf(stderr, " ptr= %p\n", *handlePtr); - fprintf(stderr, " size= %d\n", (int)size); - fprintf(stderr, " alignment= %d\n", (int)alignment); - fprintf(stderr, " ------- \n\n"); -#endif - return NULL; - } - void CUDALaunch( + double CUDALaunch( void **handlePtr, - const char * module_name, - const char * module_1, const char * func_name, - void **func_args, - int countx, int county, int countz) + void **func_args) { - assert(module_name != NULL); - assert(module_1 != NULL); - assert(func_name != NULL); - assert(func_args != NULL); -#if 1 - const char * module = module_1; -#else - const std::vector module_str = readBinary("kernel.cubin"); + const std::vector module_str = readBinary("__kernels.ptx"); const char * module = &module_str[0]; -#endif -#if 1 CUmodule cudaModule = loadModule(module); CUfunction cudaFunction = getFunction(cudaModule, func_name); - deviceLaunch(cudaFunction, countx, county, countz, func_args); - unloadModule(cudaModule); -#else - fprintf(stderr, " handle= %p\n", *handlePtr); - fprintf(stderr, " count= %d %d %d\n", countx, county, countz); - - fprintf(stderr, " module_name= %s \n", module_name); - fprintf(stderr, " func_name= %s \n", func_name); - // fprintf(stderr, " ptx= %s \n", module); - fprintf(stderr, " x0= %g \n", *((float*)(func_args[0]))); - fprintf(stderr, " dx= %g \n", *((float*)(func_args[1]))); - fprintf(stderr, " y0= %g \n", *((float*)(func_args[2]))); - fprintf(stderr, " dy= %g \n", *((float*)(func_args[3]))); - fprintf(stderr, " w= %d \n", *((int*)(func_args[4]))); - fprintf(stderr, " h= %d \n", *((int*)(func_args[5]))); - fprintf(stderr, " xs= %d \n", *((int*)(func_args[6]))); - fprintf(stderr, " ys= %d \n", *((int*)(func_args[7]))); - fprintf(stderr, " maxit= %d \n", *((int*)(func_args[8]))); - fprintf(stderr, " ptr= %p \n", *((int**)(func_args[9]))); - fprintf(stderr, " ------- \n\n"); -#endif - } - void CUDASync(void *handle) - { + const double t0 = rtc(); + deviceLaunch(cudaFunction, func_args); checkCudaErrors(cuStreamSynchronize(0)); + const double dt = rtc() - t0; + unloadModule(cudaModule); + return dt; } - void ISPCSync(void *handle) - { - } - void CUDAFree(void *handle) - { - } + } /********************/ @@ -382,9 +406,15 @@ int main(int argc, char *argv[]) { for (unsigned int i = 0; i < width * height; ++i) buf[i] = 0; reset_and_start_timer(); +#if 0 const double t0 = rtc(); mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, (int*)d_buf); double dt = rtc() - t0; //get_elapsed_mcycles(); +#else + const char * func_name = "mandelbrot_ispc"; + void *func_args[] = {&x0, &y0, &x1, &y1, &width, &height, &maxIterations, &d_buf}; + const double dt = CUDALaunch(NULL, func_name, func_args); +#endif minISPC = std::min(minISPC, dt); } #endif diff --git a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp index ffad92d0..f6741cbd 100644 --- a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp +++ b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp @@ -42,8 +42,20 @@ #include #include #include "../timing.h" -#include "mandelbrot_ispc.h" +#include "mandelbrot_tasks3d_ispc.h" using namespace ispc; +#include +double rtc(void) +{ + struct timeval Tvalue; + double etime; + struct timezone dummy; + + gettimeofday(&Tvalue,&dummy); + etime = (double) Tvalue.tv_sec + + 1.e-6*((double) Tvalue.tv_usec); + return etime; +} extern void mandelbrot_serial(float x0, float y0, float x1, float y1, int width, int height, int maxIterations, @@ -113,8 +125,9 @@ int main(int argc, char *argv[]) { for (unsigned int i = 0; i < width * height; ++i) buf[i] = 0; reset_and_start_timer(); + const double t0 = rtc(); mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); - double dt = get_elapsed_mcycles(); + double dt = rtc() - t0; //get_elapsed_mcycles(); minISPC = std::min(minISPC, dt); } diff --git a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc index 395bdca4..aeff4773 100644 --- a/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc +++ b/examples_cuda/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc @@ -31,12 +31,21 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#ifdef __NVPTX__ +#define taskIndex0 blockIndex0() +#define taskIndex1 blockIndex1() +#define taskCount0 blockCount0() +#define taskCount1 blockCount1() +#define programCount warpSize() +#define programIndex laneIndex() +#endif + static inline int mandel(float c_re, float c_im, int count) { float z_re = c_re, z_im = c_im; int i; for (i = 0; i < count; ++i) { - if (z_re * z_re + z_im * z_im > 4.) + if (z_re * z_re + z_im * z_im > 4.0f) break; float new_re = z_re*z_re - z_im*z_im; @@ -65,13 +74,16 @@ mandelbrot_scanline(uniform float x0, uniform float dx, const uniform int ystart = taskIndex1 * yspan; const uniform int yend = min(ystart + yspan, height); + for (uniform int yi = ystart; yi < yend; yi++) + for (uniform int xi = xstart; xi < xend; xi += programCount) + { + const float x = x0 + (xi + programIndex) * dx; + const float y = y0 + yi * dy; - foreach (yi = ystart ... yend, xi = xstart ... xend) { - float x = x0 + xi * dx; - float y = y0 + yi * dy; - - int index = yi * width + xi; - output[index] = mandel(x, y, maxIterations); + const int res = mandel(x,y,maxIterations); + const int index = yi * width + (xi + programIndex); + if (xi + programIndex < xend) + output[index] = res; } } @@ -84,8 +96,8 @@ mandelbrot_ispc(uniform float x0, uniform float y0, uniform int maxIterations, uniform int output[]) { uniform float dx = (x1 - x0) / width; uniform float dy = (y1 - y0) / height; - const uniform int xspan = 16; /* make sure it is big enough to avoid false-sharing */ - const uniform int yspan = 16; + const uniform int xspan = 32; /* make sure it is big enough to avoid false-sharing */ + const uniform int yspan = 4; #if 1 diff --git a/examples_cuda/tasksys.cpp b/examples_cuda/tasksys.cpp index 55cbccd5..c46b7352 100644 --- a/examples_cuda/tasksys.cpp +++ b/examples_cuda/tasksys.cpp @@ -214,7 +214,7 @@ extern "C" { /////////////////////////////////////////////////////////////////////////// // TaskGroupBase -#define LOG_TASK_QUEUE_CHUNK_SIZE 14 +#define LOG_TASK_QUEUE_CHUNK_SIZE 16 #define MAX_TASK_QUEUE_CHUNKS 8 #define TASK_QUEUE_CHUNK_SIZE (1<