diff --git a/examples_cuda/sort/Makefile b/examples_cuda/sort/Makefile index cf6bffa4..09ae50e5 100644 --- a/examples_cuda/sort/Makefile +++ b/examples_cuda/sort/Makefile @@ -2,7 +2,7 @@ EXAMPLE=sort CPP_SRC=sort.cpp sort_serial.cpp ISPC_SRC=sort.ispc -ISPC_IA_TARGETS=sse2,sse4-x2,avx +ISPC_IA_TARGETS=avx ISPC_ARM_TARGETS=neon #ISPC_FLAGS=-DDEBUG diff --git a/examples_cuda/sort/Makefile_gpu b/examples_cuda/sort/Makefile_gpu new file mode 100644 index 00000000..af65c30f --- /dev/null +++ b/examples_cuda/sort/Makefile_gpu @@ -0,0 +1,52 @@ +PROG=sort_cu +ISPC_SRC=sort1.ispc +CXX_SRC=sort_cu.cpp sort_serial.cpp + +CXX=g++ +CXXFLAGS=-O3 -I$(CUDATK)/include +LD=g++ +LDFLAGS=-lcuda + +ISPC=ispc +ISPCFLAGS=-O3 --math-lib=default --target=nvptx64,avx + +LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 +LLVM = $(HOME)/usr/local/llvm/bin-3.3 +PTXGEN = $(HOME)/ptxgen + +LLVM32DIS=$(LLVM32)/bin/llvm-dis + +.SUFFIXES: .bc .o .ptx .cu _ispc_nvptx64.bc + + +ISPC_OBJ=$(ISPC_SRC:%.ispc=%_ispc.o) +ISPC_BC=$(ISPC_SRC:%.ispc=%_ispc_nvptx64.bc) +PTXSRC=$(ISPC_SRC:%.ispc=%_ispc_nvptx64.ptx) +CXX_OBJ=$(CXX_SRC:%.cpp=%.o) + +all: $(PROG) + + +$(PROG): $(CXX_OBJ) kernel.ptx + /bin/cp kernel.ptx __kernels.ptx + $(LD) -o $@ $(CXX_OBJ) $(LDFLAGS) + +%.o: %.cpp + $(CXX) $(CXXFLAGS) -o $@ -c $< + + +%_ispc_nvptx64.bc: %.ispc + $(ISPC) $(ISPCFLAGS) --emit-llvm -o `basename $< .ispc`_ispc.bc -h `basename $< .ispc`_ispc.h $< --emit-llvm + +%.ptx: %.bc + $(LLVM32DIS) $< + $(PTXGEN) `basename $< .bc`.ll > $@ + +kernel.ptx: $(PTXSRC) + cat $^ > kernel.ptx + +clean: + /bin/rm -rf *.ptx *.bc *.ll $(PROG) + + + diff --git a/examples_cuda/sort/drvapi_error_string.h b/examples_cuda/sort/drvapi_error_string.h new file mode 100644 index 00000000..ce85f152 --- /dev/null +++ b/examples_cuda/sort/drvapi_error_string.h @@ -0,0 +1,370 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +#ifndef _DRVAPI_ERROR_STRING_H_ +#define _DRVAPI_ERROR_STRING_H_ + +#include +#include +#include + +// Error Code string definitions here +typedef struct +{ + char const *error_string; + int error_id; +} s_CudaErrorStr; + +/** + * Error codes + */ +static s_CudaErrorStr sCudaDrvErrorString[] = +{ + /** + * The API call returned with no errors. In the case of query calls, this + * can also mean that the operation being queried is complete (see + * ::cuEventQuery() and ::cuStreamQuery()). + */ + { "CUDA_SUCCESS", 0 }, + + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + { "CUDA_ERROR_INVALID_VALUE", 1 }, + + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + { "CUDA_ERROR_OUT_OF_MEMORY", 2 }, + + /** + * This indicates that the CUDA driver has not been initialized with + * ::cuInit() or that initialization has failed. + */ + { "CUDA_ERROR_NOT_INITIALIZED", 3 }, + + /** + * This indicates that the CUDA driver is in the process of shutting down. + */ + { "CUDA_ERROR_DEINITIALIZED", 4 }, + + /** + * This indicates profiling APIs are called while application is running + * in visual profiler mode. + */ + { "CUDA_ERROR_PROFILER_DISABLED", 5 }, + /** + * This indicates profiling has not been initialized for this context. + * Call cuProfilerInitialize() to resolve this. + */ + { "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 }, + /** + * This indicates profiler has already been started and probably + * cuProfilerStart() is incorrectly called. + */ + { "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 }, + /** + * This indicates profiler has already been stopped and probably + * cuProfilerStop() is incorrectly called. + */ + { "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 }, + /** + * This indicates that no CUDA-capable devices were detected by the installed + * CUDA driver. + */ + { "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 }, + + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device. + */ + { "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 }, + + + /** + * This indicates that the device kernel image is invalid. This can also + * indicate an invalid CUDA module. + */ + { "CUDA_ERROR_INVALID_IMAGE", 200 }, + + /** + * This most frequently indicates that there is no context bound to the + * current thread. This can also be returned if the context passed to an + * API call is not a valid handle (such as a context that has had + * ::cuCtxDestroy() invoked on it). This can also be returned if a user + * mixes different API versions (i.e. 3010 context with 3020 API calls). + * See ::cuCtxGetApiVersion() for more details. + */ + { "CUDA_ERROR_INVALID_CONTEXT", 201 }, + + /** + * This indicated that the context being supplied as a parameter to the + * API call was already the active context. + * \deprecated + * This error return is deprecated as of CUDA 3.2. It is no longer an + * error to attempt to push the active context via ::cuCtxPushCurrent(). + */ + { "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 }, + + /** + * This indicates that a map or register operation has failed. + */ + { "CUDA_ERROR_MAP_FAILED", 205 }, + + /** + * This indicates that an unmap or unregister operation has failed. + */ + { "CUDA_ERROR_UNMAP_FAILED", 206 }, + + /** + * This indicates that the specified array is currently mapped and thus + * cannot be destroyed. + */ + { "CUDA_ERROR_ARRAY_IS_MAPPED", 207 }, + + /** + * This indicates that the resource is already mapped. + */ + { "CUDA_ERROR_ALREADY_MAPPED", 208 }, + + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + { "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 }, + + /** + * This indicates that a resource has already been acquired. + */ + { "CUDA_ERROR_ALREADY_ACQUIRED", 210 }, + + /** + * This indicates that a resource is not mapped. + */ + { "CUDA_ERROR_NOT_MAPPED", 211 }, + + /** + * This indicates that a mapped resource is not available for access as an + * array. + */ + { "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 }, + + /** + * This indicates that a mapped resource is not available for access as a + * pointer. + */ + { "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 }, + + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + { "CUDA_ERROR_ECC_UNCORRECTABLE", 214 }, + + /** + * This indicates that the ::CUlimit passed to the API call is not + * supported by the active device. + */ + { "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 }, + + /** + * This indicates that the ::CUcontext passed to the API call can + * only be bound to a single CPU thread at a time but is already + * bound to a CPU thread. + */ + { "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 }, + + /** + * This indicates that peer access is not supported across the given + * devices. + */ + { "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217}, + + /** + * This indicates that the device kernel source is invalid. + */ + { "CUDA_ERROR_INVALID_SOURCE", 300 }, + + /** + * This indicates that the file specified was not found. + */ + { "CUDA_ERROR_FILE_NOT_FOUND", 301 }, + + /** + * This indicates that a link to a shared object failed to resolve. + */ + { "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 }, + + /** + * This indicates that initialization of a shared object failed. + */ + { "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 }, + + /** + * This indicates that an OS call failed. + */ + { "CUDA_ERROR_OPERATING_SYSTEM", 304 }, + + + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::CUstream and ::CUevent. + */ + { "CUDA_ERROR_INVALID_HANDLE", 400 }, + + + /** + * This indicates that a named symbol was not found. Examples of symbols + * are global/constant variable names, texture names }, and surface names. + */ + { "CUDA_ERROR_NOT_FOUND", 500 }, + + + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be indicated + * differently than ::CUDA_SUCCESS (which indicates completion). Calls that + * may return this value include ::cuEventQuery() and ::cuStreamQuery(). + */ + { "CUDA_ERROR_NOT_READY", 600 }, + + + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. The context cannot be used }, so it must + * be destroyed (and a new one should be created). All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + { "CUDA_ERROR_LAUNCH_FAILED", 700 }, + + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. This error usually indicates that the user has + * attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register + * count. Passing arguments of the wrong size (i.e. a 64-bit pointer + * when a 32-bit int is expected) is equivalent to passing too many + * arguments and can also result in this error. + */ + { "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 }, + + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device attribute + * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The + * context cannot be used (and must be destroyed similar to + * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from + * this context are invalid and must be reconstructed if the program is to + * continue using CUDA. + */ + { "CUDA_ERROR_LAUNCH_TIMEOUT", 702 }, + + /** + * This error indicates a kernel launch that uses an incompatible texturing + * mode. + */ + { "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 }, + + /** + * This error indicates that a call to ::cuCtxEnablePeerAccess() is + * trying to re-enable peer access to a context which has already + * had peer access to it enabled. + */ + { "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 }, + + /** + * This error indicates that ::cuCtxDisablePeerAccess() is + * trying to disable peer access which has not been enabled yet + * via ::cuCtxEnablePeerAccess(). + */ + { "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 }, + + /** + * This error indicates that the primary context for the specified device + * has already been initialized. + */ + { "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 }, + + /** + * This error indicates that the context current to the calling thread + * has been destroyed using ::cuCtxDestroy }, or is a primary context which + * has not yet been initialized. + */ + { "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 }, + + /** + * A device-side assert triggered during kernel execution. The context + * cannot be used anymore, and must be destroyed. All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + { "CUDA_ERROR_ASSERT", 710 }, + + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cuCtxEnablePeerAccess(). + */ + { "CUDA_ERROR_TOO_MANY_PEERS", 711 }, + + /** + * This error indicates that the memory range passed to ::cuMemHostRegister() + * has already been registered. + */ + { "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 }, + + /** + * This error indicates that the pointer passed to ::cuMemHostUnregister() + * does not correspond to any currently registered memory region. + */ + { "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 }, + + /** + * This error indicates that the attempted operation is not permitted. + */ + { "CUDA_ERROR_NOT_PERMITTED", 800 }, + + /** + * This error indicates that the attempted operation is not supported + * on the current system or device. + */ + { "CUDA_ERROR_NOT_SUPPORTED", 801 }, + + /** + * This indicates that an unknown internal error has occurred. + */ + { "CUDA_ERROR_UNKNOWN", 999 }, + { NULL, -1 } +}; + +// This is just a linear search through the array, since the error_id's are not +// always ocurring consecutively +const char * getCudaDrvErrorString(CUresult error_id) +{ + int index = 0; + while (sCudaDrvErrorString[index].error_id != error_id && + sCudaDrvErrorString[index].error_id != -1) + { + index++; + } + if (sCudaDrvErrorString[index].error_id == error_id) + return (const char *)sCudaDrvErrorString[index].error_string; + else + return (const char *)"CUDA_ERROR not found!"; +} + +#endif diff --git a/examples_cuda/sort/sort b/examples_cuda/sort/sort index 51e36ea7..d148e465 100755 Binary files a/examples_cuda/sort/sort and b/examples_cuda/sort/sort differ diff --git a/examples_cuda/sort/sort.cpp b/examples_cuda/sort/sort.cpp index f5e4264a..37a5c289 100644 --- a/examples_cuda/sort/sort.cpp +++ b/examples_cuda/sort/sort.cpp @@ -41,6 +41,20 @@ #include "../timing.h" #include "sort_ispc.h" +#include +static inline double rtc(void) +{ + struct timeval Tvalue; + double etime; + struct timezone dummy; + + gettimeofday(&Tvalue,&dummy); + etime = (double) Tvalue.tv_sec + + 1.e-6*((double) Tvalue.tv_usec); + return etime; +} + + using namespace ispc; extern void sort_serial (int n, unsigned int code[], int order[]); @@ -71,26 +85,30 @@ int main (int argc, char *argv[]) { int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX; double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0; + printf("n= %d \n", n); unsigned int *code = new unsigned int [n]; int *order = new int [n]; srand (0); +#if 0 for (i = 0; i < m; i ++) { for (j = 0; j < n; j ++) code [j] = random() % l; reset_and_start_timer(); + const double t0 = rtc(); sort_ispc (n, code, order, 1); - tISPC1 += get_elapsed_mcycles(); + tISPC1 += (rtc() - t0); //get_elapsed_mcycles(); if (argc != 3) progressbar (i, m); } printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1); +#endif srand (0); @@ -100,9 +118,10 @@ int main (int argc, char *argv[]) reset_and_start_timer(); + const double t0 = rtc(); sort_ispc (n, code, order, 0); - tISPC2 += get_elapsed_mcycles(); + tISPC2 += (rtc() - t0); // get_elapsed_mcycles(); if (argc != 3) progressbar (i, m); @@ -118,9 +137,10 @@ int main (int argc, char *argv[]) reset_and_start_timer(); + const double t0 = rtc(); sort_serial (n, code, order); - tSerial += get_elapsed_mcycles(); + tSerial += (rtc() - t0);//get_elapsed_mcycles(); if (argc != 3) progressbar (i, m); diff --git a/examples_cuda/sort/sort1.cu b/examples_cuda/sort/sort1.cu new file mode 100644 index 00000000..b35ba506 --- /dev/null +++ b/examples_cuda/sort/sort1.cu @@ -0,0 +1,265 @@ +/* + Copyright (c) 2013, Durham University + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Durham University nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Author: Tomasz Koziara */ + +#define programCount 32 +#define programIndex (threadIdx.x & 31) +#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5)) +#define taskCount (gridDim.x*4) +#define cfor for +#define cif if + +#define int8 char +#define int64 long +#define sync cudaDeviceSynchronize(); + +__device__ inline int nbx(const int n) { return (n - 1) / 4 + 1; } + +__global__ void histogram ( int span, int n, int64 code[], int pass, int hist[]) +{ + if (taskIndex >= taskCount) return; + int start = taskIndex*span; + int end = taskIndex == taskCount-1 ? n : start+span; + int strip = (end-start)/programCount; + int tail = (end-start)%programCount; + int i = programCount*taskIndex + programIndex; + int g [256]; + + cfor (int j = 0; j < 256; j ++) + { + g[j] = 0; + } + + cfor (int k = start+programIndex*strip; k < start+(programIndex+1)*strip; k ++) + { + unsigned int8 *c = (unsigned int8*) &code[k]; + + g[c[pass]] ++; + } + + if (programIndex == programCount-1) /* remainder is processed by the last lane */ + { + for (int k = start+programCount*strip; k < start+programCount*strip+tail; k ++) + { + unsigned int8 *c = (unsigned int8*) &code[k]; + + g[c[pass]] ++; + } + } + + cfor (int j = 0; j < 256; j ++) + { + hist[j*programCount*taskCount+i] = g[j]; + } +} + +__global__ void permutation ( int span, int n, int64 code[], int pass, int hist[], int64 perm[]) +{ + if (taskIndex >= taskCount) return; + int start = taskIndex*span; + int end = taskIndex == taskCount-1 ? n : start+span; + int strip = (end-start)/programCount; + int tail = (end-start)%programCount; + int i = programCount*taskIndex + programIndex; + int g [256]; + + cfor (int j = 0; j < 256; j ++) + { + g[j] = hist[j*programCount*taskCount+i]; + } + + cfor (int k = start+programIndex*strip; k < start+(programIndex+1)*strip; k ++) + { + unsigned int8 *c = (unsigned int8*) &code[k]; + + int l = g[c[pass]]; + + perm[l] = code[k]; + + g[c[pass]] = l+1; + } + + if (programIndex == programCount-1) /* remainder is processed by the last lane */ + { + for (int k = start+programCount*strip; k < start+programCount*strip+tail; k ++) + { + unsigned int8 *c = (unsigned int8*) &code[k]; + + int l = g[c[pass]]; + + perm[l] = code[k]; + + g[c[pass]] = l+1; + } + } +} + +__global__ void copy ( int span, int n, int64 from[], int64 to[]) +{ + if (taskIndex >= taskCount) return; + int start = taskIndex*span; + int end = taskIndex == taskCount-1 ? n : start+span; + + for (int i = programIndex + start; i < end; i += programCount) + if (i < end) + { + to[i] = from[i]; + } +} + +__global__ void pack ( int span, int n, unsigned int code[], int64 pair[]) +{ + if (taskIndex >= taskCount) return; + int start = taskIndex*span; + int end = taskIndex == taskCount-1 ? n : start+span; + + for (int i = programIndex + start; i < end; i += programCount) + if (i < end) + { + pair[i] = ((int64)i<<32)+code[i]; + } +} + +__global__ void unpack ( int span, int n, int64 pair[], int unsigned code[], int order[]) +{ + if (taskIndex >= taskCount) return; + int start = taskIndex*span; + int end = taskIndex == taskCount-1 ? n : start+span; + + for (int i = programIndex + start; i < end; i += programCount) + if (i < end) + { + code[i] = pair[i]; + order[i] = pair[i]>>32; + } +} + +__global__ void addup ( int h[], int g[]) +{ + if (taskIndex >= taskCount) return; + int * u = &h[256*programCount*taskIndex]; + int i, x, y = 0; + + for (i = 0; i < 256*programCount; i ++) + { + x = u[i]; + u[i] = y; + y += x; + } + + g[taskIndex] = y; +} + +__global__ void bumpup ( int h[], int g[]) +{ + if (taskIndex >= taskCount) return; + int * u = &h[256*programCount*taskIndex]; + int z = g[taskIndex]; + + for (int i = programIndex; i < 256*programCount; i += programCount) + { + u[i] += z; + } +} + +__device__ +static void prefix_sum ( int num, int h[]) +{ + int * g = new int [num+1]; + int i; + +// launch[num] addup (h, g+1); + if(programIndex == 0) + addup<<>>(h,g+1); + sync; + + for (g[0] = 0, i = 1; i < num; i ++) g[i] += g[i-1]; + +// launch[num] bumpup (h, g); + if(programIndex == 0) + bumpup<<>>(h,g); + sync; + + delete g; +} + +extern "C" __global__ +void sort_ispc ( int n, unsigned int code[], int order[], int ntasks) +{ + int num = ntasks < 1 ? 13*4*8 : ntasks; + int span = n / num; + int hsize = 256*programCount*num; + int * hist = new int [hsize]; + int64 * pair = new int64 [n]; + int64 * temp = new int64 [n]; + int pass, i; + + +// launch[num] pack (span, n, code, pair); + if(programIndex == 0) + pack<<>>(span, n, code, pair); + sync; + +#if 0 + for (pass = 0; pass < 4; pass ++) + { +// launch[num] histogram (span, n, pair, pass, hist); + if(programIndex == 0) + histogram<<>>(span, n, pair, pass, hist); + sync; + + prefix_sum (num, hist); + +// launch[num] permutation (span, n, pair, pass, hist, temp); + if(programIndex == 0) + permutation<<>> (span, n, pair, pass, hist, temp); + sync; + +/// launch[num] copy (span, n, temp, pair); + if(programIndex == 0) + copy<<>> (span, n, temp, pair); + sync; + } + +/// launch[num] unpack (span, n, pair, code, order); + if(programIndex == 0) + unpack<<>> (span, n, pair, code, order); + sync; +#endif + + + delete hist; + delete pair; + delete temp; +} diff --git a/examples_cuda/sort/sort1.ispc b/examples_cuda/sort/sort1.ispc new file mode 100644 index 00000000..0c500ed4 --- /dev/null +++ b/examples_cuda/sort/sort1.ispc @@ -0,0 +1,275 @@ +/* + Copyright (c) 2013, Durham University + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Durham University nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Author: Tomasz Koziara */ + +#ifdef __NVPTX__ +#warning "emitting DEVICE code" +#define programCount warpSize() +#define programIndex laneIndex() +#define taskIndex blockIndex0() +#define taskCount blockCount0() +#define cfor for +#define cif if +#else +#warning "emitting HOST code" +#endif + +task void histogram (uniform int span, uniform int n, uniform int64 code[], uniform int pass, uniform int hist[]) +{ + if (taskIndex >= taskCount) return; + uniform int start = taskIndex*span; + uniform int end = taskIndex == taskCount-1 ? n : start+span; + uniform int strip = (end-start)/programCount; + uniform int tail = (end-start)%programCount; + int i = programCount*taskIndex + programIndex; + int g [256]; + + cfor (int j = 0; j < 256; j ++) + { + g[j] = 0; + } + + cfor (int k = start+programIndex*strip; k < start+(programIndex+1)*strip; k ++) + { + unsigned int8 *c = (unsigned int8*) &code[k]; + + g[c[pass]] ++; + } + + if (programIndex == programCount-1) /* remainder is processed by the last lane */ + { + for (int k = start+programCount*strip; k < start+programCount*strip+tail; k ++) + { + unsigned int8 *c = (unsigned int8*) &code[k]; + + g[c[pass]] ++; + } + } + + cfor (int j = 0; j < 256; j ++) + { + hist[j*programCount*taskCount+i] = g[j]; + } +} + +task void permutation (uniform int span, uniform int n, uniform int64 code[], uniform int pass, uniform int hist[], uniform int64 perm[]) +{ + if (taskIndex >= taskCount) return; + uniform int start = taskIndex*span; + uniform int end = taskIndex == taskCount-1 ? n : start+span; + uniform int strip = (end-start)/programCount; + uniform int tail = (end-start)%programCount; + int i = programCount*taskIndex + programIndex; + int g [256]; + + cfor (int j = 0; j < 256; j ++) + { + g[j] = hist[j*programCount*taskCount+i]; + } + + cfor (int k = start+programIndex*strip; k < start+(programIndex+1)*strip; k ++) + { + unsigned int8 *c = (unsigned int8*) &code[k]; + + int l = g[c[pass]]; + + perm[l] = code[k]; + + g[c[pass]] = l+1; + } + + if (programIndex == programCount-1) /* remainder is processed by the last lane */ + { + for (int k = start+programCount*strip; k < start+programCount*strip+tail; k ++) + { + unsigned int8 *c = (unsigned int8*) &code[k]; + + int l = g[c[pass]]; + + perm[l] = code[k]; + + g[c[pass]] = l+1; + } + } +} + +task void copy (uniform int span, uniform int n, uniform int64 from[], uniform int64 to[]) +{ + if (taskIndex >= taskCount) return; + uniform int start = taskIndex*span; + uniform int end = taskIndex == taskCount-1 ? n : start+span; + + for (int i = programIndex + start; i < end; i += programCount) + if (i < end) + { + to[i] = from[i]; + } +} + +task void pack (uniform int span, uniform int n, uniform unsigned int code[], uniform int64 pair[]) +{ +#if 0 + if (taskIndex >= taskCount) return; + uniform int start = taskIndex*span; + uniform int end = taskIndex == taskCount-1 ? n : start+span; + + for (int i = programIndex + start; i < end; i += programCount) + if (i < end) + { + pair[i] = ((int64)i<<32)+code[i]; + } +#endif +} + +task void unpack (uniform int span, uniform int n, uniform int64 pair[], uniform int unsigned code[], uniform int order[]) +{ + if (taskIndex >= taskCount) return; + uniform int start = taskIndex*span; + uniform int end = taskIndex == taskCount-1 ? n : start+span; + + for (int i = programIndex + start; i < end; i += programCount) + if (i < end) + { + code[i] = pair[i]; + order[i] = pair[i]>>32; + } +} + +task void addup (uniform int h[], uniform int g[]) +{ + if (taskIndex >= taskCount) return; + uniform int * uniform u = &h[256*programCount*taskIndex]; + uniform int i, x, y = 0; + + for (i = 0; i < 256*programCount; i ++) + { + x = u[i]; + u[i] = y; + y += x; + } + + g[taskIndex] = y; +} + +task void bumpup (uniform int h[], uniform int g[]) +{ + if (taskIndex >= taskCount) return; + uniform int * uniform u = &h[256*programCount*taskIndex]; + uniform int z = g[taskIndex]; + + for (int i = programIndex; i < 256*programCount; i += programCount) + { + u[i] += z; + } +} + +static void prefix_sum (uniform int num, uniform int h[]) +{ + uniform int * uniform g = uniform new uniform int [num+1]; + uniform int i; + + launch[num] addup (h, g+1); + sync; + + for (g[0] = 0, i = 1; i < num; i ++) g[i] += g[i-1]; + + launch[num] bumpup (h, g); + sync; + + delete g; +} + +export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int order[], uniform int ntasks) +{ + uniform int num = ntasks < 1 ? 13*4*8 : ntasks; + uniform int span = n / num; + uniform int hsize = 256*programCount*num; + uniform int * uniform hist = uniform new uniform int [hsize]; + uniform int64 * uniform pair = uniform new uniform int64 [n]; + uniform int64 * uniform temp = uniform new uniform int64 [n]; + uniform int pass, i; + +#if DEBUG + if (n < 100) + { + print ("input: "); + for (i = 0; i < n; i ++) print ("%, ", code[i]); + print ("\n"); + } +#endif + + launch[num] pack (span, n, code, pair); + sync; +#if 0 + + for (pass = 0; pass < 4; pass ++) + { + launch[num] histogram (span, n, pair, pass, hist); + sync; + + prefix_sum (num, hist); + + launch[num] permutation (span, n, pair, pass, hist, temp); + sync; + + launch[num] copy (span, n, temp, pair); + sync; + } + + launch[num] unpack (span, n, pair, code, order); + sync; + +#if DEBUG + for (i = 0; i < n; i ++) + { + if (i > 0 && code[i-1] > code[i]) + print ("ERR at % => % > %; ", i, code[i-1], code[i]); + } + + if (n < 100) + { + print ("output: "); + for (i = 0; i < n; i ++) print ("%, ", code[i]); + print ("\n"); + print ("order: "); + for (i = 0; i < n; i ++) print ("%, ", order[i]); + print ("\n"); + } +#endif +#endif + + delete hist; + delete pair; + delete temp; +} diff --git a/examples_cuda/sort/sort_cu.cpp b/examples_cuda/sort/sort_cu.cpp new file mode 100644 index 00000000..616be83f --- /dev/null +++ b/examples_cuda/sort/sort_cu.cpp @@ -0,0 +1,405 @@ +/* + Copyright (c) 2013, Durham University + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Durham University nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Author: Tomasz Koziara */ + +#include +#include +#include +#include +#include +#include "../timing.h" +//#include "sort_ispc.h" +//using namespace ispc; + +#include +static inline double rtc(void) +{ + struct timeval Tvalue; + double etime; + struct timezone dummy; + + gettimeofday(&Tvalue,&dummy); + etime = (double) Tvalue.tv_sec + + 1.e-6*((double) Tvalue.tv_usec); + return etime; +} +/******************************/ +#include +#include +#include +#include "drvapi_error_string.h" + +#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) +// These are the inline versions for all of the SDK helper functions +void __checkCudaErrors(CUresult err, const char *file, const int line) { + if(CUDA_SUCCESS != err) { + std::cerr << "checkCudeErrors() Driver API error = " << err << "\"" + << getCudaDrvErrorString(err) << "\" from file <" << file + << ", line " << line << "\n"; + exit(-1); + } +} + +/**********************/ +/* Basic CUDriver API */ +CUcontext context; + +void createContext(const int deviceId = 0) +{ + CUdevice device; + int devCount; + checkCudaErrors(cuInit(0)); + checkCudaErrors(cuDeviceGetCount(&devCount)); + assert(devCount > 0); + checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0)); + + char name[128]; + checkCudaErrors(cuDeviceGetName(name, 128, device)); + std::cout << "Using CUDA Device [0]: " << name << "\n"; + + int devMajor, devMinor; + checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); + std::cout << "Device Compute Capability: " + << devMajor << "." << devMinor << "\n"; + if (devMajor < 2) { + std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; + exit(1); + } + + // Create driver context + checkCudaErrors(cuCtxCreate(&context, 0, device)); +} +void destroyContext() +{ + checkCudaErrors(cuCtxDestroy(context)); +} + +CUmodule loadModule(const char * module) +{ + const double t0 = rtc(); + CUmodule cudaModule; + // in this branch we use compilation with parameters + + CUlinkState CUState; + CUlinkState *lState = &CUState; + const int nOptions = 7; + CUjit_option options[nOptions]; + void* optionVals[nOptions]; + float walltime; + const unsigned int logSize = 32768; + char error_log[logSize], + info_log[logSize]; + void *cuOut; + size_t outSize; + int myErr = 0; + + // Setup linker options + // Return walltime from JIT compilation + options[0] = CU_JIT_WALL_TIME; + optionVals[0] = (void*) &walltime; + // Pass a buffer for info messages + options[1] = CU_JIT_INFO_LOG_BUFFER; + optionVals[1] = (void*) info_log; + // Pass the size of the info buffer + options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + optionVals[2] = (void*) logSize; + // Pass a buffer for error message + options[3] = CU_JIT_ERROR_LOG_BUFFER; + optionVals[3] = (void*) error_log; + // Pass the size of the error buffer + options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; + optionVals[4] = (void*) logSize; + // Make the linker verbose + options[5] = CU_JIT_LOG_VERBOSE; + optionVals[5] = (void*) 1; + // Max # of registers/pthread + options[6] = CU_JIT_MAX_REGISTERS; + int jitRegCount = 32; + optionVals[6] = (void *)(size_t)jitRegCount; + + // Create a pending linker invocation + checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState)); + +#if 0 + if (sizeof(void *)==4) + { + // Load the PTX from the string myPtx32 + printf("Loading myPtx32[] program\n"); + // PTX May also be loaded from file, as per below. + myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0); + } + else +#endif + { + // Load the PTX from the string myPtx (64-bit) + fprintf(stderr, "Loading ptx..\n"); + myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0); + myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "libcudadevrt.a", 0,0,0); + // PTX May also be loaded from file, as per below. + // myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0); + } + + // Complete the linker step + myErr = cuLinkComplete(*lState, &cuOut, &outSize); + + if ( myErr != CUDA_SUCCESS ) + { + // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. + fprintf(stderr,"PTX Linker Error:\n%s\n",error_log); + assert(0); + } + + // Linker walltime and info_log were requested in options above. + fprintf(stderr, "CUDA Link Completed in %fms [ %g ms]. Linker Output:\n%s\n",walltime,info_log,1e3*(rtc() - t0)); + + // Load resulting cuBin into module + checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut)); + + // Destroy the linker invocation + checkCudaErrors(cuLinkDestroy(*lState)); + fprintf(stderr, " loadModule took %g ms \n", 1e3*(rtc() - t0)); + return cudaModule; +} +void unloadModule(CUmodule &cudaModule) +{ + checkCudaErrors(cuModuleUnload(cudaModule)); +} + +CUfunction getFunction(CUmodule &cudaModule, const char * function) +{ + CUfunction cudaFunction; + checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function)); + return cudaFunction; +} + +CUdeviceptr deviceMalloc(const size_t size) +{ + CUdeviceptr d_buf; + checkCudaErrors(cuMemAlloc(&d_buf, size)); + return d_buf; +} +void deviceFree(CUdeviceptr d_buf) +{ + checkCudaErrors(cuMemFree(d_buf)); +} +void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size)); +} +void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); +} +#define deviceLaunch(func,params) \ + checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \ + checkCudaErrors( \ + cuLaunchKernel( \ + (func), \ + 1,1,1, \ + 32, 1, 1, \ + 0, NULL, (params), NULL \ + )); + +typedef CUdeviceptr devicePtr; + + +/**************/ +#include +std::vector readBinary(const char * filename) +{ + std::vector buffer; + FILE *fp = fopen(filename, "rb"); + if (!fp ) + { + fprintf(stderr, "file %s not found\n", filename); + assert(0); + } +#if 0 + char c; + while ((c = fgetc(fp)) != EOF) + buffer.push_back(c); +#else + fseek(fp, 0, SEEK_END); + const unsigned long long size = ftell(fp); /*calc the size needed*/ + fseek(fp, 0, SEEK_SET); + buffer.resize(size); + + if (fp == NULL){ /*ERROR detection if file == empty*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n",filename); + exit(1); + } + else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n", filename); + exit(1); + } +#endif + fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size()); + return buffer; +} + +extern "C" +{ + double CUDALaunch( + void **handlePtr, + const char * func_name, + void **func_args) + { + const std::vector module_str = readBinary("__kernels.ptx"); + const char * module = &module_str[0]; + CUmodule cudaModule = loadModule(module); + CUfunction cudaFunction = getFunction(cudaModule, func_name); + const double t0 = rtc(); + deviceLaunch(cudaFunction, func_args); + checkCudaErrors(cuStreamSynchronize(0)); + const double dt = rtc() - t0; + unloadModule(cudaModule); + return dt; + } +} +/******************************/ + + + +extern void sort_serial (int n, unsigned int code[], int order[]); + +/* progress bar by Ross Hemsley; + * http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */ +static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50) +{ + if (n < 100) + { + x *= 100/n; + n = 100; + } + + if ((x != n) && (x % (n/100) != 0)) return; + + using namespace std; + float ratio = x/(float)n; + int c = ratio * w; + + cout << setw(3) << (int)(ratio*100) << "% ["; + for (int x=0; x