From 463ddf170718e026fc0a9896c908b0cbd0873634 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 20 Feb 2014 13:49:58 +0100 Subject: [PATCH] added radixSort --- examples/portable/radixSort/Makefile_cpu | 9 + examples/portable/radixSort/Makefile_ptx | 15 + examples/portable/radixSort/radixSort.cpp | 119 +++++++ examples/portable/radixSort/radixSort.cu | 365 +++++++++++++++++++++ examples/portable/radixSort/radixSort.ispc | 301 +++++++++++++++++ 5 files changed, 809 insertions(+) create mode 100644 examples/portable/radixSort/Makefile_cpu create mode 100644 examples/portable/radixSort/Makefile_ptx create mode 100644 examples/portable/radixSort/radixSort.cpp create mode 100644 examples/portable/radixSort/radixSort.cu create mode 100644 examples/portable/radixSort/radixSort.ispc diff --git a/examples/portable/radixSort/Makefile_cpu b/examples/portable/radixSort/Makefile_cpu new file mode 100644 index 00000000..1d3808dc --- /dev/null +++ b/examples/portable/radixSort/Makefile_cpu @@ -0,0 +1,9 @@ + +EXAMPLE=radixSort +CPP_SRC=radixSort.cpp +ISPC_SRC=radixSort.ispc +ISPC_IA_TARGETS=avx1-i32x8 +ISPC_ARM_TARGETS=neon +#ISPC_FLAGS=-DDEBUG -g + +include ../common_cpu.mk diff --git a/examples/portable/radixSort/Makefile_ptx b/examples/portable/radixSort/Makefile_ptx new file mode 100644 index 00000000..da7494e4 --- /dev/null +++ b/examples/portable/radixSort/Makefile_ptx @@ -0,0 +1,15 @@ +PROG=radixSort +ISPC_SRC=radixSort.ispc + +CU_SRC=radixSort.cu +# NVCC_FLAGS=-Xptxas=-O1 +CXX_SRC=radixSort.cpp radixSort.cpp +PTXCC_REGMAX=64 + +LLVM_GPU=1 +NVVM_GPU=1 + +include ../common_ptx.mk + + + diff --git a/examples/portable/radixSort/radixSort.cpp b/examples/portable/radixSort/radixSort.cpp new file mode 100644 index 00000000..4e42476e --- /dev/null +++ b/examples/portable/radixSort/radixSort.cpp @@ -0,0 +1,119 @@ +#include +#include +#include +#include +#include +#include +#include "timing.h" +#include "ispc_malloc.h" +#include "radixSort_ispc.h" + +/* progress bar by Ross Hemsley; + * http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */ +static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50) +{ + if (n < 100) + { + x *= 100/n; + n = 100; + } + + if ((x != n) && (x % (n/100) != 0)) return; + + using namespace std; + float ratio = x/(float)n; + int c = ratio * w; + + cout << setw(3) << (int)(ratio*100) << "% ["; + for (int x=0; x + +#define NUMBITS 8 +#define NUMDIGITS (1<> bit); + atomic_add_global(&counts[key], 1); + } + +#pragma unroll 8 + for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) + atomic_add_global(&countsGlobal[digit], counts[digit]); +} + +__global__ +void sortPass( + Key keysAll[], + Key sorted[], + int bit, + int numElements, + int digitOffsetsAll[]) +{ + const int blkIdx = taskIndex; + const int numBlocks = taskCount; + + const int blkDim = (numElements + numBlocks - 1) / numBlocks; + + + const int keyIndex = blkIdx * blkDim; + Key * keys = keysAll + keyIndex; + + + const int nloc = min(numElements - keyIndex, blkDim); + + const int mask = (1 << NUMBITS) - 1; + + /* copy digit offset from Gmem to Lmem */ +#if 1 + __shared__ int digitOffsets_sh[NUMDIGITS*4]; + volatile int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS; + for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) + digitOffsets[digit] = digitOffsetsAll[blkIdx*NUMDIGITS + digit]; +#else + int *digitOffsets = &digitOffsetsAll[blkIdx*NUMDIGITS]; +#endif + + + for (int i = programIndex; i < nloc; i += programCount) + if (i < nloc) + { + const int key = mask & ((unsigned int)keys[i] >> bit); + int scatter; + /* not a vector friendly loop */ +#pragma unroll 1 /* needed, otherwise compiler unroll and optimizes the result :S */ + for (int iv = 0; iv < programCount; iv++) + if (programIndex == iv) + scatter = digitOffsets[key]++; + sorted [scatter] = keys[i]; + } +} + +__global__ +void partialScanLocal( + int numBlocks, + int excScanAll[], + int countsAll[], + int partialSumAll[]) +{ + const int blkIdx = taskIndex; + + const int blkDim = (numBlocks+taskCount-1)/taskCount; + const int bbeg = blkIdx * blkDim; + const int bend = min(bbeg + blkDim, numBlocks); + + int (* countsBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])countsAll; + int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll; + int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll; + +#pragma unroll 8 + for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) + { + int prev = bbeg == 0 ? excScanBlock[0][digit] : 0; + for ( int block = bbeg; block < bend; block++) + { + const int y = countsBlock[block][digit]; + excScanBlock[block][digit] = prev; + prev += y; + } + partialSum[blkIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit]; + } +} + +__global__ +void partialScanGlobal( + const int numBlocks, + int partialSumAll[], + int prefixSumAll[]) +{ + int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll; + int (* prefixSum)[NUMDIGITS] = ( int (*)[NUMDIGITS]) prefixSumAll; + const int digit = taskIndex; + int carry = 0; + for (int block = programIndex; block < numBlocks; block += programCount) + { + const int value = partialSum[block][digit]; + const int scan = exclusive_scan_add(value); + if (block < numBlocks) + prefixSum[block][digit] = scan + carry; + carry += __shfl(scan+value, programCount-1); + } +} + +__global__ +void completeScanGlobal( + int numBlocks, + int excScanAll[], + int carryValueAll[]) +{ + const int blkIdx = taskIndex; + const int blkDim = (numBlocks+taskCount-1)/taskCount; + const int bbeg = blkIdx * blkDim; + const int bend = min(bbeg + blkDim, numBlocks); + + int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll; + int (* carryValue)[NUMDIGITS] = ( int (*)[NUMDIGITS])carryValueAll; + +#pragma unroll 8 + for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) + { + const int carry = carryValue[blkIdx][digit]; + for ( int block = bbeg; block < bend; block++) + excScanBlock[block][digit] += carry; + } +} + +__device__ static +inline void radixExclusiveScan( + const int numBlocks, + int excScanPtr[], + int countsPtr[], + int partialSum[], + int prefixSum[]) +{ + const int scale = 8; + launch (numBlocks/scale, 1,1, partialScanLocal)(numBlocks, excScanPtr, countsPtr, partialSum); + sync; + + launch (NUMDIGITS,1,1,partialScanGlobal) (numBlocks/scale, partialSum, prefixSum); + sync; + + launch (numBlocks/scale,1,1, completeScanGlobal) (numBlocks, excScanPtr, prefixSum); + sync; +} + +__device__ static int * memoryPool = NULL; +__device__ static int numBlocks; +__device__ static int nSharedCounts; +__device__ static int nCountsGlobal; +__device__ static int nExcScan; +__device__ static int nCountsBlock; +__device__ static int nPartialSum; +__device__ static int nPrefixSum; + +__device__ static int * sharedCounts; +__device__ static int * countsGlobal; +__device__ static int * excScan; +__device__ static int * counts; +__device__ static int * partialSum; +__device__ static int * prefixSum; + +__device__ static int numElementsBuf = 0; +__device__ static Key * bufKeys; + +__global__ +void radixSort_alloc___export(const int n) +{ + assert(memoryPool == NULL); + numBlocks = 13*32*4; + nSharedCounts = NUMDIGITS*numBlocks; + nCountsGlobal = NUMDIGITS; + nExcScan = NUMDIGITS*numBlocks; + nCountsBlock = NUMDIGITS*numBlocks; + nPartialSum = NUMDIGITS*numBlocks; + nPrefixSum = NUMDIGITS*numBlocks; + + + const int nalloc = + nSharedCounts + + nCountsGlobal + + nExcScan + + nCountsBlock + + nPartialSum + + nPrefixSum; + + if (programIndex == 0) + memoryPool = new int[nalloc]; + + sharedCounts = memoryPool; + countsGlobal = sharedCounts + nSharedCounts; + excScan = countsGlobal + nCountsGlobal; + counts = excScan + nExcScan; + partialSum = counts + nCountsBlock; + prefixSum = partialSum + nPartialSum; +} + +extern "C" +void radixSort_alloc(const int n) +{ + radixSort_alloc___export<<<1,32>>>(n); + sync; +} + + +__device__ static +void radixSort_freeBufKeys() +{ + if (numElementsBuf > 0) + { + if (programIndex == 0) + delete bufKeys; + numElementsBuf = 0; + } +} + +__global__ void radixSort_free___export() +{ + assert(memoryPool != NULL); + if (programIndex == 0) + delete memoryPool; + memoryPool = NULL; + + radixSort_freeBufKeys(); +} +extern "C" +void radixSort_free() +{ + radixSort_free___export<<<1,32>>>(); + sync; +} + +__global__ void radixSort___export( + const int numElements, + Key keys[], + const int nBits) +{ +#ifdef __NVPTX__ + assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */ +#endif + + if (numElementsBuf < numElements) + radixSort_freeBufKeys(); + if (numElementsBuf == 0) + { + numElementsBuf = numElements; + if (programIndex == 0) + bufKeys = new Key[numElementsBuf]; + } + + const int blkDim = (numElements + numBlocks - 1) / numBlocks; + + for ( int bit = 0; bit < nBits; bit += NUMBITS) + { + /* initialize histogram for each digit */ + for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) + countsGlobal[digit] = 0; + + /* compute histogram for each digit */ + launch (numBlocks,1,1, countPass)(keys, bufKeys, bit, numElements, counts, countsGlobal); + sync; + + /* exclusive scan on global histogram */ + int carry = 0; + excScan[0] = 0; +#pragma unroll 8 + for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) + { + const int value = countsGlobal[digit]; + const int scan = exclusive_scan_add(value); + excScan[digit] = scan + carry; + carry += __shfl(scan+value, programCount-1); + } + + /* computing offsets for each digit */ + radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum); + + /* sorting */ + launch (numBlocks,1,1, + sortPass)( + bufKeys, + keys, + bit, + numElements, + excScan); + sync; + } +} + +extern "C" +void radixSort( + const int numElements, + Key keys[], + const int nBits) +{ + cudaDeviceSetCacheConfig ( cudaFuncCachePreferEqual ); + radixSort___export<<<1,32>>>(numElements, keys, nBits); + sync; +} diff --git a/examples/portable/radixSort/radixSort.ispc b/examples/portable/radixSort/radixSort.ispc new file mode 100644 index 00000000..d2dce2ac --- /dev/null +++ b/examples/portable/radixSort/radixSort.ispc @@ -0,0 +1,301 @@ +#define NUMBITS 8 +#define NUMDIGITS (1<> bit); +#ifdef __NVPTX__ + atomic_add_global(&counts[key], 1); +#else + atomic_add_local(&counts[key], 1); +#endif + } + + foreach (digit = 0 ... NUMDIGITS) + atomic_add_global(&countsGlobal[digit], counts[digit]); +} + +task +void sortPass( + uniform Key keysAll[], + uniform Key sorted[], + uniform int bit, + uniform int numElements, + uniform int digitOffsetsAll[]) +{ + const uniform int blockIdx = taskIndex; + const uniform int numBlocks = taskCount; + + const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks; + + + const uniform int keyIndex = blockIdx * blockDim; + uniform Key * uniform keys = keysAll + keyIndex; + + + const uniform int nloc = min(numElements - keyIndex, blockDim); + + const uniform int mask = (1 << NUMBITS) - 1; + + /* copy digit offset from Gmem to Lmem */ +#if 1 + uniform int digitOffsets[NUMDIGITS]; + foreach (digit = 0 ... NUMDIGITS) + digitOffsets[digit] = digitOffsetsAll[blockIdx*NUMDIGITS + digit]; +#else + uniform int * uniform digitOffsets = &digitOffsetsAll[blockIdx*NUMDIGITS]; +#endif + + foreach (i = 0 ... nloc) + { + const int key = mask & ((unsigned int)keys[i] >> bit); + int scatter; + /* not a vector friendly loop */ + foreach_active(iv) + scatter = digitOffsets[key]++; + sorted[scatter] = keys[i]; + } +} + +task +void partialScanLocal( + uniform int numBlocks, + uniform int excScanAll[], + uniform int countsAll[], + uniform int partialSumAll[]) +{ + const uniform int blockIdx = taskIndex; + + const uniform int blockDim = (numBlocks+taskCount-1)/taskCount; + const uniform int bbeg = blockIdx * blockDim; + const uniform int bend = min(bbeg + blockDim, numBlocks); + + uniform int (* uniform countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])countsAll; + uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll; + uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll; + + foreach (digit = 0 ... NUMDIGITS) + { + int prev = bbeg == 0 ? excScanBlock[0][digit] : 0; + for (uniform int block = bbeg; block < bend; block++) + { + const int y = countsBlock[block][digit]; + excScanBlock[block][digit] = prev; + prev += y; + } + partialSum[blockIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit]; + } +} + +task +void partialScanGlobal( + const uniform int numBlocks, + uniform int partialSumAll[], + uniform int prefixSumAll[]) +{ + uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll; + uniform int (* uniform prefixSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS]) prefixSumAll; + const uniform int digit = taskIndex; + int carry = 0; + foreach (block = 0 ... numBlocks) + { + const int value = partialSum[block][digit]; + const int scan = exclusive_scan_add(value); + prefixSum[block][digit] = scan + carry; + carry += broadcast(scan+value, programCount-1); + } +} + +task +void completeScanGlobal( + uniform int numBlocks, + uniform int excScanAll[], + uniform int carryValueAll[]) +{ + const uniform int blockIdx = taskIndex; + const uniform int blockDim = (numBlocks+taskCount-1)/taskCount; + const uniform int bbeg = blockIdx * blockDim; + const uniform int bend = min(bbeg + blockDim, numBlocks); + + uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll; + uniform int (* uniform carryValue)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])carryValueAll; + + foreach (digit = 0 ... NUMDIGITS) + { + const int carry = carryValue[blockIdx][digit]; + for (uniform int block = bbeg; block < bend; block++) + excScanBlock[block][digit] += carry; + } +} + +static +inline void radixExclusiveScan( + const uniform int numBlocks, + uniform int excScanPtr[], + uniform int countsPtr[], + uniform int partialSum[], + uniform int prefixSum[]) +{ + const uniform int scale = 8; + launch [numBlocks/scale] partialScanLocal(numBlocks, excScanPtr, countsPtr, partialSum); + sync; + + launch [NUMDIGITS] partialScanGlobal(numBlocks/scale, partialSum, prefixSum); + sync; + + launch [numBlocks/scale] completeScanGlobal(numBlocks, excScanPtr, prefixSum); + sync; +} + +static uniform int * uniform memoryPool = NULL; +static uniform int numBlocks; +static uniform int nSharedCounts; +static uniform int nCountsGlobal; +static uniform int nExcScan; +static uniform int nCountsBlock; +static uniform int nPartialSum; +static uniform int nPrefixSum; + +static uniform int * uniform sharedCounts; +static uniform int * uniform countsGlobal; +static uniform int * uniform excScan; +static uniform int * uniform counts; +static uniform int * uniform partialSum; +static uniform int * uniform prefixSum; + +static uniform int numElementsBuf = 0; +static uniform Key * uniform bufKeys; + +export void radixSort_alloc(const uniform int n) +{ + assert(memoryPool == NULL); + numBlocks = num_cores()*4; +#ifdef __NVPTX__ + numBlocks = 13*32*4; //num_cores()*4; +#endif + nSharedCounts = NUMDIGITS*numBlocks; + nCountsGlobal = NUMDIGITS; + nExcScan = NUMDIGITS*numBlocks; + nCountsBlock = NUMDIGITS*numBlocks; + nPartialSum = NUMDIGITS*numBlocks; + nPrefixSum = NUMDIGITS*numBlocks; + + + const uniform int nalloc = + nSharedCounts + + nCountsGlobal + + nExcScan + + nCountsBlock + + nPartialSum + + nPrefixSum; + + memoryPool = uniform new uniform int[nalloc]; + + sharedCounts = memoryPool; + countsGlobal = sharedCounts + nSharedCounts; + excScan = countsGlobal + nCountsGlobal; + counts = excScan + nExcScan; + partialSum = counts + nCountsBlock; + prefixSum = partialSum + nPartialSum; +} + +static +void radixSort_freeBufKeys() +{ + if (numElementsBuf > 0) + { + delete bufKeys; + numElementsBuf = 0; + } +} + +export void radixSort_free() +{ + assert(memoryPool != NULL); + delete memoryPool; + memoryPool = NULL; + + radixSort_freeBufKeys(); +} + +export void radixSort( + const uniform int numElements, + uniform Key keys[], + const uniform int nBits) +{ +#ifdef __NVPTX__ + assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */ +#endif + + if (numElementsBuf < numElements) + radixSort_freeBufKeys(); + if (numElementsBuf == 0) + { + numElementsBuf = numElements; + bufKeys = uniform new uniform Key[numElementsBuf]; + } + + const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks; + + for (uniform int bit = 0; bit < nBits; bit += NUMBITS) + { + /* initialize histogram for each digit */ + foreach (digit = 0 ... NUMDIGITS) + countsGlobal[digit] = 0; + + /* compute histogram for each digit */ + launch [numBlocks] countPass(keys, bufKeys, bit, numElements, counts, countsGlobal); + sync; + + /* exclusive scan on global histogram */ + int carry = 0; + excScan[0] = 0; + foreach (digit = 0 ... NUMDIGITS) + { + const int value = countsGlobal[digit]; + const int scan = exclusive_scan_add(value); + excScan[digit] = scan + carry; + carry += broadcast(scan+value, programCount-1); + } + + /* computing offsets for each digit */ + radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum); + + /* sorting */ + launch [numBlocks] + sortPass( + bufKeys, + keys, + bit, + numElements, + excScan); + sync; + } + +}