diff --git a/examples_ptx/radixSort/radixSort.cu b/examples_ptx/radixSort/radixSort.cu new file mode 100644 index 00000000..7748b5e2 --- /dev/null +++ b/examples_ptx/radixSort/radixSort.cu @@ -0,0 +1,370 @@ +#include "cuda_helpers.cuh" +#include + +#define NUMBITS 8 +#define NUMDIGITS (1<> bit); + atomic_add_global(&counts[key], 1); + } + + for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) + atomic_add_global(&countsGlobal[digit], counts[digit]); +} + +__global__ +void sortPass( + Key keysAll[], + Key sorted[], + int bit, + int numElements, + int digitOffsetsAll[]) +{ + const int blkIdx = taskIndex; + const int numBlocks = taskCount; + + const int blkDim = (numElements + numBlocks - 1) / numBlocks; + + + const int keyIndex = blkIdx * blkDim; + Key * keys = keysAll + keyIndex; + + + const int nloc = min(numElements - keyIndex, blkDim); + + const int mask = (1 << NUMBITS) - 1; + + const int unitScan = exclusive_scan_add(1); + + /* copy digit offset from Gmem to Lmem */ + __shared__ int digitOffsets[NUMDIGITS]; + for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) + digitOffsets[digit] = digitOffsetsAll[blkIdx*NUMDIGITS + digit]; + + + for (int i = programIndex; i < nloc; i += programCount) + if (i < nloc) + { + const int key = mask & ((unsigned int)keys[i] >> bit); + int scatter; + /* not a vector friendly loop */ + for (int lane = 0; lane < programCount; lane++) + if (programIndex == lane) + scatter = digitOffsets[key]++; + sorted [scatter] = keys[i]; + } +} + +__global__ +void partialScanLocal( + int numBlocks, + int excScanAll[], + int countsAll[], + int partialSumAll[]) +{ + const int blkIdx = taskIndex; + + const int blkDim = (numBlocks+taskCount-1)/taskCount; + const int bbeg = blkIdx * blkDim; + const int bend = min(bbeg + blkDim, numBlocks); + + int (* countsBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])countsAll; + int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll; + int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll; + + for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) + { + int prev = bbeg == 0 ? excScanBlock[0][digit] : 0; + for ( int block = bbeg; block < bend; block++) + { + const int y = countsBlock[block][digit]; + excScanBlock[block][digit] = prev; + prev += y; + } + partialSum[blkIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit]; + } +} + +__global__ +void partialScanGlobal( + const int numBlocks, + int partialSumAll[], + int prefixSumAll[]) +{ + int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll; + int (* prefixSum)[NUMDIGITS] = ( int (*)[NUMDIGITS]) prefixSumAll; + const int digit = taskIndex; + int carry = 0; + for (int block = programIndex; block < numBlocks; block += programCount) + if (block < numBlocks) + { + const int value = partialSum[block][digit]; + const int scan = exclusive_scan_add(value); + prefixSum[block][digit] = scan + carry; + carry += __shfl(scan+value, programCount-1); + } +} + +__global__ +void completeScanGlobal( + int numBlocks, + int excScanAll[], + int carryValueAll[]) +{ + const int blkIdx = taskIndex; + const int blkDim = (numBlocks+taskCount-1)/taskCount; + const int bbeg = blkIdx * blkDim; + const int bend = min(bbeg + blkDim, numBlocks); + + int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll; + int (* carryValue)[NUMDIGITS] = ( int (*)[NUMDIGITS])carryValueAll; + + for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) + { + const int carry = carryValue[blkIdx][digit]; + for ( int block = bbeg; block < bend; block++) + excScanBlock[block][digit] += carry; + } +} + +__device__ static +inline void radixExclusiveScan( + const int numBlocks, + int excScanPtr[], + int countsPtr[], + int partialSum[], + int prefixSum[]) +{ + const int scale = 8; + launch (numBlocks/scale, 1,1, partialScanLocal)(numBlocks, excScanPtr, countsPtr, partialSum); + sync; + + launch (NUMDIGITS,1,1,partialScanGlobal) (numBlocks/scale, partialSum, prefixSum); + sync; + + launch (numBlocks/scale,1,1, completeScanGlobal) (numBlocks, excScanPtr, prefixSum); + sync; +} + +__device__ static int * memoryPool = NULL; +__device__ static int numBlocks; +__device__ static int nSharedCounts; +__device__ static int nCountsGlobal; +__device__ static int nExcScan; +__device__ static int nCountsBlock; +__device__ static int nPartialSum; +__device__ static int nPrefixSum; + +__device__ static int * sharedCounts; +__device__ static int * countsGlobal; +__device__ static int * excScan; +__device__ static int * counts; +__device__ static int * partialSum; +__device__ static int * prefixSum; + +__device__ static int numElementsBuf = 0; +__device__ static Key * bufKeys; + +__global__ +void radixSort_alloc___export(const int n) +{ + assert(memoryPool == NULL); + numBlocks = 13*32*4; + nSharedCounts = NUMDIGITS*numBlocks; + nCountsGlobal = NUMDIGITS; + nExcScan = NUMDIGITS*numBlocks; + nCountsBlock = NUMDIGITS*numBlocks; + nPartialSum = NUMDIGITS*numBlocks; + nPrefixSum = NUMDIGITS*numBlocks; + + + const int nalloc = + nSharedCounts + + nCountsGlobal + + nExcScan + + nCountsBlock + + nPartialSum + + nPrefixSum; + + if (programIndex == 0) + memoryPool = new int[nalloc]; + + union {int* ptr; int val[2];} t; + t.ptr = memoryPool; + t.val[0] = __shfl(t.val[0], 0); + t.val[1] = __shfl(t.val[1], 0); + memoryPool = t.ptr; + + + + + sharedCounts = memoryPool; + countsGlobal = sharedCounts + nSharedCounts; + excScan = countsGlobal + nCountsGlobal; + counts = excScan + nExcScan; + partialSum = counts + nCountsBlock; + prefixSum = partialSum + nPartialSum; +} + +extern "C" __global__ +void radixSort_alloc(const int n) +{ + radixSort_alloc___export<<<1,32>>>(n); + sync; +} + + +__device__ static +void radixSort_freeBufKeys() +{ + if (numElementsBuf > 0) + { + delete bufKeys; + numElementsBuf = 0; + } +} + +__global__ void radixSort_free___export() +{ + assert(memoryPool != NULL); + if (programIndex == 0) + delete memoryPool; + memoryPool = NULL; + + radixSort_freeBufKeys(); +} +extern "C" __global__ +void radixSort_free() +{ + radixSort_free___export<<<1,32>>>(); + sync; +} + +__global__ void radixSort___export( + const int numElements, + Key keys[], + const int nBits) +{ +#ifdef __NVPTX__ + assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */ +#endif + + if (numElementsBuf < numElements) + radixSort_freeBufKeys(); + if (numElementsBuf == 0) + { + numElementsBuf = numElements; + if (programIndex == 0) + bufKeys = new Key[numElementsBuf]; + union {Key* ptr; int val[2];} t; + t.ptr = bufKeys; + t.val[0] = __shfl(t.val[0], 0); + t.val[1] = __shfl(t.val[1], 0); + bufKeys = t.ptr; + + + } + + const int blkDim = (numElements + numBlocks - 1) / numBlocks; + + for ( int bit = 0; bit < nBits; bit += NUMBITS) + { + /* initialize histogram for each digit */ + for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) + countsGlobal[digit] = 0; + + /* compute histogram for each digit */ + launch (numBlocks,1,1, countPass)(keys, bufKeys, bit, numElements, counts, countsGlobal); + sync; + + /* exclusive scan on global histogram */ + int carry = 0; + excScan[0] = 0; + for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) + { + const int value = countsGlobal[digit]; + const int scan = exclusive_scan_add(value); + excScan[digit] = scan + carry; + carry += __shfl(scan+value, programCount-1); + } + + /* computing offsets for each digit */ + radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum); + + /* sorting */ + launch (numBlocks,1,1, + sortPass)( + bufKeys, + keys, + bit, + numElements, + excScan); + sync; + } + +} +extern "C" __global__ +void radixSort( + const int numElements, + Key keys[], + const int nBits) +{ + radixSort___export<<<1,32>>>(numElements, keys, nBits); + sync; +}