#include "cuda_helpers.cuh" #include #define NUMBITS 8 #define NUMDIGITS (1<> bit); atomic_add_global(&counts[key], 1); } for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) atomic_add_global(&countsGlobal[digit], counts[digit]); } __global__ void sortPass( Key keysAll[], Key sorted[], int bit, int numElements, int digitOffsetsAll[]) { const int blkIdx = taskIndex; const int numBlocks = taskCount; const int blkDim = (numElements + numBlocks - 1) / numBlocks; const int keyIndex = blkIdx * blkDim; Key * keys = keysAll + keyIndex; const int nloc = min(numElements - keyIndex, blkDim); const int mask = (1 << NUMBITS) - 1; const int unitScan = exclusive_scan_add(1); /* copy digit offset from Gmem to Lmem */ __shared__ int digitOffsets[NUMDIGITS]; for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) digitOffsets[digit] = digitOffsetsAll[blkIdx*NUMDIGITS + digit]; for (int i = programIndex; i < nloc; i += programCount) if (i < nloc) { const int key = mask & ((unsigned int)keys[i] >> bit); int scatter; /* not a vector friendly loop */ for (int lane = 0; lane < programCount; lane++) if (programIndex == lane) scatter = digitOffsets[key]++; sorted [scatter] = keys[i]; } } __global__ void partialScanLocal( int numBlocks, int excScanAll[], int countsAll[], int partialSumAll[]) { const int blkIdx = taskIndex; const int blkDim = (numBlocks+taskCount-1)/taskCount; const int bbeg = blkIdx * blkDim; const int bend = min(bbeg + blkDim, numBlocks); int (* countsBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])countsAll; int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll; int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll; for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) { int prev = bbeg == 0 ? excScanBlock[0][digit] : 0; for ( int block = bbeg; block < bend; block++) { const int y = countsBlock[block][digit]; excScanBlock[block][digit] = prev; prev += y; } partialSum[blkIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit]; } } __global__ void partialScanGlobal( const int numBlocks, int partialSumAll[], int prefixSumAll[]) { int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll; int (* prefixSum)[NUMDIGITS] = ( int (*)[NUMDIGITS]) prefixSumAll; const int digit = taskIndex; int carry = 0; for (int block = programIndex; block < numBlocks; block += programCount) if (block < numBlocks) { const int value = partialSum[block][digit]; const int scan = exclusive_scan_add(value); prefixSum[block][digit] = scan + carry; carry += __shfl(scan+value, programCount-1); } } __global__ void completeScanGlobal( int numBlocks, int excScanAll[], int carryValueAll[]) { const int blkIdx = taskIndex; const int blkDim = (numBlocks+taskCount-1)/taskCount; const int bbeg = blkIdx * blkDim; const int bend = min(bbeg + blkDim, numBlocks); int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll; int (* carryValue)[NUMDIGITS] = ( int (*)[NUMDIGITS])carryValueAll; for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) { const int carry = carryValue[blkIdx][digit]; for ( int block = bbeg; block < bend; block++) excScanBlock[block][digit] += carry; } } __device__ static inline void radixExclusiveScan( const int numBlocks, int excScanPtr[], int countsPtr[], int partialSum[], int prefixSum[]) { const int scale = 8; launch (numBlocks/scale, 1,1, partialScanLocal)(numBlocks, excScanPtr, countsPtr, partialSum); sync; launch (NUMDIGITS,1,1,partialScanGlobal) (numBlocks/scale, partialSum, prefixSum); sync; launch (numBlocks/scale,1,1, completeScanGlobal) (numBlocks, excScanPtr, prefixSum); sync; } __device__ static int * memoryPool = NULL; __device__ static int numBlocks; __device__ static int nSharedCounts; __device__ static int nCountsGlobal; __device__ static int nExcScan; __device__ static int nCountsBlock; __device__ static int nPartialSum; __device__ static int nPrefixSum; __device__ static int * sharedCounts; __device__ static int * countsGlobal; __device__ static int * excScan; __device__ static int * counts; __device__ static int * partialSum; __device__ static int * prefixSum; __device__ static int numElementsBuf = 0; __device__ static Key * bufKeys; __global__ void radixSort_alloc___export(const int n) { assert(memoryPool == NULL); numBlocks = 13*32*4; nSharedCounts = NUMDIGITS*numBlocks; nCountsGlobal = NUMDIGITS; nExcScan = NUMDIGITS*numBlocks; nCountsBlock = NUMDIGITS*numBlocks; nPartialSum = NUMDIGITS*numBlocks; nPrefixSum = NUMDIGITS*numBlocks; const int nalloc = nSharedCounts + nCountsGlobal + nExcScan + nCountsBlock + nPartialSum + nPrefixSum; if (programIndex == 0) memoryPool = new int[nalloc]; union {int* ptr; int val[2];} t; t.ptr = memoryPool; t.val[0] = __shfl(t.val[0], 0); t.val[1] = __shfl(t.val[1], 0); memoryPool = t.ptr; sharedCounts = memoryPool; countsGlobal = sharedCounts + nSharedCounts; excScan = countsGlobal + nCountsGlobal; counts = excScan + nExcScan; partialSum = counts + nCountsBlock; prefixSum = partialSum + nPartialSum; } extern "C" __global__ void radixSort_alloc(const int n) { radixSort_alloc___export<<<1,32>>>(n); sync; } __device__ static void radixSort_freeBufKeys() { if (numElementsBuf > 0) { delete bufKeys; numElementsBuf = 0; } } __global__ void radixSort_free___export() { assert(memoryPool != NULL); if (programIndex == 0) delete memoryPool; memoryPool = NULL; radixSort_freeBufKeys(); } extern "C" __global__ void radixSort_free() { radixSort_free___export<<<1,32>>>(); sync; } __global__ void radixSort___export( const int numElements, Key keys[], const int nBits) { #ifdef __NVPTX__ assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */ #endif if (numElementsBuf < numElements) radixSort_freeBufKeys(); if (numElementsBuf == 0) { numElementsBuf = numElements; if (programIndex == 0) bufKeys = new Key[numElementsBuf]; union {Key* ptr; int val[2];} t; t.ptr = bufKeys; t.val[0] = __shfl(t.val[0], 0); t.val[1] = __shfl(t.val[1], 0); bufKeys = t.ptr; } const int blkDim = (numElements + numBlocks - 1) / numBlocks; for ( int bit = 0; bit < nBits; bit += NUMBITS) { /* initialize histogram for each digit */ for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) countsGlobal[digit] = 0; /* compute histogram for each digit */ launch (numBlocks,1,1, countPass)(keys, bufKeys, bit, numElements, counts, countsGlobal); sync; /* exclusive scan on global histogram */ int carry = 0; excScan[0] = 0; for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) { const int value = countsGlobal[digit]; const int scan = exclusive_scan_add(value); excScan[digit] = scan + carry; carry += __shfl(scan+value, programCount-1); } /* computing offsets for each digit */ radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum); /* sorting */ launch (numBlocks,1,1, sortPass)( bufKeys, keys, bit, numElements, excScan); sync; } } extern "C" __global__ void radixSort( const int numElements, Key keys[], const int nBits) { radixSort___export<<<1,32>>>(numElements, keys, nBits); sync; }