#include "cuda_helpers.cuh" #include #define NUMBITS 8 #define NUMDIGITS (1<> bit); atomic_add_global(&counts[key], 1); } #pragma unroll 8 for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) atomic_add_global(&countsGlobal[digit], counts[digit]); } __global__ void sortPass( Key keysAll[], Key sorted[], int bit, int numElements, int digitOffsetsAll[]) { const int blkIdx = taskIndex; const int numBlocks = taskCount; const int blkDim = (numElements + numBlocks - 1) / numBlocks; const int keyIndex = blkIdx * blkDim; Key * keys = keysAll + keyIndex; const int nloc = min(numElements - keyIndex, blkDim); const int mask = (1 << NUMBITS) - 1; /* copy digit offset from Gmem to Lmem */ #if 1 __shared__ int digitOffsets_sh[NUMDIGITS*4]; volatile int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS; for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) digitOffsets[digit] = digitOffsetsAll[blkIdx*NUMDIGITS + digit]; #else int *digitOffsets = &digitOffsetsAll[blkIdx*NUMDIGITS]; #endif for (int i = programIndex; i < nloc; i += programCount) if (i < nloc) { const int key = mask & ((unsigned int)keys[i] >> bit); int scatter; /* not a vector friendly loop */ #pragma unroll 1 /* needed, otherwise compiler unroll and optimizes the result :S */ for (int iv = 0; iv < programCount; iv++) if (programIndex == iv) scatter = digitOffsets[key]++; sorted [scatter] = keys[i]; } } __global__ void partialScanLocal( int numBlocks, int excScanAll[], int countsAll[], int partialSumAll[]) { const int blkIdx = taskIndex; const int blkDim = (numBlocks+taskCount-1)/taskCount; const int bbeg = blkIdx * blkDim; const int bend = min(bbeg + blkDim, numBlocks); int (* countsBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])countsAll; int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll; int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll; #pragma unroll 8 for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) { int prev = bbeg == 0 ? excScanBlock[0][digit] : 0; for ( int block = bbeg; block < bend; block++) { const int y = countsBlock[block][digit]; excScanBlock[block][digit] = prev; prev += y; } partialSum[blkIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit]; } } __global__ void partialScanGlobal( const int numBlocks, int partialSumAll[], int prefixSumAll[]) { int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll; int (* prefixSum)[NUMDIGITS] = ( int (*)[NUMDIGITS]) prefixSumAll; const int digit = taskIndex; int carry = 0; for (int block = programIndex; block < numBlocks; block += programCount) { const int value = partialSum[block][digit]; const int scan = exclusive_scan_add(value); if (block < numBlocks) prefixSum[block][digit] = scan + carry; carry += __shfl(scan+value, programCount-1); } } __global__ void completeScanGlobal( int numBlocks, int excScanAll[], int carryValueAll[]) { const int blkIdx = taskIndex; const int blkDim = (numBlocks+taskCount-1)/taskCount; const int bbeg = blkIdx * blkDim; const int bend = min(bbeg + blkDim, numBlocks); int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll; int (* carryValue)[NUMDIGITS] = ( int (*)[NUMDIGITS])carryValueAll; #pragma unroll 8 for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) { const int carry = carryValue[blkIdx][digit]; for ( int block = bbeg; block < bend; block++) excScanBlock[block][digit] += carry; } } __device__ static inline void radixExclusiveScan( const int numBlocks, int excScanPtr[], int countsPtr[], int partialSum[], int prefixSum[]) { const int scale = 8; launch (numBlocks/scale, 1,1, partialScanLocal)(numBlocks, excScanPtr, countsPtr, partialSum); sync; launch (NUMDIGITS,1,1,partialScanGlobal) (numBlocks/scale, partialSum, prefixSum); sync; launch (numBlocks/scale,1,1, completeScanGlobal) (numBlocks, excScanPtr, prefixSum); sync; } __device__ static int * memoryPool = NULL; __device__ static int numBlocks; __device__ static int nSharedCounts; __device__ static int nCountsGlobal; __device__ static int nExcScan; __device__ static int nCountsBlock; __device__ static int nPartialSum; __device__ static int nPrefixSum; __device__ static int * sharedCounts; __device__ static int * countsGlobal; __device__ static int * excScan; __device__ static int * counts; __device__ static int * partialSum; __device__ static int * prefixSum; __device__ static int numElementsBuf = 0; __device__ static Key * bufKeys; __global__ void radixSort_alloc___export(const int n) { assert(memoryPool == NULL); numBlocks = 13*32*4; nSharedCounts = NUMDIGITS*numBlocks; nCountsGlobal = NUMDIGITS; nExcScan = NUMDIGITS*numBlocks; nCountsBlock = NUMDIGITS*numBlocks; nPartialSum = NUMDIGITS*numBlocks; nPrefixSum = NUMDIGITS*numBlocks; const int nalloc = nSharedCounts + nCountsGlobal + nExcScan + nCountsBlock + nPartialSum + nPrefixSum; if (programIndex == 0) memoryPool = new int[nalloc]; sharedCounts = memoryPool; countsGlobal = sharedCounts + nSharedCounts; excScan = countsGlobal + nCountsGlobal; counts = excScan + nExcScan; partialSum = counts + nCountsBlock; prefixSum = partialSum + nPartialSum; } extern "C" void radixSort_alloc(const int n) { radixSort_alloc___export<<<1,32>>>(n); sync; } __device__ static void radixSort_freeBufKeys() { if (numElementsBuf > 0) { if (programIndex == 0) delete bufKeys; numElementsBuf = 0; } } __global__ void radixSort_free___export() { assert(memoryPool != NULL); if (programIndex == 0) delete memoryPool; memoryPool = NULL; radixSort_freeBufKeys(); } extern "C" void radixSort_free() { radixSort_free___export<<<1,32>>>(); sync; } __global__ void radixSort___export( const int numElements, Key keys[], const int nBits) { #ifdef __NVPTX__ assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */ #endif if (numElementsBuf < numElements) radixSort_freeBufKeys(); if (numElementsBuf == 0) { numElementsBuf = numElements; if (programIndex == 0) bufKeys = new Key[numElementsBuf]; } const int blkDim = (numElements + numBlocks - 1) / numBlocks; for ( int bit = 0; bit < nBits; bit += NUMBITS) { /* initialize histogram for each digit */ for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) countsGlobal[digit] = 0; /* compute histogram for each digit */ launch (numBlocks,1,1, countPass)(keys, bufKeys, bit, numElements, counts, countsGlobal); sync; /* exclusive scan on global histogram */ int carry = 0; excScan[0] = 0; #pragma unroll 8 for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) { const int value = countsGlobal[digit]; const int scan = exclusive_scan_add(value); excScan[digit] = scan + carry; carry += __shfl(scan+value, programCount-1); } /* computing offsets for each digit */ radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum); /* sorting */ launch (numBlocks,1,1, sortPass)( bufKeys, keys, bit, numElements, excScan); sync; } } extern "C" void radixSort( const int numElements, Key keys[], const int nBits) { cudaDeviceSetCacheConfig ( cudaFuncCachePreferEqual ); radixSort___export<<<1,32>>>(numElements, keys, nBits); sync; }