diff --git a/examples_ptx/radixSort/radixSort.cu b/examples_ptx/radixSort/radixSort.cu index 7748b5e2..5a56602a 100644 --- a/examples_ptx/radixSort/radixSort.cu +++ b/examples_ptx/radixSort/radixSort.cu @@ -6,7 +6,7 @@ typedef long long Key; -__device__ int atomic_add_global(int* ptr, int value) +__forceinline__ __device__ int atomic_add_global(int* ptr, int value) { return atomicAdd(ptr, value); } @@ -24,13 +24,13 @@ static __device__ __forceinline__ int shfl_scan_add_step(int partial, int up_off return result; } -__device__ int exclusive_scan_add(int value) +__forceinline__ __device__ int exclusive_scan_add(int value) { int mysum = value; #pragma unroll for(int i = 0; i < 5; ++i) mysum = shfl_scan_add_step(mysum, 1 << i); - return mysum; + return mysum - value; } __global__ @@ -53,6 +53,7 @@ void countPass( int * counts = countsAll + blkIdx*NUMDIGITS; const int nloc = min(numElements - blkIdx*blkDim, blkDim); +#pragma unroll 8 for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) counts[digit] = 0; @@ -64,6 +65,7 @@ void countPass( atomic_add_global(&counts[key], 1); } +#pragma unroll 8 for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) atomic_add_global(&countsGlobal[digit], counts[digit]); } @@ -90,12 +92,15 @@ void sortPass( const int mask = (1 << NUMBITS) - 1; - const int unitScan = exclusive_scan_add(1); - /* copy digit offset from Gmem to Lmem */ - __shared__ int digitOffsets[NUMDIGITS]; +#if 1 + __shared__ int digitOffsets_sh[NUMDIGITS*4]; + int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS; for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) digitOffsets[digit] = digitOffsetsAll[blkIdx*NUMDIGITS + digit]; +#else + int *digitOffsets = &digitOffsetsAll[blkIdx*NUMDIGITS]; +#endif for (int i = programIndex; i < nloc; i += programCount) @@ -104,8 +109,8 @@ void sortPass( const int key = mask & ((unsigned int)keys[i] >> bit); int scatter; /* not a vector friendly loop */ - for (int lane = 0; lane < programCount; lane++) - if (programIndex == lane) + for (int iv = 0; iv < programCount; iv++) + if (programIndex == iv) scatter = digitOffsets[key]++; sorted [scatter] = keys[i]; } @@ -128,6 +133,7 @@ void partialScanLocal( int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll; int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll; +#pragma unroll 8 for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) { int prev = bbeg == 0 ? excScanBlock[0][digit] : 0; @@ -152,13 +158,13 @@ void partialScanGlobal( const int digit = taskIndex; int carry = 0; for (int block = programIndex; block < numBlocks; block += programCount) + { + const int value = partialSum[block][digit]; + const int scan = exclusive_scan_add(value); if (block < numBlocks) - { - const int value = partialSum[block][digit]; - const int scan = exclusive_scan_add(value); prefixSum[block][digit] = scan + carry; - carry += __shfl(scan+value, programCount-1); - } + carry += __shfl(scan+value, programCount-1); + } } __global__ @@ -175,6 +181,7 @@ void completeScanGlobal( int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll; int (* carryValue)[NUMDIGITS] = ( int (*)[NUMDIGITS])carryValueAll; +#pragma unroll 8 for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) { const int carry = carryValue[blkIdx][digit]; @@ -245,15 +252,6 @@ void radixSort_alloc___export(const int n) if (programIndex == 0) memoryPool = new int[nalloc]; - union {int* ptr; int val[2];} t; - t.ptr = memoryPool; - t.val[0] = __shfl(t.val[0], 0); - t.val[1] = __shfl(t.val[1], 0); - memoryPool = t.ptr; - - - - sharedCounts = memoryPool; countsGlobal = sharedCounts + nSharedCounts; excScan = countsGlobal + nCountsGlobal; @@ -262,7 +260,7 @@ void radixSort_alloc___export(const int n) prefixSum = partialSum + nPartialSum; } -extern "C" __global__ +extern "C" void radixSort_alloc(const int n) { radixSort_alloc___export<<<1,32>>>(n); @@ -275,7 +273,8 @@ void radixSort_freeBufKeys() { if (numElementsBuf > 0) { - delete bufKeys; + if (programIndex == 0) + delete bufKeys; numElementsBuf = 0; } } @@ -289,7 +288,7 @@ __global__ void radixSort_free___export() radixSort_freeBufKeys(); } -extern "C" __global__ +extern "C" void radixSort_free() { radixSort_free___export<<<1,32>>>(); @@ -312,13 +311,6 @@ __global__ void radixSort___export( numElementsBuf = numElements; if (programIndex == 0) bufKeys = new Key[numElementsBuf]; - union {Key* ptr; int val[2];} t; - t.ptr = bufKeys; - t.val[0] = __shfl(t.val[0], 0); - t.val[1] = __shfl(t.val[1], 0); - bufKeys = t.ptr; - - } const int blkDim = (numElements + numBlocks - 1) / numBlocks; @@ -336,6 +328,7 @@ __global__ void radixSort___export( /* exclusive scan on global histogram */ int carry = 0; excScan[0] = 0; +#pragma unroll 8 for (int digit = programIndex; digit < NUMDIGITS; digit += programCount) { const int value = countsGlobal[digit]; @@ -357,14 +350,15 @@ __global__ void radixSort___export( excScan); sync; } - } -extern "C" __global__ + +extern "C" void radixSort( const int numElements, Key keys[], const int nBits) { + cudaDeviceSetCacheConfig ( cudaFuncCachePreferEqual ); radixSort___export<<<1,32>>>(numElements, keys, nBits); sync; } diff --git a/examples_ptx/radixSort/radixSort.ispc b/examples_ptx/radixSort/radixSort.ispc index 66e2defa..2abd11af 100644 --- a/examples_ptx/radixSort/radixSort.ispc +++ b/examples_ptx/radixSort/radixSort.ispc @@ -63,14 +63,11 @@ void sortPass( const uniform int mask = (1 << NUMBITS) - 1; - const int unitScan = exclusive_scan_add(1); - - /* copy digit offset from Gmem to Lmem */ + /* copy digit offset from Gmem to Lmem */ uniform int digitOffsets[NUMDIGITS]; foreach (digit = 0 ... NUMDIGITS) digitOffsets[digit] = digitOffsetsAll[blockIdx*NUMDIGITS + digit]; - foreach (i = 0 ... nloc) { const int key = mask & ((unsigned int)keys[i] >> bit); @@ -78,7 +75,7 @@ void sortPass( /* not a vector friendly loop */ foreach_active(iv) scatter = digitOffsets[key]++; - sorted [scatter] = keys[i]; + sorted[scatter] = keys[i]; } } @@ -237,7 +234,7 @@ export void radixSort_free() delete memoryPool; memoryPool = NULL; - radixSort_freeBufKeys; + radixSort_freeBufKeys(); } export void radixSort(