From 659573338c7adc3af4a2d90f14a8bbdc8a70af69 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Tue, 28 Jan 2014 16:43:00 +0100 Subject: [PATCH] +1 --- examples_ptx/radixSort/radixSort.ispc | 58 ++++++++++++++++----------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/examples_ptx/radixSort/radixSort.ispc b/examples_ptx/radixSort/radixSort.ispc index 3522c33e..9d3b3780 100644 --- a/examples_ptx/radixSort/radixSort.ispc +++ b/examples_ptx/radixSort/radixSort.ispc @@ -19,30 +19,37 @@ void countPass( uniform int * uniform counts = countsAll + blockIdx*NUMDIGITS; const uniform int nloc = min(numElements - blockIdx*blockDim, blockDim); - foreach (digit = 0 ... NUMDIGITS) - counts[digit] = 0; -#if 1 +#if 0 + uniform int shcounts[NUMDIGITS]; +#else +#define shcounts counts +#endif + + foreach (digit = 0 ... NUMDIGITS) + shcounts[digit] = 0; + foreach (i = 0 ... nloc) { const int key = mask & ((unsigned int)keys[i] >> bit); uniform int skey; if (reduce_equal(key, &skey) == true) - counts[skey] += reduce_add(1); + shcounts[skey] += reduce_add(1); else { #ifdef __NVPTX__ - atomic_add_global(&counts[key], 1); + atomic_add_global(&shcounts[key], 1); #else - atomic_add_local(&counts[key], 1); + atomic_add_local(&shcounts[key], 1); #endif } } -#else -#endif foreach (digit = 0 ... NUMDIGITS) - atomic_add_global(&countsGlobal[digit], counts[digit]); + { + counts[digit] = shcounts[digit]; + atomic_add_global(&countsGlobal[digit], shcounts[digit]); + } } task @@ -51,22 +58,23 @@ void sortPass( uniform int sorted[], uniform int bit, uniform int numElements, - uniform int digitOffsetsAll[], - uniform int sharedCounts[]) + uniform int digitOffsetsAll[]) { const uniform int blockIdx = taskIndex; const uniform int numBlocks = taskCount; const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks; - - uniform int * uniform localCounts = sharedCounts + blockIdx*NUMDIGITS; + uniform int localCounts[NUMDIGITS]; const uniform int keyIndex = blockIdx * blockDim; uniform int * uniform keys = keysAll + keyIndex; uniform int * uniform digitOffsets = digitOffsetsAll + blockIdx*NUMDIGITS; const uniform int nloc = min(numElements - keyIndex, blockDim); + const int unitScan = exclusive_scan_add(1); + + const uniform int mask = (1 << NUMBITS) - 1; foreach (i = 0 ... NUMDIGITS) localCounts[i] = 0; @@ -75,10 +83,18 @@ void sortPass( { const int key = mask & ((unsigned int)keys[i] >> bit); int rel; - foreach_active(iv) + if (reduce_equal(key) == true) { - rel = localCounts[key]; - localCounts[key]++; + rel = localCounts[key] + unitScan; + localCounts[key] = rel+1; + } + else + { +#ifdef __NVPTX__ + rel = atomic_add_global(&localCounts[key],1); +#else + rel = atomic_add_local(&localCounts[key],1); +#endif } const int scatter = rel + digitOffsets[key]; sorted [scatter] = keys[i]; @@ -177,14 +193,12 @@ inline void radixExclusiveScan( static uniform int * uniform memoryPool = NULL; static uniform int numBlocks; -static uniform int nSharedCounts; static uniform int nCountsGlobal; static uniform int nExcScan; static uniform int nCountsBlock; static uniform int nPartialSum; static uniform int nPrefixSum; -static uniform int * uniform sharedCounts; static uniform int * uniform countsGlobal; static uniform int * uniform excScan; static uniform int * uniform counts; @@ -198,7 +212,6 @@ export void radixSort_alloc(const uniform int n) { assert(memoryPool == NULL); numBlocks = num_cores()*4; - nSharedCounts = NUMDIGITS*numBlocks; nCountsGlobal = NUMDIGITS; nExcScan = NUMDIGITS*numBlocks; nCountsBlock = NUMDIGITS*numBlocks; @@ -207,7 +220,6 @@ export void radixSort_alloc(const uniform int n) const uniform int nalloc = - nSharedCounts + nCountsGlobal + nExcScan + nCountsBlock + @@ -216,8 +228,7 @@ export void radixSort_alloc(const uniform int n) memoryPool = uniform new uniform int[nalloc]; - sharedCounts = memoryPool; - countsGlobal = sharedCounts + nSharedCounts; + countsGlobal = memoryPool; excScan = countsGlobal + nCountsGlobal; counts = excScan + nExcScan; partialSum = counts + nCountsBlock; @@ -292,8 +303,7 @@ export void radixSort( bufKeys, bit, numElements, - excScan, - sharedCounts); + excScan); sync; uniform int * uniform tmp = keys;