This commit is contained in:
Evghenii
2014-01-29 11:32:53 +01:00
parent 922018ac2d
commit 2634ae65fd

View File

@@ -1,4 +1,4 @@
#define NUMBITS 8 #define NUMBITS 4
#define NUMDIGITS (1<<NUMBITS) #define NUMDIGITS (1<<NUMBITS)
typedef int64 Key; typedef int64 Key;
@@ -26,26 +26,16 @@ void countPass(
foreach (digit = 0 ... NUMDIGITS) foreach (digit = 0 ... NUMDIGITS)
counts[digit] = 0; counts[digit] = 0;
#if 1
foreach (i = 0 ... nloc) foreach (i = 0 ... nloc)
{ {
sorted[i] = keys[i]; sorted[i] = keys[i];
const int key = mask & ((unsigned int)keys[i] >> bit); const int key = mask & ((unsigned int)keys[i] >> bit);
uniform int skey;
if (reduce_equal(key, &skey) == true)
counts[skey] += reduce_add(1);
else
{
#ifdef __NVPTX__ #ifdef __NVPTX__
atomic_add_global(&counts[key], 1); atomic_add_global(&counts[key], 1);
#else #else
atomic_add_local(&counts[key], 1); atomic_add_local(&counts[key], 1);
#endif #endif
}
} }
#else
#endif
foreach (digit = 0 ... NUMDIGITS) foreach (digit = 0 ... NUMDIGITS)
atomic_add_global(&countsGlobal[digit], counts[digit]); atomic_add_global(&countsGlobal[digit], counts[digit]);
@@ -74,6 +64,8 @@ void sortPass(
const uniform int mask = (1 << NUMBITS) - 1; const uniform int mask = (1 << NUMBITS) - 1;
const int unitScan = exclusive_scan_add(1); const int unitScan = exclusive_scan_add(1);
int lkeys[NUMDIGITS] = {0};
/* copy digit offset from Gmem to Lmem */ /* copy digit offset from Gmem to Lmem */
uniform int digitOffsets[NUMDIGITS]; uniform int digitOffsets[NUMDIGITS];
@@ -85,26 +77,8 @@ void sortPass(
{ {
const int key = mask & ((unsigned int)keys[i] >> bit); const int key = mask & ((unsigned int)keys[i] >> bit);
int scatter; int scatter;
#if 0
/* serialize exuection to test correctness of algorithm */
foreach_active(iv) foreach_active(iv)
scattter = digitOffsets[key]++; scatter = digitOffsets[key]++;
#else
if (reduce_equal(key) == true)
digitOffsets[key] = (scatter = digitOffsets[key] + unitScan) + 1;
else
{
#ifdef __NVPTX__
/* there is a bug, not clear where exectly (perhaps due to optimizations),
* This complex code restored correctness */
/* :S */
if (programIndex < 16) {if (key < NUMDIGITS) scatter = atomic_add_global(&digitOffsets[key],1);}
else {if (key < NUMDIGITS) scatter = atomic_add_global(&digitOffsets[key],1);}
#else
scatter = atomic_add_local(&digitOffsets[key],1);
#endif
}
#endif
sorted [scatter] = keys[i]; sorted [scatter] = keys[i];
} }
} }
@@ -126,6 +100,7 @@ void partialScanLocal(
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll; uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll; uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
#if 0
foreach (digit = 0 ... NUMDIGITS) foreach (digit = 0 ... NUMDIGITS)
{ {
int prev = bbeg == 0 ? excScanBlock[0][digit] : 0; int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
@@ -137,6 +112,21 @@ void partialScanLocal(
} }
partialSum[blockIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit]; partialSum[blockIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
} }
#else
int prev[NUMDIGITS];
for (int digit = 0; digit < NUMDIGITS; digit++)
prev[digit] = bbeg == 0 ? excScanBlock[0][digit] : 0;
foreach_tiled (block = bbeg ... bend, digit = 0 ... NUMDIGITS)
{
const int y = countsBlock[block][digit];
excScanBlock[block][digit] = prev[digit];
prev[digit] += y;
}
foreach (digit = 0 ... NUMDIGITS)
partialSum[blockIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
#endif
} }
task task
@@ -310,6 +300,7 @@ export void radixSort(
/* computing offsets for each digit */ /* computing offsets for each digit */
radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum); radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
#if 1
/* sorting */ /* sorting */
launch [numBlocks] launch [numBlocks]
sortPass( sortPass(
@@ -319,6 +310,7 @@ export void radixSort(
numElements, numElements,
excScan); excScan);
sync; sync;
#endif
} }
} }