This commit is contained in:
Evghenii
2014-01-28 16:43:00 +01:00
parent 5a6b650d8b
commit 659573338c

View File

@@ -19,30 +19,37 @@ void countPass(
uniform int * uniform counts = countsAll + blockIdx*NUMDIGITS; uniform int * uniform counts = countsAll + blockIdx*NUMDIGITS;
const uniform int nloc = min(numElements - blockIdx*blockDim, blockDim); const uniform int nloc = min(numElements - blockIdx*blockDim, blockDim);
foreach (digit = 0 ... NUMDIGITS)
counts[digit] = 0;
#if 1 #if 0
uniform int shcounts[NUMDIGITS];
#else
#define shcounts counts
#endif
foreach (digit = 0 ... NUMDIGITS)
shcounts[digit] = 0;
foreach (i = 0 ... nloc) foreach (i = 0 ... nloc)
{ {
const int key = mask & ((unsigned int)keys[i] >> bit); const int key = mask & ((unsigned int)keys[i] >> bit);
uniform int skey; uniform int skey;
if (reduce_equal(key, &skey) == true) if (reduce_equal(key, &skey) == true)
counts[skey] += reduce_add(1); shcounts[skey] += reduce_add(1);
else else
{ {
#ifdef __NVPTX__ #ifdef __NVPTX__
atomic_add_global(&counts[key], 1); atomic_add_global(&shcounts[key], 1);
#else #else
atomic_add_local(&counts[key], 1); atomic_add_local(&shcounts[key], 1);
#endif #endif
} }
} }
#else
#endif
foreach (digit = 0 ... NUMDIGITS) foreach (digit = 0 ... NUMDIGITS)
atomic_add_global(&countsGlobal[digit], counts[digit]); {
counts[digit] = shcounts[digit];
atomic_add_global(&countsGlobal[digit], shcounts[digit]);
}
} }
task task
@@ -51,22 +58,23 @@ void sortPass(
uniform int sorted[], uniform int sorted[],
uniform int bit, uniform int bit,
uniform int numElements, uniform int numElements,
uniform int digitOffsetsAll[], uniform int digitOffsetsAll[])
uniform int sharedCounts[])
{ {
const uniform int blockIdx = taskIndex; const uniform int blockIdx = taskIndex;
const uniform int numBlocks = taskCount; const uniform int numBlocks = taskCount;
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks; const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
uniform int localCounts[NUMDIGITS];
uniform int * uniform localCounts = sharedCounts + blockIdx*NUMDIGITS;
const uniform int keyIndex = blockIdx * blockDim; const uniform int keyIndex = blockIdx * blockDim;
uniform int * uniform keys = keysAll + keyIndex; uniform int * uniform keys = keysAll + keyIndex;
uniform int * uniform digitOffsets = digitOffsetsAll + blockIdx*NUMDIGITS; uniform int * uniform digitOffsets = digitOffsetsAll + blockIdx*NUMDIGITS;
const uniform int nloc = min(numElements - keyIndex, blockDim); const uniform int nloc = min(numElements - keyIndex, blockDim);
const int unitScan = exclusive_scan_add(1);
const uniform int mask = (1 << NUMBITS) - 1; const uniform int mask = (1 << NUMBITS) - 1;
foreach (i = 0 ... NUMDIGITS) foreach (i = 0 ... NUMDIGITS)
localCounts[i] = 0; localCounts[i] = 0;
@@ -75,10 +83,18 @@ void sortPass(
{ {
const int key = mask & ((unsigned int)keys[i] >> bit); const int key = mask & ((unsigned int)keys[i] >> bit);
int rel; int rel;
foreach_active(iv) if (reduce_equal(key) == true)
{ {
rel = localCounts[key]; rel = localCounts[key] + unitScan;
localCounts[key]++; localCounts[key] = rel+1;
}
else
{
#ifdef __NVPTX__
rel = atomic_add_global(&localCounts[key],1);
#else
rel = atomic_add_local(&localCounts[key],1);
#endif
} }
const int scatter = rel + digitOffsets[key]; const int scatter = rel + digitOffsets[key];
sorted [scatter] = keys[i]; sorted [scatter] = keys[i];
@@ -177,14 +193,12 @@ inline void radixExclusiveScan(
static uniform int * uniform memoryPool = NULL; static uniform int * uniform memoryPool = NULL;
static uniform int numBlocks; static uniform int numBlocks;
static uniform int nSharedCounts;
static uniform int nCountsGlobal; static uniform int nCountsGlobal;
static uniform int nExcScan; static uniform int nExcScan;
static uniform int nCountsBlock; static uniform int nCountsBlock;
static uniform int nPartialSum; static uniform int nPartialSum;
static uniform int nPrefixSum; static uniform int nPrefixSum;
static uniform int * uniform sharedCounts;
static uniform int * uniform countsGlobal; static uniform int * uniform countsGlobal;
static uniform int * uniform excScan; static uniform int * uniform excScan;
static uniform int * uniform counts; static uniform int * uniform counts;
@@ -198,7 +212,6 @@ export void radixSort_alloc(const uniform int n)
{ {
assert(memoryPool == NULL); assert(memoryPool == NULL);
numBlocks = num_cores()*4; numBlocks = num_cores()*4;
nSharedCounts = NUMDIGITS*numBlocks;
nCountsGlobal = NUMDIGITS; nCountsGlobal = NUMDIGITS;
nExcScan = NUMDIGITS*numBlocks; nExcScan = NUMDIGITS*numBlocks;
nCountsBlock = NUMDIGITS*numBlocks; nCountsBlock = NUMDIGITS*numBlocks;
@@ -207,7 +220,6 @@ export void radixSort_alloc(const uniform int n)
const uniform int nalloc = const uniform int nalloc =
nSharedCounts +
nCountsGlobal + nCountsGlobal +
nExcScan + nExcScan +
nCountsBlock + nCountsBlock +
@@ -216,8 +228,7 @@ export void radixSort_alloc(const uniform int n)
memoryPool = uniform new uniform int[nalloc]; memoryPool = uniform new uniform int[nalloc];
sharedCounts = memoryPool; countsGlobal = memoryPool;
countsGlobal = sharedCounts + nSharedCounts;
excScan = countsGlobal + nCountsGlobal; excScan = countsGlobal + nCountsGlobal;
counts = excScan + nExcScan; counts = excScan + nExcScan;
partialSum = counts + nCountsBlock; partialSum = counts + nCountsBlock;
@@ -292,8 +303,7 @@ export void radixSort(
bufKeys, bufKeys,
bit, bit,
numElements, numElements,
excScan, excScan);
sharedCounts);
sync; sync;
uniform int * uniform tmp = keys; uniform int * uniform tmp = keys;