+1
This commit is contained in:
@@ -19,30 +19,37 @@ void countPass(
|
|||||||
uniform int * uniform counts = countsAll + blockIdx*NUMDIGITS;
|
uniform int * uniform counts = countsAll + blockIdx*NUMDIGITS;
|
||||||
const uniform int nloc = min(numElements - blockIdx*blockDim, blockDim);
|
const uniform int nloc = min(numElements - blockIdx*blockDim, blockDim);
|
||||||
|
|
||||||
foreach (digit = 0 ... NUMDIGITS)
|
|
||||||
counts[digit] = 0;
|
|
||||||
|
|
||||||
#if 1
|
#if 0
|
||||||
|
uniform int shcounts[NUMDIGITS];
|
||||||
|
#else
|
||||||
|
#define shcounts counts
|
||||||
|
#endif
|
||||||
|
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
shcounts[digit] = 0;
|
||||||
|
|
||||||
foreach (i = 0 ... nloc)
|
foreach (i = 0 ... nloc)
|
||||||
{
|
{
|
||||||
const int key = mask & ((unsigned int)keys[i] >> bit);
|
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||||
uniform int skey;
|
uniform int skey;
|
||||||
if (reduce_equal(key, &skey) == true)
|
if (reduce_equal(key, &skey) == true)
|
||||||
counts[skey] += reduce_add(1);
|
shcounts[skey] += reduce_add(1);
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
#ifdef __NVPTX__
|
#ifdef __NVPTX__
|
||||||
atomic_add_global(&counts[key], 1);
|
atomic_add_global(&shcounts[key], 1);
|
||||||
#else
|
#else
|
||||||
atomic_add_local(&counts[key], 1);
|
atomic_add_local(&shcounts[key], 1);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
#endif
|
|
||||||
|
|
||||||
foreach (digit = 0 ... NUMDIGITS)
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
atomic_add_global(&countsGlobal[digit], counts[digit]);
|
{
|
||||||
|
counts[digit] = shcounts[digit];
|
||||||
|
atomic_add_global(&countsGlobal[digit], shcounts[digit]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
task
|
task
|
||||||
@@ -51,22 +58,23 @@ void sortPass(
|
|||||||
uniform int sorted[],
|
uniform int sorted[],
|
||||||
uniform int bit,
|
uniform int bit,
|
||||||
uniform int numElements,
|
uniform int numElements,
|
||||||
uniform int digitOffsetsAll[],
|
uniform int digitOffsetsAll[])
|
||||||
uniform int sharedCounts[])
|
|
||||||
{
|
{
|
||||||
const uniform int blockIdx = taskIndex;
|
const uniform int blockIdx = taskIndex;
|
||||||
const uniform int numBlocks = taskCount;
|
const uniform int numBlocks = taskCount;
|
||||||
|
|
||||||
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
||||||
|
|
||||||
|
uniform int localCounts[NUMDIGITS];
|
||||||
uniform int * uniform localCounts = sharedCounts + blockIdx*NUMDIGITS;
|
|
||||||
|
|
||||||
const uniform int keyIndex = blockIdx * blockDim;
|
const uniform int keyIndex = blockIdx * blockDim;
|
||||||
uniform int * uniform keys = keysAll + keyIndex;
|
uniform int * uniform keys = keysAll + keyIndex;
|
||||||
uniform int * uniform digitOffsets = digitOffsetsAll + blockIdx*NUMDIGITS;
|
uniform int * uniform digitOffsets = digitOffsetsAll + blockIdx*NUMDIGITS;
|
||||||
const uniform int nloc = min(numElements - keyIndex, blockDim);
|
const uniform int nloc = min(numElements - keyIndex, blockDim);
|
||||||
|
|
||||||
|
const int unitScan = exclusive_scan_add(1);
|
||||||
|
|
||||||
|
|
||||||
const uniform int mask = (1 << NUMBITS) - 1;
|
const uniform int mask = (1 << NUMBITS) - 1;
|
||||||
foreach (i = 0 ... NUMDIGITS)
|
foreach (i = 0 ... NUMDIGITS)
|
||||||
localCounts[i] = 0;
|
localCounts[i] = 0;
|
||||||
@@ -75,10 +83,18 @@ void sortPass(
|
|||||||
{
|
{
|
||||||
const int key = mask & ((unsigned int)keys[i] >> bit);
|
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||||
int rel;
|
int rel;
|
||||||
foreach_active(iv)
|
if (reduce_equal(key) == true)
|
||||||
{
|
{
|
||||||
rel = localCounts[key];
|
rel = localCounts[key] + unitScan;
|
||||||
localCounts[key]++;
|
localCounts[key] = rel+1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
#ifdef __NVPTX__
|
||||||
|
rel = atomic_add_global(&localCounts[key],1);
|
||||||
|
#else
|
||||||
|
rel = atomic_add_local(&localCounts[key],1);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
const int scatter = rel + digitOffsets[key];
|
const int scatter = rel + digitOffsets[key];
|
||||||
sorted [scatter] = keys[i];
|
sorted [scatter] = keys[i];
|
||||||
@@ -177,14 +193,12 @@ inline void radixExclusiveScan(
|
|||||||
|
|
||||||
static uniform int * uniform memoryPool = NULL;
|
static uniform int * uniform memoryPool = NULL;
|
||||||
static uniform int numBlocks;
|
static uniform int numBlocks;
|
||||||
static uniform int nSharedCounts;
|
|
||||||
static uniform int nCountsGlobal;
|
static uniform int nCountsGlobal;
|
||||||
static uniform int nExcScan;
|
static uniform int nExcScan;
|
||||||
static uniform int nCountsBlock;
|
static uniform int nCountsBlock;
|
||||||
static uniform int nPartialSum;
|
static uniform int nPartialSum;
|
||||||
static uniform int nPrefixSum;
|
static uniform int nPrefixSum;
|
||||||
|
|
||||||
static uniform int * uniform sharedCounts;
|
|
||||||
static uniform int * uniform countsGlobal;
|
static uniform int * uniform countsGlobal;
|
||||||
static uniform int * uniform excScan;
|
static uniform int * uniform excScan;
|
||||||
static uniform int * uniform counts;
|
static uniform int * uniform counts;
|
||||||
@@ -198,7 +212,6 @@ export void radixSort_alloc(const uniform int n)
|
|||||||
{
|
{
|
||||||
assert(memoryPool == NULL);
|
assert(memoryPool == NULL);
|
||||||
numBlocks = num_cores()*4;
|
numBlocks = num_cores()*4;
|
||||||
nSharedCounts = NUMDIGITS*numBlocks;
|
|
||||||
nCountsGlobal = NUMDIGITS;
|
nCountsGlobal = NUMDIGITS;
|
||||||
nExcScan = NUMDIGITS*numBlocks;
|
nExcScan = NUMDIGITS*numBlocks;
|
||||||
nCountsBlock = NUMDIGITS*numBlocks;
|
nCountsBlock = NUMDIGITS*numBlocks;
|
||||||
@@ -207,7 +220,6 @@ export void radixSort_alloc(const uniform int n)
|
|||||||
|
|
||||||
|
|
||||||
const uniform int nalloc =
|
const uniform int nalloc =
|
||||||
nSharedCounts +
|
|
||||||
nCountsGlobal +
|
nCountsGlobal +
|
||||||
nExcScan +
|
nExcScan +
|
||||||
nCountsBlock +
|
nCountsBlock +
|
||||||
@@ -216,8 +228,7 @@ export void radixSort_alloc(const uniform int n)
|
|||||||
|
|
||||||
memoryPool = uniform new uniform int[nalloc];
|
memoryPool = uniform new uniform int[nalloc];
|
||||||
|
|
||||||
sharedCounts = memoryPool;
|
countsGlobal = memoryPool;
|
||||||
countsGlobal = sharedCounts + nSharedCounts;
|
|
||||||
excScan = countsGlobal + nCountsGlobal;
|
excScan = countsGlobal + nCountsGlobal;
|
||||||
counts = excScan + nExcScan;
|
counts = excScan + nExcScan;
|
||||||
partialSum = counts + nCountsBlock;
|
partialSum = counts + nCountsBlock;
|
||||||
@@ -292,8 +303,7 @@ export void radixSort(
|
|||||||
bufKeys,
|
bufKeys,
|
||||||
bit,
|
bit,
|
||||||
numElements,
|
numElements,
|
||||||
excScan,
|
excScan);
|
||||||
sharedCounts);
|
|
||||||
sync;
|
sync;
|
||||||
|
|
||||||
uniform int * uniform tmp = keys;
|
uniform int * uniform tmp = keys;
|
||||||
|
|||||||
Reference in New Issue
Block a user