compiles
This commit is contained in:
@@ -1,255 +1,242 @@
|
|||||||
#if 1
|
#define NUMBITS 8
|
||||||
struct int2 { int x,y; };
|
#define NUMDIGITS (1<<NUMBITS)
|
||||||
struct int4 { int x,y,z,w; };
|
|
||||||
#else
|
task
|
||||||
typedef int<2> int2;
|
void computeHistogram(
|
||||||
typedef int<4> int4;
|
const uniform int keysAll[],
|
||||||
|
const uniform int bit,
|
||||||
|
const uniform int numElements,
|
||||||
|
uniform int countsAll[],
|
||||||
|
uniform int countsGlobal[])
|
||||||
|
{
|
||||||
|
const uniform int blockIdx = taskIndex;
|
||||||
|
const uniform int numBlocks = taskCount;
|
||||||
|
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
||||||
|
|
||||||
|
const uniform int mask = (1 << NUMBITS) - 1;
|
||||||
|
|
||||||
|
const uniform int * uniform keys = keysAll + blockIdx*blockDim;
|
||||||
|
uniform int * uniform counts = countsAll + blockIdx*NUMDIGITS;
|
||||||
|
const uniform int nloc = min(numElements - blockIdx*blockDim, blockDim);
|
||||||
|
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
counts[digit] = 0;
|
||||||
|
|
||||||
|
foreach (i = 0 ... nloc)
|
||||||
|
{
|
||||||
|
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||||
|
atomic_add_local(&counts[key], 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
atomic_add_global(&countsGlobal[digit], counts[digit]);
|
||||||
|
}
|
||||||
|
|
||||||
|
task
|
||||||
|
void sortPass(
|
||||||
|
uniform int keysAll[],
|
||||||
|
uniform int sorted[],
|
||||||
|
uniform int bit,
|
||||||
|
uniform int numElements,
|
||||||
|
uniform int digitOffsetsAll[],
|
||||||
|
uniform int sharedCounts[])
|
||||||
|
{
|
||||||
|
const uniform int blockIdx = taskIndex;
|
||||||
|
const uniform int numBlocks = taskCount;
|
||||||
|
|
||||||
|
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
||||||
|
|
||||||
|
const uniform int mask = (1 << NUMBITS) - 1;
|
||||||
|
|
||||||
|
uniform int * uniform localCounts = sharedCounts + blockIdx*NUMDIGITS;
|
||||||
|
|
||||||
|
const uniform int keyIndex = blockIdx * blockDim;
|
||||||
|
uniform int * uniform keys = keysAll + keyIndex;
|
||||||
|
uniform int * uniform digitOffsets = digitOffsetsAll + blockIdx*NUMDIGITS;
|
||||||
|
const uniform int nloc = min(numElements - keyIndex, blockDim);
|
||||||
|
|
||||||
|
foreach (i = 0 ... NUMDIGITS)
|
||||||
|
localCounts[i] = 0;
|
||||||
|
|
||||||
|
foreach (i = 0 ... nloc)
|
||||||
|
{
|
||||||
|
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||||
|
const int rel = localCounts[key];
|
||||||
|
const int scatter = rel + digitOffsets[key];
|
||||||
|
sorted [scatter] = keys[i];
|
||||||
|
localCounts[key] = 1 + rel;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
task
|
||||||
|
void partialScanLocal(
|
||||||
|
uniform int excScanPtr[],
|
||||||
|
uniform int countsPtr[],
|
||||||
|
uniform int partialSum[])
|
||||||
|
{
|
||||||
|
const uniform int numBlocks = taskCount;
|
||||||
|
const uniform int blockIdx = taskIndex;
|
||||||
|
|
||||||
|
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
||||||
|
const uniform int bbeg = blockIdx * blockDim;
|
||||||
|
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||||
|
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
{
|
||||||
|
uniform int * uniform excScanBlock = excScanPtr + bbeg*NUMDIGITS;
|
||||||
|
uniform int * uniform countsBlock = countsPtr + bbeg*NUMDIGITS;
|
||||||
|
|
||||||
|
int prev = bbeg == 0 ? excScanBlock[digit] : 0;
|
||||||
|
for (uniform int block = bbeg; block < bend; block++)
|
||||||
|
{
|
||||||
|
const int y = countsBlock[digit];
|
||||||
|
excScanBlock[digit] = prev;
|
||||||
|
prev += y;
|
||||||
|
|
||||||
|
excScanBlock += NUMDIGITS;
|
||||||
|
countsBlock += NUMDIGITS;
|
||||||
|
}
|
||||||
|
|
||||||
|
excScanBlock -= NUMDIGITS;
|
||||||
|
countsBlock -= NUMDIGITS;
|
||||||
|
|
||||||
|
partialSum[blockIdx*NUMDIGITS + digit] = excScanBlock[digit] + countsBlock[digit];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
task
|
||||||
|
void partialScanGlobal(
|
||||||
|
const uniform int numBlocks,
|
||||||
|
uniform int partialSum[],
|
||||||
|
uniform int prefixSum[])
|
||||||
|
{
|
||||||
|
const int digit = taskIndex;
|
||||||
|
int carry = 0;
|
||||||
|
foreach (block = 0 ... numBlocks)
|
||||||
|
{
|
||||||
|
const int value = partialSum[block*NUMDIGITS + digit];
|
||||||
|
const int scan = exclusive_scan_add(value);
|
||||||
|
prefixSum[block*NUMDIGITS + digit] = value + carry;
|
||||||
|
carry = broadcast(scan+value, programCount-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
task
|
||||||
|
void completeScanGlobal(
|
||||||
|
uniform int excScanAll[],
|
||||||
|
uniform int carryValue[])
|
||||||
|
{
|
||||||
|
const uniform int numBlocks = taskCount;
|
||||||
|
const uniform int blockIdx = taskIndex;
|
||||||
|
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
||||||
|
const uniform int bbeg = blockIdx * blockDim;
|
||||||
|
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||||
|
|
||||||
|
carryValue += blockIdx*NUMDIGITS;
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
{
|
||||||
|
const int carry = carryValue[digit];
|
||||||
|
uniform int * uniform excScanBlock = excScanAll + bbeg*NUMDIGITS;
|
||||||
|
for (uniform int block = bbeg; block < bend; block++, excScanBlock += NUMDIGITS)
|
||||||
|
excScanBlock[digit] += carry;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
inline void radixExclusiveScan(
|
||||||
|
const uniform int numBlocks,
|
||||||
|
uniform int excScanPtr[],
|
||||||
|
uniform int countsPtr[],
|
||||||
|
uniform int partialSum[],
|
||||||
|
uniform int prefixSum[])
|
||||||
|
{
|
||||||
|
launch [numBlocks] partialScanLocal(excScanPtr, countsPtr, partialSum);
|
||||||
|
sync;
|
||||||
|
|
||||||
|
launch [NUMDIGITS] partialScanGlobal(numBlocks, partialSum, prefixSum);
|
||||||
|
sync;
|
||||||
|
|
||||||
|
launch [numBlocks] completeScanGlobal(excScanPtr, prefixSum);
|
||||||
|
sync;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void radixSort(
|
||||||
|
const uniform int numElements,
|
||||||
|
uniform int keys[],
|
||||||
|
uniform int sorted[])
|
||||||
|
{
|
||||||
|
const uniform int numBlocks = num_cores()*2;
|
||||||
|
|
||||||
|
#ifdef __NVPTX__
|
||||||
|
assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int4 scan4(const int4 idata)
|
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
||||||
{
|
|
||||||
const int idx = programIndex;
|
|
||||||
|
|
||||||
int4 val4 = idata;
|
const uniform int nSharedCounts = NUMDIGITS*numBlocks;
|
||||||
int sum[3];
|
const uniform int nCountsGlobal = NUMDIGITS;
|
||||||
sum[0] = val4.x;
|
const uniform int nExcScan = NUMDIGITS*numBlocks;
|
||||||
sum[1] = val4.y + sum[0];
|
const uniform int nCountsBlock = NUMDIGITS*numBlocks;
|
||||||
sum[2] = val4.z + sum[1];
|
const uniform int nPartialSum = NUMDIGITS*numBlocks;
|
||||||
|
const uniform int nPrefixSum = NUMDIGITS*numBlocks;
|
||||||
|
|
||||||
int val = val4.w + sum[2];
|
const uniform int nalloc =
|
||||||
val = exclusive_scan_add(val);
|
nSharedCounts +
|
||||||
|
nCountsGlobal +
|
||||||
|
nExcScan +
|
||||||
|
nCountsBlock +
|
||||||
|
nPartialSum +
|
||||||
|
nPrefixSum;
|
||||||
|
|
||||||
val4.x = val;
|
uniform int * uniform mem_pool = uniform new uniform int[nalloc];
|
||||||
val4.y = val + sum[0];
|
|
||||||
val4.z = val + sum[1];
|
|
||||||
val4.w = val + sum[2];
|
|
||||||
|
|
||||||
return val4;
|
uniform int * uniform sharedCounts = mem_pool;
|
||||||
}
|
uniform int * uniform countsGlobal = sharedCounts + nSharedCounts;
|
||||||
|
uniform int * uniform excScan = countsGlobal + nCountsGlobal;
|
||||||
|
uniform int * uniform countsBlock = excScan + nExcScan;
|
||||||
|
uniform int * uniform partialSum = countsBlock + nCountsBlock;
|
||||||
|
uniform int * uniform prefixSum = partialSum + nPartialSum;
|
||||||
|
|
||||||
static int4 rank4(int4 preds)
|
for (uniform int bit = 0; bit < 32; bit += NUMBITS)
|
||||||
{
|
|
||||||
const int localId = programIndex;
|
|
||||||
const uniform int localSize = programCount;
|
|
||||||
|
|
||||||
const int4 address = scan4(preds);
|
|
||||||
|
|
||||||
const int numtrue = broadcast(address.w + preds.w, localSize-1);
|
|
||||||
|
|
||||||
int4 rank;
|
|
||||||
const int idx = localId*4;
|
|
||||||
rank.x = (preds.x) ? address.x : numtrue + idx - address.x;
|
|
||||||
rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y;
|
|
||||||
rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z;
|
|
||||||
rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w;
|
|
||||||
|
|
||||||
return rank;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void radixSortBlockKeysOnly(int4 &key_inout, uniform int nbits, uniform int startbit)
|
|
||||||
{
|
|
||||||
const int localId = programIndex;
|
|
||||||
const uniform int localSize = programCount;
|
|
||||||
|
|
||||||
uniform int sMem[programCount*4];
|
|
||||||
|
|
||||||
int4 key = key_inout;
|
|
||||||
for (uniform int shift = startbit; shift < (startbit + nbits); ++shift)
|
|
||||||
{
|
{
|
||||||
int4 lsb;
|
/* initialize histogram for each digit */
|
||||||
lsb.x = !((key.x >> shift) & 0x1);
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
lsb.y = !((key.y >> shift) & 0x1);
|
countsGlobal[digit] = 0;
|
||||||
lsb.z = !((key.z >> shift) & 0x1);
|
|
||||||
lsb.w = !((key.w >> shift) & 0x1);
|
|
||||||
|
|
||||||
const int4 r = rank4(lsb);
|
/* compute histogram for each digit */
|
||||||
|
launch [numBlocks] computeHistogram(keys, bit, numElements, countsBlock, countsGlobal);
|
||||||
|
sync;
|
||||||
|
|
||||||
// This arithmetic strides the ranks across 4 CTA_SIZE regions
|
/* exclusive scan on global histogram */
|
||||||
sMem[(r.x & 3) * localSize + (r.x >> 2)] = key.x;
|
int carry = 0;
|
||||||
sMem[(r.y & 3) * localSize + (r.y >> 2)] = key.y;
|
excScan[0] = 0;
|
||||||
sMem[(r.z & 3) * localSize + (r.z >> 2)] = key.z;
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
sMem[(r.w & 3) * localSize + (r.w >> 2)] = key.w;
|
|
||||||
|
|
||||||
// The above allows us to read without 4-way bank conflicts:
|
|
||||||
key.x = sMem[localId ];
|
|
||||||
key.y = sMem[localId + localSize];
|
|
||||||
key.z = sMem[localId + 2 * localSize];
|
|
||||||
key.w = sMem[localId + 3 * localSize];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
task void radixSortBlocksKeysOnly(
|
|
||||||
uniform int keysIn[],
|
|
||||||
uniform int keysOut[],
|
|
||||||
uniform int nbits,
|
|
||||||
uniform int startbit,
|
|
||||||
uniform int numElements,
|
|
||||||
uniform int totalBlocks)
|
|
||||||
{
|
|
||||||
const int globalId = taskIndex * programCount + programIndex;
|
|
||||||
|
|
||||||
int4 key;
|
|
||||||
key.x = keysIn[4*globalId + 0];
|
|
||||||
key.y = keysIn[4*globalId + 1];
|
|
||||||
key.z = keysIn[4*globalId + 2];
|
|
||||||
key.w = keysIn[4*globalId + 3];
|
|
||||||
|
|
||||||
radixSortBlockKeysOnly(key, nbits, startbit);
|
|
||||||
|
|
||||||
keysOut[4*globalId+0] = key.x;
|
|
||||||
keysOut[4*globalId+1] = key.y;
|
|
||||||
keysOut[4*globalId+2] = key.z;
|
|
||||||
keysOut[4*globalId+3] = key.w;
|
|
||||||
}
|
|
||||||
|
|
||||||
//----------------------------------------------------------------------------
|
|
||||||
// Given an array with blocks sorted according to a 4-bit radix group, each
|
|
||||||
// block counts the number of keys that fall into each radix in the group, and
|
|
||||||
// finds the starting offset of each radix in the block. It then writes the radix
|
|
||||||
// counts to the counters array, and the starting offsets to the blockOffsets array.
|
|
||||||
//
|
|
||||||
// Template parameters are used to generate efficient code for various special cases
|
|
||||||
// For example, we have to handle arrays that are a multiple of the block size
|
|
||||||
// (fullBlocks) differently than arrays that are not. "loop" is used when persistent
|
|
||||||
// CTAs are used.
|
|
||||||
//
|
|
||||||
// By persistent CTAs we mean that we launch only as many thread blocks as can
|
|
||||||
// be resident in the GPU and no more, rather than launching as many threads as
|
|
||||||
// we have elements. Persistent CTAs loop over blocks of elements until all work
|
|
||||||
// is complete. This can be faster in some cases. In our tests it is faster
|
|
||||||
// for large sorts (and the threshold is higher on compute version 1.1 and earlier
|
|
||||||
// GPUs than it is on compute version 1.2 GPUs.
|
|
||||||
//
|
|
||||||
//----------------------------------------------------------------------------
|
|
||||||
task void findRadixOffsets(
|
|
||||||
uniform int keys[],
|
|
||||||
uniform int counters[],
|
|
||||||
uniform int blockOffsets[],
|
|
||||||
uniform int startbit,
|
|
||||||
uniform int numElements,
|
|
||||||
uniform int totalBlocks)
|
|
||||||
{
|
|
||||||
uniform int sStartPointers[16];
|
|
||||||
|
|
||||||
const uniform int groupId = taskIndex;
|
|
||||||
const int localId = programIndex;
|
|
||||||
const uniform int groupSize = programCount;
|
|
||||||
const int globalId = taskIndex * programCount + programIndex;
|
|
||||||
|
|
||||||
int2 radix2;
|
|
||||||
|
|
||||||
radix2.x = keys[2*globalId + 0];
|
|
||||||
radix2.y = keys[2*globalId + 1];
|
|
||||||
|
|
||||||
uniform int sRadix1[4*programCount];
|
|
||||||
|
|
||||||
sRadix1[2 * localId] = (radix2.x >> startbit) & 0xF;
|
|
||||||
sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF;
|
|
||||||
|
|
||||||
// Finds the position where the sRadix1 entries differ and stores start
|
|
||||||
// index for each radix.
|
|
||||||
if(localId < 16)
|
|
||||||
sStartPointers[localId] = 0;
|
|
||||||
|
|
||||||
if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) )
|
|
||||||
sStartPointers[sRadix1[localId]] = localId;
|
|
||||||
|
|
||||||
if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1])
|
|
||||||
sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize;
|
|
||||||
|
|
||||||
if(localId < 16)
|
|
||||||
blockOffsets[groupId*16 + localId] = sStartPointers[localId];
|
|
||||||
|
|
||||||
// Compute the sizes of each block.
|
|
||||||
if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) )
|
|
||||||
sStartPointers[sRadix1[localId - 1]] =
|
|
||||||
localId - sStartPointers[sRadix1[localId - 1]];
|
|
||||||
if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] )
|
|
||||||
sStartPointers[sRadix1[localId + groupSize - 1]] =
|
|
||||||
localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]];
|
|
||||||
|
|
||||||
|
|
||||||
if(localId == groupSize - 1)
|
|
||||||
sStartPointers[sRadix1[2 * groupSize - 1]] =
|
|
||||||
2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]];
|
|
||||||
|
|
||||||
if(localId < 16)
|
|
||||||
counters[localId * totalBlocks + groupId] = sStartPointers[localId];
|
|
||||||
}
|
|
||||||
|
|
||||||
// a naive scan routine that works only for array that
|
|
||||||
// can fit into a single block, just for debugging purpose,
|
|
||||||
// not used in the sort now
|
|
||||||
task void scanNaive(
|
|
||||||
uniform int odata[],
|
|
||||||
uniform int idata[],
|
|
||||||
uniform int n)
|
|
||||||
{
|
|
||||||
if (programIndex < n)
|
|
||||||
odata[programIndex] = exclusive_scan_add(idata[programIndex]);
|
|
||||||
}
|
|
||||||
|
|
||||||
//----------------------------------------------------------------------------
|
|
||||||
// reorderData shuffles data in the array globally after the radix offsets
|
|
||||||
// have been found. On compute version 1.1 and earlier GPUs, this code depends
|
|
||||||
// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
|
|
||||||
//
|
|
||||||
// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
|
|
||||||
// that all writes are coalesced using extra work in the kernel. On later
|
|
||||||
// GPUs coalescing rules have been relaxed, so this extra overhead hurts
|
|
||||||
// performance. On these GPUs we set manualCoalesce=false and directly store
|
|
||||||
// the results.
|
|
||||||
//
|
|
||||||
// Template parameters are used to generate efficient code for various special cases
|
|
||||||
// For example, we have to handle arrays that are a multiple of the block size
|
|
||||||
// (fullBlocks) differently than arrays that are not. "loop" is used when persistent
|
|
||||||
// CTAs are used.
|
|
||||||
//
|
|
||||||
// By persistent CTAs we mean that we launch only as many thread blocks as can
|
|
||||||
// be resident in the GPU and no more, rather than launching as many threads as
|
|
||||||
// we have elements. Persistent CTAs loop over blocks of elements until all work
|
|
||||||
// is complete. This can be faster in some cases. In our tests it is faster
|
|
||||||
// for large sorts (and the threshold is higher on compute version 1.1 and earlier
|
|
||||||
// GPUs than it is on compute version 1.2 GPUs.
|
|
||||||
//----------------------------------------------------------------------------
|
|
||||||
task void reorderDataKeysOnly(
|
|
||||||
uniform int outKeys[],
|
|
||||||
uniform int keys[],
|
|
||||||
uniform int blockOffsets[],
|
|
||||||
uniform int offsets[],
|
|
||||||
uniform int sizes[],
|
|
||||||
uniform int startbit,
|
|
||||||
uniform int numElements,
|
|
||||||
uniform int totalBlocks)
|
|
||||||
{
|
|
||||||
uniform int sOffsets[16];
|
|
||||||
uniform int sBlockOffsets[16];
|
|
||||||
uniform int2 sKeys2[programCount];
|
|
||||||
uniform int * uniform sKeys1 = (uniform int * uniform)&sKeys2[0];
|
|
||||||
|
|
||||||
const uniform int groupId = taskIndex;
|
|
||||||
const int globalId = taskIndex*programCount + programIndex;
|
|
||||||
const int localId = programIndex;
|
|
||||||
const uniform int groupSize = programCount;
|
|
||||||
|
|
||||||
sKeys2[localId].x = keys[2*globalId + 0];
|
|
||||||
sKeys2[localId].y = keys[2*globalId + 1];
|
|
||||||
|
|
||||||
if(localId < 16)
|
|
||||||
{
|
{
|
||||||
sOffsets[localId] = offsets[localId * totalBlocks + groupId];
|
const int value = countsGlobal[digit];
|
||||||
sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId];
|
const int scan = exclusive_scan_add(value);
|
||||||
|
excScan[digit] = value + carry;
|
||||||
|
carry += broadcast(scan+value, programCount-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
int radix = (sKeys1[localId] >> startbit) & 0xF;
|
|
||||||
int globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix];
|
|
||||||
|
|
||||||
if (globalOffset < numElements)
|
/* computing offsets for each digit */
|
||||||
outKeys[globalOffset] = sKeys1[localId];
|
radixExclusiveScan(numBlocks, excScan, countsBlock, partialSum, prefixSum);
|
||||||
|
|
||||||
radix = (sKeys1[localId + groupSize] >> startbit) & 0xF;
|
/* sorting */
|
||||||
globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix];
|
launch [numBlocks]
|
||||||
|
sortPass(
|
||||||
|
keys,
|
||||||
|
sorted,
|
||||||
|
bit,
|
||||||
|
numElements,
|
||||||
|
excScan,
|
||||||
|
sharedCounts);
|
||||||
|
sync;
|
||||||
|
|
||||||
if (globalOffset < numElements)
|
uniform int * uniform tmp = keys;
|
||||||
outKeys[globalOffset] = sKeys1[localId + groupSize];
|
keys = sorted;
|
||||||
|
sorted = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
delete mem_pool;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,25 +1,29 @@
|
|||||||
#define NUMBITS 8
|
#define NUMBITS 8
|
||||||
#define NUMBUCKETS (1<<NUMBITS)
|
#define NUMDIGITS (1<<NUMBITS)
|
||||||
|
|
||||||
task
|
task
|
||||||
void localHistogram(
|
void localHistogram(
|
||||||
uniform unsigned int32 keys_all[],
|
uniform int blockSize,
|
||||||
uniform int32 bit,
|
uniform int numBlocks,
|
||||||
uniform int32 count_all,
|
uniform int keys_all[],
|
||||||
uniform int32 counts_all[])
|
uniform int bit,
|
||||||
|
uniform int count_all,
|
||||||
|
uniform int counts_all[])
|
||||||
{
|
{
|
||||||
const uniform unsigned int mask = (1 << NUMBITS) - 1;
|
const uniform int mask = (1 << NUMBITS) - 1;
|
||||||
for (uniform int block = taskIndex; block < numBlocks; block += taskCount)
|
for (uniform int block = taskIndex; block < numBlocks; block += taskCount)
|
||||||
|
if (block < numBlocks)
|
||||||
{
|
{
|
||||||
uniform unsigned int32 * uniform keys = keys_all + block*blockSize;
|
uniform int * uniform keys = keys_all + block*blockSize;
|
||||||
uniform int32 * uniform keys = counts_all + block*NUMBUCKETS;
|
uniform int * uniform counts = counts_all + block*NUMDIGITS;
|
||||||
uniform int32 count = min(count_all - block*blockSize, blockSize);
|
uniform int count = min(count_all - block*blockSize, blockSize);
|
||||||
|
|
||||||
foreach (i = 0 ... NUMBUCKETS)
|
foreach (i = 0 ... NUMDIGITS)
|
||||||
counts[i] = 0;
|
counts[i] = 0;
|
||||||
|
|
||||||
foreach (i = 0 ... count)
|
foreach (i = 0 ... count)
|
||||||
{
|
{
|
||||||
const int key = mask & (keys[i] >> bit);
|
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||||
atomic_add_local(&counts[key], 1);
|
atomic_add_local(&counts[key], 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -27,47 +31,215 @@ void localHistogram(
|
|||||||
|
|
||||||
task
|
task
|
||||||
void globalHistogram(
|
void globalHistogram(
|
||||||
uniform int32 counts_all[],
|
uniform int blockSize,
|
||||||
uniform int32 countsGlobal[])
|
uniform int numBlocks,
|
||||||
|
uniform int counts_all[],
|
||||||
|
uniform int countsGlobal[])
|
||||||
{
|
{
|
||||||
uniform int32 (* uniform countsBlock)[NUMBUCKETS] = (uniform int (*)[NUMBUCKETS]) counts;
|
uniform int (* uniform countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS]) counts;
|
||||||
for (uniform int digit = taskIndex; digit < NUMBUCKETS; digit += taskCount)
|
for (uniform int digit = taskIndex; digit < NUMDIGITS; digit += taskCount)
|
||||||
{
|
{
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
foreach (block = 0...numBlocks)
|
foreach (block = 0...numBlocks)
|
||||||
sum += counts[block][digit];
|
sum += counts[block][digit];
|
||||||
countsGlobal[digit] = reduce_add(sum);
|
countsGlobal[digit] = reduce_add(sum);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int sum[NUMDIGITS/programCount] = {0};
|
||||||
|
for (uniform int block = taskIndex; block < numBlocks; block += gridDim)
|
||||||
|
if (block < numBlocks)
|
||||||
|
for (int digit = programIndex; digit < NUMDIGITS: digit += programCount)
|
||||||
|
sum[digit/programCount] += countsBlock[block][digit];
|
||||||
|
|
||||||
|
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||||
|
add_atomic_global(&countsGlobal[digit], sum[digit/programCount]);
|
||||||
|
}
|
||||||
|
|
||||||
|
task
|
||||||
|
void sortPass(
|
||||||
|
uniform int blockSize,
|
||||||
|
uniform int numBlocks,
|
||||||
|
uniform int keys_all[],
|
||||||
|
uniform int sorted[],
|
||||||
|
uniform int bit,
|
||||||
|
uniform int count_all,
|
||||||
|
uniform int digitOffsets_all[],
|
||||||
|
uniform int shared_counts[])
|
||||||
|
{
|
||||||
|
const uniform int mask = (1 << NUMBITS) - 1;
|
||||||
|
|
||||||
|
|
||||||
|
uniform int * uniform local_counts = shared_counts + taskIndex*NUMDIGITS;
|
||||||
|
|
||||||
|
for (uniform int block = taskIndex; block < numBlocks; block += taskCount)
|
||||||
|
if (block < numBlocks)
|
||||||
|
{
|
||||||
|
const uniform int keyIndex = block * blockSize;
|
||||||
|
uniform int * uniform keys = keys_all + keyIndex;
|
||||||
|
uniform int * uniform digitOffsets = digitOffset_all + block*NUMDIGITS;
|
||||||
|
const uniform int count = min(count_all - keyIndex, blockSize);
|
||||||
|
|
||||||
|
foreach (i = 0 ... count)
|
||||||
|
local_counts[i] = 0;
|
||||||
|
|
||||||
|
foreach (i = 0 ... count)
|
||||||
|
{
|
||||||
|
const int key = mask & (keys[i] >> bit);
|
||||||
|
const int rel = local_counts[key];
|
||||||
|
const int scatter = rel + digitOffsets[key];
|
||||||
|
sorted [scatter] = keys[i];
|
||||||
|
local_counts[key] = 1 + rel;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
task
|
||||||
|
void partialScanLocal(
|
||||||
|
const uniform int numBlocks,
|
||||||
|
uniform int excScanPtr[],
|
||||||
|
uniform int countsPtr[],
|
||||||
|
uniform int partialSum[])
|
||||||
|
{
|
||||||
|
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
||||||
|
const uniform int bbeg = taskIndex * blockDim;
|
||||||
|
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||||
|
|
||||||
|
if (bbeg >= numBlocks)
|
||||||
|
return;
|
||||||
|
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
{
|
||||||
|
uniform int * uniform excScanBlock = excScanPtr + bbeg*NUMDIGITS;
|
||||||
|
uniform int * uniform countsBLock = countsPtr + bbeg*NUMDIGITS;
|
||||||
|
|
||||||
|
int prev = bbeg == 0 ? excScanBlock[digit] : 0;
|
||||||
|
for (uniform int block = bbeg; block < bend; block++)
|
||||||
|
{
|
||||||
|
const int y = countsBlock[digit];
|
||||||
|
excScanBlock[digit] = prev;
|
||||||
|
prev += y;
|
||||||
|
|
||||||
|
excScanBlock += NUMDIGITS;
|
||||||
|
countsBlock += NUMDIGITS;
|
||||||
|
}
|
||||||
|
|
||||||
|
excScanBlock -= NUMDIGITS;
|
||||||
|
countsBlock -= NUMDIGITS;
|
||||||
|
|
||||||
|
partialSum[taskIndex*NUMDIGITS + digit] = excScanBlock[digit] + countsBlock[digit];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
task
|
||||||
|
void partialScanGlobal(
|
||||||
|
const uniform int nBlocks,
|
||||||
|
uniform int partialSum[],
|
||||||
|
uniform int prefixSum[])
|
||||||
|
{
|
||||||
|
const int digit = taskIndex;
|
||||||
|
if (digit >= NUMBUCKETS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
int carry = 0;
|
||||||
|
foreach (block = 0 ... nBlocks)
|
||||||
|
{
|
||||||
|
const int value = partialSum[block*NUMDIGITS + digit];
|
||||||
|
const int scan = exclusive_scan(value);
|
||||||
|
prefixSum[block*NUMDIGITS + digit] = value + carry;
|
||||||
|
carry = broadcast(scan+value, programCount-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
task
|
||||||
|
void completeScanGobal(
|
||||||
|
const uniform int numBlocks,
|
||||||
|
uniform int excScanPtr[],
|
||||||
|
uniform int carryValue[])
|
||||||
|
{
|
||||||
|
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
||||||
|
const uniform int bbeg = taskIndex * blockDim;
|
||||||
|
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||||
|
|
||||||
|
if (bbeg >= numBlocks)
|
||||||
|
return;
|
||||||
|
|
||||||
|
carryValue += taskIndex*NUMDIGITS;
|
||||||
|
foreach (digit = 0 ... NUMBUCKETS)
|
||||||
|
{
|
||||||
|
const int carry = carryValue[digit];
|
||||||
|
uniform int * uniform excScanBlock = excScanPtr + bbeg*NUMDIGITS;
|
||||||
|
for (uniform int block = bbeg; block < bend; block++, excScanBlock += NUMDIGITS)
|
||||||
|
excScanBlock[digit] += carry;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
inline void exclusiveScan(
|
||||||
|
const uniform int nTasks,
|
||||||
|
const uniform int numBlocks,
|
||||||
|
uniform int excScanPtr[],
|
||||||
|
uniform int countsPtr[],
|
||||||
|
uniform int partialSum[],
|
||||||
|
uniform int prefixSum[])
|
||||||
|
{
|
||||||
|
launch [nTasks] partialScanLocal(numBlocks, excScanPtr, countsPtr, partialSum)
|
||||||
|
sync;
|
||||||
|
|
||||||
|
launch [NUMBUCKETS] partialScanGlobal(nTasks, partialSum, prefixSum);
|
||||||
|
sync;
|
||||||
|
|
||||||
|
launch [nTasks] complateScanGlobal(numBlocks, excScanPtr, prefixSum);
|
||||||
|
sync;
|
||||||
}
|
}
|
||||||
|
|
||||||
export void radixSort()
|
export void radixSort()
|
||||||
{
|
{
|
||||||
|
const uniform nTasks = __num_cores()*4;
|
||||||
|
uniform int * uniform sharedCounts = uniform new uniform int[NUMDIGITS*(nTasks+1)];
|
||||||
|
uniform int * uniform countsGlobal = sharedCounts + NUMDIGITS*nTasks;
|
||||||
|
|
||||||
for (uniform int bit = 0; bit < 32; bit += NUMBITS)
|
for (uniform int bit = 0; bit < 32; bit += NUMBITS)
|
||||||
{
|
{
|
||||||
/* histogramming each of the block */
|
/* histogramming each of the block */
|
||||||
launch [nBlocks] localHistogram(keys, bit, count, counts);
|
launch [nTasks] localHistogram(blockSize, keys, bit, count, count);
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
countsGlobal[digit] = 0;
|
||||||
sync;
|
sync;
|
||||||
|
|
||||||
/* compute global histogram */
|
/* computing global histogram */
|
||||||
launch [nBlocks] globalHistogram(counts, countsGlobal);
|
launch [nTasks] globalHistogram(count, countsGlobal);
|
||||||
sync();
|
sync();
|
||||||
|
|
||||||
/* exclusive scan on global histogram */
|
/* exclusive scan on global histogram */
|
||||||
int carry = 0;
|
int carry = 0;
|
||||||
foreach (i = 0...NUMBUCKETS)
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
{
|
{
|
||||||
const int value = countsGlobal[i];
|
const int value = countsGlobal[digit];
|
||||||
const int scan = exclusive_scan(value);
|
const int scan = exclusive_scan(value);
|
||||||
scanGlobal[i] = value + carry;
|
excScanBlockPtr[digit] = value + carry;
|
||||||
carry = broadcast(scan+value, programCount-1);
|
carry = broadcast(scan+value, programCount-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* computing offsets for each digit */
|
/* computing offsets for each digit */
|
||||||
launch [nBlocks] computeGlobalOffset();
|
exclusive_scan(nTasks, excScanBlockPtr, countsBlockPtr, numBlocks);
|
||||||
sync();
|
|
||||||
|
|
||||||
/* sorting */
|
/* sorting */
|
||||||
launch [nBlocks] sort()
|
launch [nBlocks]
|
||||||
|
sortPass(
|
||||||
|
blockSize,
|
||||||
|
numBlocks,
|
||||||
|
keys,
|
||||||
|
sorted,
|
||||||
|
bit,
|
||||||
|
count,
|
||||||
|
excScanBlockPtr,
|
||||||
|
shared_counts);
|
||||||
|
sync;
|
||||||
|
|
||||||
|
uniform int * uniform tmp = keys;
|
||||||
|
keys = sorted;
|
||||||
|
sorted = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
delete shared_counts;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user