runs but incorrectly
This commit is contained in:
@@ -4,6 +4,6 @@ CPP_SRC=radixSort.cpp
|
|||||||
ISPC_SRC=radixSort.ispc
|
ISPC_SRC=radixSort.ispc
|
||||||
ISPC_IA_TARGETS=avx1-i32x8
|
ISPC_IA_TARGETS=avx1-i32x8
|
||||||
ISPC_ARM_TARGETS=neon
|
ISPC_ARM_TARGETS=neon
|
||||||
#ISPC_FLAGS=-DDEBUG
|
#ISPC_FLAGS=-DDEBUG -g
|
||||||
|
|
||||||
include ../common.mk
|
include ../common.mk
|
||||||
|
|||||||
@@ -38,14 +38,16 @@ int main (int argc, char *argv[])
|
|||||||
unsigned int *tmpv = new unsigned int [n];
|
unsigned int *tmpv = new unsigned int [n];
|
||||||
unsigned int *keys_orig = new unsigned int [n];
|
unsigned int *keys_orig = new unsigned int [n];
|
||||||
|
|
||||||
srand48(rtc()*65536);
|
// srand48(rtc()*65536);
|
||||||
|
srand48(1234);
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (int i = 0; i < n; i++)
|
for (int i = 0; i < n; i++)
|
||||||
{
|
{
|
||||||
keys[i] = drand48() * (1<<30);
|
keys[i] = 4*n-3*i; //drand48() * (1<<30);
|
||||||
tmpv[i] = 0;
|
tmpv[i] = keys[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
std::random_shuffle(keys, keys + n);
|
std::random_shuffle(keys, keys + n);
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
@@ -67,6 +69,7 @@ int main (int argc, char *argv[])
|
|||||||
printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n*m/tISPC2);
|
printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n*m/tISPC2);
|
||||||
|
|
||||||
std::sort(keys_orig, keys_orig + n);
|
std::sort(keys_orig, keys_orig + n);
|
||||||
|
std::sort(keys, keys+ n);
|
||||||
for (int i = 0; i < n; i++)
|
for (int i = 0; i < n; i++)
|
||||||
assert(keys[i] == keys_orig[i]);
|
assert(keys[i] == keys_orig[i]);
|
||||||
|
|
||||||
@@ -96,7 +99,7 @@ int main (int argc, char *argv[])
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
delete keys;
|
delete keys;
|
||||||
delete keys;
|
delete keys_orig;
|
||||||
delete tmpv;
|
delete tmpv;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
#define NUMDIGITS (1<<NUMBITS)
|
#define NUMDIGITS (1<<NUMBITS)
|
||||||
|
|
||||||
task
|
task
|
||||||
void computeHistogram(
|
void countPass(
|
||||||
const uniform int keysAll[],
|
const uniform int keysAll[],
|
||||||
const uniform int bit,
|
const uniform int bit,
|
||||||
const uniform int numElements,
|
const uniform int numElements,
|
||||||
@@ -46,7 +46,6 @@ void sortPass(
|
|||||||
|
|
||||||
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
||||||
|
|
||||||
const uniform int mask = (1 << NUMBITS) - 1;
|
|
||||||
|
|
||||||
uniform int * uniform localCounts = sharedCounts + blockIdx*NUMDIGITS;
|
uniform int * uniform localCounts = sharedCounts + blockIdx*NUMDIGITS;
|
||||||
|
|
||||||
@@ -55,6 +54,7 @@ void sortPass(
|
|||||||
uniform int * uniform digitOffsets = digitOffsetsAll + blockIdx*NUMDIGITS;
|
uniform int * uniform digitOffsets = digitOffsetsAll + blockIdx*NUMDIGITS;
|
||||||
const uniform int nloc = min(numElements - keyIndex, blockDim);
|
const uniform int nloc = min(numElements - keyIndex, blockDim);
|
||||||
|
|
||||||
|
const uniform int mask = (1 << NUMBITS) - 1;
|
||||||
foreach (i = 0 ... NUMDIGITS)
|
foreach (i = 0 ... NUMDIGITS)
|
||||||
localCounts[i] = 0;
|
localCounts[i] = 0;
|
||||||
|
|
||||||
@@ -63,82 +63,79 @@ void sortPass(
|
|||||||
const int key = mask & ((unsigned int)keys[i] >> bit);
|
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||||
const int rel = localCounts[key];
|
const int rel = localCounts[key];
|
||||||
const int scatter = rel + digitOffsets[key];
|
const int scatter = rel + digitOffsets[key];
|
||||||
sorted [scatter] = keys[i];
|
sorted [scatter] = keys[i];
|
||||||
localCounts[key] = 1 + rel;
|
localCounts[key] = 1 + rel;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
task
|
task
|
||||||
void partialScanLocal(
|
void partialScanLocal(
|
||||||
uniform int excScanPtr[],
|
uniform int excScanAll[],
|
||||||
uniform int countsPtr[],
|
uniform int countsAll[],
|
||||||
uniform int partialSum[])
|
uniform int partialSumAll[])
|
||||||
{
|
{
|
||||||
const uniform int numBlocks = taskCount;
|
const uniform int numBlocks = taskCount;
|
||||||
const uniform int blockIdx = taskIndex;
|
const uniform int blockIdx = taskIndex;
|
||||||
|
|
||||||
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
const uniform int blockDim = (numBlocks+numBlocks-1)/numBlocks;
|
||||||
const uniform int bbeg = blockIdx * blockDim;
|
const uniform int bbeg = blockIdx * blockDim;
|
||||||
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||||
|
|
||||||
|
uniform int (* uniform countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])countsAll;
|
||||||
|
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
|
||||||
|
uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
|
||||||
|
|
||||||
foreach (digit = 0 ... NUMDIGITS)
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
{
|
{
|
||||||
uniform int * uniform excScanBlock = excScanPtr + bbeg*NUMDIGITS;
|
int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
|
||||||
uniform int * uniform countsBlock = countsPtr + bbeg*NUMDIGITS;
|
|
||||||
|
|
||||||
int prev = bbeg == 0 ? excScanBlock[digit] : 0;
|
|
||||||
for (uniform int block = bbeg; block < bend; block++)
|
for (uniform int block = bbeg; block < bend; block++)
|
||||||
{
|
{
|
||||||
const int y = countsBlock[digit];
|
const int y = countsBlock[block][digit];
|
||||||
excScanBlock[digit] = prev;
|
excScanBlock[block][digit] = prev;
|
||||||
prev += y;
|
prev += y;
|
||||||
|
|
||||||
excScanBlock += NUMDIGITS;
|
|
||||||
countsBlock += NUMDIGITS;
|
|
||||||
}
|
}
|
||||||
|
partialSum[blockIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
|
||||||
excScanBlock -= NUMDIGITS;
|
|
||||||
countsBlock -= NUMDIGITS;
|
|
||||||
|
|
||||||
partialSum[blockIdx*NUMDIGITS + digit] = excScanBlock[digit] + countsBlock[digit];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
task
|
task
|
||||||
void partialScanGlobal(
|
void partialScanGlobal(
|
||||||
const uniform int numBlocks,
|
const uniform int numBlocks,
|
||||||
uniform int partialSum[],
|
uniform int partialSumAll[],
|
||||||
uniform int prefixSum[])
|
uniform int prefixSumAll[])
|
||||||
{
|
{
|
||||||
const int digit = taskIndex;
|
uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
|
||||||
|
uniform int (* uniform prefixSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS]) prefixSumAll;
|
||||||
|
const uniform int digit = taskIndex;
|
||||||
int carry = 0;
|
int carry = 0;
|
||||||
foreach (block = 0 ... numBlocks)
|
foreach (block = 0 ... numBlocks)
|
||||||
{
|
{
|
||||||
const int value = partialSum[block*NUMDIGITS + digit];
|
const int value = partialSum[block][digit];
|
||||||
const int scan = exclusive_scan_add(value);
|
const int scan = exclusive_scan_add(value);
|
||||||
prefixSum[block*NUMDIGITS + digit] = value + carry;
|
prefixSum[block][digit] = scan + carry;
|
||||||
carry = broadcast(scan+value, programCount-1);
|
carry += broadcast(scan+value, programCount-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
task
|
task
|
||||||
void completeScanGlobal(
|
void completeScanGlobal(
|
||||||
uniform int excScanAll[],
|
uniform int excScanAll[],
|
||||||
uniform int carryValue[])
|
uniform int carryValueAll[])
|
||||||
{
|
{
|
||||||
const uniform int numBlocks = taskCount;
|
const uniform int numBlocks = taskCount;
|
||||||
const uniform int blockIdx = taskIndex;
|
const uniform int blockIdx = taskIndex;
|
||||||
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
const uniform int blockDim = (numBlocks+numBlocks-1)/numBlocks;
|
||||||
const uniform int bbeg = blockIdx * blockDim;
|
const uniform int bbeg = blockIdx * blockDim;
|
||||||
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||||
|
|
||||||
|
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
|
||||||
|
uniform int (* uniform carryValue)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])carryValueAll;
|
||||||
|
|
||||||
carryValue += blockIdx*NUMDIGITS;
|
|
||||||
foreach (digit = 0 ... NUMDIGITS)
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
{
|
{
|
||||||
const int carry = carryValue[digit];
|
const int carry = carryValue[blockIdx][digit];
|
||||||
uniform int * uniform excScanBlock = excScanAll + bbeg*NUMDIGITS;
|
for (uniform int block = bbeg; block < bend; block++)
|
||||||
for (uniform int block = bbeg; block < bend; block++, excScanBlock += NUMDIGITS)
|
excScanBlock[block][digit] += carry;
|
||||||
excScanBlock[digit] += carry;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -165,7 +162,7 @@ export void radixSort(
|
|||||||
uniform int keys[],
|
uniform int keys[],
|
||||||
uniform int sorted[])
|
uniform int sorted[])
|
||||||
{
|
{
|
||||||
const uniform int numBlocks = num_cores()*2;
|
const uniform int numBlocks = num_cores()*4;
|
||||||
|
|
||||||
#ifdef __NVPTX__
|
#ifdef __NVPTX__
|
||||||
assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */
|
assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */
|
||||||
@@ -204,7 +201,7 @@ export void radixSort(
|
|||||||
countsGlobal[digit] = 0;
|
countsGlobal[digit] = 0;
|
||||||
|
|
||||||
/* compute histogram for each digit */
|
/* compute histogram for each digit */
|
||||||
launch [numBlocks] computeHistogram(keys, bit, numElements, countsBlock, countsGlobal);
|
launch [numBlocks] countPass(keys, bit, numElements, countsBlock, countsGlobal);
|
||||||
sync;
|
sync;
|
||||||
|
|
||||||
/* exclusive scan on global histogram */
|
/* exclusive scan on global histogram */
|
||||||
@@ -214,10 +211,9 @@ export void radixSort(
|
|||||||
{
|
{
|
||||||
const int value = countsGlobal[digit];
|
const int value = countsGlobal[digit];
|
||||||
const int scan = exclusive_scan_add(value);
|
const int scan = exclusive_scan_add(value);
|
||||||
excScan[digit] = value + carry;
|
excScan[digit] = scan + carry;
|
||||||
carry += broadcast(scan+value, programCount-1);
|
carry += broadcast(scan+value, programCount-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* computing offsets for each digit */
|
/* computing offsets for each digit */
|
||||||
radixExclusiveScan(numBlocks, excScan, countsBlock, partialSum, prefixSum);
|
radixExclusiveScan(numBlocks, excScan, countsBlock, partialSum, prefixSum);
|
||||||
|
|||||||
Reference in New Issue
Block a user