working. next step tuning
This commit is contained in:
@@ -35,18 +35,13 @@ int main (int argc, char *argv[])
|
|||||||
int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
|
int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
|
||||||
double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
|
double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
|
||||||
unsigned int *keys = new unsigned int [n];
|
unsigned int *keys = new unsigned int [n];
|
||||||
unsigned int *tmpv = new unsigned int [n];
|
|
||||||
unsigned int *keys_orig = new unsigned int [n];
|
unsigned int *keys_orig = new unsigned int [n];
|
||||||
|
|
||||||
// srand48(rtc()*65536);
|
srand48(rtc()*65536);
|
||||||
srand48(1234);
|
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (int i = 0; i < n; i++)
|
for (int i = 0; i < n; i++)
|
||||||
{
|
keys[i] = 10*i; //drand48() * (1<<30);
|
||||||
keys[i] = 4*n-3*i; //drand48() * (1<<30);
|
|
||||||
tmpv[i] = keys[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
std::random_shuffle(keys, keys + n);
|
std::random_shuffle(keys, keys + n);
|
||||||
|
|
||||||
@@ -56,16 +51,20 @@ int main (int argc, char *argv[])
|
|||||||
|
|
||||||
ispcSetMallocHeapLimit(1024*1024*1024);
|
ispcSetMallocHeapLimit(1024*1024*1024);
|
||||||
|
|
||||||
|
ispc::radixSort_alloc(n);
|
||||||
|
|
||||||
for (i = 0; i < m; i ++)
|
for (i = 0; i < m; i ++)
|
||||||
{
|
{
|
||||||
ispcMemcpy(keys, keys_orig, n*sizeof(unsigned int));
|
ispcMemcpy(keys, keys_orig, n*sizeof(unsigned int));
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
ispc::radixSort(n, (int*)keys, (int*)tmpv);
|
ispc::radixSort(n, (int*)keys);
|
||||||
tISPC2 += get_elapsed_msec();
|
tISPC2 += get_elapsed_msec();
|
||||||
if (argc != 3)
|
if (argc != 3)
|
||||||
progressbar (i, m);
|
progressbar (i, m);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ispc::radixSort_free();
|
||||||
|
|
||||||
printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n*m/tISPC2);
|
printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n*m/tISPC2);
|
||||||
|
|
||||||
std::sort(keys_orig, keys_orig + n);
|
std::sort(keys_orig, keys_orig + n);
|
||||||
@@ -100,6 +99,5 @@ int main (int argc, char *argv[])
|
|||||||
|
|
||||||
delete keys;
|
delete keys;
|
||||||
delete keys_orig;
|
delete keys_orig;
|
||||||
delete tmpv;
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -158,7 +158,7 @@ inline void radixExclusiveScan(
|
|||||||
uniform int partialSum[],
|
uniform int partialSum[],
|
||||||
uniform int prefixSum[])
|
uniform int prefixSum[])
|
||||||
{
|
{
|
||||||
const uniform int scale = 4;
|
const uniform int scale = 2;
|
||||||
launch [numBlocks/scale] partialScanLocal(numBlocks, excScanPtr, countsPtr, partialSum);
|
launch [numBlocks/scale] partialScanLocal(numBlocks, excScanPtr, countsPtr, partialSum);
|
||||||
sync;
|
sync;
|
||||||
|
|
||||||
@@ -169,25 +169,36 @@ inline void radixExclusiveScan(
|
|||||||
sync;
|
sync;
|
||||||
}
|
}
|
||||||
|
|
||||||
export void radixSort(
|
static uniform int * uniform memoryPool = NULL;
|
||||||
const uniform int numElements,
|
static uniform int numBlocks;
|
||||||
uniform int keys[],
|
static uniform int nSharedCounts;
|
||||||
uniform int sorted[])
|
static uniform int nCountsGlobal;
|
||||||
|
static uniform int nExcScan;
|
||||||
|
static uniform int nCountsBlock;
|
||||||
|
static uniform int nPartialSum;
|
||||||
|
static uniform int nPrefixSum;
|
||||||
|
|
||||||
|
static uniform int * uniform sharedCounts;
|
||||||
|
static uniform int * uniform countsGlobal;
|
||||||
|
static uniform int * uniform excScan;
|
||||||
|
static uniform int * uniform counts;
|
||||||
|
static uniform int * uniform partialSum;
|
||||||
|
static uniform int * uniform prefixSum;
|
||||||
|
|
||||||
|
static uniform int numElementsBuf = 0;
|
||||||
|
static uniform int * uniform bufKeys;
|
||||||
|
|
||||||
|
export void radixSort_alloc(const uniform int n)
|
||||||
{
|
{
|
||||||
const uniform int numBlocks = num_cores()*4;
|
assert(memoryPool == NULL);
|
||||||
|
numBlocks = num_cores()*4;
|
||||||
|
nSharedCounts = NUMDIGITS*numBlocks;
|
||||||
|
nCountsGlobal = NUMDIGITS;
|
||||||
|
nExcScan = NUMDIGITS*numBlocks;
|
||||||
|
nCountsBlock = NUMDIGITS*numBlocks;
|
||||||
|
nPartialSum = NUMDIGITS*numBlocks;
|
||||||
|
nPrefixSum = NUMDIGITS*numBlocks;
|
||||||
|
|
||||||
#ifdef __NVPTX__
|
|
||||||
assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
|
||||||
|
|
||||||
const uniform int nSharedCounts = NUMDIGITS*numBlocks;
|
|
||||||
const uniform int nCountsGlobal = NUMDIGITS;
|
|
||||||
const uniform int nExcScan = NUMDIGITS*numBlocks;
|
|
||||||
const uniform int nCountsBlock = NUMDIGITS*numBlocks;
|
|
||||||
const uniform int nPartialSum = NUMDIGITS*numBlocks;
|
|
||||||
const uniform int nPrefixSum = NUMDIGITS*numBlocks;
|
|
||||||
|
|
||||||
const uniform int nalloc =
|
const uniform int nalloc =
|
||||||
nSharedCounts +
|
nSharedCounts +
|
||||||
@@ -197,14 +208,52 @@ export void radixSort(
|
|||||||
nPartialSum +
|
nPartialSum +
|
||||||
nPrefixSum;
|
nPrefixSum;
|
||||||
|
|
||||||
uniform int * uniform mem_pool = uniform new uniform int[nalloc];
|
memoryPool = uniform new uniform int[nalloc];
|
||||||
|
|
||||||
uniform int * uniform sharedCounts = mem_pool;
|
sharedCounts = memoryPool;
|
||||||
uniform int * uniform countsGlobal = sharedCounts + nSharedCounts;
|
countsGlobal = sharedCounts + nSharedCounts;
|
||||||
uniform int * uniform excScan = countsGlobal + nCountsGlobal;
|
excScan = countsGlobal + nCountsGlobal;
|
||||||
uniform int * uniform counts = excScan + nExcScan;
|
counts = excScan + nExcScan;
|
||||||
uniform int * uniform partialSum = counts + nCountsBlock;
|
partialSum = counts + nCountsBlock;
|
||||||
uniform int * uniform prefixSum = partialSum + nPartialSum;
|
prefixSum = partialSum + nPartialSum;
|
||||||
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
void radixSort_freeBufKeys()
|
||||||
|
{
|
||||||
|
if (numElementsBuf > 0)
|
||||||
|
{
|
||||||
|
delete bufKeys;
|
||||||
|
numElementsBuf = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export void radixSort_free()
|
||||||
|
{
|
||||||
|
assert(memoryPool != NULL);
|
||||||
|
delete memoryPool;
|
||||||
|
memoryPool = NULL;
|
||||||
|
|
||||||
|
radixSort_freeBufKeys;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void radixSort(
|
||||||
|
const uniform int numElements,
|
||||||
|
uniform int keys[])
|
||||||
|
{
|
||||||
|
#ifdef __NVPTX__
|
||||||
|
assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (numElementsBuf < numElements)
|
||||||
|
radixSort_freeBufKeys();
|
||||||
|
if (numElementsBuf == 0)
|
||||||
|
{
|
||||||
|
numElementsBuf = numElements;
|
||||||
|
bufKeys = uniform new uniform int[numElementsBuf];
|
||||||
|
}
|
||||||
|
|
||||||
|
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
||||||
|
|
||||||
for (uniform int bit = 0; bit < 32; bit += NUMBITS)
|
for (uniform int bit = 0; bit < 32; bit += NUMBITS)
|
||||||
{
|
{
|
||||||
@@ -234,7 +283,7 @@ export void radixSort(
|
|||||||
launch [numBlocks]
|
launch [numBlocks]
|
||||||
sortPass(
|
sortPass(
|
||||||
keys,
|
keys,
|
||||||
sorted,
|
bufKeys,
|
||||||
bit,
|
bit,
|
||||||
numElements,
|
numElements,
|
||||||
excScan,
|
excScan,
|
||||||
@@ -242,9 +291,8 @@ export void radixSort(
|
|||||||
sync;
|
sync;
|
||||||
|
|
||||||
uniform int * uniform tmp = keys;
|
uniform int * uniform tmp = keys;
|
||||||
keys = sorted;
|
keys = bufKeys;
|
||||||
sorted = tmp;
|
bufKeys = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
delete mem_pool;
|
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user