+sometuning
This commit is contained in:
@@ -45,10 +45,13 @@ int main (int argc, char *argv[])
|
|||||||
|
|
||||||
srand48(rtc()*65536);
|
srand48(rtc()*65536);
|
||||||
|
|
||||||
|
int sortBits = 32;
|
||||||
|
assert(sortBits <= 32);
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (int i = 0; i < n; i++)
|
for (int i = 0; i < n; i++)
|
||||||
{
|
{
|
||||||
keys[i].key = ((int)(drand48() * (1<<30))) & 0x00FFFFFF;
|
keys[i].key = ((int)(drand48() * (1<<30))) & ((1ULL << sortBits) - 1);
|
||||||
keys[i].val = i;
|
keys[i].val = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -70,7 +73,7 @@ int main (int argc, char *argv[])
|
|||||||
{
|
{
|
||||||
ispcMemcpy(keys, keys_orig, n*sizeof(Key));
|
ispcMemcpy(keys, keys_orig, n*sizeof(Key));
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
ispc::radixSort(n, (int64_t*)keys, 32);
|
ispc::radixSort(n, (int64_t*)keys, sortBits);
|
||||||
tISPC2 = std::min(tISPC2, get_elapsed_msec());
|
tISPC2 = std::min(tISPC2, get_elapsed_msec());
|
||||||
if (argc != 3)
|
if (argc != 3)
|
||||||
progressbar (i, m);
|
progressbar (i, m);
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ void countPass(
|
|||||||
foreach (digit = 0 ... NUMDIGITS)
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
counts[digit] = 0;
|
counts[digit] = 0;
|
||||||
|
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
foreach (i = 0 ... nloc)
|
foreach (i = 0 ... nloc)
|
||||||
{
|
{
|
||||||
@@ -56,8 +57,7 @@ void sortPass(
|
|||||||
uniform Key sorted[],
|
uniform Key sorted[],
|
||||||
uniform int bit,
|
uniform int bit,
|
||||||
uniform int numElements,
|
uniform int numElements,
|
||||||
uniform int digitOffsetsAll[],
|
uniform int digitOffsetsAll[])
|
||||||
uniform int sharedCounts[])
|
|
||||||
{
|
{
|
||||||
const uniform int blockIdx = taskIndex;
|
const uniform int blockIdx = taskIndex;
|
||||||
const uniform int numBlocks = taskCount;
|
const uniform int numBlocks = taskCount;
|
||||||
@@ -65,70 +65,46 @@ void sortPass(
|
|||||||
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
||||||
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
uniform int * uniform localCounts = sharedCounts + blockIdx*NUMDIGITS;
|
|
||||||
#else
|
|
||||||
uniform int localCounts[NUMDIGITS];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const uniform int keyIndex = blockIdx * blockDim;
|
const uniform int keyIndex = blockIdx * blockDim;
|
||||||
uniform Key * uniform keys = keysAll + keyIndex;
|
uniform Key * uniform keys = keysAll + keyIndex;
|
||||||
uniform int * uniform digitOffsets = digitOffsetsAll + blockIdx*NUMDIGITS;
|
|
||||||
|
|
||||||
const uniform int nloc = min(numElements - keyIndex, blockDim);
|
const uniform int nloc = min(numElements - keyIndex, blockDim);
|
||||||
|
|
||||||
const uniform int mask = (1 << NUMBITS) - 1;
|
const uniform int mask = (1 << NUMBITS) - 1;
|
||||||
foreach (i = 0 ... NUMDIGITS)
|
|
||||||
localCounts[i] = 0;
|
|
||||||
|
|
||||||
const int unitScan = exclusive_scan_add(1);
|
const int unitScan = exclusive_scan_add(1);
|
||||||
const int unitSum = exclusive_scan_add((int)(programIndex < programCount/2));
|
const int unitScanHalf = exclusive_scan_add((int)(programIndex >= programCount/2));
|
||||||
|
|
||||||
|
uniform int digitOffsets[NUMDIGITS];
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
digitOffsets[digit] = digitOffsetsAll[blockIdx*NUMDIGITS + digit];
|
||||||
|
|
||||||
|
|
||||||
foreach (i = 0 ... nloc)
|
foreach (i = 0 ... nloc)
|
||||||
{
|
{
|
||||||
const int key = mask & ((unsigned int)keys[i] >> bit);
|
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||||
int rel;
|
int scatter;
|
||||||
#if 0
|
#if 0
|
||||||
|
/* serialize exuection to test correctness of algorithm */
|
||||||
foreach_active(iv)
|
foreach_active(iv)
|
||||||
{
|
scattter = digitOffsets[key]++;
|
||||||
rel = localCounts[key];
|
|
||||||
localCounts[key]++;
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
if (reduce_equal(key) == true)
|
if (reduce_equal(key) == true)
|
||||||
{
|
digitOffsets[key] = (scatter = digitOffsets[key] + unitScan) + 1;
|
||||||
rel = localCounts[key] + unitScan;
|
|
||||||
localCounts[key] = rel+1;
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
#ifdef __NVPTX__
|
#ifdef __NVPTX__
|
||||||
if (programIndex < 16)
|
/* there is a bug, not clear where exectly (perhaps due to optimizations),
|
||||||
{
|
* This complex code restored correctness */
|
||||||
if (reduce_equal(key) == true)
|
/* :S */
|
||||||
{
|
if (programIndex < 16) {if (key < NUMDIGITS) scatter = atomic_add_global(&digitOffsets[key],1);}
|
||||||
rel = localCounts[key] + unitScan;
|
else {if (key < NUMDIGITS) scatter = atomic_add_global(&digitOffsets[key],1);}
|
||||||
localCounts[key] = rel+1;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
rel = atomic_add_global(&localCounts[key],1);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (reduce_equal(key) == true)
|
|
||||||
{
|
|
||||||
rel = localCounts[key] + unitScan - unitSum;
|
|
||||||
localCounts[key] = rel+1;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
rel = atomic_add_global(&localCounts[key],1);
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
rel = atomic_add_local(&localCounts[key],1);
|
scatter = atomic_add_local(&digitOffsets[key],1);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
const int scatter = rel + digitOffsets[key];
|
|
||||||
sorted [scatter] = keys[i];
|
sorted [scatter] = keys[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -341,8 +317,7 @@ export void radixSort(
|
|||||||
keys,
|
keys,
|
||||||
bit,
|
bit,
|
||||||
numElements,
|
numElements,
|
||||||
excScan,
|
excScan);
|
||||||
sharedCounts);
|
|
||||||
sync;
|
sync;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user