+sometuning

This commit is contained in:
Evghenii
2014-01-29 10:40:48 +01:00
parent 1f7b994232
commit fbcadf3d4d
2 changed files with 27 additions and 49 deletions

View File

@@ -45,10 +45,13 @@ int main (int argc, char *argv[])
srand48(rtc()*65536); srand48(rtc()*65536);
int sortBits = 32;
assert(sortBits <= 32);
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
{ {
keys[i].key = ((int)(drand48() * (1<<30))) & 0x00FFFFFF; keys[i].key = ((int)(drand48() * (1<<30))) & ((1ULL << sortBits) - 1);
keys[i].val = i; keys[i].val = i;
} }
@@ -70,7 +73,7 @@ int main (int argc, char *argv[])
{ {
ispcMemcpy(keys, keys_orig, n*sizeof(Key)); ispcMemcpy(keys, keys_orig, n*sizeof(Key));
reset_and_start_timer(); reset_and_start_timer();
ispc::radixSort(n, (int64_t*)keys, 32); ispc::radixSort(n, (int64_t*)keys, sortBits);
tISPC2 = std::min(tISPC2, get_elapsed_msec()); tISPC2 = std::min(tISPC2, get_elapsed_msec());
if (argc != 3) if (argc != 3)
progressbar (i, m); progressbar (i, m);

View File

@@ -26,6 +26,7 @@ void countPass(
foreach (digit = 0 ... NUMDIGITS) foreach (digit = 0 ... NUMDIGITS)
counts[digit] = 0; counts[digit] = 0;
#if 1 #if 1
foreach (i = 0 ... nloc) foreach (i = 0 ... nloc)
{ {
@@ -56,8 +57,7 @@ void sortPass(
uniform Key sorted[], uniform Key sorted[],
uniform int bit, uniform int bit,
uniform int numElements, uniform int numElements,
uniform int digitOffsetsAll[], uniform int digitOffsetsAll[])
uniform int sharedCounts[])
{ {
const uniform int blockIdx = taskIndex; const uniform int blockIdx = taskIndex;
const uniform int numBlocks = taskCount; const uniform int numBlocks = taskCount;
@@ -65,70 +65,46 @@ void sortPass(
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks; const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
#if 0
uniform int * uniform localCounts = sharedCounts + blockIdx*NUMDIGITS;
#else
uniform int localCounts[NUMDIGITS];
#endif
const uniform int keyIndex = blockIdx * blockDim; const uniform int keyIndex = blockIdx * blockDim;
uniform Key * uniform keys = keysAll + keyIndex; uniform Key * uniform keys = keysAll + keyIndex;
uniform int * uniform digitOffsets = digitOffsetsAll + blockIdx*NUMDIGITS;
const uniform int nloc = min(numElements - keyIndex, blockDim); const uniform int nloc = min(numElements - keyIndex, blockDim);
const uniform int mask = (1 << NUMBITS) - 1; const uniform int mask = (1 << NUMBITS) - 1;
foreach (i = 0 ... NUMDIGITS)
localCounts[i] = 0;
const int unitScan = exclusive_scan_add(1); const int unitScan = exclusive_scan_add(1);
const int unitSum = exclusive_scan_add((int)(programIndex < programCount/2)); const int unitScanHalf = exclusive_scan_add((int)(programIndex >= programCount/2));
uniform int digitOffsets[NUMDIGITS];
foreach (digit = 0 ... NUMDIGITS)
digitOffsets[digit] = digitOffsetsAll[blockIdx*NUMDIGITS + digit];
foreach (i = 0 ... nloc) foreach (i = 0 ... nloc)
{ {
const int key = mask & ((unsigned int)keys[i] >> bit); const int key = mask & ((unsigned int)keys[i] >> bit);
int rel; int scatter;
#if 0 #if 0
/* serialize exuection to test correctness of algorithm */
foreach_active(iv) foreach_active(iv)
{ scattter = digitOffsets[key]++;
rel = localCounts[key];
localCounts[key]++;
}
#else #else
if (reduce_equal(key) == true) if (reduce_equal(key) == true)
{ digitOffsets[key] = (scatter = digitOffsets[key] + unitScan) + 1;
rel = localCounts[key] + unitScan;
localCounts[key] = rel+1;
}
else else
{ {
#ifdef __NVPTX__ #ifdef __NVPTX__
if (programIndex < 16) /* there is a bug, not clear where exectly (perhaps due to optimizations),
{ * This complex code restored correctness */
if (reduce_equal(key) == true) /* :S */
{ if (programIndex < 16) {if (key < NUMDIGITS) scatter = atomic_add_global(&digitOffsets[key],1);}
rel = localCounts[key] + unitScan; else {if (key < NUMDIGITS) scatter = atomic_add_global(&digitOffsets[key],1);}
localCounts[key] = rel+1;
}
else
rel = atomic_add_global(&localCounts[key],1);
}
else
{
if (reduce_equal(key) == true)
{
rel = localCounts[key] + unitScan - unitSum;
localCounts[key] = rel+1;
}
else
rel = atomic_add_global(&localCounts[key],1);
}
#else #else
rel = atomic_add_local(&localCounts[key],1); scatter = atomic_add_local(&digitOffsets[key],1);
#endif #endif
} }
#endif #endif
const int scatter = rel + digitOffsets[key];
sorted [scatter] = keys[i]; sorted [scatter] = keys[i];
} }
} }
@@ -341,8 +317,7 @@ export void radixSort(
keys, keys,
bit, bit,
numElements, numElements,
excScan, excScan);
sharedCounts);
sync; sync;
} }