From fbcadf3d4d48102f4a805b2f1c855d458de9c373 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Wed, 29 Jan 2014 10:40:48 +0100 Subject: [PATCH] +sometuning --- examples_ptx/radixSort/radixSort.cpp | 7 ++- examples_ptx/radixSort/radixSort.ispc | 69 +++++++++------------------ 2 files changed, 27 insertions(+), 49 deletions(-) diff --git a/examples_ptx/radixSort/radixSort.cpp b/examples_ptx/radixSort/radixSort.cpp index f474ff2d..1570a621 100644 --- a/examples_ptx/radixSort/radixSort.cpp +++ b/examples_ptx/radixSort/radixSort.cpp @@ -45,10 +45,13 @@ int main (int argc, char *argv[]) srand48(rtc()*65536); + int sortBits = 32; + assert(sortBits <= 32); + #pragma omp parallel for for (int i = 0; i < n; i++) { - keys[i].key = ((int)(drand48() * (1<<30))) & 0x00FFFFFF; + keys[i].key = ((int)(drand48() * (1<<30))) & ((1ULL << sortBits) - 1); keys[i].val = i; } @@ -70,7 +73,7 @@ int main (int argc, char *argv[]) { ispcMemcpy(keys, keys_orig, n*sizeof(Key)); reset_and_start_timer(); - ispc::radixSort(n, (int64_t*)keys, 32); + ispc::radixSort(n, (int64_t*)keys, sortBits); tISPC2 = std::min(tISPC2, get_elapsed_msec()); if (argc != 3) progressbar (i, m); diff --git a/examples_ptx/radixSort/radixSort.ispc b/examples_ptx/radixSort/radixSort.ispc index ff17f741..8678be65 100644 --- a/examples_ptx/radixSort/radixSort.ispc +++ b/examples_ptx/radixSort/radixSort.ispc @@ -26,6 +26,7 @@ void countPass( foreach (digit = 0 ... NUMDIGITS) counts[digit] = 0; + #if 1 foreach (i = 0 ... nloc) { @@ -56,8 +57,7 @@ void sortPass( uniform Key sorted[], uniform int bit, uniform int numElements, - uniform int digitOffsetsAll[], - uniform int sharedCounts[]) + uniform int digitOffsetsAll[]) { const uniform int blockIdx = taskIndex; const uniform int numBlocks = taskCount; @@ -65,70 +65,46 @@ void sortPass( const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks; -#if 0 - uniform int * uniform localCounts = sharedCounts + blockIdx*NUMDIGITS; -#else - uniform int localCounts[NUMDIGITS]; -#endif - const uniform int keyIndex = blockIdx * blockDim; uniform Key * uniform keys = keysAll + keyIndex; - uniform int * uniform digitOffsets = digitOffsetsAll + blockIdx*NUMDIGITS; + + const uniform int nloc = min(numElements - keyIndex, blockDim); const uniform int mask = (1 << NUMBITS) - 1; - foreach (i = 0 ... NUMDIGITS) - localCounts[i] = 0; - const int unitScan = exclusive_scan_add(1); - const int unitSum = exclusive_scan_add((int)(programIndex < programCount/2)); + const int unitScan = exclusive_scan_add(1); + const int unitScanHalf = exclusive_scan_add((int)(programIndex >= programCount/2)); + + uniform int digitOffsets[NUMDIGITS]; + foreach (digit = 0 ... NUMDIGITS) + digitOffsets[digit] = digitOffsetsAll[blockIdx*NUMDIGITS + digit]; foreach (i = 0 ... nloc) { const int key = mask & ((unsigned int)keys[i] >> bit); - int rel; -#if 0 + int scatter; +#if 0 + /* serialize exuection to test correctness of algorithm */ foreach_active(iv) - { - rel = localCounts[key]; - localCounts[key]++; - } + scattter = digitOffsets[key]++; #else if (reduce_equal(key) == true) - { - rel = localCounts[key] + unitScan; - localCounts[key] = rel+1; - } + digitOffsets[key] = (scatter = digitOffsets[key] + unitScan) + 1; else { #ifdef __NVPTX__ - if (programIndex < 16) - { - if (reduce_equal(key) == true) - { - rel = localCounts[key] + unitScan; - localCounts[key] = rel+1; - } - else - rel = atomic_add_global(&localCounts[key],1); - } - else - { - if (reduce_equal(key) == true) - { - rel = localCounts[key] + unitScan - unitSum; - localCounts[key] = rel+1; - } - else - rel = atomic_add_global(&localCounts[key],1); - } + /* there is a bug, not clear where exectly (perhaps due to optimizations), + * This complex code restored correctness */ + /* :S */ + if (programIndex < 16) {if (key < NUMDIGITS) scatter = atomic_add_global(&digitOffsets[key],1);} + else {if (key < NUMDIGITS) scatter = atomic_add_global(&digitOffsets[key],1);} #else - rel = atomic_add_local(&localCounts[key],1); + scatter = atomic_add_local(&digitOffsets[key],1); #endif } #endif - const int scatter = rel + digitOffsets[key]; sorted [scatter] = keys[i]; } } @@ -341,8 +317,7 @@ export void radixSort( keys, bit, numElements, - excScan, - sharedCounts); + excScan); sync; }