diff --git a/examples_ptx/radixSort/radixSort.cpp b/examples_ptx/radixSort/radixSort.cpp index 8b6ddbc7..f28a370e 100644 --- a/examples_ptx/radixSort/radixSort.cpp +++ b/examples_ptx/radixSort/radixSort.cpp @@ -35,18 +35,13 @@ int main (int argc, char *argv[]) int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX; double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0; unsigned int *keys = new unsigned int [n]; - unsigned int *tmpv = new unsigned int [n]; unsigned int *keys_orig = new unsigned int [n]; -// srand48(rtc()*65536); - srand48(1234); + srand48(rtc()*65536); #pragma omp parallel for for (int i = 0; i < n; i++) - { - keys[i] = 4*n-3*i; //drand48() * (1<<30); - tmpv[i] = keys[i]; - } + keys[i] = 10*i; //drand48() * (1<<30); std::random_shuffle(keys, keys + n); @@ -56,16 +51,20 @@ int main (int argc, char *argv[]) ispcSetMallocHeapLimit(1024*1024*1024); + ispc::radixSort_alloc(n); + for (i = 0; i < m; i ++) { ispcMemcpy(keys, keys_orig, n*sizeof(unsigned int)); reset_and_start_timer(); - ispc::radixSort(n, (int*)keys, (int*)tmpv); + ispc::radixSort(n, (int*)keys); tISPC2 += get_elapsed_msec(); if (argc != 3) progressbar (i, m); } + ispc::radixSort_free(); + printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n*m/tISPC2); std::sort(keys_orig, keys_orig + n); @@ -100,6 +99,5 @@ int main (int argc, char *argv[]) delete keys; delete keys_orig; - delete tmpv; return 0; } diff --git a/examples_ptx/radixSort/radixSort.ispc b/examples_ptx/radixSort/radixSort.ispc index 026ae4e5..bb1202ad 100644 --- a/examples_ptx/radixSort/radixSort.ispc +++ b/examples_ptx/radixSort/radixSort.ispc @@ -158,7 +158,7 @@ inline void radixExclusiveScan( uniform int partialSum[], uniform int prefixSum[]) { - const uniform int scale = 4; + const uniform int scale = 2; launch [numBlocks/scale] partialScanLocal(numBlocks, excScanPtr, countsPtr, partialSum); sync; @@ -169,25 +169,36 @@ inline void radixExclusiveScan( sync; } -export void radixSort( - const uniform int numElements, - uniform int keys[], - uniform int sorted[]) +static uniform int * uniform memoryPool = NULL; +static uniform int numBlocks; +static uniform int nSharedCounts; +static uniform int nCountsGlobal; +static uniform int nExcScan; +static uniform int nCountsBlock; +static uniform int nPartialSum; +static uniform int nPrefixSum; + +static uniform int * uniform sharedCounts; +static uniform int * uniform countsGlobal; +static uniform int * uniform excScan; +static uniform int * uniform counts; +static uniform int * uniform partialSum; +static uniform int * uniform prefixSum; + +static uniform int numElementsBuf = 0; +static uniform int * uniform bufKeys; + +export void radixSort_alloc(const uniform int n) { - const uniform int numBlocks = num_cores()*4; + assert(memoryPool == NULL); + numBlocks = num_cores()*4; + nSharedCounts = NUMDIGITS*numBlocks; + nCountsGlobal = NUMDIGITS; + nExcScan = NUMDIGITS*numBlocks; + nCountsBlock = NUMDIGITS*numBlocks; + nPartialSum = NUMDIGITS*numBlocks; + nPrefixSum = NUMDIGITS*numBlocks; -#ifdef __NVPTX__ - assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */ -#endif - - const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks; - - const uniform int nSharedCounts = NUMDIGITS*numBlocks; - const uniform int nCountsGlobal = NUMDIGITS; - const uniform int nExcScan = NUMDIGITS*numBlocks; - const uniform int nCountsBlock = NUMDIGITS*numBlocks; - const uniform int nPartialSum = NUMDIGITS*numBlocks; - const uniform int nPrefixSum = NUMDIGITS*numBlocks; const uniform int nalloc = nSharedCounts + @@ -197,14 +208,52 @@ export void radixSort( nPartialSum + nPrefixSum; - uniform int * uniform mem_pool = uniform new uniform int[nalloc]; + memoryPool = uniform new uniform int[nalloc]; - uniform int * uniform sharedCounts = mem_pool; - uniform int * uniform countsGlobal = sharedCounts + nSharedCounts; - uniform int * uniform excScan = countsGlobal + nCountsGlobal; - uniform int * uniform counts = excScan + nExcScan; - uniform int * uniform partialSum = counts + nCountsBlock; - uniform int * uniform prefixSum = partialSum + nPartialSum; + sharedCounts = memoryPool; + countsGlobal = sharedCounts + nSharedCounts; + excScan = countsGlobal + nCountsGlobal; + counts = excScan + nExcScan; + partialSum = counts + nCountsBlock; + prefixSum = partialSum + nPartialSum; +} + +static +void radixSort_freeBufKeys() +{ + if (numElementsBuf > 0) + { + delete bufKeys; + numElementsBuf = 0; + } +} + +export void radixSort_free() +{ + assert(memoryPool != NULL); + delete memoryPool; + memoryPool = NULL; + + radixSort_freeBufKeys; +} + +export void radixSort( + const uniform int numElements, + uniform int keys[]) +{ +#ifdef __NVPTX__ + assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */ +#endif + + if (numElementsBuf < numElements) + radixSort_freeBufKeys(); + if (numElementsBuf == 0) + { + numElementsBuf = numElements; + bufKeys = uniform new uniform int[numElementsBuf]; + } + + const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks; for (uniform int bit = 0; bit < 32; bit += NUMBITS) { @@ -234,7 +283,7 @@ export void radixSort( launch [numBlocks] sortPass( keys, - sorted, + bufKeys, bit, numElements, excScan, @@ -242,9 +291,8 @@ export void radixSort( sync; uniform int * uniform tmp = keys; - keys = sorted; - sorted = tmp; + keys = bufKeys; + bufKeys = tmp; } - delete mem_pool; }