tuning radixSort

This commit is contained in:
Evghenii
2014-01-28 15:00:43 +01:00
parent d4dd945828
commit 1b993e167f
2 changed files with 17 additions and 9 deletions

View File

@@ -4,7 +4,7 @@ ISPC_SRC=radixSort.ispc
CXX_SRC=radixSort.cpp radixSort.cpp
PTXCC_REGMAX=64
LLVM_GPU=1
# LLVM_GPU=1
NVVM_GPU=1
include ../common_gpu.mk

View File

@@ -22,11 +22,18 @@ void countPass(
foreach (digit = 0 ... NUMDIGITS)
counts[digit] = 0;
#if 1
foreach (i = 0 ... nloc)
{
const int key = mask & ((unsigned int)keys[i] >> bit);
atomic_add_local(&counts[key], 1);
uniform int skey;
if (reduce_equal(key, &skey) == true)
counts[skey] += reduce_add(1);
else
atomic_add_local(&counts[key], 1);
}
#else
#endif
foreach (digit = 0 ... NUMDIGITS)
atomic_add_global(&countsGlobal[digit], counts[digit]);
@@ -74,14 +81,14 @@ void sortPass(
task
void partialScanLocal(
uniform int numBlocks,
uniform int excScanAll[],
uniform int countsAll[],
uniform int partialSumAll[])
{
const uniform int blockIdx = taskIndex;
const uniform int numBlocks = taskCount;
const uniform int blockDim = (numBlocks+numBlocks-1)/numBlocks;
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
const uniform int bbeg = blockIdx * blockDim;
const uniform int bend = min(bbeg + blockDim, numBlocks);
@@ -123,12 +130,12 @@ void partialScanGlobal(
task
void completeScanGlobal(
uniform int numBlocks,
uniform int excScanAll[],
uniform int carryValueAll[])
{
const uniform int numBlocks = taskCount;
const uniform int blockIdx = taskIndex;
const uniform int blockDim = (numBlocks+numBlocks-1)/numBlocks;
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
const uniform int bbeg = blockIdx * blockDim;
const uniform int bend = min(bbeg + blockDim, numBlocks);
@@ -151,13 +158,14 @@ inline void radixExclusiveScan(
uniform int partialSum[],
uniform int prefixSum[])
{
launch [numBlocks] partialScanLocal(excScanPtr, countsPtr, partialSum);
const uniform int scale = 4;
launch [numBlocks/scale] partialScanLocal(numBlocks, excScanPtr, countsPtr, partialSum);
sync;
launch [NUMDIGITS] partialScanGlobal(numBlocks, partialSum, prefixSum);
launch [NUMDIGITS] partialScanGlobal(numBlocks/scale, partialSum, prefixSum);
sync;
launch [numBlocks] completeScanGlobal(excScanPtr, prefixSum);
launch [numBlocks/scale] completeScanGlobal(numBlocks, excScanPtr, prefixSum);
sync;
}