tuning radixSort
This commit is contained in:
@@ -4,7 +4,7 @@ ISPC_SRC=radixSort.ispc
|
|||||||
CXX_SRC=radixSort.cpp radixSort.cpp
|
CXX_SRC=radixSort.cpp radixSort.cpp
|
||||||
PTXCC_REGMAX=64
|
PTXCC_REGMAX=64
|
||||||
|
|
||||||
LLVM_GPU=1
|
# LLVM_GPU=1
|
||||||
NVVM_GPU=1
|
NVVM_GPU=1
|
||||||
|
|
||||||
include ../common_gpu.mk
|
include ../common_gpu.mk
|
||||||
|
|||||||
@@ -22,11 +22,18 @@ void countPass(
|
|||||||
foreach (digit = 0 ... NUMDIGITS)
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
counts[digit] = 0;
|
counts[digit] = 0;
|
||||||
|
|
||||||
|
#if 1
|
||||||
foreach (i = 0 ... nloc)
|
foreach (i = 0 ... nloc)
|
||||||
{
|
{
|
||||||
const int key = mask & ((unsigned int)keys[i] >> bit);
|
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||||
atomic_add_local(&counts[key], 1);
|
uniform int skey;
|
||||||
|
if (reduce_equal(key, &skey) == true)
|
||||||
|
counts[skey] += reduce_add(1);
|
||||||
|
else
|
||||||
|
atomic_add_local(&counts[key], 1);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
#endif
|
||||||
|
|
||||||
foreach (digit = 0 ... NUMDIGITS)
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
atomic_add_global(&countsGlobal[digit], counts[digit]);
|
atomic_add_global(&countsGlobal[digit], counts[digit]);
|
||||||
@@ -74,14 +81,14 @@ void sortPass(
|
|||||||
|
|
||||||
task
|
task
|
||||||
void partialScanLocal(
|
void partialScanLocal(
|
||||||
|
uniform int numBlocks,
|
||||||
uniform int excScanAll[],
|
uniform int excScanAll[],
|
||||||
uniform int countsAll[],
|
uniform int countsAll[],
|
||||||
uniform int partialSumAll[])
|
uniform int partialSumAll[])
|
||||||
{
|
{
|
||||||
const uniform int blockIdx = taskIndex;
|
const uniform int blockIdx = taskIndex;
|
||||||
const uniform int numBlocks = taskCount;
|
|
||||||
|
|
||||||
const uniform int blockDim = (numBlocks+numBlocks-1)/numBlocks;
|
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
||||||
const uniform int bbeg = blockIdx * blockDim;
|
const uniform int bbeg = blockIdx * blockDim;
|
||||||
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||||
|
|
||||||
@@ -123,12 +130,12 @@ void partialScanGlobal(
|
|||||||
|
|
||||||
task
|
task
|
||||||
void completeScanGlobal(
|
void completeScanGlobal(
|
||||||
|
uniform int numBlocks,
|
||||||
uniform int excScanAll[],
|
uniform int excScanAll[],
|
||||||
uniform int carryValueAll[])
|
uniform int carryValueAll[])
|
||||||
{
|
{
|
||||||
const uniform int numBlocks = taskCount;
|
|
||||||
const uniform int blockIdx = taskIndex;
|
const uniform int blockIdx = taskIndex;
|
||||||
const uniform int blockDim = (numBlocks+numBlocks-1)/numBlocks;
|
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
||||||
const uniform int bbeg = blockIdx * blockDim;
|
const uniform int bbeg = blockIdx * blockDim;
|
||||||
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||||
|
|
||||||
@@ -151,13 +158,14 @@ inline void radixExclusiveScan(
|
|||||||
uniform int partialSum[],
|
uniform int partialSum[],
|
||||||
uniform int prefixSum[])
|
uniform int prefixSum[])
|
||||||
{
|
{
|
||||||
launch [numBlocks] partialScanLocal(excScanPtr, countsPtr, partialSum);
|
const uniform int scale = 4;
|
||||||
|
launch [numBlocks/scale] partialScanLocal(numBlocks, excScanPtr, countsPtr, partialSum);
|
||||||
sync;
|
sync;
|
||||||
|
|
||||||
launch [NUMDIGITS] partialScanGlobal(numBlocks, partialSum, prefixSum);
|
launch [NUMDIGITS] partialScanGlobal(numBlocks/scale, partialSum, prefixSum);
|
||||||
sync;
|
sync;
|
||||||
|
|
||||||
launch [numBlocks] completeScanGlobal(excScanPtr, prefixSum);
|
launch [numBlocks/scale] completeScanGlobal(numBlocks, excScanPtr, prefixSum);
|
||||||
sync;
|
sync;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user