some tuning
This commit is contained in:
@@ -233,7 +233,7 @@ void mergeSortGang(
|
|||||||
uniform Val_t srcVal[],
|
uniform Val_t srcVal[],
|
||||||
uniform int batchSize)
|
uniform int batchSize)
|
||||||
{
|
{
|
||||||
uniform int nTasks = batchSize/4;
|
uniform int nTasks = batchSize;
|
||||||
launch (nTasks,1,1,mergeSortGangKernel)(batchSize, dstKey, dstVal, srcKey, srcVal);
|
launch (nTasks,1,1,mergeSortGangKernel)(batchSize, dstKey, dstVal, srcKey, srcVal);
|
||||||
sync;
|
sync;
|
||||||
}
|
}
|
||||||
@@ -306,7 +306,7 @@ void generateSampleRanks(
|
|||||||
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||||
|
|
||||||
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
||||||
uniform int nTasks = nBlocks/4;
|
uniform int nTasks = nBlocks;
|
||||||
|
|
||||||
launch (nTasks,1,1, generateSampleRanksKernel)(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
|
launch (nTasks,1,1, generateSampleRanksKernel)(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
|
||||||
sync;
|
sync;
|
||||||
@@ -372,7 +372,7 @@ void mergeRanksAndIndices(
|
|||||||
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||||
|
|
||||||
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
||||||
uniform int nTasks = nBlocks/4;
|
uniform int nTasks = nBlocks;
|
||||||
|
|
||||||
launch (nTasks,1,1,mergeRanksAndIndicesKernel)(
|
launch (nTasks,1,1,mergeRanksAndIndicesKernel)(
|
||||||
nBlocks,
|
nBlocks,
|
||||||
@@ -448,20 +448,21 @@ void mergeElementaryIntervalsKernel(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Compute destination addresses for merge data
|
// Compute destination addresses for merge data
|
||||||
int dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
|
int dstPosA, dstPosB, dstA = -1, dstB = -1;
|
||||||
int dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
|
if (any(programIndex < lenSrcA))
|
||||||
|
dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
|
||||||
|
if (any(programIndex < lenSrcB))
|
||||||
|
dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
|
||||||
|
|
||||||
|
|
||||||
int dstA = -1, dstB = -1;
|
|
||||||
if (programIndex < lenSrcA && dstPosA < lenSrcA)
|
if (programIndex < lenSrcA && dstPosA < lenSrcA)
|
||||||
dstA = segmentBase + startDstA + dstPosA;
|
dstA = segmentBase + startDstA + dstPosA;
|
||||||
if (programIndex < lenSrcB && dstPosB < lenSrcA)
|
|
||||||
dstB = segmentBase + startDstA + dstPosB;
|
|
||||||
|
|
||||||
dstPosA -= lenSrcA;
|
dstPosA -= lenSrcA;
|
||||||
dstPosB -= lenSrcA;
|
|
||||||
if (programIndex < lenSrcA && dstPosA < lenSrcB)
|
if (programIndex < lenSrcA && dstPosA < lenSrcB)
|
||||||
dstA = segmentBase + startDstB + dstPosA;
|
dstA = segmentBase + startDstB + dstPosA;
|
||||||
|
|
||||||
|
if (programIndex < lenSrcB && dstPosB < lenSrcA)
|
||||||
|
dstB = segmentBase + startDstA + dstPosB;
|
||||||
|
dstPosB -= lenSrcA;
|
||||||
if (programIndex < lenSrcB && dstPosB < lenSrcB)
|
if (programIndex < lenSrcB && dstPosB < lenSrcB)
|
||||||
dstB = segmentBase + startDstB + dstPosB;
|
dstB = segmentBase + startDstB + dstPosB;
|
||||||
|
|
||||||
@@ -499,7 +500,7 @@ void mergeElementaryIntervals(
|
|||||||
const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||||
|
|
||||||
|
|
||||||
nTasks = mergePairs/(4*programCount);
|
nTasks = mergePairs/(programCount);
|
||||||
|
|
||||||
launch (nTasks,1,1, mergeElementaryIntervalsKernel)(
|
launch (nTasks,1,1, mergeElementaryIntervalsKernel)(
|
||||||
mergePairs,
|
mergePairs,
|
||||||
|
|||||||
@@ -221,7 +221,7 @@ void mergeSortGang(
|
|||||||
{
|
{
|
||||||
uniform int nTasks = num_cores()*4;
|
uniform int nTasks = num_cores()*4;
|
||||||
#ifdef __NVPTX__
|
#ifdef __NVPTX__
|
||||||
nTasks = iDivUp(batchSize,4);
|
nTasks = iDivUp(batchSize,1);
|
||||||
#endif
|
#endif
|
||||||
launch [nTasks] mergeSortGangKernel(batchSize, dstKey, dstVal, srcKey, srcVal, 2*programCount);
|
launch [nTasks] mergeSortGangKernel(batchSize, dstKey, dstVal, srcKey, srcVal, 2*programCount);
|
||||||
sync;
|
sync;
|
||||||
@@ -297,7 +297,7 @@ void generateSampleRanks(
|
|||||||
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
||||||
uniform int nTasks = num_cores()*4;
|
uniform int nTasks = num_cores()*4;
|
||||||
#ifdef __NVPTX__
|
#ifdef __NVPTX__
|
||||||
nTasks = iDivUp(nBlocks,4);
|
nTasks = iDivUp(nBlocks,1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
launch [nTasks] generateSampleRanksKernel(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
|
launch [nTasks] generateSampleRanksKernel(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
|
||||||
@@ -367,7 +367,7 @@ void mergeRanksAndIndices(
|
|||||||
uniform int nTasks = num_cores()*4;
|
uniform int nTasks = num_cores()*4;
|
||||||
|
|
||||||
#ifdef __NVPTX__
|
#ifdef __NVPTX__
|
||||||
nTasks = iDivUp(nBlocks,4);
|
nTasks = iDivUp(nBlocks,1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
launch [nTasks] mergeRanksAndIndicesKernel(
|
launch [nTasks] mergeRanksAndIndicesKernel(
|
||||||
@@ -492,7 +492,7 @@ void mergeElementaryIntervals(
|
|||||||
|
|
||||||
uniform int nTasks = num_cores()*4;
|
uniform int nTasks = num_cores()*4;
|
||||||
#ifdef __NVPTX__
|
#ifdef __NVPTX__
|
||||||
nTasks = iDivUp(mergePairs,4*programCount);
|
nTasks = iDivUp(mergePairs,1*programCount);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
launch [nTasks] mergeElementaryIntervalsKernel(
|
launch [nTasks] mergeElementaryIntervalsKernel(
|
||||||
|
|||||||
Reference in New Issue
Block a user