This commit is contained in:
Evghenii
2014-01-30 10:33:49 +01:00
parent 4e26a1b700
commit bf245d3b98

View File

@@ -232,6 +232,7 @@ void mergeSortGang(
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
task task
void generateSampleRanksKernel( void generateSampleRanksKernel(
uniform int nBlocks,
uniform int in_ranksA[], uniform int in_ranksA[],
uniform int in_ranksB[], uniform int in_ranksB[],
uniform Key_t in_srcKey[], uniform Key_t in_srcKey[],
@@ -239,7 +240,14 @@ void generateSampleRanksKernel(
uniform int N, uniform int N,
uniform int totalProgramCount) uniform int totalProgramCount)
{ {
const int pos = taskIndex * programCount + programIndex; const uniform int blockIdx = taskIndex;
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
for (uniform int block = blockBeg; block < blockEnd; block++)
{
const int pos = block * programCount + programIndex;
cif (pos >= totalProgramCount) cif (pos >= totalProgramCount)
return; return;
@@ -270,6 +278,7 @@ void generateSampleRanksKernel(
srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0, srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
segmentElementsA, nextPowerOfTwo(segmentElementsA)); segmentElementsA, nextPowerOfTwo(segmentElementsA));
} }
}
} }
static inline static inline
@@ -285,9 +294,13 @@ void generateSampleRanks(
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE); (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
uniform int nTasks = iDivUp(threadCount, SAMPLE_STRIDE); uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
uniform int nTasks = num_cores()*4;
#ifdef __NVPTX__
nTasks = nBlocks/4;
#endif
launch [nTasks] generateSampleRanksKernel(ranksA, ranksB, srcKey, stride, N, threadCount); launch [nTasks] generateSampleRanksKernel(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
sync; sync;
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@@ -295,13 +308,21 @@ void generateSampleRanks(
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
task task
void mergeRanksAndIndicesKernel( void mergeRanksAndIndicesKernel(
uniform int nBlocks,
uniform int in_Limits[], uniform int in_Limits[],
uniform int in_Ranks[], uniform int in_Ranks[],
uniform int stride, uniform int stride,
uniform int N, uniform int N,
uniform int totalProgramCount) uniform int totalProgramCount)
{ {
int pos = taskIndex * programCount + programIndex; const uniform int blockIdx = taskIndex;
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
for (uniform int block = blockBeg; block < blockEnd; block++)
{
int pos = block * programCount + programIndex;
cif (pos >= totalProgramCount) cif (pos >= totalProgramCount)
return; return;
@@ -326,6 +347,7 @@ void mergeRanksAndIndicesKernel(
int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i; int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
limits[dstPos] = ranks[segmentSamplesA + i]; limits[dstPos] = ranks[segmentSamplesA + i];
} }
}
} }
static inline static inline
void mergeRanksAndIndices( void mergeRanksAndIndices(
@@ -341,17 +363,22 @@ void mergeRanksAndIndices(
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE); (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
const uniform int nTasks = iDivUp(threadCount, SAMPLE_STRIDE); const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
uniform int nTasks = num_cores()*4;
#ifdef __NVPTX__
nTasks = nBlocks/4;
#endif
launch [nTasks] mergeRanksAndIndicesKernel( launch [nTasks] mergeRanksAndIndicesKernel(
nBlocks,
limitsA, limitsA,
ranksA, ranksA,
stride, stride,
N, N,
threadCount); threadCount);
sync;
launch [nTasks] mergeRanksAndIndicesKernel( launch [nTasks] mergeRanksAndIndicesKernel(
nBlocks,
limitsB, limitsB,
ranksB, ranksB,
stride, stride,
@@ -380,6 +407,7 @@ void merge(
} }
#if 0
task task
void mergeElementaryIntervalsKernel( void mergeElementaryIntervalsKernel(
uniform Key_t dstKey[], uniform Key_t dstKey[],
@@ -437,11 +465,6 @@ void mergeElementaryIntervalsKernel(
lenSrcA, SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE,
lenSrcB, SAMPLE_STRIDE lenSrcB, SAMPLE_STRIDE
); );
//Store merged data
#if 0
uniform Key_t s_key[2 * SAMPLE_STRIDE]; uniform Key_t s_key[2 * SAMPLE_STRIDE];
uniform Val_t s_val[2 * SAMPLE_STRIDE]; uniform Val_t s_val[2 * SAMPLE_STRIDE];
@@ -455,6 +478,8 @@ void mergeElementaryIntervalsKernel(
s_key[dstPosB] = keyB; s_key[dstPosB] = keyB;
s_val[dstPosB] = valB; s_val[dstPosB] = valB;
} }
// Coalesced writes
if (programIndex < lenSrcA) if (programIndex < lenSrcA)
{ {
dstKey[segmentBase + startDstA + programIndex] = s_key[programIndex]; dstKey[segmentBase + startDstA + programIndex] = s_key[programIndex];
@@ -466,35 +491,8 @@ void mergeElementaryIntervalsKernel(
dstKey[segmentBase + startDstB + programIndex] = s_key[lenSrcA + programIndex]; dstKey[segmentBase + startDstB + programIndex] = s_key[lenSrcA + programIndex];
dstVal[segmentBase + startDstB + programIndex] = s_val[lenSrcA + programIndex]; dstVal[segmentBase + startDstB + programIndex] = s_val[lenSrcA + programIndex];
} }
#else
int dstA, dstB;
dstA = dstB = -1;
if (programIndex < lenSrcA && dstPosA < lenSrcA)
dstA = segmentBase + startDstA + dstPosA;
if (programIndex < lenSrcB && dstPosB < lenSrcA)
dstB = segmentBase + startDstA + dstPosB;
dstPosA -= lenSrcA;
dstPosB -= lenSrcA;
dstA = dstB = -1;
if (programIndex < lenSrcA && dstPosA < lenSrcB)
dstA = segmentBase + startDstB + dstPosA;
if (programIndex < lenSrcB && dstPosB < lenSrcB)
dstB = segmentBase + startDstB + dstPosB;
if (dstA >= 0)
{
dstKey[dstA] = keyA;
dstVal[dstA] = valA;
}
if (dstB >= 0)
{
dstKey[dstB] = keyB;
dstVal[dstB] = valB;
}
#endif
} }
#endif
task task
void mergeElementaryIntervalsKernel( void mergeElementaryIntervalsKernel(
@@ -606,20 +604,10 @@ void mergeElementaryIntervals(
const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE; const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
#if 0
launch [mergePairs] mergeElementaryIntervalsKernel(
dstKey,
dstVal,
srcKey,
srcVal,
limitsA,
limitsB,
stride,
N);
#else
#ifdef __NVPTX__ #ifdef __NVPTX__
nTasks = mergePairs/(4*programCount); nTasks = mergePairs/(4*programCount);
#endif #endif
launch [nTasks] mergeElementaryIntervalsKernel( launch [nTasks] mergeElementaryIntervalsKernel(
mergePairs, mergePairs,
dstKey, dstKey,
@@ -630,7 +618,6 @@ void mergeElementaryIntervals(
limitsB, limitsB,
stride, stride,
N); N);
#endif
sync; sync;
} }