+1
This commit is contained in:
@@ -232,6 +232,7 @@ void mergeSortGang(
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
task
|
task
|
||||||
void generateSampleRanksKernel(
|
void generateSampleRanksKernel(
|
||||||
|
uniform int nBlocks,
|
||||||
uniform int in_ranksA[],
|
uniform int in_ranksA[],
|
||||||
uniform int in_ranksB[],
|
uniform int in_ranksB[],
|
||||||
uniform Key_t in_srcKey[],
|
uniform Key_t in_srcKey[],
|
||||||
@@ -239,36 +240,44 @@ void generateSampleRanksKernel(
|
|||||||
uniform int N,
|
uniform int N,
|
||||||
uniform int totalProgramCount)
|
uniform int totalProgramCount)
|
||||||
{
|
{
|
||||||
const int pos = taskIndex * programCount + programIndex;
|
const uniform int blockIdx = taskIndex;
|
||||||
cif (pos >= totalProgramCount)
|
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
|
||||||
return;
|
const uniform int blockBeg = blockIdx * blockDim;
|
||||||
|
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
|
||||||
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
|
||||||
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
for (uniform int block = blockBeg; block < blockEnd; block++)
|
||||||
|
|
||||||
uniform Key_t * srcKey = in_srcKey + segmentBase;
|
|
||||||
uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
|
|
||||||
uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
|
|
||||||
|
|
||||||
const int segmentElementsA = stride;
|
|
||||||
const int segmentElementsB = min(stride, N - segmentBase - stride);
|
|
||||||
const int segmentSamplesA = getSampleCount(segmentElementsA);
|
|
||||||
const int segmentSamplesB = getSampleCount(segmentElementsB);
|
|
||||||
|
|
||||||
if (i < segmentSamplesA)
|
|
||||||
{
|
{
|
||||||
ranksA[i] = i * SAMPLE_STRIDE;
|
const int pos = block * programCount + programIndex;
|
||||||
ranksB[i] = binarySearchExclusive(
|
cif (pos >= totalProgramCount)
|
||||||
srcKey[i * SAMPLE_STRIDE], srcKey + stride,
|
return;
|
||||||
segmentElementsB, nextPowerOfTwo(segmentElementsB));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i < segmentSamplesB)
|
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||||
{
|
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||||
ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
|
|
||||||
ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
|
uniform Key_t * srcKey = in_srcKey + segmentBase;
|
||||||
srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
|
uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
|
||||||
segmentElementsA, nextPowerOfTwo(segmentElementsA));
|
uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
|
||||||
|
|
||||||
|
const int segmentElementsA = stride;
|
||||||
|
const int segmentElementsB = min(stride, N - segmentBase - stride);
|
||||||
|
const int segmentSamplesA = getSampleCount(segmentElementsA);
|
||||||
|
const int segmentSamplesB = getSampleCount(segmentElementsB);
|
||||||
|
|
||||||
|
if (i < segmentSamplesA)
|
||||||
|
{
|
||||||
|
ranksA[i] = i * SAMPLE_STRIDE;
|
||||||
|
ranksB[i] = binarySearchExclusive(
|
||||||
|
srcKey[i * SAMPLE_STRIDE], srcKey + stride,
|
||||||
|
segmentElementsB, nextPowerOfTwo(segmentElementsB));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i < segmentSamplesB)
|
||||||
|
{
|
||||||
|
ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
|
||||||
|
ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
|
||||||
|
srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
|
||||||
|
segmentElementsA, nextPowerOfTwo(segmentElementsA));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -285,9 +294,13 @@ void generateSampleRanks(
|
|||||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||||
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||||
|
|
||||||
uniform int nTasks = iDivUp(threadCount, SAMPLE_STRIDE);
|
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
||||||
|
uniform int nTasks = num_cores()*4;
|
||||||
|
#ifdef __NVPTX__
|
||||||
|
nTasks = nBlocks/4;
|
||||||
|
#endif
|
||||||
|
|
||||||
launch [nTasks] generateSampleRanksKernel(ranksA, ranksB, srcKey, stride, N, threadCount);
|
launch [nTasks] generateSampleRanksKernel(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
|
||||||
sync;
|
sync;
|
||||||
}
|
}
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@@ -295,36 +308,45 @@ void generateSampleRanks(
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
task
|
task
|
||||||
void mergeRanksAndIndicesKernel(
|
void mergeRanksAndIndicesKernel(
|
||||||
|
uniform int nBlocks,
|
||||||
uniform int in_Limits[],
|
uniform int in_Limits[],
|
||||||
uniform int in_Ranks[],
|
uniform int in_Ranks[],
|
||||||
uniform int stride,
|
uniform int stride,
|
||||||
uniform int N,
|
uniform int N,
|
||||||
uniform int totalProgramCount)
|
uniform int totalProgramCount)
|
||||||
{
|
{
|
||||||
int pos = taskIndex * programCount + programIndex;
|
const uniform int blockIdx = taskIndex;
|
||||||
cif (pos >= totalProgramCount)
|
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
|
||||||
return;
|
const uniform int blockBeg = blockIdx * blockDim;
|
||||||
|
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
|
||||||
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
|
||||||
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
for (uniform int block = blockBeg; block < blockEnd; block++)
|
||||||
uniform int * ranks = in_Ranks + (pos - i) * 2;
|
|
||||||
uniform int * limits = in_Limits + (pos - i) * 2;
|
|
||||||
|
|
||||||
const int segmentElementsA = stride;
|
|
||||||
const int segmentElementsB = min(stride, N - segmentBase - stride);
|
|
||||||
const int segmentSamplesA = getSampleCount(segmentElementsA);
|
|
||||||
const int segmentSamplesB = getSampleCount(segmentElementsB);
|
|
||||||
|
|
||||||
if (i < segmentSamplesA)
|
|
||||||
{
|
{
|
||||||
int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
|
int pos = block * programCount + programIndex;
|
||||||
limits[dstPos] = ranks[i];
|
cif (pos >= totalProgramCount)
|
||||||
}
|
return;
|
||||||
|
|
||||||
if (i < segmentSamplesB)
|
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||||
{
|
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||||
int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
|
uniform int * ranks = in_Ranks + (pos - i) * 2;
|
||||||
limits[dstPos] = ranks[segmentSamplesA + i];
|
uniform int * limits = in_Limits + (pos - i) * 2;
|
||||||
|
|
||||||
|
const int segmentElementsA = stride;
|
||||||
|
const int segmentElementsB = min(stride, N - segmentBase - stride);
|
||||||
|
const int segmentSamplesA = getSampleCount(segmentElementsA);
|
||||||
|
const int segmentSamplesB = getSampleCount(segmentElementsB);
|
||||||
|
|
||||||
|
if (i < segmentSamplesA)
|
||||||
|
{
|
||||||
|
int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
|
||||||
|
limits[dstPos] = ranks[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i < segmentSamplesB)
|
||||||
|
{
|
||||||
|
int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
|
||||||
|
limits[dstPos] = ranks[segmentSamplesA + i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
static inline
|
static inline
|
||||||
@@ -341,17 +363,22 @@ void mergeRanksAndIndices(
|
|||||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||||
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||||
|
|
||||||
const uniform int nTasks = iDivUp(threadCount, SAMPLE_STRIDE);
|
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
||||||
|
uniform int nTasks = num_cores()*4;
|
||||||
|
|
||||||
|
#ifdef __NVPTX__
|
||||||
|
nTasks = nBlocks/4;
|
||||||
|
#endif
|
||||||
|
|
||||||
launch [nTasks] mergeRanksAndIndicesKernel(
|
launch [nTasks] mergeRanksAndIndicesKernel(
|
||||||
|
nBlocks,
|
||||||
limitsA,
|
limitsA,
|
||||||
ranksA,
|
ranksA,
|
||||||
stride,
|
stride,
|
||||||
N,
|
N,
|
||||||
threadCount);
|
threadCount);
|
||||||
sync;
|
|
||||||
|
|
||||||
launch [nTasks] mergeRanksAndIndicesKernel(
|
launch [nTasks] mergeRanksAndIndicesKernel(
|
||||||
|
nBlocks,
|
||||||
limitsB,
|
limitsB,
|
||||||
ranksB,
|
ranksB,
|
||||||
stride,
|
stride,
|
||||||
@@ -380,6 +407,7 @@ void merge(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if 0
|
||||||
task
|
task
|
||||||
void mergeElementaryIntervalsKernel(
|
void mergeElementaryIntervalsKernel(
|
||||||
uniform Key_t dstKey[],
|
uniform Key_t dstKey[],
|
||||||
@@ -437,11 +465,6 @@ void mergeElementaryIntervalsKernel(
|
|||||||
lenSrcA, SAMPLE_STRIDE,
|
lenSrcA, SAMPLE_STRIDE,
|
||||||
lenSrcB, SAMPLE_STRIDE
|
lenSrcB, SAMPLE_STRIDE
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
//Store merged data
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
uniform Key_t s_key[2 * SAMPLE_STRIDE];
|
uniform Key_t s_key[2 * SAMPLE_STRIDE];
|
||||||
uniform Val_t s_val[2 * SAMPLE_STRIDE];
|
uniform Val_t s_val[2 * SAMPLE_STRIDE];
|
||||||
|
|
||||||
@@ -455,6 +478,8 @@ void mergeElementaryIntervalsKernel(
|
|||||||
s_key[dstPosB] = keyB;
|
s_key[dstPosB] = keyB;
|
||||||
s_val[dstPosB] = valB;
|
s_val[dstPosB] = valB;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Coalesced writes
|
||||||
if (programIndex < lenSrcA)
|
if (programIndex < lenSrcA)
|
||||||
{
|
{
|
||||||
dstKey[segmentBase + startDstA + programIndex] = s_key[programIndex];
|
dstKey[segmentBase + startDstA + programIndex] = s_key[programIndex];
|
||||||
@@ -466,35 +491,8 @@ void mergeElementaryIntervalsKernel(
|
|||||||
dstKey[segmentBase + startDstB + programIndex] = s_key[lenSrcA + programIndex];
|
dstKey[segmentBase + startDstB + programIndex] = s_key[lenSrcA + programIndex];
|
||||||
dstVal[segmentBase + startDstB + programIndex] = s_val[lenSrcA + programIndex];
|
dstVal[segmentBase + startDstB + programIndex] = s_val[lenSrcA + programIndex];
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
int dstA, dstB;
|
|
||||||
dstA = dstB = -1;
|
|
||||||
if (programIndex < lenSrcA && dstPosA < lenSrcA)
|
|
||||||
dstA = segmentBase + startDstA + dstPosA;
|
|
||||||
if (programIndex < lenSrcB && dstPosB < lenSrcA)
|
|
||||||
dstB = segmentBase + startDstA + dstPosB;
|
|
||||||
|
|
||||||
dstPosA -= lenSrcA;
|
|
||||||
dstPosB -= lenSrcA;
|
|
||||||
dstA = dstB = -1;
|
|
||||||
if (programIndex < lenSrcA && dstPosA < lenSrcB)
|
|
||||||
dstA = segmentBase + startDstB + dstPosA;
|
|
||||||
if (programIndex < lenSrcB && dstPosB < lenSrcB)
|
|
||||||
dstB = segmentBase + startDstB + dstPosB;
|
|
||||||
|
|
||||||
if (dstA >= 0)
|
|
||||||
{
|
|
||||||
dstKey[dstA] = keyA;
|
|
||||||
dstVal[dstA] = valA;
|
|
||||||
}
|
|
||||||
if (dstB >= 0)
|
|
||||||
{
|
|
||||||
dstKey[dstB] = keyB;
|
|
||||||
dstVal[dstB] = valB;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
task
|
task
|
||||||
void mergeElementaryIntervalsKernel(
|
void mergeElementaryIntervalsKernel(
|
||||||
@@ -606,20 +604,10 @@ void mergeElementaryIntervals(
|
|||||||
const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||||
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
launch [mergePairs] mergeElementaryIntervalsKernel(
|
|
||||||
dstKey,
|
|
||||||
dstVal,
|
|
||||||
srcKey,
|
|
||||||
srcVal,
|
|
||||||
limitsA,
|
|
||||||
limitsB,
|
|
||||||
stride,
|
|
||||||
N);
|
|
||||||
#else
|
|
||||||
#ifdef __NVPTX__
|
#ifdef __NVPTX__
|
||||||
nTasks = mergePairs/(4*programCount);
|
nTasks = mergePairs/(4*programCount);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
launch [nTasks] mergeElementaryIntervalsKernel(
|
launch [nTasks] mergeElementaryIntervalsKernel(
|
||||||
mergePairs,
|
mergePairs,
|
||||||
dstKey,
|
dstKey,
|
||||||
@@ -630,7 +618,6 @@ void mergeElementaryIntervals(
|
|||||||
limitsB,
|
limitsB,
|
||||||
stride,
|
stride,
|
||||||
N);
|
N);
|
||||||
#endif
|
|
||||||
sync;
|
sync;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user