added performacne data
This commit is contained in:
@@ -583,40 +583,56 @@ void mergeSort___export(
|
|||||||
|
|
||||||
assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
|
assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
|
||||||
assert(N % (programCount*2) == 0);
|
assert(N % (programCount*2) == 0);
|
||||||
mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
|
|
||||||
|
// k20m: 140 M/s
|
||||||
|
{
|
||||||
|
// k20m: 2367 M/s
|
||||||
|
mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
|
for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
|
||||||
{
|
{
|
||||||
const uniform int lastSegmentElements = N % (2 * stride);
|
const uniform int lastSegmentElements = N % (2 * stride);
|
||||||
|
|
||||||
//Find sample ranks and prepare for limiters merge
|
// k20m: 271 M/s
|
||||||
generateSampleRanks(ranksA, ranksB, iKey, stride, N);
|
{
|
||||||
|
#if 1
|
||||||
//Merge ranks and indices
|
// k20m: 944 M/s
|
||||||
mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
|
|
||||||
|
|
||||||
//Merge elementary intervals
|
|
||||||
mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
|
|
||||||
|
|
||||||
if (lastSegmentElements <= stride)
|
|
||||||
for (int i = programIndex; i < lastSegmentElements; i += programCount)
|
|
||||||
if (i < lastSegmentElements)
|
|
||||||
{
|
{
|
||||||
oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i];
|
// k20m: 1396 M/s
|
||||||
oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i];
|
//Find sample ranks and prepare for limiters merge
|
||||||
|
generateSampleRanks(ranksA, ranksB, iKey, stride, N);
|
||||||
|
|
||||||
|
// k20m: 2379 M/s
|
||||||
|
//Merge ranks and indices
|
||||||
|
mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// k20m: 371 M/s
|
||||||
|
//Merge elementary intervals
|
||||||
|
mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lastSegmentElements <= stride)
|
||||||
|
for (int i = programIndex; i < lastSegmentElements; i += programCount)
|
||||||
|
if (i < lastSegmentElements)
|
||||||
|
{
|
||||||
|
oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i];
|
||||||
|
oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
{
|
{
|
||||||
uniform Key_t * uniform tmpKey = iKey;
|
uniform Key_t * uniform tmpKey = iKey;
|
||||||
iKey = oKey;
|
iKey = oKey;
|
||||||
oKey = tmpKey;
|
oKey = tmpKey;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
uniform Val_t * uniform tmpVal = iVal;
|
uniform Val_t * uniform tmpVal = iVal;
|
||||||
iVal = oVal;
|
iVal = oVal;
|
||||||
oVal = tmpVal;
|
oVal = tmpVal;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -687,40 +687,56 @@ void mergeSort(
|
|||||||
|
|
||||||
assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
|
assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
|
||||||
assert(N % (programCount*2) == 0);
|
assert(N % (programCount*2) == 0);
|
||||||
mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
|
|
||||||
|
// cpu: 28 gpu: 74 M/s
|
||||||
|
{
|
||||||
|
// cpu: 356 gpu: 534 M/s
|
||||||
|
mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
|
for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
|
||||||
{
|
{
|
||||||
const uniform int lastSegmentElements = N % (2 * stride);
|
const uniform int lastSegmentElements = N % (2 * stride);
|
||||||
|
|
||||||
//Find sample ranks and prepare for limiters merge
|
// cpu: 30 gpu: 112 M/s
|
||||||
generateSampleRanks(ranksA, ranksB, iKey, stride, N);
|
|
||||||
|
|
||||||
//Merge ranks and indices
|
|
||||||
mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
|
|
||||||
|
|
||||||
//Merge elementary intervals
|
|
||||||
mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
|
|
||||||
|
|
||||||
if (lastSegmentElements <= stride)
|
|
||||||
foreach (i = 0 ... lastSegmentElements)
|
|
||||||
{
|
{
|
||||||
oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i];
|
#if 1
|
||||||
oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i];
|
// cpu: 121 gpu: 460 M/s
|
||||||
|
{
|
||||||
|
// cpu: 190 gpu: 600 M/s
|
||||||
|
//Find sample ranks and prepare for limiters merge
|
||||||
|
generateSampleRanks(ranksA, ranksB, iKey, stride, N);
|
||||||
|
|
||||||
|
// cpu: 120 gpu: 457 M/s
|
||||||
|
//Merge ranks and indices
|
||||||
|
mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// cpu: 287 gpu: 194 M/s
|
||||||
|
//Merge elementary intervals
|
||||||
|
mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
|
||||||
}
|
}
|
||||||
memory_barrier();
|
|
||||||
|
if (lastSegmentElements <= stride)
|
||||||
|
foreach (i = 0 ... lastSegmentElements)
|
||||||
|
{
|
||||||
|
oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i];
|
||||||
|
oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i];
|
||||||
|
}
|
||||||
|
memory_barrier();
|
||||||
|
|
||||||
|
|
||||||
{
|
{
|
||||||
uniform Key_t * uniform tmpKey = iKey;
|
uniform Key_t * uniform tmpKey = iKey;
|
||||||
iKey = oKey;
|
iKey = oKey;
|
||||||
oKey = tmpKey;
|
oKey = tmpKey;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
uniform Val_t * uniform tmpVal = iVal;
|
uniform Val_t * uniform tmpVal = iVal;
|
||||||
iVal = oVal;
|
iVal = oVal;
|
||||||
oVal = tmpVal;
|
oVal = tmpVal;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user