From 97971bef0cd77e86989631890a61e877f247ba44 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 30 Jan 2014 11:45:23 +0100 Subject: [PATCH] added performacne data --- examples_ptx/mergeSort/mergeSort.cu | 70 ++++++++++++++++----------- examples_ptx/mergeSort/mergeSort.ispc | 70 ++++++++++++++++----------- 2 files changed, 86 insertions(+), 54 deletions(-) diff --git a/examples_ptx/mergeSort/mergeSort.cu b/examples_ptx/mergeSort/mergeSort.cu index 92c0770c..9f3e6793 100644 --- a/examples_ptx/mergeSort/mergeSort.cu +++ b/examples_ptx/mergeSort/mergeSort.cu @@ -583,40 +583,56 @@ void mergeSort___export( assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT); assert(N % (programCount*2) == 0); - mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount)); + + // k20m: 140 M/s + { + // k20m: 2367 M/s + mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount)); #if 1 - for (uniform int stride = 2*programCount; stride < N; stride <<= 1) - { - const uniform int lastSegmentElements = N % (2 * stride); + for (uniform int stride = 2*programCount; stride < N; stride <<= 1) + { + const uniform int lastSegmentElements = N % (2 * stride); - //Find sample ranks and prepare for limiters merge - generateSampleRanks(ranksA, ranksB, iKey, stride, N); - - //Merge ranks and indices - mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N); - - //Merge elementary intervals - mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N); - - if (lastSegmentElements <= stride) - for (int i = programIndex; i < lastSegmentElements; i += programCount) - if (i < lastSegmentElements) + // k20m: 271 M/s + { +#if 1 + // k20m: 944 M/s { - oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i]; - oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i]; + // k20m: 1396 M/s + //Find sample ranks and prepare for limiters merge + generateSampleRanks(ranksA, ranksB, iKey, stride, N); + + // k20m: 2379 M/s + //Merge ranks and indices + mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N); } +#endif + + // k20m: 371 M/s + //Merge elementary intervals + mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N); + } + + if (lastSegmentElements <= stride) + for (int i = programIndex; i < lastSegmentElements; i += programCount) + if (i < lastSegmentElements) + { + oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i]; + oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i]; + } - { - uniform Key_t * uniform tmpKey = iKey; - iKey = oKey; - oKey = tmpKey; - } - { - uniform Val_t * uniform tmpVal = iVal; - iVal = oVal; - oVal = tmpVal; + { + uniform Key_t * uniform tmpKey = iKey; + iKey = oKey; + oKey = tmpKey; + } + { + uniform Val_t * uniform tmpVal = iVal; + iVal = oVal; + oVal = tmpVal; + } } } #endif diff --git a/examples_ptx/mergeSort/mergeSort.ispc b/examples_ptx/mergeSort/mergeSort.ispc index a85c5cc8..10e0f6a2 100644 --- a/examples_ptx/mergeSort/mergeSort.ispc +++ b/examples_ptx/mergeSort/mergeSort.ispc @@ -687,40 +687,56 @@ void mergeSort( assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT); assert(N % (programCount*2) == 0); - mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount)); + + // cpu: 28 gpu: 74 M/s + { + // cpu: 356 gpu: 534 M/s + mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount)); #if 1 - for (uniform int stride = 2*programCount; stride < N; stride <<= 1) - { - const uniform int lastSegmentElements = N % (2 * stride); + for (uniform int stride = 2*programCount; stride < N; stride <<= 1) + { + const uniform int lastSegmentElements = N % (2 * stride); - //Find sample ranks and prepare for limiters merge - generateSampleRanks(ranksA, ranksB, iKey, stride, N); - - //Merge ranks and indices - mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N); - - //Merge elementary intervals - mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N); - - if (lastSegmentElements <= stride) - foreach (i = 0 ... lastSegmentElements) + // cpu: 30 gpu: 112 M/s { - oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i]; - oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i]; +#if 1 + // cpu: 121 gpu: 460 M/s + { + // cpu: 190 gpu: 600 M/s + //Find sample ranks and prepare for limiters merge + generateSampleRanks(ranksA, ranksB, iKey, stride, N); + + // cpu: 120 gpu: 457 M/s + //Merge ranks and indices + mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N); + } +#endif + + // cpu: 287 gpu: 194 M/s + //Merge elementary intervals + mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N); } - memory_barrier(); + + if (lastSegmentElements <= stride) + foreach (i = 0 ... lastSegmentElements) + { + oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i]; + oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i]; + } + memory_barrier(); - { - uniform Key_t * uniform tmpKey = iKey; - iKey = oKey; - oKey = tmpKey; - } - { - uniform Val_t * uniform tmpVal = iVal; - iVal = oVal; - oVal = tmpVal; + { + uniform Key_t * uniform tmpKey = iKey; + iKey = oKey; + oKey = tmpKey; + } + { + uniform Val_t * uniform tmpVal = iVal; + iVal = oVal; + oVal = tmpVal; + } } } #endif