From 23c6325cb6871b3f5586500068d8fd180ff02eeb Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 30 Jan 2014 08:27:43 +0100 Subject: [PATCH] somet tunning --- examples_ptx/mergeSort/mergeSort.ispc | 90 +++++++++++++++++---------- 1 file changed, 58 insertions(+), 32 deletions(-) diff --git a/examples_ptx/mergeSort/mergeSort.ispc b/examples_ptx/mergeSort/mergeSort.ispc index 8f7eb5c7..8abdac1e 100644 --- a/examples_ptx/mergeSort/mergeSort.ispc +++ b/examples_ptx/mergeSort/mergeSort.ispc @@ -225,7 +225,6 @@ void generateSampleRanksKernel( uniform int totalProgramCount) { const int pos = taskIndex * programCount + programIndex; - assert(pos < totalProgramCount); const int i = pos & ((stride / SAMPLE_STRIDE) - 1); const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); @@ -265,7 +264,6 @@ void generateSampleRanks( uniform int N) { uniform int lastSegmentElements = N % (2 * stride); - assert(lastSegmentElements == 0); uniform int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); @@ -287,7 +285,6 @@ void mergeRanksAndIndicesKernel( uniform int totalProgramCount) { int pos = taskIndex * programCount + programIndex; - assert(pos < totalProgramCount); const int i = pos & ((stride / SAMPLE_STRIDE) - 1); const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); @@ -321,7 +318,6 @@ void mergeRanksAndIndices( uniform int N) { const uniform int lastSegmentElements = N % (2 * stride); - assert(lastSegmentElements == 0); const uniform int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); @@ -348,8 +344,8 @@ void mergeRanksAndIndices( static inline void merge( - uniform Key_t dstKey[], - uniform Val_t dstVal[], + int &dstPosA, + int &dstPosB, Key_t keyA, Val_t valA, Key_t keyB, Val_t valB, uniform int lenA, @@ -358,18 +354,10 @@ void merge( uniform int nPowTwoLenB) { if (programIndex < lenA) - { - const int dstPosA = binarySearchExclusive1(keyA, keyB, lenB, nPowTwoLenB) + programIndex; - dstKey[dstPosA] = keyA; - dstVal[dstPosA] = valA; - } + dstPosA = binarySearchExclusive1(keyA, keyB, lenB, nPowTwoLenB) + programIndex; if (programIndex < lenB) - { - const int dstPosB = binarySearchInclusive1(keyB, keyA, lenA, nPowTwoLenA) + programIndex; - dstKey[dstPosB] = keyB; - dstVal[dstPosB] = valB; - } + dstPosB = binarySearchInclusive1(keyB, keyA, lenA, nPowTwoLenA) + programIndex; } @@ -384,9 +372,6 @@ void mergeElementaryIntervalsKernel( uniform int stride, uniform int N) { - uniform Key_t s_key[2 * SAMPLE_STRIDE]; - uniform Val_t s_val[2 * SAMPLE_STRIDE]; - const int uniform intervalI = taskIndex & ((2 * stride) / SAMPLE_STRIDE - 1); const int uniform segmentBase = (taskIndex - intervalI) * SAMPLE_STRIDE; @@ -424,17 +409,33 @@ void mergeElementaryIntervalsKernel( } //Merge data in shared memory + int dstPosA, dstPosB; merge( - s_key, - s_val, + dstPosA, + dstPosB, keyA, valA, keyB, valB, lenSrcA, SAMPLE_STRIDE, lenSrcB, SAMPLE_STRIDE ); + //Store merged data +#if 0 + uniform Key_t s_key[2 * SAMPLE_STRIDE]; + uniform Val_t s_val[2 * SAMPLE_STRIDE]; + + if (programIndex < lenSrcA) + { + s_key[dstPosA] = keyA; + s_val[dstPosA] = valA; + } + if (programIndex < lenSrcB) + { + s_key[dstPosB] = keyB; + s_val[dstPosB] = valB; + } if (programIndex < lenSrcA) { dstKey[segmentBase + startDstA + programIndex] = s_key[programIndex]; @@ -446,6 +447,34 @@ void mergeElementaryIntervalsKernel( dstKey[segmentBase + startDstB + programIndex] = s_key[lenSrcA + programIndex]; dstVal[segmentBase + startDstB + programIndex] = s_val[lenSrcA + programIndex]; } +#else + int dstA, dstB; + dstA = dstB = -1; + if (programIndex < lenSrcA && dstPosA < lenSrcA) + dstA = segmentBase + startDstA + dstPosA; + if (programIndex < lenSrcB && dstPosB < lenSrcA) + dstB = segmentBase + startDstA + dstPosB; + + dstPosA -= lenSrcA; + dstPosB -= lenSrcA; + dstA = dstB = -1; + if (programIndex < lenSrcA && dstPosA < lenSrcB) + dstA = segmentBase + startDstB + dstPosA; + if (programIndex < lenSrcB && dstPosB < lenSrcB) + dstB = segmentBase + startDstB + dstPosB; + + if (dstA >= 0) + { + dstKey[dstA] = keyA; + dstVal[dstA] = valA; + } + if (dstB >= 0) + { + dstKey[dstB] = keyB; + dstVal[dstB] = valB; + } +#endif + } static inline @@ -544,7 +573,7 @@ void mergeSort( for (uniform int stride = 2*programCount; stride < N; stride <<= 1) { - uniform int lastSegmentElements = N % (2 * stride); + const uniform int lastSegmentElements = N % (2 * stride); //Find sample ranks and prepare for limiters merge generateSampleRanks(ranksA, ranksB, iKey, stride, N); @@ -556,26 +585,23 @@ void mergeSort( mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N); if (lastSegmentElements <= stride) - { -#if 0 - //Last merge segment consists of a single array which just needs to be passed through - copyKernel(oKey + (N - lastSegmentElements), iKey + (N - lastSegmentElements), lastSegmentElements); - copyKernel(oVal + (N - lastSegmentElements), iVal + (N - lastSegmentElements), lastSegmentElements); -#endif - } + foreach (i = 0 ... lastSegmentElements) + { + oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i]; + oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i]; + } + memory_barrier(); + -#if 1 { uniform Key_t * uniform tmpKey = iKey; iKey = oKey; oKey = tmpKey; } - { uniform Val_t * uniform tmpVal = iVal; iVal = oVal; oVal = tmpVal; } -#endif } }