diff --git a/examples_ptx/mergeSort/mergeSort.cu b/examples_ptx/mergeSort/mergeSort.cu index 39032279..b694e7b4 100644 --- a/examples_ptx/mergeSort/mergeSort.cu +++ b/examples_ptx/mergeSort/mergeSort.cu @@ -233,7 +233,7 @@ void mergeSortGang( uniform Val_t srcVal[], uniform int batchSize) { - uniform int nTasks = batchSize/4; + uniform int nTasks = batchSize; launch (nTasks,1,1,mergeSortGangKernel)(batchSize, dstKey, dstVal, srcKey, srcVal); sync; } @@ -306,7 +306,7 @@ void generateSampleRanks( (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE); - uniform int nTasks = nBlocks/4; + uniform int nTasks = nBlocks; launch (nTasks,1,1, generateSampleRanksKernel)(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount); sync; @@ -372,7 +372,7 @@ void mergeRanksAndIndices( (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE); - uniform int nTasks = nBlocks/4; + uniform int nTasks = nBlocks; launch (nTasks,1,1,mergeRanksAndIndicesKernel)( nBlocks, @@ -448,20 +448,21 @@ void mergeElementaryIntervalsKernel( } // Compute destination addresses for merge data - int dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex; - int dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex; + int dstPosA, dstPosB, dstA = -1, dstB = -1; + if (any(programIndex < lenSrcA)) + dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex; + if (any(programIndex < lenSrcB)) + dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex; - - int dstA = -1, dstB = -1; if (programIndex < lenSrcA && dstPosA < lenSrcA) dstA = segmentBase + startDstA + dstPosA; - if (programIndex < lenSrcB && dstPosB < lenSrcA) - dstB = segmentBase + startDstA + dstPosB; - dstPosA -= lenSrcA; - dstPosB -= lenSrcA; if (programIndex < lenSrcA && dstPosA < lenSrcB) dstA = segmentBase + startDstB + dstPosA; + + if (programIndex < lenSrcB && dstPosB < lenSrcA) + dstB = segmentBase + startDstA + dstPosB; + dstPosB -= lenSrcA; if (programIndex < lenSrcB && dstPosB < lenSrcB) dstB = segmentBase + startDstB + dstPosB; @@ -499,7 +500,7 @@ void mergeElementaryIntervals( const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE; - nTasks = mergePairs/(4*programCount); + nTasks = mergePairs/(programCount); launch (nTasks,1,1, mergeElementaryIntervalsKernel)( mergePairs, diff --git a/examples_ptx/mergeSort/mergeSort.ispc b/examples_ptx/mergeSort/mergeSort.ispc index 04510c3d..cdbd27b4 100644 --- a/examples_ptx/mergeSort/mergeSort.ispc +++ b/examples_ptx/mergeSort/mergeSort.ispc @@ -221,7 +221,7 @@ void mergeSortGang( { uniform int nTasks = num_cores()*4; #ifdef __NVPTX__ - nTasks = iDivUp(batchSize,4); + nTasks = iDivUp(batchSize,1); #endif launch [nTasks] mergeSortGangKernel(batchSize, dstKey, dstVal, srcKey, srcVal, 2*programCount); sync; @@ -297,7 +297,7 @@ void generateSampleRanks( uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE); uniform int nTasks = num_cores()*4; #ifdef __NVPTX__ - nTasks = iDivUp(nBlocks,4); + nTasks = iDivUp(nBlocks,1); #endif launch [nTasks] generateSampleRanksKernel(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount); @@ -367,7 +367,7 @@ void mergeRanksAndIndices( uniform int nTasks = num_cores()*4; #ifdef __NVPTX__ - nTasks = iDivUp(nBlocks,4); + nTasks = iDivUp(nBlocks,1); #endif launch [nTasks] mergeRanksAndIndicesKernel( @@ -492,7 +492,7 @@ void mergeElementaryIntervals( uniform int nTasks = num_cores()*4; #ifdef __NVPTX__ - nTasks = iDivUp(mergePairs,4*programCount); + nTasks = iDivUp(mergePairs,1*programCount); #endif launch [nTasks] mergeElementaryIntervalsKernel(