/* Copyright (c) 2014, Evghenii Gaburov All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Based on mergeSort from CUDA SDK */ #include "keyType.h" #define SAMPLE_STRIDE programCount #define iDivUp(a,b) (((a) + (b) - 1)/(b)) #define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE))) #define W (/*sizeof(int)=*/4 * 8) static inline int nextPowerOfTwo(int x) { #if 0 --x; x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; return ++x; #else return 1U << (W - count_leading_zeros(x - 1)); #endif } static inline int binarySearchInclusiveRanks( const int val, uniform int *data, const int L, int stride) { cif (L == 0) return 0; int pos = 0; cfor (; stride > 0; stride >>= 1) { int newPos = min(pos + stride, L); cif (data[newPos - 1] <= val) pos = newPos; } return pos; } static inline int binarySearchExclusiveRanks( const int val, uniform int *data, const int L, int stride) { cif (L == 0) return 0; int pos = 0; cfor (; stride > 0; stride >>= 1) { int newPos = min(pos + stride, L); if (data[newPos - 1] < val) pos = newPos; } return pos; } static inline int binarySearchInclusive( const Key_t val, uniform Key_t *data, const int L, int stride) { cif (L == 0) return 0; int pos = 0; cfor (; stride > 0; stride >>= 1) { int newPos = min(pos + stride, L); if (data[newPos - 1] <= val) pos = newPos; } return pos; } static inline int binarySearchExclusive( const Key_t val, uniform Key_t *data, const int L, int stride) { cif (L == 0) return 0; int pos = 0; cfor (; stride > 0; stride >>= 1) { int newPos = min(pos + stride, L); if (data[newPos - 1] < val) pos = newPos; } return pos; } static inline int binarySearchInclusive1( const Key_t val, Key_t data, const uniform int L, uniform int stride) { if (L == 0) return 0; int pos = 0; for (; stride > 0; stride >>= 1) { int newPos = min(pos + stride, L); if (shuffle(data,newPos - 1) <= val) pos = newPos; } return pos; } static inline int binarySearchExclusive1( const Key_t val, Key_t data, const uniform int L, uniform int stride) { if (L == 0) return 0; int pos = 0; for (; stride > 0; stride >>= 1) { int newPos = min(pos + stride, L); if (shuffle(data,newPos - 1) < val) pos = newPos; } return pos; } //////////////////////////////////////////////////////////////////////////////// // Bottom-level merge sort (binary search-based) //////////////////////////////////////////////////////////////////////////////// task void mergeSortGangKernel( uniform int batchSize, uniform Key_t dstKey[], uniform Val_t dstVal[], uniform Key_t srcKey[], uniform Val_t srcVal[], uniform int arrayLength) { const uniform int blockIdx = taskIndex; const uniform int blockDim = (batchSize + taskCount - 1)/taskCount; const uniform int blockBeg = blockIdx * blockDim; const uniform int blockEnd = min(blockBeg + blockDim, batchSize); uniform Key_t s_key[2*programCount]; uniform Val_t s_val[2*programCount]; for (uniform int block = blockBeg; block < blockEnd; block++) { const uniform int base = block * (programCount*2); s_key[programIndex + 0] = srcKey[base + programIndex + 0]; s_val[programIndex + 0] = srcVal[base + programIndex + 0]; s_key[programIndex + programCount] = srcKey[base + programIndex + programCount]; s_val[programIndex + programCount] = srcVal[base + programIndex + programCount]; for (uniform int stride = 1; stride < arrayLength; stride <<= 1) { const int lPos = programIndex & (stride - 1); const int offset = 2 * (programIndex - lPos); uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos); uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos); Key_t keyA = baseKey[lPos + 0]; Val_t valA = baseVal[lPos + 0]; Key_t keyB = baseKey[lPos + stride]; Val_t valB = baseVal[lPos + stride]; int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos; int posB = binarySearchInclusive(keyB, baseKey + 0, stride, stride) + lPos; baseKey[posA] = keyA; baseVal[posA] = valA; baseKey[posB] = keyB; baseVal[posB] = valB; } dstKey[base + programIndex + 0] = s_key[programIndex + 0]; dstVal[base + programIndex + 0] = s_val[programIndex + 0]; dstKey[base + programIndex + programCount] = s_key[programIndex + programCount]; dstVal[base + programIndex + programCount] = s_val[programIndex + programCount]; } } static inline void mergeSortGang( uniform Key_t dstKey[], uniform Val_t dstVal[], uniform Key_t srcKey[], uniform Val_t srcVal[], uniform int batchSize) { uniform int nTasks = num_cores()*4; #ifdef __NVPTX__ nTasks = iDivUp(batchSize,1); #endif launch [nTasks] mergeSortGangKernel(batchSize, dstKey, dstVal, srcKey, srcVal, 2*programCount); sync; } //////////////////////////////////////////////////////////////////////////////// // Merge step 1: generate sample ranks //////////////////////////////////////////////////////////////////////////////// task void generateSampleRanksKernel( uniform int nBlocks, uniform int in_ranksA[], uniform int in_ranksB[], uniform Key_t in_srcKey[], uniform int stride, uniform int N, uniform int totalProgramCount) { const uniform int blockIdx = taskIndex; const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount; const uniform int blockBeg = blockIdx * blockDim; const uniform int blockEnd = min(blockBeg + blockDim, nBlocks); for (uniform int block = blockBeg; block < blockEnd; block++) { const int pos = block * programCount + programIndex; cif (pos >= totalProgramCount) return; const int i = pos & ((stride / SAMPLE_STRIDE) - 1); const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); uniform Key_t * srcKey = in_srcKey + segmentBase; uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE; uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE; const int segmentElementsA = stride; const int segmentElementsB = min(stride, N - segmentBase - stride); const int segmentSamplesA = getSampleCount(segmentElementsA); const int segmentSamplesB = getSampleCount(segmentElementsB); if (i < segmentSamplesA) { ranksA[i] = i * SAMPLE_STRIDE; ranksB[i] = binarySearchExclusive( srcKey[i * SAMPLE_STRIDE], srcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB)); } if (i < segmentSamplesB) { ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE; ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive( srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA)); } } } static inline void generateSampleRanks( uniform int ranksA[], uniform int ranksB[], uniform Key_t srcKey[], uniform int stride, uniform int N) { uniform int lastSegmentElements = N % (2 * stride); uniform int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE); uniform int nTasks = num_cores()*4; #ifdef __NVPTX__ nTasks = iDivUp(nBlocks,1); #endif launch [nTasks] generateSampleRanksKernel(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount); sync; } //////////////////////////////////////////////////////////////////////////////// // Merge step 2: generate sample ranks and indices //////////////////////////////////////////////////////////////////////////////// task void mergeRanksAndIndicesKernel( uniform int nBlocks, uniform int in_Limits[], uniform int in_Ranks[], uniform int stride, uniform int N, uniform int totalProgramCount) { const uniform int blockIdx = taskIndex; const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount; const uniform int blockBeg = blockIdx * blockDim; const uniform int blockEnd = min(blockBeg + blockDim, nBlocks); for (uniform int block = blockBeg; block < blockEnd; block++) { int pos = block * programCount + programIndex; cif (pos >= totalProgramCount) return; const int i = pos & ((stride / SAMPLE_STRIDE) - 1); const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); uniform int * ranks = in_Ranks + (pos - i) * 2; uniform int * limits = in_Limits + (pos - i) * 2; const int segmentElementsA = stride; const int segmentElementsB = min(stride, N - segmentBase - stride); const int segmentSamplesA = getSampleCount(segmentElementsA); const int segmentSamplesB = getSampleCount(segmentElementsB); if (i < segmentSamplesA) { int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i; limits[dstPos] = ranks[i]; } if (i < segmentSamplesB) { int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i; limits[dstPos] = ranks[segmentSamplesA + i]; } } } static inline void mergeRanksAndIndices( uniform int limitsA[], uniform int limitsB[], uniform int ranksA[], uniform int ranksB[], uniform int stride, uniform int N) { const uniform int lastSegmentElements = N % (2 * stride); const uniform int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE); uniform int nTasks = num_cores()*4; #ifdef __NVPTX__ nTasks = iDivUp(nBlocks,1); #endif launch [nTasks] mergeRanksAndIndicesKernel( nBlocks, limitsA, ranksA, stride, N, threadCount); launch [nTasks] mergeRanksAndIndicesKernel( nBlocks, limitsB, ranksB, stride, N, threadCount); sync; } task void mergeElementaryIntervalsKernel( uniform int mergePairs, uniform Key_t dstKey[], uniform Val_t dstVal[], uniform Key_t srcKey[], uniform Val_t srcVal[], uniform int limitsA[], uniform int limitsB[], uniform int stride, uniform int N) { const uniform int blockIdx = taskIndex; const uniform int blockDim = (mergePairs + taskCount - 1)/taskCount; const uniform int blockBeg = blockIdx * blockDim; const uniform int blockEnd = min(blockBeg + blockDim, mergePairs); for (uniform int block = blockBeg; block < blockEnd; block++) { const int uniform intervalI = block & ((2 * stride) / SAMPLE_STRIDE - 1); const int uniform segmentBase = (block - intervalI) * SAMPLE_STRIDE; //Set up threadblock-wide parameters const uniform int segmentElementsA = stride; const uniform int segmentElementsB = min(stride, N - segmentBase - stride); const uniform int segmentSamplesA = getSampleCount(segmentElementsA); const uniform int segmentSamplesB = getSampleCount(segmentElementsB); const uniform int segmentSamples = segmentSamplesA + segmentSamplesB; const uniform int startSrcA = limitsA[block]; const uniform int startSrcB = limitsB[block]; const uniform int endSrcA = (intervalI + 1 < segmentSamples) ? limitsA[block + 1] : segmentElementsA; const uniform int endSrcB = (intervalI + 1 < segmentSamples) ? limitsB[block + 1] : segmentElementsB; const uniform int lenSrcA = endSrcA - startSrcA; const uniform int lenSrcB = endSrcB - startSrcB; const uniform int startDstA = startSrcA + startSrcB; const uniform int startDstB = startDstA + lenSrcA; //Load main input data Key_t keyA, keyB; Val_t valA, valB; if (programIndex < lenSrcA) { keyA = srcKey[segmentBase + startSrcA + programIndex]; valA = srcVal[segmentBase + startSrcA + programIndex]; } if (programIndex < lenSrcB) { keyB = srcKey[segmentBase + stride + startSrcB + programIndex]; valB = srcVal[segmentBase + stride + startSrcB + programIndex]; } // Compute destination addresses for merge data int dstPosA, dstPosB, dstA = -1, dstB = -1; if (programIndex < lenSrcA) dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex; if (programIndex < lenSrcB) dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex; if (programIndex < lenSrcA && dstPosA < lenSrcA) dstA = segmentBase + startDstA + dstPosA; dstPosA -= lenSrcA; if (programIndex < lenSrcA && dstPosA < lenSrcB) dstA = segmentBase + startDstB + dstPosA; if (programIndex < lenSrcB && dstPosB < lenSrcA) dstB = segmentBase + startDstA + dstPosB; dstPosB -= lenSrcA; if (programIndex < lenSrcB && dstPosB < lenSrcB) dstB = segmentBase + startDstB + dstPosB; if (dstA >= 0) { dstKey[dstA] = keyA; dstVal[dstA] = valA; } if (dstB >= 0) { dstKey[dstB] = keyB; dstVal[dstB] = valB; } } } static inline void mergeElementaryIntervals( uniform Key_t dstKey[], uniform Val_t dstVal[], uniform Key_t srcKey[], uniform Val_t srcVal[], uniform int limitsA[], uniform int limitsB[], uniform int stride, uniform int N) { const uniform int lastSegmentElements = N % (2 * stride); const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE; uniform int nTasks = num_cores()*4; #ifdef __NVPTX__ nTasks = iDivUp(mergePairs,1*programCount); #endif launch [nTasks] mergeElementaryIntervalsKernel( mergePairs, dstKey, dstVal, srcKey, srcVal, limitsA, limitsB, stride, N); if (lastSegmentElements <= stride) foreach (i = 0 ... lastSegmentElements) { dstKey[N-lastSegmentElements+i] = srcKey[N-lastSegmentElements+i]; dstVal[N-lastSegmentElements+i] = srcVal[N-lastSegmentElements+i]; } sync; } static uniform int * uniform memPool = NULL; static uniform int * uniform ranksA; static uniform int * uniform ranksB; static uniform int * uniform limitsA; static uniform int * uniform limitsB; static uniform int MAX_SAMPLE_COUNT = 0; export void openMergeSort() { MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount; assert(memPool == NULL); const uniform int nalloc = MAX_SAMPLE_COUNT * 4; memPool = uniform new uniform int[nalloc]; ranksA = memPool; ranksB = ranksA + MAX_SAMPLE_COUNT; limitsA = ranksB + MAX_SAMPLE_COUNT; limitsB = limitsA + MAX_SAMPLE_COUNT; } export void closeMergeSort() { assert(memPool != NULL); delete memPool; memPool = NULL; } export void mergeSort( uniform Key_t dstKey[], uniform Val_t dstVal[], uniform Key_t bufKey[], uniform Val_t bufVal[], uniform Key_t srcKey[], uniform Val_t srcVal[], uniform int N) { uniform int stageCount = 0; for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++); uniform Key_t * uniform iKey, * uniform oKey; uniform Val_t * uniform iVal, * uniform oVal; if (stageCount & 1) { iKey = bufKey; iVal = bufVal; oKey = dstKey; oVal = dstVal; } else { iKey = dstKey; iVal = dstVal; oKey = bufKey; oVal = bufVal; } assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT); assert(N % (programCount*2) == 0); // cpu: 28 gpu: 74 M/s { // cpu: 356 gpu: 534 M/s mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount)); #if 1 for (uniform int stride = 2*programCount; stride < N; stride <<= 1) { // cpu: 30 gpu: 112 M/s { #if 1 // cpu: 121 gpu: 460 M/s { // cpu: 190 gpu: 600 M/s //Find sample ranks and prepare for limiters merge generateSampleRanks(ranksA, ranksB, iKey, stride, N); // cpu: 120 gpu: 457 M/s //Merge ranks and indices mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N); } #endif // cpu: 287 gpu: 194 M/s //Merge elementary intervals mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N); } { uniform Key_t * uniform tmpKey = iKey; iKey = oKey; oKey = tmpKey; } { uniform Val_t * uniform tmpVal = iVal; iVal = oVal; oVal = tmpVal; } } #endif } }