From eb57852f2e06fc09b0e92d27a0516e44129fb10d Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 30 Jan 2014 11:32:23 +0100 Subject: [PATCH] added cuda version --- examples_ptx/mergeSort/mergeSort.cu | 643 ++++++++++++++++++++++++++++ 1 file changed, 643 insertions(+) create mode 100644 examples_ptx/mergeSort/mergeSort.cu diff --git a/examples_ptx/mergeSort/mergeSort.cu b/examples_ptx/mergeSort/mergeSort.cu new file mode 100644 index 00000000..92c0770c --- /dev/null +++ b/examples_ptx/mergeSort/mergeSort.cu @@ -0,0 +1,643 @@ +#include "keyType.h" +#include "cuda_helpers.cuh" +#include + +#define uniform + +#define SAMPLE_STRIDE programCount + +#define iDivUp(a,b) (((a) + (b) - 1)/(b)) +#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE))) + +#define W (/*sizeof(int)=*/4 * 8) + +__device__ static inline +int nextPowerOfTwo(int x) +{ +#if 0 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return ++x; +#else + return 1U << (W - __clz(x - 1)); +#endif +} + + +__device__ static inline +int binarySearchInclusiveRanks( + const int val, + uniform int *data, + const int L, + int stride) +{ + if (L == 0) + return 0; + + int pos = 0; + for (; stride > 0; stride >>= 1) + { + int newPos = min(pos + stride, L); + + if (data[newPos - 1] <= val) + pos = newPos; + } + + return pos; +} + +__device__ static inline +int binarySearchExclusiveRanks( + const int val, + uniform int *data, + const int L, + int stride) +{ + if (L == 0) + return 0; + + int pos = 0; + for (; stride > 0; stride >>= 1) + { + int newPos = min(pos + stride, L); + + if (data[newPos - 1] < val) + pos = newPos; + } + + return pos; +} + +__device__ static inline +int binarySearchInclusive( + const Key_t val, + uniform Key_t *data, + const int L, + int stride) +{ + if (L == 0) + return 0; + + int pos = 0; + for (; stride > 0; stride >>= 1) + { + int newPos = min(pos + stride, L); + + if (data[newPos - 1] <= val) + pos = newPos; + } + + return pos; +} + +__device__ static inline +int binarySearchExclusive( + const Key_t val, + uniform Key_t *data, + const int L, + int stride) +{ + if (L == 0) + return 0; + + int pos = 0; + for (; stride > 0; stride >>= 1) + { + int newPos = min(pos + stride, L); + + if (data[newPos - 1] < val) + pos = newPos; + } + + return pos; +} + +__device__ static inline +int binarySearchInclusive1( + const Key_t val, + Key_t data, + const uniform int L, + uniform int stride) +{ + if (L == 0) + return 0; + + int pos = 0; + for (; stride > 0; stride >>= 1) + { + int newPos = min(pos + stride, L); + + if (shuffle(data,newPos - 1) <= val) + pos = newPos; + } + + return pos; +} + +__device__ static inline +int binarySearchExclusive1( + const Key_t val, + Key_t data, + const uniform int L, + uniform int stride) +{ + if (L == 0) + return 0; + + int pos = 0; + for (; stride > 0; stride >>= 1) + { + int newPos = min(pos + stride, L); + + if (shuffle(data,newPos - 1) < val) + pos = newPos; + } + + return pos; +} + +//////////////////////////////////////////////////////////////////////////////// +// Bottom-level merge sort (binary search-based) +//////////////////////////////////////////////////////////////////////////////// +__global__ +void mergeSortGangKernel( + uniform int batchSize, + uniform Key_t dstKey[], + uniform Val_t dstVal[], + uniform Key_t srcKey[], + uniform Val_t srcVal[]) +{ + const uniform int blkIdx = taskIndex; + const uniform int blkDim = (batchSize + taskCount - 1)/taskCount; + const uniform int blkBeg = blkIdx * blkDim; + const uniform int blkEnd = min(blkBeg + blkDim, batchSize); + + __shared__ Key_t s_key_tmp[2*programCount*4]; + __shared__ Val_t s_val_tmp[2*programCount*4]; + Key_t *s_key = s_key_tmp + warpIdx*(2*programCount); + Val_t *s_val = s_val_tmp + warpIdx*(2*programCount); + + for (uniform int blk = blkBeg; blk < blkEnd; blk++) + { + const uniform int base = blk * (programCount*2); + s_key[programIndex + 0] = srcKey[base + programIndex + 0]; + s_val[programIndex + 0] = srcVal[base + programIndex + 0]; + s_key[programIndex + programCount] = srcKey[base + programIndex + programCount]; + s_val[programIndex + programCount] = srcVal[base + programIndex + programCount]; + +#if 1 + for (uniform int stride = 1; stride < 2*programCount; stride <<= 1) + { + const int lPos = programIndex & (stride - 1); + uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos); + uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos); + + Key_t keyA = baseKey[lPos + 0]; + Val_t valA = baseVal[lPos + 0]; + Key_t keyB = baseKey[lPos + stride]; + Val_t valB = baseVal[lPos + stride]; + int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos; + int posB = binarySearchInclusive(keyB, baseKey + 0, stride, stride) + lPos; + + baseKey[posA] = keyA; + baseVal[posA] = valA; + baseKey[posB] = keyB; + baseVal[posB] = valB; + } +#endif + + dstKey[base + programIndex + 0] = s_key[programIndex + 0]; + dstVal[base + programIndex + 0] = s_val[programIndex + 0]; + dstKey[base + programIndex + programCount] = s_key[programIndex + programCount]; + dstVal[base + programIndex + programCount] = s_val[programIndex + programCount]; + } +} + +__device__ static inline +void mergeSortGang( + uniform Key_t dstKey[], + uniform Val_t dstVal[], + uniform Key_t srcKey[], + uniform Val_t srcVal[], + uniform int batchSize) +{ + uniform int nTasks = batchSize/4; + launch (nTasks,1,1,mergeSortGangKernel)(batchSize, dstKey, dstVal, srcKey, srcVal); + sync; +} + +//////////////////////////////////////////////////////////////////////////////// +// Merge step 1: generate sample ranks +//////////////////////////////////////////////////////////////////////////////// +__global__ +void generateSampleRanksKernel( + uniform int nBlocks, + uniform int in_ranksA[], + uniform int in_ranksB[], + uniform Key_t in_srcKey[], + uniform int stride, + uniform int N, + uniform int totalProgramCount) +{ + const uniform int blkIdx = taskIndex; + const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount; + const uniform int blkBeg = blkIdx * blkDim; + const uniform int blkEnd = min(blkBeg + blkDim, nBlocks); + + for (uniform int blk = blkBeg; blk < blkEnd; blk++) + { + const int pos = blk * programCount + programIndex; + cif (pos >= totalProgramCount) + return; + + const int i = pos & ((stride / SAMPLE_STRIDE) - 1); + const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); + + uniform Key_t * srcKey = in_srcKey + segmentBase; + uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE; + uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE; + + const int segmentElementsA = stride; + const int segmentElementsB = min(stride, N - segmentBase - stride); + const int segmentSamplesA = getSampleCount(segmentElementsA); + const int segmentSamplesB = getSampleCount(segmentElementsB); + + if (i < segmentSamplesA) + { + ranksA[i] = i * SAMPLE_STRIDE; + ranksB[i] = binarySearchExclusive( + srcKey[i * SAMPLE_STRIDE], srcKey + stride, + segmentElementsB, nextPowerOfTwo(segmentElementsB)); + } + + if (i < segmentSamplesB) + { + ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE; + ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive( + srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0, + segmentElementsA, nextPowerOfTwo(segmentElementsA)); + } + } +} + +__device__ static inline +void generateSampleRanks( + uniform int ranksA[], + uniform int ranksB[], + uniform Key_t srcKey[], + uniform int stride, + uniform int N) +{ + uniform int lastSegmentElements = N % (2 * stride); + uniform int threadCount = (lastSegmentElements > stride) ? + (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : + (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); + + uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE); + uniform int nTasks = nBlocks/4; + + launch (nTasks,1,1, generateSampleRanksKernel)(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount); + sync; +} +//////////////////////////////////////////////////////////////////////////////// +// Merge step 2: generate sample ranks and indices +//////////////////////////////////////////////////////////////////////////////// +__global__ +void mergeRanksAndIndicesKernel( + uniform int nBlocks, + uniform int in_Limits[], + uniform int in_Ranks[], + uniform int stride, + uniform int N, + uniform int totalProgramCount) +{ + const uniform int blkIdx = taskIndex; + const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount; + const uniform int blkBeg = blkIdx * blkDim; + const uniform int blkEnd = min(blkBeg + blkDim, nBlocks); + + for (uniform int blk = blkBeg; blk < blkEnd; blk++) + { + int pos = blk * programCount + programIndex; + cif (pos >= totalProgramCount) + return; + + const int i = pos & ((stride / SAMPLE_STRIDE) - 1); + const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); + uniform int * ranks = in_Ranks + (pos - i) * 2; + uniform int * limits = in_Limits + (pos - i) * 2; + + const int segmentElementsA = stride; + const int segmentElementsB = min(stride, N - segmentBase - stride); + const int segmentSamplesA = getSampleCount(segmentElementsA); + const int segmentSamplesB = getSampleCount(segmentElementsB); + + if (i < segmentSamplesA) + { + int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i; + limits[dstPos] = ranks[i]; + } + + if (i < segmentSamplesB) + { + int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i; + limits[dstPos] = ranks[segmentSamplesA + i]; + } + } +} +__device__ static inline +void mergeRanksAndIndices( + uniform int limitsA[], + uniform int limitsB[], + uniform int ranksA[], + uniform int ranksB[], + uniform int stride, + uniform int N) +{ + const uniform int lastSegmentElements = N % (2 * stride); + const uniform int threadCount = (lastSegmentElements > stride) ? + (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : + (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); + + const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE); + uniform int nTasks = nBlocks/4; + + launch (nTasks,1,1,mergeRanksAndIndicesKernel)( + nBlocks, + limitsA, + ranksA, + stride, + N, + threadCount); + launch (nTasks,1,1, mergeRanksAndIndicesKernel)( + nBlocks, + limitsB, + ranksB, + stride, + N, + threadCount); + sync; +} + + +__global__ +void mergeElementaryIntervalsKernel( + uniform int mergePairs, + uniform Key_t dstKey[], + uniform Val_t dstVal[], + uniform Key_t srcKey[], + uniform Val_t srcVal[], + uniform int limitsA[], + uniform int limitsB[], + uniform int stride, + uniform int N) +{ + const uniform int blkIdx = taskIndex; + const uniform int blkDim = (mergePairs + taskCount - 1)/taskCount; + const uniform int blkBeg = blkIdx * blkDim; + const uniform int blkEnd = min(blkBeg + blkDim, mergePairs); + + for (uniform int blk = blkBeg; blk < blkEnd; blk++) + { + const int uniform intervalI = blk & ((2 * stride) / SAMPLE_STRIDE - 1); + const int uniform segmentBase = (blk - intervalI) * SAMPLE_STRIDE; + + //Set up threadblk-wide parameters + + const uniform int segmentElementsA = stride; + const uniform int segmentElementsB = min(stride, N - segmentBase - stride); + const uniform int segmentSamplesA = getSampleCount(segmentElementsA); + const uniform int segmentSamplesB = getSampleCount(segmentElementsB); + const uniform int segmentSamples = segmentSamplesA + segmentSamplesB; + + const uniform int startSrcA = limitsA[blk]; + const uniform int startSrcB = limitsB[blk]; + const uniform int endSrcA = (intervalI + 1 < segmentSamples) ? limitsA[blk + 1] : segmentElementsA; + const uniform int endSrcB = (intervalI + 1 < segmentSamples) ? limitsB[blk + 1] : segmentElementsB; + const uniform int lenSrcA = endSrcA - startSrcA; + const uniform int lenSrcB = endSrcB - startSrcB; + const uniform int startDstA = startSrcA + startSrcB; + const uniform int startDstB = startDstA + lenSrcA; + + //Load main input data + + Key_t keyA, keyB; + Val_t valA, valB; + if (programIndex < lenSrcA) + { + keyA = srcKey[segmentBase + startSrcA + programIndex]; + valA = srcVal[segmentBase + startSrcA + programIndex]; + } + + if (programIndex < lenSrcB) + { + keyB = srcKey[segmentBase + stride + startSrcB + programIndex]; + valB = srcVal[segmentBase + stride + startSrcB + programIndex]; + } + + // Compute destination addresses for merge data + int dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex; + int dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex; + + + int dstA = -1, dstB = -1; + if (programIndex < lenSrcA && dstPosA < lenSrcA) + dstA = segmentBase + startDstA + dstPosA; + if (programIndex < lenSrcB && dstPosB < lenSrcA) + dstB = segmentBase + startDstA + dstPosB; + + dstPosA -= lenSrcA; + dstPosB -= lenSrcA; + if (programIndex < lenSrcA && dstPosA < lenSrcB) + dstA = segmentBase + startDstB + dstPosA; + if (programIndex < lenSrcB && dstPosB < lenSrcB) + dstB = segmentBase + startDstB + dstPosB; + + // store merge data + if (dstA >= 0) + { + // int dstA = segmentBase + startSrcA + programIndex; + dstKey[dstA] = keyA; + dstVal[dstA] = valA; + } + if (dstB >= 0) + { +// int dstB = segmentBase + stride + startSrcB + programIndex; + dstKey[dstB] = keyB; + dstVal[dstB] = valB; + } + } + +} + + +__device__ static inline +void mergeElementaryIntervals( + uniform int nTasks, + uniform Key_t dstKey[], + uniform Val_t dstVal[], + uniform Key_t srcKey[], + uniform Val_t srcVal[], + uniform int limitsA[], + uniform int limitsB[], + uniform int stride, + uniform int N) +{ + const uniform int lastSegmentElements = N % (2 * stride); + const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE; + + + nTasks = mergePairs/(4*programCount); + + launch (nTasks,1,1, mergeElementaryIntervalsKernel)( + mergePairs, + dstKey, + dstVal, + srcKey, + srcVal, + limitsA, + limitsB, + stride, + N); + sync; +} + +__device__ static uniform int * uniform memPool = NULL; +__device__ static uniform int * uniform ranksA; +__device__ static uniform int * uniform ranksB; +__device__ static uniform int * uniform limitsA; +__device__ static uniform int * uniform limitsB; +__device__ static uniform int nTasks; +__device__ static uniform int MAX_SAMPLE_COUNT = 0; + +__global__ +void openMergeSort___export() +{ + nTasks = 13*32*13; + MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount; + assert(memPool == NULL); + const uniform int nalloc = MAX_SAMPLE_COUNT * 4; + memPool = uniform new uniform int[nalloc]; + ranksA = memPool; + ranksB = ranksA + MAX_SAMPLE_COUNT; + limitsA = ranksB + MAX_SAMPLE_COUNT; + limitsB = limitsA + MAX_SAMPLE_COUNT; +} +extern "C" +void openMergeSort() +{ + openMergeSort___export<<<1,1>>>(); + sync; +} + +__global__ +void closeMergeSort___export() +{ + assert(memPool != NULL); + delete memPool; + memPool = NULL; +} +extern "C" +void closeMergeSort() +{ + closeMergeSort___export<<<1,1>>>(); + sync; +} + +__global__ +void mergeSort___export( + uniform Key_t dstKey[], + uniform Val_t dstVal[], + uniform Key_t bufKey[], + uniform Val_t bufVal[], + uniform Key_t srcKey[], + uniform Val_t srcVal[], + uniform int N) +{ + uniform int stageCount = 0; + for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++); + + uniform Key_t * uniform iKey, * uniform oKey; + uniform Val_t * uniform iVal, * uniform oVal; + + if (stageCount & 1) + { + iKey = bufKey; + iVal = bufVal; + oKey = dstKey; + oVal = dstVal; + } + else + { + iKey = dstKey; + iVal = dstVal; + oKey = bufKey; + oVal = bufVal; + } + + + + assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT); + assert(N % (programCount*2) == 0); + mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount)); + +#if 1 + for (uniform int stride = 2*programCount; stride < N; stride <<= 1) + { + const uniform int lastSegmentElements = N % (2 * stride); + + //Find sample ranks and prepare for limiters merge + generateSampleRanks(ranksA, ranksB, iKey, stride, N); + + //Merge ranks and indices + mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N); + + //Merge elementary intervals + mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N); + + if (lastSegmentElements <= stride) + for (int i = programIndex; i < lastSegmentElements; i += programCount) + if (i < lastSegmentElements) + { + oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i]; + oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i]; + } + + + { + uniform Key_t * uniform tmpKey = iKey; + iKey = oKey; + oKey = tmpKey; + } + { + uniform Val_t * uniform tmpVal = iVal; + iVal = oVal; + oVal = tmpVal; + } + } +#endif +} +extern "C" +void mergeSort( + uniform Key_t dstKey[], + uniform Val_t dstVal[], + uniform Key_t bufKey[], + uniform Val_t bufVal[], + uniform Key_t srcKey[], + uniform Val_t srcVal[], + uniform int N) +{ + mergeSort___export<<<1,32>>>( + dstKey, + dstVal, + bufKey, + bufVal, + srcKey, + srcVal, + N); + sync; +}