added cuda version

This commit is contained in:
Evghenii
2014-01-30 11:32:23 +01:00
parent 6457241e66
commit eb57852f2e

View File

@@ -0,0 +1,643 @@
#include "keyType.h"
#include "cuda_helpers.cuh"
#include <cassert>
#define uniform
#define SAMPLE_STRIDE programCount
#define iDivUp(a,b) (((a) + (b) - 1)/(b))
#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
#define W (/*sizeof(int)=*/4 * 8)
__device__ static inline
int nextPowerOfTwo(int x)
{
#if 0
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
#else
return 1U << (W - __clz(x - 1));
#endif
}
__device__ static inline
int binarySearchInclusiveRanks(
const int val,
uniform int *data,
const int L,
int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] <= val)
pos = newPos;
}
return pos;
}
__device__ static inline
int binarySearchExclusiveRanks(
const int val,
uniform int *data,
const int L,
int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] < val)
pos = newPos;
}
return pos;
}
__device__ static inline
int binarySearchInclusive(
const Key_t val,
uniform Key_t *data,
const int L,
int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] <= val)
pos = newPos;
}
return pos;
}
__device__ static inline
int binarySearchExclusive(
const Key_t val,
uniform Key_t *data,
const int L,
int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] < val)
pos = newPos;
}
return pos;
}
__device__ static inline
int binarySearchInclusive1(
const Key_t val,
Key_t data,
const uniform int L,
uniform int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (shuffle(data,newPos - 1) <= val)
pos = newPos;
}
return pos;
}
__device__ static inline
int binarySearchExclusive1(
const Key_t val,
Key_t data,
const uniform int L,
uniform int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (shuffle(data,newPos - 1) < val)
pos = newPos;
}
return pos;
}
////////////////////////////////////////////////////////////////////////////////
// Bottom-level merge sort (binary search-based)
////////////////////////////////////////////////////////////////////////////////
__global__
void mergeSortGangKernel(
uniform int batchSize,
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[])
{
const uniform int blkIdx = taskIndex;
const uniform int blkDim = (batchSize + taskCount - 1)/taskCount;
const uniform int blkBeg = blkIdx * blkDim;
const uniform int blkEnd = min(blkBeg + blkDim, batchSize);
__shared__ Key_t s_key_tmp[2*programCount*4];
__shared__ Val_t s_val_tmp[2*programCount*4];
Key_t *s_key = s_key_tmp + warpIdx*(2*programCount);
Val_t *s_val = s_val_tmp + warpIdx*(2*programCount);
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
{
const uniform int base = blk * (programCount*2);
s_key[programIndex + 0] = srcKey[base + programIndex + 0];
s_val[programIndex + 0] = srcVal[base + programIndex + 0];
s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];
#if 1
for (uniform int stride = 1; stride < 2*programCount; stride <<= 1)
{
const int lPos = programIndex & (stride - 1);
uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
Key_t keyA = baseKey[lPos + 0];
Val_t valA = baseVal[lPos + 0];
Key_t keyB = baseKey[lPos + stride];
Val_t valB = baseVal[lPos + stride];
int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
int posB = binarySearchInclusive(keyB, baseKey + 0, stride, stride) + lPos;
baseKey[posA] = keyA;
baseVal[posA] = valA;
baseKey[posB] = keyB;
baseVal[posB] = valB;
}
#endif
dstKey[base + programIndex + 0] = s_key[programIndex + 0];
dstVal[base + programIndex + 0] = s_val[programIndex + 0];
dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
}
}
__device__ static inline
void mergeSortGang(
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int batchSize)
{
uniform int nTasks = batchSize/4;
launch (nTasks,1,1,mergeSortGangKernel)(batchSize, dstKey, dstVal, srcKey, srcVal);
sync;
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 1: generate sample ranks
////////////////////////////////////////////////////////////////////////////////
__global__
void generateSampleRanksKernel(
uniform int nBlocks,
uniform int in_ranksA[],
uniform int in_ranksB[],
uniform Key_t in_srcKey[],
uniform int stride,
uniform int N,
uniform int totalProgramCount)
{
const uniform int blkIdx = taskIndex;
const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blkBeg = blkIdx * blkDim;
const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
{
const int pos = blk * programCount + programIndex;
cif (pos >= totalProgramCount)
return;
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
uniform Key_t * srcKey = in_srcKey + segmentBase;
uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
const int segmentElementsA = stride;
const int segmentElementsB = min(stride, N - segmentBase - stride);
const int segmentSamplesA = getSampleCount(segmentElementsA);
const int segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA)
{
ranksA[i] = i * SAMPLE_STRIDE;
ranksB[i] = binarySearchExclusive(
srcKey[i * SAMPLE_STRIDE], srcKey + stride,
segmentElementsB, nextPowerOfTwo(segmentElementsB));
}
if (i < segmentSamplesB)
{
ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
segmentElementsA, nextPowerOfTwo(segmentElementsA));
}
}
}
__device__ static inline
void generateSampleRanks(
uniform int ranksA[],
uniform int ranksB[],
uniform Key_t srcKey[],
uniform int stride,
uniform int N)
{
uniform int lastSegmentElements = N % (2 * stride);
uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
uniform int nTasks = nBlocks/4;
launch (nTasks,1,1, generateSampleRanksKernel)(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
sync;
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices
////////////////////////////////////////////////////////////////////////////////
__global__
void mergeRanksAndIndicesKernel(
uniform int nBlocks,
uniform int in_Limits[],
uniform int in_Ranks[],
uniform int stride,
uniform int N,
uniform int totalProgramCount)
{
const uniform int blkIdx = taskIndex;
const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blkBeg = blkIdx * blkDim;
const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
{
int pos = blk * programCount + programIndex;
cif (pos >= totalProgramCount)
return;
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
uniform int * ranks = in_Ranks + (pos - i) * 2;
uniform int * limits = in_Limits + (pos - i) * 2;
const int segmentElementsA = stride;
const int segmentElementsB = min(stride, N - segmentBase - stride);
const int segmentSamplesA = getSampleCount(segmentElementsA);
const int segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA)
{
int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
limits[dstPos] = ranks[i];
}
if (i < segmentSamplesB)
{
int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
limits[dstPos] = ranks[segmentSamplesA + i];
}
}
}
__device__ static inline
void mergeRanksAndIndices(
uniform int limitsA[],
uniform int limitsB[],
uniform int ranksA[],
uniform int ranksB[],
uniform int stride,
uniform int N)
{
const uniform int lastSegmentElements = N % (2 * stride);
const uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
uniform int nTasks = nBlocks/4;
launch (nTasks,1,1,mergeRanksAndIndicesKernel)(
nBlocks,
limitsA,
ranksA,
stride,
N,
threadCount);
launch (nTasks,1,1, mergeRanksAndIndicesKernel)(
nBlocks,
limitsB,
ranksB,
stride,
N,
threadCount);
sync;
}
__global__
void mergeElementaryIntervalsKernel(
uniform int mergePairs,
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int limitsA[],
uniform int limitsB[],
uniform int stride,
uniform int N)
{
const uniform int blkIdx = taskIndex;
const uniform int blkDim = (mergePairs + taskCount - 1)/taskCount;
const uniform int blkBeg = blkIdx * blkDim;
const uniform int blkEnd = min(blkBeg + blkDim, mergePairs);
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
{
const int uniform intervalI = blk & ((2 * stride) / SAMPLE_STRIDE - 1);
const int uniform segmentBase = (blk - intervalI) * SAMPLE_STRIDE;
//Set up threadblk-wide parameters
const uniform int segmentElementsA = stride;
const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
const uniform int segmentSamplesA = getSampleCount(segmentElementsA);
const uniform int segmentSamplesB = getSampleCount(segmentElementsB);
const uniform int segmentSamples = segmentSamplesA + segmentSamplesB;
const uniform int startSrcA = limitsA[blk];
const uniform int startSrcB = limitsB[blk];
const uniform int endSrcA = (intervalI + 1 < segmentSamples) ? limitsA[blk + 1] : segmentElementsA;
const uniform int endSrcB = (intervalI + 1 < segmentSamples) ? limitsB[blk + 1] : segmentElementsB;
const uniform int lenSrcA = endSrcA - startSrcA;
const uniform int lenSrcB = endSrcB - startSrcB;
const uniform int startDstA = startSrcA + startSrcB;
const uniform int startDstB = startDstA + lenSrcA;
//Load main input data
Key_t keyA, keyB;
Val_t valA, valB;
if (programIndex < lenSrcA)
{
keyA = srcKey[segmentBase + startSrcA + programIndex];
valA = srcVal[segmentBase + startSrcA + programIndex];
}
if (programIndex < lenSrcB)
{
keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
valB = srcVal[segmentBase + stride + startSrcB + programIndex];
}
// Compute destination addresses for merge data
int dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
int dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
int dstA = -1, dstB = -1;
if (programIndex < lenSrcA && dstPosA < lenSrcA)
dstA = segmentBase + startDstA + dstPosA;
if (programIndex < lenSrcB && dstPosB < lenSrcA)
dstB = segmentBase + startDstA + dstPosB;
dstPosA -= lenSrcA;
dstPosB -= lenSrcA;
if (programIndex < lenSrcA && dstPosA < lenSrcB)
dstA = segmentBase + startDstB + dstPosA;
if (programIndex < lenSrcB && dstPosB < lenSrcB)
dstB = segmentBase + startDstB + dstPosB;
// store merge data
if (dstA >= 0)
{
// int dstA = segmentBase + startSrcA + programIndex;
dstKey[dstA] = keyA;
dstVal[dstA] = valA;
}
if (dstB >= 0)
{
// int dstB = segmentBase + stride + startSrcB + programIndex;
dstKey[dstB] = keyB;
dstVal[dstB] = valB;
}
}
}
__device__ static inline
void mergeElementaryIntervals(
uniform int nTasks,
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int limitsA[],
uniform int limitsB[],
uniform int stride,
uniform int N)
{
const uniform int lastSegmentElements = N % (2 * stride);
const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
nTasks = mergePairs/(4*programCount);
launch (nTasks,1,1, mergeElementaryIntervalsKernel)(
mergePairs,
dstKey,
dstVal,
srcKey,
srcVal,
limitsA,
limitsB,
stride,
N);
sync;
}
__device__ static uniform int * uniform memPool = NULL;
__device__ static uniform int * uniform ranksA;
__device__ static uniform int * uniform ranksB;
__device__ static uniform int * uniform limitsA;
__device__ static uniform int * uniform limitsB;
__device__ static uniform int nTasks;
__device__ static uniform int MAX_SAMPLE_COUNT = 0;
__global__
void openMergeSort___export()
{
nTasks = 13*32*13;
MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
assert(memPool == NULL);
const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
memPool = uniform new uniform int[nalloc];
ranksA = memPool;
ranksB = ranksA + MAX_SAMPLE_COUNT;
limitsA = ranksB + MAX_SAMPLE_COUNT;
limitsB = limitsA + MAX_SAMPLE_COUNT;
}
extern "C"
void openMergeSort()
{
openMergeSort___export<<<1,1>>>();
sync;
}
__global__
void closeMergeSort___export()
{
assert(memPool != NULL);
delete memPool;
memPool = NULL;
}
extern "C"
void closeMergeSort()
{
closeMergeSort___export<<<1,1>>>();
sync;
}
__global__
void mergeSort___export(
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t bufKey[],
uniform Val_t bufVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int N)
{
uniform int stageCount = 0;
for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);
uniform Key_t * uniform iKey, * uniform oKey;
uniform Val_t * uniform iVal, * uniform oVal;
if (stageCount & 1)
{
iKey = bufKey;
iVal = bufVal;
oKey = dstKey;
oVal = dstVal;
}
else
{
iKey = dstKey;
iVal = dstVal;
oKey = bufKey;
oVal = bufVal;
}
assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
assert(N % (programCount*2) == 0);
mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
#if 1
for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
{
const uniform int lastSegmentElements = N % (2 * stride);
//Find sample ranks and prepare for limiters merge
generateSampleRanks(ranksA, ranksB, iKey, stride, N);
//Merge ranks and indices
mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
//Merge elementary intervals
mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
if (lastSegmentElements <= stride)
for (int i = programIndex; i < lastSegmentElements; i += programCount)
if (i < lastSegmentElements)
{
oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i];
oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i];
}
{
uniform Key_t * uniform tmpKey = iKey;
iKey = oKey;
oKey = tmpKey;
}
{
uniform Val_t * uniform tmpVal = iVal;
iVal = oVal;
oVal = tmpVal;
}
}
#endif
}
extern "C"
void mergeSort(
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t bufKey[],
uniform Val_t bufVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int N)
{
mergeSort___export<<<1,32>>>(
dstKey,
dstVal,
bufKey,
bufVal,
srcKey,
srcVal,
N);
sync;
}