Files
ispc/examples/portable/mergeSort/mergeSort.ispc
2014-07-09 08:26:54 +02:00

659 lines
18 KiB
Plaintext

/*
Copyright (c) 2014, Evghenii Gaburov
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Based on mergeSort from CUDA SDK
*/
#include "keyType.h"
#define SAMPLE_STRIDE programCount
#define iDivUp(a,b) (((a) + (b) - 1)/(b))
#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
#define W (/*sizeof(int)=*/4 * 8)
static inline
int nextPowerOfTwo(int x)
{
#if 0
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
#else
return 1U << (W - count_leading_zeros(x - 1));
#endif
}
static inline
int binarySearchInclusiveRanks(
const int val,
uniform int *data,
const int L,
int stride)
{
cif (L == 0)
return 0;
int pos = 0;
cfor (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
cif (data[newPos - 1] <= val)
pos = newPos;
}
return pos;
}
static inline
int binarySearchExclusiveRanks(
const int val,
uniform int *data,
const int L,
int stride)
{
cif (L == 0)
return 0;
int pos = 0;
cfor (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] < val)
pos = newPos;
}
return pos;
}
static inline
int binarySearchInclusive(
const Key_t val,
uniform Key_t *data,
const int L,
int stride)
{
cif (L == 0)
return 0;
int pos = 0;
cfor (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] <= val)
pos = newPos;
}
return pos;
}
static inline
int binarySearchExclusive(
const Key_t val,
uniform Key_t *data,
const int L,
int stride)
{
cif (L == 0)
return 0;
int pos = 0;
cfor (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] < val)
pos = newPos;
}
return pos;
}
static inline
int binarySearchInclusive1(
const Key_t val,
Key_t data,
const uniform int L,
uniform int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (shuffle(data,newPos - 1) <= val)
pos = newPos;
}
return pos;
}
static inline
int binarySearchExclusive1(
const Key_t val,
Key_t data,
const uniform int L,
uniform int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (shuffle(data,newPos - 1) < val)
pos = newPos;
}
return pos;
}
////////////////////////////////////////////////////////////////////////////////
// Bottom-level merge sort (binary search-based)
////////////////////////////////////////////////////////////////////////////////
task
void mergeSortGangKernel(
uniform int batchSize,
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int arrayLength)
{
const uniform int blockIdx = taskIndex;
const uniform int blockDim = (batchSize + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, batchSize);
uniform Key_t s_key[2*programCount];
uniform Val_t s_val[2*programCount];
for (uniform int block = blockBeg; block < blockEnd; block++)
{
const uniform int base = block * (programCount*2);
s_key[programIndex + 0] = srcKey[base + programIndex + 0];
s_val[programIndex + 0] = srcVal[base + programIndex + 0];
s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];
for (uniform int stride = 1; stride < arrayLength; stride <<= 1)
{
const int lPos = programIndex & (stride - 1);
const int offset = 2 * (programIndex - lPos);
uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
Key_t keyA = baseKey[lPos + 0];
Val_t valA = baseVal[lPos + 0];
Key_t keyB = baseKey[lPos + stride];
Val_t valB = baseVal[lPos + stride];
int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
int posB = binarySearchInclusive(keyB, baseKey + 0, stride, stride) + lPos;
baseKey[posA] = keyA;
baseVal[posA] = valA;
baseKey[posB] = keyB;
baseVal[posB] = valB;
}
dstKey[base + programIndex + 0] = s_key[programIndex + 0];
dstVal[base + programIndex + 0] = s_val[programIndex + 0];
dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
}
}
static inline
void mergeSortGang(
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int batchSize)
{
uniform int nTasks = num_cores()*4;
#ifdef __NVPTX__
nTasks = iDivUp(batchSize,1);
#endif
launch [nTasks] mergeSortGangKernel(batchSize, dstKey, dstVal, srcKey, srcVal, 2*programCount);
sync;
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 1: generate sample ranks
////////////////////////////////////////////////////////////////////////////////
task
void generateSampleRanksKernel(
uniform int nBlocks,
uniform int in_ranksA[],
uniform int in_ranksB[],
uniform Key_t in_srcKey[],
uniform int stride,
uniform int N,
uniform int totalProgramCount)
{
const uniform int blockIdx = taskIndex;
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
for (uniform int block = blockBeg; block < blockEnd; block++)
{
const int pos = block * programCount + programIndex;
cif (pos >= totalProgramCount)
return;
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
uniform Key_t * srcKey = in_srcKey + segmentBase;
uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
const int segmentElementsA = stride;
const int segmentElementsB = min(stride, N - segmentBase - stride);
const int segmentSamplesA = getSampleCount(segmentElementsA);
const int segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA)
{
ranksA[i] = i * SAMPLE_STRIDE;
ranksB[i] = binarySearchExclusive(
srcKey[i * SAMPLE_STRIDE], srcKey + stride,
segmentElementsB, nextPowerOfTwo(segmentElementsB));
}
if (i < segmentSamplesB)
{
ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
segmentElementsA, nextPowerOfTwo(segmentElementsA));
}
}
}
static inline
void generateSampleRanks(
uniform int ranksA[],
uniform int ranksB[],
uniform Key_t srcKey[],
uniform int stride,
uniform int N)
{
uniform int lastSegmentElements = N % (2 * stride);
uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
uniform int nTasks = num_cores()*4;
#ifdef __NVPTX__
nTasks = iDivUp(nBlocks,1);
#endif
launch [nTasks] generateSampleRanksKernel(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
sync;
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices
////////////////////////////////////////////////////////////////////////////////
task
void mergeRanksAndIndicesKernel(
uniform int nBlocks,
uniform int in_Limits[],
uniform int in_Ranks[],
uniform int stride,
uniform int N,
uniform int totalProgramCount)
{
const uniform int blockIdx = taskIndex;
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
for (uniform int block = blockBeg; block < blockEnd; block++)
{
int pos = block * programCount + programIndex;
cif (pos >= totalProgramCount)
return;
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
uniform int * ranks = in_Ranks + (pos - i) * 2;
uniform int * limits = in_Limits + (pos - i) * 2;
const int segmentElementsA = stride;
const int segmentElementsB = min(stride, N - segmentBase - stride);
const int segmentSamplesA = getSampleCount(segmentElementsA);
const int segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA)
{
int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
limits[dstPos] = ranks[i];
}
if (i < segmentSamplesB)
{
int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
limits[dstPos] = ranks[segmentSamplesA + i];
}
}
}
static inline
void mergeRanksAndIndices(
uniform int limitsA[],
uniform int limitsB[],
uniform int ranksA[],
uniform int ranksB[],
uniform int stride,
uniform int N)
{
const uniform int lastSegmentElements = N % (2 * stride);
const uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
uniform int nTasks = num_cores()*4;
#ifdef __NVPTX__
nTasks = iDivUp(nBlocks,1);
#endif
launch [nTasks] mergeRanksAndIndicesKernel(
nBlocks,
limitsA,
ranksA,
stride,
N,
threadCount);
launch [nTasks] mergeRanksAndIndicesKernel(
nBlocks,
limitsB,
ranksB,
stride,
N,
threadCount);
sync;
}
task
void mergeElementaryIntervalsKernel(
uniform int mergePairs,
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int limitsA[],
uniform int limitsB[],
uniform int stride,
uniform int N)
{
const uniform int blockIdx = taskIndex;
const uniform int blockDim = (mergePairs + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, mergePairs);
for (uniform int block = blockBeg; block < blockEnd; block++)
{
const int uniform intervalI = block & ((2 * stride) / SAMPLE_STRIDE - 1);
const int uniform segmentBase = (block - intervalI) * SAMPLE_STRIDE;
//Set up threadblock-wide parameters
const uniform int segmentElementsA = stride;
const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
const uniform int segmentSamplesA = getSampleCount(segmentElementsA);
const uniform int segmentSamplesB = getSampleCount(segmentElementsB);
const uniform int segmentSamples = segmentSamplesA + segmentSamplesB;
const uniform int startSrcA = limitsA[block];
const uniform int startSrcB = limitsB[block];
const uniform int endSrcA = (intervalI + 1 < segmentSamples) ? limitsA[block + 1] : segmentElementsA;
const uniform int endSrcB = (intervalI + 1 < segmentSamples) ? limitsB[block + 1] : segmentElementsB;
const uniform int lenSrcA = endSrcA - startSrcA;
const uniform int lenSrcB = endSrcB - startSrcB;
const uniform int startDstA = startSrcA + startSrcB;
const uniform int startDstB = startDstA + lenSrcA;
//Load main input data
Key_t keyA, keyB;
Val_t valA, valB;
if (programIndex < lenSrcA)
{
keyA = srcKey[segmentBase + startSrcA + programIndex];
valA = srcVal[segmentBase + startSrcA + programIndex];
}
if (programIndex < lenSrcB)
{
keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
valB = srcVal[segmentBase + stride + startSrcB + programIndex];
}
// Compute destination addresses for merge data
int dstPosA, dstPosB, dstA = -1, dstB = -1;
if (programIndex < lenSrcA)
dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
if (programIndex < lenSrcB)
dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
if (programIndex < lenSrcA && dstPosA < lenSrcA)
dstA = segmentBase + startDstA + dstPosA;
dstPosA -= lenSrcA;
if (programIndex < lenSrcA && dstPosA < lenSrcB)
dstA = segmentBase + startDstB + dstPosA;
if (programIndex < lenSrcB && dstPosB < lenSrcA)
dstB = segmentBase + startDstA + dstPosB;
dstPosB -= lenSrcA;
if (programIndex < lenSrcB && dstPosB < lenSrcB)
dstB = segmentBase + startDstB + dstPosB;
if (dstA >= 0)
{
dstKey[dstA] = keyA;
dstVal[dstA] = valA;
}
if (dstB >= 0)
{
dstKey[dstB] = keyB;
dstVal[dstB] = valB;
}
}
}
static inline
void mergeElementaryIntervals(
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int limitsA[],
uniform int limitsB[],
uniform int stride,
uniform int N)
{
const uniform int lastSegmentElements = N % (2 * stride);
const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
uniform int nTasks = num_cores()*4;
#ifdef __NVPTX__
nTasks = iDivUp(mergePairs,1*programCount);
#endif
launch [nTasks] mergeElementaryIntervalsKernel(
mergePairs,
dstKey,
dstVal,
srcKey,
srcVal,
limitsA,
limitsB,
stride,
N);
if (lastSegmentElements <= stride)
foreach (i = 0 ... lastSegmentElements)
{
dstKey[N-lastSegmentElements+i] = srcKey[N-lastSegmentElements+i];
dstVal[N-lastSegmentElements+i] = srcVal[N-lastSegmentElements+i];
}
sync;
}
static uniform int * uniform memPool = NULL;
static uniform int * uniform ranksA;
static uniform int * uniform ranksB;
static uniform int * uniform limitsA;
static uniform int * uniform limitsB;
static uniform int MAX_SAMPLE_COUNT = 0;
export
void openMergeSort()
{
MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
assert(memPool == NULL);
const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
memPool = uniform new uniform int[nalloc];
ranksA = memPool;
ranksB = ranksA + MAX_SAMPLE_COUNT;
limitsA = ranksB + MAX_SAMPLE_COUNT;
limitsB = limitsA + MAX_SAMPLE_COUNT;
}
export
void closeMergeSort()
{
assert(memPool != NULL);
delete memPool;
memPool = NULL;
}
export
void mergeSort(
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t bufKey[],
uniform Val_t bufVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int N)
{
uniform int stageCount = 0;
for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);
uniform Key_t * uniform iKey, * uniform oKey;
uniform Val_t * uniform iVal, * uniform oVal;
if (stageCount & 1)
{
iKey = bufKey;
iVal = bufVal;
oKey = dstKey;
oVal = dstVal;
}
else
{
iKey = dstKey;
iVal = dstVal;
oKey = bufKey;
oVal = bufVal;
}
assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
assert(N % (programCount*2) == 0);
// cpu: 28 gpu: 74 M/s
{
// cpu: 356 gpu: 534 M/s
mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
#if 1
for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
{
// cpu: 30 gpu: 112 M/s
{
#if 1
// cpu: 121 gpu: 460 M/s
{
// cpu: 190 gpu: 600 M/s
//Find sample ranks and prepare for limiters merge
generateSampleRanks(ranksA, ranksB, iKey, stride, N);
// cpu: 120 gpu: 457 M/s
//Merge ranks and indices
mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
}
#endif
// cpu: 287 gpu: 194 M/s
//Merge elementary intervals
mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
}
{
uniform Key_t * uniform tmpKey = iKey;
iKey = oKey;
oKey = tmpKey;
}
{
uniform Val_t * uniform tmpVal = iVal;
iVal = oVal;
oVal = tmpVal;
}
}
#endif
}
}