added cuda version

2014-01-30 11:32:23 +01:00
parent 6457241e66
commit eb57852f2e
1 changed files with 643 additions and 0 deletions
--- a/examples_ptx/mergeSort/mergeSort.cu
+++ b/examples_ptx/mergeSort/mergeSort.cu
@@ -0,0 +1,643 @@
 #include "keyType.h"
 #include "cuda_helpers.cuh"
 #include <cassert>
 #define uniform
 #define SAMPLE_STRIDE programCount
 #define iDivUp(a,b) (((a) + (b) - 1)/(b))
 #define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
 #define W (/*sizeof(int)=*/4 * 8)
 __device__ static inline
 int nextPowerOfTwo(int x)
 {
 #if 0
  --x;
  x |= x >> 1;
  x |= x >> 2;
  x |= x >> 4;
  x |= x >> 8;
  x |= x >> 16;
  return ++x;
 #else
  return 1U << (W - __clz(x - 1));
 #endif
 }
 __device__ static inline
 int binarySearchInclusiveRanks(
    const int val, 
    uniform int *data,
    const int L, 
    int stride)
 {
  if (L == 0)
    return 0;
  int pos = 0;
  for (; stride > 0; stride >>= 1)
  {
    int newPos = min(pos + stride, L);
    if (data[newPos - 1] <= val)
      pos = newPos;
  }
  return pos;
 }
 __device__ static inline
 int binarySearchExclusiveRanks(
    const int val, 
    uniform int *data, 
    const int L, 
    int stride)
 {
  if (L == 0)
    return 0;
  int pos = 0;
  for (; stride > 0; stride >>= 1)
  {
    int newPos = min(pos + stride, L);
    if (data[newPos - 1] < val)
      pos = newPos;
  }
  return pos;
 }
 __device__ static inline
 int binarySearchInclusive(
    const Key_t val, 
    uniform Key_t *data,
    const int L, 
    int stride)
 {
  if (L == 0)
    return 0;
  int pos = 0;
  for (; stride > 0; stride >>= 1)
  {
    int newPos = min(pos + stride, L);
    if (data[newPos - 1] <= val)
      pos = newPos;
  }
  return pos;
 }
 __device__ static inline
 int binarySearchExclusive(
    const Key_t val, 
    uniform Key_t *data, 
    const int L, 
    int stride)
 {
  if (L == 0)
    return 0;
  int pos = 0;
  for (; stride > 0; stride >>= 1)
  {
    int newPos = min(pos + stride, L);
    if (data[newPos - 1] < val)
      pos = newPos;
  }
  return pos;
 }
 __device__ static inline
 int binarySearchInclusive1(
    const Key_t val, 
    Key_t data,
    const uniform int L, 
    uniform int stride)
 {
  if (L == 0)
    return 0;
  int pos = 0;
  for (; stride > 0; stride >>= 1)
  {
    int newPos = min(pos + stride, L);
    if (shuffle(data,newPos - 1) <= val)
      pos = newPos;
  }
  return pos;
 }
 __device__ static inline
 int binarySearchExclusive1(
    const Key_t val, 
    Key_t data, 
    const uniform int L, 
    uniform int stride)
 {
  if (L == 0)
    return 0;
  int pos = 0;
  for (; stride > 0; stride >>= 1)
  {
    int newPos = min(pos + stride, L);
    if (shuffle(data,newPos - 1) < val)
      pos = newPos;
  }
  return pos;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Bottom-level merge sort (binary search-based)
 ////////////////////////////////////////////////////////////////////////////////
 __global__
 void mergeSortGangKernel(
    uniform int batchSize,
    uniform Key_t dstKey[],
    uniform Val_t dstVal[],
    uniform Key_t srcKey[],
    uniform Val_t srcVal[])
 {
  const uniform int blkIdx = taskIndex;
  const uniform int blkDim = (batchSize + taskCount - 1)/taskCount;
  const uniform int blkBeg =     blkIdx * blkDim;
  const uniform int blkEnd = min(blkBeg + blkDim, batchSize);
  __shared__ Key_t s_key_tmp[2*programCount*4];
  __shared__ Val_t s_val_tmp[2*programCount*4];
  Key_t *s_key = s_key_tmp + warpIdx*(2*programCount);
  Val_t *s_val = s_val_tmp + warpIdx*(2*programCount);
  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
  {
    const uniform int base = blk * (programCount*2);
    s_key[programIndex +            0] = srcKey[base + programIndex +            0];
    s_val[programIndex +            0] = srcVal[base + programIndex +            0];
    s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
    s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];
 #if 1
    for (uniform int stride = 1; stride < 2*programCount; stride <<= 1)
    {
      const int lPos = programIndex & (stride - 1);
      uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
      uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
      Key_t keyA = baseKey[lPos +      0];
      Val_t valA = baseVal[lPos +      0];
      Key_t keyB = baseKey[lPos + stride];
      Val_t valB = baseVal[lPos + stride];
      int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
      int posB = binarySearchInclusive(keyB, baseKey +      0, stride, stride) + lPos;
      baseKey[posA] = keyA;
      baseVal[posA] = valA;
      baseKey[posB] = keyB;
      baseVal[posB] = valB;
    }
 #endif
    dstKey[base + programIndex +            0] = s_key[programIndex +            0];
    dstVal[base + programIndex +            0] = s_val[programIndex +            0];
    dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
    dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
  }
 }
 __device__ static inline
 void mergeSortGang(
    uniform Key_t dstKey[],
    uniform Val_t dstVal[],
    uniform Key_t srcKey[],
    uniform Val_t srcVal[],
    uniform int batchSize)
 {
  uniform int nTasks = batchSize/4;
  launch (nTasks,1,1,mergeSortGangKernel)(batchSize, dstKey, dstVal, srcKey, srcVal);
  sync;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 1: generate sample ranks
 ////////////////////////////////////////////////////////////////////////////////
 __global__
 void generateSampleRanksKernel(
    uniform int nBlocks,
    uniform int in_ranksA[],
    uniform int in_ranksB[],
    uniform Key_t in_srcKey[],
    uniform int stride,
    uniform int N,
    uniform int totalProgramCount)
 {
  const uniform int blkIdx = taskIndex;
  const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
  const uniform int blkBeg =     blkIdx * blkDim;
  const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
  {
    const int pos = blk * programCount + programIndex;
    cif (pos >= totalProgramCount)
      return;
    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
    uniform Key_t * srcKey = in_srcKey + segmentBase;
    uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
    uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
    const int segmentElementsA = stride;
    const int segmentElementsB = min(stride, N - segmentBase - stride);
    const int  segmentSamplesA = getSampleCount(segmentElementsA);
    const int  segmentSamplesB = getSampleCount(segmentElementsB);
    if (i < segmentSamplesA)
    {
      ranksA[i] = i * SAMPLE_STRIDE;
      ranksB[i] = binarySearchExclusive(
          srcKey[i * SAMPLE_STRIDE], srcKey + stride,
          segmentElementsB, nextPowerOfTwo(segmentElementsB));
    }
    if (i < segmentSamplesB)
    {
      ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
      ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
          srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
          segmentElementsA, nextPowerOfTwo(segmentElementsA));
    }
  }
 }
 __device__ static inline
 void generateSampleRanks(
    uniform int ranksA[],
    uniform int ranksB[],
    uniform Key_t srcKey[],
    uniform int stride,
    uniform int N)
 {
  uniform int lastSegmentElements = N % (2 * stride);
  uniform int threadCount = (lastSegmentElements > stride) ? 
    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : 
    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
  uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
  uniform int nTasks = nBlocks/4;
  launch (nTasks,1,1, generateSampleRanksKernel)(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
  sync;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 2: generate sample ranks and indices
 ////////////////////////////////////////////////////////////////////////////////
 __global__ 
 void mergeRanksAndIndicesKernel(
    uniform int nBlocks,
    uniform int in_Limits[],
    uniform int in_Ranks[],
    uniform int stride,
    uniform int N,
    uniform int totalProgramCount)
 {
  const uniform int blkIdx = taskIndex;
  const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
  const uniform int blkBeg =     blkIdx * blkDim;
  const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
  {
    int pos = blk * programCount + programIndex;
    cif (pos >= totalProgramCount)
      return;
    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
    uniform int *  ranks = in_Ranks  + (pos - i) * 2;
    uniform int * limits = in_Limits + (pos - i) * 2;
    const int segmentElementsA = stride;
    const int segmentElementsB = min(stride, N - segmentBase - stride);
    const int  segmentSamplesA = getSampleCount(segmentElementsA);
    const int  segmentSamplesB = getSampleCount(segmentElementsB);
    if (i < segmentSamplesA)
    {
      int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
      limits[dstPos] = ranks[i];
    }
    if (i < segmentSamplesB)
    {
      int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
      limits[dstPos] = ranks[segmentSamplesA + i];
    }
  }
 }
 __device__ static inline
 void mergeRanksAndIndices(
    uniform int limitsA[],
    uniform int limitsB[],
    uniform int ranksA[],
    uniform int ranksB[],
    uniform int stride,
    uniform int N)
 {
  const uniform int lastSegmentElements = N % (2 * stride);
  const uniform int threadCount = (lastSegmentElements > stride) ? 
    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : 
    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
  const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
  uniform int nTasks = nBlocks/4;
  launch (nTasks,1,1,mergeRanksAndIndicesKernel)(
      nBlocks,
      limitsA,
      ranksA,
      stride,
      N,
      threadCount);
  launch (nTasks,1,1, mergeRanksAndIndicesKernel)(
      nBlocks,
      limitsB,
      ranksB,
      stride,
      N,
      threadCount);
  sync;
 }
 __global__
 void mergeElementaryIntervalsKernel(
    uniform int mergePairs,
    uniform Key_t dstKey[],
    uniform Val_t dstVal[],
    uniform Key_t srcKey[],
    uniform Val_t srcVal[],
    uniform int limitsA[],
    uniform int limitsB[],
    uniform int stride,
    uniform int N)
 {
  const uniform int blkIdx = taskIndex;
  const uniform int blkDim = (mergePairs + taskCount - 1)/taskCount;
  const uniform int blkBeg =     blkIdx * blkDim;
  const uniform int blkEnd = min(blkBeg + blkDim, mergePairs);
  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
  {
    const int uniform   intervalI =  blk & ((2 * stride) / SAMPLE_STRIDE - 1);
    const int uniform segmentBase = (blk - intervalI) * SAMPLE_STRIDE;
    //Set up threadblk-wide parameters
    const uniform int segmentElementsA = stride;
    const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
    const uniform int  segmentSamplesA = getSampleCount(segmentElementsA);
    const uniform int  segmentSamplesB = getSampleCount(segmentElementsB);
    const uniform int   segmentSamples = segmentSamplesA + segmentSamplesB;
    const uniform int startSrcA = limitsA[blk];
    const uniform int startSrcB = limitsB[blk];
    const uniform int endSrcA   = (intervalI + 1 < segmentSamples) ? limitsA[blk + 1] : segmentElementsA;
    const uniform int endSrcB   = (intervalI + 1 < segmentSamples) ? limitsB[blk + 1] : segmentElementsB;
    const uniform int lenSrcA   = endSrcA - startSrcA;
    const uniform int lenSrcB   = endSrcB - startSrcB;
    const uniform int startDstA = startSrcA + startSrcB;
    const uniform int startDstB = startDstA + lenSrcA;
    //Load main input data
    Key_t keyA, keyB;
    Val_t valA, valB;
    if (programIndex < lenSrcA)
    {
      keyA = srcKey[segmentBase + startSrcA + programIndex];
      valA = srcVal[segmentBase + startSrcA + programIndex];
    }
    if (programIndex < lenSrcB)
    {
      keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
      valB = srcVal[segmentBase + stride + startSrcB + programIndex];
    }
    // Compute destination addresses for merge data
    int dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
    int dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
    int dstA = -1, dstB = -1;
    if (programIndex < lenSrcA && dstPosA < lenSrcA)
      dstA = segmentBase + startDstA + dstPosA;
    if (programIndex < lenSrcB && dstPosB < lenSrcA)
      dstB = segmentBase + startDstA + dstPosB;
    dstPosA -= lenSrcA;
    dstPosB -= lenSrcA;
    if (programIndex < lenSrcA && dstPosA < lenSrcB)
      dstA = segmentBase + startDstB + dstPosA;
    if (programIndex < lenSrcB && dstPosB < lenSrcB)
      dstB = segmentBase + startDstB + dstPosB;
    // store merge data
    if (dstA >= 0) 
    {
 //     int dstA = segmentBase + startSrcA + programIndex;
      dstKey[dstA] = keyA;
      dstVal[dstA] = valA;
    }
    if (dstB >= 0) 
    {
 //      int dstB = segmentBase + stride + startSrcB + programIndex;
      dstKey[dstB] = keyB;
      dstVal[dstB] = valB;
    }
  }
 }
 __device__ static inline
 void mergeElementaryIntervals(
    uniform int nTasks,
    uniform Key_t dstKey[],
    uniform Val_t dstVal[],
    uniform Key_t srcKey[],
    uniform Val_t srcVal[],
    uniform int limitsA[],
    uniform int limitsB[],
    uniform int stride,
    uniform int N)
 {
  const uniform int lastSegmentElements = N % (2 * stride);
  const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
  nTasks = mergePairs/(4*programCount);
  launch (nTasks,1,1, mergeElementaryIntervalsKernel)(
      mergePairs,
      dstKey,
      dstVal,
      srcKey,
      srcVal,
      limitsA,
      limitsB,
      stride,
      N);
  sync;
 }
 __device__ static uniform int * uniform memPool = NULL;
 __device__ static uniform int * uniform ranksA;
 __device__ static uniform int * uniform ranksB;
 __device__ static uniform int * uniform limitsA;
 __device__ static uniform int * uniform limitsB;
 __device__ static uniform int nTasks;
 __device__ static uniform int MAX_SAMPLE_COUNT = 0;
 __global__ 
 void openMergeSort___export()
 {
  nTasks = 13*32*13;
  MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
  assert(memPool == NULL);
  const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
  memPool = uniform new uniform int[nalloc];
  ranksA  = memPool;
  ranksB  =  ranksA + MAX_SAMPLE_COUNT;
  limitsA =  ranksB + MAX_SAMPLE_COUNT;
  limitsB = limitsA + MAX_SAMPLE_COUNT;
 }
 extern "C"
 void openMergeSort()
 {
  openMergeSort___export<<<1,1>>>();
  sync;
 }
 __global__
 void closeMergeSort___export()
 {
  assert(memPool != NULL);
  delete memPool;
  memPool = NULL;
 }
 extern "C"
 void closeMergeSort()
 {
  closeMergeSort___export<<<1,1>>>();
  sync;
 }
 __global__
 void mergeSort___export(
    uniform Key_t dstKey[],
    uniform Val_t dstVal[],
    uniform Key_t bufKey[],
    uniform Val_t bufVal[],
    uniform Key_t srcKey[],
    uniform Val_t srcVal[],
    uniform int N)
 {
  uniform int stageCount = 0;
  for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);
  uniform Key_t * uniform iKey, * uniform oKey;
  uniform Val_t * uniform iVal, * uniform oVal;
  if (stageCount & 1)
  {
    iKey = bufKey;
    iVal = bufVal;
    oKey = dstKey;
    oVal = dstVal;
  }
  else
  {
    iKey = dstKey;
    iVal = dstVal;
    oKey = bufKey;
    oVal = bufVal;
  }
  assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
  assert(N % (programCount*2) == 0);
  mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
 #if 1
  for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
  {
    const uniform int lastSegmentElements = N % (2 * stride);
    //Find sample ranks and prepare for limiters merge
    generateSampleRanks(ranksA, ranksB, iKey, stride, N);
    //Merge ranks and indices
    mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
    //Merge elementary intervals
    mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
    if (lastSegmentElements <= stride)
      for (int i = programIndex; i < lastSegmentElements; i += programCount)
        if (i < lastSegmentElements)
        {
          oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i];
          oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i];
        }
    {
      uniform Key_t * uniform tmpKey = iKey;
      iKey = oKey;
      oKey = tmpKey;
    }
    {
      uniform Val_t * uniform tmpVal = iVal;
      iVal = oVal;
      oVal = tmpVal;
    }
  }
 #endif
 }
 extern "C"
 void mergeSort(
    uniform Key_t dstKey[],
    uniform Val_t dstVal[],
    uniform Key_t bufKey[],
    uniform Val_t bufVal[],
    uniform Key_t srcKey[],
    uniform Val_t srcVal[],
    uniform int N)
 {
  mergeSort___export<<<1,32>>>(
      dstKey,
      dstVal,
      bufKey,
      bufVal,
      srcKey,
      srcVal,
      N);
  sync;
 }