#include "keyType.h"

#define SAMPLE_STRIDE programCount

#define iDivUp(a,b) (((a) + (b) - 1)/(b))
#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))

#define W (/*sizeof(int)=*/4 * 8)

static inline
int nextPowerOfTwo(int x)
{
#if 0
  --x;
  x |= x >> 1;
  x |= x >> 2;
  x |= x >> 4;
  x |= x >> 8;
  x |= x >> 16;
  return ++x;
#else
  return 1U << (W - count_leading_zeros(x - 1));
#endif
}

static inline
int binarySearchInclusiveRanks(
    const int val, 
    uniform int *data,
    const int L, 
    int stride)
{
  if (L == 0)
    return 0;

  int pos = 0;
  for (; stride > 0; stride >>= 1)
  {
    int newPos = min(pos + stride, L);

    if (data[newPos - 1] <= val)
      pos = newPos;
  }

  return pos;
}

static inline
int binarySearchExclusiveRanks(
    const int val, 
    uniform int *data, 
    const int L, 
    int stride)
{
  if (L == 0)
    return 0;

  int pos = 0;
  for (; stride > 0; stride >>= 1)
  {
    int newPos = min(pos + stride, L);

    if (data[newPos - 1] < val)
      pos = newPos;
  }

  return pos;
}

static inline
int binarySearchInclusive(
    const Key_t val, 
    uniform Key_t *data,
    const int L, 
    int stride)
{
  if (L == 0)
    return 0;

  int pos = 0;
  for (; stride > 0; stride >>= 1)
  {
    int newPos = min(pos + stride, L);

    if (data[newPos - 1] <= val)
      pos = newPos;
  }

  return pos;
}

static inline
int binarySearchExclusive(
    const Key_t val, 
    uniform Key_t *data, 
    const int L, 
    int stride)
{
  if (L == 0)
    return 0;

  int pos = 0;
  for (; stride > 0; stride >>= 1)
  {
    int newPos = min(pos + stride, L);

    if (data[newPos - 1] < val)
      pos = newPos;
  }

  return pos;
}

static inline
int binarySearchInclusive1(
    const Key_t val, 
    Key_t data,
    const uniform int L, 
    uniform int stride)
{
  if (L == 0)
    return 0;

  int pos = 0;
  for (; stride > 0; stride >>= 1)
  {
    int newPos = min(pos + stride, L);

    if (shuffle(data,newPos - 1) <= val)
      pos = newPos;
  }

  return pos;
}

static inline
int binarySearchExclusive1(
    const Key_t val, 
    Key_t data, 
    const uniform int L, 
    uniform int stride)
{
  if (L == 0)
    return 0;

  int pos = 0;
  for (; stride > 0; stride >>= 1)
  {
    int newPos = min(pos + stride, L);

    if (shuffle(data,newPos - 1) < val)
      pos = newPos;
  }

  return pos;
}

////////////////////////////////////////////////////////////////////////////////
// Bottom-level merge sort (binary search-based)
////////////////////////////////////////////////////////////////////////////////
task 
void mergeSortGangKernel(
    uniform int batchSize,
    uniform Key_t dstKey[],
    uniform Val_t dstVal[],
    uniform Key_t srcKey[],
    uniform Val_t srcVal[],
    uniform int arrayLength)
{
  const uniform int blockIdx = taskIndex;
  const uniform int blockDim = (batchSize + taskCount - 1)/taskCount;
  const uniform int blockBeg =     blockIdx * blockDim;
  const uniform int blockEnd = min(blockBeg + blockDim, batchSize);

  uniform Key_t s_key[2*programCount];
  uniform Val_t s_val[2*programCount];

  for (uniform int block = blockBeg; block < blockEnd; block++)
  {
    const uniform int base = block * (programCount*2);
    s_key[programIndex +            0] = srcKey[base + programIndex +            0];
    s_val[programIndex +            0] = srcVal[base + programIndex +            0];
    s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
    s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];

#define STEP(stride) {\
  const int lPos = programIndex & (stride - 1); \
  const int offset = 2 * (programIndex - lPos); \
  Key_t keyA = s_key[lPos +      0]; \
  Val_t valA = s_val[lPos +      0]; \
  Key_t keyB = s_key[lPos + stride]; \
  Val_t valB = s_val[lPos + stride]; \
  s_key[programIndex] = keyA; \
  s_val[programIndex] = valA; \
  s_key[programCount+programIndex] = keyB; \
  s_val[programCount+programIndex] = valB;  \
}

#if 1
    for (uniform int stride = 1; stride < arrayLength; stride <<= 1)
    {
      const int lPos = programIndex & (stride - 1);
      const int offset = 2 * (programIndex - lPos);
      uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
      uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
      
      Key_t keyA = baseKey[lPos +      0];
      Val_t valA = baseVal[lPos +      0];
      Key_t keyB = baseKey[lPos + stride];
      Val_t valB = baseVal[lPos + stride];

#if 1
      int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
      int posB = binarySearchInclusive(keyB, baseKey +      0, stride, stride) + lPos;

      baseKey[posA] = keyA;
      baseVal[posA] = valA;
      baseKey[posB] = keyB;
      baseVal[posB] = valB;
#else
      s_key[programIndex] = keyA;
      s_val[programIndex] = valA;
      s_key[programCount+programIndex] = keyB;
      s_val[programCount+programIndex] = valB;
#endif
    }
#endif

    dstKey[base + programIndex +            0] = s_key[programIndex +            0];
    dstVal[base + programIndex +            0] = s_val[programIndex +            0];
    dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
    dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
  }
}

static inline
void mergeSortGang(
    uniform Key_t dstKey[],
    uniform Val_t dstVal[],
    uniform Key_t srcKey[],
    uniform Val_t srcVal[],
    uniform int batchSize)
{
  uniform int nTasks = num_cores()*4;
#ifdef __NVPTX__
  nTasks = batchSize/4;
#endif
  launch [nTasks] mergeSortGangKernel(batchSize, dstKey, dstVal, srcKey, srcVal, 2*programCount);
  sync;
}

////////////////////////////////////////////////////////////////////////////////
// Merge step 1: generate sample ranks
////////////////////////////////////////////////////////////////////////////////
task
void generateSampleRanksKernel(
    uniform int nBlocks,
    uniform int in_ranksA[],
    uniform int in_ranksB[],
    uniform Key_t in_srcKey[],
    uniform int stride,
    uniform int N,
    uniform int totalProgramCount)
{
  const uniform int blockIdx = taskIndex;
  const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
  const uniform int blockBeg =     blockIdx * blockDim;
  const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
  
  for (uniform int block = blockBeg; block < blockEnd; block++)
  {
    const int pos = block * programCount + programIndex;
    cif (pos >= totalProgramCount)
      return;

    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);

    uniform Key_t * srcKey = in_srcKey + segmentBase;
    uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
    uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;

    const int segmentElementsA = stride;
    const int segmentElementsB = min(stride, N - segmentBase - stride);
    const int  segmentSamplesA = getSampleCount(segmentElementsA);
    const int  segmentSamplesB = getSampleCount(segmentElementsB);

    if (i < segmentSamplesA)
    {
      ranksA[i] = i * SAMPLE_STRIDE;
      ranksB[i] = binarySearchExclusive(
          srcKey[i * SAMPLE_STRIDE], srcKey + stride,
          segmentElementsB, nextPowerOfTwo(segmentElementsB));
    }

    if (i < segmentSamplesB)
    {
      ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
      ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
          srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
          segmentElementsA, nextPowerOfTwo(segmentElementsA));
    }
  }
}

static inline
void generateSampleRanks(
    uniform int ranksA[],
    uniform int ranksB[],
    uniform Key_t srcKey[],
    uniform int stride,
    uniform int N)
{
  uniform int lastSegmentElements = N % (2 * stride);
  uniform int threadCount = (lastSegmentElements > stride) ? 
    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : 
    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

  uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
  uniform int nTasks = num_cores()*4;
#ifdef __NVPTX__
  nTasks = nBlocks/4;
#endif

  launch [nTasks] generateSampleRanksKernel(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
  sync;
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices
////////////////////////////////////////////////////////////////////////////////
task 
void mergeRanksAndIndicesKernel(
    uniform int nBlocks,
    uniform int in_Limits[],
    uniform int in_Ranks[],
    uniform int stride,
    uniform int N,
    uniform int totalProgramCount)
{
  const uniform int blockIdx = taskIndex;
  const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
  const uniform int blockBeg =     blockIdx * blockDim;
  const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
  
  for (uniform int block = blockBeg; block < blockEnd; block++)
  {
    int pos = block * programCount + programIndex;
    cif (pos >= totalProgramCount)
      return;

    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
    uniform int *  ranks = in_Ranks  + (pos - i) * 2;
    uniform int * limits = in_Limits + (pos - i) * 2;

    const int segmentElementsA = stride;
    const int segmentElementsB = min(stride, N - segmentBase - stride);
    const int  segmentSamplesA = getSampleCount(segmentElementsA);
    const int  segmentSamplesB = getSampleCount(segmentElementsB);

    if (i < segmentSamplesA)
    {
      int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
      limits[dstPos] = ranks[i];
    }

    if (i < segmentSamplesB)
    {
      int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
      limits[dstPos] = ranks[segmentSamplesA + i];
    }
  }
}
static inline
void mergeRanksAndIndices(
    uniform int limitsA[],
    uniform int limitsB[],
    uniform int ranksA[],
    uniform int ranksB[],
    uniform int stride,
    uniform int N)
{
  const uniform int lastSegmentElements = N % (2 * stride);
  const uniform int threadCount = (lastSegmentElements > stride) ? 
    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : 
    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

  const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
  uniform int nTasks = num_cores()*4;

#ifdef __NVPTX__
  nTasks = nBlocks/4;
#endif

  launch [nTasks] mergeRanksAndIndicesKernel(
      nBlocks,
      limitsA,
      ranksA,
      stride,
      N,
      threadCount);
  launch [nTasks] mergeRanksAndIndicesKernel(
      nBlocks,
      limitsB,
      ranksB,
      stride,
      N,
      threadCount);
  sync;
}


#if 0
static inline
void merge(
    int &dstPosA,
    int &dstPosB,
    Key_t keyA, Val_t valA,
    Key_t keyB, Val_t valB,
    uniform int lenA,
    uniform int nPowTwoLenA,
    uniform int lenB,
    uniform int nPowTwoLenB)
{
  if (programIndex < lenA)
    dstPosA = binarySearchExclusive1(keyA, keyB, lenB, nPowTwoLenB) + programIndex;

  if (programIndex < lenB)
    dstPosB = binarySearchInclusive1(keyB, keyA, lenA, nPowTwoLenA) + programIndex;
}
#endif


#if 0
task
void mergeElementaryIntervalsKernel(
    uniform Key_t dstKey[],
    uniform Val_t dstVal[],
    uniform Key_t srcKey[],
    uniform Val_t srcVal[],
    uniform int limitsA[],
    uniform int limitsB[],
    uniform int stride,
    uniform int N)
{
  const int uniform   intervalI =  taskIndex & ((2 * stride) / SAMPLE_STRIDE - 1);
  const int uniform segmentBase = (taskIndex - intervalI) * SAMPLE_STRIDE;

  //Set up threadblock-wide parameters

  const uniform int segmentElementsA = stride;
  const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
  const uniform int  segmentSamplesA = getSampleCount(segmentElementsA);
  const uniform int  segmentSamplesB = getSampleCount(segmentElementsB);
  const uniform int   segmentSamples = segmentSamplesA + segmentSamplesB;

  const uniform int startSrcA = limitsA[taskIndex];
  const uniform int startSrcB = limitsB[taskIndex];
  const uniform int endSrcA   = (intervalI + 1 < segmentSamples) ? limitsA[taskIndex + 1] : segmentElementsA;
  const uniform int endSrcB   = (intervalI + 1 < segmentSamples) ? limitsB[taskIndex + 1] : segmentElementsB;
  const uniform int lenSrcA   = endSrcA - startSrcA;
  const uniform int lenSrcB   = endSrcB - startSrcB;
  const uniform int startDstA = startSrcA + startSrcB;
  const uniform int startDstB = startDstA + lenSrcA;

  //Load main input data

  Key_t keyA, keyB;
  Val_t valA, valB;
  if (programIndex < lenSrcA)
  {
    keyA = srcKey[segmentBase + startSrcA + programIndex];
    valA = srcVal[segmentBase + startSrcA + programIndex];
  }

  if (programIndex < lenSrcB)
  {
    keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
    valB = srcVal[segmentBase + stride + startSrcB + programIndex];
  }

  //Merge data in shared memory
  int dstPosA, dstPosB;
  merge(
      dstPosA,
      dstPosB,
      keyA, valA,
      keyB, valB,
      lenSrcA, SAMPLE_STRIDE,
      lenSrcB, SAMPLE_STRIDE
      );
  uniform Key_t s_key[2 * SAMPLE_STRIDE];
  uniform Val_t s_val[2 * SAMPLE_STRIDE];

  if (programIndex < lenSrcA)
  {
    s_key[dstPosA] = keyA;
    s_val[dstPosA] = valA;
  }
  if (programIndex < lenSrcB)
  {
    s_key[dstPosB] = keyB;
    s_val[dstPosB] = valB;
  }

  // Coalesced writes
  if (programIndex < lenSrcA)
  {
    dstKey[segmentBase + startDstA + programIndex] = s_key[programIndex];
    dstVal[segmentBase + startDstA + programIndex] = s_val[programIndex];
  }

  if (programIndex < lenSrcB)
  {
    dstKey[segmentBase + startDstB + programIndex] = s_key[lenSrcA + programIndex];
    dstVal[segmentBase + startDstB + programIndex] = s_val[lenSrcA + programIndex];
  }
}
#endif

task
void mergeElementaryIntervalsKernel(
    uniform int mergePairs,
    uniform Key_t dstKey[],
    uniform Val_t dstVal[],
    uniform Key_t srcKey[],
    uniform Val_t srcVal[],
    uniform int limitsA[],
    uniform int limitsB[],
    uniform int stride,
    uniform int N)
{
  const uniform int blockIdx = taskIndex;
  const uniform int blockDim = (mergePairs + taskCount - 1)/taskCount;
  const uniform int blockBeg =     blockIdx * blockDim;
  const uniform int blockEnd = min(blockBeg + blockDim, mergePairs);

  for (uniform int block = blockBeg; block < blockEnd; block++)
  {
    const int uniform   intervalI =  block & ((2 * stride) / SAMPLE_STRIDE - 1);
    const int uniform segmentBase = (block - intervalI) * SAMPLE_STRIDE;

    //Set up threadblock-wide parameters

    const uniform int segmentElementsA = stride;
    const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
    const uniform int  segmentSamplesA = getSampleCount(segmentElementsA);
    const uniform int  segmentSamplesB = getSampleCount(segmentElementsB);
    const uniform int   segmentSamples = segmentSamplesA + segmentSamplesB;

    const uniform int startSrcA = limitsA[block];
    const uniform int startSrcB = limitsB[block];
    const uniform int endSrcA   = (intervalI + 1 < segmentSamples) ? limitsA[block + 1] : segmentElementsA;
    const uniform int endSrcB   = (intervalI + 1 < segmentSamples) ? limitsB[block + 1] : segmentElementsB;
    const uniform int lenSrcA   = endSrcA - startSrcA;
    const uniform int lenSrcB   = endSrcB - startSrcB;
    const uniform int startDstA = startSrcA + startSrcB;
    const uniform int startDstB = startDstA + lenSrcA;

    //Load main input data

    Key_t keyA, keyB;
    Val_t valA, valB;
    if (programIndex < lenSrcA)
    {
      keyA = srcKey[segmentBase + startSrcA + programIndex];
      valA = srcVal[segmentBase + startSrcA + programIndex];
    }

    if (programIndex < lenSrcB)
    {
      keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
      valB = srcVal[segmentBase + stride + startSrcB + programIndex];
    }

    // Compute destination addresses for merge data
    int dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
    int dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;


    int dstA = -1, dstB = -1;
    if (programIndex < lenSrcA && dstPosA < lenSrcA)
      dstA = segmentBase + startDstA + dstPosA;
    if (programIndex < lenSrcB && dstPosB < lenSrcA)
      dstB = segmentBase + startDstA + dstPosB;

    dstPosA -= lenSrcA;
    dstPosB -= lenSrcA;
    if (programIndex < lenSrcA && dstPosA < lenSrcB)
      dstA = segmentBase + startDstB + dstPosA;
    if (programIndex < lenSrcB && dstPosB < lenSrcB)
      dstB = segmentBase + startDstB + dstPosB;

    // store merge data
    if (dstA >= 0) 
    {
 //     int dstA = segmentBase + startSrcA + programIndex;
      dstKey[dstA] = keyA;
      dstVal[dstA] = valA;
    }
    if (dstB >= 0) 
    {
//      int dstB = segmentBase + stride + startSrcB + programIndex;
      dstKey[dstB] = keyB;
      dstVal[dstB] = valB;
    }
  }

}


static inline
void mergeElementaryIntervals(
    uniform int nTasks,
    uniform Key_t dstKey[],
    uniform Val_t dstVal[],
    uniform Key_t srcKey[],
    uniform Val_t srcVal[],
    uniform int limitsA[],
    uniform int limitsB[],
    uniform int stride,
    uniform int N)
{
  const uniform int lastSegmentElements = N % (2 * stride);
  const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;


#ifdef __NVPTX__
  nTasks = mergePairs/(4*programCount);
#endif

  launch [nTasks] mergeElementaryIntervalsKernel(
      mergePairs,
      dstKey,
      dstVal,
      srcKey,
      srcVal,
      limitsA,
      limitsB,
      stride,
      N);
  sync;
}

static uniform int * uniform memPool = NULL;
static uniform int * uniform ranksA;
static uniform int * uniform ranksB;
static uniform int * uniform limitsA;
static uniform int * uniform limitsB;
static uniform int nTasks;
static uniform int MAX_SAMPLE_COUNT = 0;

export 
void openMergeSort()
{
  nTasks = num_cores()*4;
#ifdef __NVPTX__
  nTasks = num_cores()*13;
#endif
  MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
  assert(memPool == NULL);
  const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
  memPool = uniform new uniform int[nalloc];
  ranksA  = memPool;
  ranksB  =  ranksA + MAX_SAMPLE_COUNT;
  limitsA =  ranksB + MAX_SAMPLE_COUNT;
  limitsB = limitsA + MAX_SAMPLE_COUNT;
}

export
void closeMergeSort()
{
  assert(memPool != NULL);
  delete memPool;
  memPool = NULL;
}

export 
void mergeSort(
    uniform Key_t dstKey[],
    uniform Val_t dstVal[],
    uniform Key_t bufKey[],
    uniform Val_t bufVal[],
    uniform Key_t srcKey[],
    uniform Val_t srcVal[],
    uniform int N)
{
  uniform int stageCount = 0;
  for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);

  uniform Key_t * uniform iKey, * uniform oKey;
  uniform Val_t * uniform iVal, * uniform oVal;

  if (stageCount & 1)
  {
    iKey = bufKey;
    iVal = bufVal;
    oKey = dstKey;
    oVal = dstVal;
  }
  else
  {
    iKey = dstKey;
    iVal = dstVal;
    oKey = bufKey;
    oVal = bufVal;
  }


  assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
  assert(N % (programCount*2) == 0);

  // cpu: 28  gpu: 74 M/s
  {
    // cpu: 356   gpu: 534 M/s
    mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));

#if 1
    for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
    {
      const uniform int lastSegmentElements = N % (2 * stride);

      // cpu: 30  gpu: 112 M/s
      {
#if 1
        // cpu: 121  gpu: 460 M/s
        {
          // cpu: 190  gpu: 600 M/s
          //Find sample ranks and prepare for limiters merge
          generateSampleRanks(ranksA, ranksB, iKey, stride, N);

          // cpu: 120 gpu: 457 M/s
          //Merge ranks and indices
          mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
        }
#endif

        // cpu: 287  gpu: 194 M/s 
        //Merge elementary intervals
        mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
      }

      if (lastSegmentElements <= stride)
        foreach (i = 0 ... lastSegmentElements)
        {
          oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i];
          oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i];
        }
      memory_barrier();


      {
        uniform Key_t * uniform tmpKey = iKey;
        iKey = oKey;
        oKey = tmpKey;
      }
      {
        uniform Val_t * uniform tmpVal = iVal;
        iVal = oVal;
        oVal = tmpVal;
      }
    }
#endif
  }
}