added mergeSort

2014-02-20 13:58:55 +01:00
parent a169ff636b
commit 607e010874
6 changed files with 1445 additions and 0 deletions
--- a/examples/portable/mergeSort/Makefile_cpu
+++ b/examples/portable/mergeSort/Makefile_cpu
@@ -0,0 +1,12 @@
+
+EXAMPLE=mergeSort
+CPP_SRC=mergeSort.cpp 
+ISPC_SRC=mergeSort.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+#ISPC_FLAGS=-DDEBUG -g
+CXXFLAGS=-g
+CCFLAGS=-g
+#NVCC_FLAGS=-Xptxas=-O0
+
+include ../common_cpu.mk
--- a/examples/portable/mergeSort/Makefile_ptx
+++ b/examples/portable/mergeSort/Makefile_ptx
@@ -0,0 +1,15 @@
+PROG=mergeSort
+ISPC_SRC=mergeSort.ispc
+CU_SRC=mergeSort.cu
+CXX_SRC=mergeSort.cpp  mergeSort.cpp
+PTXCC_REGMAX=64
+#PTXCC_FLAGS= -Xptxas=-O3
+#NVCC_FLAGS=-Xptxas=-O0
+
+LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
--- a/examples/portable/mergeSort/keyType.h
+++ b/examples/portable/mergeSort/keyType.h
@@ -0,0 +1,3 @@
+#pragma once
+typedef float Key_t;
+typedef int   Val_t;
--- a/examples/portable/mergeSort/mergeSort.cpp
+++ b/examples/portable/mergeSort/mergeSort.cpp
@@ -0,0 +1,135 @@
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <cassert>
+#include <iomanip>
+#include "timing.h"
+#include "ispc_malloc.h"
+#include "mergeSort_ispc.h"
+
+/* progress bar by Ross Hemsley;
+ * http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */
+static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50)
+{
+  if (n < 100)
+  {
+    x *= 100/n;
+    n = 100;
+  }
+
+  if ((x != n) && (x % (n/100) != 0)) return;
+
+  using namespace std;
+  float ratio  =  x/(float)n;
+  int c =  ratio * w;
+
+  cout << setw(3) << (int)(ratio*100) << "% [";
+  for (int x=0; x<c; x++) cout << "=";
+  for (int x=c; x<w; x++) cout << " ";
+  cout << "]\r" << flush;
+}
+
+#include "keyType.h"
+struct Key
+{
+  Key_t key;
+  Val_t val;
+};
+
+
+int main (int argc, char *argv[])
+{
+  int i, j, n = argc == 1 ? 1024*1024: atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
+  double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
+
+  Key *keys = new Key[n];
+  srand48(rtc()*65536);
+#pragma omp parallel for
+  for (int i = 0; i < n; i++)
+  {
+    keys[i].key = i; //((int)(drand48() * (1<<30)));
+    keys[i].val = i;
+  }
+  std::random_shuffle(keys, keys + n);
+
+  Key_t *keysSrc = new Key_t[n];
+  Val_t *valsSrc = new Val_t[n];
+  Key_t *keysBuf = new Key_t[n];
+  Val_t *valsBuf = new Val_t[n];
+  Key_t *keysDst = new Key_t[n];
+  Val_t *valsDst = new Val_t[n];
+  Key_t *keysGld = new Key_t[n];
+  Val_t *valsGld = new Val_t[n];
+#pragma omp parallel for
+  for (int i = 0; i < n; i++)
+  {
+    keysSrc[i] = keys[i].key;
+    valsSrc[i] = keys[i].val;
+
+    keysGld[i] = keysSrc[i];
+    valsGld[i] = valsSrc[i];
+  }
+  delete keys;
+    
+  ispcSetMallocHeapLimit(1024*1024*1024);
+
+  ispc::openMergeSort();
+
+  tISPC2 = 1e30;
+  for (i = 0; i < m; i ++)
+  {
+    ispcMemcpy(keysSrc, keysGld, n*sizeof(Key_t));
+    ispcMemcpy(valsSrc, valsGld, n*sizeof(Val_t));
+
+    reset_and_start_timer();
+    ispc::mergeSort(keysDst, valsDst, keysBuf, valsBuf, keysSrc, valsSrc, n);
+    tISPC2 = std::min(tISPC2, get_elapsed_msec());
+
+    if (argc != 3)
+        progressbar (i, m);
+  }
+
+  ispc::closeMergeSort();
+
+  printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n/tISPC2);
+
+#if 0
+  printf("\n---\n");
+  for (int i = 0; i < 128; i++)
+  {
+    if ((i%32) == 0) printf("\n");
+    printf("%d ", (int)keysSrc[i]);
+  }
+  printf("\n---\n");
+  for (int i = 0; i < 128; i++)
+  {
+    if ((i%32) == 0) printf("\n");
+    printf("%d ", (int)keysBuf[i]);
+  }
+  printf("\n---\n");
+  for (int i = 0; i < 128; i++)
+  {
+    if ((i%32) == 0) printf("\n");
+    printf("%d ", (int)keysDst[i]);
+  }
+  printf("\n---\n");
+#endif
+    
+
+
+  std::sort(keysGld, keysGld + n);
+  for (int i = 0; i < n; i++)
+    assert(keysDst[i] == keysGld[i]);
+
+  delete keysSrc;
+  delete valsSrc;
+  delete keysDst;
+  delete valsDst;
+  delete keysBuf;
+  delete valsBuf;
+  delete keysGld;
+  delete valsGld;
+
+  return 0;
+}
--- a/examples/portable/mergeSort/mergeSort.cu
+++ b/examples/portable/mergeSort/mergeSort.cu
@@ -0,0 +1,658 @@
+#include "keyType.h"
+#include "cuda_helpers.cuh"
+#include <cassert>
+
+#define uniform
+
+#define SAMPLE_STRIDE programCount
+
+#define iDivUp(a,b) (((a) + (b) - 1)/(b))
+#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
+
+#define W (/*sizeof(int)=*/4 * 8)
+
+__device__ static inline
+int nextPowerOfTwo(int x)
+{
+#if 0
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
+#else
+  return 1U << (W - __clz(x - 1));
+#endif
+}
+
+
+__device__ static inline
+int binarySearchInclusiveRanks(
+    const int val, 
+    uniform int *data,
+    const int L, 
+    int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchExclusiveRanks(
+    const int val, 
+    uniform int *data, 
+    const int L, 
+    int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchInclusive(
+    const Key_t val, 
+    uniform Key_t *data,
+    const int L, 
+    int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchExclusive(
+    const Key_t val, 
+    uniform Key_t *data, 
+    const int L, 
+    int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchInclusive1(
+    const Key_t val, 
+    Key_t data,
+    const uniform int L, 
+    uniform int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (shuffle(data,newPos - 1) <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+__device__ static inline
+int binarySearchExclusive1(
+    const Key_t val, 
+    Key_t data, 
+    const uniform int L, 
+    uniform int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (shuffle(data,newPos - 1) < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Bottom-level merge sort (binary search-based)
+////////////////////////////////////////////////////////////////////////////////
+__global__
+void mergeSortGangKernel(
+    uniform int batchSize,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[])
+{
+  const uniform int blkIdx = taskIndex;
+  const uniform int blkDim = (batchSize + taskCount - 1)/taskCount;
+  const uniform int blkBeg =     blkIdx * blkDim;
+  const uniform int blkEnd = min(blkBeg + blkDim, batchSize);
+
+  __shared__ Key_t s_key_tmp[2*programCount*4];
+  __shared__ Val_t s_val_tmp[2*programCount*4];
+  Key_t *s_key = s_key_tmp + warpIdx*(2*programCount);
+  Val_t *s_val = s_val_tmp + warpIdx*(2*programCount);
+
+  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
+  {
+    const uniform int base = blk * (programCount*2);
+    s_key[programIndex +            0] = srcKey[base + programIndex +            0];
+    s_val[programIndex +            0] = srcVal[base + programIndex +            0];
+    s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
+    s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];
+
+    for (uniform int stride = 1; stride < 2*programCount; stride <<= 1)
+    {
+      const int lPos = programIndex & (stride - 1);
+      uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
+      uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
+
+      Key_t keyA = baseKey[lPos +      0];
+      Val_t valA = baseVal[lPos +      0];
+      Key_t keyB = baseKey[lPos + stride];
+      Val_t valB = baseVal[lPos + stride];
+      int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
+      int posB = binarySearchInclusive(keyB, baseKey +      0, stride, stride) + lPos;
+
+      baseKey[posA] = keyA;
+      baseVal[posA] = valA;
+      baseKey[posB] = keyB;
+      baseVal[posB] = valB;
+    }
+
+    dstKey[base + programIndex +            0] = s_key[programIndex +            0];
+    dstVal[base + programIndex +            0] = s_val[programIndex +            0];
+    dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
+    dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
+  }
+}
+
+__device__ static inline
+void mergeSortGang(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int batchSize)
+{
+  uniform int nTasks = batchSize;
+  launch (nTasks,1,1,mergeSortGangKernel)(batchSize, dstKey, dstVal, srcKey, srcVal);
+  sync;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Merge step 1: generate sample ranks
+////////////////////////////////////////////////////////////////////////////////
+__global__
+void generateSampleRanksKernel(
+    uniform int nBlocks,
+    uniform int in_ranksA[],
+    uniform int in_ranksB[],
+    uniform Key_t in_srcKey[],
+    uniform int stride,
+    uniform int N,
+    uniform int totalProgramCount)
+{
+  const uniform int blkIdx = taskIndex;
+  const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
+  const uniform int blkBeg =     blkIdx * blkDim;
+  const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
+  
+  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
+  {
+    const int pos = blk * programCount + programIndex;
+    cif (pos >= totalProgramCount)
+      return;
+
+    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+
+    uniform Key_t * srcKey = in_srcKey + segmentBase;
+    uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
+    uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
+
+    const int segmentElementsA = stride;
+    const int segmentElementsB = min(stride, N - segmentBase - stride);
+    const int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const int  segmentSamplesB = getSampleCount(segmentElementsB);
+
+    if (i < segmentSamplesA)
+    {
+      ranksA[i] = i * SAMPLE_STRIDE;
+      ranksB[i] = binarySearchExclusive(
+          srcKey[i * SAMPLE_STRIDE], srcKey + stride,
+          segmentElementsB, nextPowerOfTwo(segmentElementsB));
+    }
+
+    if (i < segmentSamplesB)
+    {
+      ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
+      ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
+          srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
+          segmentElementsA, nextPowerOfTwo(segmentElementsA));
+    }
+  }
+}
+
+__device__ static inline
+void generateSampleRanks(
+    uniform int ranksA[],
+    uniform int ranksB[],
+    uniform Key_t srcKey[],
+    uniform int stride,
+    uniform int N)
+{
+  uniform int lastSegmentElements = N % (2 * stride);
+  uniform int threadCount = (lastSegmentElements > stride) ? 
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : 
+    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+
+  uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
+  uniform int nTasks = nBlocks;
+
+  launch (nTasks,1,1, generateSampleRanksKernel)(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
+  sync;
+}
+////////////////////////////////////////////////////////////////////////////////
+// Merge step 2: generate sample ranks and indices
+////////////////////////////////////////////////////////////////////////////////
+__global__ 
+void mergeRanksAndIndicesKernel(
+    uniform int nBlocks,
+    uniform int in_Limits[],
+    uniform int in_Ranks[],
+    uniform int stride,
+    uniform int N,
+    uniform int totalProgramCount)
+{
+  const uniform int blkIdx = taskIndex;
+  const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
+  const uniform int blkBeg =     blkIdx * blkDim;
+  const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
+  
+  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
+  {
+    int pos = blk * programCount + programIndex;
+    cif (pos >= totalProgramCount)
+      return;
+
+    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+    uniform int *  ranks = in_Ranks  + (pos - i) * 2;
+    uniform int * limits = in_Limits + (pos - i) * 2;
+
+    const int segmentElementsA = stride;
+    const int segmentElementsB = min(stride, N - segmentBase - stride);
+    const int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const int  segmentSamplesB = getSampleCount(segmentElementsB);
+
+    if (i < segmentSamplesA)
+    {
+      int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
+      limits[dstPos] = ranks[i];
+    }
+
+    if (i < segmentSamplesB)
+    {
+      int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
+      limits[dstPos] = ranks[segmentSamplesA + i];
+    }
+  }
+}
+__device__ static inline
+void mergeRanksAndIndices(
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int ranksA[],
+    uniform int ranksB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int lastSegmentElements = N % (2 * stride);
+  const uniform int threadCount = (lastSegmentElements > stride) ? 
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : 
+    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+
+  const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
+  uniform int nTasks = nBlocks;
+
+  launch (nTasks,1,1,mergeRanksAndIndicesKernel)(
+      nBlocks,
+      limitsA,
+      ranksA,
+      stride,
+      N,
+      threadCount);
+  launch (nTasks,1,1, mergeRanksAndIndicesKernel)(
+      nBlocks,
+      limitsB,
+      ranksB,
+      stride,
+      N,
+      threadCount);
+  sync;
+}
+
+
+__global__
+void mergeElementaryIntervalsKernel(
+    uniform int mergePairs,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int blkIdx = taskIndex;
+  const uniform int blkDim = (mergePairs + taskCount - 1)/taskCount;
+  const uniform int blkBeg =     blkIdx * blkDim;
+  const uniform int blkEnd = min(blkBeg + blkDim, mergePairs);
+
+  for (uniform int blk = blkBeg; blk < blkEnd; blk++)
+  {
+    const int uniform   intervalI =  blk & ((2 * stride) / SAMPLE_STRIDE - 1);
+    const int uniform segmentBase = (blk - intervalI) * SAMPLE_STRIDE;
+
+    //Set up threadblk-wide parameters
+
+    const uniform int segmentElementsA = stride;
+    const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
+    const uniform int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const uniform int  segmentSamplesB = getSampleCount(segmentElementsB);
+    const uniform int   segmentSamples = segmentSamplesA + segmentSamplesB;
+
+    const uniform int startSrcA = limitsA[blk];
+    const uniform int startSrcB = limitsB[blk];
+    const uniform int endSrcA   = (intervalI + 1 < segmentSamples) ? limitsA[blk + 1] : segmentElementsA;
+    const uniform int endSrcB   = (intervalI + 1 < segmentSamples) ? limitsB[blk + 1] : segmentElementsB;
+    const uniform int lenSrcA   = endSrcA - startSrcA;
+    const uniform int lenSrcB   = endSrcB - startSrcB;
+    const uniform int startDstA = startSrcA + startSrcB;
+    const uniform int startDstB = startDstA + lenSrcA;
+
+    //Load main input data
+
+    Key_t keyA, keyB;
+    Val_t valA, valB;
+    if (programIndex < lenSrcA)
+    {
+      keyA = srcKey[segmentBase + startSrcA + programIndex];
+      valA = srcVal[segmentBase + startSrcA + programIndex];
+    }
+
+    if (programIndex < lenSrcB)
+    {
+      keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
+      valB = srcVal[segmentBase + stride + startSrcB + programIndex];
+    }
+
+    // Compute destination addresses for merge data
+    int dstPosA, dstPosB, dstA = -1, dstB = -1;
+    if (any(programIndex < lenSrcA))
+      dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
+    if (any(programIndex < lenSrcB))
+      dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
+
+    if (programIndex < lenSrcA && dstPosA < lenSrcA)
+      dstA = segmentBase + startDstA + dstPosA;
+    dstPosA -= lenSrcA;
+    if (programIndex < lenSrcA && dstPosA < lenSrcB)
+      dstA = segmentBase + startDstB + dstPosA;
+
+    if (programIndex < lenSrcB && dstPosB < lenSrcA)
+      dstB = segmentBase + startDstA + dstPosB;
+    dstPosB -= lenSrcA;
+    if (programIndex < lenSrcB && dstPosB < lenSrcB)
+      dstB = segmentBase + startDstB + dstPosB;
+
+    // store merge data
+    if (dstA >= 0) 
+    {
+ //     int dstA = segmentBase + startSrcA + programIndex;
+      dstKey[dstA] = keyA;
+      dstVal[dstA] = valA;
+    }
+    if (dstB >= 0) 
+    {
+//      int dstB = segmentBase + stride + startSrcB + programIndex;
+      dstKey[dstB] = keyB;
+      dstVal[dstB] = valB;
+    }
+  }
+
+}
+
+
+__device__ static inline
+void mergeElementaryIntervals(
+    uniform int nTasks,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int lastSegmentElements = N % (2 * stride);
+  const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
+
+
+  nTasks = mergePairs/(programCount);
+
+  launch (nTasks,1,1, mergeElementaryIntervalsKernel)(
+      mergePairs,
+      dstKey,
+      dstVal,
+      srcKey,
+      srcVal,
+      limitsA,
+      limitsB,
+      stride,
+      N);
+  sync;
+}
+
+__device__ static uniform int * uniform memPool = NULL;
+__device__ static uniform int * uniform ranksA;
+__device__ static uniform int * uniform ranksB;
+__device__ static uniform int * uniform limitsA;
+__device__ static uniform int * uniform limitsB;
+__device__ static uniform int nTasks;
+__device__ static uniform int MAX_SAMPLE_COUNT = 0;
+
+__global__ 
+void openMergeSort___export()
+{
+  nTasks = 13*32*13;
+  MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
+  assert(memPool == NULL);
+  const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
+  memPool = uniform new uniform int[nalloc];
+  ranksA  = memPool;
+  ranksB  =  ranksA + MAX_SAMPLE_COUNT;
+  limitsA =  ranksB + MAX_SAMPLE_COUNT;
+  limitsB = limitsA + MAX_SAMPLE_COUNT;
+}
+extern "C"
+void openMergeSort()
+{
+  openMergeSort___export<<<1,1>>>();
+  sync;
+}
+
+__global__
+void closeMergeSort___export()
+{
+  assert(memPool != NULL);
+  delete memPool;
+  memPool = NULL;
+}
+extern "C"
+void closeMergeSort()
+{
+  closeMergeSort___export<<<1,1>>>();
+  sync;
+}
+
+__global__
+void mergeSort___export(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t bufKey[],
+    uniform Val_t bufVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int N)
+{
+  uniform int stageCount = 0;
+  for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);
+
+  uniform Key_t * uniform iKey, * uniform oKey;
+  uniform Val_t * uniform iVal, * uniform oVal;
+
+  if (stageCount & 1)
+  {
+    iKey = bufKey;
+    iVal = bufVal;
+    oKey = dstKey;
+    oVal = dstVal;
+  }
+  else
+  {
+    iKey = dstKey;
+    iVal = dstVal;
+    oKey = bufKey;
+    oVal = bufVal;
+  }
+
+
+
+  assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
+  assert(N % (programCount*2) == 0);
+
+  // k20m: 140 M/s
+  {
+    // k20m:  2367 M/s
+    mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
+
+#if 1
+    for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
+    {
+      const uniform int lastSegmentElements = N % (2 * stride);
+
+      // k20m: 271 M/s
+      {
+#if 1
+        // k20m: 944 M/s
+        {
+          // k20m:  1396 M/s
+          //Find sample ranks and prepare for limiters merge
+          generateSampleRanks(ranksA, ranksB, iKey, stride, N);
+
+          // k20m: 2379 M/s
+          //Merge ranks and indices
+          mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
+        }
+#endif
+
+        // k20m: 371 M/s
+        //Merge elementary intervals
+        mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
+      }
+
+      if (lastSegmentElements <= stride)
+        for (int i = programIndex; i < lastSegmentElements; i += programCount)
+          if (i < lastSegmentElements)
+          {
+            oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i];
+            oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i];
+          }
+
+
+      {
+        uniform Key_t * uniform tmpKey = iKey;
+        iKey = oKey;
+        oKey = tmpKey;
+      }
+      {
+        uniform Val_t * uniform tmpVal = iVal;
+        iVal = oVal;
+        oVal = tmpVal;
+      }
+    }
+#endif
+  }
+}
+extern "C"
+void mergeSort(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t bufKey[],
+    uniform Val_t bufVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int N)
+{
+  mergeSort___export<<<1,32>>>(
+      dstKey,
+      dstVal,
+      bufKey,
+      bufVal,
+      srcKey,
+      srcVal,
+      N);
+  sync;
+}
--- a/examples/portable/mergeSort/mergeSort.ispc
+++ b/examples/portable/mergeSort/mergeSort.ispc
@@ -0,0 +1,622 @@
+#include "keyType.h"
+
+#define SAMPLE_STRIDE programCount
+
+#define iDivUp(a,b) (((a) + (b) - 1)/(b))
+#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
+
+#define W (/*sizeof(int)=*/4 * 8)
+
+static inline
+int nextPowerOfTwo(int x)
+{
+#if 0
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
+#else
+  return 1U << (W - count_leading_zeros(x - 1));
+#endif
+}
+
+static inline
+int binarySearchInclusiveRanks(
+    const int val, 
+    uniform int *data,
+    const int L, 
+    int stride)
+{
+  cif (L == 0)
+    return 0;
+
+  int pos = 0;
+  cfor (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    cif (data[newPos - 1] <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchExclusiveRanks(
+    const int val, 
+    uniform int *data, 
+    const int L, 
+    int stride)
+{
+  cif (L == 0)
+    return 0;
+
+  int pos = 0;
+  cfor (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchInclusive(
+    const Key_t val, 
+    uniform Key_t *data,
+    const int L, 
+    int stride)
+{
+  cif (L == 0)
+    return 0;
+
+  int pos = 0;
+  cfor (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchExclusive(
+    const Key_t val, 
+    uniform Key_t *data, 
+    const int L, 
+    int stride)
+{
+  cif (L == 0)
+    return 0;
+
+  int pos = 0;
+  cfor (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (data[newPos - 1] < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchInclusive1(
+    const Key_t val, 
+    Key_t data,
+    const uniform int L, 
+    uniform int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (shuffle(data,newPos - 1) <= val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+static inline
+int binarySearchExclusive1(
+    const Key_t val, 
+    Key_t data, 
+    const uniform int L, 
+    uniform int stride)
+{
+  if (L == 0)
+    return 0;
+
+  int pos = 0;
+  for (; stride > 0; stride >>= 1)
+  {
+    int newPos = min(pos + stride, L);
+
+    if (shuffle(data,newPos - 1) < val)
+      pos = newPos;
+  }
+
+  return pos;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Bottom-level merge sort (binary search-based)
+////////////////////////////////////////////////////////////////////////////////
+task 
+void mergeSortGangKernel(
+    uniform int batchSize,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int arrayLength)
+{
+  const uniform int blockIdx = taskIndex;
+  const uniform int blockDim = (batchSize + taskCount - 1)/taskCount;
+  const uniform int blockBeg =     blockIdx * blockDim;
+  const uniform int blockEnd = min(blockBeg + blockDim, batchSize);
+
+  uniform Key_t s_key[2*programCount];
+  uniform Val_t s_val[2*programCount];
+
+  for (uniform int block = blockBeg; block < blockEnd; block++)
+  {
+    const uniform int base = block * (programCount*2);
+    s_key[programIndex +            0] = srcKey[base + programIndex +            0];
+    s_val[programIndex +            0] = srcVal[base + programIndex +            0];
+    s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
+    s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];
+
+    for (uniform int stride = 1; stride < arrayLength; stride <<= 1)
+    {
+      const int lPos = programIndex & (stride - 1);
+      const int offset = 2 * (programIndex - lPos);
+      uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
+      uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
+      
+      Key_t keyA = baseKey[lPos +      0];
+      Val_t valA = baseVal[lPos +      0];
+      Key_t keyB = baseKey[lPos + stride];
+      Val_t valB = baseVal[lPos + stride];
+
+      int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
+      int posB = binarySearchInclusive(keyB, baseKey +      0, stride, stride) + lPos;
+
+      baseKey[posA] = keyA;
+      baseVal[posA] = valA;
+      baseKey[posB] = keyB;
+      baseVal[posB] = valB;
+    }
+
+    dstKey[base + programIndex +            0] = s_key[programIndex +            0];
+    dstVal[base + programIndex +            0] = s_val[programIndex +            0];
+    dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
+    dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
+  }
+}
+
+static inline
+void mergeSortGang(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int batchSize)
+{
+  uniform int nTasks = num_cores()*4;
+#ifdef __NVPTX__
+  nTasks = iDivUp(batchSize,1);
+#endif
+  launch [nTasks] mergeSortGangKernel(batchSize, dstKey, dstVal, srcKey, srcVal, 2*programCount);
+  sync;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Merge step 1: generate sample ranks
+////////////////////////////////////////////////////////////////////////////////
+task
+void generateSampleRanksKernel(
+    uniform int nBlocks,
+    uniform int in_ranksA[],
+    uniform int in_ranksB[],
+    uniform Key_t in_srcKey[],
+    uniform int stride,
+    uniform int N,
+    uniform int totalProgramCount)
+{
+  const uniform int blockIdx = taskIndex;
+  const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
+  const uniform int blockBeg =     blockIdx * blockDim;
+  const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
+  
+  for (uniform int block = blockBeg; block < blockEnd; block++)
+  {
+    const int pos = block * programCount + programIndex;
+    cif (pos >= totalProgramCount)
+      return;
+
+    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+
+    uniform Key_t * srcKey = in_srcKey + segmentBase;
+    uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
+    uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
+
+    const int segmentElementsA = stride;
+    const int segmentElementsB = min(stride, N - segmentBase - stride);
+    const int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const int  segmentSamplesB = getSampleCount(segmentElementsB);
+
+    if (i < segmentSamplesA)
+    {
+      ranksA[i] = i * SAMPLE_STRIDE;
+      ranksB[i] = binarySearchExclusive(
+          srcKey[i * SAMPLE_STRIDE], srcKey + stride,
+          segmentElementsB, nextPowerOfTwo(segmentElementsB));
+    }
+
+    if (i < segmentSamplesB)
+    {
+      ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
+      ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
+          srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
+          segmentElementsA, nextPowerOfTwo(segmentElementsA));
+    }
+  }
+}
+
+static inline
+void generateSampleRanks(
+    uniform int ranksA[],
+    uniform int ranksB[],
+    uniform Key_t srcKey[],
+    uniform int stride,
+    uniform int N)
+{
+  uniform int lastSegmentElements = N % (2 * stride);
+  uniform int threadCount = (lastSegmentElements > stride) ? 
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : 
+    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+
+  uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
+  uniform int nTasks = num_cores()*4;
+#ifdef __NVPTX__
+  nTasks = iDivUp(nBlocks,1);
+#endif
+
+  launch [nTasks] generateSampleRanksKernel(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
+  sync;
+}
+////////////////////////////////////////////////////////////////////////////////
+// Merge step 2: generate sample ranks and indices
+////////////////////////////////////////////////////////////////////////////////
+task 
+void mergeRanksAndIndicesKernel(
+    uniform int nBlocks,
+    uniform int in_Limits[],
+    uniform int in_Ranks[],
+    uniform int stride,
+    uniform int N,
+    uniform int totalProgramCount)
+{
+  const uniform int blockIdx = taskIndex;
+  const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
+  const uniform int blockBeg =     blockIdx * blockDim;
+  const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
+  
+  for (uniform int block = blockBeg; block < blockEnd; block++)
+  {
+    int pos = block * programCount + programIndex;
+    cif (pos >= totalProgramCount)
+      return;
+
+    const int           i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+    uniform int *  ranks = in_Ranks  + (pos - i) * 2;
+    uniform int * limits = in_Limits + (pos - i) * 2;
+
+    const int segmentElementsA = stride;
+    const int segmentElementsB = min(stride, N - segmentBase - stride);
+    const int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const int  segmentSamplesB = getSampleCount(segmentElementsB);
+
+    if (i < segmentSamplesA)
+    {
+      int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
+      limits[dstPos] = ranks[i];
+    }
+
+    if (i < segmentSamplesB)
+    {
+      int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
+      limits[dstPos] = ranks[segmentSamplesA + i];
+    }
+  }
+}
+static inline
+void mergeRanksAndIndices(
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int ranksA[],
+    uniform int ranksB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int lastSegmentElements = N % (2 * stride);
+  const uniform int threadCount = (lastSegmentElements > stride) ? 
+    (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : 
+    (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+
+  const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
+  uniform int nTasks = num_cores()*4;
+
+#ifdef __NVPTX__
+  nTasks = iDivUp(nBlocks,1);
+#endif
+
+  launch [nTasks] mergeRanksAndIndicesKernel(
+      nBlocks,
+      limitsA,
+      ranksA,
+      stride,
+      N,
+      threadCount);
+  launch [nTasks] mergeRanksAndIndicesKernel(
+      nBlocks,
+      limitsB,
+      ranksB,
+      stride,
+      N,
+      threadCount);
+  sync;
+}
+
+
+task
+void mergeElementaryIntervalsKernel(
+    uniform int mergePairs,
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int blockIdx = taskIndex;
+  const uniform int blockDim = (mergePairs + taskCount - 1)/taskCount;
+  const uniform int blockBeg =     blockIdx * blockDim;
+  const uniform int blockEnd = min(blockBeg + blockDim, mergePairs);
+
+  for (uniform int block = blockBeg; block < blockEnd; block++)
+  {
+    const int uniform   intervalI =  block & ((2 * stride) / SAMPLE_STRIDE - 1);
+    const int uniform segmentBase = (block - intervalI) * SAMPLE_STRIDE;
+
+    //Set up threadblock-wide parameters
+
+    const uniform int segmentElementsA = stride;
+    const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
+    const uniform int  segmentSamplesA = getSampleCount(segmentElementsA);
+    const uniform int  segmentSamplesB = getSampleCount(segmentElementsB);
+    const uniform int   segmentSamples = segmentSamplesA + segmentSamplesB;
+
+    const uniform int startSrcA = limitsA[block];
+    const uniform int startSrcB = limitsB[block];
+    const uniform int endSrcA   = (intervalI + 1 < segmentSamples) ? limitsA[block + 1] : segmentElementsA;
+    const uniform int endSrcB   = (intervalI + 1 < segmentSamples) ? limitsB[block + 1] : segmentElementsB;
+    const uniform int lenSrcA   = endSrcA - startSrcA;
+    const uniform int lenSrcB   = endSrcB - startSrcB;
+    const uniform int startDstA = startSrcA + startSrcB;
+    const uniform int startDstB = startDstA + lenSrcA;
+
+    //Load main input data
+
+    Key_t keyA, keyB;
+    Val_t valA, valB;
+    if (programIndex < lenSrcA)
+    {
+      keyA = srcKey[segmentBase + startSrcA + programIndex];
+      valA = srcVal[segmentBase + startSrcA + programIndex];
+    }
+
+    if (programIndex < lenSrcB)
+    {
+      keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
+      valB = srcVal[segmentBase + stride + startSrcB + programIndex];
+    }
+
+    // Compute destination addresses for merge data
+    int dstPosA, dstPosB, dstA = -1, dstB = -1;
+    if (programIndex < lenSrcA)
+      dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
+    if (programIndex < lenSrcB)
+      dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
+
+    if (programIndex < lenSrcA && dstPosA < lenSrcA)
+      dstA = segmentBase + startDstA + dstPosA;
+    dstPosA -= lenSrcA;
+    if (programIndex < lenSrcA && dstPosA < lenSrcB)
+      dstA = segmentBase + startDstB + dstPosA;
+
+    if (programIndex < lenSrcB && dstPosB < lenSrcA)
+      dstB = segmentBase + startDstA + dstPosB;
+    dstPosB -= lenSrcA;
+    if (programIndex < lenSrcB && dstPosB < lenSrcB)
+      dstB = segmentBase + startDstB + dstPosB;
+
+    if (dstA >= 0) 
+    {
+      dstKey[dstA] = keyA;
+      dstVal[dstA] = valA;
+    }
+    if (dstB >= 0) 
+    {
+      dstKey[dstB] = keyB;
+      dstVal[dstB] = valB;
+    }
+  }
+}
+
+static inline
+void mergeElementaryIntervals(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int limitsA[],
+    uniform int limitsB[],
+    uniform int stride,
+    uniform int N)
+{
+  const uniform int lastSegmentElements = N % (2 * stride);
+  const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
+
+
+  uniform int nTasks = num_cores()*4;
+#ifdef __NVPTX__
+  nTasks = iDivUp(mergePairs,1*programCount);
+#endif
+
+  launch [nTasks] mergeElementaryIntervalsKernel(
+      mergePairs,
+      dstKey,
+      dstVal,
+      srcKey,
+      srcVal,
+      limitsA,
+      limitsB,
+      stride,
+      N);
+  if (lastSegmentElements <= stride)
+    foreach (i = 0 ... lastSegmentElements)
+    {
+      dstKey[N-lastSegmentElements+i] = srcKey[N-lastSegmentElements+i];
+      dstVal[N-lastSegmentElements+i] = srcVal[N-lastSegmentElements+i];
+    }
+  sync;
+}
+
+static uniform int * uniform memPool = NULL;
+static uniform int * uniform ranksA;
+static uniform int * uniform ranksB;
+static uniform int * uniform limitsA;
+static uniform int * uniform limitsB;
+static uniform int MAX_SAMPLE_COUNT = 0;
+
+export 
+void openMergeSort()
+{
+  MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
+  assert(memPool == NULL);
+  const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
+  memPool = uniform new uniform int[nalloc];
+  ranksA  = memPool;
+  ranksB  =  ranksA + MAX_SAMPLE_COUNT;
+  limitsA =  ranksB + MAX_SAMPLE_COUNT;
+  limitsB = limitsA + MAX_SAMPLE_COUNT;
+}
+
+export
+void closeMergeSort()
+{
+  assert(memPool != NULL);
+  delete memPool;
+  memPool = NULL;
+}
+
+export 
+void mergeSort(
+    uniform Key_t dstKey[],
+    uniform Val_t dstVal[],
+    uniform Key_t bufKey[],
+    uniform Val_t bufVal[],
+    uniform Key_t srcKey[],
+    uniform Val_t srcVal[],
+    uniform int N)
+{
+  uniform int stageCount = 0;
+  for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);
+
+  uniform Key_t * uniform iKey, * uniform oKey;
+  uniform Val_t * uniform iVal, * uniform oVal;
+
+  if (stageCount & 1)
+  {
+    iKey = bufKey;
+    iVal = bufVal;
+    oKey = dstKey;
+    oVal = dstVal;
+  }
+  else
+  {
+    iKey = dstKey;
+    iVal = dstVal;
+    oKey = bufKey;
+    oVal = bufVal;
+  }
+
+
+
+  assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
+  assert(N % (programCount*2) == 0);
+
+  // cpu: 28  gpu: 74 M/s
+  {
+    // cpu: 356   gpu: 534 M/s
+    mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
+
+#if 1
+    for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
+    {
+      // cpu: 30  gpu: 112 M/s
+      {
+#if 1
+        // cpu: 121  gpu: 460 M/s
+        {
+          // cpu: 190  gpu: 600 M/s
+          //Find sample ranks and prepare for limiters merge
+          generateSampleRanks(ranksA, ranksB, iKey, stride, N);
+
+          // cpu: 120 gpu: 457 M/s
+          //Merge ranks and indices
+          mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
+        }
+#endif
+
+        // cpu: 287  gpu: 194 M/s 
+        //Merge elementary intervals
+        mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
+      }
+
+      {
+        uniform Key_t * uniform tmpKey = iKey;
+        iKey = oKey;
+        oKey = tmpKey;
+      }
+      {
+        uniform Val_t * uniform tmpVal = iVal;
+        iVal = oVal;
+        oVal = tmpVal;
+      }
+    }
+#endif
+  }
+}