first commit bitonicSort

2014-01-27 14:02:42 +01:00
parent 673d814a45
commit 3ae4a7e660
6 changed files with 764 additions and 0 deletions
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1967,6 +1967,7 @@ static llvm::Value* lConvertGepToGenericPtr(FunctionEmitContext *ctx, llvm::Valu
 */
 static llvm::Value* lCorrectLocalPtr(FunctionEmitContext *ctx, llvm::Value* value)
 {
 // return value;
  assert(value->getType()->isPointerTy());
  llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
  if (g->target->getISA() != Target::NVPTX || pt->getAddressSpace() != 3) return value;
@@ -1981,6 +1982,7 @@ static llvm::Value* lCorrectLocalPtr(FunctionEmitContext *ctx, llvm::Value* valu
 */
 static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos &currentPos)
 {
 //  return value;
  if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) return value;
  llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
--- a/examples_ptx/bitonicSort/Makefile_cpu
+++ b/examples_ptx/bitonicSort/Makefile_cpu
@@ -0,0 +1,9 @@
 EXAMPLE=sort
 CPP_SRC=sort.cpp sort_serial.cpp
 ISPC_SRC=sort.ispc
 ISPC_IA_TARGETS=avx1-i32x8
 ISPC_ARM_TARGETS=neon
 #ISPC_FLAGS=-DDEBUG
 include ../common.mk
--- a/examples_ptx/bitonicSort/Makefile_gpu
+++ b/examples_ptx/bitonicSort/Makefile_gpu
@@ -0,0 +1,13 @@
 PROG=sort
 ISPC_SRC=sort.ispc
 CU_SRC=sort.cu
 CXX_SRC=sort.cpp  sort_serial.cpp
 PTXCC_REGMAX=32
 LLVM_GPU=1
 NVVM_GPU=1
 include ../common_gpu.mk
--- a/examples_ptx/bitonicSort/bitonicSort.cpp
+++ b/examples_ptx/bitonicSort/bitonicSort.cpp
@@ -0,0 +1,157 @@
 /*
  Copyright (c) 2013, Durham University
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Durham University nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 /* Author: Tomasz Koziara */
 #include <cstdio>
 #include <cstdlib>
 #include <algorithm>
 #include <iostream>
 #include <cassert>
 #include <iomanip>
 #include "../timing.h"
 #include "../ispc_malloc.h"
 #include "sort_ispc.h"
 using namespace ispc;
 extern void sort_serial (int n, unsigned int code[], int order[]);
 /* progress bar by Ross Hemsley;
 * http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */
 static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50)
 {
  if (n < 100)
  {
    x *= 100/n;
    n = 100;
  }
  if ((x != n) && (x % (n/100) != 0)) return;
  using namespace std;
  float ratio  =  x/(float)n;
  int c =  ratio * w;
  cout << setw(3) << (int)(ratio*100) << "% [";
  for (int x=0; x<c; x++) cout << "=";
  for (int x=c; x<w; x++) cout << " ";
  cout << "]\r" << flush;
 }
 int main (int argc, char *argv[])
 {
  int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
  double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
  unsigned int *code      = new unsigned int [n];
  unsigned int *code_orig = new unsigned int [n];
  int *order = new int [n];
  for (j = 0; j < n; j ++) code_orig[j] = rand() % l;
  ispcSetMallocHeapLimit(1024*1024*1024);
  srand (0);
 #ifndef _CUDA_
  for (i = 0; i < m; i ++)
  {
    ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
    reset_and_start_timer();
    sort_ispc (n, code, order, 1);
    tISPC1 += get_elapsed_msec();
    if (argc != 3)
        progressbar (i, m);
  }
  printf("[sort ispc]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC1, 1.0e-3*n*m/tISPC1);
 #endif
  srand (0);
  const int ntask = 13*8;
  for (i = 0; i < m; i ++)
  {
    ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
    reset_and_start_timer();
    sort_ispc (n, code, order, ntask);
    tISPC2 += get_elapsed_msec();
    if (argc != 3)
        progressbar (i, m);
  }
  printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n*m/tISPC2);
  unsigned int *code1 =  new unsigned int [n];
  for (int i = 0; i < n; i++) 
    code1[i] = code[i];
  std::sort(code1, code1+n);
  for (int i = 0; i < n; i++)
    assert(code1[i] == code[i]);
  srand (0);
  for (i = 0; i < m; i ++)
  {
    ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
    reset_and_start_timer();
    sort_serial (n, code, order);
    tSerial += get_elapsed_msec();
    if (argc != 3)
        progressbar (i, m);
  }
  printf("[sort serial]:\t\t[%.3f] msec [%.3f Mpair/s]\n", tSerial, 1.0e-3*n*m/tSerial);
 #ifndef _CUDA_
  printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
 #else
  printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", tSerial/tISPC2);
 #endif
  delete code;
  delete code_orig;
  delete order;
  return 0;
 }
--- a/examples_ptx/bitonicSort/bitonicSort.cu
+++ b/examples_ptx/bitonicSort/bitonicSort.cu
@@ -0,0 +1,294 @@
 /*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */
 //Based on http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm
 #include <assert.h>
 #include <cutil_inline.h>
 #include "sortingNetworks_common.h"
 #include "sortingNetworks_common.cuh"
 ////////////////////////////////////////////////////////////////////////////////
 // Monolithic bitonic sort kernel for short arrays fitting into shared memory
 ////////////////////////////////////////////////////////////////////////////////
 __global__ void bitonicSortShared(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint arrayLength,
    uint dir
 ){
    //Shared memory storage for one or more short vectors
    __shared__ uint s_key[SHARED_SIZE_LIMIT];
    __shared__ uint s_val[SHARED_SIZE_LIMIT];
    //Offset to the beginning of subbatch and load data
    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    s_key[threadIdx.x +                       0] = d_SrcKey[                      0];
    s_val[threadIdx.x +                       0] = d_SrcVal[                      0];
    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
    for(uint size = 2; size < arrayLength; size <<= 1){
        //Bitonic merge
        uint ddd = dir ^ ( (threadIdx.x & (size / 2)) != 0 );
        for(uint stride = size / 2; stride > 0; stride >>= 1){
            __syncthreads();
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
            Comparator(
                s_key[pos +      0], s_val[pos +      0],
                s_key[pos + stride], s_val[pos + stride],
                ddd
            );
        }
    }
    //ddd == dir for the last bitonic merge step
    {
        for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){
            __syncthreads();
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
            Comparator(
                s_key[pos +      0], s_val[pos +      0],
                s_key[pos + stride], s_val[pos + stride],
                dir
            );
        }
    }
    __syncthreads();
    d_DstKey[                      0] = s_key[threadIdx.x +                       0];
    d_DstVal[                      0] = s_val[threadIdx.x +                       0];
    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Bitonic sort kernel for large arrays (not fitting into shared memory)
 ////////////////////////////////////////////////////////////////////////////////
 //Bottom-level bitonic sort
 //Almost the same as bitonicSortShared with the exception of
 //even / odd subarrays being sorted in opposite directions
 //Bitonic merge accepts both
 //Ascending | descending or descending | ascending sorted pairs
 __global__ void bitonicSortShared1(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal
 ){
    //Shared memory storage for current subarray
    __shared__ uint s_key[SHARED_SIZE_LIMIT];
    __shared__ uint s_val[SHARED_SIZE_LIMIT];
    //Offset to the beginning of subarray and load data
    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    s_key[threadIdx.x +                       0] = d_SrcKey[                      0];
    s_val[threadIdx.x +                       0] = d_SrcVal[                      0];
    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
    for(uint size = 2; size < SHARED_SIZE_LIMIT; size <<= 1){
        //Bitonic merge
        uint ddd = (threadIdx.x & (size / 2)) != 0;
        for(uint stride = size / 2; stride > 0; stride >>= 1){
            __syncthreads();
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
            Comparator(
                s_key[pos +      0], s_val[pos +      0],
                s_key[pos + stride], s_val[pos + stride],
                ddd
            );
        }
    }
    //Odd / even arrays of SHARED_SIZE_LIMIT elements
    //sorted in opposite directions
    uint ddd = blockIdx.x & 1;
    {
        for(uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
            __syncthreads();
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
            Comparator(
                s_key[pos +      0], s_val[pos +      0],
                s_key[pos + stride], s_val[pos + stride],
                ddd
            );
        }
    }
    __syncthreads();
    d_DstKey[                      0] = s_key[threadIdx.x +                       0];
    d_DstVal[                      0] = s_val[threadIdx.x +                       0];
    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 }
 //Bitonic merge iteration for stride >= SHARED_SIZE_LIMIT
 __global__ void bitonicMergeGlobal(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint arrayLength,
    uint size,
    uint stride,
    uint dir
 ){
    uint global_comparatorI = blockIdx.x * blockDim.x + threadIdx.x;
    uint        comparatorI = global_comparatorI & (arrayLength / 2 - 1);
    //Bitonic merge
    uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
    uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
    uint keyA = d_SrcKey[pos +      0];
    uint valA = d_SrcVal[pos +      0];
    uint keyB = d_SrcKey[pos + stride];
    uint valB = d_SrcVal[pos + stride];
    Comparator(
        keyA, valA,
        keyB, valB,
        ddd
    );
    d_DstKey[pos +      0] = keyA;
    d_DstVal[pos +      0] = valA;
    d_DstKey[pos + stride] = keyB;
    d_DstVal[pos + stride] = valB;
 }
 //Combined bitonic merge steps for
 //size > SHARED_SIZE_LIMIT and stride = [1 .. SHARED_SIZE_LIMIT / 2]
 __global__ void bitonicMergeShared(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint arrayLength,
    uint size,
    uint dir
 ){
    //Shared memory storage for current subarray
    __shared__ uint s_key[SHARED_SIZE_LIMIT];
    __shared__ uint s_val[SHARED_SIZE_LIMIT];
    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    s_key[threadIdx.x +                       0] = d_SrcKey[                      0];
    s_val[threadIdx.x +                       0] = d_SrcVal[                      0];
    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
    //Bitonic merge
    uint comparatorI = UMAD(blockIdx.x, blockDim.x, threadIdx.x) & ((arrayLength / 2) - 1);
    uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
    for(uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
        __syncthreads();
        uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
        Comparator(
            s_key[pos +      0], s_val[pos +      0],
            s_key[pos + stride], s_val[pos + stride],
            ddd
        );
    }
    __syncthreads();
    d_DstKey[                      0] = s_key[threadIdx.x +                       0];
    d_DstVal[                      0] = s_val[threadIdx.x +                       0];
    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Interface function
 ////////////////////////////////////////////////////////////////////////////////
 //Helper function (also used by odd-even merge sort)
 extern "C" uint factorRadix2(uint *log2L, uint L){
    if(!L){
        *log2L = 0;
        return 0;
    }else{
        for(*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++);
        return L;
    }
 }
 extern "C" uint bitonicSort(
    uint *d_DstKey,
    uint *d_DstVal,
    uint *d_SrcKey,
    uint *d_SrcVal,
    uint batchSize,
    uint arrayLength,
    uint dir
 ){
    //Nothing to sort
    if(arrayLength < 2)
        return 0;
    //Only power-of-two array lengths are supported by this implementation
    uint log2L;
    uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
    assert( factorizationRemainder == 1 );
    dir = (dir != 0);
    uint  blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
    uint threadCount = SHARED_SIZE_LIMIT / 2;
    if(arrayLength <= SHARED_SIZE_LIMIT){
        assert( (batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0 );
        bitonicSortShared<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, dir);
    }else{
        bitonicSortShared1<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal);
        for(uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1)
            for(unsigned stride = size / 2; stride > 0; stride >>= 1)
                if(stride >= SHARED_SIZE_LIMIT){
                    bitonicMergeGlobal<<<(batchSize * arrayLength) / 512, 256>>>(d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, stride, dir);
                }else{
                    bitonicMergeShared<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, dir);
                    break;
                }
    }
    return threadCount;   
 }
 extern "C" int isDeviceEmulation(void){
    #ifdef __DEVICE_EMULATION__
        return 1;
    #else
        return 0;
    #endif
 }
--- a/examples_ptx/bitonicSort/bitonicSort.ispc
+++ b/examples_ptx/bitonicSort/bitonicSort.ispc
@@ -0,0 +1,289 @@
 #define SIZE_LIMIT (2*programCount)
 static inline void Comparator(
    int &keyA,
    int &valA,
    int &keyB,
    int &valB,
    const int dir)
 {
  if ((keyA > keyB) == dir)
  {
    int t;
    t = keyA; keyA = keyB; keyB = t;
    t = valA; valA = valB; valB = t;
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Monolithic bitonic sort kernel for short arrays fitting into local memory
 ////////////////////////////////////////////////////////////////////////////////
 task
 void bitonicSortLocal(
    uniform int dstKey[],
    uniform int dstVal[],
    uniform int srcKey[],
    uniform int srcVal[],
    const uniform int arrayLength,
    const uniform int dir)
 {
  uniform int l_key[SIZE_LIMIT];
  uniform int l_val[SIZE_LIMIT];
  //Offset to the beginning of subbatch and load data
  const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
  l_key[programIndex +              0] = srcKey[offset];
  l_val[programIndex +              0] = srcVal[offset];
  l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset +(SIZE_LIMIT/2)];
  l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset +(SIZE_LIMIT/2)];
  for (uniform int size = 2; size < arrayLength; size <<= 1)
  {
    //Bitonic merge
    const int ddd = dir ^ ( (programIndex & (size / 2)) != 0 );
    for (uniform int stride = size / 2; stride > 0; stride >>= 1)
    {
      const int pos = 2 * programIndex - (programIndex & (stride - 1));
      int key_a = l_key[pos];
      int val_a = l_val[pos];
      int key_b = l_key[pos + stride];
      int val_b = l_val[pos + stride];
      Comparator(key_a, val_a, key_b, val_b, ddd);
      l_key[pos] = key_a;
      l_val[pos] = val_a;
      l_key[pos + stride] = key_b;
      l_val[pos + stride] = val_b;
    }
  }
  //ddd == dir for the last bitonic merge step
  {
    for (int stride = arrayLength / 2; stride > 0; stride >>= 1)
    {
      const int pos = 2 * programIndex - (programIndex & (stride - 1));
      int key_a = l_key[pos];
      int val_a = l_val[pos];
      int key_b = l_key[pos + stride];
      int val_b = l_val[pos + stride];
      Comparator(key_a, val_a, key_b, val_b, dir);
      l_key[pos] = key_a;
      l_val[pos] = val_a;
      l_key[pos + stride] = key_b;
      l_val[pos + stride] = val_b;
    }
  }
  dstKey[offset]                 = l_key[programIndex +              0];
  dstVal[offset]                 = l_val[programIndex +              0];
  dstKey[offset +(SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
  dstVal[offset +(SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Bitonic sort kernel for large arrays (not fitting into local memory)
 ////////////////////////////////////////////////////////////////////////////////
 //Bottom-level bitonic sort
 //Almost the same as bitonicSortLocal with the only exception
 //of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
 //sorted in opposite directions
 task
 void bitonicSortLocal1(
    uniform int dstKey[],
    uniform int dstVal[],
    uniform int srcKey[],
    uniform int srcVal[])
 {
  uniform int l_key[SIZE_LIMIT];
  uniform int l_val[SIZE_LIMIT];
  //Offset to the beginning of subarray and load data
  const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
  l_key[programIndex +              0] = srcKey[offset];
  l_val[programIndex +              0] = srcVal[offset];
  l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset + (SIZE_LIMIT/2)];
  l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset + (SIZE_LIMIT/2)];
  for (int size = 2; size < SIZE_LIMIT; size <<= 1)
  {
    //Bitonic merge
    const int ddd = (programIndex & (size / 2)) != 0;
    for (int stride = size / 2; stride > 0; stride >>= 1)
    {
      const int pos = 2 * programIndex - (programIndex & (stride - 1));
      int key_a = l_key[pos];
      int val_a = l_val[pos];
      int key_b = l_key[pos + stride];
      int val_b = l_val[pos + stride];
      Comparator(key_a, val_a, key_b, val_b, ddd);
      l_key[pos] = key_a;
      l_val[pos] = val_a;
      l_key[pos + stride] = key_b;
      l_val[pos + stride] = val_b;
    }
  }
  //Odd / even arrays of LOCAL_SIZE_LIMIT elements
  //sorted in opposite directions
  {
    const int ddd = taskIndex0 & 1;
    for (int stride = SIZE_LIMIT/2; stride > 0; stride >>= 1)
    {
      const int pos = 2 * programIndex - (programIndex & (stride - 1));
      int key_a = l_key[pos];
      int val_a = l_val[pos];
      int key_b = l_key[pos + stride];
      int val_b = l_val[pos + stride];
      Comparator(key_a, val_a, key_b, val_b, ddd);
      l_key[pos] = key_a;
      l_val[pos] = val_a;
      l_key[pos + stride] = key_b;
      l_val[pos + stride] = val_b;
    }
  }
  dstKey[offset]                  = l_key[programIndex +              0];
  dstVal[offset]                  = l_val[programIndex +              0];
  dstKey[offset + (SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
  dstVal[offset + (SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
 }
 //Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
 task
 void bitonicMergeGlobal(
    uniform int dstKey[],
    uniform int dstVal[],
    uniform int srcKey[],
    uniform int srcVal[],
    const uniform int arrayLength,
    const uniform int size,
    const uniform int stride,
    const uniform int dir)
 {
  const int global_comparatorI = taskIndex0*programCount + programIndex;
  const int        comparatorI = global_comparatorI & (arrayLength / 2 - 1);
  //Bitonic merge
  const int ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
  const int pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
  int keyA = srcKey[pos +      0];
  int valA = srcVal[pos +      0];
  int keyB = srcKey[pos + stride];
  int valB = srcVal[pos + stride];
  Comparator(
      keyA, valA,
      keyB, valB,
      ddd
      );
  dstKey[pos +      0] = keyA;
  dstVal[pos +      0] = valA;
  dstKey[pos + stride] = keyB;
  dstVal[pos + stride] = valB;
 }
 //Combined bitonic merge steps for
 //'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
 task
 void bitonicMergeLocal(
    uniform int dstKey[],
    uniform int dstVal[],
    uniform int srcKey[],
    uniform int srcVal[],
    const uniform int arrayLength,
    const uniform int size,
    const uniform int dir)
 {
  uniform int l_key[SIZE_LIMIT];
  uniform int l_val[SIZE_LIMIT];
  const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
  l_key[programIndex +              0] = srcKey[offset];
  l_val[programIndex +              0] = srcVal[offset];
  l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset +(SIZE_LIMIT/2)];
  l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset +(SIZE_LIMIT/2)];
  //Bitonic merge
  const int   global_id = taskIndex0*programCount + programIndex;
  const int comparatorI = global_id & ((arrayLength / 2) - 1);
  const int         ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
  for (int stride = SIZE_LIMIT / 2; stride > 0; stride >>= 1)
  {
      const int pos = 2 * programIndex - (programIndex & (stride - 1));
      int key_a = l_key[pos];
      int val_a = l_val[pos];
      int key_b = l_key[pos + stride];
      int val_b = l_val[pos + stride];
      Comparator(key_a, val_a, key_b, val_b, ddd);
      l_key[pos] = key_a;
      l_val[pos] = val_a;
      l_key[pos + stride] = key_b;
      l_val[pos + stride] = val_b;
  }
  dstKey[offset]                 = l_key[programIndex +              0];
  dstVal[offset]                 = l_val[programIndex +              0];
  dstKey[offset +(SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
  dstVal[offset +(SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
 }
 static inline int factorRadix2(int &log2L, int L){
  if(!L){
    log2L = 0;
    return 0;
  }else{
    int val;
    for(val = 0; (L & 1) == 0; L >>= 1, val++);
    log2L = val;
    return L;
  }
 }
 export
 void bitoniSort(
    uniform int dstKey[],
    uniform int dstVal[],
    uniform int srcKey[],
    uniform int srcVal[],
    const uniform int batchSize,
    const uniform int arrayLength,
    const uniform int dir)
 {
  //Nothing to sort
  if (arrayLength < 2)
    return;
  //Only power-of-two array lengths are supported by this implementation
  int log2L;
  const int factorizationRemainder = factorRadix2(log2L, arrayLength);
  assert( factorizationRemainder == 1 );
  const uniform int  blockCount = batchSize * arrayLength / SIZE_LIMIT;
  const uniform int threadCount = SIZE_LIMIT / 2;
  if (arrayLength <= SIZE_LIMIT)
  {
    assert( (batchSize * arrayLength) % SIZE_LIMIT == 0 );
    launch [blockCount] bitonicSortLocal(dstKey, dstVal, srcKey, srcVal, arrayLength, dir);
    sync;
  }
  else
  {
    launch [blockCount] bitonicSortLocal1(dstKey, dstVal, srcKey, srcVal);
    sync;
    for(uniform int size = 2 * SIZE_LIMIT; size <= arrayLength; size <<= 1)
      for(uniform int stride = size / 2; stride > 0; stride >>= 1)
        if (stride >= SIZE_LIMIT)
        {
          launch [blockCount] bitonicMergeGlobal(dstKey, dstVal, dstKey, dstVal, arrayLength, size, stride, dir);
          sync;
        }
        else
        {
          launch [blockCount] bitonicMergeLocal(dstKey, dstVal, dstKey, dstVal, arrayLength, size, dir);
          sync;
        }
  }
 }