diff --git a/ctx.cpp b/ctx.cpp index 6f7413b7..39f56885 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1967,6 +1967,7 @@ static llvm::Value* lConvertGepToGenericPtr(FunctionEmitContext *ctx, llvm::Valu */ static llvm::Value* lCorrectLocalPtr(FunctionEmitContext *ctx, llvm::Value* value) { + // return value; assert(value->getType()->isPointerTy()); llvm::PointerType *pt = llvm::dyn_cast(value->getType()); if (g->target->getISA() != Target::NVPTX || pt->getAddressSpace() != 3) return value; @@ -1981,6 +1982,7 @@ static llvm::Value* lCorrectLocalPtr(FunctionEmitContext *ctx, llvm::Value* valu */ static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos ¤tPos) { +// return value; if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) return value; llvm::PointerType *pt = llvm::dyn_cast(value->getType()); diff --git a/examples_ptx/bitonicSort/Makefile_cpu b/examples_ptx/bitonicSort/Makefile_cpu new file mode 100644 index 00000000..a70ba750 --- /dev/null +++ b/examples_ptx/bitonicSort/Makefile_cpu @@ -0,0 +1,9 @@ + +EXAMPLE=sort +CPP_SRC=sort.cpp sort_serial.cpp +ISPC_SRC=sort.ispc +ISPC_IA_TARGETS=avx1-i32x8 +ISPC_ARM_TARGETS=neon +#ISPC_FLAGS=-DDEBUG + +include ../common.mk diff --git a/examples_ptx/bitonicSort/Makefile_gpu b/examples_ptx/bitonicSort/Makefile_gpu new file mode 100644 index 00000000..97a51f26 --- /dev/null +++ b/examples_ptx/bitonicSort/Makefile_gpu @@ -0,0 +1,13 @@ +PROG=sort +ISPC_SRC=sort.ispc +CU_SRC=sort.cu +CXX_SRC=sort.cpp sort_serial.cpp +PTXCC_REGMAX=32 + +LLVM_GPU=1 +NVVM_GPU=1 + +include ../common_gpu.mk + + + diff --git a/examples_ptx/bitonicSort/bitonicSort.cpp b/examples_ptx/bitonicSort/bitonicSort.cpp new file mode 100644 index 00000000..21778d18 --- /dev/null +++ b/examples_ptx/bitonicSort/bitonicSort.cpp @@ -0,0 +1,157 @@ +/* + Copyright (c) 2013, Durham University + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Durham University nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Author: Tomasz Koziara */ + +#include +#include +#include +#include +#include +#include +#include "../timing.h" +#include "../ispc_malloc.h" +#include "sort_ispc.h" + +using namespace ispc; + +extern void sort_serial (int n, unsigned int code[], int order[]); + +/* progress bar by Ross Hemsley; + * http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */ +static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50) +{ + if (n < 100) + { + x *= 100/n; + n = 100; + } + + if ((x != n) && (x % (n/100) != 0)) return; + + using namespace std; + float ratio = x/(float)n; + int c = ratio * w; + + cout << setw(3) << (int)(ratio*100) << "% ["; + for (int x=0; x +#include +#include "sortingNetworks_common.h" +#include "sortingNetworks_common.cuh" + + + +//////////////////////////////////////////////////////////////////////////////// +// Monolithic bitonic sort kernel for short arrays fitting into shared memory +//////////////////////////////////////////////////////////////////////////////// +__global__ void bitonicSortShared( + uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint arrayLength, + uint dir +){ + //Shared memory storage for one or more short vectors + __shared__ uint s_key[SHARED_SIZE_LIMIT]; + __shared__ uint s_val[SHARED_SIZE_LIMIT]; + + //Offset to the beginning of subbatch and load data + d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + s_key[threadIdx.x + 0] = d_SrcKey[ 0]; + s_val[threadIdx.x + 0] = d_SrcVal[ 0]; + s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; + s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; + + for(uint size = 2; size < arrayLength; size <<= 1){ + //Bitonic merge + uint ddd = dir ^ ( (threadIdx.x & (size / 2)) != 0 ); + for(uint stride = size / 2; stride > 0; stride >>= 1){ + __syncthreads(); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + Comparator( + s_key[pos + 0], s_val[pos + 0], + s_key[pos + stride], s_val[pos + stride], + ddd + ); + } + } + + //ddd == dir for the last bitonic merge step + { + for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){ + __syncthreads(); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + Comparator( + s_key[pos + 0], s_val[pos + 0], + s_key[pos + stride], s_val[pos + stride], + dir + ); + } + } + + __syncthreads(); + d_DstKey[ 0] = s_key[threadIdx.x + 0]; + d_DstVal[ 0] = s_val[threadIdx.x + 0]; + d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; +} + + + +//////////////////////////////////////////////////////////////////////////////// +// Bitonic sort kernel for large arrays (not fitting into shared memory) +//////////////////////////////////////////////////////////////////////////////// +//Bottom-level bitonic sort +//Almost the same as bitonicSortShared with the exception of +//even / odd subarrays being sorted in opposite directions +//Bitonic merge accepts both +//Ascending | descending or descending | ascending sorted pairs +__global__ void bitonicSortShared1( + uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal +){ + //Shared memory storage for current subarray + __shared__ uint s_key[SHARED_SIZE_LIMIT]; + __shared__ uint s_val[SHARED_SIZE_LIMIT]; + + //Offset to the beginning of subarray and load data + d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + s_key[threadIdx.x + 0] = d_SrcKey[ 0]; + s_val[threadIdx.x + 0] = d_SrcVal[ 0]; + s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; + s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; + + for(uint size = 2; size < SHARED_SIZE_LIMIT; size <<= 1){ + //Bitonic merge + uint ddd = (threadIdx.x & (size / 2)) != 0; + for(uint stride = size / 2; stride > 0; stride >>= 1){ + __syncthreads(); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + Comparator( + s_key[pos + 0], s_val[pos + 0], + s_key[pos + stride], s_val[pos + stride], + ddd + ); + } + } + + //Odd / even arrays of SHARED_SIZE_LIMIT elements + //sorted in opposite directions + uint ddd = blockIdx.x & 1; + { + for(uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1){ + __syncthreads(); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + Comparator( + s_key[pos + 0], s_val[pos + 0], + s_key[pos + stride], s_val[pos + stride], + ddd + ); + } + } + + + __syncthreads(); + d_DstKey[ 0] = s_key[threadIdx.x + 0]; + d_DstVal[ 0] = s_val[threadIdx.x + 0]; + d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; +} + +//Bitonic merge iteration for stride >= SHARED_SIZE_LIMIT +__global__ void bitonicMergeGlobal( + uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint arrayLength, + uint size, + uint stride, + uint dir +){ + uint global_comparatorI = blockIdx.x * blockDim.x + threadIdx.x; + uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); + + //Bitonic merge + uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); + uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); + + uint keyA = d_SrcKey[pos + 0]; + uint valA = d_SrcVal[pos + 0]; + uint keyB = d_SrcKey[pos + stride]; + uint valB = d_SrcVal[pos + stride]; + + Comparator( + keyA, valA, + keyB, valB, + ddd + ); + + d_DstKey[pos + 0] = keyA; + d_DstVal[pos + 0] = valA; + d_DstKey[pos + stride] = keyB; + d_DstVal[pos + stride] = valB; +} + +//Combined bitonic merge steps for +//size > SHARED_SIZE_LIMIT and stride = [1 .. SHARED_SIZE_LIMIT / 2] +__global__ void bitonicMergeShared( + uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint arrayLength, + uint size, + uint dir +){ + //Shared memory storage for current subarray + __shared__ uint s_key[SHARED_SIZE_LIMIT]; + __shared__ uint s_val[SHARED_SIZE_LIMIT]; + + d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; + s_key[threadIdx.x + 0] = d_SrcKey[ 0]; + s_val[threadIdx.x + 0] = d_SrcVal[ 0]; + s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; + s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)]; + + //Bitonic merge + uint comparatorI = UMAD(blockIdx.x, blockDim.x, threadIdx.x) & ((arrayLength / 2) - 1); + uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); + for(uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1){ + __syncthreads(); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + Comparator( + s_key[pos + 0], s_val[pos + 0], + s_key[pos + stride], s_val[pos + stride], + ddd + ); + } + + __syncthreads(); + d_DstKey[ 0] = s_key[threadIdx.x + 0]; + d_DstVal[ 0] = s_val[threadIdx.x + 0]; + d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; +} + + + +//////////////////////////////////////////////////////////////////////////////// +// Interface function +//////////////////////////////////////////////////////////////////////////////// +//Helper function (also used by odd-even merge sort) +extern "C" uint factorRadix2(uint *log2L, uint L){ + if(!L){ + *log2L = 0; + return 0; + }else{ + for(*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++); + return L; + } +} + +extern "C" uint bitonicSort( + uint *d_DstKey, + uint *d_DstVal, + uint *d_SrcKey, + uint *d_SrcVal, + uint batchSize, + uint arrayLength, + uint dir +){ + //Nothing to sort + if(arrayLength < 2) + return 0; + + //Only power-of-two array lengths are supported by this implementation + uint log2L; + uint factorizationRemainder = factorRadix2(&log2L, arrayLength); + assert( factorizationRemainder == 1 ); + + dir = (dir != 0); + + uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT; + uint threadCount = SHARED_SIZE_LIMIT / 2; + + if(arrayLength <= SHARED_SIZE_LIMIT){ + assert( (batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0 ); + bitonicSortShared<<>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, dir); + }else{ + bitonicSortShared1<<>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal); + + for(uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1) + for(unsigned stride = size / 2; stride > 0; stride >>= 1) + if(stride >= SHARED_SIZE_LIMIT){ + bitonicMergeGlobal<<<(batchSize * arrayLength) / 512, 256>>>(d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, stride, dir); + }else{ + bitonicMergeShared<<>>(d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, dir); + break; + } + } + return threadCount; +} + + +extern "C" int isDeviceEmulation(void){ + #ifdef __DEVICE_EMULATION__ + return 1; + #else + return 0; + #endif +} diff --git a/examples_ptx/bitonicSort/bitonicSort.ispc b/examples_ptx/bitonicSort/bitonicSort.ispc new file mode 100644 index 00000000..c1facd27 --- /dev/null +++ b/examples_ptx/bitonicSort/bitonicSort.ispc @@ -0,0 +1,289 @@ +#define SIZE_LIMIT (2*programCount) + +static inline void Comparator( + int &keyA, + int &valA, + int &keyB, + int &valB, + const int dir) +{ + if ((keyA > keyB) == dir) + { + int t; + t = keyA; keyA = keyB; keyB = t; + t = valA; valA = valB; valB = t; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Monolithic bitonic sort kernel for short arrays fitting into local memory +//////////////////////////////////////////////////////////////////////////////// +task +void bitonicSortLocal( + uniform int dstKey[], + uniform int dstVal[], + uniform int srcKey[], + uniform int srcVal[], + const uniform int arrayLength, + const uniform int dir) +{ + uniform int l_key[SIZE_LIMIT]; + uniform int l_val[SIZE_LIMIT]; + + //Offset to the beginning of subbatch and load data + const int offset = taskIndex0 * SIZE_LIMIT + programIndex; + l_key[programIndex + 0] = srcKey[offset]; + l_val[programIndex + 0] = srcVal[offset]; + l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset +(SIZE_LIMIT/2)]; + l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset +(SIZE_LIMIT/2)]; + + for (uniform int size = 2; size < arrayLength; size <<= 1) + { + //Bitonic merge + const int ddd = dir ^ ( (programIndex & (size / 2)) != 0 ); + for (uniform int stride = size / 2; stride > 0; stride >>= 1) + { + const int pos = 2 * programIndex - (programIndex & (stride - 1)); + int key_a = l_key[pos]; + int val_a = l_val[pos]; + int key_b = l_key[pos + stride]; + int val_b = l_val[pos + stride]; + Comparator(key_a, val_a, key_b, val_b, ddd); + l_key[pos] = key_a; + l_val[pos] = val_a; + l_key[pos + stride] = key_b; + l_val[pos + stride] = val_b; + } + } + + //ddd == dir for the last bitonic merge step + { + for (int stride = arrayLength / 2; stride > 0; stride >>= 1) + { + const int pos = 2 * programIndex - (programIndex & (stride - 1)); + int key_a = l_key[pos]; + int val_a = l_val[pos]; + int key_b = l_key[pos + stride]; + int val_b = l_val[pos + stride]; + Comparator(key_a, val_a, key_b, val_b, dir); + l_key[pos] = key_a; + l_val[pos] = val_a; + l_key[pos + stride] = key_b; + l_val[pos + stride] = val_b; + } + } + + dstKey[offset] = l_key[programIndex + 0]; + dstVal[offset] = l_val[programIndex + 0]; + dstKey[offset +(SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)]; + dstVal[offset +(SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Bitonic sort kernel for large arrays (not fitting into local memory) +//////////////////////////////////////////////////////////////////////////////// +//Bottom-level bitonic sort +//Almost the same as bitonicSortLocal with the only exception +//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being +//sorted in opposite directions +task +void bitonicSortLocal1( + uniform int dstKey[], + uniform int dstVal[], + uniform int srcKey[], + uniform int srcVal[]) +{ + uniform int l_key[SIZE_LIMIT]; + uniform int l_val[SIZE_LIMIT]; + + //Offset to the beginning of subarray and load data + const int offset = taskIndex0 * SIZE_LIMIT + programIndex; + l_key[programIndex + 0] = srcKey[offset]; + l_val[programIndex + 0] = srcVal[offset]; + l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset + (SIZE_LIMIT/2)]; + l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset + (SIZE_LIMIT/2)]; + + for (int size = 2; size < SIZE_LIMIT; size <<= 1) + { + //Bitonic merge + const int ddd = (programIndex & (size / 2)) != 0; + for (int stride = size / 2; stride > 0; stride >>= 1) + { + const int pos = 2 * programIndex - (programIndex & (stride - 1)); + int key_a = l_key[pos]; + int val_a = l_val[pos]; + int key_b = l_key[pos + stride]; + int val_b = l_val[pos + stride]; + Comparator(key_a, val_a, key_b, val_b, ddd); + l_key[pos] = key_a; + l_val[pos] = val_a; + l_key[pos + stride] = key_b; + l_val[pos + stride] = val_b; + } + } + + //Odd / even arrays of LOCAL_SIZE_LIMIT elements + //sorted in opposite directions + { + const int ddd = taskIndex0 & 1; + for (int stride = SIZE_LIMIT/2; stride > 0; stride >>= 1) + { + const int pos = 2 * programIndex - (programIndex & (stride - 1)); + int key_a = l_key[pos]; + int val_a = l_val[pos]; + int key_b = l_key[pos + stride]; + int val_b = l_val[pos + stride]; + Comparator(key_a, val_a, key_b, val_b, ddd); + l_key[pos] = key_a; + l_val[pos] = val_a; + l_key[pos + stride] = key_b; + l_val[pos + stride] = val_b; + } + } + + dstKey[offset] = l_key[programIndex + 0]; + dstVal[offset] = l_val[programIndex + 0]; + dstKey[offset + (SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)]; + dstVal[offset + (SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)]; +} + +//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT +task +void bitonicMergeGlobal( + uniform int dstKey[], + uniform int dstVal[], + uniform int srcKey[], + uniform int srcVal[], + const uniform int arrayLength, + const uniform int size, + const uniform int stride, + const uniform int dir) +{ + const int global_comparatorI = taskIndex0*programCount + programIndex; + const int comparatorI = global_comparatorI & (arrayLength / 2 - 1); + + //Bitonic merge + const int ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); + const int pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); + + int keyA = srcKey[pos + 0]; + int valA = srcVal[pos + 0]; + int keyB = srcKey[pos + stride]; + int valB = srcVal[pos + stride]; + + Comparator( + keyA, valA, + keyB, valB, + ddd + ); + + dstKey[pos + 0] = keyA; + dstVal[pos + 0] = valA; + dstKey[pos + stride] = keyB; + dstVal[pos + stride] = valB; +} + +//Combined bitonic merge steps for +//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2] +task +void bitonicMergeLocal( + uniform int dstKey[], + uniform int dstVal[], + uniform int srcKey[], + uniform int srcVal[], + const uniform int arrayLength, + const uniform int size, + const uniform int dir) +{ + uniform int l_key[SIZE_LIMIT]; + uniform int l_val[SIZE_LIMIT]; + + const int offset = taskIndex0 * SIZE_LIMIT + programIndex; + l_key[programIndex + 0] = srcKey[offset]; + l_val[programIndex + 0] = srcVal[offset]; + l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset +(SIZE_LIMIT/2)]; + l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset +(SIZE_LIMIT/2)]; + + //Bitonic merge + const int global_id = taskIndex0*programCount + programIndex; + const int comparatorI = global_id & ((arrayLength / 2) - 1); + const int ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); + for (int stride = SIZE_LIMIT / 2; stride > 0; stride >>= 1) + { + const int pos = 2 * programIndex - (programIndex & (stride - 1)); + int key_a = l_key[pos]; + int val_a = l_val[pos]; + int key_b = l_key[pos + stride]; + int val_b = l_val[pos + stride]; + Comparator(key_a, val_a, key_b, val_b, ddd); + l_key[pos] = key_a; + l_val[pos] = val_a; + l_key[pos + stride] = key_b; + l_val[pos + stride] = val_b; + } + + dstKey[offset] = l_key[programIndex + 0]; + dstVal[offset] = l_val[programIndex + 0]; + dstKey[offset +(SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)]; + dstVal[offset +(SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)]; +} + +static inline int factorRadix2(int &log2L, int L){ + if(!L){ + log2L = 0; + return 0; + }else{ + int val; + for(val = 0; (L & 1) == 0; L >>= 1, val++); + log2L = val; + return L; + } +} + +export +void bitoniSort( + uniform int dstKey[], + uniform int dstVal[], + uniform int srcKey[], + uniform int srcVal[], + const uniform int batchSize, + const uniform int arrayLength, + const uniform int dir) +{ + //Nothing to sort + if (arrayLength < 2) + return; + + //Only power-of-two array lengths are supported by this implementation + int log2L; + const int factorizationRemainder = factorRadix2(log2L, arrayLength); + assert( factorizationRemainder == 1 ); + + const uniform int blockCount = batchSize * arrayLength / SIZE_LIMIT; + const uniform int threadCount = SIZE_LIMIT / 2; + + if (arrayLength <= SIZE_LIMIT) + { + assert( (batchSize * arrayLength) % SIZE_LIMIT == 0 ); + launch [blockCount] bitonicSortLocal(dstKey, dstVal, srcKey, srcVal, arrayLength, dir); + sync; + } + else + { + launch [blockCount] bitonicSortLocal1(dstKey, dstVal, srcKey, srcVal); + sync; + + for(uniform int size = 2 * SIZE_LIMIT; size <= arrayLength; size <<= 1) + for(uniform int stride = size / 2; stride > 0; stride >>= 1) + if (stride >= SIZE_LIMIT) + { + launch [blockCount] bitonicMergeGlobal(dstKey, dstVal, dstKey, dstVal, arrayLength, size, stride, dir); + sync; + } + else + { + launch [blockCount] bitonicMergeLocal(dstKey, dstVal, dstKey, dstVal, arrayLength, size, dir); + sync; + } + } +}