first commit bitonicSort

This commit is contained in:
Evghenii
2014-01-27 14:02:42 +01:00
parent 673d814a45
commit 3ae4a7e660
6 changed files with 764 additions and 0 deletions

View File

@@ -1967,6 +1967,7 @@ static llvm::Value* lConvertGepToGenericPtr(FunctionEmitContext *ctx, llvm::Valu
*/
static llvm::Value* lCorrectLocalPtr(FunctionEmitContext *ctx, llvm::Value* value)
{
// return value;
assert(value->getType()->isPointerTy());
llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
if (g->target->getISA() != Target::NVPTX || pt->getAddressSpace() != 3) return value;
@@ -1981,6 +1982,7 @@ static llvm::Value* lCorrectLocalPtr(FunctionEmitContext *ctx, llvm::Value* valu
*/
static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos &currentPos)
{
// return value;
if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) return value;
llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());

View File

@@ -0,0 +1,9 @@
EXAMPLE=sort
CPP_SRC=sort.cpp sort_serial.cpp
ISPC_SRC=sort.ispc
ISPC_IA_TARGETS=avx1-i32x8
ISPC_ARM_TARGETS=neon
#ISPC_FLAGS=-DDEBUG
include ../common.mk

View File

@@ -0,0 +1,13 @@
PROG=sort
ISPC_SRC=sort.ispc
CU_SRC=sort.cu
CXX_SRC=sort.cpp sort_serial.cpp
PTXCC_REGMAX=32
LLVM_GPU=1
NVVM_GPU=1
include ../common_gpu.mk

View File

@@ -0,0 +1,157 @@
/*
Copyright (c) 2013, Durham University
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Durham University nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Author: Tomasz Koziara */
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <cassert>
#include <iomanip>
#include "../timing.h"
#include "../ispc_malloc.h"
#include "sort_ispc.h"
using namespace ispc;
extern void sort_serial (int n, unsigned int code[], int order[]);
/* progress bar by Ross Hemsley;
* http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */
static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50)
{
if (n < 100)
{
x *= 100/n;
n = 100;
}
if ((x != n) && (x % (n/100) != 0)) return;
using namespace std;
float ratio = x/(float)n;
int c = ratio * w;
cout << setw(3) << (int)(ratio*100) << "% [";
for (int x=0; x<c; x++) cout << "=";
for (int x=c; x<w; x++) cout << " ";
cout << "]\r" << flush;
}
int main (int argc, char *argv[])
{
int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
unsigned int *code = new unsigned int [n];
unsigned int *code_orig = new unsigned int [n];
int *order = new int [n];
for (j = 0; j < n; j ++) code_orig[j] = rand() % l;
ispcSetMallocHeapLimit(1024*1024*1024);
srand (0);
#ifndef _CUDA_
for (i = 0; i < m; i ++)
{
ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
reset_and_start_timer();
sort_ispc (n, code, order, 1);
tISPC1 += get_elapsed_msec();
if (argc != 3)
progressbar (i, m);
}
printf("[sort ispc]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC1, 1.0e-3*n*m/tISPC1);
#endif
srand (0);
const int ntask = 13*8;
for (i = 0; i < m; i ++)
{
ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
reset_and_start_timer();
sort_ispc (n, code, order, ntask);
tISPC2 += get_elapsed_msec();
if (argc != 3)
progressbar (i, m);
}
printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n*m/tISPC2);
unsigned int *code1 = new unsigned int [n];
for (int i = 0; i < n; i++)
code1[i] = code[i];
std::sort(code1, code1+n);
for (int i = 0; i < n; i++)
assert(code1[i] == code[i]);
srand (0);
for (i = 0; i < m; i ++)
{
ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
reset_and_start_timer();
sort_serial (n, code, order);
tSerial += get_elapsed_msec();
if (argc != 3)
progressbar (i, m);
}
printf("[sort serial]:\t\t[%.3f] msec [%.3f Mpair/s]\n", tSerial, 1.0e-3*n*m/tSerial);
#ifndef _CUDA_
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
#else
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", tSerial/tISPC2);
#endif
delete code;
delete code_orig;
delete order;
return 0;
}

View File

@@ -0,0 +1,294 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
//Based on http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/bitonic/bitonicen.htm
#include <assert.h>
#include <cutil_inline.h>
#include "sortingNetworks_common.h"
#include "sortingNetworks_common.cuh"
////////////////////////////////////////////////////////////////////////////////
// Monolithic bitonic sort kernel for short arrays fitting into shared memory
////////////////////////////////////////////////////////////////////////////////
__global__ void bitonicSortShared(
uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint arrayLength,
uint dir
){
//Shared memory storage for one or more short vectors
__shared__ uint s_key[SHARED_SIZE_LIMIT];
__shared__ uint s_val[SHARED_SIZE_LIMIT];
//Offset to the beginning of subbatch and load data
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[ 0];
s_val[threadIdx.x + 0] = d_SrcVal[ 0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
for(uint size = 2; size < arrayLength; size <<= 1){
//Bitonic merge
uint ddd = dir ^ ( (threadIdx.x & (size / 2)) != 0 );
for(uint stride = size / 2; stride > 0; stride >>= 1){
__syncthreads();
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(
s_key[pos + 0], s_val[pos + 0],
s_key[pos + stride], s_val[pos + stride],
ddd
);
}
}
//ddd == dir for the last bitonic merge step
{
for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){
__syncthreads();
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(
s_key[pos + 0], s_val[pos + 0],
s_key[pos + stride], s_val[pos + stride],
dir
);
}
}
__syncthreads();
d_DstKey[ 0] = s_key[threadIdx.x + 0];
d_DstVal[ 0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
}
////////////////////////////////////////////////////////////////////////////////
// Bitonic sort kernel for large arrays (not fitting into shared memory)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level bitonic sort
//Almost the same as bitonicSortShared with the exception of
//even / odd subarrays being sorted in opposite directions
//Bitonic merge accepts both
//Ascending | descending or descending | ascending sorted pairs
__global__ void bitonicSortShared1(
uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal
){
//Shared memory storage for current subarray
__shared__ uint s_key[SHARED_SIZE_LIMIT];
__shared__ uint s_val[SHARED_SIZE_LIMIT];
//Offset to the beginning of subarray and load data
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[ 0];
s_val[threadIdx.x + 0] = d_SrcVal[ 0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
for(uint size = 2; size < SHARED_SIZE_LIMIT; size <<= 1){
//Bitonic merge
uint ddd = (threadIdx.x & (size / 2)) != 0;
for(uint stride = size / 2; stride > 0; stride >>= 1){
__syncthreads();
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(
s_key[pos + 0], s_val[pos + 0],
s_key[pos + stride], s_val[pos + stride],
ddd
);
}
}
//Odd / even arrays of SHARED_SIZE_LIMIT elements
//sorted in opposite directions
uint ddd = blockIdx.x & 1;
{
for(uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
__syncthreads();
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(
s_key[pos + 0], s_val[pos + 0],
s_key[pos + stride], s_val[pos + stride],
ddd
);
}
}
__syncthreads();
d_DstKey[ 0] = s_key[threadIdx.x + 0];
d_DstVal[ 0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
}
//Bitonic merge iteration for stride >= SHARED_SIZE_LIMIT
__global__ void bitonicMergeGlobal(
uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint arrayLength,
uint size,
uint stride,
uint dir
){
uint global_comparatorI = blockIdx.x * blockDim.x + threadIdx.x;
uint comparatorI = global_comparatorI & (arrayLength / 2 - 1);
//Bitonic merge
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
uint keyA = d_SrcKey[pos + 0];
uint valA = d_SrcVal[pos + 0];
uint keyB = d_SrcKey[pos + stride];
uint valB = d_SrcVal[pos + stride];
Comparator(
keyA, valA,
keyB, valB,
ddd
);
d_DstKey[pos + 0] = keyA;
d_DstVal[pos + 0] = valA;
d_DstKey[pos + stride] = keyB;
d_DstVal[pos + stride] = valB;
}
//Combined bitonic merge steps for
//size > SHARED_SIZE_LIMIT and stride = [1 .. SHARED_SIZE_LIMIT / 2]
__global__ void bitonicMergeShared(
uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint arrayLength,
uint size,
uint dir
){
//Shared memory storage for current subarray
__shared__ uint s_key[SHARED_SIZE_LIMIT];
__shared__ uint s_val[SHARED_SIZE_LIMIT];
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[ 0];
s_val[threadIdx.x + 0] = d_SrcVal[ 0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
//Bitonic merge
uint comparatorI = UMAD(blockIdx.x, blockDim.x, threadIdx.x) & ((arrayLength / 2) - 1);
uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
for(uint stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1){
__syncthreads();
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(
s_key[pos + 0], s_val[pos + 0],
s_key[pos + stride], s_val[pos + stride],
ddd
);
}
__syncthreads();
d_DstKey[ 0] = s_key[threadIdx.x + 0];
d_DstVal[ 0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
}
////////////////////////////////////////////////////////////////////////////////
// Interface function
////////////////////////////////////////////////////////////////////////////////
//Helper function (also used by odd-even merge sort)
extern "C" uint factorRadix2(uint *log2L, uint L){
if(!L){
*log2L = 0;
return 0;
}else{
for(*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++);
return L;
}
}
extern "C" uint bitonicSort(
uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint batchSize,
uint arrayLength,
uint dir
){
//Nothing to sort
if(arrayLength < 2)
return 0;
//Only power-of-two array lengths are supported by this implementation
uint log2L;
uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
assert( factorizationRemainder == 1 );
dir = (dir != 0);
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
uint threadCount = SHARED_SIZE_LIMIT / 2;
if(arrayLength <= SHARED_SIZE_LIMIT){
assert( (batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0 );
bitonicSortShared<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, dir);
}else{
bitonicSortShared1<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal);
for(uint size = 2 * SHARED_SIZE_LIMIT; size <= arrayLength; size <<= 1)
for(unsigned stride = size / 2; stride > 0; stride >>= 1)
if(stride >= SHARED_SIZE_LIMIT){
bitonicMergeGlobal<<<(batchSize * arrayLength) / 512, 256>>>(d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, stride, dir);
}else{
bitonicMergeShared<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_DstKey, d_DstVal, arrayLength, size, dir);
break;
}
}
return threadCount;
}
extern "C" int isDeviceEmulation(void){
#ifdef __DEVICE_EMULATION__
return 1;
#else
return 0;
#endif
}

View File

@@ -0,0 +1,289 @@
#define SIZE_LIMIT (2*programCount)
static inline void Comparator(
int &keyA,
int &valA,
int &keyB,
int &valB,
const int dir)
{
if ((keyA > keyB) == dir)
{
int t;
t = keyA; keyA = keyB; keyB = t;
t = valA; valA = valB; valB = t;
}
}
////////////////////////////////////////////////////////////////////////////////
// Monolithic bitonic sort kernel for short arrays fitting into local memory
////////////////////////////////////////////////////////////////////////////////
task
void bitonicSortLocal(
uniform int dstKey[],
uniform int dstVal[],
uniform int srcKey[],
uniform int srcVal[],
const uniform int arrayLength,
const uniform int dir)
{
uniform int l_key[SIZE_LIMIT];
uniform int l_val[SIZE_LIMIT];
//Offset to the beginning of subbatch and load data
const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
l_key[programIndex + 0] = srcKey[offset];
l_val[programIndex + 0] = srcVal[offset];
l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset +(SIZE_LIMIT/2)];
l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset +(SIZE_LIMIT/2)];
for (uniform int size = 2; size < arrayLength; size <<= 1)
{
//Bitonic merge
const int ddd = dir ^ ( (programIndex & (size / 2)) != 0 );
for (uniform int stride = size / 2; stride > 0; stride >>= 1)
{
const int pos = 2 * programIndex - (programIndex & (stride - 1));
int key_a = l_key[pos];
int val_a = l_val[pos];
int key_b = l_key[pos + stride];
int val_b = l_val[pos + stride];
Comparator(key_a, val_a, key_b, val_b, ddd);
l_key[pos] = key_a;
l_val[pos] = val_a;
l_key[pos + stride] = key_b;
l_val[pos + stride] = val_b;
}
}
//ddd == dir for the last bitonic merge step
{
for (int stride = arrayLength / 2; stride > 0; stride >>= 1)
{
const int pos = 2 * programIndex - (programIndex & (stride - 1));
int key_a = l_key[pos];
int val_a = l_val[pos];
int key_b = l_key[pos + stride];
int val_b = l_val[pos + stride];
Comparator(key_a, val_a, key_b, val_b, dir);
l_key[pos] = key_a;
l_val[pos] = val_a;
l_key[pos + stride] = key_b;
l_val[pos + stride] = val_b;
}
}
dstKey[offset] = l_key[programIndex + 0];
dstVal[offset] = l_val[programIndex + 0];
dstKey[offset +(SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
dstVal[offset +(SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
}
////////////////////////////////////////////////////////////////////////////////
// Bitonic sort kernel for large arrays (not fitting into local memory)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level bitonic sort
//Almost the same as bitonicSortLocal with the only exception
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
//sorted in opposite directions
task
void bitonicSortLocal1(
uniform int dstKey[],
uniform int dstVal[],
uniform int srcKey[],
uniform int srcVal[])
{
uniform int l_key[SIZE_LIMIT];
uniform int l_val[SIZE_LIMIT];
//Offset to the beginning of subarray and load data
const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
l_key[programIndex + 0] = srcKey[offset];
l_val[programIndex + 0] = srcVal[offset];
l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset + (SIZE_LIMIT/2)];
l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset + (SIZE_LIMIT/2)];
for (int size = 2; size < SIZE_LIMIT; size <<= 1)
{
//Bitonic merge
const int ddd = (programIndex & (size / 2)) != 0;
for (int stride = size / 2; stride > 0; stride >>= 1)
{
const int pos = 2 * programIndex - (programIndex & (stride - 1));
int key_a = l_key[pos];
int val_a = l_val[pos];
int key_b = l_key[pos + stride];
int val_b = l_val[pos + stride];
Comparator(key_a, val_a, key_b, val_b, ddd);
l_key[pos] = key_a;
l_val[pos] = val_a;
l_key[pos + stride] = key_b;
l_val[pos + stride] = val_b;
}
}
//Odd / even arrays of LOCAL_SIZE_LIMIT elements
//sorted in opposite directions
{
const int ddd = taskIndex0 & 1;
for (int stride = SIZE_LIMIT/2; stride > 0; stride >>= 1)
{
const int pos = 2 * programIndex - (programIndex & (stride - 1));
int key_a = l_key[pos];
int val_a = l_val[pos];
int key_b = l_key[pos + stride];
int val_b = l_val[pos + stride];
Comparator(key_a, val_a, key_b, val_b, ddd);
l_key[pos] = key_a;
l_val[pos] = val_a;
l_key[pos + stride] = key_b;
l_val[pos + stride] = val_b;
}
}
dstKey[offset] = l_key[programIndex + 0];
dstVal[offset] = l_val[programIndex + 0];
dstKey[offset + (SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
dstVal[offset + (SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
}
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
task
void bitonicMergeGlobal(
uniform int dstKey[],
uniform int dstVal[],
uniform int srcKey[],
uniform int srcVal[],
const uniform int arrayLength,
const uniform int size,
const uniform int stride,
const uniform int dir)
{
const int global_comparatorI = taskIndex0*programCount + programIndex;
const int comparatorI = global_comparatorI & (arrayLength / 2 - 1);
//Bitonic merge
const int ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
const int pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
int keyA = srcKey[pos + 0];
int valA = srcVal[pos + 0];
int keyB = srcKey[pos + stride];
int valB = srcVal[pos + stride];
Comparator(
keyA, valA,
keyB, valB,
ddd
);
dstKey[pos + 0] = keyA;
dstVal[pos + 0] = valA;
dstKey[pos + stride] = keyB;
dstVal[pos + stride] = valB;
}
//Combined bitonic merge steps for
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
task
void bitonicMergeLocal(
uniform int dstKey[],
uniform int dstVal[],
uniform int srcKey[],
uniform int srcVal[],
const uniform int arrayLength,
const uniform int size,
const uniform int dir)
{
uniform int l_key[SIZE_LIMIT];
uniform int l_val[SIZE_LIMIT];
const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
l_key[programIndex + 0] = srcKey[offset];
l_val[programIndex + 0] = srcVal[offset];
l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset +(SIZE_LIMIT/2)];
l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset +(SIZE_LIMIT/2)];
//Bitonic merge
const int global_id = taskIndex0*programCount + programIndex;
const int comparatorI = global_id & ((arrayLength / 2) - 1);
const int ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
for (int stride = SIZE_LIMIT / 2; stride > 0; stride >>= 1)
{
const int pos = 2 * programIndex - (programIndex & (stride - 1));
int key_a = l_key[pos];
int val_a = l_val[pos];
int key_b = l_key[pos + stride];
int val_b = l_val[pos + stride];
Comparator(key_a, val_a, key_b, val_b, ddd);
l_key[pos] = key_a;
l_val[pos] = val_a;
l_key[pos + stride] = key_b;
l_val[pos + stride] = val_b;
}
dstKey[offset] = l_key[programIndex + 0];
dstVal[offset] = l_val[programIndex + 0];
dstKey[offset +(SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
dstVal[offset +(SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
}
static inline int factorRadix2(int &log2L, int L){
if(!L){
log2L = 0;
return 0;
}else{
int val;
for(val = 0; (L & 1) == 0; L >>= 1, val++);
log2L = val;
return L;
}
}
export
void bitoniSort(
uniform int dstKey[],
uniform int dstVal[],
uniform int srcKey[],
uniform int srcVal[],
const uniform int batchSize,
const uniform int arrayLength,
const uniform int dir)
{
//Nothing to sort
if (arrayLength < 2)
return;
//Only power-of-two array lengths are supported by this implementation
int log2L;
const int factorizationRemainder = factorRadix2(log2L, arrayLength);
assert( factorizationRemainder == 1 );
const uniform int blockCount = batchSize * arrayLength / SIZE_LIMIT;
const uniform int threadCount = SIZE_LIMIT / 2;
if (arrayLength <= SIZE_LIMIT)
{
assert( (batchSize * arrayLength) % SIZE_LIMIT == 0 );
launch [blockCount] bitonicSortLocal(dstKey, dstVal, srcKey, srcVal, arrayLength, dir);
sync;
}
else
{
launch [blockCount] bitonicSortLocal1(dstKey, dstVal, srcKey, srcVal);
sync;
for(uniform int size = 2 * SIZE_LIMIT; size <= arrayLength; size <<= 1)
for(uniform int stride = size / 2; stride > 0; stride >>= 1)
if (stride >= SIZE_LIMIT)
{
launch [blockCount] bitonicMergeGlobal(dstKey, dstVal, dstKey, dstVal, arrayLength, size, stride, dir);
sync;
}
else
{
launch [blockCount] bitonicMergeLocal(dstKey, dstVal, dstKey, dstVal, arrayLength, size, dir);
sync;
}
}
}