290 lines
9.2 KiB
Plaintext
290 lines
9.2 KiB
Plaintext
#define SIZE_LIMIT (2*programCount)
|
|
|
|
static inline void Comparator(
|
|
int &keyA,
|
|
int &valA,
|
|
int &keyB,
|
|
int &valB,
|
|
const int dir)
|
|
{
|
|
if ((keyA > keyB) == dir)
|
|
{
|
|
int t;
|
|
t = keyA; keyA = keyB; keyB = t;
|
|
t = valA; valA = valB; valB = t;
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Monolithic bitonic sort kernel for short arrays fitting into local memory
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
task
|
|
void bitonicSortLocal(
|
|
uniform int dstKey[],
|
|
uniform int dstVal[],
|
|
uniform int srcKey[],
|
|
uniform int srcVal[],
|
|
const uniform int arrayLength,
|
|
const uniform int dir)
|
|
{
|
|
uniform int l_key[SIZE_LIMIT];
|
|
uniform int l_val[SIZE_LIMIT];
|
|
|
|
//Offset to the beginning of subbatch and load data
|
|
const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
|
|
l_key[programIndex + 0] = srcKey[offset];
|
|
l_val[programIndex + 0] = srcVal[offset];
|
|
l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset +(SIZE_LIMIT/2)];
|
|
l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset +(SIZE_LIMIT/2)];
|
|
|
|
for (uniform int size = 2; size < arrayLength; size <<= 1)
|
|
{
|
|
//Bitonic merge
|
|
const int ddd = dir ^ ( (programIndex & (size / 2)) != 0 );
|
|
for (uniform int stride = size / 2; stride > 0; stride >>= 1)
|
|
{
|
|
const int pos = 2 * programIndex - (programIndex & (stride - 1));
|
|
int key_a = l_key[pos];
|
|
int val_a = l_val[pos];
|
|
int key_b = l_key[pos + stride];
|
|
int val_b = l_val[pos + stride];
|
|
Comparator(key_a, val_a, key_b, val_b, ddd);
|
|
l_key[pos] = key_a;
|
|
l_val[pos] = val_a;
|
|
l_key[pos + stride] = key_b;
|
|
l_val[pos + stride] = val_b;
|
|
}
|
|
}
|
|
|
|
//ddd == dir for the last bitonic merge step
|
|
{
|
|
for (int stride = arrayLength / 2; stride > 0; stride >>= 1)
|
|
{
|
|
const int pos = 2 * programIndex - (programIndex & (stride - 1));
|
|
int key_a = l_key[pos];
|
|
int val_a = l_val[pos];
|
|
int key_b = l_key[pos + stride];
|
|
int val_b = l_val[pos + stride];
|
|
Comparator(key_a, val_a, key_b, val_b, dir);
|
|
l_key[pos] = key_a;
|
|
l_val[pos] = val_a;
|
|
l_key[pos + stride] = key_b;
|
|
l_val[pos + stride] = val_b;
|
|
}
|
|
}
|
|
|
|
dstKey[offset] = l_key[programIndex + 0];
|
|
dstVal[offset] = l_val[programIndex + 0];
|
|
dstKey[offset +(SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
|
|
dstVal[offset +(SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Bitonic sort kernel for large arrays (not fitting into local memory)
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//Bottom-level bitonic sort
|
|
//Almost the same as bitonicSortLocal with the only exception
|
|
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
|
|
//sorted in opposite directions
|
|
task
|
|
void bitonicSortLocal1(
|
|
uniform int dstKey[],
|
|
uniform int dstVal[],
|
|
uniform int srcKey[],
|
|
uniform int srcVal[])
|
|
{
|
|
uniform int l_key[SIZE_LIMIT];
|
|
uniform int l_val[SIZE_LIMIT];
|
|
|
|
//Offset to the beginning of subarray and load data
|
|
const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
|
|
l_key[programIndex + 0] = srcKey[offset];
|
|
l_val[programIndex + 0] = srcVal[offset];
|
|
l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset + (SIZE_LIMIT/2)];
|
|
l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset + (SIZE_LIMIT/2)];
|
|
|
|
for (int size = 2; size < SIZE_LIMIT; size <<= 1)
|
|
{
|
|
//Bitonic merge
|
|
const int ddd = (programIndex & (size / 2)) != 0;
|
|
for (int stride = size / 2; stride > 0; stride >>= 1)
|
|
{
|
|
const int pos = 2 * programIndex - (programIndex & (stride - 1));
|
|
int key_a = l_key[pos];
|
|
int val_a = l_val[pos];
|
|
int key_b = l_key[pos + stride];
|
|
int val_b = l_val[pos + stride];
|
|
Comparator(key_a, val_a, key_b, val_b, ddd);
|
|
l_key[pos] = key_a;
|
|
l_val[pos] = val_a;
|
|
l_key[pos + stride] = key_b;
|
|
l_val[pos + stride] = val_b;
|
|
}
|
|
}
|
|
|
|
//Odd / even arrays of LOCAL_SIZE_LIMIT elements
|
|
//sorted in opposite directions
|
|
{
|
|
const int ddd = taskIndex0 & 1;
|
|
for (int stride = SIZE_LIMIT/2; stride > 0; stride >>= 1)
|
|
{
|
|
const int pos = 2 * programIndex - (programIndex & (stride - 1));
|
|
int key_a = l_key[pos];
|
|
int val_a = l_val[pos];
|
|
int key_b = l_key[pos + stride];
|
|
int val_b = l_val[pos + stride];
|
|
Comparator(key_a, val_a, key_b, val_b, ddd);
|
|
l_key[pos] = key_a;
|
|
l_val[pos] = val_a;
|
|
l_key[pos + stride] = key_b;
|
|
l_val[pos + stride] = val_b;
|
|
}
|
|
}
|
|
|
|
dstKey[offset] = l_key[programIndex + 0];
|
|
dstVal[offset] = l_val[programIndex + 0];
|
|
dstKey[offset + (SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
|
|
dstVal[offset + (SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
|
|
}
|
|
|
|
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
|
|
task
|
|
void bitonicMergeGlobal(
|
|
uniform int dstKey[],
|
|
uniform int dstVal[],
|
|
uniform int srcKey[],
|
|
uniform int srcVal[],
|
|
const uniform int arrayLength,
|
|
const uniform int size,
|
|
const uniform int stride,
|
|
const uniform int dir)
|
|
{
|
|
const int global_comparatorI = taskIndex0*programCount + programIndex;
|
|
const int comparatorI = global_comparatorI & (arrayLength / 2 - 1);
|
|
|
|
//Bitonic merge
|
|
const int ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
|
|
const int pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
|
|
|
|
int keyA = srcKey[pos + 0];
|
|
int valA = srcVal[pos + 0];
|
|
int keyB = srcKey[pos + stride];
|
|
int valB = srcVal[pos + stride];
|
|
|
|
Comparator(
|
|
keyA, valA,
|
|
keyB, valB,
|
|
ddd
|
|
);
|
|
|
|
dstKey[pos + 0] = keyA;
|
|
dstVal[pos + 0] = valA;
|
|
dstKey[pos + stride] = keyB;
|
|
dstVal[pos + stride] = valB;
|
|
}
|
|
|
|
//Combined bitonic merge steps for
|
|
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
|
|
task
|
|
void bitonicMergeLocal(
|
|
uniform int dstKey[],
|
|
uniform int dstVal[],
|
|
uniform int srcKey[],
|
|
uniform int srcVal[],
|
|
const uniform int arrayLength,
|
|
const uniform int size,
|
|
const uniform int dir)
|
|
{
|
|
uniform int l_key[SIZE_LIMIT];
|
|
uniform int l_val[SIZE_LIMIT];
|
|
|
|
const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
|
|
l_key[programIndex + 0] = srcKey[offset];
|
|
l_val[programIndex + 0] = srcVal[offset];
|
|
l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset +(SIZE_LIMIT/2)];
|
|
l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset +(SIZE_LIMIT/2)];
|
|
|
|
//Bitonic merge
|
|
const int global_id = taskIndex0*programCount + programIndex;
|
|
const int comparatorI = global_id & ((arrayLength / 2) - 1);
|
|
const int ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
|
|
for (int stride = SIZE_LIMIT / 2; stride > 0; stride >>= 1)
|
|
{
|
|
const int pos = 2 * programIndex - (programIndex & (stride - 1));
|
|
int key_a = l_key[pos];
|
|
int val_a = l_val[pos];
|
|
int key_b = l_key[pos + stride];
|
|
int val_b = l_val[pos + stride];
|
|
Comparator(key_a, val_a, key_b, val_b, ddd);
|
|
l_key[pos] = key_a;
|
|
l_val[pos] = val_a;
|
|
l_key[pos + stride] = key_b;
|
|
l_val[pos + stride] = val_b;
|
|
}
|
|
|
|
dstKey[offset] = l_key[programIndex + 0];
|
|
dstVal[offset] = l_val[programIndex + 0];
|
|
dstKey[offset +(SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
|
|
dstVal[offset +(SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
|
|
}
|
|
|
|
static inline int factorRadix2(int &log2L, int L){
|
|
if(!L){
|
|
log2L = 0;
|
|
return 0;
|
|
}else{
|
|
int val;
|
|
for(val = 0; (L & 1) == 0; L >>= 1, val++);
|
|
log2L = val;
|
|
return L;
|
|
}
|
|
}
|
|
|
|
export
|
|
void bitoniSort(
|
|
uniform int dstKey[],
|
|
uniform int dstVal[],
|
|
uniform int srcKey[],
|
|
uniform int srcVal[],
|
|
const uniform int batchSize,
|
|
const uniform int arrayLength,
|
|
const uniform int dir)
|
|
{
|
|
//Nothing to sort
|
|
if (arrayLength < 2)
|
|
return;
|
|
|
|
//Only power-of-two array lengths are supported by this implementation
|
|
int log2L;
|
|
const int factorizationRemainder = factorRadix2(log2L, arrayLength);
|
|
assert( factorizationRemainder == 1 );
|
|
|
|
const uniform int blockCount = batchSize * arrayLength / SIZE_LIMIT;
|
|
const uniform int threadCount = SIZE_LIMIT / 2;
|
|
|
|
if (arrayLength <= SIZE_LIMIT)
|
|
{
|
|
assert( (batchSize * arrayLength) % SIZE_LIMIT == 0 );
|
|
launch [blockCount] bitonicSortLocal(dstKey, dstVal, srcKey, srcVal, arrayLength, dir);
|
|
sync;
|
|
}
|
|
else
|
|
{
|
|
launch [blockCount] bitonicSortLocal1(dstKey, dstVal, srcKey, srcVal);
|
|
sync;
|
|
|
|
for(uniform int size = 2 * SIZE_LIMIT; size <= arrayLength; size <<= 1)
|
|
for(uniform int stride = size / 2; stride > 0; stride >>= 1)
|
|
if (stride >= SIZE_LIMIT)
|
|
{
|
|
launch [blockCount] bitonicMergeGlobal(dstKey, dstVal, dstKey, dstVal, arrayLength, size, stride, dir);
|
|
sync;
|
|
}
|
|
else
|
|
{
|
|
launch [blockCount] bitonicMergeLocal(dstKey, dstVal, dstKey, dstVal, arrayLength, size, dir);
|
|
sync;
|
|
}
|
|
}
|
|
}
|