Files
ispc/examples_ptx/bitonicSort/bitonicSort.ispc
2014-01-27 14:02:42 +01:00

290 lines
9.2 KiB
Plaintext

#define SIZE_LIMIT (2*programCount)
static inline void Comparator(
int &keyA,
int &valA,
int &keyB,
int &valB,
const int dir)
{
if ((keyA > keyB) == dir)
{
int t;
t = keyA; keyA = keyB; keyB = t;
t = valA; valA = valB; valB = t;
}
}
////////////////////////////////////////////////////////////////////////////////
// Monolithic bitonic sort kernel for short arrays fitting into local memory
////////////////////////////////////////////////////////////////////////////////
task
void bitonicSortLocal(
uniform int dstKey[],
uniform int dstVal[],
uniform int srcKey[],
uniform int srcVal[],
const uniform int arrayLength,
const uniform int dir)
{
uniform int l_key[SIZE_LIMIT];
uniform int l_val[SIZE_LIMIT];
//Offset to the beginning of subbatch and load data
const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
l_key[programIndex + 0] = srcKey[offset];
l_val[programIndex + 0] = srcVal[offset];
l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset +(SIZE_LIMIT/2)];
l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset +(SIZE_LIMIT/2)];
for (uniform int size = 2; size < arrayLength; size <<= 1)
{
//Bitonic merge
const int ddd = dir ^ ( (programIndex & (size / 2)) != 0 );
for (uniform int stride = size / 2; stride > 0; stride >>= 1)
{
const int pos = 2 * programIndex - (programIndex & (stride - 1));
int key_a = l_key[pos];
int val_a = l_val[pos];
int key_b = l_key[pos + stride];
int val_b = l_val[pos + stride];
Comparator(key_a, val_a, key_b, val_b, ddd);
l_key[pos] = key_a;
l_val[pos] = val_a;
l_key[pos + stride] = key_b;
l_val[pos + stride] = val_b;
}
}
//ddd == dir for the last bitonic merge step
{
for (int stride = arrayLength / 2; stride > 0; stride >>= 1)
{
const int pos = 2 * programIndex - (programIndex & (stride - 1));
int key_a = l_key[pos];
int val_a = l_val[pos];
int key_b = l_key[pos + stride];
int val_b = l_val[pos + stride];
Comparator(key_a, val_a, key_b, val_b, dir);
l_key[pos] = key_a;
l_val[pos] = val_a;
l_key[pos + stride] = key_b;
l_val[pos + stride] = val_b;
}
}
dstKey[offset] = l_key[programIndex + 0];
dstVal[offset] = l_val[programIndex + 0];
dstKey[offset +(SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
dstVal[offset +(SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
}
////////////////////////////////////////////////////////////////////////////////
// Bitonic sort kernel for large arrays (not fitting into local memory)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level bitonic sort
//Almost the same as bitonicSortLocal with the only exception
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
//sorted in opposite directions
task
void bitonicSortLocal1(
uniform int dstKey[],
uniform int dstVal[],
uniform int srcKey[],
uniform int srcVal[])
{
uniform int l_key[SIZE_LIMIT];
uniform int l_val[SIZE_LIMIT];
//Offset to the beginning of subarray and load data
const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
l_key[programIndex + 0] = srcKey[offset];
l_val[programIndex + 0] = srcVal[offset];
l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset + (SIZE_LIMIT/2)];
l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset + (SIZE_LIMIT/2)];
for (int size = 2; size < SIZE_LIMIT; size <<= 1)
{
//Bitonic merge
const int ddd = (programIndex & (size / 2)) != 0;
for (int stride = size / 2; stride > 0; stride >>= 1)
{
const int pos = 2 * programIndex - (programIndex & (stride - 1));
int key_a = l_key[pos];
int val_a = l_val[pos];
int key_b = l_key[pos + stride];
int val_b = l_val[pos + stride];
Comparator(key_a, val_a, key_b, val_b, ddd);
l_key[pos] = key_a;
l_val[pos] = val_a;
l_key[pos + stride] = key_b;
l_val[pos + stride] = val_b;
}
}
//Odd / even arrays of LOCAL_SIZE_LIMIT elements
//sorted in opposite directions
{
const int ddd = taskIndex0 & 1;
for (int stride = SIZE_LIMIT/2; stride > 0; stride >>= 1)
{
const int pos = 2 * programIndex - (programIndex & (stride - 1));
int key_a = l_key[pos];
int val_a = l_val[pos];
int key_b = l_key[pos + stride];
int val_b = l_val[pos + stride];
Comparator(key_a, val_a, key_b, val_b, ddd);
l_key[pos] = key_a;
l_val[pos] = val_a;
l_key[pos + stride] = key_b;
l_val[pos + stride] = val_b;
}
}
dstKey[offset] = l_key[programIndex + 0];
dstVal[offset] = l_val[programIndex + 0];
dstKey[offset + (SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
dstVal[offset + (SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
}
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
task
void bitonicMergeGlobal(
uniform int dstKey[],
uniform int dstVal[],
uniform int srcKey[],
uniform int srcVal[],
const uniform int arrayLength,
const uniform int size,
const uniform int stride,
const uniform int dir)
{
const int global_comparatorI = taskIndex0*programCount + programIndex;
const int comparatorI = global_comparatorI & (arrayLength / 2 - 1);
//Bitonic merge
const int ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
const int pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
int keyA = srcKey[pos + 0];
int valA = srcVal[pos + 0];
int keyB = srcKey[pos + stride];
int valB = srcVal[pos + stride];
Comparator(
keyA, valA,
keyB, valB,
ddd
);
dstKey[pos + 0] = keyA;
dstVal[pos + 0] = valA;
dstKey[pos + stride] = keyB;
dstVal[pos + stride] = valB;
}
//Combined bitonic merge steps for
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
task
void bitonicMergeLocal(
uniform int dstKey[],
uniform int dstVal[],
uniform int srcKey[],
uniform int srcVal[],
const uniform int arrayLength,
const uniform int size,
const uniform int dir)
{
uniform int l_key[SIZE_LIMIT];
uniform int l_val[SIZE_LIMIT];
const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
l_key[programIndex + 0] = srcKey[offset];
l_val[programIndex + 0] = srcVal[offset];
l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset +(SIZE_LIMIT/2)];
l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset +(SIZE_LIMIT/2)];
//Bitonic merge
const int global_id = taskIndex0*programCount + programIndex;
const int comparatorI = global_id & ((arrayLength / 2) - 1);
const int ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
for (int stride = SIZE_LIMIT / 2; stride > 0; stride >>= 1)
{
const int pos = 2 * programIndex - (programIndex & (stride - 1));
int key_a = l_key[pos];
int val_a = l_val[pos];
int key_b = l_key[pos + stride];
int val_b = l_val[pos + stride];
Comparator(key_a, val_a, key_b, val_b, ddd);
l_key[pos] = key_a;
l_val[pos] = val_a;
l_key[pos + stride] = key_b;
l_val[pos + stride] = val_b;
}
dstKey[offset] = l_key[programIndex + 0];
dstVal[offset] = l_val[programIndex + 0];
dstKey[offset +(SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
dstVal[offset +(SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
}
static inline int factorRadix2(int &log2L, int L){
if(!L){
log2L = 0;
return 0;
}else{
int val;
for(val = 0; (L & 1) == 0; L >>= 1, val++);
log2L = val;
return L;
}
}
export
void bitoniSort(
uniform int dstKey[],
uniform int dstVal[],
uniform int srcKey[],
uniform int srcVal[],
const uniform int batchSize,
const uniform int arrayLength,
const uniform int dir)
{
//Nothing to sort
if (arrayLength < 2)
return;
//Only power-of-two array lengths are supported by this implementation
int log2L;
const int factorizationRemainder = factorRadix2(log2L, arrayLength);
assert( factorizationRemainder == 1 );
const uniform int blockCount = batchSize * arrayLength / SIZE_LIMIT;
const uniform int threadCount = SIZE_LIMIT / 2;
if (arrayLength <= SIZE_LIMIT)
{
assert( (batchSize * arrayLength) % SIZE_LIMIT == 0 );
launch [blockCount] bitonicSortLocal(dstKey, dstVal, srcKey, srcVal, arrayLength, dir);
sync;
}
else
{
launch [blockCount] bitonicSortLocal1(dstKey, dstVal, srcKey, srcVal);
sync;
for(uniform int size = 2 * SIZE_LIMIT; size <= arrayLength; size <<= 1)
for(uniform int stride = size / 2; stride > 0; stride >>= 1)
if (stride >= SIZE_LIMIT)
{
launch [blockCount] bitonicMergeGlobal(dstKey, dstVal, dstKey, dstVal, arrayLength, size, stride, dir);
sync;
}
else
{
launch [blockCount] bitonicMergeLocal(dstKey, dstVal, dstKey, dstVal, arrayLength, size, dir);
sync;
}
}
}