first commit bitonicSort
This commit is contained in:
289
examples_ptx/bitonicSort/bitonicSort.ispc
Normal file
289
examples_ptx/bitonicSort/bitonicSort.ispc
Normal file
@@ -0,0 +1,289 @@
|
||||
#define SIZE_LIMIT (2*programCount)
|
||||
|
||||
static inline void Comparator(
|
||||
int &keyA,
|
||||
int &valA,
|
||||
int &keyB,
|
||||
int &valB,
|
||||
const int dir)
|
||||
{
|
||||
if ((keyA > keyB) == dir)
|
||||
{
|
||||
int t;
|
||||
t = keyA; keyA = keyB; keyB = t;
|
||||
t = valA; valA = valB; valB = t;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Monolithic bitonic sort kernel for short arrays fitting into local memory
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
task
|
||||
void bitonicSortLocal(
|
||||
uniform int dstKey[],
|
||||
uniform int dstVal[],
|
||||
uniform int srcKey[],
|
||||
uniform int srcVal[],
|
||||
const uniform int arrayLength,
|
||||
const uniform int dir)
|
||||
{
|
||||
uniform int l_key[SIZE_LIMIT];
|
||||
uniform int l_val[SIZE_LIMIT];
|
||||
|
||||
//Offset to the beginning of subbatch and load data
|
||||
const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
|
||||
l_key[programIndex + 0] = srcKey[offset];
|
||||
l_val[programIndex + 0] = srcVal[offset];
|
||||
l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset +(SIZE_LIMIT/2)];
|
||||
l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset +(SIZE_LIMIT/2)];
|
||||
|
||||
for (uniform int size = 2; size < arrayLength; size <<= 1)
|
||||
{
|
||||
//Bitonic merge
|
||||
const int ddd = dir ^ ( (programIndex & (size / 2)) != 0 );
|
||||
for (uniform int stride = size / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
const int pos = 2 * programIndex - (programIndex & (stride - 1));
|
||||
int key_a = l_key[pos];
|
||||
int val_a = l_val[pos];
|
||||
int key_b = l_key[pos + stride];
|
||||
int val_b = l_val[pos + stride];
|
||||
Comparator(key_a, val_a, key_b, val_b, ddd);
|
||||
l_key[pos] = key_a;
|
||||
l_val[pos] = val_a;
|
||||
l_key[pos + stride] = key_b;
|
||||
l_val[pos + stride] = val_b;
|
||||
}
|
||||
}
|
||||
|
||||
//ddd == dir for the last bitonic merge step
|
||||
{
|
||||
for (int stride = arrayLength / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
const int pos = 2 * programIndex - (programIndex & (stride - 1));
|
||||
int key_a = l_key[pos];
|
||||
int val_a = l_val[pos];
|
||||
int key_b = l_key[pos + stride];
|
||||
int val_b = l_val[pos + stride];
|
||||
Comparator(key_a, val_a, key_b, val_b, dir);
|
||||
l_key[pos] = key_a;
|
||||
l_val[pos] = val_a;
|
||||
l_key[pos + stride] = key_b;
|
||||
l_val[pos + stride] = val_b;
|
||||
}
|
||||
}
|
||||
|
||||
dstKey[offset] = l_key[programIndex + 0];
|
||||
dstVal[offset] = l_val[programIndex + 0];
|
||||
dstKey[offset +(SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
|
||||
dstVal[offset +(SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Bitonic sort kernel for large arrays (not fitting into local memory)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//Bottom-level bitonic sort
|
||||
//Almost the same as bitonicSortLocal with the only exception
|
||||
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
|
||||
//sorted in opposite directions
|
||||
task
|
||||
void bitonicSortLocal1(
|
||||
uniform int dstKey[],
|
||||
uniform int dstVal[],
|
||||
uniform int srcKey[],
|
||||
uniform int srcVal[])
|
||||
{
|
||||
uniform int l_key[SIZE_LIMIT];
|
||||
uniform int l_val[SIZE_LIMIT];
|
||||
|
||||
//Offset to the beginning of subarray and load data
|
||||
const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
|
||||
l_key[programIndex + 0] = srcKey[offset];
|
||||
l_val[programIndex + 0] = srcVal[offset];
|
||||
l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset + (SIZE_LIMIT/2)];
|
||||
l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset + (SIZE_LIMIT/2)];
|
||||
|
||||
for (int size = 2; size < SIZE_LIMIT; size <<= 1)
|
||||
{
|
||||
//Bitonic merge
|
||||
const int ddd = (programIndex & (size / 2)) != 0;
|
||||
for (int stride = size / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
const int pos = 2 * programIndex - (programIndex & (stride - 1));
|
||||
int key_a = l_key[pos];
|
||||
int val_a = l_val[pos];
|
||||
int key_b = l_key[pos + stride];
|
||||
int val_b = l_val[pos + stride];
|
||||
Comparator(key_a, val_a, key_b, val_b, ddd);
|
||||
l_key[pos] = key_a;
|
||||
l_val[pos] = val_a;
|
||||
l_key[pos + stride] = key_b;
|
||||
l_val[pos + stride] = val_b;
|
||||
}
|
||||
}
|
||||
|
||||
//Odd / even arrays of LOCAL_SIZE_LIMIT elements
|
||||
//sorted in opposite directions
|
||||
{
|
||||
const int ddd = taskIndex0 & 1;
|
||||
for (int stride = SIZE_LIMIT/2; stride > 0; stride >>= 1)
|
||||
{
|
||||
const int pos = 2 * programIndex - (programIndex & (stride - 1));
|
||||
int key_a = l_key[pos];
|
||||
int val_a = l_val[pos];
|
||||
int key_b = l_key[pos + stride];
|
||||
int val_b = l_val[pos + stride];
|
||||
Comparator(key_a, val_a, key_b, val_b, ddd);
|
||||
l_key[pos] = key_a;
|
||||
l_val[pos] = val_a;
|
||||
l_key[pos + stride] = key_b;
|
||||
l_val[pos + stride] = val_b;
|
||||
}
|
||||
}
|
||||
|
||||
dstKey[offset] = l_key[programIndex + 0];
|
||||
dstVal[offset] = l_val[programIndex + 0];
|
||||
dstKey[offset + (SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
|
||||
dstVal[offset + (SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
|
||||
}
|
||||
|
||||
//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
|
||||
task
|
||||
void bitonicMergeGlobal(
|
||||
uniform int dstKey[],
|
||||
uniform int dstVal[],
|
||||
uniform int srcKey[],
|
||||
uniform int srcVal[],
|
||||
const uniform int arrayLength,
|
||||
const uniform int size,
|
||||
const uniform int stride,
|
||||
const uniform int dir)
|
||||
{
|
||||
const int global_comparatorI = taskIndex0*programCount + programIndex;
|
||||
const int comparatorI = global_comparatorI & (arrayLength / 2 - 1);
|
||||
|
||||
//Bitonic merge
|
||||
const int ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
|
||||
const int pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));
|
||||
|
||||
int keyA = srcKey[pos + 0];
|
||||
int valA = srcVal[pos + 0];
|
||||
int keyB = srcKey[pos + stride];
|
||||
int valB = srcVal[pos + stride];
|
||||
|
||||
Comparator(
|
||||
keyA, valA,
|
||||
keyB, valB,
|
||||
ddd
|
||||
);
|
||||
|
||||
dstKey[pos + 0] = keyA;
|
||||
dstVal[pos + 0] = valA;
|
||||
dstKey[pos + stride] = keyB;
|
||||
dstVal[pos + stride] = valB;
|
||||
}
|
||||
|
||||
//Combined bitonic merge steps for
|
||||
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
|
||||
task
|
||||
void bitonicMergeLocal(
|
||||
uniform int dstKey[],
|
||||
uniform int dstVal[],
|
||||
uniform int srcKey[],
|
||||
uniform int srcVal[],
|
||||
const uniform int arrayLength,
|
||||
const uniform int size,
|
||||
const uniform int dir)
|
||||
{
|
||||
uniform int l_key[SIZE_LIMIT];
|
||||
uniform int l_val[SIZE_LIMIT];
|
||||
|
||||
const int offset = taskIndex0 * SIZE_LIMIT + programIndex;
|
||||
l_key[programIndex + 0] = srcKey[offset];
|
||||
l_val[programIndex + 0] = srcVal[offset];
|
||||
l_key[programIndex + (SIZE_LIMIT/2)] = srcKey[offset +(SIZE_LIMIT/2)];
|
||||
l_val[programIndex + (SIZE_LIMIT/2)] = srcVal[offset +(SIZE_LIMIT/2)];
|
||||
|
||||
//Bitonic merge
|
||||
const int global_id = taskIndex0*programCount + programIndex;
|
||||
const int comparatorI = global_id & ((arrayLength / 2) - 1);
|
||||
const int ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
|
||||
for (int stride = SIZE_LIMIT / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
const int pos = 2 * programIndex - (programIndex & (stride - 1));
|
||||
int key_a = l_key[pos];
|
||||
int val_a = l_val[pos];
|
||||
int key_b = l_key[pos + stride];
|
||||
int val_b = l_val[pos + stride];
|
||||
Comparator(key_a, val_a, key_b, val_b, ddd);
|
||||
l_key[pos] = key_a;
|
||||
l_val[pos] = val_a;
|
||||
l_key[pos + stride] = key_b;
|
||||
l_val[pos + stride] = val_b;
|
||||
}
|
||||
|
||||
dstKey[offset] = l_key[programIndex + 0];
|
||||
dstVal[offset] = l_val[programIndex + 0];
|
||||
dstKey[offset +(SIZE_LIMIT/2)] = l_key[programIndex + (SIZE_LIMIT/2)];
|
||||
dstVal[offset +(SIZE_LIMIT/2)] = l_val[programIndex + (SIZE_LIMIT/2)];
|
||||
}
|
||||
|
||||
static inline int factorRadix2(int &log2L, int L){
|
||||
if(!L){
|
||||
log2L = 0;
|
||||
return 0;
|
||||
}else{
|
||||
int val;
|
||||
for(val = 0; (L & 1) == 0; L >>= 1, val++);
|
||||
log2L = val;
|
||||
return L;
|
||||
}
|
||||
}
|
||||
|
||||
export
|
||||
void bitoniSort(
|
||||
uniform int dstKey[],
|
||||
uniform int dstVal[],
|
||||
uniform int srcKey[],
|
||||
uniform int srcVal[],
|
||||
const uniform int batchSize,
|
||||
const uniform int arrayLength,
|
||||
const uniform int dir)
|
||||
{
|
||||
//Nothing to sort
|
||||
if (arrayLength < 2)
|
||||
return;
|
||||
|
||||
//Only power-of-two array lengths are supported by this implementation
|
||||
int log2L;
|
||||
const int factorizationRemainder = factorRadix2(log2L, arrayLength);
|
||||
assert( factorizationRemainder == 1 );
|
||||
|
||||
const uniform int blockCount = batchSize * arrayLength / SIZE_LIMIT;
|
||||
const uniform int threadCount = SIZE_LIMIT / 2;
|
||||
|
||||
if (arrayLength <= SIZE_LIMIT)
|
||||
{
|
||||
assert( (batchSize * arrayLength) % SIZE_LIMIT == 0 );
|
||||
launch [blockCount] bitonicSortLocal(dstKey, dstVal, srcKey, srcVal, arrayLength, dir);
|
||||
sync;
|
||||
}
|
||||
else
|
||||
{
|
||||
launch [blockCount] bitonicSortLocal1(dstKey, dstVal, srcKey, srcVal);
|
||||
sync;
|
||||
|
||||
for(uniform int size = 2 * SIZE_LIMIT; size <= arrayLength; size <<= 1)
|
||||
for(uniform int stride = size / 2; stride > 0; stride >>= 1)
|
||||
if (stride >= SIZE_LIMIT)
|
||||
{
|
||||
launch [blockCount] bitonicMergeGlobal(dstKey, dstVal, dstKey, dstVal, arrayLength, size, stride, dir);
|
||||
sync;
|
||||
}
|
||||
else
|
||||
{
|
||||
launch [blockCount] bitonicMergeLocal(dstKey, dstVal, dstKey, dstVal, arrayLength, size, dir);
|
||||
sync;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user