diff --git a/examples_ptx/radixSort/radix.ispc b/examples_ptx/radixSort/radix.ispc new file mode 100644 index 00000000..1d438440 --- /dev/null +++ b/examples_ptx/radixSort/radix.ispc @@ -0,0 +1,265 @@ +//---------------------------------------------------------------------------- +// scan4 scans 4*RadixSort::CTA_SIZE numElements in a block (4 per thread), using +// a warp-scan algorithm +//---------------------------------------------------------------------------- +struct int4 { int x,y,z,w; }; +struct int2 { int x,y; }; + +static int4 scan4(int4 idata) +{ + int idx = programIndex; + + int4 val4 = idata; + int sum[3]; + sum[0] = val4.x; + sum[1] = val4.y + sum[0]; + sum[2] = val4.z + sum[1]; + + int val = val4.w + sum[2]; + + val = exclusive_scan_add(val); + + val4.x = val; + val4.y = val + sum[0]; + val4.z = val + sum[1]; + val4.w = val + sum[2]; + + return val4; +} + +static int4 rank4(int4 preds) +{ + int localId = programIndex; + uniform int localSize = programCount; + + int4 address = scan4(preds); + + const int numtrue = broadcast(address.w + preds.w, localSize - 1); + + int4 rank; + int idx = localId*4; + rank.x = (preds.x) ? address.x : numtrue + idx - address.x; + rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y; + rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z; + rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w; + + return rank; +} + +static int4 radixSortBlockKeysOnly( + int4 key, + uniform int nbits, + uniform int startbit, + uniform int sMem[], + uniform int numtrue[]) +{ + int localId = programIndex; + uniform int localSize = programCount; + + for (uniform int shift = startbit; shift < (startbit + nbits); ++shift) + { + int4 lsb; + lsb.x = !(((key).x >> shift) & 0x1); + lsb.y = !(((key).y >> shift) & 0x1); + lsb.z = !(((key).z >> shift) & 0x1); + lsb.w = !(((key).w >> shift) & 0x1); + + int4 r; + + r = rank4(lsb); + + // This arithmetic strides the ranks across 4 CTA_SIZE regions + sMem[(r.x & 3) * localSize + (r.x >> 2)] = (key).x; + sMem[(r.y & 3) * localSize + (r.y >> 2)] = (key).y; + sMem[(r.z & 3) * localSize + (r.z >> 2)] = (key).z; + sMem[(r.w & 3) * localSize + (r.w >> 2)] = (key).w; + + // The above allows us to read without 4-way bank conflicts: + (key).x = sMem[localId]; + (key).y = sMem[localId + localSize]; + (key).z = sMem[localId + 2 * localSize]; + (key).w = sMem[localId + 3 * localSize]; + } + return key; +} + +task +void radixSortBlocksKeysOnly( + uniform int4 keysIn[], + uniform int4 keysOut[], + uniform int nbits, + uniform int startbit, + uniform int numElements, + uniform int totalBlocks, + uniform int sMem[]) +{ + int globalId = taskIndex*programCount + programIndex; + uniform int numtrue[1]; + + int4 key; + key = keysIn[globalId]; + + + key = radixSortBlockKeysOnly(key, nbits, startbit, sMem, numtrue); + + keysOut[globalId] = key; +} + +//---------------------------------------------------------------------------- +// Given an array with blocks sorted according to a 4-bit radix group, each +// block counts the number of keys that fall into each radix in the group, and +// finds the starting offset of each radix in the block. It then writes the radix +// counts to the counters array, and the starting offsets to the blockOffsets array. +// +// Template parameters are used to generate efficient code for various special cases +// For example, we have to handle arrays that are a multiple of the block size +// (fullBlocks) differently than arrays that are not. "loop" is used when persistent +// CTAs are used. +// +// By persistent CTAs we mean that we launch only as many thread blocks as can +// be resident in the GPU and no more, rather than launching as many threads as +// we have elements. Persistent CTAs loop over blocks of elements until all work +// is complete. This can be faster in some cases. In our tests it is faster +// for large sorts (and the threshold is higher on compute version 1.1 and earlier +// GPUs than it is on compute version 1.2 GPUs. +// +//---------------------------------------------------------------------------- +task +void findRadixOffsets( + uniform int2 keys[], + uniform int counters[], + uniform int blockOffsets[], + uniform int startbit, + uniform int numElements, + uniform int totalBlocks, + uniform int sRadix1[]) +{ + uniform int sStartPointers[16]; + + uniform int groupId = taskIndex; + uniform int groupSize = programCount; + + int localId = programIndex; + + int2 radix2; + + int globalId = taskIndex*programCount + programIndex; + radix2 = keys[globalId]; + + + sRadix1[2 * localId] = (radix2.x >> startbit) & 0xF; + sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF; + + // Finds the position where the sRadix1 entries differ and stores start + // index for each radix. + if(localId < 16) + sStartPointers[localId] = 0; + + if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) + sStartPointers[sRadix1[localId]] = localId; + if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1]) + sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize; + + + if(localId < 16) + blockOffsets[groupId*16 + localId] = sStartPointers[localId]; + + // Compute the sizes of each block. + if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) + sStartPointers[sRadix1[localId - 1]] = + localId - sStartPointers[sRadix1[localId - 1]]; + + if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] ) + sStartPointers[sRadix1[localId + groupSize - 1]] = + localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]]; + + + if(localId == groupSize - 1) + sStartPointers[sRadix1[2 * groupSize - 1]] = + 2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]]; + + if(localId < 16) + counters[localId * totalBlocks + groupId] = sStartPointers[localId]; +} + +// a naive scan routine that works only for array that +// can fit into a single block, just for debugging purpose, +// not used in the sort now +task +void scanNaive( + uniform int g_odata[], + uniform int g_idata[], + uniform int n, + uniform int temp[]) +{ + if (programIndex < n) + g_odata[programIndex] = exclusive_scan_add(g_idata[programIndex]); +} + +//---------------------------------------------------------------------------- +// reorderData shuffles data in the array globally after the radix offsets +// have been found. On compute version 1.1 and earlier GPUs, this code depends +// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits). +// +// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures +// that all writes are coalesced using extra work in the kernel. On later +// GPUs coalescing rules have been relaxed, so this extra overhead hurts +// performance. On these GPUs we set manualCoalesce=false and directly store +// the results. +// +// Template parameters are used to generate efficient code for various special cases +// For example, we have to handle arrays that are a multiple of the block size +// (fullBlocks) differently than arrays that are not. "loop" is used when persistent +// CTAs are used. +// +// By persistent CTAs we mean that we launch only as many thread blocks as can +// be resident in the GPU and no more, rather than launching as many threads as +// we have elements. Persistent CTAs loop over blocks of elements until all work +// is complete. This can be faster in some cases. In our tests it is faster +// for large sorts (and the threshold is higher on compute version 1.1 and earlier +// GPUs than it is on compute version 1.2 GPUs. +//---------------------------------------------------------------------------- +task +void reorderDataKeysOnly( + uniform int outKeys[], + uniform int2 keys[], + uniform int blockOffsets[], + uniform int offsets[], + uniform int sizes[], + uniform int startbit, + uniform int numElements, + uniform int totalBlocks, + uniform int2 sKeys2[]) +{ + uniform int sOffsets[16]; + uniform int sBlockOffsets[16]; + + uniform int * uniform sKeys1 = (uniform int* uniform)sKeys2; + + uniform int groupId = taskIndex; + + uniform int groupSize = programCount; + + int localId = programIndex; + int globalId = taskIndex*programCount + programIndex; + + sKeys2[localId] = keys[globalId]; + + if(localId < 16) + { + sOffsets[localId] = offsets[localId * totalBlocks + groupId]; + sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId]; + } + + int radix = (sKeys1[localId] >> startbit) & 0xF; + int globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix]; + + if (globalOffset < numElements) + outKeys[globalOffset] = sKeys1[localId]; + + radix = (sKeys1[localId + groupSize] >> startbit) & 0xF; + globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix]; + + if (globalOffset < numElements) + outKeys[globalOffset] = sKeys1[localId + groupSize]; +}