diff --git a/examples_ptx/radixSort/radixSort.ispc b/examples_ptx/radixSort/radixSort.ispc new file mode 100644 index 00000000..1d3f7248 --- /dev/null +++ b/examples_ptx/radixSort/radixSort.ispc @@ -0,0 +1,255 @@ +#if 1 +struct int2 { int x,y; }; +struct int4 { int x,y,z,w; }; +#else +typedef int<2> int2; +typedef int<4> int4; +#endif + +static int4 scan4(const int4 idata) +{ + const int idx = programIndex; + + int4 val4 = idata; + int sum[3]; + sum[0] = val4.x; + sum[1] = val4.y + sum[0]; + sum[2] = val4.z + sum[1]; + + int val = val4.w + sum[2]; + val = exclusive_scan_add(val); + + val4.x = val; + val4.y = val + sum[0]; + val4.z = val + sum[1]; + val4.w = val + sum[2]; + + return val4; +} + +static int4 rank4(int4 preds) +{ + const int localId = programIndex; + const uniform int localSize = programCount; + + const int4 address = scan4(preds); + + const int numtrue = broadcast(address.w + preds.w, localSize-1); + + int4 rank; + const int idx = localId*4; + rank.x = (preds.x) ? address.x : numtrue + idx - address.x; + rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y; + rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z; + rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w; + + return rank; +} + +static void radixSortBlockKeysOnly(int4 &key_inout, uniform int nbits, uniform int startbit) +{ + const int localId = programIndex; + const uniform int localSize = programCount; + + uniform int sMem[programCount*4]; + + int4 key = key_inout; + for (uniform int shift = startbit; shift < (startbit + nbits); ++shift) + { + int4 lsb; + lsb.x = !((key.x >> shift) & 0x1); + lsb.y = !((key.y >> shift) & 0x1); + lsb.z = !((key.z >> shift) & 0x1); + lsb.w = !((key.w >> shift) & 0x1); + + const int4 r = rank4(lsb); + + // This arithmetic strides the ranks across 4 CTA_SIZE regions + sMem[(r.x & 3) * localSize + (r.x >> 2)] = key.x; + sMem[(r.y & 3) * localSize + (r.y >> 2)] = key.y; + sMem[(r.z & 3) * localSize + (r.z >> 2)] = key.z; + sMem[(r.w & 3) * localSize + (r.w >> 2)] = key.w; + + // The above allows us to read without 4-way bank conflicts: + key.x = sMem[localId ]; + key.y = sMem[localId + localSize]; + key.z = sMem[localId + 2 * localSize]; + key.w = sMem[localId + 3 * localSize]; + } +} + +task void radixSortBlocksKeysOnly( + uniform int keysIn[], + uniform int keysOut[], + uniform int nbits, + uniform int startbit, + uniform int numElements, + uniform int totalBlocks) +{ + const int globalId = taskIndex * programCount + programIndex; + + int4 key; + key.x = keysIn[4*globalId + 0]; + key.y = keysIn[4*globalId + 1]; + key.z = keysIn[4*globalId + 2]; + key.w = keysIn[4*globalId + 3]; + + radixSortBlockKeysOnly(key, nbits, startbit); + + keysOut[4*globalId+0] = key.x; + keysOut[4*globalId+1] = key.y; + keysOut[4*globalId+2] = key.z; + keysOut[4*globalId+3] = key.w; +} + +//---------------------------------------------------------------------------- +// Given an array with blocks sorted according to a 4-bit radix group, each +// block counts the number of keys that fall into each radix in the group, and +// finds the starting offset of each radix in the block. It then writes the radix +// counts to the counters array, and the starting offsets to the blockOffsets array. +// +// Template parameters are used to generate efficient code for various special cases +// For example, we have to handle arrays that are a multiple of the block size +// (fullBlocks) differently than arrays that are not. "loop" is used when persistent +// CTAs are used. +// +// By persistent CTAs we mean that we launch only as many thread blocks as can +// be resident in the GPU and no more, rather than launching as many threads as +// we have elements. Persistent CTAs loop over blocks of elements until all work +// is complete. This can be faster in some cases. In our tests it is faster +// for large sorts (and the threshold is higher on compute version 1.1 and earlier +// GPUs than it is on compute version 1.2 GPUs. +// +//---------------------------------------------------------------------------- +task void findRadixOffsets( + uniform int keys[], + uniform int counters[], + uniform int blockOffsets[], + uniform int startbit, + uniform int numElements, + uniform int totalBlocks) +{ + uniform int sStartPointers[16]; + + const uniform int groupId = taskIndex; + const int localId = programIndex; + const uniform int groupSize = programCount; + const int globalId = taskIndex * programCount + programIndex; + + int2 radix2; + + radix2.x = keys[2*globalId + 0]; + radix2.y = keys[2*globalId + 1]; + + uniform int sRadix1[4*programCount]; + + sRadix1[2 * localId] = (radix2.x >> startbit) & 0xF; + sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF; + + // Finds the position where the sRadix1 entries differ and stores start + // index for each radix. + if(localId < 16) + sStartPointers[localId] = 0; + + if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) + sStartPointers[sRadix1[localId]] = localId; + + if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1]) + sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize; + + if(localId < 16) + blockOffsets[groupId*16 + localId] = sStartPointers[localId]; + + // Compute the sizes of each block. + if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) + sStartPointers[sRadix1[localId - 1]] = + localId - sStartPointers[sRadix1[localId - 1]]; + if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] ) + sStartPointers[sRadix1[localId + groupSize - 1]] = + localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]]; + + + if(localId == groupSize - 1) + sStartPointers[sRadix1[2 * groupSize - 1]] = + 2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]]; + + if(localId < 16) + counters[localId * totalBlocks + groupId] = sStartPointers[localId]; +} + +// a naive scan routine that works only for array that +// can fit into a single block, just for debugging purpose, +// not used in the sort now +task void scanNaive( + uniform int odata[], + uniform int idata[], + uniform int n) +{ + if (programIndex < n) + odata[programIndex] = exclusive_scan_add(idata[programIndex]); +} + +//---------------------------------------------------------------------------- +// reorderData shuffles data in the array globally after the radix offsets +// have been found. On compute version 1.1 and earlier GPUs, this code depends +// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits). +// +// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures +// that all writes are coalesced using extra work in the kernel. On later +// GPUs coalescing rules have been relaxed, so this extra overhead hurts +// performance. On these GPUs we set manualCoalesce=false and directly store +// the results. +// +// Template parameters are used to generate efficient code for various special cases +// For example, we have to handle arrays that are a multiple of the block size +// (fullBlocks) differently than arrays that are not. "loop" is used when persistent +// CTAs are used. +// +// By persistent CTAs we mean that we launch only as many thread blocks as can +// be resident in the GPU and no more, rather than launching as many threads as +// we have elements. Persistent CTAs loop over blocks of elements until all work +// is complete. This can be faster in some cases. In our tests it is faster +// for large sorts (and the threshold is higher on compute version 1.1 and earlier +// GPUs than it is on compute version 1.2 GPUs. +//---------------------------------------------------------------------------- +task void reorderDataKeysOnly( + uniform int outKeys[], + uniform int keys[], + uniform int blockOffsets[], + uniform int offsets[], + uniform int sizes[], + uniform int startbit, + uniform int numElements, + uniform int totalBlocks) +{ + uniform int sOffsets[16]; + uniform int sBlockOffsets[16]; + uniform int2 sKeys2[programCount]; + uniform int * uniform sKeys1 = (uniform int * uniform)&sKeys2[0]; + + const uniform int groupId = taskIndex; + const int globalId = taskIndex*programCount + programIndex; + const int localId = programIndex; + const uniform int groupSize = programCount; + + sKeys2[localId].x = keys[2*globalId + 0]; + sKeys2[localId].y = keys[2*globalId + 1]; + + if(localId < 16) + { + sOffsets[localId] = offsets[localId * totalBlocks + groupId]; + sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId]; + } + + int radix = (sKeys1[localId] >> startbit) & 0xF; + int globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix]; + + if (globalOffset < numElements) + outKeys[globalOffset] = sKeys1[localId]; + + radix = (sKeys1[localId + groupSize] >> startbit) & 0xF; + globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix]; + + if (globalOffset < numElements) + outKeys[globalOffset] = sKeys1[localId + groupSize]; +}