first commit alternative radix

2014-01-28 15:39:27 +01:00
parent f343e4cb0e
commit 585afa09e5
1 changed files with 265 additions and 0 deletions
--- a/examples_ptx/radixSort/radix.ispc
+++ b/examples_ptx/radixSort/radix.ispc
@@ -0,0 +1,265 @@
+//----------------------------------------------------------------------------
+// scan4 scans 4*RadixSort::CTA_SIZE numElements in a block (4 per thread), using 
+// a warp-scan algorithm
+//----------------------------------------------------------------------------
+struct int4 { int x,y,z,w; };
+struct int2 { int x,y; };
+
+static int4 scan4(int4 idata)
+{    
+  int idx = programIndex;
+
+  int4 val4 = idata;
+  int sum[3];
+  sum[0] = val4.x;
+  sum[1] = val4.y + sum[0];
+  sum[2] = val4.z + sum[1];
+
+  int val = val4.w + sum[2];
+
+  val = exclusive_scan_add(val);
+
+  val4.x = val;
+  val4.y = val + sum[0];
+  val4.z = val + sum[1];
+  val4.w = val + sum[2];
+
+  return val4;
+}
+
+static int4 rank4(int4 preds)
+{
+	int localId = programIndex;
+  uniform int localSize = programCount;
+
+	int4 address = scan4(preds);
+
+  const int numtrue = broadcast(address.w + preds.w, localSize - 1);
+	
+	int4 rank;
+	int idx = localId*4;
+	rank.x = (preds.x) ? address.x : numtrue + idx - address.x;
+	rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y;
+	rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z;
+	rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w;
+	
+	return rank;
+}
+
+static int4 radixSortBlockKeysOnly(
+    int4 key,
+    uniform int nbits, 
+    uniform int startbit, 
+    uniform int sMem[], 
+    uniform int numtrue[])
+{
+  int localId = programIndex;
+  uniform int localSize = programCount;
+	
+	for (uniform int shift = startbit; shift < (startbit + nbits); ++shift)
+  {
+    int4 lsb;
+    lsb.x = !(((key).x >> shift) & 0x1);
+    lsb.y = !(((key).y >> shift) & 0x1);
+    lsb.z = !(((key).z >> shift) & 0x1);
+    lsb.w = !(((key).w >> shift) & 0x1);
+
+    int4 r;
+
+    r = rank4(lsb);
+
+    // This arithmetic strides the ranks across 4 CTA_SIZE regions
+    sMem[(r.x & 3) * localSize + (r.x >> 2)] = (key).x;
+    sMem[(r.y & 3) * localSize + (r.y >> 2)] = (key).y;
+    sMem[(r.z & 3) * localSize + (r.z >> 2)] = (key).z;
+    sMem[(r.w & 3) * localSize + (r.w >> 2)] = (key).w;
+
+    // The above allows us to read without 4-way bank conflicts:
+    (key).x = sMem[localId];
+    (key).y = sMem[localId +     localSize];
+    (key).z = sMem[localId + 2 * localSize];
+    (key).w = sMem[localId + 3 * localSize];
+  }
+  return key;
+}
+
+task 
+void radixSortBlocksKeysOnly(
+    uniform int4 keysIn[],
+    uniform int4 keysOut[],
+    uniform int nbits,
+    uniform int startbit,
+    uniform int numElements, 
+    uniform int totalBlocks,
+    uniform int sMem[])
+{
+  int globalId  = taskIndex*programCount + programIndex;
+  uniform int numtrue[1];
+
+  int4 key;
+  key = keysIn[globalId];
+
+
+  key = radixSortBlockKeysOnly(key, nbits, startbit, sMem, numtrue);
+
+  keysOut[globalId] = key;
+}
+
+//----------------------------------------------------------------------------
+// Given an array with blocks sorted according to a 4-bit radix group, each 
+// block counts the number of keys that fall into each radix in the group, and 
+// finds the starting offset of each radix in the block.  It then writes the radix 
+// counts to the counters array, and the starting offsets to the blockOffsets array.
+//
+// Template parameters are used to generate efficient code for various special cases
+// For example, we have to handle arrays that are a multiple of the block size 
+// (fullBlocks) differently than arrays that are not. "loop" is used when persistent 
+// CTAs are used. 
+//
+// By persistent CTAs we mean that we launch only as many thread blocks as can 
+// be resident in the GPU and no more, rather than launching as many threads as
+// we have elements. Persistent CTAs loop over blocks of elements until all work
+// is complete.  This can be faster in some cases.  In our tests it is faster
+// for large sorts (and the threshold is higher on compute version 1.1 and earlier
+// GPUs than it is on compute version 1.2 GPUs.
+//                                
+//----------------------------------------------------------------------------
+task
+void findRadixOffsets(
+    uniform int2 keys[],
+    uniform int counters[],
+    uniform int blockOffsets[],
+    uniform int startbit,
+    uniform int numElements,
+    uniform int totalBlocks,
+    uniform int sRadix1[])
+{
+  uniform int  sStartPointers[16];
+
+  uniform int groupId   = taskIndex;
+  uniform int groupSize = programCount;
+
+  int localId = programIndex;
+
+  int2 radix2;
+
+  int globalId  = taskIndex*programCount + programIndex;
+  radix2 = keys[globalId];
+
+
+  sRadix1[2 * localId]     = (radix2.x >> startbit) & 0xF;
+  sRadix1[2 * localId + 1] = (radix2.y >> startbit) & 0xF;
+
+  // Finds the position where the sRadix1 entries differ and stores start 
+  // index for each radix.
+  if(localId < 16) 
+    sStartPointers[localId] = 0; 
+
+  if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) 
+    sStartPointers[sRadix1[localId]] = localId;
+  if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1]) 
+    sStartPointers[sRadix1[localId + groupSize]] = localId + groupSize;
+
+
+  if(localId < 16) 
+    blockOffsets[groupId*16 + localId] = sStartPointers[localId];
+
+  // Compute the sizes of each block.
+  if((localId > 0) && (sRadix1[localId] != sRadix1[localId - 1]) ) 
+    sStartPointers[sRadix1[localId - 1]] = 
+      localId - sStartPointers[sRadix1[localId - 1]];
+
+  if(sRadix1[localId + groupSize] != sRadix1[localId + groupSize - 1] ) 
+    sStartPointers[sRadix1[localId + groupSize - 1]] = 
+      localId + groupSize - sStartPointers[sRadix1[localId + groupSize - 1]];
+
+
+  if(localId == groupSize - 1) 
+    sStartPointers[sRadix1[2 * groupSize - 1]] = 
+      2 * groupSize - sStartPointers[sRadix1[2 * groupSize - 1]];
+
+  if(localId < 16) 
+    counters[localId * totalBlocks + groupId] = sStartPointers[localId];
+}
+
+// a naive scan routine that works only for array that
+// can fit into a single block, just for debugging purpose,
+// not used in the sort now
+task
+void scanNaive(
+    uniform int g_odata[],
+    uniform int g_idata[],
+    uniform int n,
+    uniform int temp[])
+{
+  if (programIndex < n)
+    g_odata[programIndex] = exclusive_scan_add(g_idata[programIndex]);
+}
+
+//----------------------------------------------------------------------------
+// reorderData shuffles data in the array globally after the radix offsets 
+// have been found. On compute version 1.1 and earlier GPUs, this code depends 
+// on RadixSort::CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
+// 
+// On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
+// that all writes are coalesced using extra work in the kernel.  On later
+// GPUs coalescing rules have been relaxed, so this extra overhead hurts 
+// performance.  On these GPUs we set manualCoalesce=false and directly store
+// the results.
+//
+// Template parameters are used to generate efficient code for various special cases
+// For example, we have to handle arrays that are a multiple of the block size 
+// (fullBlocks) differently than arrays that are not.  "loop" is used when persistent 
+// CTAs are used. 
+//
+// By persistent CTAs we mean that we launch only as many thread blocks as can 
+// be resident in the GPU and no more, rather than launching as many threads as
+// we have elements. Persistent CTAs loop over blocks of elements until all work
+// is complete.  This can be faster in some cases.  In our tests it is faster
+// for large sorts (and the threshold is higher on compute version 1.1 and earlier
+// GPUs than it is on compute version 1.2 GPUs.
+//----------------------------------------------------------------------------
+task 
+void reorderDataKeysOnly(
+    uniform int  outKeys[],
+    uniform int2 keys[],
+    uniform int  blockOffsets[],
+    uniform int  offsets[],
+    uniform int  sizes[],
+    uniform int startbit,
+    uniform int numElements,
+    uniform int totalBlocks,
+    uniform int2 sKeys2[])
+{
+  uniform int sOffsets[16];
+  uniform int sBlockOffsets[16];
+
+  uniform int * uniform sKeys1 = (uniform int* uniform)sKeys2; 
+
+  uniform int groupId = taskIndex;
+
+  uniform int groupSize = programCount;
+  
+  int localId   = programIndex;
+  int globalId  = taskIndex*programCount + programIndex;
+
+  sKeys2[localId]   = keys[globalId];
+
+  if(localId < 16)  
+  {
+    sOffsets[localId]      = offsets[localId * totalBlocks + groupId];
+    sBlockOffsets[localId] = blockOffsets[groupId * 16 + localId];
+  }
+
+  int radix = (sKeys1[localId] >> startbit) & 0xF;
+  int globalOffset = sOffsets[radix] + localId - sBlockOffsets[radix];
+
+  if (globalOffset < numElements)
+    outKeys[globalOffset] = sKeys1[localId];
+
+  radix = (sKeys1[localId + groupSize] >> startbit) & 0xF;
+  globalOffset = sOffsets[radix] + localId + groupSize - sBlockOffsets[radix];
+
+  if (globalOffset < numElements)
+    outKeys[globalOffset]   = sKeys1[localId + groupSize];
+}