partiall workin cuda version

2014-01-29 13:20:41 +01:00
parent 2f5eb9f6d3
commit 97253354ac
2 changed files with 31 additions and 40 deletions
--- a/examples_ptx/radixSort/radixSort.cu
+++ b/examples_ptx/radixSort/radixSort.cu
@@ -6,7 +6,7 @@

 typedef long long Key;

-__device__ int atomic_add_global(int* ptr, int value)
+__forceinline__ __device__ int atomic_add_global(int* ptr, int value)
 {
  return atomicAdd(ptr, value);
 } 
@@ -24,13 +24,13 @@ static __device__ __forceinline__ int shfl_scan_add_step(int partial, int up_off
  return result;
 }

-__device__ int exclusive_scan_add(int value)
+__forceinline__ __device__ int exclusive_scan_add(int value)
 {
  int mysum = value;
 #pragma unroll
  for(int i = 0; i < 5; ++i)
    mysum = shfl_scan_add_step(mysum, 1 << i);
-  return mysum;
+  return mysum - value;
 }

 __global__
@@ -53,6 +53,7 @@ void countPass(
  int *      counts = countsAll + blkIdx*NUMDIGITS;
  const  int           nloc = min(numElements - blkIdx*blkDim, blkDim);

+#pragma unroll 8
  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
    counts[digit] = 0;

@@ -64,6 +65,7 @@ void countPass(
      atomic_add_global(&counts[key], 1);
    }

+#pragma unroll 8
  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
    atomic_add_global(&countsGlobal[digit], counts[digit]);
 }
@@ -90,12 +92,15 @@ void sortPass(

  const  int mask = (1 << NUMBITS) - 1;

-  const int unitScan     = exclusive_scan_add(1);
-
  /* copy digit offset from Gmem to Lmem */ 
-  __shared__ int digitOffsets[NUMDIGITS];
+#if 1
+  __shared__ int digitOffsets_sh[NUMDIGITS*4];
+  int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS;
  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
    digitOffsets[digit] = digitOffsetsAll[blkIdx*NUMDIGITS + digit];
+#else
+  int *digitOffsets = &digitOffsetsAll[blkIdx*NUMDIGITS];
+#endif


  for (int i = programIndex; i < nloc; i += programCount)
@@ -104,8 +109,8 @@ void sortPass(
      const int key = mask & ((unsigned int)keys[i] >> bit);
      int scatter;
      /* not a vector friendly loop */
-      for (int lane = 0; lane < programCount; lane++)
-        if (programIndex == lane)
+      for (int iv = 0; iv < programCount; iv++)
+        if (programIndex == iv)
          scatter = digitOffsets[key]++;
      sorted [scatter] = keys[i];
    }
@@ -128,6 +133,7 @@ void partialScanLocal(
  int (*  excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
  int (*    partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll;

+#pragma unroll 8
  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
  {
    int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
@@ -152,13 +158,13 @@ void partialScanGlobal(
  const  int digit = taskIndex;
  int carry = 0;
  for (int block = programIndex;  block < numBlocks; block += programCount)
+  {
+    const int value = partialSum[block][digit];
+    const int scan  = exclusive_scan_add(value);
    if (block < numBlocks)
-    {
-      const int value = partialSum[block][digit];
-      const int scan  = exclusive_scan_add(value);
      prefixSum[block][digit] = scan + carry;
-      carry += __shfl(scan+value, programCount-1);
-    }
+    carry += __shfl(scan+value, programCount-1);
+  }
 }

 __global__
@@ -175,6 +181,7 @@ void completeScanGlobal(
  int (*  excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
  int (*    carryValue)[NUMDIGITS] = ( int (*)[NUMDIGITS])carryValueAll;

+#pragma unroll 8
  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
  {
    const int carry = carryValue[blkIdx][digit];
@@ -245,15 +252,6 @@ void radixSort_alloc___export(const  int n)
  if (programIndex == 0)
    memoryPool =  new  int[nalloc];

-  union {int* ptr;  int val[2];} t;
-  t.ptr = memoryPool;
-  t.val[0] = __shfl(t.val[0], 0);
-  t.val[1] = __shfl(t.val[1], 0);
-  memoryPool = t.ptr;
-
-
-
-
  sharedCounts = memoryPool;
  countsGlobal = sharedCounts + nSharedCounts;
  excScan      = countsGlobal + nCountsGlobal;
@@ -262,7 +260,7 @@ void radixSort_alloc___export(const  int n)
  prefixSum    = partialSum   + nPartialSum;
 }

-extern "C" __global__
+extern "C" 
 void radixSort_alloc(const  int n)
 {
  radixSort_alloc___export<<<1,32>>>(n);
@@ -275,7 +273,8 @@ void radixSort_freeBufKeys()
 {
  if (numElementsBuf > 0)
  {
-    delete bufKeys;
+    if (programIndex == 0)
+      delete bufKeys;
    numElementsBuf = 0;
  }
 }
@@ -289,7 +288,7 @@ __global__ void radixSort_free___export()

  radixSort_freeBufKeys();
 }
-extern "C" __global__
+extern "C"
 void radixSort_free()
 {
  radixSort_free___export<<<1,32>>>();
@@ -312,13 +311,6 @@ __global__ void radixSort___export(
    numElementsBuf = numElements;
    if (programIndex == 0)
      bufKeys =  new  Key[numElementsBuf];
-    union {Key* ptr;  int val[2];} t;
-    t.ptr = bufKeys;
-    t.val[0] = __shfl(t.val[0], 0);
-    t.val[1] = __shfl(t.val[1], 0);
-    bufKeys = t.ptr;
-
-
  }

  const  int blkDim  = (numElements + numBlocks - 1) / numBlocks;
@@ -336,6 +328,7 @@ __global__ void radixSort___export(
    /* exclusive scan on global histogram */
    int carry = 0;
    excScan[0] = 0;
+#pragma unroll 8
    for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
    {
      const int value = countsGlobal[digit];
@@ -357,14 +350,15 @@ __global__ void radixSort___export(
          excScan);
    sync;
  }
-
 }
-extern "C" __global__
+
+extern "C"
 void radixSort(
    const  int numElements,
    Key keys[],
    const  int nBits)
 {
+  cudaDeviceSetCacheConfig ( cudaFuncCachePreferEqual );
  radixSort___export<<<1,32>>>(numElements, keys, nBits);
  sync;
 }
--- a/examples_ptx/radixSort/radixSort.ispc
+++ b/examples_ptx/radixSort/radixSort.ispc
@@ -63,14 +63,11 @@ void sortPass(

  const uniform int mask = (1 << NUMBITS) - 1;

-  const int unitScan     = exclusive_scan_add(1);
-
- /* copy digit offset from Gmem to Lmem */ 
+  /* copy digit offset from Gmem to Lmem */ 
  uniform int digitOffsets[NUMDIGITS];
  foreach (digit = 0 ... NUMDIGITS)
    digitOffsets[digit] = digitOffsetsAll[blockIdx*NUMDIGITS + digit];

-
  foreach (i = 0 ... nloc)
  {
    const int key = mask & ((unsigned int)keys[i] >> bit);
@@ -78,7 +75,7 @@ void sortPass(
    /* not a vector friendly loop */
    foreach_active(iv)
      scatter = digitOffsets[key]++;
-    sorted [scatter] = keys[i];
+    sorted[scatter] = keys[i];
  }
 }

@@ -237,7 +234,7 @@ export void radixSort_free()
  delete memoryPool;
  memoryPool = NULL;

-  radixSort_freeBufKeys;
+  radixSort_freeBufKeys();
 }

 export void radixSort(