added radixSort

2014-02-20 13:49:58 +01:00
parent 0c0655c0f2
commit 463ddf1707
5 changed files with 809 additions and 0 deletions
--- a/examples/portable/radixSort/Makefile_cpu
+++ b/examples/portable/radixSort/Makefile_cpu
@@ -0,0 +1,9 @@
+
+EXAMPLE=radixSort
+CPP_SRC=radixSort.cpp 
+ISPC_SRC=radixSort.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+#ISPC_FLAGS=-DDEBUG -g
+
+include ../common_cpu.mk
--- a/examples/portable/radixSort/Makefile_ptx
+++ b/examples/portable/radixSort/Makefile_ptx
@@ -0,0 +1,15 @@
+PROG=radixSort
+ISPC_SRC=radixSort.ispc
+
+CU_SRC=radixSort.cu
+# NVCC_FLAGS=-Xptxas=-O1
+CXX_SRC=radixSort.cpp  radixSort.cpp
+PTXCC_REGMAX=64
+
+LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
--- a/examples/portable/radixSort/radixSort.cpp
+++ b/examples/portable/radixSort/radixSort.cpp
@@ -0,0 +1,119 @@
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <cassert>
+#include <iomanip>
+#include "timing.h"
+#include "ispc_malloc.h"
+#include "radixSort_ispc.h"
+
+/* progress bar by Ross Hemsley;
+ * http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */
+static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50)
+{
+  if (n < 100)
+  {
+    x *= 100/n;
+    n = 100;
+  }
+
+  if ((x != n) && (x % (n/100) != 0)) return;
+
+  using namespace std;
+  float ratio  =  x/(float)n;
+  int c =  ratio * w;
+
+  cout << setw(3) << (int)(ratio*100) << "% [";
+  for (int x=0; x<c; x++) cout << "=";
+  for (int x=c; x<w; x++) cout << " ";
+  cout << "]\r" << flush;
+}
+
+struct Key
+{
+  int32_t key,val;
+};
+
+int main (int argc, char *argv[])
+{
+  int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
+  double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
+  Key *keys = new Key [n];
+  Key *keys_orig = new Key [n];
+  unsigned int *keys_gold = new unsigned int [n];
+  
+  srand48(rtc()*65536);
+
+  int sortBits = 32;
+  assert(sortBits <= 32);
+
+#pragma omp parallel for
+  for (int i = 0; i < n; i++)
+  {
+    keys[i].key = ((int)(drand48() * (1<<30))) & ((1ULL << sortBits) - 1);
+    keys[i].val = i;
+  }
+
+  std::random_shuffle(keys, keys + n);
+
+#pragma omp parallel for
+  for (int i = 0; i < n; i++)
+  {
+    keys_gold[i] = keys[i].key;
+    keys_orig[i] = keys[i];
+  }
+    
+  ispcSetMallocHeapLimit(1024*1024*1024);
+
+  ispc::radixSort_alloc(n);
+
+  tISPC2 = 1e30;
+  for (i = 0; i < m; i ++)
+  {
+    ispcMemcpy(keys, keys_orig, n*sizeof(Key));
+    reset_and_start_timer();
+    ispc::radixSort(n, (int64_t*)keys, sortBits);
+    tISPC2 = std::min(tISPC2, get_elapsed_msec());
+    if (argc != 3)
+        progressbar (i, m);
+  }
+
+  ispc::radixSort_free();
+
+  printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n/tISPC2);
+
+  std::sort(keys_gold, keys_gold + n);
+  for (int i = 0; i < n; i++)
+    assert(keys[i].key == keys_gold[i]);
+
+
+#if 0
+  for (i = 0; i < m; i ++)
+  {
+    ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
+
+    reset_and_start_timer();
+
+    sort_serial (n, code, order);
+
+    tSerial += get_elapsed_msec();
+
+    if (argc != 3)
+        progressbar (i, m);
+  }
+
+  printf("[sort serial]:\t\t[%.3f] msec [%.3f Mpair/s]\n", tSerial, 1.0e-3*n*m/tSerial);
+
+#ifndef _CUDA_
+  printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
+#else
+  printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", tSerial/tISPC2);
+#endif
+#endif
+
+  delete keys;
+  delete keys_orig;
+  delete keys_gold;
+  return 0;
+}
--- a/examples/portable/radixSort/radixSort.cu
+++ b/examples/portable/radixSort/radixSort.cu
@@ -0,0 +1,365 @@
+#include "cuda_helpers.cuh"
+#include <cassert>
+
+#define NUMBITS 8
+#define NUMDIGITS (1<<NUMBITS)
+
+typedef long long Key;
+
+__forceinline__ __device__ int atomic_add_global(int* ptr, int value)
+{
+  return atomicAdd(ptr, value);
+} 
+
+static __device__ __forceinline__ int shfl_scan_add_step(int partial, int up_offset)
+{
+  int result;
+  asm(
+      "{.reg .u32 r0;"
+      ".reg .pred p;"
+      "shfl.up.b32 r0|p, %1, %2, 0;"
+      "@p add.u32 r0, r0, %3;"
+      "mov.u32 %0, r0;}"
+      : "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
+  return result;
+}
+
+__forceinline__ __device__ int exclusive_scan_add(int value)
+{
+  int mysum = value;
+#pragma unroll
+  for(int i = 0; i < 5; ++i)
+    mysum = shfl_scan_add_step(mysum, 1 << i);
+  return mysum - value;
+}
+
+__global__
+void countPass(
+    const  Key keysAll[],
+    Key sortedAll[],
+    const  int bit,
+    const  int numElements,
+    int countsAll[],
+    int countsGlobal[])
+{
+  const  int  blkIdx = taskIndex;
+  const  int numBlocks = taskCount;
+  const  int  blkDim = (numElements + numBlocks - 1) / numBlocks;
+
+  const  int mask = (1 << NUMBITS) - 1;
+
+  const  Key *  keys   =   keysAll + blkIdx*blkDim;
+  Key *  sorted = sortedAll + blkIdx*blkDim;
+  int *      counts = countsAll + blkIdx*NUMDIGITS;
+  const  int           nloc = min(numElements - blkIdx*blkDim, blkDim);
+
+#pragma unroll 8
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+    counts[digit] = 0;
+
+  for (int i = programIndex; i < nloc; i += programCount)
+    if (i < nloc)
+    {
+      sorted[i] = keys[i];
+      const int key = mask & ((unsigned int)keys[i] >> bit);
+      atomic_add_global(&counts[key], 1);
+    }
+
+#pragma unroll 8
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+    atomic_add_global(&countsGlobal[digit], counts[digit]);
+}
+
+__global__
+void sortPass(
+    Key keysAll[],
+    Key sorted[],
+    int bit,
+    int numElements,
+    int digitOffsetsAll[])
+{
+  const  int  blkIdx = taskIndex;
+  const  int numBlocks = taskCount;
+
+  const  int  blkDim = (numElements + numBlocks - 1) / numBlocks;
+
+
+  const  int keyIndex = blkIdx * blkDim;
+  Key *  keys = keysAll + keyIndex;
+
+
+  const  int nloc = min(numElements - keyIndex, blkDim);
+
+  const  int mask = (1 << NUMBITS) - 1;
+
+  /* copy digit offset from Gmem to Lmem */ 
+#if 1
+  __shared__ int digitOffsets_sh[NUMDIGITS*4];
+  volatile int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS;
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+    digitOffsets[digit] = digitOffsetsAll[blkIdx*NUMDIGITS + digit];
+#else
+  int *digitOffsets = &digitOffsetsAll[blkIdx*NUMDIGITS];
+#endif
+
+
+  for (int i = programIndex; i < nloc; i += programCount)
+    if (i < nloc)
+    {
+      const int key = mask & ((unsigned int)keys[i] >> bit);
+      int scatter;
+      /* not a vector friendly loop */
+#pragma unroll 1  /* needed, otherwise compiler unroll and optimizes the result :S */
+      for (int iv = 0; iv < programCount; iv++)
+        if (programIndex == iv)
+          scatter = digitOffsets[key]++;
+      sorted [scatter] = keys[i];
+    }
+}
+
+__global__
+void partialScanLocal(
+    int numBlocks,
+    int excScanAll[],
+    int  countsAll[],
+    int partialSumAll[])
+{
+  const  int  blkIdx = taskIndex;
+
+  const  int  blkDim = (numBlocks+taskCount-1)/taskCount;
+  const  int      bbeg = blkIdx * blkDim;
+  const  int      bend = min(bbeg + blkDim, numBlocks);
+
+  int (*   countsBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])countsAll;
+  int (*  excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
+  int (*    partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll;
+
+#pragma unroll 8
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+  {
+    int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
+    for ( int block = bbeg; block < bend; block++)
+    {
+      const int y = countsBlock[block][digit];
+      excScanBlock[block][digit] = prev;
+      prev += y;
+    }
+    partialSum[blkIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
+  }
+}
+
+__global__
+void partialScanGlobal(
+    const  int numBlocks,
+    int partialSumAll[],
+    int prefixSumAll[])
+{
+  int (*  partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll;
+  int (*   prefixSum)[NUMDIGITS] = ( int (*)[NUMDIGITS]) prefixSumAll;
+  const  int digit = taskIndex;
+  int carry = 0;
+  for (int block = programIndex;  block < numBlocks; block += programCount)
+  {
+    const int value = partialSum[block][digit];
+    const int scan  = exclusive_scan_add(value);
+    if (block < numBlocks)
+      prefixSum[block][digit] = scan + carry;
+    carry += __shfl(scan+value, programCount-1);
+  }
+}
+
+__global__
+void completeScanGlobal(
+    int numBlocks,
+    int excScanAll[],
+    int carryValueAll[])
+{
+  const  int  blkIdx = taskIndex;
+  const  int  blkDim = (numBlocks+taskCount-1)/taskCount;
+  const  int      bbeg = blkIdx * blkDim;
+  const  int      bend = min(bbeg  + blkDim, numBlocks);
+
+  int (*  excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
+  int (*    carryValue)[NUMDIGITS] = ( int (*)[NUMDIGITS])carryValueAll;
+
+#pragma unroll 8
+  for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+  {
+    const int carry = carryValue[blkIdx][digit];
+    for ( int block = bbeg; block < bend; block++)
+      excScanBlock[block][digit] += carry;
+  }
+}
+
+__device__ static 
+inline void radixExclusiveScan(
+    const  int numBlocks,
+    int excScanPtr[],
+    int  countsPtr[],
+    int partialSum[],
+    int  prefixSum[])
+{
+  const  int scale = 8;
+  launch (numBlocks/scale, 1,1, partialScanLocal)(numBlocks, excScanPtr, countsPtr, partialSum);
+  sync;
+
+  launch (NUMDIGITS,1,1,partialScanGlobal) (numBlocks/scale, partialSum, prefixSum);
+  sync;
+
+  launch (numBlocks/scale,1,1, completeScanGlobal) (numBlocks, excScanPtr, prefixSum);
+  sync;
+}
+
+__device__ static  int *  memoryPool = NULL;
+__device__ static  int numBlocks;
+__device__ static  int nSharedCounts;
+__device__ static  int nCountsGlobal;
+__device__ static  int nExcScan;
+__device__ static  int nCountsBlock;
+__device__ static  int nPartialSum;
+__device__ static  int nPrefixSum;
+
+__device__ static  int *  sharedCounts;
+__device__ static  int *  countsGlobal;
+__device__ static  int *  excScan;
+__device__ static  int *  counts;
+__device__ static  int *  partialSum;
+__device__ static  int *  prefixSum;
+
+__device__ static  int numElementsBuf = 0;
+__device__ static  Key *  bufKeys;
+
+__global__
+void radixSort_alloc___export(const  int n)
+{
+  assert(memoryPool == NULL);
+  numBlocks     = 13*32*4;
+  nSharedCounts = NUMDIGITS*numBlocks;
+  nCountsGlobal = NUMDIGITS;
+  nExcScan      = NUMDIGITS*numBlocks;
+  nCountsBlock  = NUMDIGITS*numBlocks;
+  nPartialSum   = NUMDIGITS*numBlocks;
+  nPrefixSum    = NUMDIGITS*numBlocks;
+
+
+  const  int nalloc = 
+    nSharedCounts +
+    nCountsGlobal +
+    nExcScan +
+    nCountsBlock + 
+    nPartialSum +
+    nPrefixSum;
+
+  if (programIndex == 0)
+    memoryPool =  new  int[nalloc];
+
+  sharedCounts = memoryPool;
+  countsGlobal = sharedCounts + nSharedCounts;
+  excScan      = countsGlobal + nCountsGlobal;
+  counts       = excScan      + nExcScan;
+  partialSum   = counts       + nCountsBlock;
+  prefixSum    = partialSum   + nPartialSum;
+}
+
+extern "C" 
+void radixSort_alloc(const  int n)
+{
+  radixSort_alloc___export<<<1,32>>>(n);
+  sync;
+}
+
+
+__device__  static 
+void radixSort_freeBufKeys()
+{
+  if (numElementsBuf > 0)
+  {
+    if (programIndex == 0)
+      delete bufKeys;
+    numElementsBuf = 0;
+  }
+}
+
+__global__ void radixSort_free___export()
+{
+  assert(memoryPool != NULL);
+  if (programIndex == 0)
+    delete memoryPool;
+  memoryPool = NULL;
+
+  radixSort_freeBufKeys();
+}
+extern "C"
+void radixSort_free()
+{
+  radixSort_free___export<<<1,32>>>();
+  sync;
+}
+
+__global__ void radixSort___export(
+    const  int numElements,
+    Key keys[],
+    const  int nBits)
+{
+#ifdef __NVPTX__
+  assert((numBlocks & 3) == 0);  /* task granularity on Kepler is 4 */
+#endif
+
+  if (numElementsBuf < numElements)
+    radixSort_freeBufKeys();
+  if (numElementsBuf == 0)
+  {
+    numElementsBuf = numElements;
+    if (programIndex == 0)
+      bufKeys =  new  Key[numElementsBuf];
+  }
+
+  const  int blkDim  = (numElements + numBlocks - 1) / numBlocks;
+
+  for ( int bit = 0; bit < nBits; bit += NUMBITS)
+  {
+    /* initialize histogram for each digit */
+    for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+      countsGlobal[digit] = 0;
+
+    /* compute histogram for each digit */
+    launch (numBlocks,1,1, countPass)(keys, bufKeys, bit, numElements, counts, countsGlobal);
+    sync;
+
+    /* exclusive scan on global histogram */
+    int carry = 0;
+    excScan[0] = 0;
+#pragma unroll 8
+    for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
+    {
+      const int value = countsGlobal[digit];
+      const int scan  = exclusive_scan_add(value);
+      excScan[digit] = scan + carry;
+      carry += __shfl(scan+value, programCount-1);
+    }
+
+    /* computing offsets for each digit */
+    radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
+
+    /* sorting */
+    launch (numBlocks,1,1,
+      sortPass)(
+          bufKeys, 
+          keys, 
+          bit, 
+          numElements,
+          excScan);
+    sync;
+  }
+}
+
+extern "C"
+void radixSort(
+    const  int numElements,
+    Key keys[],
+    const  int nBits)
+{
+  cudaDeviceSetCacheConfig ( cudaFuncCachePreferEqual );
+  radixSort___export<<<1,32>>>(numElements, keys, nBits);
+  sync;
+}
--- a/examples/portable/radixSort/radixSort.ispc
+++ b/examples/portable/radixSort/radixSort.ispc
@@ -0,0 +1,301 @@
+#define NUMBITS 8
+#define NUMDIGITS (1<<NUMBITS)
+
+typedef int64 Key;
+
+task
+void countPass(
+    const uniform Key keysAll[],
+    uniform Key sortedAll[],
+    const uniform int bit,
+    const uniform int numElements,
+    uniform int countsAll[],
+    uniform int countsGlobal[])
+{
+  const uniform int  blockIdx = taskIndex;
+  const uniform int numBlocks = taskCount;
+  const uniform int  blockDim = (numElements + numBlocks - 1) / numBlocks;
+
+  const uniform int mask = (1 << NUMBITS) - 1;
+
+  const uniform Key * uniform keys   =   keysAll + blockIdx*blockDim;
+  uniform Key * uniform sorted = sortedAll + blockIdx*blockDim;
+  uniform int * uniform     counts = countsAll + blockIdx*NUMDIGITS;
+  const uniform int           nloc = min(numElements - blockIdx*blockDim, blockDim);
+
+  foreach (digit = 0 ... NUMDIGITS)
+    counts[digit] = 0;
+
+  foreach (i = 0 ... nloc)
+  {
+    sorted[i] = keys[i];
+    const int key = mask & ((unsigned int)keys[i] >> bit);
+#ifdef __NVPTX__
+    atomic_add_global(&counts[key], 1);
+#else
+    atomic_add_local(&counts[key], 1);
+#endif
+  }
+
+  foreach (digit = 0 ... NUMDIGITS)
+    atomic_add_global(&countsGlobal[digit], counts[digit]);
+}
+
+task
+void sortPass(
+    uniform Key keysAll[],
+    uniform Key sorted[],
+    uniform int bit,
+    uniform int numElements,
+    uniform int digitOffsetsAll[])
+{
+  const uniform int  blockIdx = taskIndex;
+  const uniform int numBlocks = taskCount;
+
+  const uniform int  blockDim = (numElements + numBlocks - 1) / numBlocks;
+
+
+  const uniform int keyIndex = blockIdx * blockDim;
+  uniform Key * uniform keys = keysAll + keyIndex;
+
+
+  const uniform int nloc = min(numElements - keyIndex, blockDim);
+
+  const uniform int mask = (1 << NUMBITS) - 1;
+
+  /* copy digit offset from Gmem to Lmem */ 
+#if 1
+  uniform int digitOffsets[NUMDIGITS];
+  foreach (digit = 0 ... NUMDIGITS)
+    digitOffsets[digit] = digitOffsetsAll[blockIdx*NUMDIGITS + digit];
+#else
+  uniform int * uniform digitOffsets = &digitOffsetsAll[blockIdx*NUMDIGITS];
+#endif
+
+  foreach (i = 0 ... nloc)
+  {
+    const int key = mask & ((unsigned int)keys[i] >> bit);
+    int scatter;
+    /* not a vector friendly loop */
+    foreach_active(iv)
+      scatter = digitOffsets[key]++;
+    sorted[scatter] = keys[i];
+  }
+}
+
+task
+void partialScanLocal(
+    uniform int numBlocks,
+    uniform int excScanAll[],
+    uniform int  countsAll[],
+    uniform int partialSumAll[])
+{
+  const uniform int  blockIdx = taskIndex;
+
+  const uniform int  blockDim = (numBlocks+taskCount-1)/taskCount;
+  const uniform int      bbeg = blockIdx * blockDim;
+  const uniform int      bend = min(bbeg + blockDim, numBlocks);
+    
+  uniform int (* uniform  countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])countsAll;
+  uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
+  uniform int (* uniform   partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
+
+  foreach (digit = 0 ... NUMDIGITS)
+  {
+    int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
+    for (uniform int block = bbeg; block < bend; block++)
+    {
+      const int y = countsBlock[block][digit];
+      excScanBlock[block][digit] = prev;
+      prev += y;
+    }
+    partialSum[blockIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
+  }
+}
+
+task
+void partialScanGlobal(
+    const uniform int numBlocks,
+    uniform int partialSumAll[],
+    uniform int prefixSumAll[])
+{
+  uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
+  uniform int (* uniform  prefixSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS]) prefixSumAll;
+  const uniform int digit = taskIndex;
+  int carry = 0;
+  foreach (block = 0 ... numBlocks)
+  {
+    const int value = partialSum[block][digit];
+    const int scan  = exclusive_scan_add(value);
+    prefixSum[block][digit] = scan + carry;
+    carry += broadcast(scan+value, programCount-1);
+  }
+}
+
+task
+void completeScanGlobal(
+    uniform int numBlocks,
+    uniform int excScanAll[],
+    uniform int carryValueAll[])
+{
+  const uniform int  blockIdx = taskIndex;
+  const uniform int  blockDim = (numBlocks+taskCount-1)/taskCount;
+  const uniform int      bbeg = blockIdx * blockDim;
+  const uniform int      bend = min(bbeg  + blockDim, numBlocks);
+  
+  uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
+  uniform int (* uniform   carryValue)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])carryValueAll;
+
+  foreach (digit = 0 ... NUMDIGITS)
+  {
+    const int carry = carryValue[blockIdx][digit];
+    for (uniform int block = bbeg; block < bend; block++)
+      excScanBlock[block][digit] += carry;
+  }
+}
+
+static 
+inline void radixExclusiveScan(
+    const uniform int numBlocks,
+    uniform int excScanPtr[],
+    uniform int  countsPtr[],
+    uniform int partialSum[],
+    uniform int  prefixSum[])
+{
+  const uniform int scale = 8;
+  launch [numBlocks/scale] partialScanLocal(numBlocks, excScanPtr, countsPtr, partialSum);
+  sync;
+
+  launch [NUMDIGITS] partialScanGlobal(numBlocks/scale, partialSum, prefixSum);
+  sync;
+
+  launch [numBlocks/scale] completeScanGlobal(numBlocks, excScanPtr, prefixSum);
+  sync;
+}
+
+static uniform int * uniform memoryPool = NULL;
+static uniform int numBlocks;
+static uniform int nSharedCounts;
+static uniform int nCountsGlobal;
+static uniform int nExcScan;
+static uniform int nCountsBlock;
+static uniform int nPartialSum;
+static uniform int nPrefixSum;
+
+static uniform int * uniform sharedCounts;
+static uniform int * uniform countsGlobal;
+static uniform int * uniform excScan;
+static uniform int * uniform counts;
+static uniform int * uniform partialSum;
+static uniform int * uniform prefixSum;
+
+static uniform int numElementsBuf = 0;
+static uniform Key * uniform bufKeys;
+
+export void radixSort_alloc(const uniform int n)
+{
+  assert(memoryPool == NULL);
+  numBlocks     = num_cores()*4;
+#ifdef __NVPTX__
+  numBlocks     = 13*32*4; //num_cores()*4;
+#endif
+  nSharedCounts = NUMDIGITS*numBlocks;
+  nCountsGlobal = NUMDIGITS;
+  nExcScan      = NUMDIGITS*numBlocks;
+  nCountsBlock  = NUMDIGITS*numBlocks;
+  nPartialSum   = NUMDIGITS*numBlocks;
+  nPrefixSum    = NUMDIGITS*numBlocks;
+
+
+  const uniform int nalloc = 
+    nSharedCounts +
+    nCountsGlobal +
+    nExcScan +
+    nCountsBlock + 
+    nPartialSum +
+    nPrefixSum;
+
+  memoryPool = uniform new uniform int[nalloc];
+
+  sharedCounts = memoryPool;
+  countsGlobal = sharedCounts + nSharedCounts;
+  excScan      = countsGlobal + nCountsGlobal;
+  counts       = excScan      + nExcScan;
+  partialSum   = counts       + nCountsBlock;
+  prefixSum    = partialSum   + nPartialSum;
+}
+
+static 
+void radixSort_freeBufKeys()
+{
+  if (numElementsBuf > 0)
+  {
+    delete bufKeys;
+    numElementsBuf = 0;
+  }
+}
+
+export void radixSort_free()
+{
+  assert(memoryPool != NULL);
+  delete memoryPool;
+  memoryPool = NULL;
+
+  radixSort_freeBufKeys();
+}
+
+export void radixSort(
+    const uniform int numElements,
+    uniform Key keys[],
+    const uniform int nBits)
+{
+#ifdef __NVPTX__
+  assert((numBlocks & 3) == 0);  /* task granularity on Kepler is 4 */
+#endif
+
+  if (numElementsBuf < numElements)
+    radixSort_freeBufKeys();
+  if (numElementsBuf == 0)
+  {
+    numElementsBuf = numElements;
+    bufKeys = uniform new uniform Key[numElementsBuf];
+  }
+
+  const uniform int blockDim  = (numElements + numBlocks - 1) / numBlocks;
+
+  for (uniform int bit = 0; bit < nBits; bit += NUMBITS)
+  {
+    /* initialize histogram for each digit */
+    foreach (digit = 0 ... NUMDIGITS)
+      countsGlobal[digit] = 0;
+
+    /* compute histogram for each digit */
+    launch [numBlocks] countPass(keys, bufKeys, bit, numElements, counts, countsGlobal);
+    sync;
+
+    /* exclusive scan on global histogram */
+    int carry = 0;
+    excScan[0] = 0;
+    foreach (digit = 0 ... NUMDIGITS)
+    {
+      const int value = countsGlobal[digit];
+      const int scan  = exclusive_scan_add(value);
+      excScan[digit] = scan + carry;
+      carry += broadcast(scan+value, programCount-1);
+    }
+          
+    /* computing offsets for each digit */
+    radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
+
+    /* sorting */
+    launch [numBlocks] 
+      sortPass(
+          bufKeys, 
+          keys, 
+          bit, 
+          numElements,
+          excScan);
+    sync;
+  }
+
+}