added radixSort
This commit is contained in:
9
examples/portable/radixSort/Makefile_cpu
Normal file
9
examples/portable/radixSort/Makefile_cpu
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
|
||||||
|
EXAMPLE=radixSort
|
||||||
|
CPP_SRC=radixSort.cpp
|
||||||
|
ISPC_SRC=radixSort.ispc
|
||||||
|
ISPC_IA_TARGETS=avx1-i32x8
|
||||||
|
ISPC_ARM_TARGETS=neon
|
||||||
|
#ISPC_FLAGS=-DDEBUG -g
|
||||||
|
|
||||||
|
include ../common_cpu.mk
|
||||||
15
examples/portable/radixSort/Makefile_ptx
Normal file
15
examples/portable/radixSort/Makefile_ptx
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
PROG=radixSort
|
||||||
|
ISPC_SRC=radixSort.ispc
|
||||||
|
|
||||||
|
CU_SRC=radixSort.cu
|
||||||
|
# NVCC_FLAGS=-Xptxas=-O1
|
||||||
|
CXX_SRC=radixSort.cpp radixSort.cpp
|
||||||
|
PTXCC_REGMAX=64
|
||||||
|
|
||||||
|
LLVM_GPU=1
|
||||||
|
NVVM_GPU=1
|
||||||
|
|
||||||
|
include ../common_ptx.mk
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
119
examples/portable/radixSort/radixSort.cpp
Normal file
119
examples/portable/radixSort/radixSort.cpp
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <iomanip>
|
||||||
|
#include "timing.h"
|
||||||
|
#include "ispc_malloc.h"
|
||||||
|
#include "radixSort_ispc.h"
|
||||||
|
|
||||||
|
/* progress bar by Ross Hemsley;
|
||||||
|
* http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */
|
||||||
|
static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50)
|
||||||
|
{
|
||||||
|
if (n < 100)
|
||||||
|
{
|
||||||
|
x *= 100/n;
|
||||||
|
n = 100;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((x != n) && (x % (n/100) != 0)) return;
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
float ratio = x/(float)n;
|
||||||
|
int c = ratio * w;
|
||||||
|
|
||||||
|
cout << setw(3) << (int)(ratio*100) << "% [";
|
||||||
|
for (int x=0; x<c; x++) cout << "=";
|
||||||
|
for (int x=c; x<w; x++) cout << " ";
|
||||||
|
cout << "]\r" << flush;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Key
|
||||||
|
{
|
||||||
|
int32_t key,val;
|
||||||
|
};
|
||||||
|
|
||||||
|
int main (int argc, char *argv[])
|
||||||
|
{
|
||||||
|
int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
|
||||||
|
double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
|
||||||
|
Key *keys = new Key [n];
|
||||||
|
Key *keys_orig = new Key [n];
|
||||||
|
unsigned int *keys_gold = new unsigned int [n];
|
||||||
|
|
||||||
|
srand48(rtc()*65536);
|
||||||
|
|
||||||
|
int sortBits = 32;
|
||||||
|
assert(sortBits <= 32);
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < n; i++)
|
||||||
|
{
|
||||||
|
keys[i].key = ((int)(drand48() * (1<<30))) & ((1ULL << sortBits) - 1);
|
||||||
|
keys[i].val = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::random_shuffle(keys, keys + n);
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < n; i++)
|
||||||
|
{
|
||||||
|
keys_gold[i] = keys[i].key;
|
||||||
|
keys_orig[i] = keys[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
ispcSetMallocHeapLimit(1024*1024*1024);
|
||||||
|
|
||||||
|
ispc::radixSort_alloc(n);
|
||||||
|
|
||||||
|
tISPC2 = 1e30;
|
||||||
|
for (i = 0; i < m; i ++)
|
||||||
|
{
|
||||||
|
ispcMemcpy(keys, keys_orig, n*sizeof(Key));
|
||||||
|
reset_and_start_timer();
|
||||||
|
ispc::radixSort(n, (int64_t*)keys, sortBits);
|
||||||
|
tISPC2 = std::min(tISPC2, get_elapsed_msec());
|
||||||
|
if (argc != 3)
|
||||||
|
progressbar (i, m);
|
||||||
|
}
|
||||||
|
|
||||||
|
ispc::radixSort_free();
|
||||||
|
|
||||||
|
printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n/tISPC2);
|
||||||
|
|
||||||
|
std::sort(keys_gold, keys_gold + n);
|
||||||
|
for (int i = 0; i < n; i++)
|
||||||
|
assert(keys[i].key == keys_gold[i]);
|
||||||
|
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
for (i = 0; i < m; i ++)
|
||||||
|
{
|
||||||
|
ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
|
||||||
|
|
||||||
|
reset_and_start_timer();
|
||||||
|
|
||||||
|
sort_serial (n, code, order);
|
||||||
|
|
||||||
|
tSerial += get_elapsed_msec();
|
||||||
|
|
||||||
|
if (argc != 3)
|
||||||
|
progressbar (i, m);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("[sort serial]:\t\t[%.3f] msec [%.3f Mpair/s]\n", tSerial, 1.0e-3*n*m/tSerial);
|
||||||
|
|
||||||
|
#ifndef _CUDA_
|
||||||
|
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
|
||||||
|
#else
|
||||||
|
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", tSerial/tISPC2);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
delete keys;
|
||||||
|
delete keys_orig;
|
||||||
|
delete keys_gold;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
365
examples/portable/radixSort/radixSort.cu
Normal file
365
examples/portable/radixSort/radixSort.cu
Normal file
@@ -0,0 +1,365 @@
|
|||||||
|
#include "cuda_helpers.cuh"
|
||||||
|
#include <cassert>
|
||||||
|
|
||||||
|
#define NUMBITS 8
|
||||||
|
#define NUMDIGITS (1<<NUMBITS)
|
||||||
|
|
||||||
|
typedef long long Key;
|
||||||
|
|
||||||
|
__forceinline__ __device__ int atomic_add_global(int* ptr, int value)
|
||||||
|
{
|
||||||
|
return atomicAdd(ptr, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int shfl_scan_add_step(int partial, int up_offset)
|
||||||
|
{
|
||||||
|
int result;
|
||||||
|
asm(
|
||||||
|
"{.reg .u32 r0;"
|
||||||
|
".reg .pred p;"
|
||||||
|
"shfl.up.b32 r0|p, %1, %2, 0;"
|
||||||
|
"@p add.u32 r0, r0, %3;"
|
||||||
|
"mov.u32 %0, r0;}"
|
||||||
|
: "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
__forceinline__ __device__ int exclusive_scan_add(int value)
|
||||||
|
{
|
||||||
|
int mysum = value;
|
||||||
|
#pragma unroll
|
||||||
|
for(int i = 0; i < 5; ++i)
|
||||||
|
mysum = shfl_scan_add_step(mysum, 1 << i);
|
||||||
|
return mysum - value;
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void countPass(
|
||||||
|
const Key keysAll[],
|
||||||
|
Key sortedAll[],
|
||||||
|
const int bit,
|
||||||
|
const int numElements,
|
||||||
|
int countsAll[],
|
||||||
|
int countsGlobal[])
|
||||||
|
{
|
||||||
|
const int blkIdx = taskIndex;
|
||||||
|
const int numBlocks = taskCount;
|
||||||
|
const int blkDim = (numElements + numBlocks - 1) / numBlocks;
|
||||||
|
|
||||||
|
const int mask = (1 << NUMBITS) - 1;
|
||||||
|
|
||||||
|
const Key * keys = keysAll + blkIdx*blkDim;
|
||||||
|
Key * sorted = sortedAll + blkIdx*blkDim;
|
||||||
|
int * counts = countsAll + blkIdx*NUMDIGITS;
|
||||||
|
const int nloc = min(numElements - blkIdx*blkDim, blkDim);
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||||
|
counts[digit] = 0;
|
||||||
|
|
||||||
|
for (int i = programIndex; i < nloc; i += programCount)
|
||||||
|
if (i < nloc)
|
||||||
|
{
|
||||||
|
sorted[i] = keys[i];
|
||||||
|
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||||
|
atomic_add_global(&counts[key], 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||||
|
atomic_add_global(&countsGlobal[digit], counts[digit]);
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void sortPass(
|
||||||
|
Key keysAll[],
|
||||||
|
Key sorted[],
|
||||||
|
int bit,
|
||||||
|
int numElements,
|
||||||
|
int digitOffsetsAll[])
|
||||||
|
{
|
||||||
|
const int blkIdx = taskIndex;
|
||||||
|
const int numBlocks = taskCount;
|
||||||
|
|
||||||
|
const int blkDim = (numElements + numBlocks - 1) / numBlocks;
|
||||||
|
|
||||||
|
|
||||||
|
const int keyIndex = blkIdx * blkDim;
|
||||||
|
Key * keys = keysAll + keyIndex;
|
||||||
|
|
||||||
|
|
||||||
|
const int nloc = min(numElements - keyIndex, blkDim);
|
||||||
|
|
||||||
|
const int mask = (1 << NUMBITS) - 1;
|
||||||
|
|
||||||
|
/* copy digit offset from Gmem to Lmem */
|
||||||
|
#if 1
|
||||||
|
__shared__ int digitOffsets_sh[NUMDIGITS*4];
|
||||||
|
volatile int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS;
|
||||||
|
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||||
|
digitOffsets[digit] = digitOffsetsAll[blkIdx*NUMDIGITS + digit];
|
||||||
|
#else
|
||||||
|
int *digitOffsets = &digitOffsetsAll[blkIdx*NUMDIGITS];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
for (int i = programIndex; i < nloc; i += programCount)
|
||||||
|
if (i < nloc)
|
||||||
|
{
|
||||||
|
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||||
|
int scatter;
|
||||||
|
/* not a vector friendly loop */
|
||||||
|
#pragma unroll 1 /* needed, otherwise compiler unroll and optimizes the result :S */
|
||||||
|
for (int iv = 0; iv < programCount; iv++)
|
||||||
|
if (programIndex == iv)
|
||||||
|
scatter = digitOffsets[key]++;
|
||||||
|
sorted [scatter] = keys[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void partialScanLocal(
|
||||||
|
int numBlocks,
|
||||||
|
int excScanAll[],
|
||||||
|
int countsAll[],
|
||||||
|
int partialSumAll[])
|
||||||
|
{
|
||||||
|
const int blkIdx = taskIndex;
|
||||||
|
|
||||||
|
const int blkDim = (numBlocks+taskCount-1)/taskCount;
|
||||||
|
const int bbeg = blkIdx * blkDim;
|
||||||
|
const int bend = min(bbeg + blkDim, numBlocks);
|
||||||
|
|
||||||
|
int (* countsBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])countsAll;
|
||||||
|
int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
|
||||||
|
int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll;
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||||
|
{
|
||||||
|
int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
|
||||||
|
for ( int block = bbeg; block < bend; block++)
|
||||||
|
{
|
||||||
|
const int y = countsBlock[block][digit];
|
||||||
|
excScanBlock[block][digit] = prev;
|
||||||
|
prev += y;
|
||||||
|
}
|
||||||
|
partialSum[blkIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void partialScanGlobal(
|
||||||
|
const int numBlocks,
|
||||||
|
int partialSumAll[],
|
||||||
|
int prefixSumAll[])
|
||||||
|
{
|
||||||
|
int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll;
|
||||||
|
int (* prefixSum)[NUMDIGITS] = ( int (*)[NUMDIGITS]) prefixSumAll;
|
||||||
|
const int digit = taskIndex;
|
||||||
|
int carry = 0;
|
||||||
|
for (int block = programIndex; block < numBlocks; block += programCount)
|
||||||
|
{
|
||||||
|
const int value = partialSum[block][digit];
|
||||||
|
const int scan = exclusive_scan_add(value);
|
||||||
|
if (block < numBlocks)
|
||||||
|
prefixSum[block][digit] = scan + carry;
|
||||||
|
carry += __shfl(scan+value, programCount-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void completeScanGlobal(
|
||||||
|
int numBlocks,
|
||||||
|
int excScanAll[],
|
||||||
|
int carryValueAll[])
|
||||||
|
{
|
||||||
|
const int blkIdx = taskIndex;
|
||||||
|
const int blkDim = (numBlocks+taskCount-1)/taskCount;
|
||||||
|
const int bbeg = blkIdx * blkDim;
|
||||||
|
const int bend = min(bbeg + blkDim, numBlocks);
|
||||||
|
|
||||||
|
int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
|
||||||
|
int (* carryValue)[NUMDIGITS] = ( int (*)[NUMDIGITS])carryValueAll;
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||||
|
{
|
||||||
|
const int carry = carryValue[blkIdx][digit];
|
||||||
|
for ( int block = bbeg; block < bend; block++)
|
||||||
|
excScanBlock[block][digit] += carry;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ static
|
||||||
|
inline void radixExclusiveScan(
|
||||||
|
const int numBlocks,
|
||||||
|
int excScanPtr[],
|
||||||
|
int countsPtr[],
|
||||||
|
int partialSum[],
|
||||||
|
int prefixSum[])
|
||||||
|
{
|
||||||
|
const int scale = 8;
|
||||||
|
launch (numBlocks/scale, 1,1, partialScanLocal)(numBlocks, excScanPtr, countsPtr, partialSum);
|
||||||
|
sync;
|
||||||
|
|
||||||
|
launch (NUMDIGITS,1,1,partialScanGlobal) (numBlocks/scale, partialSum, prefixSum);
|
||||||
|
sync;
|
||||||
|
|
||||||
|
launch (numBlocks/scale,1,1, completeScanGlobal) (numBlocks, excScanPtr, prefixSum);
|
||||||
|
sync;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ static int * memoryPool = NULL;
|
||||||
|
__device__ static int numBlocks;
|
||||||
|
__device__ static int nSharedCounts;
|
||||||
|
__device__ static int nCountsGlobal;
|
||||||
|
__device__ static int nExcScan;
|
||||||
|
__device__ static int nCountsBlock;
|
||||||
|
__device__ static int nPartialSum;
|
||||||
|
__device__ static int nPrefixSum;
|
||||||
|
|
||||||
|
__device__ static int * sharedCounts;
|
||||||
|
__device__ static int * countsGlobal;
|
||||||
|
__device__ static int * excScan;
|
||||||
|
__device__ static int * counts;
|
||||||
|
__device__ static int * partialSum;
|
||||||
|
__device__ static int * prefixSum;
|
||||||
|
|
||||||
|
__device__ static int numElementsBuf = 0;
|
||||||
|
__device__ static Key * bufKeys;
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void radixSort_alloc___export(const int n)
|
||||||
|
{
|
||||||
|
assert(memoryPool == NULL);
|
||||||
|
numBlocks = 13*32*4;
|
||||||
|
nSharedCounts = NUMDIGITS*numBlocks;
|
||||||
|
nCountsGlobal = NUMDIGITS;
|
||||||
|
nExcScan = NUMDIGITS*numBlocks;
|
||||||
|
nCountsBlock = NUMDIGITS*numBlocks;
|
||||||
|
nPartialSum = NUMDIGITS*numBlocks;
|
||||||
|
nPrefixSum = NUMDIGITS*numBlocks;
|
||||||
|
|
||||||
|
|
||||||
|
const int nalloc =
|
||||||
|
nSharedCounts +
|
||||||
|
nCountsGlobal +
|
||||||
|
nExcScan +
|
||||||
|
nCountsBlock +
|
||||||
|
nPartialSum +
|
||||||
|
nPrefixSum;
|
||||||
|
|
||||||
|
if (programIndex == 0)
|
||||||
|
memoryPool = new int[nalloc];
|
||||||
|
|
||||||
|
sharedCounts = memoryPool;
|
||||||
|
countsGlobal = sharedCounts + nSharedCounts;
|
||||||
|
excScan = countsGlobal + nCountsGlobal;
|
||||||
|
counts = excScan + nExcScan;
|
||||||
|
partialSum = counts + nCountsBlock;
|
||||||
|
prefixSum = partialSum + nPartialSum;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
void radixSort_alloc(const int n)
|
||||||
|
{
|
||||||
|
radixSort_alloc___export<<<1,32>>>(n);
|
||||||
|
sync;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__device__ static
|
||||||
|
void radixSort_freeBufKeys()
|
||||||
|
{
|
||||||
|
if (numElementsBuf > 0)
|
||||||
|
{
|
||||||
|
if (programIndex == 0)
|
||||||
|
delete bufKeys;
|
||||||
|
numElementsBuf = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void radixSort_free___export()
|
||||||
|
{
|
||||||
|
assert(memoryPool != NULL);
|
||||||
|
if (programIndex == 0)
|
||||||
|
delete memoryPool;
|
||||||
|
memoryPool = NULL;
|
||||||
|
|
||||||
|
radixSort_freeBufKeys();
|
||||||
|
}
|
||||||
|
extern "C"
|
||||||
|
void radixSort_free()
|
||||||
|
{
|
||||||
|
radixSort_free___export<<<1,32>>>();
|
||||||
|
sync;
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void radixSort___export(
|
||||||
|
const int numElements,
|
||||||
|
Key keys[],
|
||||||
|
const int nBits)
|
||||||
|
{
|
||||||
|
#ifdef __NVPTX__
|
||||||
|
assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (numElementsBuf < numElements)
|
||||||
|
radixSort_freeBufKeys();
|
||||||
|
if (numElementsBuf == 0)
|
||||||
|
{
|
||||||
|
numElementsBuf = numElements;
|
||||||
|
if (programIndex == 0)
|
||||||
|
bufKeys = new Key[numElementsBuf];
|
||||||
|
}
|
||||||
|
|
||||||
|
const int blkDim = (numElements + numBlocks - 1) / numBlocks;
|
||||||
|
|
||||||
|
for ( int bit = 0; bit < nBits; bit += NUMBITS)
|
||||||
|
{
|
||||||
|
/* initialize histogram for each digit */
|
||||||
|
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||||
|
countsGlobal[digit] = 0;
|
||||||
|
|
||||||
|
/* compute histogram for each digit */
|
||||||
|
launch (numBlocks,1,1, countPass)(keys, bufKeys, bit, numElements, counts, countsGlobal);
|
||||||
|
sync;
|
||||||
|
|
||||||
|
/* exclusive scan on global histogram */
|
||||||
|
int carry = 0;
|
||||||
|
excScan[0] = 0;
|
||||||
|
#pragma unroll 8
|
||||||
|
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||||
|
{
|
||||||
|
const int value = countsGlobal[digit];
|
||||||
|
const int scan = exclusive_scan_add(value);
|
||||||
|
excScan[digit] = scan + carry;
|
||||||
|
carry += __shfl(scan+value, programCount-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* computing offsets for each digit */
|
||||||
|
radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
|
||||||
|
|
||||||
|
/* sorting */
|
||||||
|
launch (numBlocks,1,1,
|
||||||
|
sortPass)(
|
||||||
|
bufKeys,
|
||||||
|
keys,
|
||||||
|
bit,
|
||||||
|
numElements,
|
||||||
|
excScan);
|
||||||
|
sync;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
void radixSort(
|
||||||
|
const int numElements,
|
||||||
|
Key keys[],
|
||||||
|
const int nBits)
|
||||||
|
{
|
||||||
|
cudaDeviceSetCacheConfig ( cudaFuncCachePreferEqual );
|
||||||
|
radixSort___export<<<1,32>>>(numElements, keys, nBits);
|
||||||
|
sync;
|
||||||
|
}
|
||||||
301
examples/portable/radixSort/radixSort.ispc
Normal file
301
examples/portable/radixSort/radixSort.ispc
Normal file
@@ -0,0 +1,301 @@
|
|||||||
|
#define NUMBITS 8
|
||||||
|
#define NUMDIGITS (1<<NUMBITS)
|
||||||
|
|
||||||
|
typedef int64 Key;
|
||||||
|
|
||||||
|
task
|
||||||
|
void countPass(
|
||||||
|
const uniform Key keysAll[],
|
||||||
|
uniform Key sortedAll[],
|
||||||
|
const uniform int bit,
|
||||||
|
const uniform int numElements,
|
||||||
|
uniform int countsAll[],
|
||||||
|
uniform int countsGlobal[])
|
||||||
|
{
|
||||||
|
const uniform int blockIdx = taskIndex;
|
||||||
|
const uniform int numBlocks = taskCount;
|
||||||
|
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
||||||
|
|
||||||
|
const uniform int mask = (1 << NUMBITS) - 1;
|
||||||
|
|
||||||
|
const uniform Key * uniform keys = keysAll + blockIdx*blockDim;
|
||||||
|
uniform Key * uniform sorted = sortedAll + blockIdx*blockDim;
|
||||||
|
uniform int * uniform counts = countsAll + blockIdx*NUMDIGITS;
|
||||||
|
const uniform int nloc = min(numElements - blockIdx*blockDim, blockDim);
|
||||||
|
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
counts[digit] = 0;
|
||||||
|
|
||||||
|
foreach (i = 0 ... nloc)
|
||||||
|
{
|
||||||
|
sorted[i] = keys[i];
|
||||||
|
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||||
|
#ifdef __NVPTX__
|
||||||
|
atomic_add_global(&counts[key], 1);
|
||||||
|
#else
|
||||||
|
atomic_add_local(&counts[key], 1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
atomic_add_global(&countsGlobal[digit], counts[digit]);
|
||||||
|
}
|
||||||
|
|
||||||
|
task
|
||||||
|
void sortPass(
|
||||||
|
uniform Key keysAll[],
|
||||||
|
uniform Key sorted[],
|
||||||
|
uniform int bit,
|
||||||
|
uniform int numElements,
|
||||||
|
uniform int digitOffsetsAll[])
|
||||||
|
{
|
||||||
|
const uniform int blockIdx = taskIndex;
|
||||||
|
const uniform int numBlocks = taskCount;
|
||||||
|
|
||||||
|
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
||||||
|
|
||||||
|
|
||||||
|
const uniform int keyIndex = blockIdx * blockDim;
|
||||||
|
uniform Key * uniform keys = keysAll + keyIndex;
|
||||||
|
|
||||||
|
|
||||||
|
const uniform int nloc = min(numElements - keyIndex, blockDim);
|
||||||
|
|
||||||
|
const uniform int mask = (1 << NUMBITS) - 1;
|
||||||
|
|
||||||
|
/* copy digit offset from Gmem to Lmem */
|
||||||
|
#if 1
|
||||||
|
uniform int digitOffsets[NUMDIGITS];
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
digitOffsets[digit] = digitOffsetsAll[blockIdx*NUMDIGITS + digit];
|
||||||
|
#else
|
||||||
|
uniform int * uniform digitOffsets = &digitOffsetsAll[blockIdx*NUMDIGITS];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
foreach (i = 0 ... nloc)
|
||||||
|
{
|
||||||
|
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||||
|
int scatter;
|
||||||
|
/* not a vector friendly loop */
|
||||||
|
foreach_active(iv)
|
||||||
|
scatter = digitOffsets[key]++;
|
||||||
|
sorted[scatter] = keys[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
task
|
||||||
|
void partialScanLocal(
|
||||||
|
uniform int numBlocks,
|
||||||
|
uniform int excScanAll[],
|
||||||
|
uniform int countsAll[],
|
||||||
|
uniform int partialSumAll[])
|
||||||
|
{
|
||||||
|
const uniform int blockIdx = taskIndex;
|
||||||
|
|
||||||
|
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
||||||
|
const uniform int bbeg = blockIdx * blockDim;
|
||||||
|
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||||
|
|
||||||
|
uniform int (* uniform countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])countsAll;
|
||||||
|
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
|
||||||
|
uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
|
||||||
|
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
{
|
||||||
|
int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
|
||||||
|
for (uniform int block = bbeg; block < bend; block++)
|
||||||
|
{
|
||||||
|
const int y = countsBlock[block][digit];
|
||||||
|
excScanBlock[block][digit] = prev;
|
||||||
|
prev += y;
|
||||||
|
}
|
||||||
|
partialSum[blockIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
task
|
||||||
|
void partialScanGlobal(
|
||||||
|
const uniform int numBlocks,
|
||||||
|
uniform int partialSumAll[],
|
||||||
|
uniform int prefixSumAll[])
|
||||||
|
{
|
||||||
|
uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
|
||||||
|
uniform int (* uniform prefixSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS]) prefixSumAll;
|
||||||
|
const uniform int digit = taskIndex;
|
||||||
|
int carry = 0;
|
||||||
|
foreach (block = 0 ... numBlocks)
|
||||||
|
{
|
||||||
|
const int value = partialSum[block][digit];
|
||||||
|
const int scan = exclusive_scan_add(value);
|
||||||
|
prefixSum[block][digit] = scan + carry;
|
||||||
|
carry += broadcast(scan+value, programCount-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
task
|
||||||
|
void completeScanGlobal(
|
||||||
|
uniform int numBlocks,
|
||||||
|
uniform int excScanAll[],
|
||||||
|
uniform int carryValueAll[])
|
||||||
|
{
|
||||||
|
const uniform int blockIdx = taskIndex;
|
||||||
|
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
||||||
|
const uniform int bbeg = blockIdx * blockDim;
|
||||||
|
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||||
|
|
||||||
|
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
|
||||||
|
uniform int (* uniform carryValue)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])carryValueAll;
|
||||||
|
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
{
|
||||||
|
const int carry = carryValue[blockIdx][digit];
|
||||||
|
for (uniform int block = bbeg; block < bend; block++)
|
||||||
|
excScanBlock[block][digit] += carry;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
inline void radixExclusiveScan(
|
||||||
|
const uniform int numBlocks,
|
||||||
|
uniform int excScanPtr[],
|
||||||
|
uniform int countsPtr[],
|
||||||
|
uniform int partialSum[],
|
||||||
|
uniform int prefixSum[])
|
||||||
|
{
|
||||||
|
const uniform int scale = 8;
|
||||||
|
launch [numBlocks/scale] partialScanLocal(numBlocks, excScanPtr, countsPtr, partialSum);
|
||||||
|
sync;
|
||||||
|
|
||||||
|
launch [NUMDIGITS] partialScanGlobal(numBlocks/scale, partialSum, prefixSum);
|
||||||
|
sync;
|
||||||
|
|
||||||
|
launch [numBlocks/scale] completeScanGlobal(numBlocks, excScanPtr, prefixSum);
|
||||||
|
sync;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uniform int * uniform memoryPool = NULL;
|
||||||
|
static uniform int numBlocks;
|
||||||
|
static uniform int nSharedCounts;
|
||||||
|
static uniform int nCountsGlobal;
|
||||||
|
static uniform int nExcScan;
|
||||||
|
static uniform int nCountsBlock;
|
||||||
|
static uniform int nPartialSum;
|
||||||
|
static uniform int nPrefixSum;
|
||||||
|
|
||||||
|
static uniform int * uniform sharedCounts;
|
||||||
|
static uniform int * uniform countsGlobal;
|
||||||
|
static uniform int * uniform excScan;
|
||||||
|
static uniform int * uniform counts;
|
||||||
|
static uniform int * uniform partialSum;
|
||||||
|
static uniform int * uniform prefixSum;
|
||||||
|
|
||||||
|
static uniform int numElementsBuf = 0;
|
||||||
|
static uniform Key * uniform bufKeys;
|
||||||
|
|
||||||
|
export void radixSort_alloc(const uniform int n)
|
||||||
|
{
|
||||||
|
assert(memoryPool == NULL);
|
||||||
|
numBlocks = num_cores()*4;
|
||||||
|
#ifdef __NVPTX__
|
||||||
|
numBlocks = 13*32*4; //num_cores()*4;
|
||||||
|
#endif
|
||||||
|
nSharedCounts = NUMDIGITS*numBlocks;
|
||||||
|
nCountsGlobal = NUMDIGITS;
|
||||||
|
nExcScan = NUMDIGITS*numBlocks;
|
||||||
|
nCountsBlock = NUMDIGITS*numBlocks;
|
||||||
|
nPartialSum = NUMDIGITS*numBlocks;
|
||||||
|
nPrefixSum = NUMDIGITS*numBlocks;
|
||||||
|
|
||||||
|
|
||||||
|
const uniform int nalloc =
|
||||||
|
nSharedCounts +
|
||||||
|
nCountsGlobal +
|
||||||
|
nExcScan +
|
||||||
|
nCountsBlock +
|
||||||
|
nPartialSum +
|
||||||
|
nPrefixSum;
|
||||||
|
|
||||||
|
memoryPool = uniform new uniform int[nalloc];
|
||||||
|
|
||||||
|
sharedCounts = memoryPool;
|
||||||
|
countsGlobal = sharedCounts + nSharedCounts;
|
||||||
|
excScan = countsGlobal + nCountsGlobal;
|
||||||
|
counts = excScan + nExcScan;
|
||||||
|
partialSum = counts + nCountsBlock;
|
||||||
|
prefixSum = partialSum + nPartialSum;
|
||||||
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
void radixSort_freeBufKeys()
|
||||||
|
{
|
||||||
|
if (numElementsBuf > 0)
|
||||||
|
{
|
||||||
|
delete bufKeys;
|
||||||
|
numElementsBuf = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export void radixSort_free()
|
||||||
|
{
|
||||||
|
assert(memoryPool != NULL);
|
||||||
|
delete memoryPool;
|
||||||
|
memoryPool = NULL;
|
||||||
|
|
||||||
|
radixSort_freeBufKeys();
|
||||||
|
}
|
||||||
|
|
||||||
|
export void radixSort(
|
||||||
|
const uniform int numElements,
|
||||||
|
uniform Key keys[],
|
||||||
|
const uniform int nBits)
|
||||||
|
{
|
||||||
|
#ifdef __NVPTX__
|
||||||
|
assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (numElementsBuf < numElements)
|
||||||
|
radixSort_freeBufKeys();
|
||||||
|
if (numElementsBuf == 0)
|
||||||
|
{
|
||||||
|
numElementsBuf = numElements;
|
||||||
|
bufKeys = uniform new uniform Key[numElementsBuf];
|
||||||
|
}
|
||||||
|
|
||||||
|
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
||||||
|
|
||||||
|
for (uniform int bit = 0; bit < nBits; bit += NUMBITS)
|
||||||
|
{
|
||||||
|
/* initialize histogram for each digit */
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
countsGlobal[digit] = 0;
|
||||||
|
|
||||||
|
/* compute histogram for each digit */
|
||||||
|
launch [numBlocks] countPass(keys, bufKeys, bit, numElements, counts, countsGlobal);
|
||||||
|
sync;
|
||||||
|
|
||||||
|
/* exclusive scan on global histogram */
|
||||||
|
int carry = 0;
|
||||||
|
excScan[0] = 0;
|
||||||
|
foreach (digit = 0 ... NUMDIGITS)
|
||||||
|
{
|
||||||
|
const int value = countsGlobal[digit];
|
||||||
|
const int scan = exclusive_scan_add(value);
|
||||||
|
excScan[digit] = scan + carry;
|
||||||
|
carry += broadcast(scan+value, programCount-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* computing offsets for each digit */
|
||||||
|
radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
|
||||||
|
|
||||||
|
/* sorting */
|
||||||
|
launch [numBlocks]
|
||||||
|
sortPass(
|
||||||
|
bufKeys,
|
||||||
|
keys,
|
||||||
|
bit,
|
||||||
|
numElements,
|
||||||
|
excScan);
|
||||||
|
sync;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user