This commit is contained in:
Evghenii
2014-01-29 18:40:45 +01:00
parent 3f641f487d
commit ac4d847eac
3 changed files with 20 additions and 22 deletions

View File

@@ -7,5 +7,6 @@ ISPC_ARM_TARGETS=neon
#ISPC_FLAGS=-DDEBUG -g #ISPC_FLAGS=-DDEBUG -g
CXXFLAGS=-g CXXFLAGS=-g
CCFLAGS=-g CCFLAGS=-g
#NVCC_FLAGS=-Xptxas=-O0
include ../common.mk include ../common.mk

View File

@@ -1,7 +1,7 @@
PROG=radixSort PROG=mergeSort
ISPC_SRC=radixSort.ispc ISPC_SRC=mergeSort.ispc
#CU_SRC=radixSort.cu #CU_SRC=mergeSort.cu
CXX_SRC=radixSort.cpp radixSort.cpp CXX_SRC=mergeSort.cpp mergeSort.cpp
PTXCC_REGMAX=64 PTXCC_REGMAX=64
# LLVM_GPU=1 # LLVM_GPU=1

View File

@@ -1,7 +1,7 @@
#define SAMPLE_STRIDE programCount #define SAMPLE_STRIDE programCount
#define iDivUp(a,b) ((a) + (b) - 1)/(b) #define iDivUp(a,b) (((a) + (b) - 1)/(b))
#define getSampleCount(dividend) iDivUp((dividend), SAMPLE_STRIDE) #define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
#define W (/*sizeof(int)=*/4 * 8) #define W (/*sizeof(int)=*/4 * 8)
@@ -24,7 +24,7 @@ int nextPowerOfTwo(int x)
static inline static inline
int binarySearchInclusive( int binarySearchInclusive(
const int val, const int val,
int *data, uniform int *data,
const int L, const int L,
int stride) int stride)
{ {
@@ -32,7 +32,6 @@ int binarySearchInclusive(
return 0; return 0;
int pos = 0; int pos = 0;
for (; stride > 0; stride >>= 1) for (; stride > 0; stride >>= 1)
{ {
int newPos = min(pos + stride, L); int newPos = min(pos + stride, L);
@@ -47,7 +46,7 @@ int binarySearchInclusive(
static inline static inline
int binarySearchExclusive( int binarySearchExclusive(
const int val, const int val,
int *data, uniform int *data,
const int L, const int L,
int stride) int stride)
{ {
@@ -55,7 +54,6 @@ int binarySearchExclusive(
return 0; return 0;
int pos = 0; int pos = 0;
for (; stride > 0; stride >>= 1) for (; stride > 0; stride >>= 1)
{ {
int newPos = min(pos + stride, L); int newPos = min(pos + stride, L);
@@ -89,8 +87,8 @@ void mergeSortGangKernel(
for (uniform int stride = 1; stride < programCount; stride <<= 1) for (uniform int stride = 1; stride < programCount; stride <<= 1)
{ {
const int lPos = programIndex & (stride - 1); const int lPos = programIndex & (stride - 1);
int *baseKey = s_key + 2 * (programIndex - lPos); uniform int *baseKey = s_key + 2 * (programIndex - lPos);
int *baseVal = s_val + 2 * (programIndex - lPos); uniform int *baseVal = s_val + 2 * (programIndex - lPos);
int keyA = baseKey[lPos + 0]; int keyA = baseKey[lPos + 0];
int valA = baseVal[lPos + 0]; int valA = baseVal[lPos + 0];
@@ -141,9 +139,9 @@ void generateSampleRanksKernel(
const int i = pos & ((stride / SAMPLE_STRIDE) - 1); const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
int * srcKey = in_srcKey + segmentBase; uniform int * srcKey = in_srcKey + segmentBase;
int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE; uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE; uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
const int segmentElementsA = stride; const int segmentElementsA = stride;
const int segmentElementsB = min(stride, N - segmentBase - stride); const int segmentElementsB = min(stride, N - segmentBase - stride);
@@ -181,7 +179,7 @@ void generateSampleRanks(
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE); (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
uniform int nTasks = (threadCount + programCount - 1) / programCount; uniform int nTasks = iDivUp(threadCount, SAMPLE_STRIDE);
launch [nTasks] generateSampleRanksKernel(ranksA, ranksB, srcKey, stride, N, threadCount); launch [nTasks] generateSampleRanksKernel(ranksA, ranksB, srcKey, stride, N, threadCount);
sync; sync;
@@ -202,8 +200,8 @@ void mergeRanksAndIndicesKernel(
const int i = pos & ((stride / SAMPLE_STRIDE) - 1); const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
int * ranks = in_Ranks + (pos - i) * 2; uniform int * ranks = in_Ranks + (pos - i) * 2;
int * limits = in_Limits + (pos - i) * 2; uniform int * limits = in_Limits + (pos - i) * 2;
const int segmentElementsA = stride; const int segmentElementsA = stride;
const int segmentElementsB = min(stride, N - segmentBase - stride); const int segmentElementsB = min(stride, N - segmentBase - stride);
@@ -237,7 +235,7 @@ void mergeRanksAndIndices(
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE); (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
const uniform int nTasks = (threadCount + programCount - 1 ) / programCount; const uniform int nTasks = iDivUp(threadCount, SAMPLE_STRIDE);
launch [nTasks] mergeRanksAndIndicesKernel( launch [nTasks] mergeRanksAndIndicesKernel(
limitsA, limitsA,
@@ -367,8 +365,6 @@ void mergeElementaryIntervalsKernel(
//Store merged data //Store merged data
assert(startDstA < N);
assert(startDstB < N);
if (programIndex < lenSrcA) if (programIndex < lenSrcA)
{ {
dstKey[startDstA + programIndex] = s_key[programIndex]; dstKey[startDstA + programIndex] = s_key[programIndex];
@@ -444,6 +440,7 @@ void copyKernel(uniform int dst[], uniform int src[], uniform int size)
dst[i] = src[i]; dst[i] = src[i];
} }
export export
void mergeSort( void mergeSort(
uniform int dstKey[], uniform int dstKey[],
@@ -483,7 +480,7 @@ void mergeSort(
for (uniform int stride = 2*programCount; stride < N; stride <<= 1) for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
{ {
print ("stride= % N= % \n", stride, N); // print ("stride= % N= % \n", stride, N);
uniform int lastSegmentElements = N % (2 * stride); uniform int lastSegmentElements = N % (2 * stride);
//Find sample ranks and prepare for limiters merge //Find sample ranks and prepare for limiters merge