This commit is contained in:
Evghenii
2014-01-29 16:01:23 +01:00
parent 1c8517947b
commit 3f641f487d
3 changed files with 150 additions and 142 deletions

View File

@@ -1,9 +1,11 @@
EXAMPLE=radixSort EXAMPLE=mergeSort
CPP_SRC=radixSort.cpp CPP_SRC=mergeSort.cpp
ISPC_SRC=radixSort.ispc ISPC_SRC=mergeSort.ispc
ISPC_IA_TARGETS=avx1-i32x8 ISPC_IA_TARGETS=avx1-i32x16
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
#ISPC_FLAGS=-DDEBUG -g #ISPC_FLAGS=-DDEBUG -g
CXXFLAGS=-g
CCFLAGS=-g
include ../common.mk include ../common.mk

View File

@@ -38,7 +38,7 @@ struct Key
int main (int argc, char *argv[]) int main (int argc, char *argv[])
{ {
int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX; int i, j, n = argc == 1 ? 1024*1024: atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0; double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
Key *keys = new Key[n]; Key *keys = new Key[n];
@@ -46,7 +46,7 @@ int main (int argc, char *argv[])
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
{ {
keys[i].key = ((int)(drand48() * (1<<30))); keys[i].key = i; //((int)(drand48() * (1<<30)));
keys[i].val = i; keys[i].val = i;
} }
std::random_shuffle(keys, keys + n); std::random_shuffle(keys, keys + n);
@@ -57,8 +57,8 @@ int main (int argc, char *argv[])
int *valsBuf = new int[n]; int *valsBuf = new int[n];
int *keysDst = new int[n]; int *keysDst = new int[n];
int *valsDst = new int[n]; int *valsDst = new int[n];
int *keysGld = new int [n]; int *keysGld = new int[n];
int *valsGld = new int [n]; int *valsGld = new int[n];
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
{ {
@@ -77,8 +77,8 @@ int main (int argc, char *argv[])
tISPC2 = 1e30; tISPC2 = 1e30;
for (i = 0; i < m; i ++) for (i = 0; i < m; i ++)
{ {
ispcMemcpy(keysSrc, keysGld, n*sizeof(Key)); ispcMemcpy(keysSrc, keysGld, n*sizeof(int));
ispcMemcpy(valsSrc, keysGld, n*sizeof(Key)); ispcMemcpy(valsSrc, keysGld, n*sizeof(int));
reset_and_start_timer(); reset_and_start_timer();
ispc::mergeSort(keysDst, valsDst, keysBuf, valsBuf, keysSrc, valsSrc, n); ispc::mergeSort(keysDst, valsDst, keysBuf, valsBuf, keysSrc, valsSrc, n);
@@ -92,6 +92,22 @@ int main (int argc, char *argv[])
printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n/tISPC2); printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n/tISPC2);
printf("\n Buf: \n");
for (int i = 0 ; i < 128; i++)
{
if ((i % 16) == 0)
printf("\n");
printf(" %d ", keysBuf[i]);
}
printf("\n Dst: \n");
for (int i = 0 ; i < 128; i++)
{
if ((i % 16) == 0)
printf("\n");
printf(" %d ", keysDst[i]);
}
printf("\n");
std::sort(keysGld, keysGld + n); std::sort(keysGld, keysGld + n);
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
assert(keysDst[i] == keysGld[i]); assert(keysDst[i] == keysGld[i]);

View File

@@ -1,45 +1,24 @@
#define SAMPLE_STRIDE programCount #define SAMPLE_STRIDE programCount
static inline #define iDivUp(a,b) ((a) + (b) - 1)/(b)
int iDivUp(int a, int b) #define getSampleCount(dividend) iDivUp((dividend), SAMPLE_STRIDE)
{
int div = a/b;
return ((a % b) == 0) ? div : (div + 1);
}
static inline
uniform int iDivUp(uniform int a, uniform int b)
{
uniform int div = a/b;
return ((a % b) == 0) ? div : (div + 1);
}
static inline
int getSampleCount(int dividend)
{
return iDivUp(dividend, SAMPLE_STRIDE);
}
static inline
uniform int getSampleCount(uniform int dividend)
{
return iDivUp(dividend, SAMPLE_STRIDE);
}
#define W (/*sizeof(int)=*/4 * 8) #define W (/*sizeof(int)=*/4 * 8)
static inline static inline
int nextPowerOfTwo(int x) int nextPowerOfTwo(int x)
{ {
/* #if 1
--x; --x;
x |= x >> 1; x |= x >> 1;
x |= x >> 2; x |= x >> 2;
x |= x >> 4; x |= x >> 4;
x |= x >> 8; x |= x >> 8;
x |= x >> 16; x |= x >> 16;
return ++x; return ++x;
*/ #else
return 1U << (W - count_leading_zeros(x - 1)); return 1U << (W - count_leading_zeros(x - 1));
#endif
} }
static inline static inline
@@ -157,9 +136,7 @@ void generateSampleRanksKernel(
uniform int totalProgramCount) uniform int totalProgramCount)
{ {
const int pos = taskIndex * programCount + programIndex; const int pos = taskIndex * programCount + programIndex;
assert(pos < totalProgramCount);
if (pos >= totalProgramCount)
return;
const int i = pos & ((stride / SAMPLE_STRIDE) - 1); const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
@@ -199,7 +176,11 @@ void generateSampleRanks(
uniform int N) uniform int N)
{ {
uniform int lastSegmentElements = N % (2 * stride); uniform int lastSegmentElements = N % (2 * stride);
uniform int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); assert(lastSegmentElements == 0);
uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
uniform int nTasks = (threadCount + programCount - 1) / programCount; uniform int nTasks = (threadCount + programCount - 1) / programCount;
launch [nTasks] generateSampleRanksKernel(ranksA, ranksB, srcKey, stride, N, threadCount); launch [nTasks] generateSampleRanksKernel(ranksA, ranksB, srcKey, stride, N, threadCount);
@@ -216,32 +197,30 @@ void mergeRanksAndIndicesKernel(
uniform int N, uniform int N,
uniform int totalProgramCount) uniform int totalProgramCount)
{ {
int pos = taskIndex * programCount + programIndex; int pos = taskIndex * programCount + programIndex;
assert(pos < totalProgramCount);
if (pos >= totalProgramCount) const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
return; const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
int * ranks = in_Ranks + (pos - i) * 2;
int * limits = in_Limits + (pos - i) * 2;
const int i = pos & ((stride / SAMPLE_STRIDE) - 1); const int segmentElementsA = stride;
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); const int segmentElementsB = min(stride, N - segmentBase - stride);
int * ranks = in_Ranks + (pos - i) * 2; const int segmentSamplesA = getSampleCount(segmentElementsA);
int * limits = in_Limits + (pos - i) * 2; const int segmentSamplesB = getSampleCount(segmentElementsB);
const int segmentElementsA = stride; if (i < segmentSamplesA)
const int segmentElementsB = min(stride, N - segmentBase - stride); {
const int segmentSamplesA = getSampleCount(segmentElementsA); int dstPos = binarySearchExclusive(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
const int segmentSamplesB = getSampleCount(segmentElementsB); limits[dstPos] = ranks[i];
}
if (i < segmentSamplesA) if (i < segmentSamplesB)
{ {
int dstPos = binarySearchExclusive(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i; int dstPos = binarySearchInclusive(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
limits[dstPos] = ranks[i]; limits[dstPos] = ranks[segmentSamplesA + i];
} }
if (i < segmentSamplesB)
{
int dstPos = binarySearchInclusive(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
limits[dstPos] = ranks[segmentSamplesA + i];
}
} }
static inline static inline
void mergeRanksAndIndices( void mergeRanksAndIndices(
@@ -253,8 +232,12 @@ void mergeRanksAndIndices(
uniform int N) uniform int N)
{ {
const uniform int lastSegmentElements = N % (2 * stride); const uniform int lastSegmentElements = N % (2 * stride);
const uniform int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); assert(lastSegmentElements == 0);
const uniform int nTasks = (threadCount + programCount -1 ) / programCount; const uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
const uniform int nTasks = (threadCount + programCount - 1 ) / programCount;
launch [nTasks] mergeRanksAndIndicesKernel( launch [nTasks] mergeRanksAndIndicesKernel(
limitsA, limitsA,
@@ -324,78 +307,79 @@ void mergeElementaryIntervalsKernel(
uniform int limitsA[], uniform int limitsA[],
uniform int limitsB[], uniform int limitsB[],
uniform int stride, uniform int stride,
uniform int N uniform int N)
)
{ {
uniform int s_key[2 * SAMPLE_STRIDE]; uniform int s_key[2 * SAMPLE_STRIDE];
uniform int s_val[2 * SAMPLE_STRIDE]; uniform int s_val[2 * SAMPLE_STRIDE];
const int uniform intervalI = taskIndex & ((2 * stride) / SAMPLE_STRIDE - 1); const int uniform intervalI = taskIndex & ((2 * stride) / SAMPLE_STRIDE - 1);
const int uniform segmentBase = (taskIndex - intervalI) * SAMPLE_STRIDE; const int uniform segmentBase = (taskIndex - intervalI) * SAMPLE_STRIDE;
srcKey += segmentBase; srcKey += segmentBase;
srcVal += segmentBase; srcVal += segmentBase;
dstKey += segmentBase; dstKey += segmentBase;
dstVal += segmentBase; dstVal += segmentBase;
//Set up threadblock-wide parameters //Set up threadblock-wide parameters
uniform int startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB; uniform int startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
{ {
uniform int segmentElementsA = stride; uniform int segmentElementsA = stride;
uniform int segmentElementsB = min(stride, N - segmentBase - stride); uniform int segmentElementsB = min(stride, N - segmentBase - stride);
uniform int segmentSamplesA = getSampleCount(segmentElementsA); uniform int segmentSamplesA = getSampleCount(segmentElementsA);
uniform int segmentSamplesB = getSampleCount(segmentElementsB); uniform int segmentSamplesB = getSampleCount(segmentElementsB);
uniform int segmentSamples = segmentSamplesA + segmentSamplesB; uniform int segmentSamples = segmentSamplesA + segmentSamplesB;
startSrcA = limitsA[taskIndex]; startSrcA = limitsA[taskIndex];
startSrcB = limitsB[taskIndex]; startSrcB = limitsB[taskIndex];
uniform int endSrcA = (intervalI + 1 < segmentSamples) ? limitsA[taskIndex+ 1] : segmentElementsA; uniform int endSrcA = (intervalI + 1 < segmentSamples) ? limitsA[taskIndex + 1] : segmentElementsA;
uniform int endSrcB = (intervalI + 1 < segmentSamples) ? limitsB[taskIndex + 1] : segmentElementsB; uniform int endSrcB = (intervalI + 1 < segmentSamples) ? limitsB[taskIndex + 1] : segmentElementsB;
lenSrcA = endSrcA - startSrcA; lenSrcA = endSrcA - startSrcA;
lenSrcB = endSrcB - startSrcB; lenSrcB = endSrcB - startSrcB;
startDstA = startSrcA + startSrcB; startDstA = startSrcA + startSrcB;
startDstB = startDstA + lenSrcA; startDstB = startDstA + lenSrcA;
} }
//Load main input data
if (programIndex < lenSrcA) //Load main input data
{
s_key[programIndex + 0] = srcKey[0 + startSrcA + programIndex];
s_val[programIndex + 0] = srcVal[0 + startSrcA + programIndex];
}
if (programIndex < lenSrcB) if (programIndex < lenSrcA)
{ {
s_key[programIndex + SAMPLE_STRIDE] = srcKey[stride + startSrcB + programIndex]; s_key[programIndex + 0] = srcKey[0 + startSrcA + programIndex];
s_val[programIndex + SAMPLE_STRIDE] = srcVal[stride + startSrcB + programIndex]; s_val[programIndex + 0] = srcVal[0 + startSrcA + programIndex];
} }
//Merge data in shared memory if (programIndex < lenSrcB)
merge( {
s_key, s_key[programIndex + SAMPLE_STRIDE] = srcKey[stride + startSrcB + programIndex];
s_val, s_val[programIndex + SAMPLE_STRIDE] = srcVal[stride + startSrcB + programIndex];
s_key + 0, }
s_val + 0,
s_key + SAMPLE_STRIDE,
s_val + SAMPLE_STRIDE,
lenSrcA, SAMPLE_STRIDE,
lenSrcB, SAMPLE_STRIDE
);
//Store merged data
if (programIndex < lenSrcA) //Merge data in shared memory
{ merge(
dstKey[startDstA + programIndex] = s_key[programIndex]; s_key,
dstVal[startDstA + programIndex] = s_val[programIndex]; s_val,
} s_key + 0,
s_val + 0,
s_key + SAMPLE_STRIDE,
s_val + SAMPLE_STRIDE,
lenSrcA, SAMPLE_STRIDE,
lenSrcB, SAMPLE_STRIDE
);
if (programIndex < lenSrcB) //Store merged data
{
dstKey[startDstB + programIndex] = s_key[lenSrcA + programIndex]; assert(startDstA < N);
dstVal[startDstB + programIndex] = s_val[lenSrcA + programIndex]; assert(startDstB < N);
} if (programIndex < lenSrcA)
{
dstKey[startDstA + programIndex] = s_key[programIndex];
dstVal[startDstA + programIndex] = s_val[programIndex];
}
if (programIndex < lenSrcB)
{
dstKey[startDstB + programIndex] = s_key[lenSrcA + programIndex];
dstVal[startDstB + programIndex] = s_val[lenSrcA + programIndex];
}
} }
static inline static inline
void mergeElementaryIntervals( void mergeElementaryIntervals(
@@ -455,6 +439,7 @@ void closeMergeSort()
export export
void copyKernel(uniform int dst[], uniform int src[], uniform int size) void copyKernel(uniform int dst[], uniform int src[], uniform int size)
{ {
assert(size == 0);
foreach (i = 0 ... size) foreach (i = 0 ... size)
dst[i] = src[i]; dst[i] = src[i];
} }
@@ -490,12 +475,15 @@ void mergeSort(
oVal = bufVal; oVal = bufVal;
} }
assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT); assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
assert(N % (programCount*2) == 0); assert(N % (programCount*2) == 0);
mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount)); mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
for (uniform int stride = 2*programCount; stride < N; stride <<= 1) for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
{ {
print ("stride= % N= % \n", stride, N);
uniform int lastSegmentElements = N % (2 * stride); uniform int lastSegmentElements = N % (2 * stride);
//Find sample ranks and prepare for limiters merge //Find sample ranks and prepare for limiters merge
@@ -509,19 +497,21 @@ void mergeSort(
if (lastSegmentElements <= stride) if (lastSegmentElements <= stride)
{ {
assert(0);
//Last merge segment consists of a single array which just needs to be passed through //Last merge segment consists of a single array which just needs to be passed through
copyKernel(oKey + (N - lastSegmentElements), iKey + (N - lastSegmentElements), lastSegmentElements); copyKernel(oKey + (N - lastSegmentElements), iKey + (N - lastSegmentElements), lastSegmentElements);
copyKernel(oVal + (N - lastSegmentElements), iVal + (N - lastSegmentElements), lastSegmentElements); copyKernel(oVal + (N - lastSegmentElements), iVal + (N - lastSegmentElements), lastSegmentElements);
} }
uniform int * uniform tmpKey = iKey; {
iKey = oKey; uniform int * uniform tmpKey = iKey;
oKey = tmpKey; iKey = oKey;
oKey = tmpKey;
}
uniform int * uniform tmpVal = iVal; {
iVal = oVal; uniform int * uniform tmpVal = iVal;
oVal = tmpVal; iVal = oVal;
oVal = tmpVal;
}
} }
} }