somet tunning

This commit is contained in:
Evghenii
2014-01-30 08:27:43 +01:00
parent 0e4af8c057
commit 23c6325cb6

View File

@@ -225,7 +225,6 @@ void generateSampleRanksKernel(
uniform int totalProgramCount) uniform int totalProgramCount)
{ {
const int pos = taskIndex * programCount + programIndex; const int pos = taskIndex * programCount + programIndex;
assert(pos < totalProgramCount);
const int i = pos & ((stride / SAMPLE_STRIDE) - 1); const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
@@ -265,7 +264,6 @@ void generateSampleRanks(
uniform int N) uniform int N)
{ {
uniform int lastSegmentElements = N % (2 * stride); uniform int lastSegmentElements = N % (2 * stride);
assert(lastSegmentElements == 0);
uniform int threadCount = (lastSegmentElements > stride) ? uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE); (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
@@ -287,7 +285,6 @@ void mergeRanksAndIndicesKernel(
uniform int totalProgramCount) uniform int totalProgramCount)
{ {
int pos = taskIndex * programCount + programIndex; int pos = taskIndex * programCount + programIndex;
assert(pos < totalProgramCount);
const int i = pos & ((stride / SAMPLE_STRIDE) - 1); const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
@@ -321,7 +318,6 @@ void mergeRanksAndIndices(
uniform int N) uniform int N)
{ {
const uniform int lastSegmentElements = N % (2 * stride); const uniform int lastSegmentElements = N % (2 * stride);
assert(lastSegmentElements == 0);
const uniform int threadCount = (lastSegmentElements > stride) ? const uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE); (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
@@ -348,8 +344,8 @@ void mergeRanksAndIndices(
static inline static inline
void merge( void merge(
uniform Key_t dstKey[], int &dstPosA,
uniform Val_t dstVal[], int &dstPosB,
Key_t keyA, Val_t valA, Key_t keyA, Val_t valA,
Key_t keyB, Val_t valB, Key_t keyB, Val_t valB,
uniform int lenA, uniform int lenA,
@@ -358,18 +354,10 @@ void merge(
uniform int nPowTwoLenB) uniform int nPowTwoLenB)
{ {
if (programIndex < lenA) if (programIndex < lenA)
{ dstPosA = binarySearchExclusive1(keyA, keyB, lenB, nPowTwoLenB) + programIndex;
const int dstPosA = binarySearchExclusive1(keyA, keyB, lenB, nPowTwoLenB) + programIndex;
dstKey[dstPosA] = keyA;
dstVal[dstPosA] = valA;
}
if (programIndex < lenB) if (programIndex < lenB)
{ dstPosB = binarySearchInclusive1(keyB, keyA, lenA, nPowTwoLenA) + programIndex;
const int dstPosB = binarySearchInclusive1(keyB, keyA, lenA, nPowTwoLenA) + programIndex;
dstKey[dstPosB] = keyB;
dstVal[dstPosB] = valB;
}
} }
@@ -384,9 +372,6 @@ void mergeElementaryIntervalsKernel(
uniform int stride, uniform int stride,
uniform int N) uniform int N)
{ {
uniform Key_t s_key[2 * SAMPLE_STRIDE];
uniform Val_t s_val[2 * SAMPLE_STRIDE];
const int uniform intervalI = taskIndex & ((2 * stride) / SAMPLE_STRIDE - 1); const int uniform intervalI = taskIndex & ((2 * stride) / SAMPLE_STRIDE - 1);
const int uniform segmentBase = (taskIndex - intervalI) * SAMPLE_STRIDE; const int uniform segmentBase = (taskIndex - intervalI) * SAMPLE_STRIDE;
@@ -424,17 +409,33 @@ void mergeElementaryIntervalsKernel(
} }
//Merge data in shared memory //Merge data in shared memory
int dstPosA, dstPosB;
merge( merge(
s_key, dstPosA,
s_val, dstPosB,
keyA, valA, keyA, valA,
keyB, valB, keyB, valB,
lenSrcA, SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE,
lenSrcB, SAMPLE_STRIDE lenSrcB, SAMPLE_STRIDE
); );
//Store merged data //Store merged data
#if 0
uniform Key_t s_key[2 * SAMPLE_STRIDE];
uniform Val_t s_val[2 * SAMPLE_STRIDE];
if (programIndex < lenSrcA)
{
s_key[dstPosA] = keyA;
s_val[dstPosA] = valA;
}
if (programIndex < lenSrcB)
{
s_key[dstPosB] = keyB;
s_val[dstPosB] = valB;
}
if (programIndex < lenSrcA) if (programIndex < lenSrcA)
{ {
dstKey[segmentBase + startDstA + programIndex] = s_key[programIndex]; dstKey[segmentBase + startDstA + programIndex] = s_key[programIndex];
@@ -446,6 +447,34 @@ void mergeElementaryIntervalsKernel(
dstKey[segmentBase + startDstB + programIndex] = s_key[lenSrcA + programIndex]; dstKey[segmentBase + startDstB + programIndex] = s_key[lenSrcA + programIndex];
dstVal[segmentBase + startDstB + programIndex] = s_val[lenSrcA + programIndex]; dstVal[segmentBase + startDstB + programIndex] = s_val[lenSrcA + programIndex];
} }
#else
int dstA, dstB;
dstA = dstB = -1;
if (programIndex < lenSrcA && dstPosA < lenSrcA)
dstA = segmentBase + startDstA + dstPosA;
if (programIndex < lenSrcB && dstPosB < lenSrcA)
dstB = segmentBase + startDstA + dstPosB;
dstPosA -= lenSrcA;
dstPosB -= lenSrcA;
dstA = dstB = -1;
if (programIndex < lenSrcA && dstPosA < lenSrcB)
dstA = segmentBase + startDstB + dstPosA;
if (programIndex < lenSrcB && dstPosB < lenSrcB)
dstB = segmentBase + startDstB + dstPosB;
if (dstA >= 0)
{
dstKey[dstA] = keyA;
dstVal[dstA] = valA;
}
if (dstB >= 0)
{
dstKey[dstB] = keyB;
dstVal[dstB] = valB;
}
#endif
} }
static inline static inline
@@ -544,7 +573,7 @@ void mergeSort(
for (uniform int stride = 2*programCount; stride < N; stride <<= 1) for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
{ {
uniform int lastSegmentElements = N % (2 * stride); const uniform int lastSegmentElements = N % (2 * stride);
//Find sample ranks and prepare for limiters merge //Find sample ranks and prepare for limiters merge
generateSampleRanks(ranksA, ranksB, iKey, stride, N); generateSampleRanks(ranksA, ranksB, iKey, stride, N);
@@ -556,26 +585,23 @@ void mergeSort(
mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N); mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
if (lastSegmentElements <= stride) if (lastSegmentElements <= stride)
{ foreach (i = 0 ... lastSegmentElements)
#if 0 {
//Last merge segment consists of a single array which just needs to be passed through oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i];
copyKernel(oKey + (N - lastSegmentElements), iKey + (N - lastSegmentElements), lastSegmentElements); oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i];
copyKernel(oVal + (N - lastSegmentElements), iVal + (N - lastSegmentElements), lastSegmentElements); }
#endif memory_barrier();
}
#if 1
{ {
uniform Key_t * uniform tmpKey = iKey; uniform Key_t * uniform tmpKey = iKey;
iKey = oKey; iKey = oKey;
oKey = tmpKey; oKey = tmpKey;
} }
{ {
uniform Val_t * uniform tmpVal = iVal; uniform Val_t * uniform tmpVal = iVal;
iVal = oVal; iVal = oVal;
oVal = tmpVal; oVal = tmpVal;
} }
#endif
} }
} }