diff --git a/examples_ptx/mergeSort/mergeSort.ispc b/examples_ptx/mergeSort/mergeSort.ispc new file mode 100644 index 00000000..81d0f7b6 --- /dev/null +++ b/examples_ptx/mergeSort/mergeSort.ispc @@ -0,0 +1,342 @@ +#define SAMPLE_STRIDE programCount + +static inline +int iDivUp(int a, int b) +{ + int div = a/b; + return ((a % b) == 0) ? div : (div + 1); +} + +static inline +uniform int iDivUp(uniform int a, uniform int b) +{ + uniform int div = a/b; + return ((a % b) == 0) ? div : (div + 1); +} + +static inline +int getSampleCount(int dividend) +{ + return iDivUp(dividend, SAMPLE_STRIDE); +} + + static inline +uniform int getSampleCount(uniform int dividend) +{ + return iDivUp(dividend, SAMPLE_STRIDE); +} + +#define W (/*sizeof(int)=*/4 * 8) +static inline +int nextPowerOfTwo(int x) +{ + /* + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return ++x; + */ + return 1U << (W - count_leading_zeros(x - 1)); +} + +static inline +int binarySearchInclusive( + const int val, + int *data, + const int L, + int stride) +{ + if (L == 0) + return 0; + + int pos = 0; + + for (; stride > 0; stride >>= 1) + { + int newPos = min(pos + stride, L); + + if (data[newPos - 1] <= val) + pos = newPos; + } + + return pos; +} + +static inline +int binarySearchExclusive( + const int val, + int *data, + const int L, + int stride) +{ + if (L == 0) + return 0; + + int pos = 0; + + for (; stride > 0; stride >>= 1) + { + int newPos = min(pos + stride, L); + + if (data[newPos - 1] < val) + pos = newPos; + } + + return pos; +} + +//////////////////////////////////////////////////////////////////////////////// +// Bottom-level merge sort (binary search-based) +//////////////////////////////////////////////////////////////////////////////// +task +void mergeSortSharedKernel( + uniform int dstKey[], + uniform int dstVal[], + uniform int srcKey[], + uniform int srcVal[]) +{ + uniform int s_key[2*programCount]; + uniform int s_val[2*programCount]; + + const uniform int base = taskIndex * (programCount*2); + s_key[programIndex + 0] = srcKey[base + programIndex + 0]; + s_val[programIndex + 0] = srcVal[base + programIndex + 0]; + s_key[programIndex + programCount] = srcKey[base + programIndex + programCount]; + s_val[programIndex + programCount] = srcVal[base + programIndex + programCount]; + + for (uniform int stride = 1; stride < programCount; stride <<= 1) + { + const int lPos = programIndex & (stride - 1); + int *baseKey = s_key + 2 * (programIndex - lPos); + int *baseVal = s_val + 2 * (programIndex - lPos); + + int keyA = baseKey[lPos + 0]; + int valA = baseVal[lPos + 0]; + int keyB = baseKey[lPos + stride]; + int valB = baseVal[lPos + stride]; + int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos; + int posB = binarySearchInclusive(keyB, baseKey + 0, stride, stride) + lPos; + + baseKey[posA] = keyA; + baseVal[posA] = valA; + baseKey[posB] = keyB; + baseVal[posB] = valB; + } + + dstKey[base + programIndex + 0] = s_key[programIndex + 0]; + dstVal[base + programIndex + 0] = s_val[programIndex + 0]; + dstKey[base + programIndex + programCount] = s_key[programIndex + programCount]; + dstVal[base + programIndex + programCount] = s_val[programIndex + programCount]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Merge step 1: generate sample ranks +//////////////////////////////////////////////////////////////////////////////// +task +void generateSampleRanksKernel( + uniform int in_ranksA[], + uniform int in_ranksB[], + uniform int in_srcKey[], + const uniform int stride, + const uniform int N, + const int totalProgramCount) +{ + const int pos = taskIndex * programCount + programIndex; + + if (pos >= totalProgramCount) + return; + + const int i = pos & ((stride / SAMPLE_STRIDE) - 1); + const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); + + int * srcKey = in_srcKey + segmentBase; + int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE; + int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE; + + const int segmentElementsA = stride; + const int segmentElementsB = min(stride, N - segmentBase - stride); + const int segmentSamplesA = getSampleCount(segmentElementsA); + const int segmentSamplesB = getSampleCount(segmentElementsB); + + if (i < segmentSamplesA) + { + ranksA[i] = i * SAMPLE_STRIDE; + ranksB[i] = binarySearchExclusive( + srcKey[i * SAMPLE_STRIDE], srcKey + stride, + segmentElementsB, nextPowerOfTwo(segmentElementsB)); + } + + if (i < segmentSamplesB) + { + ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE; + ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive( + srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0, + segmentElementsA, nextPowerOfTwo(segmentElementsA)); + } +} +//////////////////////////////////////////////////////////////////////////////// +// Merge step 2: generate sample ranks and indices +//////////////////////////////////////////////////////////////////////////////// +task +void mergeRanksAndIndicesKernel( + uniform int in_Limits[], + uniform int in_Ranks[], + uniform int stride, + uniform int N, + uniform int totalProgramCount) +{ + int pos = taskIndex * programCount + programIndex; + + if (pos >= totalProgramCount) + return; + + const int i = pos & ((stride / SAMPLE_STRIDE) - 1); + const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); + int * ranks = in_Ranks + (pos - i) * 2; + int * limits = in_Limits + (pos - i) * 2; + + const int segmentElementsA = stride; + const int segmentElementsB = min(stride, N - segmentBase - stride); + const int segmentSamplesA = getSampleCount(segmentElementsA); + const int segmentSamplesB = getSampleCount(segmentElementsB); + + if (i < segmentSamplesA) + { + int dstPos = binarySearchExclusive(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i; + limits[dstPos] = ranks[i]; + } + + if (i < segmentSamplesB) + { + int dstPos = binarySearchInclusive(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i; + limits[dstPos] = ranks[segmentSamplesA + i]; + } +} + +static inline +void merge( + uniform int dstKey[], + uniform int dstVal[], + uniform int srcAKey[], + uniform int srcAVal[], + uniform int srcBKey[], + uniform int srcBVal[], + uniform int lenA, + uniform int nPowTwoLenA, + uniform int lenB, + uniform int nPowTwoLenB) +{ + int keyA, valA, keyB, valB, dstPosA, dstPosB; + + if (programIndex < lenA) + { + keyA = srcAKey[programIndex]; + valA = srcAVal[programIndex]; + dstPosA = binarySearchExclusive(keyA, srcBKey, lenB, nPowTwoLenB) + programIndex; + } + + if (programIndex < lenB) + { + keyB = srcBKey[programIndex]; + valB = srcBVal[programIndex]; + dstPosB = binarySearchInclusive(keyB, srcAKey, lenA, nPowTwoLenA) + programIndex; + } + + if (programIndex < lenA) + { + dstKey[dstPosA] = keyA; + dstVal[dstPosA] = valA; + } + + if (programIndex < lenB) + { + dstKey[dstPosB] = keyB; + dstVal[dstPosB] = valB; + } +} + +task +void mergeElementaryIntervalsKernel( + uniform int dstKey[], + uniform int dstVal[], + uniform int srcKey[], + uniform int srcVal[], + uniform int limitsA[], + uniform int limitsB[], + uniform int stride, + uniform int N +) +{ + uniform int s_key[2 * SAMPLE_STRIDE]; + uniform int s_val[2 * SAMPLE_STRIDE]; + + const int uniform intervalI = taskIndex & ((2 * stride) / SAMPLE_STRIDE - 1); + const int uniform segmentBase = (taskIndex - intervalI) * SAMPLE_STRIDE; + srcKey += segmentBase; + srcVal += segmentBase; + dstKey += segmentBase; + dstVal += segmentBase; + + //Set up threadblock-wide parameters + uniform int startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB; + + { + uniform int segmentElementsA = stride; + uniform int segmentElementsB = min(stride, N - segmentBase - stride); + uniform int segmentSamplesA = getSampleCount(segmentElementsA); + uniform int segmentSamplesB = getSampleCount(segmentElementsB); + uniform int segmentSamples = segmentSamplesA + segmentSamplesB; + + startSrcA = limitsA[taskIndex]; + startSrcB = limitsB[taskIndex]; + uniform int endSrcA = (intervalI + 1 < segmentSamples) ? limitsA[taskIndex+ 1] : segmentElementsA; + uniform int endSrcB = (intervalI + 1 < segmentSamples) ? limitsB[taskIndex + 1] : segmentElementsB; + lenSrcA = endSrcA - startSrcA; + lenSrcB = endSrcB - startSrcB; + startDstA = startSrcA + startSrcB; + startDstB = startDstA + lenSrcA; + } + + //Load main input data + + if (programIndex < lenSrcA) + { + s_key[programIndex + 0] = srcKey[0 + startSrcA + programIndex]; + s_val[programIndex + 0] = srcVal[0 + startSrcA + programIndex]; + } + + if (programIndex < lenSrcB) + { + s_key[programIndex + SAMPLE_STRIDE] = srcKey[stride + startSrcB + programIndex]; + s_val[programIndex + SAMPLE_STRIDE] = srcVal[stride + startSrcB + programIndex]; + } + + //Merge data in shared memory + merge( + s_key, + s_val, + s_key + 0, + s_val + 0, + s_key + SAMPLE_STRIDE, + s_val + SAMPLE_STRIDE, + lenSrcA, SAMPLE_STRIDE, + lenSrcB, SAMPLE_STRIDE + ); + + //Store merged data + + if (programIndex < lenSrcA) + { + dstKey[startDstA + programIndex] = s_key[programIndex]; + dstVal[startDstA + programIndex] = s_val[programIndex]; + } + + if (programIndex < lenSrcB) + { + dstKey[startDstB + programIndex] = s_key[lenSrcA + programIndex]; + dstVal[startDstB + programIndex] = s_val[lenSrcA + programIndex]; + } +}