This commit is contained in:
Evghenii
2014-01-06 11:50:18 +01:00
parent 27d957f0cb
commit 774e907ecf
13 changed files with 850 additions and 6 deletions

View File

@@ -170,9 +170,8 @@ task void bumpup (uniform int h[], uniform int g[])
} }
} }
static void prefix_sum (uniform int num, uniform int h[]) static void prefix_sum (uniform int num, uniform int h[], uniform int g[])
{ {
uniform int * uniform g = uniform new uniform int [num+1];
uniform int i; uniform int i;
launch[num] addup (h, g+1); launch[num] addup (h, g+1);
@@ -183,10 +182,9 @@ static void prefix_sum (uniform int num, uniform int h[])
launch[num] bumpup (h, g); launch[num] bumpup (h, g);
sync; sync;
delete g;
} }
export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int order[], uniform int ntasks) export void sort_ispc(uniform int n, uniform unsigned int code[], uniform int order[], uniform int ntasks)
{ {
uniform int num = ntasks ; //< 1 ? num_cores () : ntasks; uniform int num = ntasks ; //< 1 ? num_cores () : ntasks;
uniform int span = n / num; uniform int span = n / num;
@@ -194,6 +192,7 @@ export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int o
uniform int * uniform hist = uniform new uniform int [hsize]; uniform int * uniform hist = uniform new uniform int [hsize];
uniform int64 * uniform pair = uniform new uniform int64 [n]; uniform int64 * uniform pair = uniform new uniform int64 [n];
uniform int64 * uniform temp = uniform new uniform int64 [n]; uniform int64 * uniform temp = uniform new uniform int64 [n];
uniform int * uniform g = uniform new uniform int [num+1];
uniform int pass, i; uniform int pass, i;
#if DEBUG #if DEBUG
@@ -213,7 +212,7 @@ export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int o
launch[num] histogram (span, n, pair, pass, hist); launch[num] histogram (span, n, pair, pass, hist);
sync; sync;
prefix_sum (num, hist); prefix_sum (num, hist,g);
launch[num] permutation (span, n, pair, pass, hist, temp); launch[num] permutation (span, n, pair, pass, hist, temp);
sync; sync;
@@ -246,4 +245,5 @@ export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int o
delete hist; delete hist;
delete pair; delete pair;
delete temp; delete temp;
delete g;
} }

View File

@@ -128,7 +128,7 @@ int main (int argc, char *argv[])
tISPC2 += (rtc() - t0); // get_elapsed_mcycles(); tISPC2 += (rtc() - t0); // get_elapsed_mcycles();
#else #else
const char * func_name = "sort_ispc"; const char * func_name = "sort_ispc___export";
#if 0 #if 0
void *func_args[] = {&n, &d_code, &d_order, &ntask}; void *func_args[] = {&n, &d_code, &d_order, &ntask};
#else #else

View File

@@ -12,3 +12,4 @@
#define taskCount (taskCount0*taskCount1*taskCount2) #define taskCount (taskCount0*taskCount1*taskCount2)
#define warpIdx (threadIdx.x >> 5) #define warpIdx (threadIdx.x >> 5)
#define launch(ntx,nty,ntz,func) if (programIndex==0) func<<<dim3(((ntx)+4-1)/4,nty,ntz),128>>> #define launch(ntx,nty,ntz,func) if (programIndex==0) func<<<dim3(((ntx)+4-1)/4,nty,ntz),128>>>
#define sync cudaDeviceSynchronize()

View File

@@ -31,5 +31,24 @@ void ispc_memset(void *ptr, int value, size_t size)
{ {
memset(ptr, value, size); memset(ptr, value, size);
} }
void ispcSetMallocHeapLimit(size_t value)
{
}
void ispcSetStackLimit(size_t value)
{
}
unsigned long long ispcGetMallocHeapLimit()
{
return -1;
}
unsigned long long ispcGetStackLimit()
{
return -1;
}
void * ispcMemcpy(void *dest, void *src, size_t num)
{
memcpy(dest, src, num);
return dest;
}
#endif #endif

View File

@@ -3,3 +3,8 @@
extern void ispc_malloc(void **ptr, const size_t size); extern void ispc_malloc(void **ptr, const size_t size);
extern void ispc_free(void *ptr); extern void ispc_free(void *ptr);
extern void ispc_memset(void *ptr, int value, size_t size); extern void ispc_memset(void *ptr, int value, size_t size);
extern void ispcSetMallocHeapLimit(size_t value);
extern void ispcSetStackLimit(size_t value);
extern unsigned long long ispcGetMallocHeapLimit();
extern unsigned long long ispcGetStackLimit();
extern void * ispcMemcpy(void *dest, void *src, size_t num);

View File

@@ -14,5 +14,30 @@ void ispc_memset(void *ptr, int value, size_t size)
{ {
cudaMemset(ptr, value, size); cudaMemset(ptr, value, size);
} }
void ispcSetMallocHeapLimit(size_t value)
{
cudaDeviceSetLimit(cudaLimitMallocHeapSize,value);
}
void ispcSetStackLimit(size_t value)
{
cudaDeviceSetLimit(cudaLimitStackSize,value);
}
unsigned long long ispcGetMallocHeapLimit()
{
size_t value;
cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize);
return value;
}
unsigned long long ispcGetStackLimit()
{
size_t value;
cudaDeviceGetLimit(&value, cudaLimitStackSize);
return value;
}
void * ispcMemcpy(void *dest, void *src, size_t num)
{
cudaMemcpy(dest, src, num, cudaMemcpyDefault);
return dest;
}

View File

@@ -0,0 +1,9 @@
EXAMPLE=sort
CPP_SRC=sort.cpp sort_serial.cpp
ISPC_SRC=sort.ispc
ISPC_IA_TARGETS=avx1-i32x8
ISPC_ARM_TARGETS=neon
#ISPC_FLAGS=-DDEBUG
include ../common.mk

View File

@@ -0,0 +1,13 @@
PROG=sort
ISPC_SRC=sort.ispc
CU_SRC=sort.cu
CXX_SRC=sort.cpp sort_serial.cpp
PTXCC_REGMAX=32
LLVM_GPU=1
NVVM_GPU=1
include ../common_gpu.mk

157
examples_ptx/sort/sort.cpp Normal file
View File

@@ -0,0 +1,157 @@
/*
Copyright (c) 2013, Durham University
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Durham University nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Author: Tomasz Koziara */
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <cassert>
#include <iomanip>
#include "../timing.h"
#include "../ispc_malloc.h"
#include "sort_ispc.h"
using namespace ispc;
extern void sort_serial (int n, unsigned int code[], int order[]);
/* progress bar by Ross Hemsley;
* http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */
static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50)
{
if (n < 100)
{
x *= 100/n;
n = 100;
}
if ((x != n) && (x % (n/100) != 0)) return;
using namespace std;
float ratio = x/(float)n;
int c = ratio * w;
cout << setw(3) << (int)(ratio*100) << "% [";
for (int x=0; x<c; x++) cout << "=";
for (int x=c; x<w; x++) cout << " ";
cout << "]\r" << flush;
}
int main (int argc, char *argv[])
{
int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
unsigned int *code = new unsigned int [n];
unsigned int *code_orig = new unsigned int [n];
int *order = new int [n];
for (j = 0; j < n; j ++) code_orig[j] = rand() % l;
ispcSetMallocHeapLimit(1024*1024*1024);
srand (0);
#ifndef _CUDA_
for (i = 0; i < m; i ++)
{
ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
reset_and_start_timer();
sort_ispc (n, code, order, 1);
tISPC1 += get_elapsed_msec();
if (argc != 3)
progressbar (i, m);
}
printf("[sort ispc]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC1, 1.0e-3*n*m/tISPC1);
#endif
srand (0);
const int ntask = 13*8;
for (i = 0; i < m; i ++)
{
ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
reset_and_start_timer();
sort_ispc (n, code, order, ntask);
tISPC2 += get_elapsed_msec();
if (argc != 3)
progressbar (i, m);
}
printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n*m/tISPC2);
unsigned int *code1 = new unsigned int [n];
for (int i = 0; i < n; i++)
code1[i] = code[i];
std::sort(code1, code1+n);
for (int i = 0; i < n; i++)
assert(code1[i] == code[i]);
srand (0);
for (i = 0; i < m; i ++)
{
ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
reset_and_start_timer();
sort_serial (n, code, order);
tSerial += get_elapsed_msec();
if (argc != 3)
progressbar (i, m);
}
printf("[sort serial]:\t\t[%.3f] msec [%.3f Mpair/s]\n", tSerial, 1.0e-3*n*m/tSerial);
#ifndef _CUDA_
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
#else
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", tSerial/tISPC2);
#endif
delete code;
delete code_orig;
delete order;
return 0;
}

272
examples_ptx/sort/sort.cu Normal file
View File

@@ -0,0 +1,272 @@
/*
Copyright (c) 2013, Durham University
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Durham University nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Author: Tomasz Koziara */
#include "cuda_helpers.cuh"
#define cfor for
#define cif if
#define int8 char
#define int64 long
template<typename T>
__device__ inline T* __new(const int n)
{
union
{
T* ptr;
int v[2];
} val;
if (programIndex == 0)
val.ptr = new T[n];
val.v[0] = __shfl(val.v[0],0);
val.v[1] = __shfl(val.v[1],0);
return val.ptr;
};
template<typename T>
__device__ inline void __delete(T* ptr)
{
if (programIndex == 0)
delete ptr;
};
__global__ void histogram ( int span, int n, int64 code[], int pass, int hist[])
{
if (taskIndex >= taskCount) return;
int start = taskIndex*span;
int end = taskIndex == taskCount-1 ? n : start+span;
int strip = (end-start)/programCount;
int tail = (end-start)%programCount;
int i = programCount*taskIndex + programIndex;
int g [256];
cfor (int j = 0; j < 256; j ++)
{
g[j] = 0;
}
cfor (int k = start+programIndex*strip; k < start+(programIndex+1)*strip; k ++)
{
unsigned int8 *c = (unsigned int8*) &code[k];
g[c[pass]] ++;
}
if (programIndex == programCount-1) /* remainder is processed by the last lane */
{
for (int k = start+programCount*strip; k < start+programCount*strip+tail; k ++)
{
unsigned int8 *c = (unsigned int8*) &code[k];
g[c[pass]] ++;
}
}
cfor (int j = 0; j < 256; j ++)
{
hist[j*programCount*taskCount+i] = g[j];
}
}
__global__ void permutation ( int span, int n, int64 code[], int pass, int hist[], int64 perm[])
{
if (taskIndex >= taskCount) return;
int start = taskIndex*span;
int end = taskIndex == taskCount-1 ? n : start+span;
int strip = (end-start)/programCount;
int tail = (end-start)%programCount;
int i = programCount*taskIndex + programIndex;
int g [256];
cfor (int j = 0; j < 256; j ++)
{
g[j] = hist[j*programCount*taskCount+i];
}
cfor (int k = start+programIndex*strip; k < start+(programIndex+1)*strip; k ++)
{
unsigned int8 *c = (unsigned int8*) &code[k];
int l = g[c[pass]];
perm[l] = code[k];
g[c[pass]] = l+1;
}
if (programIndex == programCount-1) /* remainder is processed by the last lane */
{
for (int k = start+programCount*strip; k < start+programCount*strip+tail; k ++)
{
unsigned int8 *c = (unsigned int8*) &code[k];
int l = g[c[pass]];
perm[l] = code[k];
g[c[pass]] = l+1;
}
}
}
__global__ void copy ( int span, int n, int64 from[], int64 to[])
{
if (taskIndex >= taskCount) return;
int start = taskIndex*span;
int end = taskIndex == taskCount-1 ? n : start+span;
for (int i = programIndex + start; i < end; i += programCount)
if (i < end)
{
to[i] = from[i];
}
}
__global__ void pack ( int span, int n, unsigned int code[], int64 pair[])
{
if (taskIndex >= taskCount) return;
int start = taskIndex*span;
int end = taskIndex == taskCount-1 ? n : start+span;
for (int i = programIndex + start; i < end; i += programCount)
if (i < end)
{
pair[i] = ((int64)i<<32)+code[i];
}
}
__global__ void unpack ( int span, int n, int64 pair[], int unsigned code[], int order[])
{
if (taskIndex >= taskCount) return;
int start = taskIndex*span;
int end = taskIndex == taskCount-1 ? n : start+span;
for (int i = programIndex + start; i < end; i += programCount)
if (i < end)
{
code[i] = pair[i];
order[i] = pair[i]>>32;
}
}
__global__ void addup ( int h[], int g[])
{
if (taskIndex >= taskCount) return;
int * u = &h[256*programCount*taskIndex];
int i, x, y = 0;
for (i = 0; i < 256*programCount; i ++)
{
x = u[i];
u[i] = y;
y += x;
}
g[taskIndex] = y;
}
__global__ void bumpup ( int h[], int g[])
{
if (taskIndex >= taskCount) return;
int * u = &h[256*programCount*taskIndex];
int z = g[taskIndex];
for (int i = programIndex; i < 256*programCount; i += programCount)
{
u[i] += z;
}
}
inline __device__
static void prefix_sum ( int num, int h[], int * g)
{
int i;
launch(num,1,1,addup)(h,g+1);
sync;
if (programIndex == 0)
for (g[0] = 0, i = 1; i < num; i ++) g[i] += g[i-1];
launch(num,1,1,bumpup)(h,g);
sync;
}
extern "C" __global__
void sort_ispc___export ( int n, unsigned int code[], int order[], int ntasks)
{
int num = ntasks;
int span = n / num;
int hsize = 256*programCount*num;
int * hist = __new< int>(hsize);
int64 * pair = __new< int64>(n);
int64 * temp = __new< int64>(n);
int * g = __new<int>(num+1);
int pass;
launch(num,1,1,pack)(span, n, code, pair);
sync;
for (pass = 0; pass < 4; pass ++)
{
launch(num,1,1,histogram)(span, n, pair, pass, hist);
sync;
prefix_sum (num, hist,g);
launch(num,1,1,permutation)(span, n, pair, pass, hist, temp);
sync;
launch(num,1,1,copy)(span, n, temp, pair);
sync;
}
launch(num,1,1,unpack)(span, n, pair, code, order);
sync;
__delete(g);
__delete(hist);
__delete(pair);
__delete(temp);
}
extern "C" __host__
void sort_ispc( int n, unsigned int code[], int order[], int ntasks)
{
sort_ispc___export<<<1,32>>>(n,code,order,ntasks);
sync;
}

249
examples_ptx/sort/sort.ispc Normal file
View File

@@ -0,0 +1,249 @@
/*
Copyright (c) 2013, Durham University
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Durham University nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Author: Tomasz Koziara */
task void histogram (uniform int span, uniform int n, uniform int64 code[], uniform int pass, uniform int hist[])
{
uniform int start = taskIndex*span;
uniform int end = taskIndex == taskCount-1 ? n : start+span;
uniform int strip = (end-start)/programCount;
uniform int tail = (end-start)%programCount;
int i = programCount*taskIndex + programIndex;
int g [256];
cfor (int j = 0; j < 256; j ++)
{
g[j] = 0;
}
cfor (int k = start+programIndex*strip; k < start+(programIndex+1)*strip; k ++)
{
unsigned int8 *c = (unsigned int8*) &code[k];
g[c[pass]] ++;
}
if (programIndex == programCount-1) /* remainder is processed by the last lane */
{
for (int k = start+programCount*strip; k < start+programCount*strip+tail; k ++)
{
unsigned int8 *c = (unsigned int8*) &code[k];
g[c[pass]] ++;
}
}
cfor (int j = 0; j < 256; j ++)
{
hist[j*programCount*taskCount+i] = g[j];
}
}
task void permutation (uniform int span, uniform int n, uniform int64 code[], uniform int pass, uniform int hist[], uniform int64 perm[])
{
uniform int start = taskIndex*span;
uniform int end = taskIndex == taskCount-1 ? n : start+span;
uniform int strip = (end-start)/programCount;
uniform int tail = (end-start)%programCount;
int i = programCount*taskIndex + programIndex;
int g [256];
cfor (int j = 0; j < 256; j ++)
{
g[j] = hist[j*programCount*taskCount+i];
}
cfor (int k = start+programIndex*strip; k < start+(programIndex+1)*strip; k ++)
{
unsigned int8 *c = (unsigned int8*) &code[k];
int l = g[c[pass]];
perm[l] = code[k];
g[c[pass]] = l+1;
}
if (programIndex == programCount-1) /* remainder is processed by the last lane */
{
for (int k = start+programCount*strip; k < start+programCount*strip+tail; k ++)
{
unsigned int8 *c = (unsigned int8*) &code[k];
int l = g[c[pass]];
perm[l] = code[k];
g[c[pass]] = l+1;
}
}
}
task void copy (uniform int span, uniform int n, uniform int64 from[], uniform int64 to[])
{
uniform int start = taskIndex*span;
uniform int end = taskIndex == taskCount-1 ? n : start+span;
foreach (i = start ... end)
{
to[i] = from[i];
}
}
task void pack (uniform int span, uniform int n, uniform unsigned int code[], uniform int64 pair[])
{
uniform int start = taskIndex*span;
uniform int end = taskIndex == taskCount-1 ? n : start+span;
foreach (i = start ... end)
{
pair[i] = ((int64)i<<32)+code[i];
}
}
task void unpack (uniform int span, uniform int n, uniform int64 pair[], uniform int unsigned code[], uniform int order[])
{
uniform int start = taskIndex*span;
uniform int end = taskIndex == taskCount-1 ? n : start+span;
foreach (i = start ... end)
{
code[i] = pair[i];
order[i] = pair[i]>>32;
}
}
task void addup (uniform int h[], uniform int g[])
{
uniform int * uniform u = &h[256*programCount*taskIndex];
uniform int i, x, y = 0;
for (i = 0; i < 256*programCount; i ++)
{
x = u[i];
u[i] = y;
y += x;
}
g[taskIndex] = y;
}
task void bumpup (uniform int h[], uniform int g[])
{
uniform int * uniform u = &h[256*programCount*taskIndex];
uniform int z = g[taskIndex];
foreach (i = 0 ... 256*programCount)
{
u[i] += z;
}
}
static void prefix_sum (uniform int num, uniform int h[], uniform int g[])
{
uniform int i;
launch[num] addup (h, g+1);
sync;
for (g[0] = 0, i = 1; i < num; i ++) g[i] += g[i-1];
launch[num] bumpup (h, g);
sync;
}
export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int order[], uniform int ntasks)
{
uniform int num = ntasks; // < 1 ? num_cores () : ntasks;
uniform int span = n / num;
uniform int hsize = 256*programCount*num;
uniform int * uniform hist = uniform new uniform int [hsize];
uniform int64 * uniform pair = uniform new uniform int64 [n];
uniform int64 * uniform temp = uniform new uniform int64 [n];
uniform int * uniform g = uniform new uniform int [num+1];
uniform int pass, i;
#if DEBUG
if (n < 100)
{
print ("input: ");
for (i = 0; i < n; i ++) print ("%, ", code[i]);
print ("\n");
}
#endif
launch[num] pack (span, n, code, pair);
sync;
for (pass = 0; pass < 4; pass ++)
{
launch[num] histogram (span, n, pair, pass, hist);
sync;
prefix_sum (num, hist, g);
launch[num] permutation (span, n, pair, pass, hist, temp);
sync;
launch[num] copy (span, n, temp, pair);
sync;
}
launch[num] unpack (span, n, pair, code, order);
sync;
#if DEBUG
for (i = 0; i < n; i ++)
{
if (i > 0 && code[i-1] > code[i])
print ("ERR at % => % > %; ", i, code[i-1], code[i]);
}
if (n < 100)
{
print ("output: ");
for (i = 0; i < n; i ++) print ("%, ", code[i]);
print ("\n");
print ("order: ");
for (i = 0; i < n; i ++) print ("%, ", order[i]);
print ("\n");
}
#endif
delete hist;
delete pair;
delete temp;
delete g;
}

View File

@@ -0,0 +1,34 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>sort</RootNamespace>
<ISPC_file>sort</ISPC_file>
<default_targets>sse2,sse4-x2,avx1-x2</default_targets>
</PropertyGroup>
<Import Project="..\common.props" />
<ItemGroup>
<ClCompile Include="sort.cpp" />
<ClCompile Include="sort_serial.cpp" />
<ClCompile Include="../tasksys.cpp" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,60 @@
/*
Copyright (c) 2013, Durham University
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Durham University nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Author: Tomasz Koziara */
#include <vector>
#include <algorithm>
#include <utility>
typedef std::pair<double,int> pair;
struct cmp
{
bool operator() (const pair& a, const pair& b) { return a.first < b.first; }
};
void sort_serial (int n, unsigned int code[], int order[])
{
std::vector<pair> pairs;
pairs.reserve (n);
for (int i = 0; i < n; i++) pairs.push_back (pair(code[i], i));
std::sort (pairs.begin(), pairs.end(), cmp());
int *o = order;
for (std::vector<pair>::const_iterator p = pairs.begin(); p != pairs.end(); ++p, ++o) *o = p->second;
}