diff --git a/examples_cuda/sort/sort1.cu b/examples_cuda/sort/sort1.cu index 8b2c8fc5..49886fdb 100644 --- a/examples_cuda/sort/sort1.cu +++ b/examples_cuda/sort/sort1.cu @@ -239,20 +239,26 @@ static void prefix_sum ( int num, int h[], int * g) } extern "C" __global__ -void sort_ispc ( int n, unsigned int code[], int order[], int ntasks) +void sort_ispc ( int n, unsigned int code[], int order[], int ntasks, + int _hist[], + int64 _pair[], + int64 _temp[], + int _g[]) { - int num = ntasks < 1 ? 13*4 : ntasks; + int num = ntasks; int span = n / num; - int hsize = 256*programCount*num; #if 0 - int * hist = new int [hsize]; - int64 * pair = new int64 [n]; - int64 * temp = new int64 [n]; -#else + int hsize = 256*programCount*num; int * hist = __new< int>(hsize); int64 * pair = __new< int64>(n); int64 * temp = __new< int64>(n); int * g = __new(num+1); +#define ALLOCATED +#else + int * hist = _hist; + int64 * pair = _pair; + int64 * temp = _temp; + int * g = _g; #endif int pass, i; @@ -262,7 +268,6 @@ void sort_ispc ( int n, unsigned int code[], int order[], int ntasks) pack<<>>(span, n, code, pair); sync; -#if 1 for (pass = 0; pass < 4; pass ++) { // launch[num] histogram (span, n, pair, pass, hist); @@ -287,15 +292,9 @@ void sort_ispc ( int n, unsigned int code[], int order[], int ntasks) if(programIndex == 0) unpack<<>> (span, n, pair, code, order); sync; -#endif - -#if 0 - delete hist; - delete pair; - delete temp; -#else - __delete(g); +#if ALLOCATED + __delete(g); __delete(hist); __delete(pair); __delete(temp); diff --git a/examples_cuda/sort/sort1.ispc b/examples_cuda/sort/sort1.ispc index 0c500ed4..e3acb4f0 100644 --- a/examples_cuda/sort/sort1.ispc +++ b/examples_cuda/sort/sort1.ispc @@ -194,51 +194,56 @@ task void bumpup (uniform int h[], uniform int g[]) } } -static void prefix_sum (uniform int num, uniform int h[]) +static void prefix_sum (uniform int num, uniform int h[], uniform int g[]) { - uniform int * uniform g = uniform new uniform int [num+1]; uniform int i; launch[num] addup (h, g+1); sync; - for (g[0] = 0, i = 1; i < num; i ++) g[i] += g[i-1]; + g[0] = 0; + for (i = 1; i < num; i ++) + g[i] += g[i-1]; launch[num] bumpup (h, g); sync; - delete g; } -export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int order[], uniform int ntasks) +export void sort_ispc (uniform int n, + uniform unsigned int code[], + uniform int order[], + uniform int ntasks, + uniform int _hist[], + uniform int64 _pair[], + uniform int64 _temp[], + uniform int _g[]) { - uniform int num = ntasks < 1 ? 13*4*8 : ntasks; + uniform int num = ntasks; uniform int span = n / num; +#if 0 uniform int hsize = 256*programCount*num; uniform int * uniform hist = uniform new uniform int [hsize]; uniform int64 * uniform pair = uniform new uniform int64 [n]; uniform int64 * uniform temp = uniform new uniform int64 [n]; - uniform int pass, i; - -#if DEBUG - if (n < 100) - { - print ("input: "); - for (i = 0; i < n; i ++) print ("%, ", code[i]); - print ("\n"); - } + uniform int * uniform g = uniform new uniform int [num+1]; +#else + uniform int * uniform hist = _hist; + uniform int64 * uniform pair = _pair; + uniform int64 * uniform temp = _temp; + uniform int * uniform g = _g; #endif + uniform int pass, i; launch[num] pack (span, n, code, pair); sync; -#if 0 for (pass = 0; pass < 4; pass ++) { launch[num] histogram (span, n, pair, pass, hist); sync; - prefix_sum (num, hist); + prefix_sum (num, hist, g); launch[num] permutation (span, n, pair, pass, hist, temp); sync; @@ -250,26 +255,11 @@ export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int o launch[num] unpack (span, n, pair, code, order); sync; -#if DEBUG - for (i = 0; i < n; i ++) - { - if (i > 0 && code[i-1] > code[i]) - print ("ERR at % => % > %; ", i, code[i-1], code[i]); - } - - if (n < 100) - { - print ("output: "); - for (i = 0; i < n; i ++) print ("%, ", code[i]); - print ("\n"); - print ("order: "); - for (i = 0; i < n; i ++) print ("%, ", order[i]); - print ("\n"); - } -#endif -#endif - +#if ALLOCATED + delete g; delete hist; delete pair; delete temp; +#endif } + diff --git a/examples_cuda/sort/sort_cu.cpp b/examples_cuda/sort/sort_cu.cpp index f749d343..ec295434 100644 --- a/examples_cuda/sort/sort_cu.cpp +++ b/examples_cuda/sort/sort_cu.cpp @@ -351,8 +351,13 @@ int main (int argc, char *argv[]) createContext(); /*******************/ + int ntask = 13*4*8; devicePtr d_code = deviceMalloc(n*sizeof(int)); devicePtr d_order = deviceMalloc(n*sizeof(int)); + devicePtr d_pair = deviceMalloc(n*2*sizeof(int)); + devicePtr d_temp = deviceMalloc(n*2*sizeof(int)); + devicePtr d_hist = deviceMalloc(256*32 * ntask * sizeof(int)); + devicePtr d_g = deviceMalloc((ntask + 1) * sizeof(int)); for (i = 0; i < m; i ++) { @@ -368,8 +373,11 @@ int main (int argc, char *argv[]) tISPC2 += (rtc() - t0); // get_elapsed_mcycles(); #else const char * func_name = "sort_ispc"; - int ntask = 0; +#if 0 void *func_args[] = {&n, &d_code, &d_order, &ntask}; +#else + void *func_args[] = {&n, &d_code, &d_order, &ntask, &d_hist, &d_pair, &d_temp, &d_g}; +#endif const double dt = CUDALaunch(NULL, func_name, func_args); tISPC2 += dt; #endif