diff --git a/examples_cuda/sort/sort.cpp b/examples_cuda/sort/sort.cpp index 1221013f..c439fdd0 100644 --- a/examples_cuda/sort/sort.cpp +++ b/examples_cuda/sort/sort.cpp @@ -108,7 +108,7 @@ int main (int argc, char *argv[]) progressbar (i, m); } - printf("[sort ispc + task]:\t[%.3f] million cycles\n", tISPC2); + printf("[sort ispc + task]:\t[%.3f] million cycles :: rate= %g Mel/sec\n", tISPC2, 1.0e-6*n*m/tISPC2); for (int i = 0; i < n-1; i++) { @@ -133,7 +133,7 @@ int main (int argc, char *argv[]) progressbar (i, m); } - printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1); + printf("[sort ispc]:\t[%.3f] million cycles :: rate= %g Mel/sec\n", tISPC1, 1.0e-6*n*m/tISPC1); srand (0); @@ -153,7 +153,7 @@ int main (int argc, char *argv[]) progressbar (i, m); } - printf("[sort serial]:\t\t[%.3f] million cycles\n", tSerial); + printf("[sort serial]:\t[%.3f] million cycles :: rate= %g Mel/sec\n", tSerial, 1.0e-6*n*m/tSerial); printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2); diff --git a/examples_cuda/sort/sort1.ispc b/examples_cuda/sort/sort1.ispc index 855aa194..608d851a 100644 --- a/examples_cuda/sort/sort1.ispc +++ b/examples_cuda/sort/sort1.ispc @@ -219,12 +219,16 @@ export void sort_ispc (uniform int n, { uniform int num = ntasks; uniform int span = n / num; -#if 0 +#if 1 + /* if fails, change to 0. some issues with stack size/heap inside CUDA context + * grep for CuCtxSetLimit in sort_cu.cpp + */ uniform int hsize = 256*programCount*num; uniform int * uniform hist = uniform new uniform int [hsize]; uniform int64 * uniform pair = uniform new uniform int64 [n]; uniform int64 * uniform temp = uniform new uniform int64 [n]; uniform int * uniform g = uniform new uniform int [num+1]; +#define ALLOCATED #else uniform int * uniform hist = _hist; uniform int64 * uniform pair = _pair; @@ -253,7 +257,7 @@ export void sort_ispc (uniform int n, launch[num] unpack (span, n, pair, code, order); sync; -#if ALLOCATED +#ifdef ALLOCATED delete g; delete hist; delete pair; diff --git a/examples_cuda/sort/sort_cu.cpp b/examples_cuda/sort/sort_cu.cpp index ed206499..d5d77b6e 100644 --- a/examples_cuda/sort/sort_cu.cpp +++ b/examples_cuda/sort/sort_cu.cpp @@ -99,7 +99,13 @@ void createContext(const int deviceId = 0) // Create driver context checkCudaErrors(cuCtxCreate(&context, 0, device)); + size_t limit; + checkCudaErrors(cuCtxGetLimit(&limit, CU_LIMIT_STACK_SIZE)); + fprintf(stderr, " stack_limit= %llu KB\n", limit/1024); + checkCudaErrors(cuCtxGetLimit(&limit, CU_LIMIT_MALLOC_HEAP_SIZE)); + fprintf(stderr, " heap= %llu KB\n", limit/1024); checkCudaErrors(cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE,1024*1024*1024)); + checkCudaErrors(cuCtxSetLimit(CU_LIMIT_STACK_SIZE,1024*4)); } void destroyContext() { @@ -386,7 +392,7 @@ int main (int argc, char *argv[]) progressbar (i, m); } - printf("[sort cuda]:\t[%.3f] million cycles\n", tISPC2); + printf("[sort cuda]:\t[%.3f] million cycles :: rate= %g Mel/sec\n", tISPC2, 1.0e-6*n*m/tISPC2); memcpyD2H(code, d_code, n*sizeof(int)); memcpyD2H(order, d_order, n*sizeof(int)); for (int i = 0; i < n-1; i++)