fixed problem with new/delete and added Mel/sec counter
This commit is contained in:
@@ -108,7 +108,7 @@ int main (int argc, char *argv[])
|
|||||||
progressbar (i, m);
|
progressbar (i, m);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("[sort ispc + task]:\t[%.3f] million cycles\n", tISPC2);
|
printf("[sort ispc + task]:\t[%.3f] million cycles :: rate= %g Mel/sec\n", tISPC2, 1.0e-6*n*m/tISPC2);
|
||||||
|
|
||||||
for (int i = 0; i < n-1; i++)
|
for (int i = 0; i < n-1; i++)
|
||||||
{
|
{
|
||||||
@@ -133,7 +133,7 @@ int main (int argc, char *argv[])
|
|||||||
progressbar (i, m);
|
progressbar (i, m);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1);
|
printf("[sort ispc]:\t[%.3f] million cycles :: rate= %g Mel/sec\n", tISPC1, 1.0e-6*n*m/tISPC1);
|
||||||
|
|
||||||
|
|
||||||
srand (0);
|
srand (0);
|
||||||
@@ -153,7 +153,7 @@ int main (int argc, char *argv[])
|
|||||||
progressbar (i, m);
|
progressbar (i, m);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("[sort serial]:\t\t[%.3f] million cycles\n", tSerial);
|
printf("[sort serial]:\t[%.3f] million cycles :: rate= %g Mel/sec\n", tSerial, 1.0e-6*n*m/tSerial);
|
||||||
|
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
|
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
|
||||||
|
|
||||||
|
|||||||
@@ -219,12 +219,16 @@ export void sort_ispc (uniform int n,
|
|||||||
{
|
{
|
||||||
uniform int num = ntasks;
|
uniform int num = ntasks;
|
||||||
uniform int span = n / num;
|
uniform int span = n / num;
|
||||||
#if 0
|
#if 1
|
||||||
|
/* if fails, change to 0. some issues with stack size/heap inside CUDA context
|
||||||
|
* grep for CuCtxSetLimit in sort_cu.cpp
|
||||||
|
*/
|
||||||
uniform int hsize = 256*programCount*num;
|
uniform int hsize = 256*programCount*num;
|
||||||
uniform int * uniform hist = uniform new uniform int [hsize];
|
uniform int * uniform hist = uniform new uniform int [hsize];
|
||||||
uniform int64 * uniform pair = uniform new uniform int64 [n];
|
uniform int64 * uniform pair = uniform new uniform int64 [n];
|
||||||
uniform int64 * uniform temp = uniform new uniform int64 [n];
|
uniform int64 * uniform temp = uniform new uniform int64 [n];
|
||||||
uniform int * uniform g = uniform new uniform int [num+1];
|
uniform int * uniform g = uniform new uniform int [num+1];
|
||||||
|
#define ALLOCATED
|
||||||
#else
|
#else
|
||||||
uniform int * uniform hist = _hist;
|
uniform int * uniform hist = _hist;
|
||||||
uniform int64 * uniform pair = _pair;
|
uniform int64 * uniform pair = _pair;
|
||||||
@@ -253,7 +257,7 @@ export void sort_ispc (uniform int n,
|
|||||||
launch[num] unpack (span, n, pair, code, order);
|
launch[num] unpack (span, n, pair, code, order);
|
||||||
sync;
|
sync;
|
||||||
|
|
||||||
#if ALLOCATED
|
#ifdef ALLOCATED
|
||||||
delete g;
|
delete g;
|
||||||
delete hist;
|
delete hist;
|
||||||
delete pair;
|
delete pair;
|
||||||
|
|||||||
@@ -99,7 +99,13 @@ void createContext(const int deviceId = 0)
|
|||||||
|
|
||||||
// Create driver context
|
// Create driver context
|
||||||
checkCudaErrors(cuCtxCreate(&context, 0, device));
|
checkCudaErrors(cuCtxCreate(&context, 0, device));
|
||||||
|
size_t limit;
|
||||||
|
checkCudaErrors(cuCtxGetLimit(&limit, CU_LIMIT_STACK_SIZE));
|
||||||
|
fprintf(stderr, " stack_limit= %llu KB\n", limit/1024);
|
||||||
|
checkCudaErrors(cuCtxGetLimit(&limit, CU_LIMIT_MALLOC_HEAP_SIZE));
|
||||||
|
fprintf(stderr, " heap= %llu KB\n", limit/1024);
|
||||||
checkCudaErrors(cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE,1024*1024*1024));
|
checkCudaErrors(cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE,1024*1024*1024));
|
||||||
|
checkCudaErrors(cuCtxSetLimit(CU_LIMIT_STACK_SIZE,1024*4));
|
||||||
}
|
}
|
||||||
void destroyContext()
|
void destroyContext()
|
||||||
{
|
{
|
||||||
@@ -386,7 +392,7 @@ int main (int argc, char *argv[])
|
|||||||
progressbar (i, m);
|
progressbar (i, m);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("[sort cuda]:\t[%.3f] million cycles\n", tISPC2);
|
printf("[sort cuda]:\t[%.3f] million cycles :: rate= %g Mel/sec\n", tISPC2, 1.0e-6*n*m/tISPC2);
|
||||||
memcpyD2H(code, d_code, n*sizeof(int));
|
memcpyD2H(code, d_code, n*sizeof(int));
|
||||||
memcpyD2H(order, d_order, n*sizeof(int));
|
memcpyD2H(order, d_order, n*sizeof(int));
|
||||||
for (int i = 0; i < n-1; i++)
|
for (int i = 0; i < n-1; i++)
|
||||||
|
|||||||
Reference in New Issue
Block a user