diff --git a/examples/portable/inplaceTraspose/Makefile_ptx b/examples/portable/inplaceTraspose/Makefile_ptx index da7494e4..3f3d0044 100644 --- a/examples/portable/inplaceTraspose/Makefile_ptx +++ b/examples/portable/inplaceTraspose/Makefile_ptx @@ -1,12 +1,12 @@ -PROG=radixSort -ISPC_SRC=radixSort.ispc +PROG=inplaceTranspose +ISPC_SRC=inplaceTranspose.ispc -CU_SRC=radixSort.cu +#CU_SRC=inplaceTranspose.cu # NVCC_FLAGS=-Xptxas=-O1 -CXX_SRC=radixSort.cpp radixSort.cpp -PTXCC_REGMAX=64 +CXX_SRC=inplaceTranspose.cpp +PTXCC_REGMAX=32 -LLVM_GPU=1 +# LLVM_GPU=1 NVVM_GPU=1 include ../common_ptx.mk diff --git a/examples/portable/inplaceTraspose/inplaceTranspose.cpp b/examples/portable/inplaceTraspose/inplaceTranspose.cpp index 08e0c806..7e2a00c9 100644 --- a/examples/portable/inplaceTraspose/inplaceTranspose.cpp +++ b/examples/portable/inplaceTraspose/inplaceTranspose.cpp @@ -43,38 +43,44 @@ int main(int argc, char * argv[]) m*n*sizeof(int)*2/1e6); - std::vector< std::pair > A(m*n); + std::pair *A = new std::pair[m*n]; + std::pair *Acopy = new std::pair[m*n]; + + for (int j = 0; j < n; j++) + for (int i = 0; i < m; i++) + A[j*m+i] = std::make_pair(i,j); + + if (verbose) + { + fprintf(stderr, "Original: \n"); + for (int j = 0; j < n; j++) + { + for (int i = 0; i < m; i++) + { + fprintf(stderr, "(%2d,%2d) ", A[j*m+i].first, A[j*m+i].second); + } + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); + for (int i = 0; i < m*n; i++) + fprintf(stderr, "(%2d,%2d) ", A[i].first, A[i].second); + fprintf(stderr, "\n"); + fprintf(stderr, "\n"); + } + + + for (int j = 0; j < n; j++) + for (int i = 0; i < m; i++) + assert(A[j*m+i].first == i && A[j*m+i].second == j); + + ispcSetMallocHeapLimit(1024ull*1024*1024*8); + ispcMemcpy(&Acopy[0], &A[0], sizeof(T)*m*n); int nrep = 10; double dt = 1e10; for (int r = 0; r < nrep; r++) { - for (int j = 0; j < n; j++) - for (int i = 0; i < m; i++) - A[j*m+i] = std::make_pair(i,j); - - if (r == 0 && verbose) - { - fprintf(stderr, "Original: \n"); - for (int j = 0; j < n; j++) - { - for (int i = 0; i < m; i++) - { - fprintf(stderr, "(%2d,%2d) ", A[j*m+i].first, A[j*m+i].second); - } - fprintf(stderr, "\n"); - } - fprintf(stderr, "\n"); - for (int i = 0; i < m*n; i++) - fprintf(stderr, "(%2d,%2d) ", A[i].first, A[i].second); - fprintf(stderr, "\n"); - fprintf(stderr, "\n"); - } - - for (int j = 0; j < n; j++) - for (int i = 0; i < m; i++) - assert(A[j*m+i].first == i && A[j*m+i].second == j); - + ispcMemcpy(&A[0], &Acopy[0], sizeof(T)*m*n); reset_and_start_timer(); ispc::transpose((T*)&A[0], n, m); const double t1 = rtc(); diff --git a/examples/portable/inplaceTraspose/inplaceTranspose.ispc b/examples/portable/inplaceTraspose/inplaceTranspose.ispc index 76e5ad60..c3b66205 100644 --- a/examples/portable/inplaceTraspose/inplaceTranspose.ispc +++ b/examples/portable/inplaceTraspose/inplaceTranspose.ispc @@ -31,6 +31,7 @@ int __sj(const int i, const uniform int j, const uniform int m, const uniform in } +#if 0 static inline void transpose_serial(uniform T A[], const uniform int m, const uniform int n) { @@ -83,6 +84,7 @@ void transpose_serial(uniform T A[], const uniform int m, const uniform int n) delete joverb; delete tmp; } +#endif static uniform T * uniform tmpAll = NULL; static uniform int * uniform joverb = NULL; @@ -90,9 +92,9 @@ static uniform int * uniform iovera = NULL; static uniform int a,b,c; static -void transpose_init(const uniform int m, const uniform int n, const uniform int taskCount) +void transpose_init(const uniform int m, const uniform int n, const uniform int nTask) { - const uniform int tmpSize = max(m,n) * programCount * taskCount; + const uniform int tmpSize = max(m,n) * programCount * nTask; tmpAll = uniform new uniform T [tmpSize]; joverb = uniform new uniform int[n]; iovera = uniform new uniform int[m]; @@ -178,7 +180,7 @@ void transpose(uniform T A[], const uniform int m, const uniform int n) #if 0 transpose_serial(A, m, n); #else - const uniform int nTask = num_cores()*4; + const uniform int nTask = 32*8; //num_cores()*4; transpose_init(m,n,nTask); launch [nTask] transpose_step1(A, m, n); @@ -192,5 +194,6 @@ void transpose(uniform T A[], const uniform int m, const uniform int n) transpose_finalize(); #endif + sync; }