diff --git a/examples/portable/inplaceTraspose/Makefile_cpu b/examples/portable/inplaceTraspose/Makefile_cpu new file mode 100644 index 00000000..f50b2d45 --- /dev/null +++ b/examples/portable/inplaceTraspose/Makefile_cpu @@ -0,0 +1,9 @@ + +EXAMPLE=inplaceTranspose +CPP_SRC=inplaceTranspose.cpp +ISPC_SRC=inplaceTranspose.ispc +ISPC_IA_TARGETS=avx1-i32x8 +ISPC_ARM_TARGETS=neon +#ISPC_FLAGS=-DDEBUG -g + +include ../common_cpu.mk diff --git a/examples/portable/inplaceTraspose/Makefile_knc b/examples/portable/inplaceTraspose/Makefile_knc new file mode 100644 index 00000000..1204364f --- /dev/null +++ b/examples/portable/inplaceTraspose/Makefile_knc @@ -0,0 +1,7 @@ +EXAMPLE=radixSort +CXX_SRC=radixSort.cpp +ISPC_SRC=radixSort.ispc +ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h +ISPC_TARGET=generic-16 + +include ../common_knc.mk diff --git a/examples/portable/inplaceTraspose/Makefile_ptx b/examples/portable/inplaceTraspose/Makefile_ptx new file mode 100644 index 00000000..da7494e4 --- /dev/null +++ b/examples/portable/inplaceTraspose/Makefile_ptx @@ -0,0 +1,15 @@ +PROG=radixSort +ISPC_SRC=radixSort.ispc + +CU_SRC=radixSort.cu +# NVCC_FLAGS=-Xptxas=-O1 +CXX_SRC=radixSort.cpp radixSort.cpp +PTXCC_REGMAX=64 + +LLVM_GPU=1 +NVVM_GPU=1 + +include ../common_ptx.mk + + + diff --git a/examples/portable/inplaceTraspose/inplaceTranspose.cpp b/examples/portable/inplaceTraspose/inplaceTranspose.cpp new file mode 100644 index 00000000..08e0c806 --- /dev/null +++ b/examples/portable/inplaceTraspose/inplaceTranspose.cpp @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include +#include "timing.h" +#include "ispc_malloc.h" +#include "inplaceTranspose_ispc.h" +#include "typeT.h" + +/* progress bar by Ross Hemsley; + * http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */ +static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50) +{ + if (n < 100) + { + x *= 100/n; + n = 100; + } + + if ((x != n) && (x % (n/100) != 0)) return; + + using namespace std; + float ratio = x/(float)n; + int c = ratio * w; + + cerr << setw(3) << (int)(ratio*100) << "% ["; + for (int x=0; x 1 ? atoi(argv[1]) : 8; + int n = argc > 2 ? atoi(argv[2]) : 12; + bool verbose = argc > 3; + + + fprintf(stderr, " m= %d n= %d :: storage= %g MB\n", m, n, + m*n*sizeof(int)*2/1e6); + + + std::vector< std::pair > A(m*n); + + int nrep = 10; + double dt = 1e10; + for (int r = 0; r < nrep; r++) + { + for (int j = 0; j < n; j++) + for (int i = 0; i < m; i++) + A[j*m+i] = std::make_pair(i,j); + + if (r == 0 && verbose) + { + fprintf(stderr, "Original: \n"); + for (int j = 0; j < n; j++) + { + for (int i = 0; i < m; i++) + { + fprintf(stderr, "(%2d,%2d) ", A[j*m+i].first, A[j*m+i].second); + } + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); + for (int i = 0; i < m*n; i++) + fprintf(stderr, "(%2d,%2d) ", A[i].first, A[i].second); + fprintf(stderr, "\n"); + fprintf(stderr, "\n"); + } + + for (int j = 0; j < n; j++) + for (int i = 0; i < m; i++) + assert(A[j*m+i].first == i && A[j*m+i].second == j); + + reset_and_start_timer(); + ispc::transpose((T*)&A[0], n, m); + const double t1 = rtc(); + dt = std::min(dt, get_elapsed_msec()); + progressbar (r, nrep); + } + progressbar (nrep, nrep); + fprintf(stderr, "\n"); + + if (verbose) + { + fprintf(stderr, "Transposed: \n"); + for (int j = 0; j < m; j++) + { + for (int i = 0; i < n; i++) + { + fprintf(stderr, "(%2d,%2d) ", A[j*n+i].first, A[j*n+i].second); + } + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); + for (int i = 0; i < m*n; i++) + fprintf(stderr, "(%2d,%2d) ", A[i].first, A[i].second); + fprintf(stderr, "\n"); + fprintf(stderr, "\n"); + } + + for (int j = 0; j < m; j++) + for (int i = 0; i < n; i++) + assert(A[j*n+i].first == j && A[j*n+i].second == i); + + fprintf(stderr, " tranpose done in %g msec :: BW= % GB/s\n", + dt , 2*m*n*sizeof(int)*2/dt*1e3/1e9); + + + return 0; +} diff --git a/examples/portable/inplaceTraspose/inplaceTranspose.ispc b/examples/portable/inplaceTraspose/inplaceTranspose.ispc new file mode 100644 index 00000000..08f1d017 --- /dev/null +++ b/examples/portable/inplaceTraspose/inplaceTranspose.ispc @@ -0,0 +1,91 @@ +#include "typeT.h" + +static inline +uniform int gcd(uniform int a, uniform int b) +{ + while ( a != 0 ) + { + uniform int c = a; + a = b%a; + b = c; + } + return b; +} + + static inline +int __rj(const int i, const uniform int joverb, const uniform int m, const uniform int b) +{ + return (i + joverb) % m; +} + + static inline +int __di(const int i, const uniform int j, const uniform int m, const uniform int n, const uniform int joverb) +{ + return (((i+joverb) % m) + j*m) % n; +} + + static inline +int __sj(const int i, const uniform int j, const uniform int m, const uniform int n, const int iovera) +{ + return (j + i*n - iovera) % m; +} + +static inline +void transpose_serial(uniform T A[], const uniform int m, const uniform int n) +{ + const uniform int tmpSize = max(m,n) * programCount; + uniform T * uniform tmp = uniform new uniform T [tmpSize]; + uniform int * uniform joverb = uniform new uniform int[n]; + uniform int * uniform iovera = uniform new uniform int[m]; + + uniform T (*uniform tmp2D)[programCount] = (uniform T (*uniform)[programCount])tmp; + + const uniform int c = gcd(m,n); + const uniform int a = m/c; + const uniform int b = n/c; + foreach (j = 0 ... n) + joverb[j] = j/b; + foreach (i = 0 ... m) + iovera[i] = i/a; + + if (c > 1) + { + for (uniform int j = 0; j < n; j++) + { + const uniform int base = j*m; + const uniform int __joverb = joverb[j]; + foreach (i = 0 ... m) + tmp[i] = A[base + __rj(i,__joverb,m,b)]; + foreach (i = 0 ... m) + A[base + i] = tmp[i]; + } + } + + foreach (i = 0 ... m) + { + for (uniform int j = 0; j < n; j++) + tmp2D[__di(i,j,m,n,joverb[j])][programIndex] = A[j*m + i]; + for (uniform int j = 0; j < n; j++) + A[j*m + i] = tmp2D[j][programIndex]; + } + + for (uniform int j = 0; j < n; j++) + { + const uniform int base = j*m; + foreach (i = 0 ... m) + tmp[i] = A[base + __sj(i,j,m,n,iovera[i])]; + foreach (i = 0 ... m) + A[base + i] = tmp[i]; + } + + delete iovera; + delete joverb; + delete tmp; +} + +export +void transpose(uniform T A[], const uniform int m, const uniform int n) +{ + transpose_serial(A, m, n); +} + diff --git a/examples/portable/inplaceTraspose/typeT.h b/examples/portable/inplaceTraspose/typeT.h new file mode 100644 index 00000000..a0aa630a --- /dev/null +++ b/examples/portable/inplaceTraspose/typeT.h @@ -0,0 +1,2 @@ +#pragma once +typedef double T;