runs on GPU but need further tuning
This commit is contained in:
@@ -1,12 +1,12 @@
|
|||||||
PROG=radixSort
|
PROG=inplaceTranspose
|
||||||
ISPC_SRC=radixSort.ispc
|
ISPC_SRC=inplaceTranspose.ispc
|
||||||
|
|
||||||
CU_SRC=radixSort.cu
|
#CU_SRC=inplaceTranspose.cu
|
||||||
# NVCC_FLAGS=-Xptxas=-O1
|
# NVCC_FLAGS=-Xptxas=-O1
|
||||||
CXX_SRC=radixSort.cpp radixSort.cpp
|
CXX_SRC=inplaceTranspose.cpp
|
||||||
PTXCC_REGMAX=64
|
PTXCC_REGMAX=32
|
||||||
|
|
||||||
LLVM_GPU=1
|
# LLVM_GPU=1
|
||||||
NVVM_GPU=1
|
NVVM_GPU=1
|
||||||
|
|
||||||
include ../common_ptx.mk
|
include ../common_ptx.mk
|
||||||
|
|||||||
@@ -43,38 +43,44 @@ int main(int argc, char * argv[])
|
|||||||
m*n*sizeof(int)*2/1e6);
|
m*n*sizeof(int)*2/1e6);
|
||||||
|
|
||||||
|
|
||||||
std::vector< std::pair<int,int> > A(m*n);
|
std::pair<int,int> *A = new std::pair<int,int>[m*n];
|
||||||
|
std::pair<int,int> *Acopy = new std::pair<int,int>[m*n];
|
||||||
|
|
||||||
|
for (int j = 0; j < n; j++)
|
||||||
|
for (int i = 0; i < m; i++)
|
||||||
|
A[j*m+i] = std::make_pair(i,j);
|
||||||
|
|
||||||
|
if (verbose)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Original: \n");
|
||||||
|
for (int j = 0; j < n; j++)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < m; i++)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "(%2d,%2d) ", A[j*m+i].first, A[j*m+i].second);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
for (int i = 0; i < m*n; i++)
|
||||||
|
fprintf(stderr, "(%2d,%2d) ", A[i].first, A[i].second);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for (int j = 0; j < n; j++)
|
||||||
|
for (int i = 0; i < m; i++)
|
||||||
|
assert(A[j*m+i].first == i && A[j*m+i].second == j);
|
||||||
|
|
||||||
|
ispcSetMallocHeapLimit(1024ull*1024*1024*8);
|
||||||
|
ispcMemcpy(&Acopy[0], &A[0], sizeof(T)*m*n);
|
||||||
|
|
||||||
int nrep = 10;
|
int nrep = 10;
|
||||||
double dt = 1e10;
|
double dt = 1e10;
|
||||||
for (int r = 0; r < nrep; r++)
|
for (int r = 0; r < nrep; r++)
|
||||||
{
|
{
|
||||||
for (int j = 0; j < n; j++)
|
ispcMemcpy(&A[0], &Acopy[0], sizeof(T)*m*n);
|
||||||
for (int i = 0; i < m; i++)
|
|
||||||
A[j*m+i] = std::make_pair(i,j);
|
|
||||||
|
|
||||||
if (r == 0 && verbose)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Original: \n");
|
|
||||||
for (int j = 0; j < n; j++)
|
|
||||||
{
|
|
||||||
for (int i = 0; i < m; i++)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "(%2d,%2d) ", A[j*m+i].first, A[j*m+i].second);
|
|
||||||
}
|
|
||||||
fprintf(stderr, "\n");
|
|
||||||
}
|
|
||||||
fprintf(stderr, "\n");
|
|
||||||
for (int i = 0; i < m*n; i++)
|
|
||||||
fprintf(stderr, "(%2d,%2d) ", A[i].first, A[i].second);
|
|
||||||
fprintf(stderr, "\n");
|
|
||||||
fprintf(stderr, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int j = 0; j < n; j++)
|
|
||||||
for (int i = 0; i < m; i++)
|
|
||||||
assert(A[j*m+i].first == i && A[j*m+i].second == j);
|
|
||||||
|
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
ispc::transpose((T*)&A[0], n, m);
|
ispc::transpose((T*)&A[0], n, m);
|
||||||
const double t1 = rtc();
|
const double t1 = rtc();
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ int __sj(const int i, const uniform int j, const uniform int m, const uniform in
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if 0
|
||||||
static inline
|
static inline
|
||||||
void transpose_serial(uniform T A[], const uniform int m, const uniform int n)
|
void transpose_serial(uniform T A[], const uniform int m, const uniform int n)
|
||||||
{
|
{
|
||||||
@@ -83,6 +84,7 @@ void transpose_serial(uniform T A[], const uniform int m, const uniform int n)
|
|||||||
delete joverb;
|
delete joverb;
|
||||||
delete tmp;
|
delete tmp;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static uniform T * uniform tmpAll = NULL;
|
static uniform T * uniform tmpAll = NULL;
|
||||||
static uniform int * uniform joverb = NULL;
|
static uniform int * uniform joverb = NULL;
|
||||||
@@ -90,9 +92,9 @@ static uniform int * uniform iovera = NULL;
|
|||||||
static uniform int a,b,c;
|
static uniform int a,b,c;
|
||||||
|
|
||||||
static
|
static
|
||||||
void transpose_init(const uniform int m, const uniform int n, const uniform int taskCount)
|
void transpose_init(const uniform int m, const uniform int n, const uniform int nTask)
|
||||||
{
|
{
|
||||||
const uniform int tmpSize = max(m,n) * programCount * taskCount;
|
const uniform int tmpSize = max(m,n) * programCount * nTask;
|
||||||
tmpAll = uniform new uniform T [tmpSize];
|
tmpAll = uniform new uniform T [tmpSize];
|
||||||
joverb = uniform new uniform int[n];
|
joverb = uniform new uniform int[n];
|
||||||
iovera = uniform new uniform int[m];
|
iovera = uniform new uniform int[m];
|
||||||
@@ -178,7 +180,7 @@ void transpose(uniform T A[], const uniform int m, const uniform int n)
|
|||||||
#if 0
|
#if 0
|
||||||
transpose_serial(A, m, n);
|
transpose_serial(A, m, n);
|
||||||
#else
|
#else
|
||||||
const uniform int nTask = num_cores()*4;
|
const uniform int nTask = 32*8; //num_cores()*4;
|
||||||
transpose_init(m,n,nTask);
|
transpose_init(m,n,nTask);
|
||||||
|
|
||||||
launch [nTask] transpose_step1(A, m, n);
|
launch [nTask] transpose_step1(A, m, n);
|
||||||
@@ -192,5 +194,6 @@ void transpose(uniform T A[], const uniform int m, const uniform int n)
|
|||||||
|
|
||||||
transpose_finalize();
|
transpose_finalize();
|
||||||
#endif
|
#endif
|
||||||
|
sync;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user