runs on GPU but need further tuning

This commit is contained in:
Evghenii
2014-03-05 11:49:15 +01:00
parent f086b7ff9b
commit 644118cd17
3 changed files with 45 additions and 36 deletions

View File

@@ -1,12 +1,12 @@
PROG=radixSort
ISPC_SRC=radixSort.ispc
PROG=inplaceTranspose
ISPC_SRC=inplaceTranspose.ispc
CU_SRC=radixSort.cu
#CU_SRC=inplaceTranspose.cu
# NVCC_FLAGS=-Xptxas=-O1
CXX_SRC=radixSort.cpp radixSort.cpp
PTXCC_REGMAX=64
CXX_SRC=inplaceTranspose.cpp
PTXCC_REGMAX=32
LLVM_GPU=1
# LLVM_GPU=1
NVVM_GPU=1
include ../common_ptx.mk

View File

@@ -43,38 +43,44 @@ int main(int argc, char * argv[])
m*n*sizeof(int)*2/1e6);
std::vector< std::pair<int,int> > A(m*n);
std::pair<int,int> *A = new std::pair<int,int>[m*n];
std::pair<int,int> *Acopy = new std::pair<int,int>[m*n];
for (int j = 0; j < n; j++)
for (int i = 0; i < m; i++)
A[j*m+i] = std::make_pair(i,j);
if (verbose)
{
fprintf(stderr, "Original: \n");
for (int j = 0; j < n; j++)
{
for (int i = 0; i < m; i++)
{
fprintf(stderr, "(%2d,%2d) ", A[j*m+i].first, A[j*m+i].second);
}
fprintf(stderr, "\n");
}
fprintf(stderr, "\n");
for (int i = 0; i < m*n; i++)
fprintf(stderr, "(%2d,%2d) ", A[i].first, A[i].second);
fprintf(stderr, "\n");
fprintf(stderr, "\n");
}
for (int j = 0; j < n; j++)
for (int i = 0; i < m; i++)
assert(A[j*m+i].first == i && A[j*m+i].second == j);
ispcSetMallocHeapLimit(1024ull*1024*1024*8);
ispcMemcpy(&Acopy[0], &A[0], sizeof(T)*m*n);
int nrep = 10;
double dt = 1e10;
for (int r = 0; r < nrep; r++)
{
for (int j = 0; j < n; j++)
for (int i = 0; i < m; i++)
A[j*m+i] = std::make_pair(i,j);
if (r == 0 && verbose)
{
fprintf(stderr, "Original: \n");
for (int j = 0; j < n; j++)
{
for (int i = 0; i < m; i++)
{
fprintf(stderr, "(%2d,%2d) ", A[j*m+i].first, A[j*m+i].second);
}
fprintf(stderr, "\n");
}
fprintf(stderr, "\n");
for (int i = 0; i < m*n; i++)
fprintf(stderr, "(%2d,%2d) ", A[i].first, A[i].second);
fprintf(stderr, "\n");
fprintf(stderr, "\n");
}
for (int j = 0; j < n; j++)
for (int i = 0; i < m; i++)
assert(A[j*m+i].first == i && A[j*m+i].second == j);
ispcMemcpy(&A[0], &Acopy[0], sizeof(T)*m*n);
reset_and_start_timer();
ispc::transpose((T*)&A[0], n, m);
const double t1 = rtc();

View File

@@ -31,6 +31,7 @@ int __sj(const int i, const uniform int j, const uniform int m, const uniform in
}
#if 0
static inline
void transpose_serial(uniform T A[], const uniform int m, const uniform int n)
{
@@ -83,6 +84,7 @@ void transpose_serial(uniform T A[], const uniform int m, const uniform int n)
delete joverb;
delete tmp;
}
#endif
static uniform T * uniform tmpAll = NULL;
static uniform int * uniform joverb = NULL;
@@ -90,9 +92,9 @@ static uniform int * uniform iovera = NULL;
static uniform int a,b,c;
static
void transpose_init(const uniform int m, const uniform int n, const uniform int taskCount)
void transpose_init(const uniform int m, const uniform int n, const uniform int nTask)
{
const uniform int tmpSize = max(m,n) * programCount * taskCount;
const uniform int tmpSize = max(m,n) * programCount * nTask;
tmpAll = uniform new uniform T [tmpSize];
joverb = uniform new uniform int[n];
iovera = uniform new uniform int[m];
@@ -178,7 +180,7 @@ void transpose(uniform T A[], const uniform int m, const uniform int n)
#if 0
transpose_serial(A, m, n);
#else
const uniform int nTask = num_cores()*4;
const uniform int nTask = 32*8; //num_cores()*4;
transpose_init(m,n,nTask);
launch [nTask] transpose_step1(A, m, n);
@@ -192,5 +194,6 @@ void transpose(uniform T A[], const uniform int m, const uniform int n)
transpose_finalize();
#endif
sync;
}