runs on GPU but need further tuning
This commit is contained in:
@@ -1,12 +1,12 @@
|
||||
PROG=radixSort
|
||||
ISPC_SRC=radixSort.ispc
|
||||
PROG=inplaceTranspose
|
||||
ISPC_SRC=inplaceTranspose.ispc
|
||||
|
||||
CU_SRC=radixSort.cu
|
||||
#CU_SRC=inplaceTranspose.cu
|
||||
# NVCC_FLAGS=-Xptxas=-O1
|
||||
CXX_SRC=radixSort.cpp radixSort.cpp
|
||||
PTXCC_REGMAX=64
|
||||
CXX_SRC=inplaceTranspose.cpp
|
||||
PTXCC_REGMAX=32
|
||||
|
||||
LLVM_GPU=1
|
||||
# LLVM_GPU=1
|
||||
NVVM_GPU=1
|
||||
|
||||
include ../common_ptx.mk
|
||||
|
||||
@@ -43,17 +43,14 @@ int main(int argc, char * argv[])
|
||||
m*n*sizeof(int)*2/1e6);
|
||||
|
||||
|
||||
std::vector< std::pair<int,int> > A(m*n);
|
||||
std::pair<int,int> *A = new std::pair<int,int>[m*n];
|
||||
std::pair<int,int> *Acopy = new std::pair<int,int>[m*n];
|
||||
|
||||
int nrep = 10;
|
||||
double dt = 1e10;
|
||||
for (int r = 0; r < nrep; r++)
|
||||
{
|
||||
for (int j = 0; j < n; j++)
|
||||
for (int i = 0; i < m; i++)
|
||||
A[j*m+i] = std::make_pair(i,j);
|
||||
|
||||
if (r == 0 && verbose)
|
||||
if (verbose)
|
||||
{
|
||||
fprintf(stderr, "Original: \n");
|
||||
for (int j = 0; j < n; j++)
|
||||
@@ -71,10 +68,19 @@ int main(int argc, char * argv[])
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
|
||||
for (int j = 0; j < n; j++)
|
||||
for (int i = 0; i < m; i++)
|
||||
assert(A[j*m+i].first == i && A[j*m+i].second == j);
|
||||
|
||||
ispcSetMallocHeapLimit(1024ull*1024*1024*8);
|
||||
ispcMemcpy(&Acopy[0], &A[0], sizeof(T)*m*n);
|
||||
|
||||
int nrep = 10;
|
||||
double dt = 1e10;
|
||||
for (int r = 0; r < nrep; r++)
|
||||
{
|
||||
ispcMemcpy(&A[0], &Acopy[0], sizeof(T)*m*n);
|
||||
reset_and_start_timer();
|
||||
ispc::transpose((T*)&A[0], n, m);
|
||||
const double t1 = rtc();
|
||||
|
||||
@@ -31,6 +31,7 @@ int __sj(const int i, const uniform int j, const uniform int m, const uniform in
|
||||
}
|
||||
|
||||
|
||||
#if 0
|
||||
static inline
|
||||
void transpose_serial(uniform T A[], const uniform int m, const uniform int n)
|
||||
{
|
||||
@@ -83,6 +84,7 @@ void transpose_serial(uniform T A[], const uniform int m, const uniform int n)
|
||||
delete joverb;
|
||||
delete tmp;
|
||||
}
|
||||
#endif
|
||||
|
||||
static uniform T * uniform tmpAll = NULL;
|
||||
static uniform int * uniform joverb = NULL;
|
||||
@@ -90,9 +92,9 @@ static uniform int * uniform iovera = NULL;
|
||||
static uniform int a,b,c;
|
||||
|
||||
static
|
||||
void transpose_init(const uniform int m, const uniform int n, const uniform int taskCount)
|
||||
void transpose_init(const uniform int m, const uniform int n, const uniform int nTask)
|
||||
{
|
||||
const uniform int tmpSize = max(m,n) * programCount * taskCount;
|
||||
const uniform int tmpSize = max(m,n) * programCount * nTask;
|
||||
tmpAll = uniform new uniform T [tmpSize];
|
||||
joverb = uniform new uniform int[n];
|
||||
iovera = uniform new uniform int[m];
|
||||
@@ -178,7 +180,7 @@ void transpose(uniform T A[], const uniform int m, const uniform int n)
|
||||
#if 0
|
||||
transpose_serial(A, m, n);
|
||||
#else
|
||||
const uniform int nTask = num_cores()*4;
|
||||
const uniform int nTask = 32*8; //num_cores()*4;
|
||||
transpose_init(m,n,nTask);
|
||||
|
||||
launch [nTask] transpose_step1(A, m, n);
|
||||
@@ -192,5 +194,6 @@ void transpose(uniform T A[], const uniform int m, const uniform int n)
|
||||
|
||||
transpose_finalize();
|
||||
#endif
|
||||
sync;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user