From 5e373706183ca3f28b156e288d8f0772c0fb0130 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 30 Jan 2014 14:47:18 +0100 Subject: [PATCH] ispc generates slooow code .. --- examples_ptx/nbody/Makefile_cpu | 2 +- examples_ptx/nbody/Makefile_gpu | 2 +- examples_ptx/nbody/nbody.cpp | 6 ++- examples_ptx/nbody/nbody.ispc | 92 +++++++++++++++++++-------------- 4 files changed, 60 insertions(+), 42 deletions(-) diff --git a/examples_ptx/nbody/Makefile_cpu b/examples_ptx/nbody/Makefile_cpu index da6fcac9..087e5d0c 100644 --- a/examples_ptx/nbody/Makefile_cpu +++ b/examples_ptx/nbody/Makefile_cpu @@ -1,6 +1,6 @@ EXAMPLE=nbody -CPP_SRC=nbody.cpp nbody_serial.cpp +CPP_SRC=nbody.cpp ISPC_SRC=nbody.ispc ISPC_IA_TARGETS=avx1-i32x8 ISPC_ARM_TARGETS=neon diff --git a/examples_ptx/nbody/Makefile_gpu b/examples_ptx/nbody/Makefile_gpu index 99b49c81..78ee0aaf 100644 --- a/examples_ptx/nbody/Makefile_gpu +++ b/examples_ptx/nbody/Makefile_gpu @@ -2,7 +2,7 @@ PROG=nbody ISPC_SRC=nbody.ispc #CU_SRC=nbody.cu CXX_SRC=nbody.cpp -PTXCC_REGMAX=32 +PTXCC_REGMAX=128 LLVM_GPU=1 NVVM_GPU=1 diff --git a/examples_ptx/nbody/nbody.cpp b/examples_ptx/nbody/nbody.cpp index a32507a1..2c3b4201 100644 --- a/examples_ptx/nbody/nbody.cpp +++ b/examples_ptx/nbody/nbody.cpp @@ -13,9 +13,11 @@ typedef double real; int main (int argc, char *argv[]) { - int i, j, n = argc == 1 ? 1024*1024: atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX; + int i, j, n = argc == 1 ? 2048: atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX; double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0; + printf(" nbodies= %d\n", n); + Plummer plummer(n); real *posx = new real[n]; @@ -55,7 +57,7 @@ int main (int argc, char *argv[]) tISPC2 = get_elapsed_msec(); fprintf(stderr, " %d iterations took %g sec; perf= %g GFlops\n", nSteps, tISPC2/1e3, - 20*n*n/(tISPC2/1e3/1e9)); + nSteps * 22.0*n*n/(tISPC2/1e3)/1e9); } ispc::closeNbody(); diff --git a/examples_ptx/nbody/nbody.ispc b/examples_ptx/nbody/nbody.ispc index 77862ef3..cbe82eee 100644 --- a/examples_ptx/nbody/nbody.ispc +++ b/examples_ptx/nbody/nbody.ispc @@ -1,7 +1,5 @@ typedef double real; -typedef real<3> real3; -typedef real<4> real4; static uniform real * uniform accx = NULL; static uniform real * uniform accy; @@ -29,24 +27,6 @@ void closeNbody() } -static inline -real4 ppForce(real3 ipos, real3 jpos, real jmass) -{ - const real3 dr = jpos - ipos; - const real r2 = dr.x*dr.x + dr.y*dr.y + dr.z*dr.z; - const real rinv = r2 > 0 ? rsqrt(r2) : 0; - const real mrinv = jmass * rinv; - const real mrinv3 = mrinv * rinv*rinv; - - real4 acc; - acc.x = -mrinv3 * dr.x; - acc.y = -mrinv3 * dr.y; - acc.z = -mrinv3 * dr.z; - acc.w = -mrinv; - - return acc; -} - task void computeForces( uniform int nbodies, @@ -61,39 +41,75 @@ void computeForces( const uniform int blockEnd = min(blockBeg + blockDim, nbodies); #if 0 - real gpotLoc = 0; + uniform real gpotLoc = 0; for (uniform int i = blockBeg; i < blockEnd; i++) { - const real3 ipos = {posx[i], posy[i], posz[i]}; - real4 iacc = 0; + const real iposx = posx[i]; + const real iposy = posy[i]; + const real iposz = posz[i]; + real iaccx = 0; + real iaccy = 0; + real iaccz = 0; + real igpot = 0; foreach (j = 0 ... nbodies) { - const real3 jpos = {posx[j], posy[j], posz[j]}; + const real jposx = posx[j]; + const real jposy = posy[j]; + const real jposz = posz[j]; const real jmass = mass[j]; - iacc += ppForce(ipos, jpos, jmass); + const real dx = jposx - iposx; + const real dy = jposy - iposy; + const real dz = jposz - iposz; + const real r2 = dx*dx + dy*dy + dz*dz; + const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0; + const real mrinv = -jmass * rinv; + const real mrinv3 = mrinv * rinv*rinv; + + iaccx += mrinv3 * dx; + iaccy += mrinv3 * dy; + iaccz += mrinv3 * dz; + igpot += mrinv; } - accx[i] = reduce_add(iacc.x); - accy[i] = reduce_add(iacc.y); - accz[i] = reduce_add(iacc.z); - gpotLoc += reduce_add(iacc.w); + accx[i] = reduce_add(iaccx); + accy[i] = reduce_add(iaccy); + accz[i] = reduce_add(iaccz); + gpotLoc += reduce_add(igpot); } - atomic_add_global(&gpot, gpotLoc); + gpotList[taskIndex] = gpotLoc; #else real gpotLoc = 0; foreach (i = blockBeg ... blockEnd) { - const real3 ipos = {posx[i], posy[i], posz[i]}; - real4 iacc = 0; + const real iposx = posx[i]; + const real iposy = posy[i]; + const real iposz = posz[i]; + real iaccx = 0; + real iaccy = 0; + real iaccz = 0; + real igpot = 0; for (uniform int j = 0; j < nbodies; j++) { - const real3 jpos = {posx[j], posy[j], posz[j]}; + const real jposx = posx[j]; + const real jposy = posy[j]; + const real jposz = posz[j]; const real jmass = mass[j]; - iacc += ppForce(ipos, jpos, jmass); + const real dx = jposx - iposx; + const real dy = jposy - iposy; + const real dz = jposz - iposz; + const real r2 = dx*dx + dy*dy + dz*dz; + const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0; + const real mrinv = -jmass * rinv; + const real mrinv3 = mrinv * rinv*rinv; + + iaccx += mrinv3 * dx; + iaccy += mrinv3 * dy; + iaccz += mrinv3 * dz; + igpot += mrinv; } - accx[i] = iacc.x; - accy[i] = iacc.y; - accz[i] = iacc.z; - gpotLoc += iacc.w; + accx[i] = iaccx; + accy[i] = iaccy; + accz[i] = iaccz; + gpotLoc += igpot; } gpotList[taskIndex] = reduce_add(gpotLoc); #endif