ispc generates slooow code ..

This commit is contained in:
Evghenii
2014-01-30 14:47:18 +01:00
parent f90688b089
commit 5e37370618
4 changed files with 60 additions and 42 deletions

View File

@@ -1,6 +1,6 @@
EXAMPLE=nbody
CPP_SRC=nbody.cpp nbody_serial.cpp
CPP_SRC=nbody.cpp
ISPC_SRC=nbody.ispc
ISPC_IA_TARGETS=avx1-i32x8
ISPC_ARM_TARGETS=neon

View File

@@ -2,7 +2,7 @@ PROG=nbody
ISPC_SRC=nbody.ispc
#CU_SRC=nbody.cu
CXX_SRC=nbody.cpp
PTXCC_REGMAX=32
PTXCC_REGMAX=128
LLVM_GPU=1
NVVM_GPU=1

View File

@@ -13,9 +13,11 @@ typedef double real;
int main (int argc, char *argv[])
{
int i, j, n = argc == 1 ? 1024*1024: atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
int i, j, n = argc == 1 ? 2048: atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
printf(" nbodies= %d\n", n);
Plummer plummer(n);
real *posx = new real[n];
@@ -55,7 +57,7 @@ int main (int argc, char *argv[])
tISPC2 = get_elapsed_msec();
fprintf(stderr, " %d iterations took %g sec; perf= %g GFlops\n",
nSteps, tISPC2/1e3,
20*n*n/(tISPC2/1e3/1e9));
nSteps * 22.0*n*n/(tISPC2/1e3)/1e9);
}
ispc::closeNbody();

View File

@@ -1,7 +1,5 @@
typedef double real;
typedef real<3> real3;
typedef real<4> real4;
static uniform real * uniform accx = NULL;
static uniform real * uniform accy;
@@ -29,24 +27,6 @@ void closeNbody()
}
static inline
real4 ppForce(real3 ipos, real3 jpos, real jmass)
{
const real3 dr = jpos - ipos;
const real r2 = dr.x*dr.x + dr.y*dr.y + dr.z*dr.z;
const real rinv = r2 > 0 ? rsqrt(r2) : 0;
const real mrinv = jmass * rinv;
const real mrinv3 = mrinv * rinv*rinv;
real4 acc;
acc.x = -mrinv3 * dr.x;
acc.y = -mrinv3 * dr.y;
acc.z = -mrinv3 * dr.z;
acc.w = -mrinv;
return acc;
}
task
void computeForces(
uniform int nbodies,
@@ -61,39 +41,75 @@ void computeForces(
const uniform int blockEnd = min(blockBeg + blockDim, nbodies);
#if 0
real gpotLoc = 0;
uniform real gpotLoc = 0;
for (uniform int i = blockBeg; i < blockEnd; i++)
{
const real3 ipos = {posx[i], posy[i], posz[i]};
real4 iacc = 0;
const real iposx = posx[i];
const real iposy = posy[i];
const real iposz = posz[i];
real iaccx = 0;
real iaccy = 0;
real iaccz = 0;
real igpot = 0;
foreach (j = 0 ... nbodies)
{
const real3 jpos = {posx[j], posy[j], posz[j]};
const real jposx = posx[j];
const real jposy = posy[j];
const real jposz = posz[j];
const real jmass = mass[j];
iacc += ppForce(ipos, jpos, jmass);
const real dx = jposx - iposx;
const real dy = jposy - iposy;
const real dz = jposz - iposz;
const real r2 = dx*dx + dy*dy + dz*dz;
const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0;
const real mrinv = -jmass * rinv;
const real mrinv3 = mrinv * rinv*rinv;
iaccx += mrinv3 * dx;
iaccy += mrinv3 * dy;
iaccz += mrinv3 * dz;
igpot += mrinv;
}
accx[i] = reduce_add(iacc.x);
accy[i] = reduce_add(iacc.y);
accz[i] = reduce_add(iacc.z);
gpotLoc += reduce_add(iacc.w);
accx[i] = reduce_add(iaccx);
accy[i] = reduce_add(iaccy);
accz[i] = reduce_add(iaccz);
gpotLoc += reduce_add(igpot);
}
atomic_add_global(&gpot, gpotLoc);
gpotList[taskIndex] = gpotLoc;
#else
real gpotLoc = 0;
foreach (i = blockBeg ... blockEnd)
{
const real3 ipos = {posx[i], posy[i], posz[i]};
real4 iacc = 0;
const real iposx = posx[i];
const real iposy = posy[i];
const real iposz = posz[i];
real iaccx = 0;
real iaccy = 0;
real iaccz = 0;
real igpot = 0;
for (uniform int j = 0; j < nbodies; j++)
{
const real3 jpos = {posx[j], posy[j], posz[j]};
const real jposx = posx[j];
const real jposy = posy[j];
const real jposz = posz[j];
const real jmass = mass[j];
iacc += ppForce(ipos, jpos, jmass);
const real dx = jposx - iposx;
const real dy = jposy - iposy;
const real dz = jposz - iposz;
const real r2 = dx*dx + dy*dy + dz*dz;
const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0;
const real mrinv = -jmass * rinv;
const real mrinv3 = mrinv * rinv*rinv;
iaccx += mrinv3 * dx;
iaccy += mrinv3 * dy;
iaccz += mrinv3 * dz;
igpot += mrinv;
}
accx[i] = iacc.x;
accy[i] = iacc.y;
accz[i] = iacc.z;
gpotLoc += iacc.w;
accx[i] = iaccx;
accy[i] = iaccy;
accz[i] = iaccz;
gpotLoc += igpot;
}
gpotList[taskIndex] = reduce_add(gpotLoc);
#endif