+1
This commit is contained in:
@@ -22,7 +22,7 @@ endif
|
|||||||
|
|
||||||
#
|
#
|
||||||
ISPC=ispc
|
ISPC=ispc
|
||||||
ISPC_FLAGS=-O3 --math-lib=default --target=nvptx --opt=fast-math
|
ISPC_FLAGS=-O3 --math-lib=fast --target=nvptx --opt=fast-math
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ int main (int argc, char *argv[])
|
|||||||
ispcSetMallocHeapLimit(1024*1024*1024);
|
ispcSetMallocHeapLimit(1024*1024*1024);
|
||||||
ispc::openNbody(n);
|
ispc::openNbody(n);
|
||||||
|
|
||||||
const int nSteps = 10;
|
const int nSteps = 1;
|
||||||
const real dt = 0;
|
const real dt = 0;
|
||||||
tISPC2 = 1e30;
|
tISPC2 = 1e30;
|
||||||
for (i = 0; i < m; i ++)
|
for (i = 0; i < m; i ++)
|
||||||
@@ -57,7 +57,7 @@ int main (int argc, char *argv[])
|
|||||||
tISPC2 = get_elapsed_msec();
|
tISPC2 = get_elapsed_msec();
|
||||||
fprintf(stderr, " %d iterations took %g sec; perf= %g GFlops\n",
|
fprintf(stderr, " %d iterations took %g sec; perf= %g GFlops\n",
|
||||||
nSteps, tISPC2/1e3,
|
nSteps, tISPC2/1e3,
|
||||||
nSteps * 22.0*n*n/(tISPC2/1e3)/1e9);
|
nSteps * 20.0*n*n/(tISPC2/1e3)/1e9);
|
||||||
}
|
}
|
||||||
|
|
||||||
ispc::closeNbody();
|
ispc::closeNbody();
|
||||||
|
|||||||
@@ -54,44 +54,6 @@ void computeForces(
|
|||||||
const uniform int blkBeg = blkIdx * blkDim;
|
const uniform int blkBeg = blkIdx * blkDim;
|
||||||
const uniform int blkEnd = min(blkBeg + blkDim, nbodies);
|
const uniform int blkEnd = min(blkBeg + blkDim, nbodies);
|
||||||
|
|
||||||
#if 0
|
|
||||||
uniform real gpotLoc = 0;
|
|
||||||
for (uniform int i = blkBeg; i < blkEnd; i++)
|
|
||||||
{
|
|
||||||
const real iposx = posx[i];
|
|
||||||
const real iposy = posy[i];
|
|
||||||
const real iposz = posz[i];
|
|
||||||
real iaccx = 0;
|
|
||||||
real iaccy = 0;
|
|
||||||
real iaccz = 0;
|
|
||||||
real igpot = 0;
|
|
||||||
foreach (j = 0 ... nbodies)
|
|
||||||
{
|
|
||||||
const real jposx = posx[j];
|
|
||||||
const real jposy = posy[j];
|
|
||||||
const real jposz = posz[j];
|
|
||||||
const real jmass = mass[j];
|
|
||||||
const real dx = jposx - iposx;
|
|
||||||
const real dy = jposy - iposy;
|
|
||||||
const real dz = jposz - iposz;
|
|
||||||
const real r2 = dx*dx + dy*dy + dz*dz;
|
|
||||||
const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0;
|
|
||||||
const real mrinv = -jmass * rinv;
|
|
||||||
const real mrinv3 = mrinv * rinv*rinv;
|
|
||||||
|
|
||||||
iaccx += mrinv3 * dx;
|
|
||||||
iaccy += mrinv3 * dy;
|
|
||||||
iaccz += mrinv3 * dz;
|
|
||||||
igpot += mrinv;
|
|
||||||
}
|
|
||||||
accx[i] = reduce_add(iaccx);
|
|
||||||
accy[i] = reduce_add(iaccy);
|
|
||||||
accz[i] = reduce_add(iaccz);
|
|
||||||
gpotLoc += reduce_add(igpot);
|
|
||||||
}
|
|
||||||
gpotList[taskIndex] = gpotLoc;
|
|
||||||
#else
|
|
||||||
real gpotLoc = 0;
|
|
||||||
for (int i = programIndex + blkBeg; i < blkEnd; i += programCount)
|
for (int i = programIndex + blkBeg; i < blkEnd; i += programCount)
|
||||||
if (i < blkEnd)
|
if (i < blkEnd)
|
||||||
{
|
{
|
||||||
@@ -102,6 +64,7 @@ void computeForces(
|
|||||||
real iaccy = 0;
|
real iaccy = 0;
|
||||||
real iaccz = 0;
|
real iaccz = 0;
|
||||||
real igpot = 0;
|
real igpot = 0;
|
||||||
|
#if 0
|
||||||
for (uniform int j = 0; j < nbodies; j++)
|
for (uniform int j = 0; j < nbodies; j++)
|
||||||
{
|
{
|
||||||
const real jposx = posx[j];
|
const real jposx = posx[j];
|
||||||
@@ -112,7 +75,7 @@ void computeForces(
|
|||||||
const real dy = jposy - iposy;
|
const real dy = jposy - iposy;
|
||||||
const real dz = jposz - iposz;
|
const real dz = jposz - iposz;
|
||||||
const real r2 = dx*dx + dy*dy + dz*dz;
|
const real r2 = dx*dx + dy*dy + dz*dz;
|
||||||
const real rinv = r2 > 0.0 ? rsqrt((float)r2) : 0;
|
const real rinv = r2; // > 0.0 ? rsqrt((float)r2) : 0;
|
||||||
const real mrinv = -jmass * rinv;
|
const real mrinv = -jmass * rinv;
|
||||||
const real mrinv3 = mrinv * rinv*rinv;
|
const real mrinv3 = mrinv * rinv*rinv;
|
||||||
iaccx += mrinv3 * dx;
|
iaccx += mrinv3 * dx;
|
||||||
@@ -120,13 +83,41 @@ void computeForces(
|
|||||||
iaccz += mrinv3 * dz;
|
iaccz += mrinv3 * dz;
|
||||||
igpot += mrinv;
|
igpot += mrinv;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
for (uniform int j = 0; j < nbodies; j += programCount)
|
||||||
|
{
|
||||||
|
__shared__ real shdata[4][programCount*4];
|
||||||
|
real (* shmem)[programCount] = (real (*)[programCount])shdata[warpIdx];
|
||||||
|
shmem[0][programIndex] = posx[j+programIndex];
|
||||||
|
shmem[1][programIndex] = posy[j+programIndex];
|
||||||
|
shmem[2][programIndex] = posz[j+programIndex];
|
||||||
|
shmem[3][programIndex] = mass[j+programIndex];
|
||||||
|
|
||||||
|
#pragma unroll 1
|
||||||
|
for (int jb = 0; jb < programCount; jb++)
|
||||||
|
{
|
||||||
|
const real jposx = shmem[0][jb];
|
||||||
|
const real jposy = shmem[1][jb];
|
||||||
|
const real jposz = shmem[2][jb];
|
||||||
|
const real jmass = shmem[3][jb];
|
||||||
|
const real dx = jposx - iposx;
|
||||||
|
const real dy = jposy - iposy;
|
||||||
|
const real dz = jposz - iposz;
|
||||||
|
const real r2 = dx*dx + dy*dy + dz*dz;
|
||||||
|
const real rinv = r2 ; //> 0.0 ? rsqrt((float)r2) : 0;
|
||||||
|
const real mrinv = -jmass * rinv;
|
||||||
|
const real mrinv3 = mrinv * rinv*rinv;
|
||||||
|
iaccx += mrinv3 * dx;
|
||||||
|
iaccy += mrinv3 * dy;
|
||||||
|
iaccz += mrinv3 * dz;
|
||||||
|
igpot += mrinv;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
accx[i] = iaccx;
|
accx[i] = iaccx;
|
||||||
accy[i] = iaccy;
|
accy[i] = iaccy;
|
||||||
accz[i] = iaccz;
|
accz[i] = iaccz;
|
||||||
gpotLoc += igpot;
|
|
||||||
}
|
}
|
||||||
// gpotList[taskIndex] = reduce_add(gpotLoc);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__
|
__global__
|
||||||
@@ -191,17 +182,16 @@ void nbodyIntegrate___export(
|
|||||||
uniform real energies[])
|
uniform real energies[])
|
||||||
{
|
{
|
||||||
uniform int nTasks ;
|
uniform int nTasks ;
|
||||||
nTasks = nbodies/(4*programCount);
|
nTasks = (nbodies+1*programCount - 1)/(1*programCount);
|
||||||
assert((nbodies % nTasks) == 0);
|
|
||||||
|
|
||||||
for (uniform int step = 0; step < nSteps; step++)
|
for (uniform int step = 0; step < nSteps; step++)
|
||||||
{
|
{
|
||||||
launch (nTasks,1,1, updatePositions)(nbodies, posx, posy, posz, velx, vely, velz,dt);
|
// launch (nTasks,1,1, updatePositions)(nbodies, posx, posy, posz, velx, vely, velz,dt);
|
||||||
sync;
|
// sync;
|
||||||
launch (nTasks,1,1, computeForces)(nbodies, posx, posy, posz, mass);
|
launch (nTasks,1,1, computeForces)(nbodies, posx, posy, posz, mass);
|
||||||
sync;
|
sync;
|
||||||
launch (nTasks,1,1, updateVelocities)(nbodies, posx, posy, posz, dt);
|
// launch (nTasks,1,1, updateVelocities)(nbodies, posx, posy, posz, dt);
|
||||||
sync;
|
//sync;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
|||||||
@@ -26,6 +26,8 @@ void closeNbody()
|
|||||||
delete gpotList;
|
delete gpotList;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uniform int nn = programCount;
|
||||||
|
|
||||||
|
|
||||||
task
|
task
|
||||||
void computeForces(
|
void computeForces(
|
||||||
@@ -39,10 +41,10 @@ void computeForces(
|
|||||||
const uniform int blockDim = (nbodies + taskCount - 1)/taskCount;
|
const uniform int blockDim = (nbodies + taskCount - 1)/taskCount;
|
||||||
const uniform int blockBeg = blockIdx * blockDim;
|
const uniform int blockBeg = blockIdx * blockDim;
|
||||||
const uniform int blockEnd = min(blockBeg + blockDim, nbodies);
|
const uniform int blockEnd = min(blockBeg + blockDim, nbodies);
|
||||||
|
uniform real shmem[4*programCount];
|
||||||
|
|
||||||
#if 0
|
//real gpotLoc = 0;
|
||||||
uniform real gpotLoc = 0;
|
foreach (i = blockBeg ... blockEnd)
|
||||||
for (uniform int i = blockBeg; i < blockEnd; i++)
|
|
||||||
{
|
{
|
||||||
const real iposx = posx[i];
|
const real iposx = posx[i];
|
||||||
const real iposy = posy[i];
|
const real iposy = posy[i];
|
||||||
@@ -51,7 +53,8 @@ void computeForces(
|
|||||||
real iaccy = 0;
|
real iaccy = 0;
|
||||||
real iaccz = 0;
|
real iaccz = 0;
|
||||||
real igpot = 0;
|
real igpot = 0;
|
||||||
foreach (j = 0 ... nbodies)
|
#if 0
|
||||||
|
for (uniform int j = 0; j < nbodies; j++)
|
||||||
{
|
{
|
||||||
const real jposx = posx[j];
|
const real jposx = posx[j];
|
||||||
const real jposy = posy[j];
|
const real jposy = posy[j];
|
||||||
@@ -70,52 +73,40 @@ void computeForces(
|
|||||||
iaccz += mrinv3 * dz;
|
iaccz += mrinv3 * dz;
|
||||||
igpot += mrinv;
|
igpot += mrinv;
|
||||||
}
|
}
|
||||||
accx[i] = reduce_add(iaccx);
|
|
||||||
accy[i] = reduce_add(iaccy);
|
|
||||||
accz[i] = reduce_add(iaccz);
|
|
||||||
gpotLoc += reduce_add(igpot);
|
|
||||||
}
|
|
||||||
gpotList[taskIndex] = gpotLoc;
|
|
||||||
#else
|
#else
|
||||||
real gpotLoc = 0;
|
for (uniform int j = 0; j < nbodies; j += programCount)
|
||||||
foreach (i = blockBeg ... blockEnd)
|
|
||||||
{
|
{
|
||||||
const real iposx = posx[i];
|
shmem[0*programCount + programIndex] = posx[j+programIndex];
|
||||||
const real iposy = posy[i];
|
shmem[1*programCount + programIndex] = posy[j+programIndex];
|
||||||
const real iposz = posz[i];
|
shmem[2*programCount + programIndex] = posz[j+programIndex];
|
||||||
real iaccx = 0;
|
shmem[3*programCount + programIndex] = mass[j+programIndex];
|
||||||
real iaccy = 0;
|
for (uniform int jb = 0; jb < programCount; jb++)
|
||||||
real iaccz = 0;
|
|
||||||
real igpot = 0;
|
|
||||||
for (uniform int j = 0; j < nbodies; j += 1)
|
|
||||||
{
|
{
|
||||||
#define STEP(jk) {\
|
const real jposx = shmem[0*programCount + jb];
|
||||||
const real jposx = posx[j+jk]; \
|
const real jposy = shmem[1*programCount + jb];
|
||||||
const real jposy = posy[j+jk]; \
|
const real jposz = shmem[2*programCount + jb];
|
||||||
const real jposz = posz[j+jk]; \
|
const real jmass = shmem[3*programCount + jb];
|
||||||
const real jmass = mass[j+jk]; \
|
const real dx = jposx - iposx;
|
||||||
const real dx = jposx - iposx; \
|
const real dy = jposy - iposy;
|
||||||
const real dy = jposy - iposy; \
|
const real dz = jposz - iposz;
|
||||||
const real dz = jposz - iposz; \
|
const real r2 = dx*dx + dy*dy + dz*dz;
|
||||||
const real r2 = dx*dx + dy*dy + dz*dz; \
|
const real rinv = r2; // > 0.0d ? rsqrt((float)r2) : 0;
|
||||||
const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0; \
|
const real mrinv = -jmass * rinv;
|
||||||
const real mrinv = -jmass * rinv; \
|
const real mrinv3 = mrinv * rinv*rinv;
|
||||||
const real mrinv3 = mrinv * rinv*rinv; \
|
|
||||||
\
|
iaccx += mrinv3 * dx;
|
||||||
iaccx += mrinv3 * dx; \
|
iaccy += mrinv3 * dy;
|
||||||
iaccy += mrinv3 * dy; \
|
iaccz += mrinv3 * dz;
|
||||||
iaccz += mrinv3 * dz; \
|
igpot += mrinv;
|
||||||
igpot += mrinv; \
|
|
||||||
}
|
|
||||||
STEP(0)
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
accx[i] = iaccx;
|
accx[i] = iaccx;
|
||||||
accy[i] = iaccy;
|
accy[i] = iaccy;
|
||||||
accz[i] = iaccz;
|
accz[i] = iaccz;
|
||||||
gpotLoc += igpot;
|
// gpotLoc += igpot;
|
||||||
}
|
}
|
||||||
gpotList[taskIndex] = reduce_add(gpotLoc);
|
// gpotList[taskIndex] = reduce_add(gpotLoc);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
task
|
task
|
||||||
@@ -179,20 +170,21 @@ void nbodyIntegrate(
|
|||||||
{
|
{
|
||||||
uniform int nTasks = num_cores()*4;
|
uniform int nTasks = num_cores()*4;
|
||||||
#ifdef __NVPTX__
|
#ifdef __NVPTX__
|
||||||
nTasks = nbodies/(4*programCount);
|
nTasks = (nbodies + 4*programCount - 1)/(4*programCount);
|
||||||
#endif
|
#endif
|
||||||
assert((nbodies % nTasks) == 0);
|
assert((nbodies % nTasks) == 0);
|
||||||
|
|
||||||
for (uniform int step = 0; step < nSteps; step++)
|
for (uniform int step = 0; step < nSteps; step++)
|
||||||
{
|
{
|
||||||
launch [nTasks] updatePositions(nbodies, posx, posy, posz, velx, vely, velz,dt);
|
// launch [nTasks] updatePositions(nbodies, posx, posy, posz, velx, vely, velz,dt);
|
||||||
sync;
|
// sync;
|
||||||
launch [nTasks] computeForces(nbodies, posx, posy, posz, mass);
|
launch [nTasks] computeForces(nbodies, posx, posy, posz, mass);
|
||||||
sync;
|
sync;
|
||||||
launch [nTasks] updateVelocities(nbodies, posx, posy, posz, dt);
|
// launch [nTasks] updateVelocities(nbodies, posx, posy, posz, dt);
|
||||||
sync;
|
// sync;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
if (energies != NULL)
|
if (energies != NULL)
|
||||||
{
|
{
|
||||||
real gpotLoc = 0;
|
real gpotLoc = 0;
|
||||||
@@ -200,6 +192,7 @@ void nbodyIntegrate(
|
|||||||
gpotLoc += gpotList[i];
|
gpotLoc += gpotList[i];
|
||||||
energies[0] = reduce_add(gpotLoc);
|
energies[0] = reduce_add(gpotLoc);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user