From eb3277587aa3163e1023f44668fd9603e0b5886a Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 30 Jan 2014 20:01:34 +0100 Subject: [PATCH] +1 --- examples_ptx/common_gpu.mk | 2 +- examples_ptx/nbody/nbody.cpp | 4 +- examples_ptx/nbody/nbody.cu | 94 ++++++++++++------------- examples_ptx/nbody/nbody.ispc | 125 ++++++++++++++++------------------ 4 files changed, 104 insertions(+), 121 deletions(-) diff --git a/examples_ptx/common_gpu.mk b/examples_ptx/common_gpu.mk index 8173e437..41997d93 100644 --- a/examples_ptx/common_gpu.mk +++ b/examples_ptx/common_gpu.mk @@ -22,7 +22,7 @@ endif # ISPC=ispc -ISPC_FLAGS=-O3 --math-lib=default --target=nvptx --opt=fast-math +ISPC_FLAGS=-O3 --math-lib=fast --target=nvptx --opt=fast-math # # # diff --git a/examples_ptx/nbody/nbody.cpp b/examples_ptx/nbody/nbody.cpp index 2c3b4201..ee5dffbf 100644 --- a/examples_ptx/nbody/nbody.cpp +++ b/examples_ptx/nbody/nbody.cpp @@ -43,7 +43,7 @@ int main (int argc, char *argv[]) ispcSetMallocHeapLimit(1024*1024*1024); ispc::openNbody(n); - const int nSteps = 10; + const int nSteps = 1; const real dt = 0; tISPC2 = 1e30; for (i = 0; i < m; i ++) @@ -57,7 +57,7 @@ int main (int argc, char *argv[]) tISPC2 = get_elapsed_msec(); fprintf(stderr, " %d iterations took %g sec; perf= %g GFlops\n", nSteps, tISPC2/1e3, - nSteps * 22.0*n*n/(tISPC2/1e3)/1e9); + nSteps * 20.0*n*n/(tISPC2/1e3)/1e9); } ispc::closeNbody(); diff --git a/examples_ptx/nbody/nbody.cu b/examples_ptx/nbody/nbody.cu index 6fdbfaae..901c465d 100644 --- a/examples_ptx/nbody/nbody.cu +++ b/examples_ptx/nbody/nbody.cu @@ -54,44 +54,6 @@ void computeForces( const uniform int blkBeg = blkIdx * blkDim; const uniform int blkEnd = min(blkBeg + blkDim, nbodies); -#if 0 - uniform real gpotLoc = 0; - for (uniform int i = blkBeg; i < blkEnd; i++) - { - const real iposx = posx[i]; - const real iposy = posy[i]; - const real iposz = posz[i]; - real iaccx = 0; - real iaccy = 0; - real iaccz = 0; - real igpot = 0; - foreach (j = 0 ... nbodies) - { - const real jposx = posx[j]; - const real jposy = posy[j]; - const real jposz = posz[j]; - const real jmass = mass[j]; - const real dx = jposx - iposx; - const real dy = jposy - iposy; - const real dz = jposz - iposz; - const real r2 = dx*dx + dy*dy + dz*dz; - const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0; - const real mrinv = -jmass * rinv; - const real mrinv3 = mrinv * rinv*rinv; - - iaccx += mrinv3 * dx; - iaccy += mrinv3 * dy; - iaccz += mrinv3 * dz; - igpot += mrinv; - } - accx[i] = reduce_add(iaccx); - accy[i] = reduce_add(iaccy); - accz[i] = reduce_add(iaccz); - gpotLoc += reduce_add(igpot); - } - gpotList[taskIndex] = gpotLoc; -#else - real gpotLoc = 0; for (int i = programIndex + blkBeg; i < blkEnd; i += programCount) if (i < blkEnd) { @@ -102,17 +64,18 @@ void computeForces( real iaccy = 0; real iaccz = 0; real igpot = 0; +#if 0 for (uniform int j = 0; j < nbodies; j++) { - const real jposx = posx[j]; - const real jposy = posy[j]; - const real jposz = posz[j]; - const real jmass = mass[j]; + const real jposx = posx[j]; + const real jposy = posy[j]; + const real jposz = posz[j]; + const real jmass = mass[j]; const real dx = jposx - iposx; const real dy = jposy - iposy; const real dz = jposz - iposz; const real r2 = dx*dx + dy*dy + dz*dz; - const real rinv = r2 > 0.0 ? rsqrt((float)r2) : 0; + const real rinv = r2; // > 0.0 ? rsqrt((float)r2) : 0; const real mrinv = -jmass * rinv; const real mrinv3 = mrinv * rinv*rinv; iaccx += mrinv3 * dx; @@ -120,13 +83,41 @@ void computeForces( iaccz += mrinv3 * dz; igpot += mrinv; } +#else + for (uniform int j = 0; j < nbodies; j += programCount) + { + __shared__ real shdata[4][programCount*4]; + real (* shmem)[programCount] = (real (*)[programCount])shdata[warpIdx]; + shmem[0][programIndex] = posx[j+programIndex]; + shmem[1][programIndex] = posy[j+programIndex]; + shmem[2][programIndex] = posz[j+programIndex]; + shmem[3][programIndex] = mass[j+programIndex]; + +#pragma unroll 1 + for (int jb = 0; jb < programCount; jb++) + { + const real jposx = shmem[0][jb]; + const real jposy = shmem[1][jb]; + const real jposz = shmem[2][jb]; + const real jmass = shmem[3][jb]; + const real dx = jposx - iposx; + const real dy = jposy - iposy; + const real dz = jposz - iposz; + const real r2 = dx*dx + dy*dy + dz*dz; + const real rinv = r2 ; //> 0.0 ? rsqrt((float)r2) : 0; + const real mrinv = -jmass * rinv; + const real mrinv3 = mrinv * rinv*rinv; + iaccx += mrinv3 * dx; + iaccy += mrinv3 * dy; + iaccz += mrinv3 * dz; + igpot += mrinv; + } + } +#endif accx[i] = iaccx; accy[i] = iaccy; accz[i] = iaccz; - gpotLoc += igpot; } -// gpotList[taskIndex] = reduce_add(gpotLoc); -#endif } __global__ @@ -191,17 +182,16 @@ void nbodyIntegrate___export( uniform real energies[]) { uniform int nTasks ; - nTasks = nbodies/(4*programCount); - assert((nbodies % nTasks) == 0); + nTasks = (nbodies+1*programCount - 1)/(1*programCount); for (uniform int step = 0; step < nSteps; step++) { - launch (nTasks,1,1, updatePositions)(nbodies, posx, posy, posz, velx, vely, velz,dt); - sync; + // launch (nTasks,1,1, updatePositions)(nbodies, posx, posy, posz, velx, vely, velz,dt); + // sync; launch (nTasks,1,1, computeForces)(nbodies, posx, posy, posz, mass); sync; - launch (nTasks,1,1, updateVelocities)(nbodies, posx, posy, posz, dt); - sync; + // launch (nTasks,1,1, updateVelocities)(nbodies, posx, posy, posz, dt); + //sync; } #if 0 diff --git a/examples_ptx/nbody/nbody.ispc b/examples_ptx/nbody/nbody.ispc index 38899916..6c88708b 100644 --- a/examples_ptx/nbody/nbody.ispc +++ b/examples_ptx/nbody/nbody.ispc @@ -26,6 +26,8 @@ void closeNbody() delete gpotList; } +uniform int nn = programCount; + task void computeForces( @@ -39,45 +41,9 @@ void computeForces( const uniform int blockDim = (nbodies + taskCount - 1)/taskCount; const uniform int blockBeg = blockIdx * blockDim; const uniform int blockEnd = min(blockBeg + blockDim, nbodies); + uniform real shmem[4*programCount]; -#if 0 - uniform real gpotLoc = 0; - for (uniform int i = blockBeg; i < blockEnd; i++) - { - const real iposx = posx[i]; - const real iposy = posy[i]; - const real iposz = posz[i]; - real iaccx = 0; - real iaccy = 0; - real iaccz = 0; - real igpot = 0; - foreach (j = 0 ... nbodies) - { - const real jposx = posx[j]; - const real jposy = posy[j]; - const real jposz = posz[j]; - const real jmass = mass[j]; - const real dx = jposx - iposx; - const real dy = jposy - iposy; - const real dz = jposz - iposz; - const real r2 = dx*dx + dy*dy + dz*dz; - const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0; - const real mrinv = -jmass * rinv; - const real mrinv3 = mrinv * rinv*rinv; - - iaccx += mrinv3 * dx; - iaccy += mrinv3 * dy; - iaccz += mrinv3 * dz; - igpot += mrinv; - } - accx[i] = reduce_add(iaccx); - accy[i] = reduce_add(iaccy); - accz[i] = reduce_add(iaccz); - gpotLoc += reduce_add(igpot); - } - gpotList[taskIndex] = gpotLoc; -#else - real gpotLoc = 0; + //real gpotLoc = 0; foreach (i = blockBeg ... blockEnd) { const real iposx = posx[i]; @@ -87,35 +53,60 @@ void computeForces( real iaccy = 0; real iaccz = 0; real igpot = 0; - for (uniform int j = 0; j < nbodies; j += 1) +#if 0 + for (uniform int j = 0; j < nbodies; j++) { -#define STEP(jk) {\ - const real jposx = posx[j+jk]; \ - const real jposy = posy[j+jk]; \ - const real jposz = posz[j+jk]; \ - const real jmass = mass[j+jk]; \ - const real dx = jposx - iposx; \ - const real dy = jposy - iposy; \ - const real dz = jposz - iposz; \ - const real r2 = dx*dx + dy*dy + dz*dz; \ - const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0; \ - const real mrinv = -jmass * rinv; \ - const real mrinv3 = mrinv * rinv*rinv; \ - \ - iaccx += mrinv3 * dx; \ - iaccy += mrinv3 * dy; \ - iaccz += mrinv3 * dz; \ - igpot += mrinv; \ -} - STEP(0) + const real jposx = posx[j]; + const real jposy = posy[j]; + const real jposz = posz[j]; + const real jmass = mass[j]; + const real dx = jposx - iposx; + const real dy = jposy - iposy; + const real dz = jposz - iposz; + const real r2 = dx*dx + dy*dy + dz*dz; + const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0; + const real mrinv = -jmass * rinv; + const real mrinv3 = mrinv * rinv*rinv; + + iaccx += mrinv3 * dx; + iaccy += mrinv3 * dy; + iaccz += mrinv3 * dz; + igpot += mrinv; } +#else + for (uniform int j = 0; j < nbodies; j += programCount) + { + shmem[0*programCount + programIndex] = posx[j+programIndex]; + shmem[1*programCount + programIndex] = posy[j+programIndex]; + shmem[2*programCount + programIndex] = posz[j+programIndex]; + shmem[3*programCount + programIndex] = mass[j+programIndex]; + for (uniform int jb = 0; jb < programCount; jb++) + { + const real jposx = shmem[0*programCount + jb]; + const real jposy = shmem[1*programCount + jb]; + const real jposz = shmem[2*programCount + jb]; + const real jmass = shmem[3*programCount + jb]; + const real dx = jposx - iposx; + const real dy = jposy - iposy; + const real dz = jposz - iposz; + const real r2 = dx*dx + dy*dy + dz*dz; + const real rinv = r2; // > 0.0d ? rsqrt((float)r2) : 0; + const real mrinv = -jmass * rinv; + const real mrinv3 = mrinv * rinv*rinv; + + iaccx += mrinv3 * dx; + iaccy += mrinv3 * dy; + iaccz += mrinv3 * dz; + igpot += mrinv; + } + } +#endif accx[i] = iaccx; accy[i] = iaccy; accz[i] = iaccz; - gpotLoc += igpot; +// gpotLoc += igpot; } - gpotList[taskIndex] = reduce_add(gpotLoc); -#endif +// gpotList[taskIndex] = reduce_add(gpotLoc); } task @@ -179,20 +170,21 @@ void nbodyIntegrate( { uniform int nTasks = num_cores()*4; #ifdef __NVPTX__ - nTasks = nbodies/(4*programCount); + nTasks = (nbodies + 4*programCount - 1)/(4*programCount); #endif assert((nbodies % nTasks) == 0); for (uniform int step = 0; step < nSteps; step++) { - launch [nTasks] updatePositions(nbodies, posx, posy, posz, velx, vely, velz,dt); - sync; +// launch [nTasks] updatePositions(nbodies, posx, posy, posz, velx, vely, velz,dt); + // sync; launch [nTasks] computeForces(nbodies, posx, posy, posz, mass); sync; - launch [nTasks] updateVelocities(nbodies, posx, posy, posz, dt); - sync; + // launch [nTasks] updateVelocities(nbodies, posx, posy, posz, dt); + // sync; } +#if 0 if (energies != NULL) { real gpotLoc = 0; @@ -200,6 +192,7 @@ void nbodyIntegrate( gpotLoc += gpotList[i]; energies[0] = reduce_add(gpotLoc); } +#endif }