diff --git a/examples/portable/nbody_hermite4/Makefile_cpu b/examples/portable/nbody_hermite4/Makefile_cpu new file mode 100644 index 00000000..b3751669 --- /dev/null +++ b/examples/portable/nbody_hermite4/Makefile_cpu @@ -0,0 +1,8 @@ + +EXAMPLE=hermite4 +CPP_SRC=hermite4.cpp +ISPC_SRC=hermite4.ispc +ISPC_IA_TARGETS=avx1-i32x8 +ISPC_ARM_TARGETS=neon + +include ../common_cpu.mk diff --git a/examples/portable/nbody_hermite4/Makefile_ptx b/examples/portable/nbody_hermite4/Makefile_ptx new file mode 100644 index 00000000..da8b268b --- /dev/null +++ b/examples/portable/nbody_hermite4/Makefile_ptx @@ -0,0 +1,14 @@ +PROG=hermite4 +ISPC_SRC=hermite4.ispc +#CU_SRC=hermite4.cu +CXX_SRC=hermite4.cpp +PTXCC_REGMAX=64 +#ISPC_FLAGS= --opt=disable-uniform-control-flow + +#LLVM_GPU=1 +NVVM_GPU=1 + +include ../common_ptx.mk + + + diff --git a/examples/portable/nbody_hermite4/hermite4.cpp b/examples/portable/nbody_hermite4/hermite4.cpp new file mode 100644 index 00000000..ed7ff3b1 --- /dev/null +++ b/examples/portable/nbody_hermite4/hermite4.cpp @@ -0,0 +1,323 @@ +/* Hermite4 N-body integrator */ +/* Makino and Aarseth, 1992 */ +/* http://adsabs.harvard.edu/abs/1992PASJ...44..141M and references there in*/ + +#include +#include +#include +#include +#include +#include + +#include "timing.h" +#include "ispc_malloc.h" + +#include "typeReal.h" +#include "hermite4_ispc.h" + +struct Hermite4 +{ + enum {PP_FLOP=44}; + const int n; + const real eta; + real *g_mass, *g_gpot; + real *g_posx, *g_posy, *g_posz; + real *g_velx, *g_vely, *g_velz; + real *g_accx, *g_accy, *g_accz; + real *g_jrkx, *g_jrky, *g_jrkz; + + std::vector accx0, accy0, accz0; + std::vector jrkx0, jrky0, jrkz0; + + Hermite4(const int _n = 8192, const real _eta = 0.1) : n(_n), eta(_eta) + { + g_mass = new real[n]; + g_gpot = new real[n]; + g_posx = new real[n]; + g_posy = new real[n]; + g_posz = new real[n]; + g_velx = new real[n]; + g_vely = new real[n]; + g_velz = new real[n]; + g_accx = new real[n]; + g_accy = new real[n]; + g_accz = new real[n]; + g_jrkx = new real[n]; + g_jrky = new real[n]; + g_jrkz = new real[n]; + + accx0.resize(n); + accy0.resize(n); + accz0.resize(n); + jrkx0.resize(n); + jrky0.resize(n); + jrkz0.resize(n); + + printf("---Intializing nbody--- \n"); + + const real R0 = 1; + const real mp = 1.0/n; +#pragma omp parallel for schedule(runtime) + for (int i = 0; i < n; i++) + { + real xp, yp, zp, s2 = 2*R0; + real vx, vy, vz; + while (s2 > R0*R0) { + xp = (1.0 - 2.0*drand48())*R0; + yp = (1.0 - 2.0*drand48())*R0; + zp = (1.0 - 2.0*drand48())*R0; + s2 = xp*xp + yp*yp + zp*zp; + vx = drand48() * 0.1; + vy = drand48() * 0.1; + vz = drand48() * 0.1; + } + g_posx[i] = xp; + g_posy[i] = yp; + g_posz[i] = zp; + g_velx[i] = vx; + g_vely[i] = vy; + g_velz[i] = vz; + g_mass[i] = mp; + } + } + + ~Hermite4() + { + delete g_mass; + delete g_gpot; + delete g_posx; + delete g_posy; + delete g_posz; + delete g_velx; + delete g_vely; + delete g_velz; + delete g_accx; + delete g_accy; + delete g_accz; + delete g_jrkx; + delete g_jrky; + delete g_jrkz; + } + + void forces(); + + real step(const real dt) + { + const real dt2 = dt*real(1.0/2.0); + const real dt3 = dt*real(1.0/3.0); + + real dt_min = HUGE; + +#pragma omp parallel for schedule(runtime) + for (int i = 0; i < n; i++) + { + accx0[i] = g_accx[i]; + accy0[i] = g_accy[i]; + accz0[i] = g_accz[i]; + jrkx0[i] = g_jrkx[i]; + jrky0[i] = g_jrky[i]; + jrkz0[i] = g_jrkz[i]; + + g_posx[i] += dt*(g_velx[i] + dt2*(g_accx[i] + dt3*g_jrkx[i])); + g_posy[i] += dt*(g_vely[i] + dt2*(g_accy[i] + dt3*g_jrky[i])); + g_posz[i] += dt*(g_velz[i] + dt2*(g_accz[i] + dt3*g_jrkz[i])); + + g_velx[i] += dt*(g_accx[i] + dt2*g_jrkx[i]); + g_vely[i] += dt*(g_accy[i] + dt2*g_jrky[i]); + g_velz[i] += dt*(g_accz[i] + dt2*g_jrkz[i]); + } + + forces(); + + if (dt > 0.0) + { + const real h = dt*real(0.5); + const real hinv = real(1.0)/h; + const real f1 = real(0.5)*hinv*hinv; + const real f2 = real(3.0)*hinv*f1; + + const real dt2 = dt *dt * real(1.0/2.0); + const real dt3 = dt2*dt * real(1.0/3.0); + const real dt4 = dt3*dt * real(1.0/4.0); + const real dt5 = dt4*dt * real(1.0/5.0); + +#pragma omp parallel for schedule(runtime) reduction(min:dt_min) + for (int i = 0; i < n; i++) + { + /* compute snp & crk */ + + const real Amx = g_accx[i] - accx0[i]; + const real Amy = g_accy[i] - accy0[i]; + const real Amz = g_accz[i] - accz0[i]; + + const real Jmx = h*(g_jrkx[i] - jrkx0[i]); + const real Jmy = h*(g_jrky[i] - jrky0[i]); + const real Jmz = h*(g_jrkz[i] - jrkz0[i]); + + const real Jpx = h*(g_jrkx[i] + jrkx0[i]); + const real Jpy = h*(g_jrky[i] + jrky0[i]); + const real Jpz = h*(g_jrkz[i] + jrkz0[i]); + + + real snpx = f1*Jmx; + real snpy = f1*Jmy; + real snpz = f1*Jmz; + + real crkx = f2*(Jpx - Amx); + real crky = f2*(Jpy - Amy); + real crkz = f2*(Jpz - Amz); + + snpx -= h*crkx; + snpy -= h*crky; + snpz -= h*crkz; + + /* correct */ + + g_posx[i] += dt4*snpx + dt5*crkx; + g_posy[i] += dt4*snpy + dt5*crky; + g_posz[i] += dt4*snpz + dt5*crkz; + + g_velx[i] += dt3*snpx + dt4*crkx; + g_vely[i] += dt3*snpy + dt4*crky; + g_velz[i] += dt3*snpz + dt4*crkz; + + /* compute new timestep */ + + const real s0 = g_accx[i]*g_accx[i] + g_accy[i]*g_accy[i] + g_accz[i]*g_accz[i]; + const real s1 = g_jrkx[i]*g_jrkx[i] + g_jrky[i]*g_jrky[i] + g_jrkz[i]*g_jrkz[i]; + const real s2 = snpx*snpx + snpy*snpy + snpz*snpz; + const real s3 = crkx*crkx + crky*crky + crkz*crkz; + + const double u = std::sqrt(s0*s2) + s1; + const double l = std::sqrt(s1*s3) + s2; + assert(l > 0.0f); + const real dt_loc = eta *std::sqrt(u/l); + dt_min = std::min(dt_min, dt_loc); + } + } + + if (dt_min == HUGE) + return dt; + else + return dt_min; + } + + void energy(real &Ekin, real &Epot) + { + real ekin = 0, epot = 0; + +#pragma omp parallel for reduction(+:ekin,epot) + for (int i = 0; i < n; i++) + { + ekin += g_mass[i] * (g_velx[i]*g_velx[i] + g_vely[i]*g_vely[i] + g_velz[i]*g_velz[i]) * real(0.5f); + epot += real(0.5f)*g_mass[i] * g_gpot[i]; + } + Ekin = ekin; + Epot = epot; + } + + void integrate(const int niter, const real t_end = HUGE) + { + const double tin = rtc(); + forces(); + const double fn = n; + printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - tin, + fn*fn*PP_FLOP/(rtc() - tin)/1e9); + + real Epot0, Ekin0; + energy(Ekin0, Epot0); + const real Etot0 = Epot0 + Ekin0; + printf(" E: %g %g %g \n", Epot0, Ekin0, Etot0); + + ///////// + + real t_global = 0; + double t0 = 0; + int iter = 0; + int ntime = 10; + real dt = 1.0/131072; + real Epot, Ekin, Etot = Etot0; + while (t_global < t_end) { + if (iter % ntime == 0) + t0 = rtc(); + + if (iter >= niter) return; + + dt = step(dt); + iter++; + t_global += dt; + + const real Etot_pre = Etot; + energy(Ekin, Epot); + Etot = Ekin + Epot; + + if (iter % 1 == 0) { + const real Etot = Ekin + Epot; + printf("iter= %d: t= %g dt= %g Ekin= %g Epot= %g Etot= %g , dE = %g d(dE)= %g \n", + iter, t_global, dt, Ekin, Epot, Etot, (Etot - Etot0)/std::abs(Etot0), + (Etot - Etot_pre)/std::abs(Etot_pre) ); + } + + if (iter % ntime == 0) { + printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - t0, + fn*fn*PP_FLOP/(rtc() - t0)/1e9*ntime); + } + + fflush(stdout); + + } + } + +}; + + + +void Hermite4::forces() +{ + ispc::compute_forces( + n, + g_mass, + g_posx, + g_posy, + g_posz, + g_velx, + g_vely, + g_velz, + g_accx, + g_accy, + g_accz, + g_jrkx, + g_jrky, + g_jrkz, + g_gpot); +} + +void run(const int nbodies, const real eta, const int nstep) +{ + Hermite4 h4(nbodies, eta); + h4.integrate(nstep); +} + +int main(int argc, char *argv[]) +{ + printf(" Usage: %s [nbodies=8192] [nsteps=40] [eta=0.1] \n", argv[0]); + + int nbodies = 8192; + if (argc > 1) nbodies = atoi(argv[1]); + + int nstep = 40; + if (argc > 2) nstep = atoi(argv[2]); + + float eta = 0.1; + if (argc > 3) eta = atof(argv[3]); + + + printf("nbodies= %d\n", nbodies); + printf("nstep= %d\n", nstep); + printf(" eta= %g \n", eta); + + run(nbodies, eta, nstep); + + return 0; +} + diff --git a/examples/portable/nbody_hermite4/hermite4.ispc b/examples/portable/nbody_hermite4/hermite4.ispc new file mode 100644 index 00000000..95992e05 --- /dev/null +++ b/examples/portable/nbody_hermite4/hermite4.ispc @@ -0,0 +1,157 @@ +#include "typeReal.h" + +typedef real<3> vec3; +struct Force +{ + vec3 acc, jrk; + real pot, null; +}; + +struct Predictor +{ + vec3 pos, vel; +}; + +static inline +void body_body_force( + Force &fi, + const Predictor &pi, + const Predictor &pj, + const real mj) +{ + const real dx = pj.pos.x - pi.pos.x; + const real dy = pj.pos.y - pi.pos.y; + const real dz = pj.pos.z - pi.pos.z; + + const real ds2 = dx*dx + dy*dy + dz*dz; + + const real inv_ds = ds2 > 1.0d-10 ? rsqrt((float)ds2) : 0.0; + const real inv_ds2 = inv_ds*inv_ds; + const real minv_ds = inv_ds * mj; + const real minv_ds3 = inv_ds2 * minv_ds; + + + fi.acc.x += minv_ds3 * dx; + fi.acc.y += minv_ds3 * dy; + fi.acc.z += minv_ds3 * dz; + fi.pot -= minv_ds; + + const real dvx = pj.vel.x - pi.vel.x; + const real dvy = pj.vel.y - pi.vel.y; + const real dvz = pj.vel.z - pi.vel.z; + const real rv = dx*dvx + dy*dvy + dz*dvz; + + const real Jij = (real)(-3.0) * (rv * inv_ds2 * minv_ds3); + + fi.jrk.x += minv_ds3*dvx + Jij*dx; + fi.jrk.y += minv_ds3*dvy + Jij*dy; + fi.jrk.z += minv_ds3*dvz + Jij*dz; +} + +task void compute_forces_task( + uniform const int n, + uniform const int nPerTask, + uniform const real mass[], + uniform const real posx[], + uniform const real posy[], + uniform const real posz[], + uniform const real velx[], + uniform const real vely[], + uniform const real velz[], + uniform real accx[], + uniform real accy[], + uniform real accz[], + uniform real jrkx[], + uniform real jrky[], + uniform real jrkz[], + uniform real gpot[]) +{ + const uniform int nibeg = taskIndex * nPerTask; + const uniform int niend = min(n, nibeg + nPerTask); + + if (nibeg >= n) + return; + + uniform real shdata[7][programCount]; + + assert((n%programCount) == 0); + + foreach (i = nibeg ... niend) + { + Force fi; + fi.acc = (real)0.0; + fi.jrk = (real)0.0; + fi.pot = (real)0.0; + + Predictor pi; + pi.pos.x = posx[i]; + pi.pos.y = posy[i]; + pi.pos.z = posz[i]; + pi.vel.x = velx[i]; + pi.vel.y = vely[i]; + pi.vel.z = velz[i]; + + for (uniform int jb = 0; jb < n; jb += programCount) + { + const int jp = jb + programIndex; + shdata[0][programIndex] = posx[jp]; + shdata[1][programIndex] = posy[jp]; + shdata[2][programIndex] = posz[jp]; + shdata[3][programIndex] = mass[jp]; + shdata[4][programIndex] = velx[jp]; + shdata[5][programIndex] = vely[jp]; + shdata[6][programIndex] = velz[jp]; + + for (uniform int j = 0; j < programCount; j++) + { + Predictor pj; + pj.pos.x = shdata[0][j]; + pj.pos.y = shdata[1][j]; + pj.pos.z = shdata[2][j]; + pj.vel.x = shdata[4][j]; + pj.vel.y = shdata[5][j]; + pj.vel.z = shdata[6][j]; + const real jmass = shdata[3][j]; + body_body_force(fi,pi,pj,jmass); + } + } + + accx[i] = fi.acc.x; + accy[i] = fi.acc.y; + accz[i] = fi.acc.z; + jrkx[i] = fi.jrk.x; + jrky[i] = fi.jrk.y; + jrkz[i] = fi.jrk.z; + gpot[i] = fi.pot; + } +} + +export void compute_forces( + uniform const int n, + uniform const real mass[], + uniform const real posx[], + uniform const real posy[], + uniform const real posz[], + uniform const real velx[], + uniform const real vely[], + uniform const real velz[], + uniform real accx[], + uniform real accy[], + uniform real accz[], + uniform real jrkx[], + uniform real jrky[], + uniform real jrkz[], + uniform real gpot[]) +{ + const uniform int nPerTask = min(128,programCount*8); + const uniform int nTask = (n+nPerTask-1)/nPerTask; + + launch [nTask] compute_forces_task( + n, nPerTask, + mass, + posx,posy,posz, + velx,vely,velz, + accx,accy,accz, + jrkx,jrky,jrkz, + gpot); +} diff --git a/examples/portable/nbody_hermite4/typeReal.h b/examples/portable/nbody_hermite4/typeReal.h new file mode 100644 index 00000000..064a6867 --- /dev/null +++ b/examples/portable/nbody_hermite4/typeReal.h @@ -0,0 +1,2 @@ +#pragma once +typedef double real;