added nbody_hermite4

2014-02-21 14:07:51 +01:00
parent 42c4d3246c
commit f09c74db21
5 changed files with 504 additions and 0 deletions
--- a/examples/portable/nbody_hermite4/Makefile_cpu
+++ b/examples/portable/nbody_hermite4/Makefile_cpu
@@ -0,0 +1,8 @@
+
+EXAMPLE=hermite4
+CPP_SRC=hermite4.cpp 
+ISPC_SRC=hermite4.ispc
+ISPC_IA_TARGETS=avx1-i32x8
+ISPC_ARM_TARGETS=neon
+
+include ../common_cpu.mk
--- a/examples/portable/nbody_hermite4/Makefile_ptx
+++ b/examples/portable/nbody_hermite4/Makefile_ptx
@@ -0,0 +1,14 @@
+PROG=hermite4
+ISPC_SRC=hermite4.ispc
+#CU_SRC=hermite4.cu
+CXX_SRC=hermite4.cpp 
+PTXCC_REGMAX=64
+#ISPC_FLAGS= --opt=disable-uniform-control-flow
+
+#LLVM_GPU=1
+NVVM_GPU=1
+
+include ../common_ptx.mk
+
+
+
--- a/examples/portable/nbody_hermite4/hermite4.cpp
+++ b/examples/portable/nbody_hermite4/hermite4.cpp
@@ -0,0 +1,323 @@
+/* Hermite4 N-body integrator */
+/* Makino and Aarseth, 1992 */
+/* http://adsabs.harvard.edu/abs/1992PASJ...44..141M and references there in*/
+
+#include <cstdlib>
+#include <cmath>
+#include <cstdio>
+#include <algorithm>
+#include <vector>
+#include <cassert>
+
+#include "timing.h"
+#include "ispc_malloc.h"
+
+#include "typeReal.h"
+#include "hermite4_ispc.h"
+
+struct Hermite4
+{
+  enum {PP_FLOP=44};
+  const int n;
+  const real eta;
+  real *g_mass, *g_gpot;
+  real *g_posx, *g_posy, *g_posz;
+  real *g_velx, *g_vely, *g_velz;
+  real *g_accx, *g_accy, *g_accz;
+  real *g_jrkx, *g_jrky, *g_jrkz;
+
+  std::vector<real> accx0, accy0, accz0;
+  std::vector<real> jrkx0, jrky0, jrkz0;
+
+  Hermite4(const int _n = 8192, const real _eta = 0.1) : n(_n), eta(_eta)
+  {
+    g_mass = new real[n];
+    g_gpot = new real[n];
+    g_posx = new real[n];
+    g_posy = new real[n];
+    g_posz = new real[n];
+    g_velx = new real[n];
+    g_vely = new real[n];
+    g_velz = new real[n];
+    g_accx = new real[n];
+    g_accy = new real[n];
+    g_accz = new real[n];
+    g_jrkx = new real[n];
+    g_jrky = new real[n];
+    g_jrkz = new real[n];
+
+    accx0.resize(n);
+    accy0.resize(n);
+    accz0.resize(n);
+    jrkx0.resize(n);
+    jrky0.resize(n);
+    jrkz0.resize(n);
+
+    printf("---Intializing nbody--- \n");
+
+    const real R0 = 1;
+    const real mp = 1.0/n;
+#pragma omp parallel for schedule(runtime)
+    for (int i = 0; i < n; i++) 
+    {
+      real xp, yp, zp, s2 = 2*R0;
+      real vx, vy, vz;
+      while (s2 > R0*R0) {
+        xp = (1.0 - 2.0*drand48())*R0;
+        yp = (1.0 - 2.0*drand48())*R0;
+        zp = (1.0 - 2.0*drand48())*R0;
+        s2 = xp*xp + yp*yp + zp*zp;
+        vx = drand48() * 0.1;
+        vy = drand48() * 0.1;
+        vz = drand48() * 0.1;
+      } 
+      g_posx[i] = xp;
+      g_posy[i] = yp;
+      g_posz[i] = zp;
+      g_velx[i] = vx;
+      g_vely[i] = vy;
+      g_velz[i] = vz;
+      g_mass[i] = mp;
+    }
+  }
+
+  ~Hermite4()
+  {
+    delete g_mass;
+    delete g_gpot;
+    delete g_posx;
+    delete g_posy;
+    delete g_posz;
+    delete g_velx;
+    delete g_vely;
+    delete g_velz;
+    delete g_accx;
+    delete g_accy;
+    delete g_accz;
+    delete g_jrkx;
+    delete g_jrky;
+    delete g_jrkz;
+  }
+
+  void forces();
+
+  real step(const real dt) 
+  {
+    const real dt2 = dt*real(1.0/2.0);
+    const real dt3 = dt*real(1.0/3.0);
+
+    real dt_min = HUGE;
+
+#pragma omp parallel for schedule(runtime)
+    for (int i = 0; i < n; i++)
+    {
+      accx0[i] = g_accx[i];
+      accy0[i] = g_accy[i];
+      accz0[i] = g_accz[i];
+      jrkx0[i] = g_jrkx[i];
+      jrky0[i] = g_jrky[i];
+      jrkz0[i] = g_jrkz[i];
+
+      g_posx[i] += dt*(g_velx[i] + dt2*(g_accx[i] + dt3*g_jrkx[i]));
+      g_posy[i] += dt*(g_vely[i] + dt2*(g_accy[i] + dt3*g_jrky[i]));
+      g_posz[i] += dt*(g_velz[i] + dt2*(g_accz[i] + dt3*g_jrkz[i]));
+
+      g_velx[i] += dt*(g_accx[i] + dt2*g_jrkx[i]);
+      g_vely[i] += dt*(g_accy[i] + dt2*g_jrky[i]);
+      g_velz[i] += dt*(g_accz[i] + dt2*g_jrkz[i]);
+    }
+
+    forces();
+
+    if (dt > 0.0)
+    {
+      const real h    = dt*real(0.5);
+      const real hinv = real(1.0)/h;
+      const real f1   = real(0.5)*hinv*hinv;
+      const real f2   = real(3.0)*hinv*f1;
+
+      const real dt2  = dt *dt * real(1.0/2.0);
+      const real dt3  = dt2*dt * real(1.0/3.0);
+      const real dt4  = dt3*dt * real(1.0/4.0);
+      const real dt5  = dt4*dt * real(1.0/5.0);
+
+#pragma omp parallel for schedule(runtime) reduction(min:dt_min)
+      for (int i = 0; i < n; i++)
+      {
+        /* compute snp & crk */
+
+        const real Amx = g_accx[i] - accx0[i]; 
+        const real Amy = g_accy[i] - accy0[i]; 
+        const real Amz = g_accz[i] - accz0[i]; 
+
+        const real Jmx = h*(g_jrkx[i] - jrkx0[i]);
+        const real Jmy = h*(g_jrky[i] - jrky0[i]);
+        const real Jmz = h*(g_jrkz[i] - jrkz0[i]);
+
+        const real Jpx = h*(g_jrkx[i] + jrkx0[i]);
+        const real Jpy = h*(g_jrky[i] + jrky0[i]);
+        const real Jpz = h*(g_jrkz[i] + jrkz0[i]);
+
+
+        real snpx = f1*Jmx;
+        real snpy = f1*Jmy;
+        real snpz = f1*Jmz;
+
+        real crkx = f2*(Jpx - Amx);
+        real crky = f2*(Jpy - Amy);
+        real crkz = f2*(Jpz - Amz);
+
+        snpx -= h*crkx;
+        snpy -= h*crky;
+        snpz -= h*crkz;
+
+        /* correct */
+
+        g_posx[i] += dt4*snpx + dt5*crkx;
+        g_posy[i] += dt4*snpy + dt5*crky;
+        g_posz[i] += dt4*snpz + dt5*crkz;
+
+        g_velx[i] += dt3*snpx + dt4*crkx;
+        g_vely[i] += dt3*snpy + dt4*crky;
+        g_velz[i] += dt3*snpz + dt4*crkz;
+
+        /* compute new timestep */
+
+        const real s0 = g_accx[i]*g_accx[i] + g_accy[i]*g_accy[i] + g_accz[i]*g_accz[i];
+        const real s1 = g_jrkx[i]*g_jrkx[i] + g_jrky[i]*g_jrky[i] + g_jrkz[i]*g_jrkz[i];
+        const real s2 = snpx*snpx + snpy*snpy + snpz*snpz;
+        const real s3 = crkx*crkx + crky*crky + crkz*crkz;
+
+        const double u = std::sqrt(s0*s2) + s1;
+        const double l = std::sqrt(s1*s3) + s2;
+        assert(l > 0.0f);
+        const real dt_loc = eta *std::sqrt(u/l);
+        dt_min = std::min(dt_min, dt_loc);
+      }
+    }
+
+    if (dt_min == HUGE) 
+      return dt;
+    else 
+      return dt_min;
+  }
+
+  void energy(real &Ekin, real &Epot) 
+  {
+    real ekin = 0, epot = 0;
+
+#pragma omp parallel for reduction(+:ekin,epot)
+    for (int i = 0; i < n; i++) 
+    {
+      ekin += g_mass[i] * (g_velx[i]*g_velx[i] + g_vely[i]*g_vely[i] + g_velz[i]*g_velz[i]) * real(0.5f);
+      epot += real(0.5f)*g_mass[i] * g_gpot[i];
+    }
+    Ekin = ekin;
+    Epot = epot;
+  }
+
+  void integrate(const int niter, const real t_end = HUGE)
+  {
+    const double tin = rtc();
+    forces();
+    const double fn = n;
+    printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - tin,
+        fn*fn*PP_FLOP/(rtc() - tin)/1e9);
+
+    real Epot0, Ekin0;
+    energy(Ekin0, Epot0);
+    const real Etot0 = Epot0 + Ekin0;
+    printf(" E: %g %g %g \n", Epot0, Ekin0, Etot0);
+
+    /////////
+
+    real t_global = 0;
+    double t0 = 0;
+    int iter = 0;
+    int ntime = 10;
+    real dt = 1.0/131072;
+    real Epot, Ekin, Etot = Etot0;
+    while (t_global < t_end) {
+      if (iter % ntime == 0) 
+        t0 = rtc();
+
+      if (iter >= niter) return;
+
+      dt = step(dt);
+      iter++;
+      t_global += dt;
+
+      const real Etot_pre = Etot;
+      energy(Ekin, Epot);
+      Etot = Ekin + Epot;
+
+      if (iter % 1 == 0) {
+        const real Etot = Ekin + Epot;
+        printf("iter= %d: t= %g  dt= %g Ekin= %g  Epot= %g  Etot= %g , dE = %g d(dE)= %g \n",
+            iter, t_global, dt, Ekin, Epot, Etot, (Etot - Etot0)/std::abs(Etot0),
+            (Etot - Etot_pre)/std::abs(Etot_pre)   );
+      }
+
+      if (iter % ntime == 0) {
+        printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - t0,
+            fn*fn*PP_FLOP/(rtc() - t0)/1e9*ntime);
+      }
+
+      fflush(stdout);
+
+    }
+  }
+
+};
+
+
+
+void Hermite4::forces()
+{
+  ispc::compute_forces(
+      n,
+      g_mass,
+      g_posx,
+      g_posy,
+      g_posz,
+      g_velx,
+      g_vely,
+      g_velz,
+      g_accx,
+      g_accy,
+      g_accz,
+      g_jrkx,
+      g_jrky,
+      g_jrkz,
+      g_gpot);
+}
+
+void run(const int nbodies, const real eta, const int nstep)
+{
+  Hermite4 h4(nbodies, eta);
+  h4.integrate(nstep);
+}
+
+int main(int argc, char *argv[]) 
+{
+  printf("  Usage: %s [nbodies=8192] [nsteps=40] [eta=0.1] \n", argv[0]);
+
+  int nbodies = 8192;
+  if (argc > 1) nbodies = atoi(argv[1]);
+
+  int nstep = 40;
+  if (argc > 2) nstep = atoi(argv[2]);
+
+  float eta = 0.1;
+  if (argc > 3) eta = atof(argv[3]);
+
+
+  printf("nbodies= %d\n", nbodies);
+  printf("nstep= %d\n", nstep);
+  printf(" eta= %g \n", eta);
+
+  run(nbodies, eta, nstep);
+
+  return 0;
+}
+
--- a/examples/portable/nbody_hermite4/hermite4.ispc
+++ b/examples/portable/nbody_hermite4/hermite4.ispc
@@ -0,0 +1,157 @@
+#include "typeReal.h"
+
+typedef real<3> vec3;
+struct Force
+{
+  vec3 acc, jrk;
+  real pot, null;
+};
+
+struct Predictor
+{
+  vec3 pos, vel;
+};
+
+static inline 
+void body_body_force(
+    Force &fi,
+    const Predictor &pi,
+    const Predictor &pj,
+    const real mj)
+{
+  const real dx = pj.pos.x - pi.pos.x;
+  const real dy = pj.pos.y - pi.pos.y;
+  const real dz = pj.pos.z - pi.pos.z;
+
+  const real ds2 = dx*dx + dy*dy + dz*dz;
+
+  const real  inv_ds  = ds2 > 1.0d-10 ? rsqrt((float)ds2) : 0.0;
+  const real  inv_ds2 = inv_ds*inv_ds;
+  const real minv_ds  = inv_ds  * mj;
+  const real minv_ds3 = inv_ds2 * minv_ds;
+
+
+  fi.acc.x += minv_ds3 * dx;
+  fi.acc.y += minv_ds3 * dy;
+  fi.acc.z += minv_ds3 * dz;
+  fi.pot   -= minv_ds;
+  
+  const real dvx = pj.vel.x - pi.vel.x;
+  const real dvy = pj.vel.y - pi.vel.y;
+  const real dvz = pj.vel.z - pi.vel.z;
+  const real rv  = dx*dvx + dy*dvy + dz*dvz;
+  
+  const real Jij = (real)(-3.0) * (rv * inv_ds2 * minv_ds3);
+  
+  fi.jrk.x += minv_ds3*dvx + Jij*dx;
+  fi.jrk.y += minv_ds3*dvy + Jij*dy;
+  fi.jrk.z += minv_ds3*dvz + Jij*dz;
+}
+
+task void compute_forces_task(
+    uniform const int     n,
+    uniform const int nPerTask,
+    uniform const real mass[],
+    uniform const real posx[],
+    uniform const real posy[],
+    uniform const real posz[],
+    uniform const real velx[],
+    uniform const real vely[],
+    uniform const real velz[],
+    uniform       real accx[],
+    uniform       real accy[],
+    uniform       real accz[],
+    uniform       real jrkx[],
+    uniform       real jrky[],
+    uniform       real jrkz[],
+    uniform       real gpot[])
+{
+  const uniform int nibeg = taskIndex * nPerTask;
+  const uniform int niend = min(n, nibeg + nPerTask);
+
+  if (nibeg >= n) 
+    return;
+
+  uniform real shdata[7][programCount];
+
+  assert((n%programCount) == 0);
+
+  foreach (i = nibeg ... niend)
+  {
+    Force fi;
+    fi.acc = (real)0.0;
+    fi.jrk = (real)0.0;
+    fi.pot = (real)0.0;
+   
+    Predictor pi;
+    pi.pos.x = posx[i];
+    pi.pos.y = posy[i];
+    pi.pos.z = posz[i];
+    pi.vel.x = velx[i];
+    pi.vel.y = vely[i];
+    pi.vel.z = velz[i];
+
+    for (uniform int jb = 0; jb < n; jb += programCount)
+    {
+      const int jp = jb + programIndex;
+      shdata[0][programIndex] = posx[jp];
+      shdata[1][programIndex] = posy[jp];
+      shdata[2][programIndex] = posz[jp];
+      shdata[3][programIndex] = mass[jp];
+      shdata[4][programIndex] = velx[jp];
+      shdata[5][programIndex] = vely[jp];
+      shdata[6][programIndex] = velz[jp];
+
+      for (uniform int j = 0; j < programCount; j++)
+      {
+        Predictor pj;
+        pj.pos.x = shdata[0][j];
+        pj.pos.y = shdata[1][j];
+        pj.pos.z = shdata[2][j];
+        pj.vel.x = shdata[4][j];
+        pj.vel.y = shdata[5][j];
+        pj.vel.z = shdata[6][j];
+        const real jmass  = shdata[3][j];
+        body_body_force(fi,pi,pj,jmass);
+      }
+    }
+
+    accx[i] = fi.acc.x;
+    accy[i] = fi.acc.y;
+    accz[i] = fi.acc.z;
+    jrkx[i] = fi.jrk.x;
+    jrky[i] = fi.jrk.y;
+    jrkz[i] = fi.jrk.z;
+    gpot[i] = fi.pot;
+  }
+}
+
+export void compute_forces(
+    uniform const int     n,
+    uniform const real mass[],
+    uniform const real posx[],
+    uniform const real posy[],
+    uniform const real posz[],
+    uniform const real velx[],
+    uniform const real vely[],
+    uniform const real velz[],
+    uniform       real accx[],
+    uniform       real accy[],
+    uniform       real accz[],
+    uniform       real jrkx[],
+    uniform       real jrky[],
+    uniform       real jrkz[],
+    uniform       real gpot[])
+{
+  const uniform int nPerTask = min(128,programCount*8);
+  const uniform int nTask = (n+nPerTask-1)/nPerTask;
+
+  launch [nTask]  compute_forces_task(
+      n, nPerTask,
+      mass, 
+      posx,posy,posz,
+      velx,vely,velz,
+      accx,accy,accz,
+      jrkx,jrky,jrkz,
+      gpot);
+}
--- a/examples/portable/nbody_hermite4/typeReal.h
+++ b/examples/portable/nbody_hermite4/typeReal.h
@@ -0,0 +1,2 @@
+#pragma once
+typedef double real;