added nbody_hermite4
This commit is contained in:
8
examples/portable/nbody_hermite4/Makefile_cpu
Normal file
8
examples/portable/nbody_hermite4/Makefile_cpu
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=hermite4
|
||||
CPP_SRC=hermite4.cpp
|
||||
ISPC_SRC=hermite4.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x8
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common_cpu.mk
|
||||
14
examples/portable/nbody_hermite4/Makefile_ptx
Normal file
14
examples/portable/nbody_hermite4/Makefile_ptx
Normal file
@@ -0,0 +1,14 @@
|
||||
PROG=hermite4
|
||||
ISPC_SRC=hermite4.ispc
|
||||
#CU_SRC=hermite4.cu
|
||||
CXX_SRC=hermite4.cpp
|
||||
PTXCC_REGMAX=64
|
||||
#ISPC_FLAGS= --opt=disable-uniform-control-flow
|
||||
|
||||
#LLVM_GPU=1
|
||||
NVVM_GPU=1
|
||||
|
||||
include ../common_ptx.mk
|
||||
|
||||
|
||||
|
||||
323
examples/portable/nbody_hermite4/hermite4.cpp
Normal file
323
examples/portable/nbody_hermite4/hermite4.cpp
Normal file
@@ -0,0 +1,323 @@
|
||||
/* Hermite4 N-body integrator */
|
||||
/* Makino and Aarseth, 1992 */
|
||||
/* http://adsabs.harvard.edu/abs/1992PASJ...44..141M and references there in*/
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
|
||||
#include "timing.h"
|
||||
#include "ispc_malloc.h"
|
||||
|
||||
#include "typeReal.h"
|
||||
#include "hermite4_ispc.h"
|
||||
|
||||
struct Hermite4
|
||||
{
|
||||
enum {PP_FLOP=44};
|
||||
const int n;
|
||||
const real eta;
|
||||
real *g_mass, *g_gpot;
|
||||
real *g_posx, *g_posy, *g_posz;
|
||||
real *g_velx, *g_vely, *g_velz;
|
||||
real *g_accx, *g_accy, *g_accz;
|
||||
real *g_jrkx, *g_jrky, *g_jrkz;
|
||||
|
||||
std::vector<real> accx0, accy0, accz0;
|
||||
std::vector<real> jrkx0, jrky0, jrkz0;
|
||||
|
||||
Hermite4(const int _n = 8192, const real _eta = 0.1) : n(_n), eta(_eta)
|
||||
{
|
||||
g_mass = new real[n];
|
||||
g_gpot = new real[n];
|
||||
g_posx = new real[n];
|
||||
g_posy = new real[n];
|
||||
g_posz = new real[n];
|
||||
g_velx = new real[n];
|
||||
g_vely = new real[n];
|
||||
g_velz = new real[n];
|
||||
g_accx = new real[n];
|
||||
g_accy = new real[n];
|
||||
g_accz = new real[n];
|
||||
g_jrkx = new real[n];
|
||||
g_jrky = new real[n];
|
||||
g_jrkz = new real[n];
|
||||
|
||||
accx0.resize(n);
|
||||
accy0.resize(n);
|
||||
accz0.resize(n);
|
||||
jrkx0.resize(n);
|
||||
jrky0.resize(n);
|
||||
jrkz0.resize(n);
|
||||
|
||||
printf("---Intializing nbody--- \n");
|
||||
|
||||
const real R0 = 1;
|
||||
const real mp = 1.0/n;
|
||||
#pragma omp parallel for schedule(runtime)
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
real xp, yp, zp, s2 = 2*R0;
|
||||
real vx, vy, vz;
|
||||
while (s2 > R0*R0) {
|
||||
xp = (1.0 - 2.0*drand48())*R0;
|
||||
yp = (1.0 - 2.0*drand48())*R0;
|
||||
zp = (1.0 - 2.0*drand48())*R0;
|
||||
s2 = xp*xp + yp*yp + zp*zp;
|
||||
vx = drand48() * 0.1;
|
||||
vy = drand48() * 0.1;
|
||||
vz = drand48() * 0.1;
|
||||
}
|
||||
g_posx[i] = xp;
|
||||
g_posy[i] = yp;
|
||||
g_posz[i] = zp;
|
||||
g_velx[i] = vx;
|
||||
g_vely[i] = vy;
|
||||
g_velz[i] = vz;
|
||||
g_mass[i] = mp;
|
||||
}
|
||||
}
|
||||
|
||||
~Hermite4()
|
||||
{
|
||||
delete g_mass;
|
||||
delete g_gpot;
|
||||
delete g_posx;
|
||||
delete g_posy;
|
||||
delete g_posz;
|
||||
delete g_velx;
|
||||
delete g_vely;
|
||||
delete g_velz;
|
||||
delete g_accx;
|
||||
delete g_accy;
|
||||
delete g_accz;
|
||||
delete g_jrkx;
|
||||
delete g_jrky;
|
||||
delete g_jrkz;
|
||||
}
|
||||
|
||||
void forces();
|
||||
|
||||
real step(const real dt)
|
||||
{
|
||||
const real dt2 = dt*real(1.0/2.0);
|
||||
const real dt3 = dt*real(1.0/3.0);
|
||||
|
||||
real dt_min = HUGE;
|
||||
|
||||
#pragma omp parallel for schedule(runtime)
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
accx0[i] = g_accx[i];
|
||||
accy0[i] = g_accy[i];
|
||||
accz0[i] = g_accz[i];
|
||||
jrkx0[i] = g_jrkx[i];
|
||||
jrky0[i] = g_jrky[i];
|
||||
jrkz0[i] = g_jrkz[i];
|
||||
|
||||
g_posx[i] += dt*(g_velx[i] + dt2*(g_accx[i] + dt3*g_jrkx[i]));
|
||||
g_posy[i] += dt*(g_vely[i] + dt2*(g_accy[i] + dt3*g_jrky[i]));
|
||||
g_posz[i] += dt*(g_velz[i] + dt2*(g_accz[i] + dt3*g_jrkz[i]));
|
||||
|
||||
g_velx[i] += dt*(g_accx[i] + dt2*g_jrkx[i]);
|
||||
g_vely[i] += dt*(g_accy[i] + dt2*g_jrky[i]);
|
||||
g_velz[i] += dt*(g_accz[i] + dt2*g_jrkz[i]);
|
||||
}
|
||||
|
||||
forces();
|
||||
|
||||
if (dt > 0.0)
|
||||
{
|
||||
const real h = dt*real(0.5);
|
||||
const real hinv = real(1.0)/h;
|
||||
const real f1 = real(0.5)*hinv*hinv;
|
||||
const real f2 = real(3.0)*hinv*f1;
|
||||
|
||||
const real dt2 = dt *dt * real(1.0/2.0);
|
||||
const real dt3 = dt2*dt * real(1.0/3.0);
|
||||
const real dt4 = dt3*dt * real(1.0/4.0);
|
||||
const real dt5 = dt4*dt * real(1.0/5.0);
|
||||
|
||||
#pragma omp parallel for schedule(runtime) reduction(min:dt_min)
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
/* compute snp & crk */
|
||||
|
||||
const real Amx = g_accx[i] - accx0[i];
|
||||
const real Amy = g_accy[i] - accy0[i];
|
||||
const real Amz = g_accz[i] - accz0[i];
|
||||
|
||||
const real Jmx = h*(g_jrkx[i] - jrkx0[i]);
|
||||
const real Jmy = h*(g_jrky[i] - jrky0[i]);
|
||||
const real Jmz = h*(g_jrkz[i] - jrkz0[i]);
|
||||
|
||||
const real Jpx = h*(g_jrkx[i] + jrkx0[i]);
|
||||
const real Jpy = h*(g_jrky[i] + jrky0[i]);
|
||||
const real Jpz = h*(g_jrkz[i] + jrkz0[i]);
|
||||
|
||||
|
||||
real snpx = f1*Jmx;
|
||||
real snpy = f1*Jmy;
|
||||
real snpz = f1*Jmz;
|
||||
|
||||
real crkx = f2*(Jpx - Amx);
|
||||
real crky = f2*(Jpy - Amy);
|
||||
real crkz = f2*(Jpz - Amz);
|
||||
|
||||
snpx -= h*crkx;
|
||||
snpy -= h*crky;
|
||||
snpz -= h*crkz;
|
||||
|
||||
/* correct */
|
||||
|
||||
g_posx[i] += dt4*snpx + dt5*crkx;
|
||||
g_posy[i] += dt4*snpy + dt5*crky;
|
||||
g_posz[i] += dt4*snpz + dt5*crkz;
|
||||
|
||||
g_velx[i] += dt3*snpx + dt4*crkx;
|
||||
g_vely[i] += dt3*snpy + dt4*crky;
|
||||
g_velz[i] += dt3*snpz + dt4*crkz;
|
||||
|
||||
/* compute new timestep */
|
||||
|
||||
const real s0 = g_accx[i]*g_accx[i] + g_accy[i]*g_accy[i] + g_accz[i]*g_accz[i];
|
||||
const real s1 = g_jrkx[i]*g_jrkx[i] + g_jrky[i]*g_jrky[i] + g_jrkz[i]*g_jrkz[i];
|
||||
const real s2 = snpx*snpx + snpy*snpy + snpz*snpz;
|
||||
const real s3 = crkx*crkx + crky*crky + crkz*crkz;
|
||||
|
||||
const double u = std::sqrt(s0*s2) + s1;
|
||||
const double l = std::sqrt(s1*s3) + s2;
|
||||
assert(l > 0.0f);
|
||||
const real dt_loc = eta *std::sqrt(u/l);
|
||||
dt_min = std::min(dt_min, dt_loc);
|
||||
}
|
||||
}
|
||||
|
||||
if (dt_min == HUGE)
|
||||
return dt;
|
||||
else
|
||||
return dt_min;
|
||||
}
|
||||
|
||||
void energy(real &Ekin, real &Epot)
|
||||
{
|
||||
real ekin = 0, epot = 0;
|
||||
|
||||
#pragma omp parallel for reduction(+:ekin,epot)
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
ekin += g_mass[i] * (g_velx[i]*g_velx[i] + g_vely[i]*g_vely[i] + g_velz[i]*g_velz[i]) * real(0.5f);
|
||||
epot += real(0.5f)*g_mass[i] * g_gpot[i];
|
||||
}
|
||||
Ekin = ekin;
|
||||
Epot = epot;
|
||||
}
|
||||
|
||||
void integrate(const int niter, const real t_end = HUGE)
|
||||
{
|
||||
const double tin = rtc();
|
||||
forces();
|
||||
const double fn = n;
|
||||
printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - tin,
|
||||
fn*fn*PP_FLOP/(rtc() - tin)/1e9);
|
||||
|
||||
real Epot0, Ekin0;
|
||||
energy(Ekin0, Epot0);
|
||||
const real Etot0 = Epot0 + Ekin0;
|
||||
printf(" E: %g %g %g \n", Epot0, Ekin0, Etot0);
|
||||
|
||||
/////////
|
||||
|
||||
real t_global = 0;
|
||||
double t0 = 0;
|
||||
int iter = 0;
|
||||
int ntime = 10;
|
||||
real dt = 1.0/131072;
|
||||
real Epot, Ekin, Etot = Etot0;
|
||||
while (t_global < t_end) {
|
||||
if (iter % ntime == 0)
|
||||
t0 = rtc();
|
||||
|
||||
if (iter >= niter) return;
|
||||
|
||||
dt = step(dt);
|
||||
iter++;
|
||||
t_global += dt;
|
||||
|
||||
const real Etot_pre = Etot;
|
||||
energy(Ekin, Epot);
|
||||
Etot = Ekin + Epot;
|
||||
|
||||
if (iter % 1 == 0) {
|
||||
const real Etot = Ekin + Epot;
|
||||
printf("iter= %d: t= %g dt= %g Ekin= %g Epot= %g Etot= %g , dE = %g d(dE)= %g \n",
|
||||
iter, t_global, dt, Ekin, Epot, Etot, (Etot - Etot0)/std::abs(Etot0),
|
||||
(Etot - Etot_pre)/std::abs(Etot_pre) );
|
||||
}
|
||||
|
||||
if (iter % ntime == 0) {
|
||||
printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - t0,
|
||||
fn*fn*PP_FLOP/(rtc() - t0)/1e9*ntime);
|
||||
}
|
||||
|
||||
fflush(stdout);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
void Hermite4::forces()
|
||||
{
|
||||
ispc::compute_forces(
|
||||
n,
|
||||
g_mass,
|
||||
g_posx,
|
||||
g_posy,
|
||||
g_posz,
|
||||
g_velx,
|
||||
g_vely,
|
||||
g_velz,
|
||||
g_accx,
|
||||
g_accy,
|
||||
g_accz,
|
||||
g_jrkx,
|
||||
g_jrky,
|
||||
g_jrkz,
|
||||
g_gpot);
|
||||
}
|
||||
|
||||
void run(const int nbodies, const real eta, const int nstep)
|
||||
{
|
||||
Hermite4 h4(nbodies, eta);
|
||||
h4.integrate(nstep);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
printf(" Usage: %s [nbodies=8192] [nsteps=40] [eta=0.1] \n", argv[0]);
|
||||
|
||||
int nbodies = 8192;
|
||||
if (argc > 1) nbodies = atoi(argv[1]);
|
||||
|
||||
int nstep = 40;
|
||||
if (argc > 2) nstep = atoi(argv[2]);
|
||||
|
||||
float eta = 0.1;
|
||||
if (argc > 3) eta = atof(argv[3]);
|
||||
|
||||
|
||||
printf("nbodies= %d\n", nbodies);
|
||||
printf("nstep= %d\n", nstep);
|
||||
printf(" eta= %g \n", eta);
|
||||
|
||||
run(nbodies, eta, nstep);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
157
examples/portable/nbody_hermite4/hermite4.ispc
Normal file
157
examples/portable/nbody_hermite4/hermite4.ispc
Normal file
@@ -0,0 +1,157 @@
|
||||
#include "typeReal.h"
|
||||
|
||||
typedef real<3> vec3;
|
||||
struct Force
|
||||
{
|
||||
vec3 acc, jrk;
|
||||
real pot, null;
|
||||
};
|
||||
|
||||
struct Predictor
|
||||
{
|
||||
vec3 pos, vel;
|
||||
};
|
||||
|
||||
static inline
|
||||
void body_body_force(
|
||||
Force &fi,
|
||||
const Predictor &pi,
|
||||
const Predictor &pj,
|
||||
const real mj)
|
||||
{
|
||||
const real dx = pj.pos.x - pi.pos.x;
|
||||
const real dy = pj.pos.y - pi.pos.y;
|
||||
const real dz = pj.pos.z - pi.pos.z;
|
||||
|
||||
const real ds2 = dx*dx + dy*dy + dz*dz;
|
||||
|
||||
const real inv_ds = ds2 > 1.0d-10 ? rsqrt((float)ds2) : 0.0;
|
||||
const real inv_ds2 = inv_ds*inv_ds;
|
||||
const real minv_ds = inv_ds * mj;
|
||||
const real minv_ds3 = inv_ds2 * minv_ds;
|
||||
|
||||
|
||||
fi.acc.x += minv_ds3 * dx;
|
||||
fi.acc.y += minv_ds3 * dy;
|
||||
fi.acc.z += minv_ds3 * dz;
|
||||
fi.pot -= minv_ds;
|
||||
|
||||
const real dvx = pj.vel.x - pi.vel.x;
|
||||
const real dvy = pj.vel.y - pi.vel.y;
|
||||
const real dvz = pj.vel.z - pi.vel.z;
|
||||
const real rv = dx*dvx + dy*dvy + dz*dvz;
|
||||
|
||||
const real Jij = (real)(-3.0) * (rv * inv_ds2 * minv_ds3);
|
||||
|
||||
fi.jrk.x += minv_ds3*dvx + Jij*dx;
|
||||
fi.jrk.y += minv_ds3*dvy + Jij*dy;
|
||||
fi.jrk.z += minv_ds3*dvz + Jij*dz;
|
||||
}
|
||||
|
||||
task void compute_forces_task(
|
||||
uniform const int n,
|
||||
uniform const int nPerTask,
|
||||
uniform const real mass[],
|
||||
uniform const real posx[],
|
||||
uniform const real posy[],
|
||||
uniform const real posz[],
|
||||
uniform const real velx[],
|
||||
uniform const real vely[],
|
||||
uniform const real velz[],
|
||||
uniform real accx[],
|
||||
uniform real accy[],
|
||||
uniform real accz[],
|
||||
uniform real jrkx[],
|
||||
uniform real jrky[],
|
||||
uniform real jrkz[],
|
||||
uniform real gpot[])
|
||||
{
|
||||
const uniform int nibeg = taskIndex * nPerTask;
|
||||
const uniform int niend = min(n, nibeg + nPerTask);
|
||||
|
||||
if (nibeg >= n)
|
||||
return;
|
||||
|
||||
uniform real shdata[7][programCount];
|
||||
|
||||
assert((n%programCount) == 0);
|
||||
|
||||
foreach (i = nibeg ... niend)
|
||||
{
|
||||
Force fi;
|
||||
fi.acc = (real)0.0;
|
||||
fi.jrk = (real)0.0;
|
||||
fi.pot = (real)0.0;
|
||||
|
||||
Predictor pi;
|
||||
pi.pos.x = posx[i];
|
||||
pi.pos.y = posy[i];
|
||||
pi.pos.z = posz[i];
|
||||
pi.vel.x = velx[i];
|
||||
pi.vel.y = vely[i];
|
||||
pi.vel.z = velz[i];
|
||||
|
||||
for (uniform int jb = 0; jb < n; jb += programCount)
|
||||
{
|
||||
const int jp = jb + programIndex;
|
||||
shdata[0][programIndex] = posx[jp];
|
||||
shdata[1][programIndex] = posy[jp];
|
||||
shdata[2][programIndex] = posz[jp];
|
||||
shdata[3][programIndex] = mass[jp];
|
||||
shdata[4][programIndex] = velx[jp];
|
||||
shdata[5][programIndex] = vely[jp];
|
||||
shdata[6][programIndex] = velz[jp];
|
||||
|
||||
for (uniform int j = 0; j < programCount; j++)
|
||||
{
|
||||
Predictor pj;
|
||||
pj.pos.x = shdata[0][j];
|
||||
pj.pos.y = shdata[1][j];
|
||||
pj.pos.z = shdata[2][j];
|
||||
pj.vel.x = shdata[4][j];
|
||||
pj.vel.y = shdata[5][j];
|
||||
pj.vel.z = shdata[6][j];
|
||||
const real jmass = shdata[3][j];
|
||||
body_body_force(fi,pi,pj,jmass);
|
||||
}
|
||||
}
|
||||
|
||||
accx[i] = fi.acc.x;
|
||||
accy[i] = fi.acc.y;
|
||||
accz[i] = fi.acc.z;
|
||||
jrkx[i] = fi.jrk.x;
|
||||
jrky[i] = fi.jrk.y;
|
||||
jrkz[i] = fi.jrk.z;
|
||||
gpot[i] = fi.pot;
|
||||
}
|
||||
}
|
||||
|
||||
export void compute_forces(
|
||||
uniform const int n,
|
||||
uniform const real mass[],
|
||||
uniform const real posx[],
|
||||
uniform const real posy[],
|
||||
uniform const real posz[],
|
||||
uniform const real velx[],
|
||||
uniform const real vely[],
|
||||
uniform const real velz[],
|
||||
uniform real accx[],
|
||||
uniform real accy[],
|
||||
uniform real accz[],
|
||||
uniform real jrkx[],
|
||||
uniform real jrky[],
|
||||
uniform real jrkz[],
|
||||
uniform real gpot[])
|
||||
{
|
||||
const uniform int nPerTask = min(128,programCount*8);
|
||||
const uniform int nTask = (n+nPerTask-1)/nPerTask;
|
||||
|
||||
launch [nTask] compute_forces_task(
|
||||
n, nPerTask,
|
||||
mass,
|
||||
posx,posy,posz,
|
||||
velx,vely,velz,
|
||||
accx,accy,accz,
|
||||
jrkx,jrky,jrkz,
|
||||
gpot);
|
||||
}
|
||||
2
examples/portable/nbody_hermite4/typeReal.h
Normal file
2
examples/portable/nbody_hermite4/typeReal.h
Normal file
@@ -0,0 +1,2 @@
|
||||
#pragma once
|
||||
typedef double real;
|
||||
Reference in New Issue
Block a user