problem solved
This commit is contained in:
@@ -14,4 +14,12 @@
|
|||||||
#define launch(ntx,nty,ntz,func) if (programIndex==0) func<<<dim3(((ntx)+4-1)/4,nty,ntz),128>>>
|
#define launch(ntx,nty,ntz,func) if (programIndex==0) func<<<dim3(((ntx)+4-1)/4,nty,ntz),128>>>
|
||||||
#define sync cudaDeviceSynchronize()
|
#define sync cudaDeviceSynchronize()
|
||||||
#define cif if
|
#define cif if
|
||||||
|
__device__ __forceinline__ static double __shfl(double x, int lane)
|
||||||
|
{
|
||||||
|
return __hiloint2double(
|
||||||
|
__shfl_xor(__double2hiint(x), lane),
|
||||||
|
__shfl_xor(__double2loint(x), lane));
|
||||||
|
|
||||||
|
}
|
||||||
#define shuffle(x,y) __shfl(x,y)
|
#define shuffle(x,y) __shfl(x,y)
|
||||||
|
#define broadcast(x,y) __shfl(x,y)
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
#include "nbody_ispc.h"
|
#include "nbody_ispc.h"
|
||||||
#include "plummer.h"
|
#include "plummer.h"
|
||||||
|
|
||||||
typedef double real;
|
#include "realType.h"
|
||||||
|
|
||||||
int main (int argc, char *argv[])
|
int main (int argc, char *argv[])
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
typedef double real;
|
#include "realType.h"
|
||||||
#include "cuda_helpers.cuh"
|
#include "cuda_helpers.cuh"
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
@@ -86,25 +86,39 @@ void computeForces(
|
|||||||
#else
|
#else
|
||||||
for (uniform int j = 0; j < nbodies; j += programCount)
|
for (uniform int j = 0; j < nbodies; j += programCount)
|
||||||
{
|
{
|
||||||
|
#if 1
|
||||||
__shared__ real shdata[4][programCount*4];
|
__shared__ real shdata[4][programCount*4];
|
||||||
real (* shmem)[programCount] = (real (*)[programCount])shdata[warpIdx];
|
real (* shmem)[programCount] = (real (*)[programCount])shdata[warpIdx];
|
||||||
shmem[0][programIndex] = posx[j+programIndex];
|
shmem[0][programIndex] = posx[j+programIndex];
|
||||||
shmem[1][programIndex] = posy[j+programIndex];
|
shmem[1][programIndex] = posy[j+programIndex];
|
||||||
shmem[2][programIndex] = posz[j+programIndex];
|
shmem[2][programIndex] = posz[j+programIndex];
|
||||||
shmem[3][programIndex] = mass[j+programIndex];
|
shmem[3][programIndex] = mass[j+programIndex];
|
||||||
|
#else
|
||||||
|
const real jPosx = posx[j+programIndex];
|
||||||
|
const real jPosy = posy[j+programIndex];
|
||||||
|
const real jPosz = posz[j+programIndex];
|
||||||
|
const real jMass = mass[j+programIndex];
|
||||||
|
#endif
|
||||||
|
|
||||||
#pragma unroll 1
|
#pragma unroll 1
|
||||||
for (int jb = 0; jb < programCount; jb++)
|
for (int jb = 0; jb < programCount; jb++)
|
||||||
{
|
{
|
||||||
|
#if 1
|
||||||
const real jposx = shmem[0][jb];
|
const real jposx = shmem[0][jb];
|
||||||
const real jposy = shmem[1][jb];
|
const real jposy = shmem[1][jb];
|
||||||
const real jposz = shmem[2][jb];
|
const real jposz = shmem[2][jb];
|
||||||
const real jmass = shmem[3][jb];
|
const real jmass = shmem[3][jb];
|
||||||
|
#else
|
||||||
|
const real jposx = broadcast(jPosx, jb);
|
||||||
|
const real jposy = broadcast(jPosy, jb);
|
||||||
|
const real jposz = broadcast(jPosz, jb);
|
||||||
|
const real jmass = broadcast(jMass, jb);
|
||||||
|
#endif
|
||||||
const real dx = jposx - iposx;
|
const real dx = jposx - iposx;
|
||||||
const real dy = jposy - iposy;
|
const real dy = jposy - iposy;
|
||||||
const real dz = jposz - iposz;
|
const real dz = jposz - iposz;
|
||||||
const real r2 = dx*dx + dy*dy + dz*dz;
|
const real r2 = dx*dx + dy*dy + dz*dz;
|
||||||
const real rinv = r2 ; //> 0.0 ? rsqrt((float)r2) : 0;
|
const real rinv = r2 > 0.0 ? rsqrt((float)r2) : 0;
|
||||||
const real mrinv = -jmass * rinv;
|
const real mrinv = -jmass * rinv;
|
||||||
const real mrinv3 = mrinv * rinv*rinv;
|
const real mrinv3 = mrinv * rinv*rinv;
|
||||||
iaccx += mrinv3 * dx;
|
iaccx += mrinv3 * dx;
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
typedef double real;
|
#include "realType.h"
|
||||||
|
|
||||||
|
|
||||||
static uniform real * uniform accx = NULL;
|
static uniform real * uniform accx = NULL;
|
||||||
@@ -30,7 +30,7 @@ uniform int nn = programCount;
|
|||||||
|
|
||||||
|
|
||||||
task
|
task
|
||||||
unmasked void computeForces(
|
void computeForces(
|
||||||
uniform int nbodies,
|
uniform int nbodies,
|
||||||
uniform real posx[],
|
uniform real posx[],
|
||||||
uniform real posy[],
|
uniform real posy[],
|
||||||
@@ -41,7 +41,6 @@ unmasked void computeForces(
|
|||||||
const uniform int blockDim = (nbodies + taskCount - 1)/taskCount;
|
const uniform int blockDim = (nbodies + taskCount - 1)/taskCount;
|
||||||
const uniform int blockBeg = blockIdx * blockDim;
|
const uniform int blockBeg = blockIdx * blockDim;
|
||||||
const uniform int blockEnd = min(blockBeg + blockDim, nbodies);
|
const uniform int blockEnd = min(blockBeg + blockDim, nbodies);
|
||||||
uniform real shmem[4*programCount];
|
|
||||||
|
|
||||||
//real gpotLoc = 0;
|
//real gpotLoc = 0;
|
||||||
foreach (i = blockBeg ... blockEnd)
|
foreach (i = blockBeg ... blockEnd)
|
||||||
@@ -53,7 +52,7 @@ unmasked void computeForces(
|
|||||||
real iaccy = 0;
|
real iaccy = 0;
|
||||||
real iaccz = 0;
|
real iaccz = 0;
|
||||||
real igpot = 0;
|
real igpot = 0;
|
||||||
#if 0
|
#ifndef __NVPTX__
|
||||||
for (uniform int j = 0; j < nbodies; j++)
|
for (uniform int j = 0; j < nbodies; j++)
|
||||||
{
|
{
|
||||||
const real jposx = posx[j];
|
const real jposx = posx[j];
|
||||||
@@ -64,7 +63,7 @@ unmasked void computeForces(
|
|||||||
const real dy = jposy - iposy;
|
const real dy = jposy - iposy;
|
||||||
const real dz = jposz - iposz;
|
const real dz = jposz - iposz;
|
||||||
const real r2 = dx*dx + dy*dy + dz*dz;
|
const real r2 = dx*dx + dy*dy + dz*dz;
|
||||||
const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0;
|
const real rinv = r2> 0.0d ? rsqrt((float)r2) : 0;
|
||||||
const real mrinv = -jmass * rinv;
|
const real mrinv = -jmass * rinv;
|
||||||
const real mrinv3 = mrinv * rinv*rinv;
|
const real mrinv3 = mrinv * rinv*rinv;
|
||||||
|
|
||||||
@@ -76,21 +75,36 @@ unmasked void computeForces(
|
|||||||
#else
|
#else
|
||||||
for (uniform int j = 0; j < nbodies; j += programCount)
|
for (uniform int j = 0; j < nbodies; j += programCount)
|
||||||
{
|
{
|
||||||
shmem[0*programCount + programIndex] = posx[j+programIndex];
|
#if 1
|
||||||
shmem[1*programCount + programIndex] = posy[j+programIndex];
|
uniform real shmem[4][programCount];
|
||||||
shmem[2*programCount + programIndex] = posz[j+programIndex];
|
shmem[0][programIndex] = posx[j+programIndex];
|
||||||
shmem[3*programCount + programIndex] = mass[j+programIndex];
|
shmem[1][programIndex] = posy[j+programIndex];
|
||||||
|
shmem[2][programIndex] = posz[j+programIndex];
|
||||||
|
shmem[3][programIndex] = mass[j+programIndex];
|
||||||
|
#else
|
||||||
|
const real jPosx = posx[j+programIndex];
|
||||||
|
const real jPosy = posy[j+programIndex];
|
||||||
|
const real jPosz = posz[j+programIndex];
|
||||||
|
const real jMass = mass[j+programIndex];
|
||||||
|
#endif
|
||||||
for (uniform int jb = 0; jb < programCount; jb++)
|
for (uniform int jb = 0; jb < programCount; jb++)
|
||||||
{
|
{
|
||||||
const real jposx = shmem[0*programCount + jb];
|
#if 1
|
||||||
const real jposy = shmem[1*programCount + jb];
|
const real jposx = shmem[0][jb];
|
||||||
const real jposz = shmem[2*programCount + jb];
|
const real jposy = shmem[1][jb];
|
||||||
const real jmass = shmem[3*programCount + jb];
|
const real jposz = shmem[2][jb];
|
||||||
|
const real jmass = shmem[3][jb];
|
||||||
|
#else
|
||||||
|
const real jposx = broadcast(jPosx, jb);
|
||||||
|
const real jposy = broadcast(jPosy, jb);
|
||||||
|
const real jposz = broadcast(jPosz, jb);
|
||||||
|
const real jmass = broadcast(jMass, jb);
|
||||||
|
#endif
|
||||||
const real dx = jposx - iposx;
|
const real dx = jposx - iposx;
|
||||||
const real dy = jposy - iposy;
|
const real dy = jposy - iposy;
|
||||||
const real dz = jposz - iposz;
|
const real dz = jposz - iposz;
|
||||||
const real r2 = dx*dx + dy*dy + dz*dz;
|
const real r2 = dx*dx + dy*dy + dz*dz;
|
||||||
const real rinv = r2; // > 0.0d ? rsqrt((float)r2) : 0;
|
const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0;
|
||||||
const real mrinv = -jmass * rinv;
|
const real mrinv = -jmass * rinv;
|
||||||
const real mrinv3 = mrinv * rinv*rinv;
|
const real mrinv3 = mrinv * rinv*rinv;
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ NVCC=nvcc
|
|||||||
|
|
||||||
$(cat $1 | sed 's/\.b0/\.b32/g' > $PTXSRC) &&
|
$(cat $1 | sed 's/\.b0/\.b32/g' > $PTXSRC) &&
|
||||||
$DEPTX < $PTXSRC > $PTXCU &&
|
$DEPTX < $PTXSRC > $PTXCU &&
|
||||||
$NVCC -arch=sm_35 -G -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \
|
$NVCC -arch=sm_35 -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \
|
||||||
sed 's/\#\$//g'| \
|
sed 's/\#\$//g'| \
|
||||||
awk '{ if ($1 == "LIBRARIES=") print $1$2; else if ($1 == "cicc") print "cp '$PTXSRC'", $NF; else print $0 }' > $PTXSH &&
|
awk '{ if ($1 == "LIBRARIES=") print $1$2; else if ($1 == "cicc") print "cp '$PTXSRC'", $NF; else print $0 }' > $PTXSH &&
|
||||||
sh $PTXSH
|
sh $PTXSH
|
||||||
|
|||||||
Reference in New Issue
Block a user