From 86a6cfc1d0b3169ed55709a4b26668a20c7cddfe Mon Sep 17 00:00:00 2001 From: Evghenii Date: Fri, 31 Jan 2014 10:36:30 +0100 Subject: [PATCH] problem solved --- examples_ptx/cuda_helpers.cuh | 8 +++++++ examples_ptx/nbody/nbody.cpp | 2 +- examples_ptx/nbody/nbody.cu | 18 +++++++++++++-- examples_ptx/nbody/nbody.ispc | 42 +++++++++++++++++++++++------------ examples_ptx/ptxcc/ptxcc | 2 +- 5 files changed, 54 insertions(+), 18 deletions(-) diff --git a/examples_ptx/cuda_helpers.cuh b/examples_ptx/cuda_helpers.cuh index a28b5a51..2c6ec6dc 100644 --- a/examples_ptx/cuda_helpers.cuh +++ b/examples_ptx/cuda_helpers.cuh @@ -14,4 +14,12 @@ #define launch(ntx,nty,ntz,func) if (programIndex==0) func<<>> #define sync cudaDeviceSynchronize() #define cif if +__device__ __forceinline__ static double __shfl(double x, int lane) +{ + return __hiloint2double( + __shfl_xor(__double2hiint(x), lane), + __shfl_xor(__double2loint(x), lane)); + +} #define shuffle(x,y) __shfl(x,y) +#define broadcast(x,y) __shfl(x,y) diff --git a/examples_ptx/nbody/nbody.cpp b/examples_ptx/nbody/nbody.cpp index ee5dffbf..275fb3b6 100644 --- a/examples_ptx/nbody/nbody.cpp +++ b/examples_ptx/nbody/nbody.cpp @@ -9,7 +9,7 @@ #include "nbody_ispc.h" #include "plummer.h" -typedef double real; +#include "realType.h" int main (int argc, char *argv[]) { diff --git a/examples_ptx/nbody/nbody.cu b/examples_ptx/nbody/nbody.cu index 901c465d..f19742e5 100644 --- a/examples_ptx/nbody/nbody.cu +++ b/examples_ptx/nbody/nbody.cu @@ -1,4 +1,4 @@ -typedef double real; +#include "realType.h" #include "cuda_helpers.cuh" #include @@ -86,25 +86,39 @@ void computeForces( #else for (uniform int j = 0; j < nbodies; j += programCount) { +#if 1 __shared__ real shdata[4][programCount*4]; real (* shmem)[programCount] = (real (*)[programCount])shdata[warpIdx]; shmem[0][programIndex] = posx[j+programIndex]; shmem[1][programIndex] = posy[j+programIndex]; shmem[2][programIndex] = posz[j+programIndex]; shmem[3][programIndex] = mass[j+programIndex]; +#else + const real jPosx = posx[j+programIndex]; + const real jPosy = posy[j+programIndex]; + const real jPosz = posz[j+programIndex]; + const real jMass = mass[j+programIndex]; +#endif #pragma unroll 1 for (int jb = 0; jb < programCount; jb++) { +#if 1 const real jposx = shmem[0][jb]; const real jposy = shmem[1][jb]; const real jposz = shmem[2][jb]; const real jmass = shmem[3][jb]; +#else + const real jposx = broadcast(jPosx, jb); + const real jposy = broadcast(jPosy, jb); + const real jposz = broadcast(jPosz, jb); + const real jmass = broadcast(jMass, jb); +#endif const real dx = jposx - iposx; const real dy = jposy - iposy; const real dz = jposz - iposz; const real r2 = dx*dx + dy*dy + dz*dz; - const real rinv = r2 ; //> 0.0 ? rsqrt((float)r2) : 0; + const real rinv = r2 > 0.0 ? rsqrt((float)r2) : 0; const real mrinv = -jmass * rinv; const real mrinv3 = mrinv * rinv*rinv; iaccx += mrinv3 * dx; diff --git a/examples_ptx/nbody/nbody.ispc b/examples_ptx/nbody/nbody.ispc index 217a2680..3b988a6a 100644 --- a/examples_ptx/nbody/nbody.ispc +++ b/examples_ptx/nbody/nbody.ispc @@ -1,4 +1,4 @@ -typedef double real; +#include "realType.h" static uniform real * uniform accx = NULL; @@ -30,7 +30,7 @@ uniform int nn = programCount; task -unmasked void computeForces( +void computeForces( uniform int nbodies, uniform real posx[], uniform real posy[], @@ -41,7 +41,6 @@ unmasked void computeForces( const uniform int blockDim = (nbodies + taskCount - 1)/taskCount; const uniform int blockBeg = blockIdx * blockDim; const uniform int blockEnd = min(blockBeg + blockDim, nbodies); - uniform real shmem[4*programCount]; //real gpotLoc = 0; foreach (i = blockBeg ... blockEnd) @@ -53,7 +52,7 @@ unmasked void computeForces( real iaccy = 0; real iaccz = 0; real igpot = 0; -#if 0 +#ifndef __NVPTX__ for (uniform int j = 0; j < nbodies; j++) { const real jposx = posx[j]; @@ -64,7 +63,7 @@ unmasked void computeForces( const real dy = jposy - iposy; const real dz = jposz - iposz; const real r2 = dx*dx + dy*dy + dz*dz; - const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0; + const real rinv = r2> 0.0d ? rsqrt((float)r2) : 0; const real mrinv = -jmass * rinv; const real mrinv3 = mrinv * rinv*rinv; @@ -76,21 +75,36 @@ unmasked void computeForces( #else for (uniform int j = 0; j < nbodies; j += programCount) { - shmem[0*programCount + programIndex] = posx[j+programIndex]; - shmem[1*programCount + programIndex] = posy[j+programIndex]; - shmem[2*programCount + programIndex] = posz[j+programIndex]; - shmem[3*programCount + programIndex] = mass[j+programIndex]; +#if 1 + uniform real shmem[4][programCount]; + shmem[0][programIndex] = posx[j+programIndex]; + shmem[1][programIndex] = posy[j+programIndex]; + shmem[2][programIndex] = posz[j+programIndex]; + shmem[3][programIndex] = mass[j+programIndex]; +#else + const real jPosx = posx[j+programIndex]; + const real jPosy = posy[j+programIndex]; + const real jPosz = posz[j+programIndex]; + const real jMass = mass[j+programIndex]; +#endif for (uniform int jb = 0; jb < programCount; jb++) { - const real jposx = shmem[0*programCount + jb]; - const real jposy = shmem[1*programCount + jb]; - const real jposz = shmem[2*programCount + jb]; - const real jmass = shmem[3*programCount + jb]; +#if 1 + const real jposx = shmem[0][jb]; + const real jposy = shmem[1][jb]; + const real jposz = shmem[2][jb]; + const real jmass = shmem[3][jb]; +#else + const real jposx = broadcast(jPosx, jb); + const real jposy = broadcast(jPosy, jb); + const real jposz = broadcast(jPosz, jb); + const real jmass = broadcast(jMass, jb); +#endif const real dx = jposx - iposx; const real dy = jposy - iposy; const real dz = jposz - iposz; const real r2 = dx*dx + dy*dy + dz*dz; - const real rinv = r2; // > 0.0d ? rsqrt((float)r2) : 0; + const real rinv = r2 > 0.0d ? rsqrt((float)r2) : 0; const real mrinv = -jmass * rinv; const real mrinv3 = mrinv * rinv*rinv; diff --git a/examples_ptx/ptxcc/ptxcc b/examples_ptx/ptxcc/ptxcc index 73964be7..25840e68 100755 --- a/examples_ptx/ptxcc/ptxcc +++ b/examples_ptx/ptxcc/ptxcc @@ -11,7 +11,7 @@ NVCC=nvcc $(cat $1 | sed 's/\.b0/\.b32/g' > $PTXSRC) && $DEPTX < $PTXSRC > $PTXCU && -$NVCC -arch=sm_35 -G -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \ +$NVCC -arch=sm_35 -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \ sed 's/\#\$//g'| \ awk '{ if ($1 == "LIBRARIES=") print $1$2; else if ($1 == "cicc") print "cp '$PTXSRC'", $NF; else print $0 }' > $PTXSH && sh $PTXSH