diff --git a/docs/perf.txt b/docs/perf.txt index 5c2a203b..62ee1907 100644 --- a/docs/perf.txt +++ b/docs/perf.txt @@ -46,8 +46,8 @@ also included in the ``examples/`` directory.) - 4.05x - 15.53x * - `Volume Rendering`_ - - 3.11x - - 15.80x + - 3.60x + - 17.53x .. _AOBench: https://github.com/ispc/ispc/tree/master/examples/aobench diff --git a/examples/volume_rendering/Makefile b/examples/volume_rendering/Makefile index fa8ff753..2c7bcf2e 100644 --- a/examples/volume_rendering/Makefile +++ b/examples/volume_rendering/Makefile @@ -8,10 +8,10 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o))) CXX=g++ CXXFLAGS=-Iobjs/ -O3 -Wall -m64 ISPC=ispc -ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64 +ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64 OBJS=objs/volume.o objs/volume_serial.o $(TASK_OBJ) objs/volume_ispc.o \ - objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o + objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o objs/volume_ispc_avx.o default: volume @@ -34,5 +34,5 @@ objs/%.o: ../%.cpp objs/volume.o: objs/volume_ispc.h -objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o: %.ispc +objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc $(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h diff --git a/examples/volume_rendering/volume.ispc b/examples/volume_rendering/volume.ispc index 229f510b..7f3367f1 100644 --- a/examples/volume_rendering/volume.ispc +++ b/examples/volume_rendering/volume.ispc @@ -124,24 +124,13 @@ static inline float D(int x, int y, int z, uniform int nVoxels[3], } -static inline float Du(uniform int x, uniform int y, uniform int z, - uniform int nVoxels[3], uniform float density[]) { - x = clamp(x, 0, nVoxels[0]-1); - y = clamp(y, 0, nVoxels[1]-1); - z = clamp(z, 0, nVoxels[2]-1); - - return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x]; -} - - static inline float3 Offset(float3 p, float3 pMin, float3 pMax) { return (p - pMin) / (pMax - pMin); } static inline float Density(float3 Pobj, float3 pMin, float3 pMax, - uniform float density[], uniform int nVoxels[3], - uniform bool &checkForSameVoxel) { + uniform float density[], uniform int nVoxels[3]) { if (!Inside(Pobj, pMin, pMax)) return 0; // Compute voxel coordinates and offsets for _Pobj_ @@ -153,39 +142,14 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax, float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz; // Trilinearly interpolate density values to compute local density - float d00, d10, d01, d11; - uniform int uvx, uvy, uvz; - if (checkForSameVoxel && reduce_equal(vx, &uvx) && reduce_equal(vy, &uvy) && - reduce_equal(vz, &uvz)) { - // If all of the program instances are inside the same voxel, then - // we'll call the 'uniform' variant of the voxel density lookup - // function, thus doing a single load for each value rather than a - // gather. - d00 = Lerp(dx, Du(uvx, uvy, uvz, nVoxels, density), - Du(uvx+1, uvy, uvz, nVoxels, density)); - d10 = Lerp(dx, Du(uvx, uvy+1, uvz, nVoxels, density), - Du(uvx+1, uvy+1, uvz, nVoxels, density)); - d01 = Lerp(dx, Du(uvx, uvy, uvz+1, nVoxels, density), - Du(uvx+1, uvy, uvz+1, nVoxels, density)); - d11 = Lerp(dx, Du(uvx, uvy+1, uvz+1, nVoxels, density), - Du(uvx+1, uvy+1, uvz+1, nVoxels, density)); - } - else { - // Otherwise, we have to do an actual gather in the more general - // D() function. Once the reduce_equal tests above fail, we stop - // checking in subsequent steps, since it's unlikely that this will - // be true in the future once they've diverged into different - // voxels. - checkForSameVoxel = false; - d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density), - D(vx+1, vy, vz, nVoxels, density)); - d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density), - D(vx+1, vy+1, vz, nVoxels, density)); - d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density), - D(vx+1, vy, vz+1, nVoxels, density)); - d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), - D(vx+1, vy+1, vz+1, nVoxels, density)); - } + float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density), + D(vx+1, vy, vz, nVoxels, density)); + float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density), + D(vx+1, vy+1, vz, nVoxels, density)); + float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density), + D(vx+1, vy, vz+1, nVoxels, density)); + float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), + D(vx+1, vy+1, vz+1, nVoxels, density)); float d0 = Lerp(dy, d00, d10); float d1 = Lerp(dy, d01, d11); return Lerp(dz, d0, d1); @@ -221,10 +185,8 @@ transmittance(uniform float3 p0, float3 p1, uniform float3 pMin, float t = rayT0; float3 pos = ray.origin + ray.dir * rayT0; float3 dirStep = ray.dir * stepT; - uniform bool checkForSameVoxel = true; while (t < rayT1) { - tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels, - checkForSameVoxel); + tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels); pos = pos + dirStep; t += stepT; } @@ -268,9 +230,8 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) { float t = rayT0; float3 pos = ray.origin + ray.dir * rayT0; float3 dirStep = ray.dir * stepT; - uniform bool checkForSameVoxel = true; cwhile (t < rayT1) { - float d = Density(pos, pMin, pMax, density, nVoxels, checkForSameVoxel); + float d = Density(pos, pMin, pMax, density, nVoxels); // terminate once attenuation is high float atten = exp(-tau); diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj index 04ae8335..908cf734 100644 --- a/examples/volume_rendering/volume.vcxproj +++ b/examples/volume_rendering/volume.vcxproj @@ -156,18 +156,18 @@ Document - ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2 + ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx - ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2 + ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h - ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2 + $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h + $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h + ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx - ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2 + ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h + $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h + $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h