diff --git a/docs/perf.txt b/docs/perf.txt
index 5c2a203b..62ee1907 100644
--- a/docs/perf.txt
+++ b/docs/perf.txt
@@ -46,8 +46,8 @@ also included in the ``examples/`` directory.)
- 4.05x
- 15.53x
* - `Volume Rendering`_
- - 3.11x
- - 15.80x
+ - 3.60x
+ - 17.53x
.. _AOBench: https://github.com/ispc/ispc/tree/master/examples/aobench
diff --git a/examples/volume_rendering/Makefile b/examples/volume_rendering/Makefile
index fa8ff753..2c7bcf2e 100644
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -8,10 +8,10 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
CXX=g++
CXXFLAGS=-Iobjs/ -O3 -Wall -m64
ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
OBJS=objs/volume.o objs/volume_serial.o $(TASK_OBJ) objs/volume_ispc.o \
- objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o
+ objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o objs/volume_ispc_avx.o
default: volume
@@ -34,5 +34,5 @@ objs/%.o: ../%.cpp
objs/volume.o: objs/volume_ispc.h
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
diff --git a/examples/volume_rendering/volume.ispc b/examples/volume_rendering/volume.ispc
index 229f510b..7f3367f1 100644
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -124,24 +124,13 @@ static inline float D(int x, int y, int z, uniform int nVoxels[3],
}
-static inline float Du(uniform int x, uniform int y, uniform int z,
- uniform int nVoxels[3], uniform float density[]) {
- x = clamp(x, 0, nVoxels[0]-1);
- y = clamp(y, 0, nVoxels[1]-1);
- z = clamp(z, 0, nVoxels[2]-1);
-
- return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
-}
-
-
static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
return (p - pMin) / (pMax - pMin);
}
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
- uniform float density[], uniform int nVoxels[3],
- uniform bool &checkForSameVoxel) {
+ uniform float density[], uniform int nVoxels[3]) {
if (!Inside(Pobj, pMin, pMax))
return 0;
// Compute voxel coordinates and offsets for _Pobj_
@@ -153,39 +142,14 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
// Trilinearly interpolate density values to compute local density
- float d00, d10, d01, d11;
- uniform int uvx, uvy, uvz;
- if (checkForSameVoxel && reduce_equal(vx, &uvx) && reduce_equal(vy, &uvy) &&
- reduce_equal(vz, &uvz)) {
- // If all of the program instances are inside the same voxel, then
- // we'll call the 'uniform' variant of the voxel density lookup
- // function, thus doing a single load for each value rather than a
- // gather.
- d00 = Lerp(dx, Du(uvx, uvy, uvz, nVoxels, density),
- Du(uvx+1, uvy, uvz, nVoxels, density));
- d10 = Lerp(dx, Du(uvx, uvy+1, uvz, nVoxels, density),
- Du(uvx+1, uvy+1, uvz, nVoxels, density));
- d01 = Lerp(dx, Du(uvx, uvy, uvz+1, nVoxels, density),
- Du(uvx+1, uvy, uvz+1, nVoxels, density));
- d11 = Lerp(dx, Du(uvx, uvy+1, uvz+1, nVoxels, density),
- Du(uvx+1, uvy+1, uvz+1, nVoxels, density));
- }
- else {
- // Otherwise, we have to do an actual gather in the more general
- // D() function. Once the reduce_equal tests above fail, we stop
- // checking in subsequent steps, since it's unlikely that this will
- // be true in the future once they've diverged into different
- // voxels.
- checkForSameVoxel = false;
- d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
- D(vx+1, vy, vz, nVoxels, density));
- d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
- D(vx+1, vy+1, vz, nVoxels, density));
- d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
- D(vx+1, vy, vz+1, nVoxels, density));
- d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
- D(vx+1, vy+1, vz+1, nVoxels, density));
- }
+ float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
+ D(vx+1, vy, vz, nVoxels, density));
+ float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
+ D(vx+1, vy+1, vz, nVoxels, density));
+ float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
+ D(vx+1, vy, vz+1, nVoxels, density));
+ float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
+ D(vx+1, vy+1, vz+1, nVoxels, density));
float d0 = Lerp(dy, d00, d10);
float d1 = Lerp(dy, d01, d11);
return Lerp(dz, d0, d1);
@@ -221,10 +185,8 @@ transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT;
- uniform bool checkForSameVoxel = true;
while (t < rayT1) {
- tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels,
- checkForSameVoxel);
+ tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
pos = pos + dirStep;
t += stepT;
}
@@ -268,9 +230,8 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT;
- uniform bool checkForSameVoxel = true;
cwhile (t < rayT1) {
- float d = Density(pos, pMin, pMax, density, nVoxels, checkForSameVoxel);
+ float d = Density(pos, pMin, pMax, density, nVoxels);
// terminate once attenuation is high
float atten = exp(-tau);
diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj
index 04ae8335..908cf734 100644
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -156,18 +156,18 @@
Document
- ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
+ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
- ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
- $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h
- $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h
- ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
+ $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h
+ $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h
+ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
- ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
- $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h
- $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h
+ $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h
+ $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h