Use reduce_equal() in volume rendering example to avoid some gathers.

Modified this example to use reduce_equal() to see if all of the program instances want to load the 8 sample values around the same voxel. When this is the case, we can just do 8 scalar loads, rather than needing to do a fully general gather. Once this check fails, it isn't done again, since it's not likely to start succeeding in the future. This gives a ~10% speedup with the low-res data set, and basically no performance difference with the high-res one. (It makes sense that the lower-resolution the voxel sampling, the longer all of the rays will stay in the same set of voxels.)
2011-08-17 12:37:07 +01:00
parent ecaa57c7c6
commit d7662b3eb9
1 changed files with 51 additions and 18 deletions
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -120,13 +120,17 @@ static inline float D(int x, int y, int z, uniform int nVoxels[3],
    y = clamp(y, 0, nVoxels[1]-1);
    z = clamp(z, 0, nVoxels[2]-1);

-#if 0
-    uniform int ux, uy, uz;
-    if (reduce_equal(x, ux) && reduce_equal(y, uy) && reduce_equal(z, uz))
-        return density[uz*nVoxels[0]*nVoxels[1] + uy*nVoxels[0] + ux];
-    else
-#endif
-        return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float Du(uniform int x, uniform int y, uniform int z, 
+                       uniform int nVoxels[3], uniform float density[]) {
+    x = clamp(x, 0, nVoxels[0]-1);
+    y = clamp(y, 0, nVoxels[1]-1);
+    z = clamp(z, 0, nVoxels[2]-1);
+
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
 }


@@ -136,7 +140,8 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {


 static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
-                            uniform float density[], uniform int nVoxels[3]) {
+                            uniform float density[], uniform int nVoxels[3],
+                            reference uniform bool checkForSameVoxel) {
    if (!Inside(Pobj, pMin, pMax)) 
        return 0;
    // Compute voxel coordinates and offsets for _Pobj_
@@ -148,14 +153,39 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;

    // Trilinearly interpolate density values to compute local density
-    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
-                         D(vx+1, vy, vz, nVoxels, density));
-    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
-                         D(vx+1, vy+1, vz, nVoxels, density));
-    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
-                         D(vx+1, vy, vz+1, nVoxels, density));
-    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
-                         D(vx+1, vy+1, vz+1, nVoxels, density));
+    float d00, d10, d01, d11;
+    uniform int uvx, uvy, uvz;
+    if (checkForSameVoxel && reduce_equal(vx, uvx) && reduce_equal(vy, uvy) &&
+        reduce_equal(vz, uvz)) {
+        // If all of the program instances are inside the same voxel, then
+        // we'll call the 'uniform' variant of the voxel density lookup
+        // function, thus doing a single load for each value rather than a
+        // gather.
+        d00 = Lerp(dx, Du(uvx, uvy, uvz, nVoxels, density),     
+                       Du(uvx+1, uvy, uvz, nVoxels, density));
+        d10 = Lerp(dx, Du(uvx, uvy+1, uvz, nVoxels, density),   
+                       Du(uvx+1, uvy+1, uvz, nVoxels, density));
+        d01 = Lerp(dx, Du(uvx, uvy, uvz+1, nVoxels, density),   
+                       Du(uvx+1, uvy, uvz+1, nVoxels, density));
+        d11 = Lerp(dx, Du(uvx, uvy+1, uvz+1, nVoxels, density), 
+                       Du(uvx+1, uvy+1, uvz+1, nVoxels, density));
+    }
+    else {
+        // Otherwise, we have to do an actual gather in the more general
+        // D() function.  Once the reduce_equal tests above fail, we stop
+        // checking in subsequent steps, since it's unlikely that this will
+        // be true in the future once they've diverged into different
+        // voxels.
+        checkForSameVoxel = false;
+        d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
+                       D(vx+1, vy, vz, nVoxels, density));
+        d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
+                       D(vx+1, vy+1, vz, nVoxels, density));
+        d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
+                       D(vx+1, vy, vz+1, nVoxels, density));
+        d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
+                       D(vx+1, vy+1, vz+1, nVoxels, density));
+    }
    float d0 = Lerp(dy, d00, d10);
    float d1 = Lerp(dy, d01, d11);
    return Lerp(dz, d0, d1);
@@ -191,8 +221,10 @@ transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
    float t = rayT0;
    float3 pos = ray.origin + ray.dir * rayT0;
    float3 dirStep = ray.dir * stepT;
+    uniform bool checkForSameVoxel = true;
    while (t < rayT1) {
-        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels,
+                                            checkForSameVoxel);
        pos = pos + dirStep;
        t += stepT;
    }
@@ -236,8 +268,9 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
    float t = rayT0;
    float3 pos = ray.origin + ray.dir * rayT0;
    float3 dirStep = ray.dir * stepT;
+    uniform bool checkForSameVoxel = true;
    cwhile (t < rayT1) {
-        float d = Density(pos, pMin, pMax, density, nVoxels);
+        float d = Density(pos, pMin, pMax, density, nVoxels, checkForSameVoxel);

        // terminate once attenuation is high
        float atten = exp(-tau);