Use reduce_equal() in volume rendering example to avoid some gathers.

Modified this example to use reduce_equal() to see if all of the program
instances want to load the 8 sample values around the same voxel.  When
this is the case, we can just do 8 scalar loads, rather than needing to
do a fully general gather.  Once this check fails, it isn't done again,
since it's not likely to start succeeding in the future.  This gives
a ~10% speedup with the low-res data set, and basically no performance
difference with the high-res one.  (It makes sense that the lower-resolution
the voxel sampling, the longer all of the rays will stay in the same set
of voxels.)
This commit is contained in:
Matt Pharr
2011-08-17 12:37:07 +01:00
parent ecaa57c7c6
commit d7662b3eb9

View File

@@ -120,13 +120,17 @@ static inline float D(int x, int y, int z, uniform int nVoxels[3],
y = clamp(y, 0, nVoxels[1]-1);
z = clamp(z, 0, nVoxels[2]-1);
#if 0
uniform int ux, uy, uz;
if (reduce_equal(x, ux) && reduce_equal(y, uy) && reduce_equal(z, uz))
return density[uz*nVoxels[0]*nVoxels[1] + uy*nVoxels[0] + ux];
else
#endif
return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
}
static inline float Du(uniform int x, uniform int y, uniform int z,
uniform int nVoxels[3], uniform float density[]) {
x = clamp(x, 0, nVoxels[0]-1);
y = clamp(y, 0, nVoxels[1]-1);
z = clamp(z, 0, nVoxels[2]-1);
return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
}
@@ -136,7 +140,8 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
uniform float density[], uniform int nVoxels[3]) {
uniform float density[], uniform int nVoxels[3],
reference uniform bool checkForSameVoxel) {
if (!Inside(Pobj, pMin, pMax))
return 0;
// Compute voxel coordinates and offsets for _Pobj_
@@ -148,14 +153,39 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
// Trilinearly interpolate density values to compute local density
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
D(vx+1, vy, vz, nVoxels, density));
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
D(vx+1, vy+1, vz, nVoxels, density));
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
D(vx+1, vy, vz+1, nVoxels, density));
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
D(vx+1, vy+1, vz+1, nVoxels, density));
float d00, d10, d01, d11;
uniform int uvx, uvy, uvz;
if (checkForSameVoxel && reduce_equal(vx, uvx) && reduce_equal(vy, uvy) &&
reduce_equal(vz, uvz)) {
// If all of the program instances are inside the same voxel, then
// we'll call the 'uniform' variant of the voxel density lookup
// function, thus doing a single load for each value rather than a
// gather.
d00 = Lerp(dx, Du(uvx, uvy, uvz, nVoxels, density),
Du(uvx+1, uvy, uvz, nVoxels, density));
d10 = Lerp(dx, Du(uvx, uvy+1, uvz, nVoxels, density),
Du(uvx+1, uvy+1, uvz, nVoxels, density));
d01 = Lerp(dx, Du(uvx, uvy, uvz+1, nVoxels, density),
Du(uvx+1, uvy, uvz+1, nVoxels, density));
d11 = Lerp(dx, Du(uvx, uvy+1, uvz+1, nVoxels, density),
Du(uvx+1, uvy+1, uvz+1, nVoxels, density));
}
else {
// Otherwise, we have to do an actual gather in the more general
// D() function. Once the reduce_equal tests above fail, we stop
// checking in subsequent steps, since it's unlikely that this will
// be true in the future once they've diverged into different
// voxels.
checkForSameVoxel = false;
d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
D(vx+1, vy, vz, nVoxels, density));
d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
D(vx+1, vy+1, vz, nVoxels, density));
d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
D(vx+1, vy, vz+1, nVoxels, density));
d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
D(vx+1, vy+1, vz+1, nVoxels, density));
}
float d0 = Lerp(dy, d00, d10);
float d1 = Lerp(dy, d01, d11);
return Lerp(dz, d0, d1);
@@ -191,8 +221,10 @@ transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT;
uniform bool checkForSameVoxel = true;
while (t < rayT1) {
tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels,
checkForSameVoxel);
pos = pos + dirStep;
t += stepT;
}
@@ -236,8 +268,9 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT;
uniform bool checkForSameVoxel = true;
cwhile (t < rayT1) {
float d = Density(pos, pMin, pMax, density, nVoxels);
float d = Density(pos, pMin, pMax, density, nVoxels, checkForSameVoxel);
// terminate once attenuation is high
float atten = exp(-tau);