Add foreach and foreach_tiled looping constructs

These make it easier to iterate over arbitrary amounts of data
elements; specifically, they automatically handle the "ragged
extra bits" that come up when the number of elements to be
processed isn't evenly divided by programCount.

TODO: documentation
This commit is contained in:
Matt Pharr
2011-11-30 13:17:31 -08:00
parent b48775a549
commit 8bc7367109
32 changed files with 1120 additions and 78 deletions

View File

@@ -60,16 +60,16 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
// Note that we'll be doing programCount computations in parallel,
// so increment i by that much. This assumes that width evenly
// divides programCount.
for (uniform int i = 0; i < width; i += programCount) {
foreach (i = 0 ... width) {
// Figure out the position on the complex plane to compute the
// number of iterations at. Note that the x values are
// different across different program instances, since its
// initializer incorporates the value of the programIndex
// variable.
float x = x0 + (programIndex + i) * dx;
float x = x0 + i * dx;
float y = y0 + j * dy;
int index = j * width + i + programIndex;
int index = j * width + i;
output[index] = mandel(x, y, maxIterations);
}
}

View File

@@ -61,14 +61,12 @@ mandelbrot_scanlines(uniform int ybase, uniform int span,
uniform int ystart = ybase + taskIndex * span;
uniform int yend = ystart + span;
for (uniform int j = ystart; j < yend; ++j) {
for (uniform int i = 0; i < width; i += programCount) {
float x = x0 + (programIndex + i) * dx;
float y = y0 + j * dy;
foreach (yi = ystart ... yend, xi = 0 ... width) {
float x = x0 + xi * dx;
float y = y0 + yi * dy;
int index = j * width + i + programIndex;
output[index] = mandel(x, y, maxIterations);
}
int index = yi * width + xi;
output[index] = mandel(x, y, maxIterations);
}
}

View File

@@ -59,15 +59,13 @@ export void
black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[],
uniform float result[], uniform int count) {
for (uniform int i = 0; i < count; i += programCount) {
float S = Sa[i + programIndex], X = Xa[i + programIndex];
float T = Ta[i + programIndex], r = ra[i + programIndex];
float v = va[i + programIndex];
foreach (i = 0 ... count) {
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
float d2 = d1 - v * sqrt(T);
result[i + programIndex] = S * CND(d1) - X * exp(-r * T) * CND(d2);
result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
}
}
@@ -78,10 +76,8 @@ binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float result[], uniform int count) {
float V[BINOMIAL_NUM];
for (uniform int i = 0; i < count; i += programCount) {
float S = Sa[i + programIndex], X = Xa[i + programIndex];
float T = Ta[i + programIndex], r = ra[i + programIndex];
float v = va[i + programIndex];
foreach (i = 0 ... count) {
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
float dt = T / BINOMIAL_NUM;
float u = exp(v * sqrt(dt));
@@ -98,6 +94,6 @@ binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
for (uniform int k = 0; k < j; ++k)
V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
result[i + programIndex] = V[0];
result[i] = V[0];
}
}

View File

@@ -199,10 +199,8 @@ int main(int argc, char *argv[]) {
}
fclose(f);
// round image resolution up to multiple of 16 to make things easy for
// the code that assigns pixels to ispc program instances
int height = (int(baseHeight * scale) + 0xf) & ~0xf;
int width = (int(baseWidth * scale) + 0xf) & ~0xf;
int height = int(baseHeight * scale);
int width = int(baseWidth * scale);
// allocate images; one to hold hit object ids, one to hold depth to
// the first interseciton

View File

@@ -244,34 +244,15 @@ static void raytrace_tile(uniform int x0, uniform int x1,
uniform float widthScale = (float)(baseWidth) / (float)(width);
uniform float heightScale = (float)(baseHeight) / (float)(height);
static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
0, 1, 0, 1, 2, 3, 2, 3 };
static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
2, 2, 3, 3, 2, 2, 3, 3 };
foreach_tiled (y = y0 ... y1, x = x0 ... x1) {
Ray ray;
generateRay(raster2camera, camera2world, x*widthScale,
y*heightScale, ray);
BVHIntersect(nodes, triangles, ray);
// The outer loops are always over blocks of 4x4 pixels
for (uniform int y = y0; y < y1; y += 4) {
for (uniform int x = x0; x < x1; x += 4) {
// Now we have a block of 4x4=16 pixels to process; it will
// take 16/programCount iterations of this loop to process
// them.
for (uniform int o = 0; o < 16 / programCount; ++o) {
// Map program instances to samples in the udx/udy arrays
// to figure out which pixel each program instance is
// responsible for
const float dx = udx[o * programCount + programIndex];
const float dy = udy[o * programCount + programIndex];
Ray ray;
generateRay(raster2camera, camera2world, (x+dx)*widthScale,
(y+dy)*heightScale, ray);
BVHIntersect(nodes, triangles, ray);
int offset = (y + (int)dy) * width + (x + (int)dx);
image[offset] = ray.maxt;
id[offset] = ray.hitId;
}
}
int offset = y * width + x;
image[offset] = ray.maxt;
id[offset] = ray.hitId;
}
}

View File

@@ -43,9 +43,8 @@ stencil_step(uniform int x0, uniform int x1,
for (uniform int z = z0; z < z1; ++z) {
for (uniform int y = y0; y < y1; ++y) {
// Assumes that (x1-x0) % programCount == 0
for (uniform int x = x0; x < x1; x += programCount) {
int index = (z * Nxy) + (y * Nx) + x + programIndex;
foreach (x = x0 ... x1) {
int index = (z * Nxy) + (y * Nx) + x;
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
float div = coef[0] * A_cur(0, 0, 0) +

View File

@@ -310,11 +310,7 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
// by 4.
for (uniform int y = y0; y < y1; y += 4) {
for (uniform int x = x0; x < x1; x += 4) {
// For each such tile, process programCount pixels at a time,
// until we've done all 16 of them. Thus, we're also assuming
// that programCount <= 16 and that 16 is evenly dividible by
// programCount.
for (uniform int o = 0; o < 16; o += programCount) {
foreach (o = 0 ... 16) {
// These two arrays encode the mapping from [0,15] to
// offsets within the 4x4 pixel block so that we render
// each pixel inside the block
@@ -324,8 +320,7 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
2, 2, 3, 3, 2, 2, 3, 3 };
// Figure out the pixel to render for this program instance
int xo = x + xoffsets[o + programIndex];
int yo = y + yoffsets[o + programIndex];
int xo = x + xoffsets[o], yo = y + yoffsets[o];
// Use viewing parameters to compute the corresponding ray
// for the pixel