Add foreach and foreach_tiled looping constructs

These make it easier to iterate over arbitrary amounts of data elements; specifically, they automatically handle the "ragged extra bits" that come up when the number of elements to be processed isn't evenly divided by programCount. TODO: documentation
2011-11-30 13:17:31 -08:00
parent b48775a549
commit 8bc7367109
32 changed files with 1120 additions and 78 deletions
--- a/examples/mandelbrot/mandelbrot.ispc
+++ b/examples/mandelbrot/mandelbrot.ispc
@@ -60,16 +60,16 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
        // Note that we'll be doing programCount computations in parallel,
        // so increment i by that much.  This assumes that width evenly
        // divides programCount.
-        for (uniform int i = 0; i < width; i += programCount) {
+        foreach (i = 0 ... width) {
            // Figure out the position on the complex plane to compute the
            // number of iterations at.  Note that the x values are
            // different across different program instances, since its
            // initializer incorporates the value of the programIndex
            // variable.
-            float x = x0 + (programIndex + i) * dx;
+            float x = x0 + i * dx;
            float y = y0 + j * dy;

-            int index = j * width + i + programIndex;
+            int index = j * width + i;
            output[index] = mandel(x, y, maxIterations);
        }
    }
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -61,14 +61,12 @@ mandelbrot_scanlines(uniform int ybase, uniform int span,
    uniform int ystart = ybase + taskIndex * span;
    uniform int yend = ystart + span;

-    for (uniform int j = ystart; j < yend; ++j) {
-        for (uniform int i = 0; i < width; i += programCount) {
-            float x = x0 + (programIndex + i) * dx;
-            float y = y0 + j * dy;
+    foreach (yi = ystart ... yend, xi = 0 ... width) {
+        float x = x0 + xi * dx;
+        float y = y0 + yi * dy;

-            int index = j * width + i + programIndex;
-            output[index] = mandel(x, y, maxIterations);
-        }
+        int index = yi * width + xi;
+        output[index] = mandel(x, y, maxIterations);
    }
 }
                               
--- a/examples/options/options.ispc
+++ b/examples/options/options.ispc
@@ -59,15 +59,13 @@ export void
 black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
                   uniform float ra[], uniform float va[], 
                   uniform float result[], uniform int count) {
-    for (uniform int i = 0; i < count; i += programCount) {
-        float S = Sa[i + programIndex], X = Xa[i + programIndex];
-        float T = Ta[i + programIndex], r = ra[i + programIndex];
-        float v = va[i + programIndex];
+    foreach (i = 0 ... count) {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];

        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
        float d2 = d1 - v * sqrt(T);

-        result[i + programIndex] = S * CND(d1) - X * exp(-r * T) * CND(d2);
+        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
    }
 }

@@ -78,10 +76,8 @@ binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
                  uniform float result[], uniform int count) {
    float V[BINOMIAL_NUM];

-    for (uniform int i = 0; i < count; i += programCount) {
-        float S = Sa[i + programIndex], X = Xa[i + programIndex];
-        float T = Ta[i + programIndex], r = ra[i + programIndex];
-        float v = va[i + programIndex];
+    foreach (i = 0 ... count) {
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];

        float dt = T / BINOMIAL_NUM;
        float u = exp(v * sqrt(dt));
@@ -98,6 +94,6 @@ binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
            for (uniform int k = 0; k < j; ++k)
                V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;

-        result[i + programIndex] = V[0];
+        result[i] = V[0];
    }
 }
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -199,10 +199,8 @@ int main(int argc, char *argv[]) {
    }
    fclose(f);

-    // round image resolution up to multiple of 16 to make things easy for
-    // the code that assigns pixels to ispc program instances
-    int height = (int(baseHeight * scale) + 0xf) & ~0xf;
-    int width = (int(baseWidth * scale) + 0xf) & ~0xf;
+    int height = int(baseHeight * scale);
+    int width = int(baseWidth * scale);

    // allocate images; one to hold hit object ids, one to hold depth to
    // the first interseciton
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -244,34 +244,15 @@ static void raytrace_tile(uniform int x0, uniform int x1,
    uniform float widthScale = (float)(baseWidth) / (float)(width);
    uniform float heightScale = (float)(baseHeight) / (float)(height);

-    static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 
-                                           0, 1, 0, 1, 2, 3, 2, 3 };
-    static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 
-                                           2, 2, 3, 3, 2, 2, 3, 3 };
+    foreach_tiled (y = y0 ... y1, x = x0 ... x1) {
+        Ray ray;
+        generateRay(raster2camera, camera2world, x*widthScale,
+                    y*heightScale, ray);
+        BVHIntersect(nodes, triangles, ray);

-    // The outer loops are always over blocks of 4x4 pixels
-    for (uniform int y = y0; y < y1; y += 4) {
-        for (uniform int x = x0; x < x1; x += 4) {
-            // Now we have a block of 4x4=16 pixels to process; it will
-            // take 16/programCount iterations of this loop to process
-            // them.
-            for (uniform int o = 0; o < 16 / programCount; ++o) {
-                // Map program instances to samples in the udx/udy arrays
-                // to figure out which pixel each program instance is
-                // responsible for
-                const float dx = udx[o * programCount + programIndex];
-                const float dy = udy[o * programCount + programIndex];
-
-                Ray ray;
-                generateRay(raster2camera, camera2world, (x+dx)*widthScale,
-                            (y+dy)*heightScale, ray);
-                BVHIntersect(nodes, triangles, ray);
-
-                int offset = (y + (int)dy) * width + (x + (int)dx);
-                image[offset] = ray.maxt;
-                id[offset] = ray.hitId;
-            }
-        }
+        int offset = y * width + x;
+        image[offset] = ray.maxt;
+        id[offset] = ray.hitId;
    }
 }

--- a/examples/stencil/stencil.ispc
+++ b/examples/stencil/stencil.ispc
@@ -43,9 +43,8 @@ stencil_step(uniform int x0, uniform int x1,

    for (uniform int z = z0; z < z1; ++z) {
        for (uniform int y = y0; y < y1; ++y) {
-            // Assumes that (x1-x0) % programCount == 0
-            for (uniform int x = x0; x < x1; x += programCount) {
-                int index = (z * Nxy) + (y * Nx) + x + programIndex;
+            foreach (x = x0 ... x1) {
+                int index = (z * Nxy) + (y * Nx) + x;
 #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
 #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
                float div = coef[0] * A_cur(0, 0, 0) +
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -310,11 +310,7 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
    // by 4.
    for (uniform int y = y0; y < y1; y += 4) {
        for (uniform int x = x0; x < x1; x += 4) {
-            // For each such tile, process programCount pixels at a time,
-            // until we've done all 16 of them.  Thus, we're also assuming
-            // that programCount <= 16 and that 16 is evenly dividible by
-            // programCount.
-            for (uniform int o = 0; o < 16; o += programCount) {
+            foreach (o = 0 ... 16) {
                // These two arrays encode the mapping from [0,15] to
                // offsets within the 4x4 pixel block so that we render
                // each pixel inside the block
@@ -324,8 +320,7 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
                                                   2, 2, 3, 3, 2, 2, 3, 3 };

                // Figure out the pixel to render for this program instance
-                int xo = x + xoffsets[o + programIndex];
-                int yo = y + yoffsets[o + programIndex];
+                int xo = x + xoffsets[o], yo = y + yoffsets[o];

                // Use viewing parameters to compute the corresponding ray
                // for the pixel