Add foreach and foreach_tiled looping constructs
These make it easier to iterate over arbitrary amounts of data elements; specifically, they automatically handle the "ragged extra bits" that come up when the number of elements to be processed isn't evenly divided by programCount. TODO: documentation
This commit is contained in:
@@ -60,16 +60,16 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
// Note that we'll be doing programCount computations in parallel,
|
||||
// so increment i by that much. This assumes that width evenly
|
||||
// divides programCount.
|
||||
for (uniform int i = 0; i < width; i += programCount) {
|
||||
foreach (i = 0 ... width) {
|
||||
// Figure out the position on the complex plane to compute the
|
||||
// number of iterations at. Note that the x values are
|
||||
// different across different program instances, since its
|
||||
// initializer incorporates the value of the programIndex
|
||||
// variable.
|
||||
float x = x0 + (programIndex + i) * dx;
|
||||
float x = x0 + i * dx;
|
||||
float y = y0 + j * dy;
|
||||
|
||||
int index = j * width + i + programIndex;
|
||||
int index = j * width + i;
|
||||
output[index] = mandel(x, y, maxIterations);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,14 +61,12 @@ mandelbrot_scanlines(uniform int ybase, uniform int span,
|
||||
uniform int ystart = ybase + taskIndex * span;
|
||||
uniform int yend = ystart + span;
|
||||
|
||||
for (uniform int j = ystart; j < yend; ++j) {
|
||||
for (uniform int i = 0; i < width; i += programCount) {
|
||||
float x = x0 + (programIndex + i) * dx;
|
||||
float y = y0 + j * dy;
|
||||
foreach (yi = ystart ... yend, xi = 0 ... width) {
|
||||
float x = x0 + xi * dx;
|
||||
float y = y0 + yi * dy;
|
||||
|
||||
int index = j * width + i + programIndex;
|
||||
output[index] = mandel(x, y, maxIterations);
|
||||
}
|
||||
int index = yi * width + xi;
|
||||
output[index] = mandel(x, y, maxIterations);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -59,15 +59,13 @@ export void
|
||||
black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
for (uniform int i = 0; i < count; i += programCount) {
|
||||
float S = Sa[i + programIndex], X = Xa[i + programIndex];
|
||||
float T = Ta[i + programIndex], r = ra[i + programIndex];
|
||||
float v = va[i + programIndex];
|
||||
foreach (i = 0 ... count) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
|
||||
float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
|
||||
float d2 = d1 - v * sqrt(T);
|
||||
|
||||
result[i + programIndex] = S * CND(d1) - X * exp(-r * T) * CND(d2);
|
||||
result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,10 +76,8 @@ binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float result[], uniform int count) {
|
||||
float V[BINOMIAL_NUM];
|
||||
|
||||
for (uniform int i = 0; i < count; i += programCount) {
|
||||
float S = Sa[i + programIndex], X = Xa[i + programIndex];
|
||||
float T = Ta[i + programIndex], r = ra[i + programIndex];
|
||||
float v = va[i + programIndex];
|
||||
foreach (i = 0 ... count) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
|
||||
float dt = T / BINOMIAL_NUM;
|
||||
float u = exp(v * sqrt(dt));
|
||||
@@ -98,6 +94,6 @@ binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
for (uniform int k = 0; k < j; ++k)
|
||||
V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
|
||||
|
||||
result[i + programIndex] = V[0];
|
||||
result[i] = V[0];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -199,10 +199,8 @@ int main(int argc, char *argv[]) {
|
||||
}
|
||||
fclose(f);
|
||||
|
||||
// round image resolution up to multiple of 16 to make things easy for
|
||||
// the code that assigns pixels to ispc program instances
|
||||
int height = (int(baseHeight * scale) + 0xf) & ~0xf;
|
||||
int width = (int(baseWidth * scale) + 0xf) & ~0xf;
|
||||
int height = int(baseHeight * scale);
|
||||
int width = int(baseWidth * scale);
|
||||
|
||||
// allocate images; one to hold hit object ids, one to hold depth to
|
||||
// the first interseciton
|
||||
|
||||
@@ -244,34 +244,15 @@ static void raytrace_tile(uniform int x0, uniform int x1,
|
||||
uniform float widthScale = (float)(baseWidth) / (float)(width);
|
||||
uniform float heightScale = (float)(baseHeight) / (float)(height);
|
||||
|
||||
static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
|
||||
0, 1, 0, 1, 2, 3, 2, 3 };
|
||||
static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
|
||||
2, 2, 3, 3, 2, 2, 3, 3 };
|
||||
foreach_tiled (y = y0 ... y1, x = x0 ... x1) {
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, x*widthScale,
|
||||
y*heightScale, ray);
|
||||
BVHIntersect(nodes, triangles, ray);
|
||||
|
||||
// The outer loops are always over blocks of 4x4 pixels
|
||||
for (uniform int y = y0; y < y1; y += 4) {
|
||||
for (uniform int x = x0; x < x1; x += 4) {
|
||||
// Now we have a block of 4x4=16 pixels to process; it will
|
||||
// take 16/programCount iterations of this loop to process
|
||||
// them.
|
||||
for (uniform int o = 0; o < 16 / programCount; ++o) {
|
||||
// Map program instances to samples in the udx/udy arrays
|
||||
// to figure out which pixel each program instance is
|
||||
// responsible for
|
||||
const float dx = udx[o * programCount + programIndex];
|
||||
const float dy = udy[o * programCount + programIndex];
|
||||
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, (x+dx)*widthScale,
|
||||
(y+dy)*heightScale, ray);
|
||||
BVHIntersect(nodes, triangles, ray);
|
||||
|
||||
int offset = (y + (int)dy) * width + (x + (int)dx);
|
||||
image[offset] = ray.maxt;
|
||||
id[offset] = ray.hitId;
|
||||
}
|
||||
}
|
||||
int offset = y * width + x;
|
||||
image[offset] = ray.maxt;
|
||||
id[offset] = ray.hitId;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -43,9 +43,8 @@ stencil_step(uniform int x0, uniform int x1,
|
||||
|
||||
for (uniform int z = z0; z < z1; ++z) {
|
||||
for (uniform int y = y0; y < y1; ++y) {
|
||||
// Assumes that (x1-x0) % programCount == 0
|
||||
for (uniform int x = x0; x < x1; x += programCount) {
|
||||
int index = (z * Nxy) + (y * Nx) + x + programIndex;
|
||||
foreach (x = x0 ... x1) {
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
float div = coef[0] * A_cur(0, 0, 0) +
|
||||
|
||||
@@ -310,11 +310,7 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
||||
// by 4.
|
||||
for (uniform int y = y0; y < y1; y += 4) {
|
||||
for (uniform int x = x0; x < x1; x += 4) {
|
||||
// For each such tile, process programCount pixels at a time,
|
||||
// until we've done all 16 of them. Thus, we're also assuming
|
||||
// that programCount <= 16 and that 16 is evenly dividible by
|
||||
// programCount.
|
||||
for (uniform int o = 0; o < 16; o += programCount) {
|
||||
foreach (o = 0 ... 16) {
|
||||
// These two arrays encode the mapping from [0,15] to
|
||||
// offsets within the 4x4 pixel block so that we render
|
||||
// each pixel inside the block
|
||||
@@ -324,8 +320,7 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
||||
2, 2, 3, 3, 2, 2, 3, 3 };
|
||||
|
||||
// Figure out the pixel to render for this program instance
|
||||
int xo = x + xoffsets[o + programIndex];
|
||||
int yo = y + yoffsets[o + programIndex];
|
||||
int xo = x + xoffsets[o], yo = y + yoffsets[o];
|
||||
|
||||
// Use viewing parameters to compute the corresponding ray
|
||||
// for the pixel
|
||||
|
||||
Reference in New Issue
Block a user