diff --git a/examples/mandelbrot_tasks/mandelbrot.ispc b/examples/mandelbrot_tasks/mandelbrot.ispc index d8ba195b..acf22adf 100644 --- a/examples/mandelbrot_tasks/mandelbrot.ispc +++ b/examples/mandelbrot_tasks/mandelbrot.ispc @@ -49,19 +49,18 @@ mandel(float c_re, float c_im, int count) { } -/* Task to compute the Mandelbrot iterations for a span of scanlines from - [ystart,yend). +/* Task to compute the Mandelbrot iterations for a single scanline. */ task void -mandelbrot_scanlines(uniform int ybase, uniform int span, - uniform float x0, uniform float dx, - uniform float y0, uniform float dy, - uniform int width, uniform int maxIterations, - uniform int output[]) { - uniform int ystart = ybase + taskIndex * span; - uniform int yend = ystart + span; +mandelbrot_scanline(uniform float x0, uniform float dx, + uniform float y0, uniform float dy, + uniform int width, uniform int height, + uniform int span, + uniform int maxIterations, uniform int output[]) { + uniform int y0 = taskIndex * span; + uniform int y1 = min((taskIndex+1) * span, height); - foreach (yi = ystart ... yend, xi = 0 ... width) { + foreach (yi = y0 ... y1, xi = 0 ... width) { float x = x0 + xi * dx; float y = y0 + yi * dy; @@ -71,20 +70,6 @@ mandelbrot_scanlines(uniform int ybase, uniform int span, } -task void -mandelbrot_chunk(uniform float x0, uniform float dx, - uniform float y0, uniform float dy, - uniform int width, uniform int height, - uniform int maxIterations, uniform int output[]) { - uniform int ystart = taskIndex * (height/taskCount); - uniform int yend = (taskIndex+1) * (height/taskCount); - uniform int span = 1; - - launch[(yend-ystart)/span] < mandelbrot_scanlines(ystart, span, x0, dx, y0, dy, - width, maxIterations, output) >; -} - - export void mandelbrot_ispc(uniform float x0, uniform float y0, uniform float x1, uniform float y1, @@ -92,7 +77,8 @@ mandelbrot_ispc(uniform float x0, uniform float y0, uniform int maxIterations, uniform int output[]) { uniform float dx = (x1 - x0) / width; uniform float dy = (y1 - y0) / height; + uniform int span = 4; - launch[32] < mandelbrot_chunk(x0, dx, y0, dy, width, height, - maxIterations, output) >; + launch[height/span] < mandelbrot_scanline(x0, dx, y0, dy, width, height, span, + maxIterations, output) >; } diff --git a/examples/stencil/stencil.ispc b/examples/stencil/stencil.ispc index 10b7b6a3..0d3c2435 100644 --- a/examples/stencil/stencil.ispc +++ b/examples/stencil/stencil.ispc @@ -41,27 +41,23 @@ stencil_step(uniform int x0, uniform int x1, uniform const float Ain[], uniform float Aout[]) { const uniform int Nxy = Nx * Ny; - for (uniform int z = z0; z < z1; ++z) { - for (uniform int y = y0; y < y1; ++y) { - foreach (x = x0 ... x1) { - int index = (z * Nxy) + (y * Nx) + x; + foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) { + int index = (z * Nxy) + (y * Nx) + x; #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - float div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + - A_cur(0, +1, 0) + A_cur(0, -1, 0) + - A_cur(0, 0, +1) + A_cur(0, 0, -1)) + - coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + - A_cur(0, +2, 0) + A_cur(0, -2, 0) + - A_cur(0, 0, +2) + A_cur(0, 0, -2)) + - coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + - A_cur(0, +3, 0) + A_cur(0, -3, 0) + - A_cur(0, 0, +3) + A_cur(0, 0, -3)); + float div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); - A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + - vsq[index] * div; - } - } + A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; } } @@ -69,11 +65,12 @@ stencil_step(uniform int x0, uniform int x1, static task void stencil_step_task(uniform int x0, uniform int x1, uniform int y0, uniform int y1, - uniform int z0, uniform int z1, + uniform int z0, uniform int Nx, uniform int Ny, uniform int Nz, uniform const float coef[4], uniform const float vsq[], uniform const float Ain[], uniform float Aout[]) { - stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Ain, Aout); + stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, + Nx, Ny, Nz, coef, vsq, Ain, Aout); } @@ -89,17 +86,14 @@ loop_stencil_ispc_tasks(uniform int t0, uniform int t1, { for (uniform int t = t0; t < t1; ++t) { // Parallelize across cores as well: each task will work on a slice - // of "dz" in the z extent of the volume. (dz=1 seems to work - // better than any larger values.) - uniform int dz = 1; - for (uniform int z = z0; z < z1; z += dz) { - if ((t & 1) == 0) - launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, - coef, vsq, Aeven, Aodd) >; - else - launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, - coef, vsq, Aodd, Aeven) >; - } + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + launch[z1-z0] < stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd) >; + else + launch[z1-z0] < stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven) >; + // We need to wait for all of the launched tasks to finish before // starting the next iteration. sync;