diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp index a3ce5712..98ff937a 100644 --- a/examples/stencil/stencil.cpp +++ b/examples/stencil/stencil.cpp @@ -116,20 +116,38 @@ int main() { InitData(Nx, Ny, Nz, Aispc, vsq); // - // Compute the image using the ispc implementation; report the minimum - // time of three runs. + // Compute the image using the ispc implementation on one core; report + // the minimum time of three runs. // - double minISPC = 1e30; + double minTimeISPC = 1e30; for (int i = 0; i < 3; ++i) { reset_and_start_timer(); loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, width, Nz - width, Nx, Ny, Nz, coeff, vsq, Aispc[0], Aispc[1]); double dt = get_elapsed_mcycles(); - minISPC = std::min(minISPC, dt); + minTimeISPC = std::min(minTimeISPC, dt); } - printf("[stencil ispc]:\t\t\t[%.3f] million cycles\n", minISPC); + printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); + + InitData(Nx, Ny, Nz, Aispc, vsq); + + // + // Compute the image using the ispc implementation with tasks; report + // the minimum time of three runs. + // + double minTimeISPCTasks = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aispc[0], Aispc[1]); + double dt = get_elapsed_mcycles(); + minTimeISPCTasks = std::min(minTimeISPCTasks, dt); + } + + printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); InitData(Nx, Ny, Nz, Aserial, vsq); @@ -137,19 +155,20 @@ int main() { // And run the serial implementation 3 times, again reporting the // minimum time. // - double minSerial = 1e30; + double minTimeSerial = 1e30; for (int i = 0; i < 3; ++i) { reset_and_start_timer(); loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, width, Nz - width, Nx, Ny, Nz, coeff, vsq, Aserial[0], Aserial[1]); double dt = get_elapsed_mcycles(); - minSerial = std::min(minSerial, dt); + minTimeSerial = std::min(minTimeSerial, dt); } - printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minSerial); + printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial); - printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC); + printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n", + minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); // Check for agreement int offset = 0; diff --git a/examples/stencil/stencil.ispc b/examples/stencil/stencil.ispc index bb618c2b..d707640c 100644 --- a/examples/stencil/stencil.ispc +++ b/examples/stencil/stencil.ispc @@ -32,7 +32,7 @@ */ -static task void +static void stencil_step(uniform int x0, uniform int x1, uniform int y0, uniform int y1, uniform int z0, uniform int z1, @@ -67,14 +67,26 @@ stencil_step(uniform int x0, uniform int x1, } -export void loop_stencil_ispc(uniform int t0, uniform int t1, - uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const float coef[4], - uniform const float vsq[], - uniform float Aeven[], uniform float Aodd[]) +static task void +stencil_step_task(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const float coef[4], uniform const float vsq[], + uniform const float Ain[], uniform float Aout[]) { + stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Ain, Aout); +} + + +export void +loop_stencil_ispc_tasks(uniform int t0, uniform int t1, + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const float coef[4], + uniform const float vsq[], + uniform float Aeven[], uniform float Aodd[]) { for (uniform int t = t0; t < t1; ++t) { // Parallelize across cores as well: each task will work on a slice @@ -83,14 +95,35 @@ export void loop_stencil_ispc(uniform int t0, uniform int t1, uniform int dz = 1; for (uniform int z = z0; z < z1; z += dz) { if ((t & 1) == 0) - launch < stencil_step(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, coef, vsq, - Aeven, Aodd) >; + launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd) >; else - launch < stencil_step(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, coef, vsq, - Aodd, Aeven) >; + launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven) >; } // We need to wait for all of the launched tasks to finish before // starting the next iteration. sync; } } + + +export void +loop_stencil_ispc(uniform int t0, uniform int t1, + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const float coef[4], + uniform const float vsq[], + uniform float Aeven[], uniform float Aodd[]) +{ + for (uniform int t = t0; t < t1; ++t) { + if ((t & 1) == 0) + stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, + Aeven, Aodd); + else + stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, + Aodd, Aeven); + } +}