diff --git a/examples_ptx/stencil/.gitignore b/examples_ptx/stencil/.gitignore new file mode 100644 index 00000000..e043735b --- /dev/null +++ b/examples_ptx/stencil/.gitignore @@ -0,0 +1,2 @@ +stencil +objs diff --git a/examples_ptx/stencil/Makefile b/examples_ptx/stencil/Makefile new file mode 100644 index 00000000..1b9c2717 --- /dev/null +++ b/examples_ptx/stencil/Makefile @@ -0,0 +1,8 @@ + +EXAMPLE=stencil +CPP_SRC=stencil.cpp stencil_serial.cpp +ISPC_SRC=stencil.ispc +ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16 +ISPC_ARM_TARGETS=neon + +include ../common.mk diff --git a/examples_ptx/stencil/Makefile_cpu b/examples_ptx/stencil/Makefile_cpu new file mode 100644 index 00000000..18dc5ccd --- /dev/null +++ b/examples_ptx/stencil/Makefile_cpu @@ -0,0 +1,8 @@ + +EXAMPLE=stencil +CPP_SRC=stencil.cpp stencil_serial.cpp +ISPC_SRC=stencil.ispc +ISPC_IA_TARGETS=avx1-i32x16 +ISPC_ARM_TARGETS=neon + +include ../common.mk diff --git a/examples_ptx/stencil/Makefile_gpu b/examples_ptx/stencil/Makefile_gpu new file mode 100644 index 00000000..3f31b3e0 --- /dev/null +++ b/examples_ptx/stencil/Makefile_gpu @@ -0,0 +1,10 @@ +PROG=stencil +ISPC_SRC=stencil.ispc +CU_SRC=stencil.cu +CXX_SRC=stencil.cpp stencil_serial.cpp +PTXCC_REGMAX=128 + +include ../common_gpu.mk + + + diff --git a/examples_ptx/stencil/stencil.cpp b/examples_ptx/stencil/stencil.cpp new file mode 100644 index 00000000..bb1deb1e --- /dev/null +++ b/examples_ptx/stencil/stencil.cpp @@ -0,0 +1,184 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include +#include +#include "../timing.h" +#include "../ispc_malloc.h" +#include "stencil_ispc.h" +using namespace ispc; + + +extern void loop_stencil_serial(int t0, int t1, int x0, int x1, + int y0, int y1, int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[5], + const double vsq[], + double Aeven[], double Aodd[]); + + +void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) { + int offset = 0; + for (int z = 0; z < Nz; ++z) + for (int y = 0; y < Ny; ++y) + for (int x = 0; x < Nx; ++x, ++offset) { + A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny); + A[1][offset] = 0; + vsq[offset] = x*y*z / double(Nx * Ny * Nz); + } +} + + +int main(int argc, char *argv[]) { + static unsigned int test_iterations[] = {3, 3, 3};//the last two numbers must be equal here + int Nx = 256, Ny = 256, Nz = 256; + int width = 4; + + if (argc > 1) { + if (strncmp(argv[1], "--scale=", 8) == 0) { + double scale = atof(argv[1] + 8); + Nx *= scale; + Ny *= scale; + Nz *= scale; + } + } + if ((argc == 4) || (argc == 5)) { + for (int i = 0; i < 3; i++) { + test_iterations[i] = atoi(argv[argc - 3 + i]); + } + } + + double *Aserial[2], *Aispc[2]; + Aserial[0] = new double [Nx * Ny * Nz]; + Aserial[1] = new double [Nx * Ny * Nz]; + Aispc[0] = new double [Nx * Ny * Nz]; + Aispc[1] = new double [Nx * Ny * Nz]; + double *vsq = new double [Nx * Ny * Nz]; + double *coeff = new double[4]; + + coeff[0] = 0.5; + coeff[1] = -.25; + coeff[2] = .125; + coeff[3] = -.0625; + + InitData(Nx, Ny, Nz, Aispc, vsq); + // + // Compute the image using the ispc implementation on one core; report + // the minimum time of three runs. + // +#if 0 +//#ifndef _CUDA_ + double minTimeISPC = 1e30; + for (int i = 0; i < test_iterations[0]; ++i) { + reset_and_start_timer(); + loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aispc[0], Aispc[1]); + double dt = get_elapsed_msec(); + printf("@time of ISPC run:\t\t\t[%.3f] msec\n", dt); + minTimeISPC = std::min(minTimeISPC, dt); + } + + printf("[stencil ispc 1 core]:\t\t[%.3f] msec\n", minTimeISPC); + + InitData(Nx, Ny, Nz, Aispc, vsq); +#endif + + // + // Compute the image using the ispc implementation with tasks; report + // the minimum time of three runs. + // + double minTimeISPCTasks = 1e30; + for (int i = 0; i < test_iterations[1]; ++i) { + reset_and_start_timer(); + loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aispc[0], Aispc[1]); + double dt = get_elapsed_msec(); + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt); + minTimeISPCTasks = std::min(minTimeISPCTasks, dt); + } + + printf("[stencil ispc + tasks]:\t\t[%.3f] msec\n", minTimeISPCTasks); + + InitData(Nx, Ny, Nz, Aserial, vsq); + + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minTimeSerial = 1e30; + for (int i = 0; i < test_iterations[2]; ++i) { + reset_and_start_timer(); + loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aserial[0], Aserial[1]); + double dt = get_elapsed_msec(); + printf("@time of serial run:\t\t\t[%.3f] msec\n", dt); + minTimeSerial = std::min(minTimeSerial, dt); + } + + printf("[stencil serial]:\t\t[%.3f] msec\n", minTimeSerial); + +#if 0 +//#ifndef _CUDA_ + printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", + minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); +#else + printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", + minTimeSerial / minTimeISPCTasks); +#endif + + // Check for agreement + int offset = 0; + for (int z = 0; z < Nz; ++z) + for (int y = 0; y < Ny; ++y) + for (int x = 0; x < Nx; ++x, ++offset) { + double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) / + Aserial[1][offset]); + if (error > 1e-4) + printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n", + x, y, z, Aispc[1][offset], Aserial[1][offset]); + } + + return 0; +} diff --git a/examples_ptx/stencil/stencil.cu b/examples_ptx/stencil/stencil.cu new file mode 100644 index 00000000..28e9bbcf --- /dev/null +++ b/examples_ptx/stencil/stencil.cu @@ -0,0 +1,129 @@ +#include "cuda_helpers.cuh" + +__device__ static void +stencil_step( int x0, int x1, + int y0, int y1, + int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[4], const double vsq[], + const double Ain[], double Aout[]) +{ + const int Nxy = Nx * Ny; + + + const double coef0 = coef[0]; + const double coef1 = coef[1]; + const double coef2 = coef[2]; + const double coef3 = coef[3]; + for ( int z = z0; z < z1; z++) + for ( int y = y0 ; y < y1; y++) + for ( int xb = x0; xb < x1; xb += programCount) + { + const int x = xb + programIndex; + + int index = (z * Nxy) + (y * Nx) + x; +#define A_cur(x, y, z) __ldg(&Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]) +#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] + double div = + coef0 * A_cur(0, 0, 0) + + coef1 * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + coef2 * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + coef3 * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); + + if (x < x1) + A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + __ldg(&vsq[index]) * div; + } +} + + +#define SPANX 32 +#define SPANY 2 +#define SPANZ 4 + +__global__ void +stencil_step_task( int x0, int x1, + int y0, int y1, + int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[4], const double vsq[], + const double Ain[], double Aout[]) { + if (taskIndex0 >= taskCount0 || + taskIndex1 >= taskCount1 || + taskIndex2 >= taskCount2) + return; + + const int xfirst = x0 + taskIndex0 * SPANX; + const int xlast = min(x1, xfirst + SPANX); + + const int yfirst = y0 + taskIndex1 * SPANY; + const int ylast = min(y1, yfirst + SPANY); + + const int zfirst = z0 + taskIndex2 * SPANZ; + const int zlast = min(z1, zfirst + SPANZ); + + stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, + Nx, Ny, Nz, coef, vsq, Ain, Aout); +} + + + +extern "C" +__global__ void +loop_stencil_ispc_tasks___export( int t0, int t1, + int x0, int x1, + int y0, int y1, + int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[4], + const double vsq[], + double Aeven[], double Aodd[]) +{ +#define NB(x,n) (((x)+(n)-1)/(n)) + + dim3 grid((NB(x1-x0,SPANX)-1)/4+1, NB(y1-y0,SPANY), NB(z1-z0,SPANZ)); + + for ( int t = t0; t < t1; ++t) + { + // Parallelize across cores as well: each task will work on a slice + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + { + if (programIndex == 0) + stencil_step_task<<>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd); + } + else + { + if (programIndex == 0) + stencil_step_task<<>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven); + } + + // We need to wait for all of the launched tasks to finish before + // starting the next iteration + cudaDeviceSynchronize(); + } +} + +extern "C" +__host__ void +loop_stencil_ispc_tasks( int t0, int t1, + int x0, int x1, + int y0, int y1, + int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[4], + const double vsq[], + double Aeven[], double Aodd[]) +{ + loop_stencil_ispc_tasks___export<<<1,32>>>(t0,t1,x0,x1,y0,y1,z0,z1,Nx,Ny,Nz,coef,vsq,Aeven,Aodd); + cudaDeviceSynchronize(); +} + diff --git a/examples_ptx/stencil/stencil.ispc b/examples_ptx/stencil/stencil.ispc new file mode 100644 index 00000000..c4746868 --- /dev/null +++ b/examples_ptx/stencil/stencil.ispc @@ -0,0 +1,164 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +static inline void +stencil_step(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + const uniform int Nxy = Nx * Ny; + +#if 0 +#define VER1 +#endif + +#ifdef VER1 + const uniform int x1o = 1; + const uniform int x2o = 2; + const uniform int x3o = 3; + const uniform int y1o = Nx; + const uniform int y2o = Nx*2; + const uniform int y3o = Nx*3; + const uniform int z1o = Nxy; + const uniform int z2o = Nxy*2; + const uniform int z3o = Nxy*3; +#endif + + foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) + { +#ifdef VER1 + + int index = (z * Nxy) + (y * Nx) + x; +#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)] +#define A_next(x, y, z) Aout[index + (x) + (y) + (z)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) + + A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) + + A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) + + coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) + + A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) + + A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) + + coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) + + A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) + + A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o)); + A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; + +#else + + int index = (z * Nxy) + (y * Nx) + x; +#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] +#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); + + A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; + +#endif + + } +} + +#define SPANX 32 +#define SPANY 2 +#define SPANZ 4 + +static task void +stencil_step_task(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + if (taskIndex0 >= taskCount0 || + taskIndex1 >= taskCount1 || + taskIndex2 >= taskCount2) + return; + + const uniform int xfirst = x0 + taskIndex0 * SPANX; + const uniform int xlast = min(x1, xfirst + SPANX); + + const uniform int yfirst = y0 + taskIndex1 * SPANY; + const uniform int ylast = min(y1, yfirst + SPANY); + + const uniform int zfirst = z0 + taskIndex2 * SPANZ; + const uniform int zlast = min(z1, zfirst + SPANZ); + + stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, + Nx, Ny, Nz, coef, vsq, Ain, Aout); +} + + + +export void +loop_stencil_ispc_tasks(uniform int t0, uniform int t1, + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], + uniform const double vsq[], + uniform double Aeven[], uniform double Aodd[]) +{ +#define NB(x,n) (((x)+(n)-1)/(n)) + + for (uniform int t = t0; t < t1; ++t) + { + // Parallelize across cores as well: each task will work on a slice + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd); + else + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven); + + // We need to wait for all of the launched tasks to finish before + // starting the next iteration. + sync; + } +} + diff --git a/examples_ptx/stencil/stencil.orig.ispc b/examples_ptx/stencil/stencil.orig.ispc new file mode 100644 index 00000000..a8b33040 --- /dev/null +++ b/examples_ptx/stencil/stencil.orig.ispc @@ -0,0 +1,122 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +static void +stencil_step(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + const uniform int Nxy = Nx * Ny; + + foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) { + int index = (z * Nxy) + (y * Nx) + x; +#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] +#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); + + A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; + } +} + + +static task void +stencil_step_task(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, + Nx, Ny, Nz, coef, vsq, Ain, Aout); +} + + +export void +loop_stencil_ispc_tasks(uniform int t0, uniform int t1, + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], + uniform const double vsq[], + uniform double Aeven[], uniform double Aodd[]) +{ + for (uniform int t = t0; t < t1; ++t) { + // Parallelize across cores as well: each task will work on a slice + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd); + else + launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven); + + // We need to wait for all of the launched tasks to finish before + // starting the next iteration. + sync; + } +} + + +export void +loop_stencil_ispc(uniform int t0, uniform int t1, + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], + uniform const double vsq[], + uniform double Aeven[], uniform double Aodd[]) +{ + for (uniform int t = t0; t < t1; ++t) { + if ((t & 1) == 0) + stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, + Aeven, Aodd); + else + stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, + Aodd, Aeven); + } +} diff --git a/examples_ptx/stencil/stencil.vcxproj b/examples_ptx/stencil/stencil.vcxproj new file mode 100644 index 00000000..fd8564aa --- /dev/null +++ b/examples_ptx/stencil/stencil.vcxproj @@ -0,0 +1,34 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {2ef070a1-f62f-4e6a-944b-88d140945c3c} + Win32Proj + rt + stencil + sse2,sse4-x2,avx1-i32x8 + + + + + + + + diff --git a/examples_ptx/stencil/stencil_serial.cpp b/examples_ptx/stencil/stencil_serial.cpp new file mode 100644 index 00000000..df3f38da --- /dev/null +++ b/examples_ptx/stencil/stencil_serial.cpp @@ -0,0 +1,86 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +static void +stencil_step(int x0, int x1, + int y0, int y1, + int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[4], const double vsq[], + const double Ain[], double Aout[]) { + int Nxy = Nx * Ny; + + for (int z = z0; z < z1; ++z) { + for (int y = y0; y < y1; ++y) { + for (int x = x0; x < x1; ++x) { + int index = (z * Nxy) + (y * Nx) + x; +#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] +#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); + + A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; + } + } + } +} + + +void loop_stencil_serial(int t0, int t1, + int x0, int x1, + int y0, int y1, + int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[4], + const double vsq[], + double Aeven[], double Aodd[]) +{ + for (int t = t0; t < t1; ++t) { + if ((t & 1) == 0) + stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, + Aeven, Aodd); + else + stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, + Aodd, Aeven); + } +}