added stencil

This commit is contained in:
Evghenii
2014-01-05 17:24:38 +01:00
parent 07049d6c76
commit 6e2d59e279
10 changed files with 747 additions and 0 deletions

2
examples_ptx/stencil/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
stencil
objs

View File

@@ -0,0 +1,8 @@
EXAMPLE=stencil
CPP_SRC=stencil.cpp stencil_serial.cpp
ISPC_SRC=stencil.ispc
ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
ISPC_ARM_TARGETS=neon
include ../common.mk

View File

@@ -0,0 +1,8 @@
EXAMPLE=stencil
CPP_SRC=stencil.cpp stencil_serial.cpp
ISPC_SRC=stencil.ispc
ISPC_IA_TARGETS=avx1-i32x16
ISPC_ARM_TARGETS=neon
include ../common.mk

View File

@@ -0,0 +1,10 @@
PROG=stencil
ISPC_SRC=stencil.ispc
CU_SRC=stencil.cu
CXX_SRC=stencil.cpp stencil_serial.cpp
PTXCC_REGMAX=128
include ../common_gpu.mk

View File

@@ -0,0 +1,184 @@
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#define NOMINMAX
#pragma warning (disable: 4244)
#pragma warning (disable: 4305)
#endif
#include <cstdio>
#include <algorithm>
#include <cstring>
#include <math.h>
#include "../timing.h"
#include "../ispc_malloc.h"
#include "stencil_ispc.h"
using namespace ispc;
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
int y0, int y1, int z0, int z1,
int Nx, int Ny, int Nz,
const double coef[5],
const double vsq[],
double Aeven[], double Aodd[]);
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
int offset = 0;
for (int z = 0; z < Nz; ++z)
for (int y = 0; y < Ny; ++y)
for (int x = 0; x < Nx; ++x, ++offset) {
A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
A[1][offset] = 0;
vsq[offset] = x*y*z / double(Nx * Ny * Nz);
}
}
int main(int argc, char *argv[]) {
static unsigned int test_iterations[] = {3, 3, 3};//the last two numbers must be equal here
int Nx = 256, Ny = 256, Nz = 256;
int width = 4;
if (argc > 1) {
if (strncmp(argv[1], "--scale=", 8) == 0) {
double scale = atof(argv[1] + 8);
Nx *= scale;
Ny *= scale;
Nz *= scale;
}
}
if ((argc == 4) || (argc == 5)) {
for (int i = 0; i < 3; i++) {
test_iterations[i] = atoi(argv[argc - 3 + i]);
}
}
double *Aserial[2], *Aispc[2];
Aserial[0] = new double [Nx * Ny * Nz];
Aserial[1] = new double [Nx * Ny * Nz];
Aispc[0] = new double [Nx * Ny * Nz];
Aispc[1] = new double [Nx * Ny * Nz];
double *vsq = new double [Nx * Ny * Nz];
double *coeff = new double[4];
coeff[0] = 0.5;
coeff[1] = -.25;
coeff[2] = .125;
coeff[3] = -.0625;
InitData(Nx, Ny, Nz, Aispc, vsq);
//
// Compute the image using the ispc implementation on one core; report
// the minimum time of three runs.
//
#if 0
//#ifndef _CUDA_
double minTimeISPC = 1e30;
for (int i = 0; i < test_iterations[0]; ++i) {
reset_and_start_timer();
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
Aispc[0], Aispc[1]);
double dt = get_elapsed_msec();
printf("@time of ISPC run:\t\t\t[%.3f] msec\n", dt);
minTimeISPC = std::min(minTimeISPC, dt);
}
printf("[stencil ispc 1 core]:\t\t[%.3f] msec\n", minTimeISPC);
InitData(Nx, Ny, Nz, Aispc, vsq);
#endif
//
// Compute the image using the ispc implementation with tasks; report
// the minimum time of three runs.
//
double minTimeISPCTasks = 1e30;
for (int i = 0; i < test_iterations[1]; ++i) {
reset_and_start_timer();
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
Aispc[0], Aispc[1]);
double dt = get_elapsed_msec();
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
}
printf("[stencil ispc + tasks]:\t\t[%.3f] msec\n", minTimeISPCTasks);
InitData(Nx, Ny, Nz, Aserial, vsq);
//
// And run the serial implementation 3 times, again reporting the
// minimum time.
//
double minTimeSerial = 1e30;
for (int i = 0; i < test_iterations[2]; ++i) {
reset_and_start_timer();
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
Aserial[0], Aserial[1]);
double dt = get_elapsed_msec();
printf("@time of serial run:\t\t\t[%.3f] msec\n", dt);
minTimeSerial = std::min(minTimeSerial, dt);
}
printf("[stencil serial]:\t\t[%.3f] msec\n", minTimeSerial);
#if 0
//#ifndef _CUDA_
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
#else
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n",
minTimeSerial / minTimeISPCTasks);
#endif
// Check for agreement
int offset = 0;
for (int z = 0; z < Nz; ++z)
for (int y = 0; y < Ny; ++y)
for (int x = 0; x < Nx; ++x, ++offset) {
double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
Aserial[1][offset]);
if (error > 1e-4)
printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
x, y, z, Aispc[1][offset], Aserial[1][offset]);
}
return 0;
}

View File

@@ -0,0 +1,129 @@
#include "cuda_helpers.cuh"
__device__ static void
stencil_step( int x0, int x1,
int y0, int y1,
int z0, int z1,
int Nx, int Ny, int Nz,
const double coef[4], const double vsq[],
const double Ain[], double Aout[])
{
const int Nxy = Nx * Ny;
const double coef0 = coef[0];
const double coef1 = coef[1];
const double coef2 = coef[2];
const double coef3 = coef[3];
for ( int z = z0; z < z1; z++)
for ( int y = y0 ; y < y1; y++)
for ( int xb = x0; xb < x1; xb += programCount)
{
const int x = xb + programIndex;
int index = (z * Nxy) + (y * Nx) + x;
#define A_cur(x, y, z) __ldg(&Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)])
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
double div =
coef0 * A_cur(0, 0, 0) +
coef1 * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
coef2 * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
coef3 * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
A_cur(0, 0, +3) + A_cur(0, 0, -3));
if (x < x1)
A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
__ldg(&vsq[index]) * div;
}
}
#define SPANX 32
#define SPANY 2
#define SPANZ 4
__global__ void
stencil_step_task( int x0, int x1,
int y0, int y1,
int z0, int z1,
int Nx, int Ny, int Nz,
const double coef[4], const double vsq[],
const double Ain[], double Aout[]) {
if (taskIndex0 >= taskCount0 ||
taskIndex1 >= taskCount1 ||
taskIndex2 >= taskCount2)
return;
const int xfirst = x0 + taskIndex0 * SPANX;
const int xlast = min(x1, xfirst + SPANX);
const int yfirst = y0 + taskIndex1 * SPANY;
const int ylast = min(y1, yfirst + SPANY);
const int zfirst = z0 + taskIndex2 * SPANZ;
const int zlast = min(z1, zfirst + SPANZ);
stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
Nx, Ny, Nz, coef, vsq, Ain, Aout);
}
extern "C"
__global__ void
loop_stencil_ispc_tasks___export( int t0, int t1,
int x0, int x1,
int y0, int y1,
int z0, int z1,
int Nx, int Ny, int Nz,
const double coef[4],
const double vsq[],
double Aeven[], double Aodd[])
{
#define NB(x,n) (((x)+(n)-1)/(n))
dim3 grid((NB(x1-x0,SPANX)-1)/4+1, NB(y1-y0,SPANY), NB(z1-z0,SPANZ));
for ( int t = t0; t < t1; ++t)
{
// Parallelize across cores as well: each task will work on a slice
// of 1 in the z extent of the volume.
if ((t & 1) == 0)
{
if (programIndex == 0)
stencil_step_task<<<grid,128>>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
coef, vsq, Aeven, Aodd);
}
else
{
if (programIndex == 0)
stencil_step_task<<<grid,128>>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
coef, vsq, Aodd, Aeven);
}
// We need to wait for all of the launched tasks to finish before
// starting the next iteration
cudaDeviceSynchronize();
}
}
extern "C"
__host__ void
loop_stencil_ispc_tasks( int t0, int t1,
int x0, int x1,
int y0, int y1,
int z0, int z1,
int Nx, int Ny, int Nz,
const double coef[4],
const double vsq[],
double Aeven[], double Aodd[])
{
loop_stencil_ispc_tasks___export<<<1,32>>>(t0,t1,x0,x1,y0,y1,z0,z1,Nx,Ny,Nz,coef,vsq,Aeven,Aodd);
cudaDeviceSynchronize();
}

View File

@@ -0,0 +1,164 @@
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
static inline void
stencil_step(uniform int x0, uniform int x1,
uniform int y0, uniform int y1,
uniform int z0, uniform int z1,
uniform int Nx, uniform int Ny, uniform int Nz,
uniform const double coef[4], uniform const double vsq[],
uniform const double Ain[], uniform double Aout[]) {
const uniform int Nxy = Nx * Ny;
#if 0
#define VER1
#endif
#ifdef VER1
const uniform int x1o = 1;
const uniform int x2o = 2;
const uniform int x3o = 3;
const uniform int y1o = Nx;
const uniform int y2o = Nx*2;
const uniform int y3o = Nx*3;
const uniform int z1o = Nxy;
const uniform int z2o = Nxy*2;
const uniform int z3o = Nxy*3;
#endif
foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1)
{
#ifdef VER1
int index = (z * Nxy) + (y * Nx) + x;
#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)]
#define A_next(x, y, z) Aout[index + (x) + (y) + (z)]
double div = coef[0] * A_cur(0, 0, 0) +
coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) +
A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) +
A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) +
coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) +
A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) +
A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) +
coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) +
A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) +
A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o));
A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
vsq[index] * div;
#else
int index = (z * Nxy) + (y * Nx) + x;
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
double div = coef[0] * A_cur(0, 0, 0) +
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
A_cur(0, 0, +3) + A_cur(0, 0, -3));
A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
vsq[index] * div;
#endif
}
}
#define SPANX 32
#define SPANY 2
#define SPANZ 4
static task void
stencil_step_task(uniform int x0, uniform int x1,
uniform int y0, uniform int y1,
uniform int z0, uniform int z1,
uniform int Nx, uniform int Ny, uniform int Nz,
uniform const double coef[4], uniform const double vsq[],
uniform const double Ain[], uniform double Aout[]) {
if (taskIndex0 >= taskCount0 ||
taskIndex1 >= taskCount1 ||
taskIndex2 >= taskCount2)
return;
const uniform int xfirst = x0 + taskIndex0 * SPANX;
const uniform int xlast = min(x1, xfirst + SPANX);
const uniform int yfirst = y0 + taskIndex1 * SPANY;
const uniform int ylast = min(y1, yfirst + SPANY);
const uniform int zfirst = z0 + taskIndex2 * SPANZ;
const uniform int zlast = min(z1, zfirst + SPANZ);
stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
Nx, Ny, Nz, coef, vsq, Ain, Aout);
}
export void
loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
uniform int x0, uniform int x1,
uniform int y0, uniform int y1,
uniform int z0, uniform int z1,
uniform int Nx, uniform int Ny, uniform int Nz,
uniform const double coef[4],
uniform const double vsq[],
uniform double Aeven[], uniform double Aodd[])
{
#define NB(x,n) (((x)+(n)-1)/(n))
for (uniform int t = t0; t < t1; ++t)
{
// Parallelize across cores as well: each task will work on a slice
// of 1 in the z extent of the volume.
if ((t & 1) == 0)
launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)]
stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
coef, vsq, Aeven, Aodd);
else
launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)]
stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
coef, vsq, Aodd, Aeven);
// We need to wait for all of the launched tasks to finish before
// starting the next iteration.
sync;
}
}

View File

@@ -0,0 +1,122 @@
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
static void
stencil_step(uniform int x0, uniform int x1,
uniform int y0, uniform int y1,
uniform int z0, uniform int z1,
uniform int Nx, uniform int Ny, uniform int Nz,
uniform const double coef[4], uniform const double vsq[],
uniform const double Ain[], uniform double Aout[]) {
const uniform int Nxy = Nx * Ny;
foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) {
int index = (z * Nxy) + (y * Nx) + x;
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
double div = coef[0] * A_cur(0, 0, 0) +
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
A_cur(0, 0, +3) + A_cur(0, 0, -3));
A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
vsq[index] * div;
}
}
static task void
stencil_step_task(uniform int x0, uniform int x1,
uniform int y0, uniform int y1,
uniform int z0,
uniform int Nx, uniform int Ny, uniform int Nz,
uniform const double coef[4], uniform const double vsq[],
uniform const double Ain[], uniform double Aout[]) {
stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
Nx, Ny, Nz, coef, vsq, Ain, Aout);
}
export void
loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
uniform int x0, uniform int x1,
uniform int y0, uniform int y1,
uniform int z0, uniform int z1,
uniform int Nx, uniform int Ny, uniform int Nz,
uniform const double coef[4],
uniform const double vsq[],
uniform double Aeven[], uniform double Aodd[])
{
for (uniform int t = t0; t < t1; ++t) {
// Parallelize across cores as well: each task will work on a slice
// of 1 in the z extent of the volume.
if ((t & 1) == 0)
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
coef, vsq, Aeven, Aodd);
else
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
coef, vsq, Aodd, Aeven);
// We need to wait for all of the launched tasks to finish before
// starting the next iteration.
sync;
}
}
export void
loop_stencil_ispc(uniform int t0, uniform int t1,
uniform int x0, uniform int x1,
uniform int y0, uniform int y1,
uniform int z0, uniform int z1,
uniform int Nx, uniform int Ny, uniform int Nz,
uniform const double coef[4],
uniform const double vsq[],
uniform double Aeven[], uniform double Aodd[])
{
for (uniform int t = t0; t < t1; ++t) {
if ((t & 1) == 0)
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
Aeven, Aodd);
else
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
Aodd, Aeven);
}
}

View File

@@ -0,0 +1,34 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>rt</RootNamespace>
<ISPC_file>stencil</ISPC_file>
<default_targets>sse2,sse4-x2,avx1-i32x8</default_targets>
</PropertyGroup>
<Import Project="..\common.props" />
<ItemGroup>
<ClCompile Include="stencil.cpp" />
<ClCompile Include="stencil_serial.cpp" />
<ClCompile Include="../tasksys.cpp" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,86 @@
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
static void
stencil_step(int x0, int x1,
int y0, int y1,
int z0, int z1,
int Nx, int Ny, int Nz,
const double coef[4], const double vsq[],
const double Ain[], double Aout[]) {
int Nxy = Nx * Ny;
for (int z = z0; z < z1; ++z) {
for (int y = y0; y < y1; ++y) {
for (int x = x0; x < x1; ++x) {
int index = (z * Nxy) + (y * Nx) + x;
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
double div = coef[0] * A_cur(0, 0, 0) +
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
A_cur(0, 0, +3) + A_cur(0, 0, -3));
A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
vsq[index] * div;
}
}
}
}
void loop_stencil_serial(int t0, int t1,
int x0, int x1,
int y0, int y1,
int z0, int z1,
int Nx, int Ny, int Nz,
const double coef[4],
const double vsq[],
double Aeven[], double Aodd[])
{
for (int t = t0; t < t1; ++t) {
if ((t & 1) == 0)
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
Aeven, Aodd);
else
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
Aodd, Aeven);
}
}