added stencil
This commit is contained in:
2
examples_ptx/stencil/.gitignore
vendored
Normal file
2
examples_ptx/stencil/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
stencil
|
||||||
|
objs
|
||||||
8
examples_ptx/stencil/Makefile
Normal file
8
examples_ptx/stencil/Makefile
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
|
||||||
|
EXAMPLE=stencil
|
||||||
|
CPP_SRC=stencil.cpp stencil_serial.cpp
|
||||||
|
ISPC_SRC=stencil.ispc
|
||||||
|
ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
|
||||||
|
ISPC_ARM_TARGETS=neon
|
||||||
|
|
||||||
|
include ../common.mk
|
||||||
8
examples_ptx/stencil/Makefile_cpu
Normal file
8
examples_ptx/stencil/Makefile_cpu
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
|
||||||
|
EXAMPLE=stencil
|
||||||
|
CPP_SRC=stencil.cpp stencil_serial.cpp
|
||||||
|
ISPC_SRC=stencil.ispc
|
||||||
|
ISPC_IA_TARGETS=avx1-i32x16
|
||||||
|
ISPC_ARM_TARGETS=neon
|
||||||
|
|
||||||
|
include ../common.mk
|
||||||
10
examples_ptx/stencil/Makefile_gpu
Normal file
10
examples_ptx/stencil/Makefile_gpu
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
PROG=stencil
|
||||||
|
ISPC_SRC=stencil.ispc
|
||||||
|
CU_SRC=stencil.cu
|
||||||
|
CXX_SRC=stencil.cpp stencil_serial.cpp
|
||||||
|
PTXCC_REGMAX=128
|
||||||
|
|
||||||
|
include ../common_gpu.mk
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
184
examples_ptx/stencil/stencil.cpp
Normal file
184
examples_ptx/stencil/stencil.cpp
Normal file
@@ -0,0 +1,184 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2010-2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#define _CRT_SECURE_NO_WARNINGS
|
||||||
|
#define NOMINMAX
|
||||||
|
#pragma warning (disable: 4244)
|
||||||
|
#pragma warning (disable: 4305)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cstring>
|
||||||
|
#include <math.h>
|
||||||
|
#include "../timing.h"
|
||||||
|
#include "../ispc_malloc.h"
|
||||||
|
#include "stencil_ispc.h"
|
||||||
|
using namespace ispc;
|
||||||
|
|
||||||
|
|
||||||
|
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
||||||
|
int y0, int y1, int z0, int z1,
|
||||||
|
int Nx, int Ny, int Nz,
|
||||||
|
const double coef[5],
|
||||||
|
const double vsq[],
|
||||||
|
double Aeven[], double Aodd[]);
|
||||||
|
|
||||||
|
|
||||||
|
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
|
||||||
|
int offset = 0;
|
||||||
|
for (int z = 0; z < Nz; ++z)
|
||||||
|
for (int y = 0; y < Ny; ++y)
|
||||||
|
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||||
|
A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
|
||||||
|
A[1][offset] = 0;
|
||||||
|
vsq[offset] = x*y*z / double(Nx * Ny * Nz);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
static unsigned int test_iterations[] = {3, 3, 3};//the last two numbers must be equal here
|
||||||
|
int Nx = 256, Ny = 256, Nz = 256;
|
||||||
|
int width = 4;
|
||||||
|
|
||||||
|
if (argc > 1) {
|
||||||
|
if (strncmp(argv[1], "--scale=", 8) == 0) {
|
||||||
|
double scale = atof(argv[1] + 8);
|
||||||
|
Nx *= scale;
|
||||||
|
Ny *= scale;
|
||||||
|
Nz *= scale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ((argc == 4) || (argc == 5)) {
|
||||||
|
for (int i = 0; i < 3; i++) {
|
||||||
|
test_iterations[i] = atoi(argv[argc - 3 + i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double *Aserial[2], *Aispc[2];
|
||||||
|
Aserial[0] = new double [Nx * Ny * Nz];
|
||||||
|
Aserial[1] = new double [Nx * Ny * Nz];
|
||||||
|
Aispc[0] = new double [Nx * Ny * Nz];
|
||||||
|
Aispc[1] = new double [Nx * Ny * Nz];
|
||||||
|
double *vsq = new double [Nx * Ny * Nz];
|
||||||
|
double *coeff = new double[4];
|
||||||
|
|
||||||
|
coeff[0] = 0.5;
|
||||||
|
coeff[1] = -.25;
|
||||||
|
coeff[2] = .125;
|
||||||
|
coeff[3] = -.0625;
|
||||||
|
|
||||||
|
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||||
|
//
|
||||||
|
// Compute the image using the ispc implementation on one core; report
|
||||||
|
// the minimum time of three runs.
|
||||||
|
//
|
||||||
|
#if 0
|
||||||
|
//#ifndef _CUDA_
|
||||||
|
double minTimeISPC = 1e30;
|
||||||
|
for (int i = 0; i < test_iterations[0]; ++i) {
|
||||||
|
reset_and_start_timer();
|
||||||
|
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
|
||||||
|
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||||
|
Aispc[0], Aispc[1]);
|
||||||
|
double dt = get_elapsed_msec();
|
||||||
|
printf("@time of ISPC run:\t\t\t[%.3f] msec\n", dt);
|
||||||
|
minTimeISPC = std::min(minTimeISPC, dt);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("[stencil ispc 1 core]:\t\t[%.3f] msec\n", minTimeISPC);
|
||||||
|
|
||||||
|
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// Compute the image using the ispc implementation with tasks; report
|
||||||
|
// the minimum time of three runs.
|
||||||
|
//
|
||||||
|
double minTimeISPCTasks = 1e30;
|
||||||
|
for (int i = 0; i < test_iterations[1]; ++i) {
|
||||||
|
reset_and_start_timer();
|
||||||
|
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
|
||||||
|
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||||
|
Aispc[0], Aispc[1]);
|
||||||
|
double dt = get_elapsed_msec();
|
||||||
|
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
|
||||||
|
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("[stencil ispc + tasks]:\t\t[%.3f] msec\n", minTimeISPCTasks);
|
||||||
|
|
||||||
|
InitData(Nx, Ny, Nz, Aserial, vsq);
|
||||||
|
|
||||||
|
//
|
||||||
|
// And run the serial implementation 3 times, again reporting the
|
||||||
|
// minimum time.
|
||||||
|
//
|
||||||
|
double minTimeSerial = 1e30;
|
||||||
|
for (int i = 0; i < test_iterations[2]; ++i) {
|
||||||
|
reset_and_start_timer();
|
||||||
|
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
|
||||||
|
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||||
|
Aserial[0], Aserial[1]);
|
||||||
|
double dt = get_elapsed_msec();
|
||||||
|
printf("@time of serial run:\t\t\t[%.3f] msec\n", dt);
|
||||||
|
minTimeSerial = std::min(minTimeSerial, dt);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("[stencil serial]:\t\t[%.3f] msec\n", minTimeSerial);
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
//#ifndef _CUDA_
|
||||||
|
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||||
|
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
|
||||||
|
#else
|
||||||
|
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n",
|
||||||
|
minTimeSerial / minTimeISPCTasks);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Check for agreement
|
||||||
|
int offset = 0;
|
||||||
|
for (int z = 0; z < Nz; ++z)
|
||||||
|
for (int y = 0; y < Ny; ++y)
|
||||||
|
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||||
|
double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
|
||||||
|
Aserial[1][offset]);
|
||||||
|
if (error > 1e-4)
|
||||||
|
printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
|
||||||
|
x, y, z, Aispc[1][offset], Aserial[1][offset]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
129
examples_ptx/stencil/stencil.cu
Normal file
129
examples_ptx/stencil/stencil.cu
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
#include "cuda_helpers.cuh"
|
||||||
|
|
||||||
|
__device__ static void
|
||||||
|
stencil_step( int x0, int x1,
|
||||||
|
int y0, int y1,
|
||||||
|
int z0, int z1,
|
||||||
|
int Nx, int Ny, int Nz,
|
||||||
|
const double coef[4], const double vsq[],
|
||||||
|
const double Ain[], double Aout[])
|
||||||
|
{
|
||||||
|
const int Nxy = Nx * Ny;
|
||||||
|
|
||||||
|
|
||||||
|
const double coef0 = coef[0];
|
||||||
|
const double coef1 = coef[1];
|
||||||
|
const double coef2 = coef[2];
|
||||||
|
const double coef3 = coef[3];
|
||||||
|
for ( int z = z0; z < z1; z++)
|
||||||
|
for ( int y = y0 ; y < y1; y++)
|
||||||
|
for ( int xb = x0; xb < x1; xb += programCount)
|
||||||
|
{
|
||||||
|
const int x = xb + programIndex;
|
||||||
|
|
||||||
|
int index = (z * Nxy) + (y * Nx) + x;
|
||||||
|
#define A_cur(x, y, z) __ldg(&Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)])
|
||||||
|
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||||
|
double div =
|
||||||
|
coef0 * A_cur(0, 0, 0) +
|
||||||
|
coef1 * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||||
|
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||||
|
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||||
|
coef2 * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||||
|
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||||
|
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||||
|
coef3 * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||||
|
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||||
|
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||||
|
|
||||||
|
if (x < x1)
|
||||||
|
A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||||
|
__ldg(&vsq[index]) * div;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define SPANX 32
|
||||||
|
#define SPANY 2
|
||||||
|
#define SPANZ 4
|
||||||
|
|
||||||
|
__global__ void
|
||||||
|
stencil_step_task( int x0, int x1,
|
||||||
|
int y0, int y1,
|
||||||
|
int z0, int z1,
|
||||||
|
int Nx, int Ny, int Nz,
|
||||||
|
const double coef[4], const double vsq[],
|
||||||
|
const double Ain[], double Aout[]) {
|
||||||
|
if (taskIndex0 >= taskCount0 ||
|
||||||
|
taskIndex1 >= taskCount1 ||
|
||||||
|
taskIndex2 >= taskCount2)
|
||||||
|
return;
|
||||||
|
|
||||||
|
const int xfirst = x0 + taskIndex0 * SPANX;
|
||||||
|
const int xlast = min(x1, xfirst + SPANX);
|
||||||
|
|
||||||
|
const int yfirst = y0 + taskIndex1 * SPANY;
|
||||||
|
const int ylast = min(y1, yfirst + SPANY);
|
||||||
|
|
||||||
|
const int zfirst = z0 + taskIndex2 * SPANZ;
|
||||||
|
const int zlast = min(z1, zfirst + SPANZ);
|
||||||
|
|
||||||
|
stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
|
||||||
|
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
__global__ void
|
||||||
|
loop_stencil_ispc_tasks___export( int t0, int t1,
|
||||||
|
int x0, int x1,
|
||||||
|
int y0, int y1,
|
||||||
|
int z0, int z1,
|
||||||
|
int Nx, int Ny, int Nz,
|
||||||
|
const double coef[4],
|
||||||
|
const double vsq[],
|
||||||
|
double Aeven[], double Aodd[])
|
||||||
|
{
|
||||||
|
#define NB(x,n) (((x)+(n)-1)/(n))
|
||||||
|
|
||||||
|
dim3 grid((NB(x1-x0,SPANX)-1)/4+1, NB(y1-y0,SPANY), NB(z1-z0,SPANZ));
|
||||||
|
|
||||||
|
for ( int t = t0; t < t1; ++t)
|
||||||
|
{
|
||||||
|
// Parallelize across cores as well: each task will work on a slice
|
||||||
|
// of 1 in the z extent of the volume.
|
||||||
|
if ((t & 1) == 0)
|
||||||
|
{
|
||||||
|
if (programIndex == 0)
|
||||||
|
stencil_step_task<<<grid,128>>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||||
|
coef, vsq, Aeven, Aodd);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (programIndex == 0)
|
||||||
|
stencil_step_task<<<grid,128>>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||||
|
coef, vsq, Aodd, Aeven);
|
||||||
|
}
|
||||||
|
|
||||||
|
// We need to wait for all of the launched tasks to finish before
|
||||||
|
// starting the next iteration
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
__host__ void
|
||||||
|
loop_stencil_ispc_tasks( int t0, int t1,
|
||||||
|
int x0, int x1,
|
||||||
|
int y0, int y1,
|
||||||
|
int z0, int z1,
|
||||||
|
int Nx, int Ny, int Nz,
|
||||||
|
const double coef[4],
|
||||||
|
const double vsq[],
|
||||||
|
double Aeven[], double Aodd[])
|
||||||
|
{
|
||||||
|
loop_stencil_ispc_tasks___export<<<1,32>>>(t0,t1,x0,x1,y0,y1,z0,z1,Nx,Ny,Nz,coef,vsq,Aeven,Aodd);
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
}
|
||||||
|
|
||||||
164
examples_ptx/stencil/stencil.ispc
Normal file
164
examples_ptx/stencil/stencil.ispc
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2010-2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
stencil_step(uniform int x0, uniform int x1,
|
||||||
|
uniform int y0, uniform int y1,
|
||||||
|
uniform int z0, uniform int z1,
|
||||||
|
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||||
|
uniform const double coef[4], uniform const double vsq[],
|
||||||
|
uniform const double Ain[], uniform double Aout[]) {
|
||||||
|
const uniform int Nxy = Nx * Ny;
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
#define VER1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef VER1
|
||||||
|
const uniform int x1o = 1;
|
||||||
|
const uniform int x2o = 2;
|
||||||
|
const uniform int x3o = 3;
|
||||||
|
const uniform int y1o = Nx;
|
||||||
|
const uniform int y2o = Nx*2;
|
||||||
|
const uniform int y3o = Nx*3;
|
||||||
|
const uniform int z1o = Nxy;
|
||||||
|
const uniform int z2o = Nxy*2;
|
||||||
|
const uniform int z3o = Nxy*3;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1)
|
||||||
|
{
|
||||||
|
#ifdef VER1
|
||||||
|
|
||||||
|
int index = (z * Nxy) + (y * Nx) + x;
|
||||||
|
#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)]
|
||||||
|
#define A_next(x, y, z) Aout[index + (x) + (y) + (z)]
|
||||||
|
double div = coef[0] * A_cur(0, 0, 0) +
|
||||||
|
coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) +
|
||||||
|
A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) +
|
||||||
|
A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) +
|
||||||
|
coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) +
|
||||||
|
A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) +
|
||||||
|
A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) +
|
||||||
|
coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) +
|
||||||
|
A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) +
|
||||||
|
A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o));
|
||||||
|
A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||||
|
vsq[index] * div;
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
int index = (z * Nxy) + (y * Nx) + x;
|
||||||
|
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||||
|
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||||
|
double div = coef[0] * A_cur(0, 0, 0) +
|
||||||
|
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||||
|
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||||
|
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||||
|
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||||
|
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||||
|
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||||
|
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||||
|
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||||
|
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||||
|
|
||||||
|
A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||||
|
vsq[index] * div;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define SPANX 32
|
||||||
|
#define SPANY 2
|
||||||
|
#define SPANZ 4
|
||||||
|
|
||||||
|
static task void
|
||||||
|
stencil_step_task(uniform int x0, uniform int x1,
|
||||||
|
uniform int y0, uniform int y1,
|
||||||
|
uniform int z0, uniform int z1,
|
||||||
|
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||||
|
uniform const double coef[4], uniform const double vsq[],
|
||||||
|
uniform const double Ain[], uniform double Aout[]) {
|
||||||
|
if (taskIndex0 >= taskCount0 ||
|
||||||
|
taskIndex1 >= taskCount1 ||
|
||||||
|
taskIndex2 >= taskCount2)
|
||||||
|
return;
|
||||||
|
|
||||||
|
const uniform int xfirst = x0 + taskIndex0 * SPANX;
|
||||||
|
const uniform int xlast = min(x1, xfirst + SPANX);
|
||||||
|
|
||||||
|
const uniform int yfirst = y0 + taskIndex1 * SPANY;
|
||||||
|
const uniform int ylast = min(y1, yfirst + SPANY);
|
||||||
|
|
||||||
|
const uniform int zfirst = z0 + taskIndex2 * SPANZ;
|
||||||
|
const uniform int zlast = min(z1, zfirst + SPANZ);
|
||||||
|
|
||||||
|
stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
|
||||||
|
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
export void
|
||||||
|
loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
|
||||||
|
uniform int x0, uniform int x1,
|
||||||
|
uniform int y0, uniform int y1,
|
||||||
|
uniform int z0, uniform int z1,
|
||||||
|
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||||
|
uniform const double coef[4],
|
||||||
|
uniform const double vsq[],
|
||||||
|
uniform double Aeven[], uniform double Aodd[])
|
||||||
|
{
|
||||||
|
#define NB(x,n) (((x)+(n)-1)/(n))
|
||||||
|
|
||||||
|
for (uniform int t = t0; t < t1; ++t)
|
||||||
|
{
|
||||||
|
// Parallelize across cores as well: each task will work on a slice
|
||||||
|
// of 1 in the z extent of the volume.
|
||||||
|
if ((t & 1) == 0)
|
||||||
|
launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)]
|
||||||
|
stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||||
|
coef, vsq, Aeven, Aodd);
|
||||||
|
else
|
||||||
|
launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)]
|
||||||
|
stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||||
|
coef, vsq, Aodd, Aeven);
|
||||||
|
|
||||||
|
// We need to wait for all of the launched tasks to finish before
|
||||||
|
// starting the next iteration.
|
||||||
|
sync;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
122
examples_ptx/stencil/stencil.orig.ispc
Normal file
122
examples_ptx/stencil/stencil.orig.ispc
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2010-2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
stencil_step(uniform int x0, uniform int x1,
|
||||||
|
uniform int y0, uniform int y1,
|
||||||
|
uniform int z0, uniform int z1,
|
||||||
|
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||||
|
uniform const double coef[4], uniform const double vsq[],
|
||||||
|
uniform const double Ain[], uniform double Aout[]) {
|
||||||
|
const uniform int Nxy = Nx * Ny;
|
||||||
|
|
||||||
|
foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) {
|
||||||
|
int index = (z * Nxy) + (y * Nx) + x;
|
||||||
|
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||||
|
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||||
|
double div = coef[0] * A_cur(0, 0, 0) +
|
||||||
|
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||||
|
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||||
|
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||||
|
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||||
|
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||||
|
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||||
|
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||||
|
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||||
|
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||||
|
|
||||||
|
A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||||
|
vsq[index] * div;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static task void
|
||||||
|
stencil_step_task(uniform int x0, uniform int x1,
|
||||||
|
uniform int y0, uniform int y1,
|
||||||
|
uniform int z0,
|
||||||
|
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||||
|
uniform const double coef[4], uniform const double vsq[],
|
||||||
|
uniform const double Ain[], uniform double Aout[]) {
|
||||||
|
stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
|
||||||
|
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export void
|
||||||
|
loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
|
||||||
|
uniform int x0, uniform int x1,
|
||||||
|
uniform int y0, uniform int y1,
|
||||||
|
uniform int z0, uniform int z1,
|
||||||
|
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||||
|
uniform const double coef[4],
|
||||||
|
uniform const double vsq[],
|
||||||
|
uniform double Aeven[], uniform double Aodd[])
|
||||||
|
{
|
||||||
|
for (uniform int t = t0; t < t1; ++t) {
|
||||||
|
// Parallelize across cores as well: each task will work on a slice
|
||||||
|
// of 1 in the z extent of the volume.
|
||||||
|
if ((t & 1) == 0)
|
||||||
|
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
|
||||||
|
coef, vsq, Aeven, Aodd);
|
||||||
|
else
|
||||||
|
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
|
||||||
|
coef, vsq, Aodd, Aeven);
|
||||||
|
|
||||||
|
// We need to wait for all of the launched tasks to finish before
|
||||||
|
// starting the next iteration.
|
||||||
|
sync;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export void
|
||||||
|
loop_stencil_ispc(uniform int t0, uniform int t1,
|
||||||
|
uniform int x0, uniform int x1,
|
||||||
|
uniform int y0, uniform int y1,
|
||||||
|
uniform int z0, uniform int z1,
|
||||||
|
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||||
|
uniform const double coef[4],
|
||||||
|
uniform const double vsq[],
|
||||||
|
uniform double Aeven[], uniform double Aodd[])
|
||||||
|
{
|
||||||
|
for (uniform int t = t0; t < t1; ++t) {
|
||||||
|
if ((t & 1) == 0)
|
||||||
|
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||||
|
Aeven, Aodd);
|
||||||
|
else
|
||||||
|
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||||
|
Aodd, Aeven);
|
||||||
|
}
|
||||||
|
}
|
||||||
34
examples_ptx/stencil/stencil.vcxproj
Normal file
34
examples_ptx/stencil/stencil.vcxproj
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
|
<ProjectConfiguration Include="Debug|Win32">
|
||||||
|
<Configuration>Debug</Configuration>
|
||||||
|
<Platform>Win32</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Debug|x64">
|
||||||
|
<Configuration>Debug</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Release|Win32">
|
||||||
|
<Configuration>Release</Configuration>
|
||||||
|
<Platform>Win32</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Release|x64">
|
||||||
|
<Configuration>Release</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
</ItemGroup>
|
||||||
|
<PropertyGroup Label="Globals">
|
||||||
|
<ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
|
||||||
|
<Keyword>Win32Proj</Keyword>
|
||||||
|
<RootNamespace>rt</RootNamespace>
|
||||||
|
<ISPC_file>stencil</ISPC_file>
|
||||||
|
<default_targets>sse2,sse4-x2,avx1-i32x8</default_targets>
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="..\common.props" />
|
||||||
|
<ItemGroup>
|
||||||
|
<ClCompile Include="stencil.cpp" />
|
||||||
|
<ClCompile Include="stencil_serial.cpp" />
|
||||||
|
<ClCompile Include="../tasksys.cpp" />
|
||||||
|
</ItemGroup>
|
||||||
|
</Project>
|
||||||
86
examples_ptx/stencil/stencil_serial.cpp
Normal file
86
examples_ptx/stencil/stencil_serial.cpp
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2010-2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
stencil_step(int x0, int x1,
|
||||||
|
int y0, int y1,
|
||||||
|
int z0, int z1,
|
||||||
|
int Nx, int Ny, int Nz,
|
||||||
|
const double coef[4], const double vsq[],
|
||||||
|
const double Ain[], double Aout[]) {
|
||||||
|
int Nxy = Nx * Ny;
|
||||||
|
|
||||||
|
for (int z = z0; z < z1; ++z) {
|
||||||
|
for (int y = y0; y < y1; ++y) {
|
||||||
|
for (int x = x0; x < x1; ++x) {
|
||||||
|
int index = (z * Nxy) + (y * Nx) + x;
|
||||||
|
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||||
|
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||||
|
double div = coef[0] * A_cur(0, 0, 0) +
|
||||||
|
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||||
|
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||||
|
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||||
|
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||||
|
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||||
|
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||||
|
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||||
|
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||||
|
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||||
|
|
||||||
|
A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||||
|
vsq[index] * div;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void loop_stencil_serial(int t0, int t1,
|
||||||
|
int x0, int x1,
|
||||||
|
int y0, int y1,
|
||||||
|
int z0, int z1,
|
||||||
|
int Nx, int Ny, int Nz,
|
||||||
|
const double coef[4],
|
||||||
|
const double vsq[],
|
||||||
|
double Aeven[], double Aodd[])
|
||||||
|
{
|
||||||
|
for (int t = t0; t < t1; ++t) {
|
||||||
|
if ((t & 1) == 0)
|
||||||
|
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||||
|
Aeven, Aodd);
|
||||||
|
else
|
||||||
|
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||||
|
Aodd, Aeven);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user