added stencil
This commit is contained in:
2
examples_ptx/stencil/.gitignore
vendored
Normal file
2
examples_ptx/stencil/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
stencil
|
||||
objs
|
||||
8
examples_ptx/stencil/Makefile
Normal file
8
examples_ptx/stencil/Makefile
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=stencil
|
||||
CPP_SRC=stencil.cpp stencil_serial.cpp
|
||||
ISPC_SRC=stencil.ispc
|
||||
ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
8
examples_ptx/stencil/Makefile_cpu
Normal file
8
examples_ptx/stencil/Makefile_cpu
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=stencil
|
||||
CPP_SRC=stencil.cpp stencil_serial.cpp
|
||||
ISPC_SRC=stencil.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x16
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
10
examples_ptx/stencil/Makefile_gpu
Normal file
10
examples_ptx/stencil/Makefile_gpu
Normal file
@@ -0,0 +1,10 @@
|
||||
PROG=stencil
|
||||
ISPC_SRC=stencil.ispc
|
||||
CU_SRC=stencil.cu
|
||||
CXX_SRC=stencil.cpp stencil_serial.cpp
|
||||
PTXCC_REGMAX=128
|
||||
|
||||
include ../common_gpu.mk
|
||||
|
||||
|
||||
|
||||
184
examples_ptx/stencil/stencil.cpp
Normal file
184
examples_ptx/stencil/stencil.cpp
Normal file
@@ -0,0 +1,184 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <math.h>
|
||||
#include "../timing.h"
|
||||
#include "../ispc_malloc.h"
|
||||
#include "stencil_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
|
||||
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
||||
int y0, int y1, int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[5],
|
||||
const double vsq[],
|
||||
double Aeven[], double Aodd[]);
|
||||
|
||||
|
||||
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
|
||||
int offset = 0;
|
||||
for (int z = 0; z < Nz; ++z)
|
||||
for (int y = 0; y < Ny; ++y)
|
||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||
A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
|
||||
A[1][offset] = 0;
|
||||
vsq[offset] = x*y*z / double(Nx * Ny * Nz);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
static unsigned int test_iterations[] = {3, 3, 3};//the last two numbers must be equal here
|
||||
int Nx = 256, Ny = 256, Nz = 256;
|
||||
int width = 4;
|
||||
|
||||
if (argc > 1) {
|
||||
if (strncmp(argv[1], "--scale=", 8) == 0) {
|
||||
double scale = atof(argv[1] + 8);
|
||||
Nx *= scale;
|
||||
Ny *= scale;
|
||||
Nz *= scale;
|
||||
}
|
||||
}
|
||||
if ((argc == 4) || (argc == 5)) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
test_iterations[i] = atoi(argv[argc - 3 + i]);
|
||||
}
|
||||
}
|
||||
|
||||
double *Aserial[2], *Aispc[2];
|
||||
Aserial[0] = new double [Nx * Ny * Nz];
|
||||
Aserial[1] = new double [Nx * Ny * Nz];
|
||||
Aispc[0] = new double [Nx * Ny * Nz];
|
||||
Aispc[1] = new double [Nx * Ny * Nz];
|
||||
double *vsq = new double [Nx * Ny * Nz];
|
||||
double *coeff = new double[4];
|
||||
|
||||
coeff[0] = 0.5;
|
||||
coeff[1] = -.25;
|
||||
coeff[2] = .125;
|
||||
coeff[3] = -.0625;
|
||||
|
||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||
//
|
||||
// Compute the image using the ispc implementation on one core; report
|
||||
// the minimum time of three runs.
|
||||
//
|
||||
#if 0
|
||||
//#ifndef _CUDA_
|
||||
double minTimeISPC = 1e30;
|
||||
for (int i = 0; i < test_iterations[0]; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aispc[0], Aispc[1]);
|
||||
double dt = get_elapsed_msec();
|
||||
printf("@time of ISPC run:\t\t\t[%.3f] msec\n", dt);
|
||||
minTimeISPC = std::min(minTimeISPC, dt);
|
||||
}
|
||||
|
||||
printf("[stencil ispc 1 core]:\t\t[%.3f] msec\n", minTimeISPC);
|
||||
|
||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||
#endif
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation with tasks; report
|
||||
// the minimum time of three runs.
|
||||
//
|
||||
double minTimeISPCTasks = 1e30;
|
||||
for (int i = 0; i < test_iterations[1]; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aispc[0], Aispc[1]);
|
||||
double dt = get_elapsed_msec();
|
||||
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
|
||||
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
|
||||
}
|
||||
|
||||
printf("[stencil ispc + tasks]:\t\t[%.3f] msec\n", minTimeISPCTasks);
|
||||
|
||||
InitData(Nx, Ny, Nz, Aserial, vsq);
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
// minimum time.
|
||||
//
|
||||
double minTimeSerial = 1e30;
|
||||
for (int i = 0; i < test_iterations[2]; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aserial[0], Aserial[1]);
|
||||
double dt = get_elapsed_msec();
|
||||
printf("@time of serial run:\t\t\t[%.3f] msec\n", dt);
|
||||
minTimeSerial = std::min(minTimeSerial, dt);
|
||||
}
|
||||
|
||||
printf("[stencil serial]:\t\t[%.3f] msec\n", minTimeSerial);
|
||||
|
||||
#if 0
|
||||
//#ifndef _CUDA_
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
|
||||
#else
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n",
|
||||
minTimeSerial / minTimeISPCTasks);
|
||||
#endif
|
||||
|
||||
// Check for agreement
|
||||
int offset = 0;
|
||||
for (int z = 0; z < Nz; ++z)
|
||||
for (int y = 0; y < Ny; ++y)
|
||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||
double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
|
||||
Aserial[1][offset]);
|
||||
if (error > 1e-4)
|
||||
printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
|
||||
x, y, z, Aispc[1][offset], Aserial[1][offset]);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
129
examples_ptx/stencil/stencil.cu
Normal file
129
examples_ptx/stencil/stencil.cu
Normal file
@@ -0,0 +1,129 @@
|
||||
#include "cuda_helpers.cuh"
|
||||
|
||||
__device__ static void
|
||||
stencil_step( int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4], const double vsq[],
|
||||
const double Ain[], double Aout[])
|
||||
{
|
||||
const int Nxy = Nx * Ny;
|
||||
|
||||
|
||||
const double coef0 = coef[0];
|
||||
const double coef1 = coef[1];
|
||||
const double coef2 = coef[2];
|
||||
const double coef3 = coef[3];
|
||||
for ( int z = z0; z < z1; z++)
|
||||
for ( int y = y0 ; y < y1; y++)
|
||||
for ( int xb = x0; xb < x1; xb += programCount)
|
||||
{
|
||||
const int x = xb + programIndex;
|
||||
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) __ldg(&Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)])
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
double div =
|
||||
coef0 * A_cur(0, 0, 0) +
|
||||
coef1 * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef2 * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef3 * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
if (x < x1)
|
||||
A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
__ldg(&vsq[index]) * div;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define SPANX 32
|
||||
#define SPANY 2
|
||||
#define SPANZ 4
|
||||
|
||||
__global__ void
|
||||
stencil_step_task( int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4], const double vsq[],
|
||||
const double Ain[], double Aout[]) {
|
||||
if (taskIndex0 >= taskCount0 ||
|
||||
taskIndex1 >= taskCount1 ||
|
||||
taskIndex2 >= taskCount2)
|
||||
return;
|
||||
|
||||
const int xfirst = x0 + taskIndex0 * SPANX;
|
||||
const int xlast = min(x1, xfirst + SPANX);
|
||||
|
||||
const int yfirst = y0 + taskIndex1 * SPANY;
|
||||
const int ylast = min(y1, yfirst + SPANY);
|
||||
|
||||
const int zfirst = z0 + taskIndex2 * SPANZ;
|
||||
const int zlast = min(z1, zfirst + SPANZ);
|
||||
|
||||
stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
|
||||
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||
}
|
||||
|
||||
|
||||
|
||||
extern "C"
|
||||
__global__ void
|
||||
loop_stencil_ispc_tasks___export( int t0, int t1,
|
||||
int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4],
|
||||
const double vsq[],
|
||||
double Aeven[], double Aodd[])
|
||||
{
|
||||
#define NB(x,n) (((x)+(n)-1)/(n))
|
||||
|
||||
dim3 grid((NB(x1-x0,SPANX)-1)/4+1, NB(y1-y0,SPANY), NB(z1-z0,SPANZ));
|
||||
|
||||
for ( int t = t0; t < t1; ++t)
|
||||
{
|
||||
// Parallelize across cores as well: each task will work on a slice
|
||||
// of 1 in the z extent of the volume.
|
||||
if ((t & 1) == 0)
|
||||
{
|
||||
if (programIndex == 0)
|
||||
stencil_step_task<<<grid,128>>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||
coef, vsq, Aeven, Aodd);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (programIndex == 0)
|
||||
stencil_step_task<<<grid,128>>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||
coef, vsq, Aodd, Aeven);
|
||||
}
|
||||
|
||||
// We need to wait for all of the launched tasks to finish before
|
||||
// starting the next iteration
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
}
|
||||
|
||||
extern "C"
|
||||
__host__ void
|
||||
loop_stencil_ispc_tasks( int t0, int t1,
|
||||
int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4],
|
||||
const double vsq[],
|
||||
double Aeven[], double Aodd[])
|
||||
{
|
||||
loop_stencil_ispc_tasks___export<<<1,32>>>(t0,t1,x0,x1,y0,y1,z0,z1,Nx,Ny,Nz,coef,vsq,Aeven,Aodd);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
164
examples_ptx/stencil/stencil.ispc
Normal file
164
examples_ptx/stencil/stencil.ispc
Normal file
@@ -0,0 +1,164 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
static inline void
|
||||
stencil_step(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4], uniform const double vsq[],
|
||||
uniform const double Ain[], uniform double Aout[]) {
|
||||
const uniform int Nxy = Nx * Ny;
|
||||
|
||||
#if 0
|
||||
#define VER1
|
||||
#endif
|
||||
|
||||
#ifdef VER1
|
||||
const uniform int x1o = 1;
|
||||
const uniform int x2o = 2;
|
||||
const uniform int x3o = 3;
|
||||
const uniform int y1o = Nx;
|
||||
const uniform int y2o = Nx*2;
|
||||
const uniform int y3o = Nx*3;
|
||||
const uniform int z1o = Nxy;
|
||||
const uniform int z2o = Nxy*2;
|
||||
const uniform int z3o = Nxy*3;
|
||||
#endif
|
||||
|
||||
foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1)
|
||||
{
|
||||
#ifdef VER1
|
||||
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + (y) + (z)]
|
||||
double div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) +
|
||||
A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) +
|
||||
A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) +
|
||||
coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) +
|
||||
A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) +
|
||||
A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) +
|
||||
coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) +
|
||||
A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) +
|
||||
A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o));
|
||||
A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
|
||||
#else
|
||||
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
double div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#define SPANX 32
|
||||
#define SPANY 2
|
||||
#define SPANZ 4
|
||||
|
||||
static task void
|
||||
stencil_step_task(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4], uniform const double vsq[],
|
||||
uniform const double Ain[], uniform double Aout[]) {
|
||||
if (taskIndex0 >= taskCount0 ||
|
||||
taskIndex1 >= taskCount1 ||
|
||||
taskIndex2 >= taskCount2)
|
||||
return;
|
||||
|
||||
const uniform int xfirst = x0 + taskIndex0 * SPANX;
|
||||
const uniform int xlast = min(x1, xfirst + SPANX);
|
||||
|
||||
const uniform int yfirst = y0 + taskIndex1 * SPANY;
|
||||
const uniform int ylast = min(y1, yfirst + SPANY);
|
||||
|
||||
const uniform int zfirst = z0 + taskIndex2 * SPANZ;
|
||||
const uniform int zlast = min(z1, zfirst + SPANZ);
|
||||
|
||||
stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
|
||||
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||
}
|
||||
|
||||
|
||||
|
||||
export void
|
||||
loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4],
|
||||
uniform const double vsq[],
|
||||
uniform double Aeven[], uniform double Aodd[])
|
||||
{
|
||||
#define NB(x,n) (((x)+(n)-1)/(n))
|
||||
|
||||
for (uniform int t = t0; t < t1; ++t)
|
||||
{
|
||||
// Parallelize across cores as well: each task will work on a slice
|
||||
// of 1 in the z extent of the volume.
|
||||
if ((t & 1) == 0)
|
||||
launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)]
|
||||
stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||
coef, vsq, Aeven, Aodd);
|
||||
else
|
||||
launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)]
|
||||
stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||
coef, vsq, Aodd, Aeven);
|
||||
|
||||
// We need to wait for all of the launched tasks to finish before
|
||||
// starting the next iteration.
|
||||
sync;
|
||||
}
|
||||
}
|
||||
|
||||
122
examples_ptx/stencil/stencil.orig.ispc
Normal file
122
examples_ptx/stencil/stencil.orig.ispc
Normal file
@@ -0,0 +1,122 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
static void
|
||||
stencil_step(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4], uniform const double vsq[],
|
||||
uniform const double Ain[], uniform double Aout[]) {
|
||||
const uniform int Nxy = Nx * Ny;
|
||||
|
||||
foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) {
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
double div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static task void
|
||||
stencil_step_task(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4], uniform const double vsq[],
|
||||
uniform const double Ain[], uniform double Aout[]) {
|
||||
stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
|
||||
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4],
|
||||
uniform const double vsq[],
|
||||
uniform double Aeven[], uniform double Aodd[])
|
||||
{
|
||||
for (uniform int t = t0; t < t1; ++t) {
|
||||
// Parallelize across cores as well: each task will work on a slice
|
||||
// of 1 in the z extent of the volume.
|
||||
if ((t & 1) == 0)
|
||||
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
|
||||
coef, vsq, Aeven, Aodd);
|
||||
else
|
||||
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
|
||||
coef, vsq, Aodd, Aeven);
|
||||
|
||||
// We need to wait for all of the launched tasks to finish before
|
||||
// starting the next iteration.
|
||||
sync;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
loop_stencil_ispc(uniform int t0, uniform int t1,
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4],
|
||||
uniform const double vsq[],
|
||||
uniform double Aeven[], uniform double Aodd[])
|
||||
{
|
||||
for (uniform int t = t0; t < t1; ++t) {
|
||||
if ((t & 1) == 0)
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aeven, Aodd);
|
||||
else
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aodd, Aeven);
|
||||
}
|
||||
}
|
||||
34
examples_ptx/stencil/stencil.vcxproj
Normal file
34
examples_ptx/stencil/stencil.vcxproj
Normal file
@@ -0,0 +1,34 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>rt</RootNamespace>
|
||||
<ISPC_file>stencil</ISPC_file>
|
||||
<default_targets>sse2,sse4-x2,avx1-i32x8</default_targets>
|
||||
</PropertyGroup>
|
||||
<Import Project="..\common.props" />
|
||||
<ItemGroup>
|
||||
<ClCompile Include="stencil.cpp" />
|
||||
<ClCompile Include="stencil_serial.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
86
examples_ptx/stencil/stencil_serial.cpp
Normal file
86
examples_ptx/stencil/stencil_serial.cpp
Normal file
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
static void
|
||||
stencil_step(int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4], const double vsq[],
|
||||
const double Ain[], double Aout[]) {
|
||||
int Nxy = Nx * Ny;
|
||||
|
||||
for (int z = z0; z < z1; ++z) {
|
||||
for (int y = y0; y < y1; ++y) {
|
||||
for (int x = x0; x < x1; ++x) {
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
double div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void loop_stencil_serial(int t0, int t1,
|
||||
int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4],
|
||||
const double vsq[],
|
||||
double Aeven[], double Aodd[])
|
||||
{
|
||||
for (int t = t0; t < t1; ++t) {
|
||||
if ((t & 1) == 0)
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aeven, Aodd);
|
||||
else
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aodd, Aeven);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user