added stencil code
This commit is contained in:
@@ -10,7 +10,8 @@ CCFLAGS+=-Iobjs/ -O2
|
||||
|
||||
LIBS=-lm $(TASK_LIB) -lstdc++
|
||||
ISPC=ispc
|
||||
ISPC_FLAGS+=-O2 --opt=fast-math --math-lib=default
|
||||
ISPC_FLAGS+=-O2
|
||||
ISPC_FLAGS+=--opt=fast-math --math-lib=default
|
||||
ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
|
||||
|
||||
ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
|
||||
|
||||
BIN
examples_cuda/stencil/.stencil.ispc.swn
Normal file
BIN
examples_cuda/stencil/.stencil.ispc.swn
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -1,370 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef _DRVAPI_ERROR_STRING_H_
|
||||
#define _DRVAPI_ERROR_STRING_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Error Code string definitions here
|
||||
typedef struct
|
||||
{
|
||||
char const *error_string;
|
||||
int error_id;
|
||||
} s_CudaErrorStr;
|
||||
|
||||
/**
|
||||
* Error codes
|
||||
*/
|
||||
static s_CudaErrorStr sCudaDrvErrorString[] =
|
||||
{
|
||||
/**
|
||||
* The API call returned with no errors. In the case of query calls, this
|
||||
* can also mean that the operation being queried is complete (see
|
||||
* ::cuEventQuery() and ::cuStreamQuery()).
|
||||
*/
|
||||
{ "CUDA_SUCCESS", 0 },
|
||||
|
||||
/**
|
||||
* This indicates that one or more of the parameters passed to the API call
|
||||
* is not within an acceptable range of values.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_VALUE", 1 },
|
||||
|
||||
/**
|
||||
* The API call failed because it was unable to allocate enough memory to
|
||||
* perform the requested operation.
|
||||
*/
|
||||
{ "CUDA_ERROR_OUT_OF_MEMORY", 2 },
|
||||
|
||||
/**
|
||||
* This indicates that the CUDA driver has not been initialized with
|
||||
* ::cuInit() or that initialization has failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_INITIALIZED", 3 },
|
||||
|
||||
/**
|
||||
* This indicates that the CUDA driver is in the process of shutting down.
|
||||
*/
|
||||
{ "CUDA_ERROR_DEINITIALIZED", 4 },
|
||||
|
||||
/**
|
||||
* This indicates profiling APIs are called while application is running
|
||||
* in visual profiler mode.
|
||||
*/
|
||||
{ "CUDA_ERROR_PROFILER_DISABLED", 5 },
|
||||
/**
|
||||
* This indicates profiling has not been initialized for this context.
|
||||
* Call cuProfilerInitialize() to resolve this.
|
||||
*/
|
||||
{ "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 },
|
||||
/**
|
||||
* This indicates profiler has already been started and probably
|
||||
* cuProfilerStart() is incorrectly called.
|
||||
*/
|
||||
{ "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 },
|
||||
/**
|
||||
* This indicates profiler has already been stopped and probably
|
||||
* cuProfilerStop() is incorrectly called.
|
||||
*/
|
||||
{ "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 },
|
||||
/**
|
||||
* This indicates that no CUDA-capable devices were detected by the installed
|
||||
* CUDA driver.
|
||||
*/
|
||||
{ "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 },
|
||||
|
||||
/**
|
||||
* This indicates that the device ordinal supplied by the user does not
|
||||
* correspond to a valid CUDA device.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 },
|
||||
|
||||
|
||||
/**
|
||||
* This indicates that the device kernel image is invalid. This can also
|
||||
* indicate an invalid CUDA module.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_IMAGE", 200 },
|
||||
|
||||
/**
|
||||
* This most frequently indicates that there is no context bound to the
|
||||
* current thread. This can also be returned if the context passed to an
|
||||
* API call is not a valid handle (such as a context that has had
|
||||
* ::cuCtxDestroy() invoked on it). This can also be returned if a user
|
||||
* mixes different API versions (i.e. 3010 context with 3020 API calls).
|
||||
* See ::cuCtxGetApiVersion() for more details.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_CONTEXT", 201 },
|
||||
|
||||
/**
|
||||
* This indicated that the context being supplied as a parameter to the
|
||||
* API call was already the active context.
|
||||
* \deprecated
|
||||
* This error return is deprecated as of CUDA 3.2. It is no longer an
|
||||
* error to attempt to push the active context via ::cuCtxPushCurrent().
|
||||
*/
|
||||
{ "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 },
|
||||
|
||||
/**
|
||||
* This indicates that a map or register operation has failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_MAP_FAILED", 205 },
|
||||
|
||||
/**
|
||||
* This indicates that an unmap or unregister operation has failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_UNMAP_FAILED", 206 },
|
||||
|
||||
/**
|
||||
* This indicates that the specified array is currently mapped and thus
|
||||
* cannot be destroyed.
|
||||
*/
|
||||
{ "CUDA_ERROR_ARRAY_IS_MAPPED", 207 },
|
||||
|
||||
/**
|
||||
* This indicates that the resource is already mapped.
|
||||
*/
|
||||
{ "CUDA_ERROR_ALREADY_MAPPED", 208 },
|
||||
|
||||
/**
|
||||
* This indicates that there is no kernel image available that is suitable
|
||||
* for the device. This can occur when a user specifies code generation
|
||||
* options for a particular CUDA source file that do not include the
|
||||
* corresponding device configuration.
|
||||
*/
|
||||
{ "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 },
|
||||
|
||||
/**
|
||||
* This indicates that a resource has already been acquired.
|
||||
*/
|
||||
{ "CUDA_ERROR_ALREADY_ACQUIRED", 210 },
|
||||
|
||||
/**
|
||||
* This indicates that a resource is not mapped.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_MAPPED", 211 },
|
||||
|
||||
/**
|
||||
* This indicates that a mapped resource is not available for access as an
|
||||
* array.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 },
|
||||
|
||||
/**
|
||||
* This indicates that a mapped resource is not available for access as a
|
||||
* pointer.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 },
|
||||
|
||||
/**
|
||||
* This indicates that an uncorrectable ECC error was detected during
|
||||
* execution.
|
||||
*/
|
||||
{ "CUDA_ERROR_ECC_UNCORRECTABLE", 214 },
|
||||
|
||||
/**
|
||||
* This indicates that the ::CUlimit passed to the API call is not
|
||||
* supported by the active device.
|
||||
*/
|
||||
{ "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 },
|
||||
|
||||
/**
|
||||
* This indicates that the ::CUcontext passed to the API call can
|
||||
* only be bound to a single CPU thread at a time but is already
|
||||
* bound to a CPU thread.
|
||||
*/
|
||||
{ "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 },
|
||||
|
||||
/**
|
||||
* This indicates that peer access is not supported across the given
|
||||
* devices.
|
||||
*/
|
||||
{ "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217},
|
||||
|
||||
/**
|
||||
* This indicates that the device kernel source is invalid.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_SOURCE", 300 },
|
||||
|
||||
/**
|
||||
* This indicates that the file specified was not found.
|
||||
*/
|
||||
{ "CUDA_ERROR_FILE_NOT_FOUND", 301 },
|
||||
|
||||
/**
|
||||
* This indicates that a link to a shared object failed to resolve.
|
||||
*/
|
||||
{ "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 },
|
||||
|
||||
/**
|
||||
* This indicates that initialization of a shared object failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 },
|
||||
|
||||
/**
|
||||
* This indicates that an OS call failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_OPERATING_SYSTEM", 304 },
|
||||
|
||||
|
||||
/**
|
||||
* This indicates that a resource handle passed to the API call was not
|
||||
* valid. Resource handles are opaque types like ::CUstream and ::CUevent.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_HANDLE", 400 },
|
||||
|
||||
|
||||
/**
|
||||
* This indicates that a named symbol was not found. Examples of symbols
|
||||
* are global/constant variable names, texture names }, and surface names.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_FOUND", 500 },
|
||||
|
||||
|
||||
/**
|
||||
* This indicates that asynchronous operations issued previously have not
|
||||
* completed yet. This result is not actually an error, but must be indicated
|
||||
* differently than ::CUDA_SUCCESS (which indicates completion). Calls that
|
||||
* may return this value include ::cuEventQuery() and ::cuStreamQuery().
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_READY", 600 },
|
||||
|
||||
|
||||
/**
|
||||
* An exception occurred on the device while executing a kernel. Common
|
||||
* causes include dereferencing an invalid device pointer and accessing
|
||||
* out of bounds shared memory. The context cannot be used }, so it must
|
||||
* be destroyed (and a new one should be created). All existing device
|
||||
* memory allocations from this context are invalid and must be
|
||||
* reconstructed if the program is to continue using CUDA.
|
||||
*/
|
||||
{ "CUDA_ERROR_LAUNCH_FAILED", 700 },
|
||||
|
||||
/**
|
||||
* This indicates that a launch did not occur because it did not have
|
||||
* appropriate resources. This error usually indicates that the user has
|
||||
* attempted to pass too many arguments to the device kernel, or the
|
||||
* kernel launch specifies too many threads for the kernel's register
|
||||
* count. Passing arguments of the wrong size (i.e. a 64-bit pointer
|
||||
* when a 32-bit int is expected) is equivalent to passing too many
|
||||
* arguments and can also result in this error.
|
||||
*/
|
||||
{ "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 },
|
||||
|
||||
/**
|
||||
* This indicates that the device kernel took too long to execute. This can
|
||||
* only occur if timeouts are enabled - see the device attribute
|
||||
* ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
|
||||
* context cannot be used (and must be destroyed similar to
|
||||
* ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
|
||||
* this context are invalid and must be reconstructed if the program is to
|
||||
* continue using CUDA.
|
||||
*/
|
||||
{ "CUDA_ERROR_LAUNCH_TIMEOUT", 702 },
|
||||
|
||||
/**
|
||||
* This error indicates a kernel launch that uses an incompatible texturing
|
||||
* mode.
|
||||
*/
|
||||
{ "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 },
|
||||
|
||||
/**
|
||||
* This error indicates that a call to ::cuCtxEnablePeerAccess() is
|
||||
* trying to re-enable peer access to a context which has already
|
||||
* had peer access to it enabled.
|
||||
*/
|
||||
{ "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 },
|
||||
|
||||
/**
|
||||
* This error indicates that ::cuCtxDisablePeerAccess() is
|
||||
* trying to disable peer access which has not been enabled yet
|
||||
* via ::cuCtxEnablePeerAccess().
|
||||
*/
|
||||
{ "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 },
|
||||
|
||||
/**
|
||||
* This error indicates that the primary context for the specified device
|
||||
* has already been initialized.
|
||||
*/
|
||||
{ "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 },
|
||||
|
||||
/**
|
||||
* This error indicates that the context current to the calling thread
|
||||
* has been destroyed using ::cuCtxDestroy }, or is a primary context which
|
||||
* has not yet been initialized.
|
||||
*/
|
||||
{ "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 },
|
||||
|
||||
/**
|
||||
* A device-side assert triggered during kernel execution. The context
|
||||
* cannot be used anymore, and must be destroyed. All existing device
|
||||
* memory allocations from this context are invalid and must be
|
||||
* reconstructed if the program is to continue using CUDA.
|
||||
*/
|
||||
{ "CUDA_ERROR_ASSERT", 710 },
|
||||
|
||||
/**
|
||||
* This error indicates that the hardware resources required to enable
|
||||
* peer access have been exhausted for one or more of the devices
|
||||
* passed to ::cuCtxEnablePeerAccess().
|
||||
*/
|
||||
{ "CUDA_ERROR_TOO_MANY_PEERS", 711 },
|
||||
|
||||
/**
|
||||
* This error indicates that the memory range passed to ::cuMemHostRegister()
|
||||
* has already been registered.
|
||||
*/
|
||||
{ "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 },
|
||||
|
||||
/**
|
||||
* This error indicates that the pointer passed to ::cuMemHostUnregister()
|
||||
* does not correspond to any currently registered memory region.
|
||||
*/
|
||||
{ "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 },
|
||||
|
||||
/**
|
||||
* This error indicates that the attempted operation is not permitted.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_PERMITTED", 800 },
|
||||
|
||||
/**
|
||||
* This error indicates that the attempted operation is not supported
|
||||
* on the current system or device.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_SUPPORTED", 801 },
|
||||
|
||||
/**
|
||||
* This indicates that an unknown internal error has occurred.
|
||||
*/
|
||||
{ "CUDA_ERROR_UNKNOWN", 999 },
|
||||
{ NULL, -1 }
|
||||
};
|
||||
|
||||
// This is just a linear search through the array, since the error_id's are not
|
||||
// always ocurring consecutively
|
||||
const char * getCudaDrvErrorString(CUresult error_id)
|
||||
{
|
||||
int index = 0;
|
||||
while (sCudaDrvErrorString[index].error_id != error_id &&
|
||||
sCudaDrvErrorString[index].error_id != -1)
|
||||
{
|
||||
index++;
|
||||
}
|
||||
if (sCudaDrvErrorString[index].error_id == error_id)
|
||||
return (const char *)sCudaDrvErrorString[index].error_string;
|
||||
else
|
||||
return (const char *)"CUDA_ERROR not found!";
|
||||
}
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -1,6 +1,11 @@
|
||||
#define programCount 32
|
||||
#define programIndex (threadIdx.x & 31)
|
||||
#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5))
|
||||
#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5))
|
||||
#define taskIndex1 (blockIdx.y)
|
||||
#define taskIndex2 (blockIdx.z)
|
||||
#define taskCount0 (gridDim.x*4)
|
||||
#define taskCount1 (gridDim.y)
|
||||
#define taskCount2 (gridDim.z)
|
||||
|
||||
__device__ static void
|
||||
stencil_step( int x0, int x1,
|
||||
@@ -48,15 +53,71 @@ stencil_step( int x0, int x1,
|
||||
}
|
||||
|
||||
|
||||
extern "C"
|
||||
#define SPANX 32
|
||||
#define SPANY 8
|
||||
#define SPANZ 8
|
||||
|
||||
__global__ void
|
||||
stencil_step_task( int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4], const double vsq[],
|
||||
const double Ain[], double Aout[]) {
|
||||
stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
|
||||
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||
if (taskIndex0 >= taskCount0 ||
|
||||
taskIndex1 >= taskCount1 ||
|
||||
taskIndex2 >= taskCount2)
|
||||
return;
|
||||
|
||||
const int xfirst = x0 + taskIndex0 * SPANX;
|
||||
const int xlast = min(x1, xfirst + SPANX);
|
||||
|
||||
const int yfirst = y0 + taskIndex1 * SPANY;
|
||||
const int ylast = min(y1, yfirst + SPANY);
|
||||
|
||||
const int zfirst = z0 + taskIndex2 * SPANZ;
|
||||
const int zlast = min(z1, zfirst + SPANZ);
|
||||
|
||||
stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
|
||||
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||
}
|
||||
|
||||
|
||||
|
||||
extern "C"
|
||||
__global__ void
|
||||
loop_stencil_ispc_tasks( int t0, int t1,
|
||||
int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4],
|
||||
const double vsq[],
|
||||
double Aeven[], double Aodd[])
|
||||
{
|
||||
#define NB(x,n) (((x)+(n)-1)/(n))
|
||||
|
||||
dim3 grid((NB(x1-x0,SPANX)-1)/4+1, NB(y1-y0,SPANY), NB(z1-z0,SPANZ));
|
||||
|
||||
for ( int t = t0; t < t1; ++t)
|
||||
{
|
||||
// Parallelize across cores as well: each task will work on a slice
|
||||
// of 1 in the z extent of the volume.
|
||||
if ((t & 1) == 0)
|
||||
{
|
||||
if (programIndex == 0)
|
||||
stencil_step_task<<<grid,128>>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||
coef, vsq, Aeven, Aodd);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (programIndex == 0)
|
||||
stencil_step_task<<<grid,128>>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||
coef, vsq, Aodd, Aeven);
|
||||
}
|
||||
|
||||
// We need to wait for all of the launched tasks to finish before
|
||||
// starting the next iteration
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,159 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
static inline void
|
||||
stencil_step(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4], uniform const double vsq[],
|
||||
uniform const double Ain[], uniform double Aout[]) {
|
||||
const uniform int Nxy = Nx * Ny;
|
||||
|
||||
#if 0
|
||||
#define VER1
|
||||
#endif
|
||||
|
||||
#ifdef VER1
|
||||
const uniform int x1o = 1;
|
||||
const uniform int x2o = 2;
|
||||
const uniform int x3o = 3;
|
||||
const uniform int y1o = Nx;
|
||||
const uniform int y2o = Nx*2;
|
||||
const uniform int y3o = Nx*3;
|
||||
const uniform int z1o = Nxy;
|
||||
const uniform int z2o = Nxy*2;
|
||||
const uniform int z3o = Nxy*3;
|
||||
#endif
|
||||
foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1)
|
||||
{
|
||||
const int index= (z * Nxy) + (y * Nx) + x;
|
||||
|
||||
#ifndef VER1
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
double div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
#else
|
||||
|
||||
#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + (y) + (z)]
|
||||
double div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) +
|
||||
A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) +
|
||||
A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) +
|
||||
coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) +
|
||||
A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) +
|
||||
A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) +
|
||||
coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) +
|
||||
A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) +
|
||||
A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o));
|
||||
|
||||
#endif
|
||||
|
||||
A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
}
|
||||
}
|
||||
|
||||
#define SPANX 32
|
||||
#define SPANY 8
|
||||
#define SPANZ 8
|
||||
|
||||
static task void
|
||||
stencil_step_task(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4], uniform const double vsq[],
|
||||
uniform const double Ain[], uniform double Aout[]) {
|
||||
if (taskIndex0 >= taskCount0 ||
|
||||
taskIndex1 >= taskCount1 ||
|
||||
taskIndex2 >= taskCount2)
|
||||
return;
|
||||
|
||||
const uniform int xfirst = x0 + taskIndex0 * SPANX;
|
||||
const uniform int xlast = min(x1, xfirst + SPANX);
|
||||
|
||||
const uniform int yfirst = y0 + taskIndex1 * SPANY;
|
||||
const uniform int ylast = min(y1, yfirst + SPANY);
|
||||
|
||||
const uniform int zfirst = z0 + taskIndex2 * SPANZ;
|
||||
const uniform int zlast = min(z1, zfirst + SPANZ);
|
||||
|
||||
stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
|
||||
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||
}
|
||||
|
||||
|
||||
|
||||
export void
|
||||
loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4],
|
||||
uniform const double vsq[],
|
||||
uniform double Aeven[], uniform double Aodd[])
|
||||
{
|
||||
#define NB(x,n) (((x)+(n)-1)/(n))
|
||||
|
||||
for (uniform int t = t0; t < t1; ++t)
|
||||
{
|
||||
// Parallelize across cores as well: each task will work on a slice
|
||||
// of 1 in the z extent of the volume.
|
||||
if ((t & 1) == 0)
|
||||
launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)]
|
||||
stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||
coef, vsq, Aeven, Aodd);
|
||||
else
|
||||
launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)]
|
||||
stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||
coef, vsq, Aodd, Aeven);
|
||||
|
||||
// We need to wait for all of the launched tasks to finish before
|
||||
// starting the next iteration.
|
||||
sync;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,126 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
static inline void
|
||||
stencil_step(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4], uniform const double vsq[],
|
||||
uniform const double Ain[], uniform double Aout[]) {
|
||||
const uniform int Nxy = Nx * Ny;
|
||||
|
||||
foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1)
|
||||
{
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
double div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#define SPANX 32
|
||||
#define SPANY 8
|
||||
#define SPANZ 8
|
||||
|
||||
static task void
|
||||
stencil_step_task(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4], uniform const double vsq[],
|
||||
uniform const double Ain[], uniform double Aout[]) {
|
||||
if (taskIndex0 >= taskCount0 ||
|
||||
taskIndex1 >= taskCount1 ||
|
||||
taskIndex2 >= taskCount2)
|
||||
return;
|
||||
|
||||
const uniform int xfirst = x0 + taskIndex0 * SPANX;
|
||||
const uniform int xlast = min(x1, xfirst + SPANX);
|
||||
|
||||
const uniform int yfirst = y0 + taskIndex1 * SPANY;
|
||||
const uniform int ylast = min(y1, yfirst + SPANY);
|
||||
|
||||
const uniform int zfirst = z0 + taskIndex2 * SPANZ;
|
||||
const uniform int zlast = min(z1, zfirst + SPANZ);
|
||||
|
||||
stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast,
|
||||
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||
}
|
||||
|
||||
|
||||
|
||||
export void
|
||||
loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4],
|
||||
uniform const double vsq[],
|
||||
uniform double Aeven[], uniform double Aodd[])
|
||||
{
|
||||
#define NB(x,n) (((x)+(n)-1)/(n))
|
||||
|
||||
for (uniform int t = t0; t < t1; ++t)
|
||||
{
|
||||
// Parallelize across cores as well: each task will work on a slice
|
||||
// of 1 in the z extent of the volume.
|
||||
if ((t & 1) == 0)
|
||||
launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)]
|
||||
stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||
coef, vsq, Aeven, Aodd);
|
||||
else
|
||||
launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)]
|
||||
stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz,
|
||||
coef, vsq, Aodd, Aeven);
|
||||
|
||||
// We need to wait for all of the launched tasks to finish before
|
||||
// starting the next iteration.
|
||||
sync;
|
||||
}
|
||||
}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -1,34 +0,0 @@
|
||||
//
|
||||
// stencil_ispc.h
|
||||
// (Header automatically generated by the ispc compiler.)
|
||||
// DO NOT EDIT THIS FILE.
|
||||
//
|
||||
|
||||
#ifndef ISPC_STENCIL_ISPC_H
|
||||
#define ISPC_STENCIL_ISPC_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace ispc { /* namespace */
|
||||
#endif // __cplusplus
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Functions exported from ispc code
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C)
|
||||
extern "C" {
|
||||
#endif // __cplusplus
|
||||
extern void loop_stencil_ispc_tasks(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd);
|
||||
#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C)
|
||||
} /* end extern C */
|
||||
#endif // __cplusplus
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* namespace */
|
||||
#endif // __cplusplus
|
||||
|
||||
#endif // ISPC_STENCIL_ISPC_H
|
||||
@@ -1,974 +0,0 @@
|
||||
; ModuleID = 'stencil_ispc_nvptx64.bc'
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
|
||||
target triple = "nvptx64"
|
||||
|
||||
module asm ""
|
||||
module asm ".extern .func (.param .b32 func_retval0) cudaLaunchDevice"
|
||||
module asm "("
|
||||
module asm " .param .b64 cudaLaunchDevice_param_0,"
|
||||
module asm " .param .b64 cudaLaunchDevice_param_1,"
|
||||
module asm " .param .align 4 .b8 cudaLaunchDevice_param_2[12],"
|
||||
module asm " .param .align 4 .b8 cudaLaunchDevice_param_3[12],"
|
||||
module asm " .param .b32 cudaLaunchDevice_param_4,"
|
||||
module asm " .param .b64 cudaLaunchDevice_param_5"
|
||||
module asm ");"
|
||||
|
||||
@constDeltaForeach1 = private unnamed_addr constant [32 x i8] zeroinitializer
|
||||
@constDeltaForeach4 = private unnamed_addr constant [32 x i8] c"\00\01\02\03\04\05\06\07\08\09\0A\0B\0C\0D\0E\0F\10\11\12\13\14\15\16\17\18\19\1A\1B\1C\1D\1E\1F"
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() nounwind readnone
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() nounwind readnone
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() nounwind readnone
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() nounwind readnone
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() nounwind readnone
|
||||
|
||||
define i32 @__shfl_i32(i32, i32) {
|
||||
%shfl = tail call i32 asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1)
|
||||
ret i32 %shfl
|
||||
}
|
||||
|
||||
define float @__shfl_xor_float(float, i32) {
|
||||
%shfl = tail call float asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1)
|
||||
ret float %shfl
|
||||
}
|
||||
|
||||
define i32 @__shfl_xor_i32(i32, i32) {
|
||||
%shfl = tail call i32 asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1)
|
||||
ret i32 %shfl
|
||||
}
|
||||
|
||||
define float @__fminf(float, float) {
|
||||
%min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1)
|
||||
ret float %min
|
||||
}
|
||||
|
||||
define float @__fmaxf(float, float) {
|
||||
%max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1)
|
||||
ret float %max
|
||||
}
|
||||
|
||||
define i32 @__ballot(i1) {
|
||||
%conv = zext i1 %0 to i32
|
||||
%res = tail call i32 asm sideeffect "{ .reg .pred %p1; \0A setp.ne.u32 %p1, $1, 0; \0A vote.ballot.b32 $0, %p1; \0A }", "=r,r"(i32 %conv)
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
define i32 @__lanemask_lt() {
|
||||
%mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"()
|
||||
ret i32 %mask
|
||||
}
|
||||
|
||||
define i8* @ISPCAlloc(i8**, i64, i32) {
|
||||
ret i8* inttoptr (i64 1 to i8*)
|
||||
}
|
||||
|
||||
declare i64 @cudaGetParameterBuffer(i64, i64)
|
||||
|
||||
define i8* @ISPCGetParamBuffer(i8**, i64 %align, i64 %size) {
|
||||
entry:
|
||||
%tid.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%and = and i32 %tid.i, 31
|
||||
%cmp = icmp eq i32 %and, 0
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%ptri64tmp = tail call i64 @cudaGetParameterBuffer(i64 %align, i64 %size)
|
||||
%phitmp = inttoptr i64 %ptri64tmp to i8*
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %entry
|
||||
%ptri64 = phi i8* [ %phitmp, %if.then ], [ null, %entry ]
|
||||
ret i8* %ptri64
|
||||
}
|
||||
|
||||
define void @ISPCLaunch(i8**, i8* %func_ptr, i8* %func_args, i32 %ntx, i32 %nty, i32 %ntz) {
|
||||
entry:
|
||||
%tid.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%and = and i32 %tid.i, 31
|
||||
%cmp = icmp eq i32 %and, 0
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%ntxm1 = add nsw i32 %ntx, -1
|
||||
%ntxm1d4 = ashr i32 %ntxm1, 2
|
||||
%nbx = add nsw i32 %ntxm1d4, 1
|
||||
%args_i64 = ptrtoint i8* %func_args to i64
|
||||
%func_i64 = ptrtoint i8* %func_ptr to i64
|
||||
%res_tmp = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 %func_i64, i64 %args_i64, i32 %nbx, i32 %nty, i32 %ntz, i32 128, i32 1, i32 1, i32 0, i64 0)
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @cudaDeviceSynchronize()
|
||||
|
||||
define void @ISPCSync(i8*) {
|
||||
%2 = tail call i32 @cudaDeviceSynchronize()
|
||||
ret void
|
||||
}
|
||||
|
||||
define i64 @__warpBinExclusiveScan(i1 %p) {
|
||||
entry:
|
||||
%conv.i = zext i1 %p to i32
|
||||
%res.i = tail call i32 asm sideeffect "{ .reg .pred %p1; \0A setp.ne.u32 %p1, $1, 0; \0A vote.ballot.b32 $0, %p1; \0A }", "=r,r"(i32 %conv.i)
|
||||
%res.i1 = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %res.i)
|
||||
%mask.i = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"()
|
||||
%and = and i32 %mask.i, %res.i
|
||||
%res.i2 = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %and)
|
||||
%retval.sroa.1.4.insert.ext.i = zext i32 %res.i2 to i64
|
||||
%retval.sroa.1.4.insert.shift.i = shl nuw i64 %retval.sroa.1.4.insert.ext.i, 32
|
||||
%retval.sroa.0.0.insert.ext.i = zext i32 %res.i1 to i64
|
||||
%retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.1.4.insert.shift.i, %retval.sroa.0.0.insert.ext.i
|
||||
ret i64 %retval.sroa.0.0.insert.insert.i
|
||||
}
|
||||
|
||||
define internal void @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Ain, double* %Aout) {
|
||||
allocas:
|
||||
%bid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
%mul_calltmp_.i = shl i32 %bid.i.i, 2
|
||||
%tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%bitop.i = ashr i32 %tid.i.i, 5
|
||||
%add_mul_calltmp__bitop.i = add i32 %bitop.i, %mul_calltmp_.i
|
||||
%nb.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
|
||||
%mul_calltmp_.i57 = shl i32 %nb.i.i, 2
|
||||
%greaterequal_calltmp_calltmp18 = icmp sge i32 %add_mul_calltmp__bitop.i, %mul_calltmp_.i57
|
||||
%bid.i.i58 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
%nb.i.i59 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
|
||||
%greaterequal_calltmp21_calltmp24 = icmp sge i32 %bid.i.i58, %nb.i.i59
|
||||
%logical_or = or i1 %greaterequal_calltmp_calltmp18, %greaterequal_calltmp21_calltmp24
|
||||
%bid.i.i60 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
|
||||
%nb.i.i61 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
|
||||
%greaterequal_calltmp27_calltmp30 = icmp sge i32 %bid.i.i60, %nb.i.i61
|
||||
%logical_or31 = or i1 %logical_or, %greaterequal_calltmp27_calltmp30
|
||||
br i1 %logical_or31, label %if_then, label %if_exit
|
||||
|
||||
if_then: ; preds = %foreach_reset19.i, %if_exit, %allocas
|
||||
ret void
|
||||
|
||||
if_exit: ; preds = %allocas
|
||||
%bid.i.i62 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
%mul_calltmp_.i63 = shl i32 %bid.i.i62, 7
|
||||
%tid.i.i64 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%bitop.i657375 = add i32 %tid.i.i64, %mul_calltmp_.i63
|
||||
%mul_calltmp35_ = and i32 %bitop.i657375, -32
|
||||
%add_x0_load_mul_calltmp35_ = add i32 %mul_calltmp35_, %x0
|
||||
%add_xfirst_load_ = add i32 %add_x0_load_mul_calltmp35_, 32
|
||||
%c.i.i = icmp sgt i32 %add_xfirst_load_, %x1
|
||||
%r.i.i = select i1 %c.i.i, i32 %x1, i32 %add_xfirst_load_
|
||||
%bid.i.i67 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
%mul_calltmp41_ = shl i32 %bid.i.i67, 3
|
||||
%add_y0_load_mul_calltmp41_ = add i32 %mul_calltmp41_, %y0
|
||||
%add_yfirst_load_ = add i32 %add_y0_load_mul_calltmp41_, 8
|
||||
%bid.i.i70 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
|
||||
%mul_calltmp47_ = shl i32 %bid.i.i70, 3
|
||||
%add_z0_load_mul_calltmp47_ = add i32 %mul_calltmp47_, %z0
|
||||
%add_zfirst_load_ = add i32 %add_z0_load_mul_calltmp47_, 8
|
||||
%c.i.i71 = icmp sgt i32 %add_zfirst_load_, %z1
|
||||
%r.i.i72 = select i1 %c.i.i71, i32 %z1, i32 %add_zfirst_load_
|
||||
%mul_Nx_load_Ny_load.i = mul i32 %Ny, %Nx
|
||||
%nitems29.i = sub i32 %r.i.i, %add_x0_load_mul_calltmp35_
|
||||
%nextras30.i = srem i32 %nitems29.i, 32
|
||||
%aligned_end31.i = sub i32 %r.i.i, %nextras30.i
|
||||
%tid.i4.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%__laneidx.i = and i32 %tid.i4.i, 31
|
||||
%0 = zext i32 %__laneidx.i to i64
|
||||
%arrayidx.i = getelementptr [32 x i8]* @constDeltaForeach1, i64 0, i64 %0
|
||||
%cmp38.i396 = icmp slt i32 %add_z0_load_mul_calltmp47_, %r.i.i72
|
||||
br i1 %cmp38.i396, label %foreach_test21.i.preheader.lr.ph, label %if_then
|
||||
|
||||
foreach_test21.i.preheader.lr.ph: ; preds = %if_exit
|
||||
%c.i.i68 = icmp sgt i32 %add_yfirst_load_, %y1
|
||||
%r.i.i69 = select i1 %c.i.i68, i32 %y1, i32 %add_yfirst_load_
|
||||
%1 = load i8* %arrayidx.i, align 1
|
||||
%_zext.i394 = zext i8 %1 to i32
|
||||
%2 = insertelement <1 x i32> undef, i32 %_zext.i394, i32 0
|
||||
%smear_counter_init.i393 = insertelement <1 x i32> undef, i32 %add_z0_load_mul_calltmp47_, i32 0
|
||||
%iter_val.i395 = add <1 x i32> %smear_counter_init.i393, %2
|
||||
%smear_counter_init44.i387 = insertelement <1 x i32> undef, i32 %add_y0_load_mul_calltmp41_, i32 0
|
||||
%cmp54.i390 = icmp slt i32 %add_y0_load_mul_calltmp41_, %r.i.i69
|
||||
%before_aligned_end73.i385 = icmp slt i32 %add_x0_load_mul_calltmp35_, %aligned_end31.i
|
||||
%smear_end_init289.i = insertelement <1 x i32> undef, i32 %r.i.i, i32 0
|
||||
%Nxy_load298_broadcast_init.i = insertelement <1 x i32> undef, i32 %mul_Nx_load_Ny_load.i, i32 0
|
||||
%Nx_load300_broadcast_init.i = insertelement <1 x i32> undef, i32 %Nx, i32 0
|
||||
%Ain_load309_ptr2int.i = ptrtoint double* %Ain to i64
|
||||
%coef_load314_offset.i = getelementptr double* %coef, i64 1
|
||||
%coef_load365_offset.i = getelementptr double* %coef, i64 2
|
||||
%mul__Nx_load385.i = shl i32 %Nx, 1
|
||||
%mul__Nx_load393.i = mul i32 %Nx, -2
|
||||
%mul__Nxy_load402.i = shl i32 %mul_Nx_load_Ny_load.i, 1
|
||||
%mul__Nxy_load410.i = mul i32 %mul_Nx_load_Ny_load.i, -2
|
||||
%coef_load416_offset.i = getelementptr double* %coef, i64 3
|
||||
%mul__Nx_load436.i = mul i32 %Nx, 3
|
||||
%mul__Nx_load444.i = mul i32 %Nx, -3
|
||||
%mul__Nxy_load453.i = mul i32 %mul_Nx_load_Ny_load.i, 3
|
||||
%mul__Nxy_load461.i = mul i32 %mul_Nx_load_Ny_load.i, -3
|
||||
%Aout_load470_ptr2int.i = ptrtoint double* %Aout to i64
|
||||
%vsq_load488_ptr2int.i = ptrtoint double* %vsq to i64
|
||||
%3 = sub i32 -9, %y0
|
||||
%4 = shl i32 %bid.i.i67, 3
|
||||
%5 = sub i32 %3, %4
|
||||
%6 = xor i32 %y1, -1
|
||||
%7 = icmp sgt i32 %5, %6
|
||||
%smax = select i1 %7, i32 %5, i32 %6
|
||||
%8 = xor i32 %smax, -1
|
||||
%9 = sub i32 -9, %z0
|
||||
%10 = shl i32 %bid.i.i70, 3
|
||||
%11 = sub i32 %9, %10
|
||||
%12 = xor i32 %z1, -1
|
||||
%13 = icmp sgt i32 %11, %12
|
||||
%smax399 = select i1 %13, i32 %11, i32 %12
|
||||
%14 = xor i32 %smax399, -1
|
||||
br label %foreach_test21.i.preheader
|
||||
|
||||
foreach_full_body.i: ; preds = %outer_not_in_extras.i.preheader, %foreach_full_body.i
|
||||
%counter32.4.i386 = phi i32 [ %new_counter279.i, %foreach_full_body.i ], [ %add_x0_load_mul_calltmp35_, %outer_not_in_extras.i.preheader ]
|
||||
%tid.i.i56 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%__laneidx80.i = and i32 %tid.i.i56, 31
|
||||
%15 = zext i32 %__laneidx80.i to i64
|
||||
%arrayidx81.i = getelementptr [32 x i8]* @constDeltaForeach4, i64 0, i64 %15
|
||||
%16 = load i8* %arrayidx81.i, align 1
|
||||
%_zext82.i = zext i8 %16 to i32
|
||||
%coef_load_offset_load.i = load double* %coef, align 8
|
||||
%.lhs362.lhs.lhs = extractelement <1 x i32> %mul_z_load297_Nxy_load298_broadcast.i, i32 0
|
||||
%.lhs362.lhs.rhs.lhs = extractelement <1 x i32> %iter_val50.i392, i32 0
|
||||
%.lhs362.lhs.rhs = mul i32 %.lhs362.lhs.rhs.lhs, %Nx
|
||||
%.lhs362.lhs = add i32 %.lhs362.lhs.lhs, %.lhs362.lhs.rhs
|
||||
%.lhs362.rhs = add i32 %counter32.4.i386, %_zext82.i
|
||||
%.lhs362 = add i32 %.lhs362.lhs, %.lhs362.rhs
|
||||
%17 = shl i32 %.lhs362, 3
|
||||
%iptr__id.i.rhs = sext i32 %17 to i64
|
||||
%iptr__id.i = add i64 %iptr__id.i.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i = inttoptr i64 %iptr__id.i to double*
|
||||
%val__id.i = load double* %ptr__id.i, align 8
|
||||
%coef_load94_offset_load.i = load double* %coef_load314_offset.i, align 8
|
||||
%18 = add i32 %17, 8
|
||||
%iptr__id.i335.rhs = sext i32 %18 to i64
|
||||
%iptr__id.i335 = add i64 %iptr__id.i335.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i336 = inttoptr i64 %iptr__id.i335 to double*
|
||||
%val__id.i337 = load double* %ptr__id.i336, align 8
|
||||
%19 = add i32 %17, -8
|
||||
%iptr__id.i330.rhs = sext i32 %19 to i64
|
||||
%iptr__id.i330 = add i64 %iptr__id.i330.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i331 = inttoptr i64 %iptr__id.i330 to double*
|
||||
%val__id.i332 = load double* %ptr__id.i331, align 8
|
||||
%.lhs365 = add i32 %.lhs362, %Nx
|
||||
%20 = shl i32 %.lhs365, 3
|
||||
%iptr__id.i325.rhs = sext i32 %20 to i64
|
||||
%iptr__id.i325 = add i64 %iptr__id.i325.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i326 = inttoptr i64 %iptr__id.i325 to double*
|
||||
%val__id.i327 = load double* %ptr__id.i326, align 8
|
||||
%.lhs366 = sub i32 %.lhs362, %Nx
|
||||
%21 = shl i32 %.lhs366, 3
|
||||
%iptr__id.i320.rhs = sext i32 %21 to i64
|
||||
%iptr__id.i320 = add i64 %iptr__id.i320.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i321 = inttoptr i64 %iptr__id.i320 to double*
|
||||
%val__id.i322 = load double* %ptr__id.i321, align 8
|
||||
%.lhs367 = add i32 %.lhs362, %mul_Nx_load_Ny_load.i
|
||||
%22 = shl i32 %.lhs367, 3
|
||||
%iptr__id.i315.rhs = sext i32 %22 to i64
|
||||
%iptr__id.i315 = add i64 %iptr__id.i315.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i316 = inttoptr i64 %iptr__id.i315 to double*
|
||||
%val__id.i317 = load double* %ptr__id.i316, align 8
|
||||
%.lhs368 = sub i32 %.lhs362, %mul_Nx_load_Ny_load.i
|
||||
%23 = shl i32 %.lhs368, 3
|
||||
%iptr__id.i310.rhs = sext i32 %23 to i64
|
||||
%iptr__id.i310 = add i64 %iptr__id.i310.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i311 = inttoptr i64 %iptr__id.i310 to double*
|
||||
%val__id.i312 = load double* %ptr__id.i311, align 8
|
||||
%coef_load145_offset_load.i = load double* %coef_load365_offset.i, align 8
|
||||
%24 = add i32 %17, 16
|
||||
%iptr__id.i305.rhs = sext i32 %24 to i64
|
||||
%iptr__id.i305 = add i64 %iptr__id.i305.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i306 = inttoptr i64 %iptr__id.i305 to double*
|
||||
%val__id.i307 = load double* %ptr__id.i306, align 8
|
||||
%25 = add i32 %17, -16
|
||||
%iptr__id.i300.rhs = sext i32 %25 to i64
|
||||
%iptr__id.i300 = add i64 %iptr__id.i300.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i301 = inttoptr i64 %iptr__id.i300 to double*
|
||||
%val__id.i302 = load double* %ptr__id.i301, align 8
|
||||
%.lhs371 = add i32 %.lhs362, %mul__Nx_load385.i
|
||||
%26 = shl i32 %.lhs371, 3
|
||||
%iptr__id.i295.rhs = sext i32 %26 to i64
|
||||
%iptr__id.i295 = add i64 %iptr__id.i295.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i296 = inttoptr i64 %iptr__id.i295 to double*
|
||||
%val__id.i297 = load double* %ptr__id.i296, align 8
|
||||
%.lhs372 = add i32 %.lhs362, %mul__Nx_load393.i
|
||||
%27 = shl i32 %.lhs372, 3
|
||||
%iptr__id.i290.rhs = sext i32 %27 to i64
|
||||
%iptr__id.i290 = add i64 %iptr__id.i290.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i291 = inttoptr i64 %iptr__id.i290 to double*
|
||||
%val__id.i292 = load double* %ptr__id.i291, align 8
|
||||
%.lhs373 = add i32 %.lhs362, %mul__Nxy_load402.i
|
||||
%28 = shl i32 %.lhs373, 3
|
||||
%iptr__id.i285.rhs = sext i32 %28 to i64
|
||||
%iptr__id.i285 = add i64 %iptr__id.i285.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i286 = inttoptr i64 %iptr__id.i285 to double*
|
||||
%val__id.i287 = load double* %ptr__id.i286, align 8
|
||||
%.lhs374 = add i32 %.lhs362, %mul__Nxy_load410.i
|
||||
%29 = shl i32 %.lhs374, 3
|
||||
%iptr__id.i280.rhs = sext i32 %29 to i64
|
||||
%iptr__id.i280 = add i64 %iptr__id.i280.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i281 = inttoptr i64 %iptr__id.i280 to double*
|
||||
%val__id.i282 = load double* %ptr__id.i281, align 8
|
||||
%coef_load196_offset_load.i = load double* %coef_load416_offset.i, align 8
|
||||
%30 = add i32 %17, 24
|
||||
%iptr__id.i275.rhs = sext i32 %30 to i64
|
||||
%iptr__id.i275 = add i64 %iptr__id.i275.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i276 = inttoptr i64 %iptr__id.i275 to double*
|
||||
%val__id.i277 = load double* %ptr__id.i276, align 8
|
||||
%31 = add i32 %17, -24
|
||||
%iptr__id.i270.rhs = sext i32 %31 to i64
|
||||
%iptr__id.i270 = add i64 %iptr__id.i270.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i271 = inttoptr i64 %iptr__id.i270 to double*
|
||||
%val__id.i272 = load double* %ptr__id.i271, align 8
|
||||
%.lhs377 = add i32 %.lhs362, %mul__Nx_load436.i
|
||||
%32 = shl i32 %.lhs377, 3
|
||||
%iptr__id.i265.rhs = sext i32 %32 to i64
|
||||
%iptr__id.i265 = add i64 %iptr__id.i265.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i266 = inttoptr i64 %iptr__id.i265 to double*
|
||||
%val__id.i267 = load double* %ptr__id.i266, align 8
|
||||
%.lhs378 = add i32 %.lhs362, %mul__Nx_load444.i
|
||||
%33 = shl i32 %.lhs378, 3
|
||||
%iptr__id.i260.rhs = sext i32 %33 to i64
|
||||
%iptr__id.i260 = add i64 %iptr__id.i260.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i261 = inttoptr i64 %iptr__id.i260 to double*
|
||||
%val__id.i262 = load double* %ptr__id.i261, align 8
|
||||
%.lhs379 = add i32 %.lhs362, %mul__Nxy_load453.i
|
||||
%34 = shl i32 %.lhs379, 3
|
||||
%iptr__id.i255.rhs = sext i32 %34 to i64
|
||||
%iptr__id.i255 = add i64 %iptr__id.i255.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i256 = inttoptr i64 %iptr__id.i255 to double*
|
||||
%val__id.i257 = load double* %ptr__id.i256, align 8
|
||||
%.lhs380 = add i32 %.lhs362, %mul__Nxy_load461.i
|
||||
%35 = shl i32 %.lhs380, 3
|
||||
%iptr__id.i250.rhs = sext i32 %35 to i64
|
||||
%iptr__id.i250 = add i64 %iptr__id.i250.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i251 = inttoptr i64 %iptr__id.i250 to double*
|
||||
%val__id.i252 = load double* %ptr__id.i251, align 8
|
||||
%val__id.i247 = load double* %ptr__id.i, align 8
|
||||
%iptr__id.i240 = add i64 %iptr__id.i.rhs, %Aout_load470_ptr2int.i
|
||||
%ptr__id.i241 = inttoptr i64 %iptr__id.i240 to double*
|
||||
%val__id.i242 = load double* %ptr__id.i241, align 8
|
||||
%iptr__id.i235 = add i64 %iptr__id.i.rhs, %vsq_load488_ptr2int.i
|
||||
%ptr__id.i236 = inttoptr i64 %iptr__id.i235 to double*
|
||||
%val__id.i237 = load double* %ptr__id.i236, align 8
|
||||
%val__id.i233.lhs.lhs = fmul double %val__id.i247, 2.000000e+00
|
||||
%val__id.i233.lhs = fsub double %val__id.i233.lhs.lhs, %val__id.i242
|
||||
%val__id.i233.rhs.rhs.lhs.lhs.lhs = fmul double %coef_load_offset_load.i, %val__id.i
|
||||
%val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i337, %val__id.i332
|
||||
%val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i327
|
||||
%val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i322
|
||||
%val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs, %val__id.i317
|
||||
%val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs, %val__id.i312
|
||||
%val__id.i233.rhs.rhs.lhs.lhs.rhs = fmul double %coef_load94_offset_load.i, %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs
|
||||
%val__id.i233.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.lhs, %val__id.i233.rhs.rhs.lhs.lhs.rhs
|
||||
%val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i307, %val__id.i302
|
||||
%val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i297
|
||||
%val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i292
|
||||
%val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs, %val__id.i287
|
||||
%val__id.i233.rhs.rhs.lhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs, %val__id.i282
|
||||
%val__id.i233.rhs.rhs.lhs.rhs = fmul double %coef_load145_offset_load.i, %val__id.i233.rhs.rhs.lhs.rhs.rhs
|
||||
%val__id.i233.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs, %val__id.i233.rhs.rhs.lhs.rhs
|
||||
%val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i277, %val__id.i272
|
||||
%val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i267
|
||||
%val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs, %val__id.i262
|
||||
%val__id.i233.rhs.rhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs, %val__id.i257
|
||||
%val__id.i233.rhs.rhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs, %val__id.i252
|
||||
%val__id.i233.rhs.rhs.rhs = fmul double %coef_load196_offset_load.i, %val__id.i233.rhs.rhs.rhs.rhs
|
||||
%val__id.i233.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs, %val__id.i233.rhs.rhs.rhs
|
||||
%val__id.i233.rhs = fmul double %val__id.i237, %val__id.i233.rhs.rhs
|
||||
%val__id.i233 = fadd double %val__id.i233.lhs, %val__id.i233.rhs
|
||||
store double %val__id.i233, double* %ptr__id.i241, align 8
|
||||
%new_counter279.i = add i32 %counter32.4.i386, 32
|
||||
%before_aligned_end73.i = icmp slt i32 %new_counter279.i, %aligned_end31.i
|
||||
br i1 %before_aligned_end73.i, label %foreach_full_body.i, label %partial_inner_all_outer.i
|
||||
|
||||
foreach_test21.i.preheader: ; preds = %foreach_reset19.i, %foreach_test21.i.preheader.lr.ph
|
||||
%iter_val.i398 = phi <1 x i32> [ %iter_val.i395, %foreach_test21.i.preheader.lr.ph ], [ %iter_val.i, %foreach_reset19.i ]
|
||||
%counter.0.i397 = phi i32 [ %add_z0_load_mul_calltmp47_, %foreach_test21.i.preheader.lr.ph ], [ %new_counter.i, %foreach_reset19.i ]
|
||||
%tid.i3.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%__laneidx47.i = and i32 %tid.i3.i, 31
|
||||
%36 = zext i32 %__laneidx47.i to i64
|
||||
%arrayidx48.i = getelementptr [32 x i8]* @constDeltaForeach1, i64 0, i64 %36
|
||||
br i1 %cmp54.i390, label %outer_not_in_extras.i.preheader.lr.ph, label %foreach_reset19.i
|
||||
|
||||
outer_not_in_extras.i.preheader.lr.ph: ; preds = %foreach_test21.i.preheader
|
||||
%37 = load i8* %arrayidx48.i, align 1
|
||||
%_zext49.i388 = zext i8 %37 to i32
|
||||
%38 = insertelement <1 x i32> undef, i32 %_zext49.i388, i32 0
|
||||
%iter_val50.i389 = add <1 x i32> %smear_counter_init44.i387, %38
|
||||
%mul_z_load297_Nxy_load298_broadcast.i = mul <1 x i32> %iter_val.i398, %Nxy_load298_broadcast_init.i
|
||||
br label %outer_not_in_extras.i.preheader
|
||||
|
||||
foreach_reset19.i: ; preds = %foreach_reset27.i, %foreach_test21.i.preheader
|
||||
%new_counter.i = add i32 %counter.0.i397, 1
|
||||
%smear_counter_init.i = insertelement <1 x i32> undef, i32 %new_counter.i, i32 0
|
||||
%39 = load i8* %arrayidx.i, align 1
|
||||
%_zext.i = zext i8 %39 to i32
|
||||
%40 = insertelement <1 x i32> undef, i32 %_zext.i, i32 0
|
||||
%iter_val.i = add <1 x i32> %smear_counter_init.i, %40
|
||||
%exitcond400 = icmp eq i32 %new_counter.i, %14
|
||||
br i1 %exitcond400, label %if_then, label %foreach_test21.i.preheader
|
||||
|
||||
outer_not_in_extras.i.preheader: ; preds = %foreach_reset27.i, %outer_not_in_extras.i.preheader.lr.ph
|
||||
%iter_val50.i392 = phi <1 x i32> [ %iter_val50.i389, %outer_not_in_extras.i.preheader.lr.ph ], [ %iter_val50.i, %foreach_reset27.i ]
|
||||
%counter25.1.i391 = phi i32 [ %add_y0_load_mul_calltmp41_, %outer_not_in_extras.i.preheader.lr.ph ], [ %new_counter35.i, %foreach_reset27.i ]
|
||||
br i1 %before_aligned_end73.i385, label %foreach_full_body.i, label %partial_inner_all_outer.i
|
||||
|
||||
foreach_reset27.i: ; preds = %pl_dolane.i, %partial_inner_only.i, %partial_inner_all_outer.i
|
||||
%new_counter35.i = add i32 %counter25.1.i391, 1
|
||||
%smear_counter_init44.i = insertelement <1 x i32> undef, i32 %new_counter35.i, i32 0
|
||||
%41 = load i8* %arrayidx48.i, align 1
|
||||
%_zext49.i = zext i8 %41 to i32
|
||||
%42 = insertelement <1 x i32> undef, i32 %_zext49.i, i32 0
|
||||
%iter_val50.i = add <1 x i32> %smear_counter_init44.i, %42
|
||||
%exitcond = icmp eq i32 %new_counter35.i, %8
|
||||
br i1 %exitcond, label %foreach_reset19.i, label %outer_not_in_extras.i.preheader
|
||||
|
||||
partial_inner_all_outer.i: ; preds = %outer_not_in_extras.i.preheader, %foreach_full_body.i
|
||||
%counter32.4.i.lcssa = phi i32 [ %add_x0_load_mul_calltmp35_, %outer_not_in_extras.i.preheader ], [ %new_counter279.i, %foreach_full_body.i ]
|
||||
%before_full_end.i = icmp slt i32 %counter32.4.i.lcssa, %r.i.i
|
||||
br i1 %before_full_end.i, label %partial_inner_only.i, label %foreach_reset27.i
|
||||
|
||||
partial_inner_only.i: ; preds = %partial_inner_all_outer.i
|
||||
%smear_counter_init282.i = insertelement <1 x i32> undef, i32 %counter32.4.i.lcssa, i32 0
|
||||
%tid.i2.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%__laneidx285.i = and i32 %tid.i2.i, 31
|
||||
%43 = zext i32 %__laneidx285.i to i64
|
||||
%arrayidx286.i = getelementptr [32 x i8]* @constDeltaForeach4, i64 0, i64 %43
|
||||
%44 = load i8* %arrayidx286.i, align 1
|
||||
%_zext287.i = zext i8 %44 to i32
|
||||
%45 = insertelement <1 x i32> undef, i32 %_zext287.i, i32 0
|
||||
%iter_val288.i = add <1 x i32> %smear_counter_init282.i, %45
|
||||
%cmp291.i = icmp slt <1 x i32> %iter_val288.i, %smear_end_init289.i
|
||||
%mul_y_load299_Nx_load300_broadcast.i = mul <1 x i32> %iter_val50.i392, %Nx_load300_broadcast_init.i
|
||||
%add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast.i = add <1 x i32> %mul_z_load297_Nxy_load298_broadcast.i, %mul_y_load299_Nx_load300_broadcast.i
|
||||
%add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i = add <1 x i32> %add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast.i, %iter_val288.i
|
||||
%v.i.i224 = extractelement <1 x i1> %cmp291.i, i32 0
|
||||
br i1 %v.i.i224, label %pl_dolane.i, label %foreach_reset27.i
|
||||
|
||||
pl_dolane.i: ; preds = %partial_inner_only.i
|
||||
%coef_load303_offset_load.i = load double* %coef, align 8
|
||||
%.lhs361 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%46 = shl i32 %.lhs361, 3
|
||||
%iptr__id.i225.rhs = sext i32 %46 to i64
|
||||
%iptr__id.i225 = add i64 %iptr__id.i225.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i226 = inttoptr i64 %iptr__id.i225 to double*
|
||||
%val__id.i227 = load double* %ptr__id.i226, align 8
|
||||
%coef_load314_offset_load.i401 = load double* %coef_load314_offset.i, align 8
|
||||
%.lhs360.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs360 = shl i32 %.lhs360.lhs, 3
|
||||
%47 = add i32 %.lhs360, 8
|
||||
%iptr__id.i218.rhs = sext i32 %47 to i64
|
||||
%iptr__id.i218 = add i64 %iptr__id.i218.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i219 = inttoptr i64 %iptr__id.i218 to double*
|
||||
%val__id.i220 = load double* %ptr__id.i219, align 8
|
||||
%.lhs359.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs359 = shl i32 %.lhs359.lhs, 3
|
||||
%48 = add i32 %.lhs359, -8
|
||||
%iptr__id.i211.rhs = sext i32 %48 to i64
|
||||
%iptr__id.i211 = add i64 %iptr__id.i211.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i212 = inttoptr i64 %iptr__id.i211 to double*
|
||||
%val__id.i213 = load double* %ptr__id.i212, align 8
|
||||
%.lhs358.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs358 = add i32 %.lhs358.lhs, %Nx
|
||||
%49 = shl i32 %.lhs358, 3
|
||||
%iptr__id.i204.rhs = sext i32 %49 to i64
|
||||
%iptr__id.i204 = add i64 %iptr__id.i204.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i205 = inttoptr i64 %iptr__id.i204 to double*
|
||||
%val__id.i206 = load double* %ptr__id.i205, align 8
|
||||
%.lhs357.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs357 = sub i32 %.lhs357.lhs, %Nx
|
||||
%50 = shl i32 %.lhs357, 3
|
||||
%iptr__id.i197.rhs = sext i32 %50 to i64
|
||||
%iptr__id.i197 = add i64 %iptr__id.i197.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i198 = inttoptr i64 %iptr__id.i197 to double*
|
||||
%val__id.i199 = load double* %ptr__id.i198, align 8
|
||||
%.lhs356.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs356 = add i32 %.lhs356.lhs, %mul_Nx_load_Ny_load.i
|
||||
%51 = shl i32 %.lhs356, 3
|
||||
%iptr__id.i190.rhs = sext i32 %51 to i64
|
||||
%iptr__id.i190 = add i64 %iptr__id.i190.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i191 = inttoptr i64 %iptr__id.i190 to double*
|
||||
%val__id.i192 = load double* %ptr__id.i191, align 8
|
||||
%.lhs355.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs355 = sub i32 %.lhs355.lhs, %mul_Nx_load_Ny_load.i
|
||||
%52 = shl i32 %.lhs355, 3
|
||||
%iptr__id.i183.rhs = sext i32 %52 to i64
|
||||
%iptr__id.i183 = add i64 %iptr__id.i183.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i184 = inttoptr i64 %iptr__id.i183 to double*
|
||||
%val__id.i185 = load double* %ptr__id.i184, align 8
|
||||
%coef_load365_offset_load.i457 = load double* %coef_load365_offset.i, align 8
|
||||
%.lhs354.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs354 = shl i32 %.lhs354.lhs, 3
|
||||
%53 = add i32 %.lhs354, 16
|
||||
%iptr__id.i176.rhs = sext i32 %53 to i64
|
||||
%iptr__id.i176 = add i64 %iptr__id.i176.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i177 = inttoptr i64 %iptr__id.i176 to double*
|
||||
%val__id.i178 = load double* %ptr__id.i177, align 8
|
||||
%.lhs353.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs353 = shl i32 %.lhs353.lhs, 3
|
||||
%54 = add i32 %.lhs353, -16
|
||||
%iptr__id.i169.rhs = sext i32 %54 to i64
|
||||
%iptr__id.i169 = add i64 %iptr__id.i169.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i170 = inttoptr i64 %iptr__id.i169 to double*
|
||||
%val__id.i171 = load double* %ptr__id.i170, align 8
|
||||
%.lhs352.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs352 = add i32 %.lhs352.lhs, %mul__Nx_load385.i
|
||||
%55 = shl i32 %.lhs352, 3
|
||||
%iptr__id.i162.rhs = sext i32 %55 to i64
|
||||
%iptr__id.i162 = add i64 %iptr__id.i162.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i163 = inttoptr i64 %iptr__id.i162 to double*
|
||||
%val__id.i164 = load double* %ptr__id.i163, align 8
|
||||
%.lhs351.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs351 = add i32 %.lhs351.lhs, %mul__Nx_load393.i
|
||||
%56 = shl i32 %.lhs351, 3
|
||||
%iptr__id.i155.rhs = sext i32 %56 to i64
|
||||
%iptr__id.i155 = add i64 %iptr__id.i155.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i156 = inttoptr i64 %iptr__id.i155 to double*
|
||||
%val__id.i157 = load double* %ptr__id.i156, align 8
|
||||
%.lhs350.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs350 = add i32 %.lhs350.lhs, %mul__Nxy_load402.i
|
||||
%57 = shl i32 %.lhs350, 3
|
||||
%iptr__id.i148.rhs = sext i32 %57 to i64
|
||||
%iptr__id.i148 = add i64 %iptr__id.i148.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i149 = inttoptr i64 %iptr__id.i148 to double*
|
||||
%val__id.i150 = load double* %ptr__id.i149, align 8
|
||||
%.lhs349.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs349 = add i32 %.lhs349.lhs, %mul__Nxy_load410.i
|
||||
%58 = shl i32 %.lhs349, 3
|
||||
%iptr__id.i141.rhs = sext i32 %58 to i64
|
||||
%iptr__id.i141 = add i64 %iptr__id.i141.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i142 = inttoptr i64 %iptr__id.i141 to double*
|
||||
%val__id.i143 = load double* %ptr__id.i142, align 8
|
||||
%coef_load416_offset_load.i544 = load double* %coef_load416_offset.i, align 8
|
||||
%.lhs348.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs348 = shl i32 %.lhs348.lhs, 3
|
||||
%59 = add i32 %.lhs348, 24
|
||||
%iptr__id.i134.rhs = sext i32 %59 to i64
|
||||
%iptr__id.i134 = add i64 %iptr__id.i134.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i135 = inttoptr i64 %iptr__id.i134 to double*
|
||||
%val__id.i136 = load double* %ptr__id.i135, align 8
|
||||
%.lhs347.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs347 = shl i32 %.lhs347.lhs, 3
|
||||
%60 = add i32 %.lhs347, -24
|
||||
%iptr__id.i127.rhs = sext i32 %60 to i64
|
||||
%iptr__id.i127 = add i64 %iptr__id.i127.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i128 = inttoptr i64 %iptr__id.i127 to double*
|
||||
%val__id.i129 = load double* %ptr__id.i128, align 8
|
||||
%.lhs346.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs346 = add i32 %.lhs346.lhs, %mul__Nx_load436.i
|
||||
%61 = shl i32 %.lhs346, 3
|
||||
%iptr__id.i120.rhs = sext i32 %61 to i64
|
||||
%iptr__id.i120 = add i64 %iptr__id.i120.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i121 = inttoptr i64 %iptr__id.i120 to double*
|
||||
%val__id.i122 = load double* %ptr__id.i121, align 8
|
||||
%.lhs345.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs345 = add i32 %.lhs345.lhs, %mul__Nx_load444.i
|
||||
%62 = shl i32 %.lhs345, 3
|
||||
%iptr__id.i113.rhs = sext i32 %62 to i64
|
||||
%iptr__id.i113 = add i64 %iptr__id.i113.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i114 = inttoptr i64 %iptr__id.i113 to double*
|
||||
%val__id.i115 = load double* %ptr__id.i114, align 8
|
||||
%.lhs344.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs344 = add i32 %.lhs344.lhs, %mul__Nxy_load453.i
|
||||
%63 = shl i32 %.lhs344, 3
|
||||
%iptr__id.i106.rhs = sext i32 %63 to i64
|
||||
%iptr__id.i106 = add i64 %iptr__id.i106.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i107 = inttoptr i64 %iptr__id.i106 to double*
|
||||
%val__id.i108 = load double* %ptr__id.i107, align 8
|
||||
%.lhs343.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%.lhs343 = add i32 %.lhs343.lhs, %mul__Nxy_load461.i
|
||||
%64 = shl i32 %.lhs343, 3
|
||||
%iptr__id.i99.rhs = sext i32 %64 to i64
|
||||
%iptr__id.i99 = add i64 %iptr__id.i99.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i100 = inttoptr i64 %iptr__id.i99 to double*
|
||||
%val__id.i101 = load double* %ptr__id.i100, align 8
|
||||
%.lhs342 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%65 = shl i32 %.lhs342, 3
|
||||
%iptr__id.i92.rhs = sext i32 %65 to i64
|
||||
%iptr__id.i92 = add i64 %iptr__id.i92.rhs, %Ain_load309_ptr2int.i
|
||||
%ptr__id.i93 = inttoptr i64 %iptr__id.i92 to double*
|
||||
%val__id.i94 = load double* %ptr__id.i93, align 8
|
||||
%.lhs341 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%66 = shl i32 %.lhs341, 3
|
||||
%iptr__id.i85.rhs = sext i32 %66 to i64
|
||||
%iptr__id.i85 = add i64 %iptr__id.i85.rhs, %Aout_load470_ptr2int.i
|
||||
%ptr__id.i86 = inttoptr i64 %iptr__id.i85 to double*
|
||||
%val__id.i87 = load double* %ptr__id.i86, align 8
|
||||
%.lhs340 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%67 = shl i32 %.lhs340, 3
|
||||
%iptr__id.i80.rhs = sext i32 %67 to i64
|
||||
%iptr__id.i80 = add i64 %iptr__id.i80.rhs, %vsq_load488_ptr2int.i
|
||||
%ptr__id.i81 = inttoptr i64 %iptr__id.i80 to double*
|
||||
%val__id.i82 = load double* %ptr__id.i81, align 8
|
||||
%.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0
|
||||
%68 = shl i32 %.lhs, 3
|
||||
%iptr__id.i76.rhs = sext i32 %68 to i64
|
||||
%iptr__id.i76 = add i64 %iptr__id.i76.rhs, %Aout_load470_ptr2int.i
|
||||
%ptr__id.i77 = inttoptr i64 %iptr__id.i76 to double*
|
||||
%val__id.i78.lhs.lhs = fmul double %val__id.i94, 2.000000e+00
|
||||
%val__id.i78.lhs = fsub double %val__id.i78.lhs.lhs, %val__id.i87
|
||||
%val__id.i78.rhs.rhs.lhs.lhs.lhs = fmul double %coef_load303_offset_load.i, %val__id.i227
|
||||
%val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i220, %val__id.i213
|
||||
%val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i206
|
||||
%val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i199
|
||||
%val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs, %val__id.i192
|
||||
%val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs, %val__id.i185
|
||||
%val__id.i78.rhs.rhs.lhs.lhs.rhs = fmul double %coef_load314_offset_load.i401, %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs
|
||||
%val__id.i78.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.lhs, %val__id.i78.rhs.rhs.lhs.lhs.rhs
|
||||
%val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i178, %val__id.i171
|
||||
%val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i164
|
||||
%val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i157
|
||||
%val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs, %val__id.i150
|
||||
%val__id.i78.rhs.rhs.lhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs, %val__id.i143
|
||||
%val__id.i78.rhs.rhs.lhs.rhs = fmul double %coef_load365_offset_load.i457, %val__id.i78.rhs.rhs.lhs.rhs.rhs
|
||||
%val__id.i78.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs, %val__id.i78.rhs.rhs.lhs.rhs
|
||||
%val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i136, %val__id.i129
|
||||
%val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i122
|
||||
%val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs, %val__id.i115
|
||||
%val__id.i78.rhs.rhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs, %val__id.i108
|
||||
%val__id.i78.rhs.rhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs, %val__id.i101
|
||||
%val__id.i78.rhs.rhs.rhs = fmul double %coef_load416_offset_load.i544, %val__id.i78.rhs.rhs.rhs.rhs
|
||||
%val__id.i78.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs, %val__id.i78.rhs.rhs.rhs
|
||||
%val__id.i78.rhs = fmul double %val__id.i78.rhs.rhs, %val__id.i82
|
||||
%val__id.i78 = fadd double %val__id.i78.lhs, %val__id.i78.rhs
|
||||
store double %val__id.i78, double* %ptr__id.i77, align 8
|
||||
br label %foreach_reset27.i
|
||||
}
|
||||
|
||||
define void @loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd, <1 x i1> %__mask) {
|
||||
allocas:
|
||||
%less_t_load_t1_load94 = icmp slt i32 %t0, %t1
|
||||
br i1 %less_t_load_t1_load94, label %for_loop.lr.ph, label %for_exit
|
||||
|
||||
for_loop.lr.ph: ; preds = %allocas
|
||||
%add_sub_x1_load21_x0_load22_ = sub i32 31, %x0
|
||||
%sub_add_sub_x1_load21_x0_load22__ = add i32 %add_sub_x1_load21_x0_load22_, %x1
|
||||
%div_sub_add_sub_x1_load21_x0_load22___ = sdiv i32 %sub_add_sub_x1_load21_x0_load22__, 32
|
||||
%add_sub_y1_load23_y0_load24_ = sub i32 7, %y0
|
||||
%sub_add_sub_y1_load23_y0_load24__ = add i32 %add_sub_y1_load23_y0_load24_, %y1
|
||||
%div_sub_add_sub_y1_load23_y0_load24___ = sdiv i32 %sub_add_sub_y1_load23_y0_load24__, 8
|
||||
%add_sub_z1_load25_z0_load26_ = sub i32 7, %z0
|
||||
%sub_add_sub_z1_load25_z0_load26__ = add i32 %add_sub_z1_load25_z0_load26_, %z1
|
||||
%div_sub_add_sub_z1_load25_z0_load26___ = sdiv i32 %sub_add_sub_z1_load25_z0_load26__, 8
|
||||
%ntxm1.i = add nsw i32 %div_sub_add_sub_x1_load21_x0_load22___, -1
|
||||
%ntxm1d4.i = ashr i32 %ntxm1.i, 2
|
||||
%nbx.i = add nsw i32 %ntxm1d4.i, 1
|
||||
br label %for_loop
|
||||
|
||||
for_loop: ; preds = %if_exit, %for_loop.lr.ph
|
||||
%t.095 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load78_plus1, %if_exit ]
|
||||
%bitop = and i32 %t.095, 1
|
||||
%equal_bitop_ = icmp eq i32 %bitop, 0
|
||||
%tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%and.i = and i32 %tid.i.i, 31
|
||||
%cmp.i = icmp eq i32 %and.i, 0
|
||||
br i1 %cmp.i, label %if.then.i, label %ISPCGetParamBuffer.exit
|
||||
|
||||
if.then.i: ; preds = %for_loop
|
||||
%ptri64tmp.i = tail call i64 @cudaGetParameterBuffer(i64 8, i64 72)
|
||||
%phitmp.i = inttoptr i64 %ptri64tmp.i to i8*
|
||||
br label %ISPCGetParamBuffer.exit
|
||||
|
||||
ISPCGetParamBuffer.exit: ; preds = %if.then.i, %for_loop
|
||||
%ptri64.i = phi i8* [ %phitmp.i, %if.then.i ], [ null, %for_loop ]
|
||||
%cmp1 = icmp eq i8* %ptri64.i, null
|
||||
br i1 %equal_bitop_, label %if_then, label %if_else
|
||||
|
||||
for_exit: ; preds = %if_exit, %allocas
|
||||
%0 = tail call i32 @cudaDeviceSynchronize()
|
||||
ret void
|
||||
|
||||
if_then: ; preds = %ISPCGetParamBuffer.exit
|
||||
br i1 %cmp1, label %if_false, label %if_true
|
||||
|
||||
if_else: ; preds = %ISPCGetParamBuffer.exit
|
||||
br i1 %cmp1, label %if_false62, label %if_true61
|
||||
|
||||
if_exit: ; preds = %if.then.i92, %if_false62, %if.then.i83, %if_false
|
||||
%1 = tail call i32 @cudaDeviceSynchronize()
|
||||
%t_load78_plus1 = add i32 %t.095, 1
|
||||
%exitcond = icmp eq i32 %t_load78_plus1, %t1
|
||||
br i1 %exitcond, label %for_exit, label %for_loop
|
||||
|
||||
if_true: ; preds = %if_then
|
||||
%funarg = bitcast i8* %ptri64.i to i32*
|
||||
store i32 %x0, i32* %funarg, align 4
|
||||
%funarg27 = getelementptr i8* %ptri64.i, i64 4
|
||||
%2 = bitcast i8* %funarg27 to i32*
|
||||
store i32 %x1, i32* %2, align 4
|
||||
%funarg28 = getelementptr i8* %ptri64.i, i64 8
|
||||
%3 = bitcast i8* %funarg28 to i32*
|
||||
store i32 %y0, i32* %3, align 4
|
||||
%funarg29 = getelementptr i8* %ptri64.i, i64 12
|
||||
%4 = bitcast i8* %funarg29 to i32*
|
||||
store i32 %y1, i32* %4, align 4
|
||||
%funarg30 = getelementptr i8* %ptri64.i, i64 16
|
||||
%5 = bitcast i8* %funarg30 to i32*
|
||||
store i32 %z0, i32* %5, align 4
|
||||
%funarg31 = getelementptr i8* %ptri64.i, i64 20
|
||||
%6 = bitcast i8* %funarg31 to i32*
|
||||
store i32 %z1, i32* %6, align 4
|
||||
%funarg32 = getelementptr i8* %ptri64.i, i64 24
|
||||
%7 = bitcast i8* %funarg32 to i32*
|
||||
store i32 %Nx, i32* %7, align 4
|
||||
%funarg33 = getelementptr i8* %ptri64.i, i64 28
|
||||
%8 = bitcast i8* %funarg33 to i32*
|
||||
store i32 %Ny, i32* %8, align 4
|
||||
%funarg34 = getelementptr i8* %ptri64.i, i64 32
|
||||
%9 = bitcast i8* %funarg34 to i32*
|
||||
store i32 %Nz, i32* %9, align 4
|
||||
%funarg35 = getelementptr i8* %ptri64.i, i64 40
|
||||
%10 = bitcast i8* %funarg35 to double**
|
||||
store double* %coef, double** %10, align 8
|
||||
%funarg36 = getelementptr i8* %ptri64.i, i64 48
|
||||
%11 = bitcast i8* %funarg36 to double**
|
||||
store double* %vsq, double** %11, align 8
|
||||
%funarg37 = getelementptr i8* %ptri64.i, i64 56
|
||||
%12 = bitcast i8* %funarg37 to double**
|
||||
store double* %Aeven, double** %12, align 8
|
||||
%funarg38 = getelementptr i8* %ptri64.i, i64 64
|
||||
%13 = bitcast i8* %funarg38 to double**
|
||||
store double* %Aodd, double** %13, align 8
|
||||
br label %if_false
|
||||
|
||||
if_false: ; preds = %if_true, %if_then
|
||||
%tid.i.i80 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%and.i81 = and i32 %tid.i.i80, 31
|
||||
%cmp.i82 = icmp eq i32 %and.i81, 0
|
||||
br i1 %cmp.i82, label %if.then.i83, label %if_exit
|
||||
|
||||
if.then.i83: ; preds = %if_false
|
||||
%args_i64.i = ptrtoint i8* %ptri64.i to i64
|
||||
%res_tmp.i = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0)
|
||||
br label %if_exit
|
||||
|
||||
if_true61: ; preds = %if_else
|
||||
%funarg64 = bitcast i8* %ptri64.i to i32*
|
||||
store i32 %x0, i32* %funarg64, align 4
|
||||
%funarg65 = getelementptr i8* %ptri64.i, i64 4
|
||||
%14 = bitcast i8* %funarg65 to i32*
|
||||
store i32 %x1, i32* %14, align 4
|
||||
%funarg66 = getelementptr i8* %ptri64.i, i64 8
|
||||
%15 = bitcast i8* %funarg66 to i32*
|
||||
store i32 %y0, i32* %15, align 4
|
||||
%funarg67 = getelementptr i8* %ptri64.i, i64 12
|
||||
%16 = bitcast i8* %funarg67 to i32*
|
||||
store i32 %y1, i32* %16, align 4
|
||||
%funarg68 = getelementptr i8* %ptri64.i, i64 16
|
||||
%17 = bitcast i8* %funarg68 to i32*
|
||||
store i32 %z0, i32* %17, align 4
|
||||
%funarg69 = getelementptr i8* %ptri64.i, i64 20
|
||||
%18 = bitcast i8* %funarg69 to i32*
|
||||
store i32 %z1, i32* %18, align 4
|
||||
%funarg70 = getelementptr i8* %ptri64.i, i64 24
|
||||
%19 = bitcast i8* %funarg70 to i32*
|
||||
store i32 %Nx, i32* %19, align 4
|
||||
%funarg71 = getelementptr i8* %ptri64.i, i64 28
|
||||
%20 = bitcast i8* %funarg71 to i32*
|
||||
store i32 %Ny, i32* %20, align 4
|
||||
%funarg72 = getelementptr i8* %ptri64.i, i64 32
|
||||
%21 = bitcast i8* %funarg72 to i32*
|
||||
store i32 %Nz, i32* %21, align 4
|
||||
%funarg73 = getelementptr i8* %ptri64.i, i64 40
|
||||
%22 = bitcast i8* %funarg73 to double**
|
||||
store double* %coef, double** %22, align 8
|
||||
%funarg74 = getelementptr i8* %ptri64.i, i64 48
|
||||
%23 = bitcast i8* %funarg74 to double**
|
||||
store double* %vsq, double** %23, align 8
|
||||
%funarg75 = getelementptr i8* %ptri64.i, i64 56
|
||||
%24 = bitcast i8* %funarg75 to double**
|
||||
store double* %Aodd, double** %24, align 8
|
||||
%funarg76 = getelementptr i8* %ptri64.i, i64 64
|
||||
%25 = bitcast i8* %funarg76 to double**
|
||||
store double* %Aeven, double** %25, align 8
|
||||
br label %if_false62
|
||||
|
||||
if_false62: ; preds = %if_true61, %if_else
|
||||
%tid.i.i84 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%and.i85 = and i32 %tid.i.i84, 31
|
||||
%cmp.i86 = icmp eq i32 %and.i85, 0
|
||||
br i1 %cmp.i86, label %if.then.i92, label %if_exit
|
||||
|
||||
if.then.i92: ; preds = %if_false62
|
||||
%args_i64.i90 = ptrtoint i8* %ptri64.i to i64
|
||||
%res_tmp.i91 = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i90, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0)
|
||||
br label %if_exit
|
||||
}
|
||||
|
||||
define void @loop_stencil_ispc_tasks(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd) {
|
||||
allocas:
|
||||
%less_t_load_t1_load94 = icmp slt i32 %t0, %t1
|
||||
br i1 %less_t_load_t1_load94, label %for_loop.lr.ph, label %for_exit
|
||||
|
||||
for_loop.lr.ph: ; preds = %allocas
|
||||
%add_sub_x1_load21_x0_load22_ = sub i32 31, %x0
|
||||
%sub_add_sub_x1_load21_x0_load22__ = add i32 %add_sub_x1_load21_x0_load22_, %x1
|
||||
%div_sub_add_sub_x1_load21_x0_load22___ = sdiv i32 %sub_add_sub_x1_load21_x0_load22__, 32
|
||||
%add_sub_y1_load23_y0_load24_ = sub i32 7, %y0
|
||||
%sub_add_sub_y1_load23_y0_load24__ = add i32 %add_sub_y1_load23_y0_load24_, %y1
|
||||
%div_sub_add_sub_y1_load23_y0_load24___ = sdiv i32 %sub_add_sub_y1_load23_y0_load24__, 8
|
||||
%add_sub_z1_load25_z0_load26_ = sub i32 7, %z0
|
||||
%sub_add_sub_z1_load25_z0_load26__ = add i32 %add_sub_z1_load25_z0_load26_, %z1
|
||||
%div_sub_add_sub_z1_load25_z0_load26___ = sdiv i32 %sub_add_sub_z1_load25_z0_load26__, 8
|
||||
%ntxm1.i = add nsw i32 %div_sub_add_sub_x1_load21_x0_load22___, -1
|
||||
%ntxm1d4.i = ashr i32 %ntxm1.i, 2
|
||||
%nbx.i = add nsw i32 %ntxm1d4.i, 1
|
||||
br label %for_loop
|
||||
|
||||
for_loop: ; preds = %if_exit, %for_loop.lr.ph
|
||||
%t.095 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load78_plus1, %if_exit ]
|
||||
%bitop = and i32 %t.095, 1
|
||||
%equal_bitop_ = icmp eq i32 %bitop, 0
|
||||
%tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%and.i = and i32 %tid.i.i, 31
|
||||
%cmp.i = icmp eq i32 %and.i, 0
|
||||
br i1 %cmp.i, label %if.then.i, label %ISPCGetParamBuffer.exit
|
||||
|
||||
if.then.i: ; preds = %for_loop
|
||||
%ptri64tmp.i = tail call i64 @cudaGetParameterBuffer(i64 8, i64 72)
|
||||
%phitmp.i = inttoptr i64 %ptri64tmp.i to i8*
|
||||
br label %ISPCGetParamBuffer.exit
|
||||
|
||||
ISPCGetParamBuffer.exit: ; preds = %if.then.i, %for_loop
|
||||
%ptri64.i = phi i8* [ %phitmp.i, %if.then.i ], [ null, %for_loop ]
|
||||
%cmp1 = icmp eq i8* %ptri64.i, null
|
||||
br i1 %equal_bitop_, label %if_then, label %if_else
|
||||
|
||||
for_exit: ; preds = %if_exit, %allocas
|
||||
%0 = tail call i32 @cudaDeviceSynchronize()
|
||||
ret void
|
||||
|
||||
if_then: ; preds = %ISPCGetParamBuffer.exit
|
||||
br i1 %cmp1, label %if_false, label %if_true
|
||||
|
||||
if_else: ; preds = %ISPCGetParamBuffer.exit
|
||||
br i1 %cmp1, label %if_false62, label %if_true61
|
||||
|
||||
if_exit: ; preds = %if.then.i92, %if_false62, %if.then.i83, %if_false
|
||||
%1 = tail call i32 @cudaDeviceSynchronize()
|
||||
%t_load78_plus1 = add i32 %t.095, 1
|
||||
%exitcond = icmp eq i32 %t_load78_plus1, %t1
|
||||
br i1 %exitcond, label %for_exit, label %for_loop
|
||||
|
||||
if_true: ; preds = %if_then
|
||||
%funarg = bitcast i8* %ptri64.i to i32*
|
||||
store i32 %x0, i32* %funarg, align 4
|
||||
%funarg27 = getelementptr i8* %ptri64.i, i64 4
|
||||
%2 = bitcast i8* %funarg27 to i32*
|
||||
store i32 %x1, i32* %2, align 4
|
||||
%funarg28 = getelementptr i8* %ptri64.i, i64 8
|
||||
%3 = bitcast i8* %funarg28 to i32*
|
||||
store i32 %y0, i32* %3, align 4
|
||||
%funarg29 = getelementptr i8* %ptri64.i, i64 12
|
||||
%4 = bitcast i8* %funarg29 to i32*
|
||||
store i32 %y1, i32* %4, align 4
|
||||
%funarg30 = getelementptr i8* %ptri64.i, i64 16
|
||||
%5 = bitcast i8* %funarg30 to i32*
|
||||
store i32 %z0, i32* %5, align 4
|
||||
%funarg31 = getelementptr i8* %ptri64.i, i64 20
|
||||
%6 = bitcast i8* %funarg31 to i32*
|
||||
store i32 %z1, i32* %6, align 4
|
||||
%funarg32 = getelementptr i8* %ptri64.i, i64 24
|
||||
%7 = bitcast i8* %funarg32 to i32*
|
||||
store i32 %Nx, i32* %7, align 4
|
||||
%funarg33 = getelementptr i8* %ptri64.i, i64 28
|
||||
%8 = bitcast i8* %funarg33 to i32*
|
||||
store i32 %Ny, i32* %8, align 4
|
||||
%funarg34 = getelementptr i8* %ptri64.i, i64 32
|
||||
%9 = bitcast i8* %funarg34 to i32*
|
||||
store i32 %Nz, i32* %9, align 4
|
||||
%funarg35 = getelementptr i8* %ptri64.i, i64 40
|
||||
%10 = bitcast i8* %funarg35 to double**
|
||||
store double* %coef, double** %10, align 8
|
||||
%funarg36 = getelementptr i8* %ptri64.i, i64 48
|
||||
%11 = bitcast i8* %funarg36 to double**
|
||||
store double* %vsq, double** %11, align 8
|
||||
%funarg37 = getelementptr i8* %ptri64.i, i64 56
|
||||
%12 = bitcast i8* %funarg37 to double**
|
||||
store double* %Aeven, double** %12, align 8
|
||||
%funarg38 = getelementptr i8* %ptri64.i, i64 64
|
||||
%13 = bitcast i8* %funarg38 to double**
|
||||
store double* %Aodd, double** %13, align 8
|
||||
br label %if_false
|
||||
|
||||
if_false: ; preds = %if_true, %if_then
|
||||
%tid.i.i80 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%and.i81 = and i32 %tid.i.i80, 31
|
||||
%cmp.i82 = icmp eq i32 %and.i81, 0
|
||||
br i1 %cmp.i82, label %if.then.i83, label %if_exit
|
||||
|
||||
if.then.i83: ; preds = %if_false
|
||||
%args_i64.i = ptrtoint i8* %ptri64.i to i64
|
||||
%res_tmp.i = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0)
|
||||
br label %if_exit
|
||||
|
||||
if_true61: ; preds = %if_else
|
||||
%funarg64 = bitcast i8* %ptri64.i to i32*
|
||||
store i32 %x0, i32* %funarg64, align 4
|
||||
%funarg65 = getelementptr i8* %ptri64.i, i64 4
|
||||
%14 = bitcast i8* %funarg65 to i32*
|
||||
store i32 %x1, i32* %14, align 4
|
||||
%funarg66 = getelementptr i8* %ptri64.i, i64 8
|
||||
%15 = bitcast i8* %funarg66 to i32*
|
||||
store i32 %y0, i32* %15, align 4
|
||||
%funarg67 = getelementptr i8* %ptri64.i, i64 12
|
||||
%16 = bitcast i8* %funarg67 to i32*
|
||||
store i32 %y1, i32* %16, align 4
|
||||
%funarg68 = getelementptr i8* %ptri64.i, i64 16
|
||||
%17 = bitcast i8* %funarg68 to i32*
|
||||
store i32 %z0, i32* %17, align 4
|
||||
%funarg69 = getelementptr i8* %ptri64.i, i64 20
|
||||
%18 = bitcast i8* %funarg69 to i32*
|
||||
store i32 %z1, i32* %18, align 4
|
||||
%funarg70 = getelementptr i8* %ptri64.i, i64 24
|
||||
%19 = bitcast i8* %funarg70 to i32*
|
||||
store i32 %Nx, i32* %19, align 4
|
||||
%funarg71 = getelementptr i8* %ptri64.i, i64 28
|
||||
%20 = bitcast i8* %funarg71 to i32*
|
||||
store i32 %Ny, i32* %20, align 4
|
||||
%funarg72 = getelementptr i8* %ptri64.i, i64 32
|
||||
%21 = bitcast i8* %funarg72 to i32*
|
||||
store i32 %Nz, i32* %21, align 4
|
||||
%funarg73 = getelementptr i8* %ptri64.i, i64 40
|
||||
%22 = bitcast i8* %funarg73 to double**
|
||||
store double* %coef, double** %22, align 8
|
||||
%funarg74 = getelementptr i8* %ptri64.i, i64 48
|
||||
%23 = bitcast i8* %funarg74 to double**
|
||||
store double* %vsq, double** %23, align 8
|
||||
%funarg75 = getelementptr i8* %ptri64.i, i64 56
|
||||
%24 = bitcast i8* %funarg75 to double**
|
||||
store double* %Aodd, double** %24, align 8
|
||||
%funarg76 = getelementptr i8* %ptri64.i, i64 64
|
||||
%25 = bitcast i8* %funarg76 to double**
|
||||
store double* %Aeven, double** %25, align 8
|
||||
br label %if_false62
|
||||
|
||||
if_false62: ; preds = %if_true61, %if_else
|
||||
%tid.i.i84 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%and.i85 = and i32 %tid.i.i84, 31
|
||||
%cmp.i86 = icmp eq i32 %and.i85, 0
|
||||
br i1 %cmp.i86, label %if.then.i92, label %if_exit
|
||||
|
||||
if.then.i92: ; preds = %if_false62
|
||||
%args_i64.i90 = ptrtoint i8* %ptri64.i to i64
|
||||
%res_tmp.i91 = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i90, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0)
|
||||
br label %if_exit
|
||||
}
|
||||
|
||||
!llvm.ident = !{!0}
|
||||
!nvvm.annotations = !{!1, !2}
|
||||
|
||||
!0 = metadata !{metadata !"clang version 3.4 (trunk 194723)"}
|
||||
!1 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, metadata !"kernel", i32 1}
|
||||
!2 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @loop_stencil_ispc_tasks, metadata !"kernel", i32 1}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,172 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include <math.h>
|
||||
#include "../timing.h"
|
||||
#include "stencil_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
|
||||
double rtc(void)
|
||||
{
|
||||
struct timeval Tvalue;
|
||||
double etime;
|
||||
struct timezone dummy;
|
||||
|
||||
gettimeofday(&Tvalue,&dummy);
|
||||
etime = (double) Tvalue.tv_sec +
|
||||
1.e-6*((double) Tvalue.tv_usec);
|
||||
return etime;
|
||||
}
|
||||
|
||||
|
||||
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
||||
int y0, int y1, int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[5],
|
||||
const double vsq[],
|
||||
double Aeven[], double Aodd[]);
|
||||
|
||||
|
||||
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
|
||||
int offset = 0;
|
||||
for (int z = 0; z < Nz; ++z)
|
||||
for (int y = 0; y < Ny; ++y)
|
||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||
A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
|
||||
A[1][offset] = 0;
|
||||
vsq[offset] = x*y*z / double(Nx * Ny * Nz);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
int Nx = 256, Ny = 256, Nz = 256;
|
||||
int width = 4;
|
||||
double *Aserial[2], *Aispc[2];
|
||||
Aserial[0] = new double [Nx * Ny * Nz];
|
||||
Aserial[1] = new double [Nx * Ny * Nz];
|
||||
Aispc[0] = new double [Nx * Ny * Nz];
|
||||
Aispc[1] = new double [Nx * Ny * Nz];
|
||||
double *vsq = new double [Nx * Ny * Nz];
|
||||
|
||||
double coeff[4] = { 0.5, -.25, .125, -.0625 };
|
||||
|
||||
// InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation on one core; report
|
||||
// the minimum time of three runs.
|
||||
//
|
||||
double minTimeISPC = 1e30;
|
||||
#if 0
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aispc[0], Aispc[1]);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeISPC = std::min(minTimeISPC, dt);
|
||||
}
|
||||
|
||||
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
|
||||
#endif
|
||||
|
||||
fprintf(stderr, " -- init -- \n");
|
||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||
fprintf(stderr, " -- done init -- \n");
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation with tasks; report
|
||||
// the minimum time of three runs.
|
||||
//
|
||||
double minTimeISPCTasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
const double t0 = rtc();
|
||||
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aispc[0], Aispc[1]);
|
||||
double dt = 1e3*(rtc() - t0); //get_elapsed_mcycles();
|
||||
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
|
||||
}
|
||||
|
||||
fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
|
||||
|
||||
|
||||
InitData(Nx, Ny, Nz, Aserial, vsq);
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
// minimum time.
|
||||
//
|
||||
double minTimeSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aserial[0], Aserial[1]);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeSerial = std::min(minTimeSerial, dt);
|
||||
}
|
||||
|
||||
printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
|
||||
|
||||
// Check for agreement
|
||||
int offset = 0;
|
||||
for (int z = 0; z < Nz; ++z)
|
||||
for (int y = 0; y < Ny; ++y)
|
||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||
double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
|
||||
Aserial[1][offset]);
|
||||
if (error > 1e-4)
|
||||
printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
|
||||
x, y, z, Aispc[1][offset], Aserial[1][offset]);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,172 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef __NVPTX__
|
||||
#warning "emitting DEVICE code"
|
||||
#define taskIndex blockIndex0()
|
||||
#define taskCount blockCount0()
|
||||
#define programIndex laneIndex()
|
||||
#define programCount warpSize()
|
||||
#else
|
||||
#warning "emitting HOST code"
|
||||
#endif
|
||||
|
||||
static inline void
|
||||
stencil_step(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4], uniform const double vsq[],
|
||||
uniform const double Ain[], uniform double Aout[]) {
|
||||
const uniform int Nxy = Nx * Ny;
|
||||
|
||||
// foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1)
|
||||
#if 0
|
||||
#define VER1
|
||||
#endif
|
||||
|
||||
#ifdef VER1
|
||||
const uniform long x1o = 1;
|
||||
const uniform long x2o = 2;
|
||||
const uniform long x3o = 3;
|
||||
const uniform long y1o = Nx;
|
||||
const uniform long y2o = Nx*2;
|
||||
const uniform long y3o = Nx*3;
|
||||
const uniform long z1o = Nxy;
|
||||
const uniform long z2o = Nxy*2;
|
||||
const uniform long z3o = Nxy*3;
|
||||
#endif
|
||||
for (uniform int z = z0; z < z1; z++)
|
||||
for (uniform int y = y0; y < y1; y++)
|
||||
{
|
||||
const int index_base = (z * Nxy) + (y * Nx);
|
||||
for (uniform int xb = x0; xb < x1; xb += programCount)
|
||||
{
|
||||
const int x = xb + programIndex;
|
||||
int index = index_base + x;
|
||||
#ifndef VER1
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
double div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
#else
|
||||
#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + (y) + (z)]
|
||||
double div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) +
|
||||
A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) +
|
||||
A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) +
|
||||
coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) +
|
||||
A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) +
|
||||
A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) +
|
||||
coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) +
|
||||
A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) +
|
||||
A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o));
|
||||
#endif
|
||||
|
||||
if (x < x1)
|
||||
A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static task void
|
||||
stencil_step_task(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4], uniform const double vsq[],
|
||||
uniform const double Ain[], uniform double Aout[]) {
|
||||
if(taskIndex >= taskCount) return;
|
||||
|
||||
stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
|
||||
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4],
|
||||
uniform const double vsq[],
|
||||
uniform double Aeven[], uniform double Aodd[])
|
||||
{
|
||||
for (uniform int t = t0; t < t1; ++t) {
|
||||
// Parallelize across cores as well: each task will work on a slice
|
||||
// of 1 in the z extent of the volume.
|
||||
if ((t & 1) == 0)
|
||||
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
|
||||
coef, vsq, Aeven, Aodd);
|
||||
else
|
||||
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
|
||||
coef, vsq, Aodd, Aeven);
|
||||
|
||||
// We need to wait for all of the launched tasks to finish before
|
||||
// starting the next iteration.
|
||||
sync;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
loop_stencil_ispc(uniform int t0, uniform int t1,
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4],
|
||||
uniform const double vsq[],
|
||||
uniform double Aeven[], uniform double Aodd[])
|
||||
{
|
||||
for (uniform int t = t0; t < t1; ++t) {
|
||||
if ((t & 1) == 0)
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aeven, Aodd);
|
||||
else
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aodd, Aeven);
|
||||
}
|
||||
}
|
||||
Binary file not shown.
Reference in New Issue
Block a user