added examples
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
EXAMPLE=ao
|
||||
CPP_SRC=ao.cpp ao_serial.cpp
|
||||
ISPC_SRC=ao.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x8
|
||||
ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
|
||||
Binary file not shown.
@@ -1,139 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#define NOMINMAX
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
#include "../timing.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (argc != 2) {
|
||||
printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
InputData *input = CreateInputDataFromFile(argv[1]);
|
||||
if (!input) {
|
||||
printf("Failed to load input file \"%s\"!\n", argv[1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
Framebuffer framebuffer(input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
|
||||
InitDynamicC(input);
|
||||
#ifdef __cilk
|
||||
InitDynamicCilk(input);
|
||||
#endif // __cilk
|
||||
|
||||
int nframes = 5;
|
||||
double ispcCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
ispc::RenderStatic(input->header, input->arrays,
|
||||
VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer.r, framebuffer.g, framebuffer.b);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
ispcCycles = std::min(ispcCycles, mcycles);
|
||||
}
|
||||
printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
|
||||
"%d x %d image\n", ispcCycles,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight);
|
||||
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
|
||||
|
||||
#ifdef __cilk
|
||||
double dynamicCilkCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
DispatchDynamicCilk(input, &framebuffer);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
|
||||
}
|
||||
printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n",
|
||||
dynamicCilkCycles);
|
||||
WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
|
||||
#endif // __cilk
|
||||
|
||||
double serialCycles = 1e30;
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
DispatchDynamicC(input, &framebuffer);
|
||||
double mcycles = get_elapsed_mcycles() / nframes;
|
||||
serialCycles = std::min(serialCycles, mcycles);
|
||||
}
|
||||
printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n",
|
||||
serialCycles);
|
||||
WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
|
||||
|
||||
#ifdef __cilk
|
||||
printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n",
|
||||
serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
|
||||
#else
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles);
|
||||
#endif // __cilk
|
||||
|
||||
DeleteInputData(input);
|
||||
|
||||
return 0;
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,843 +0,0 @@
|
||||
//
|
||||
// Generated by LLVM NVPTX Back-End
|
||||
//
|
||||
|
||||
.version 3.1
|
||||
.target sm_35, texmode_independent
|
||||
.address_size 64
|
||||
|
||||
// .globl __vselect_i8
|
||||
// @__vselect_i8
|
||||
.func (.param .align 1 .b8 func_retval0[1]) __vselect_i8(
|
||||
.param .align 1 .b8 __vselect_i8_param_0[1],
|
||||
.param .align 1 .b8 __vselect_i8_param_1[1],
|
||||
.param .align 4 .b8 __vselect_i8_param_2[4]
|
||||
)
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u32 %r0, [__vselect_i8_param_2];
|
||||
setp.eq.s32 %p0, %r0, 0;
|
||||
ld.param.u8 %rc0, [__vselect_i8_param_0];
|
||||
ld.param.u8 %rc1, [__vselect_i8_param_1];
|
||||
selp.b16 %rc0, %rc0, %rc1, %p0;
|
||||
st.param.b8 [func_retval0+0], %rc0;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __vselect_i16
|
||||
.func (.param .align 2 .b8 func_retval0[2]) __vselect_i16(
|
||||
.param .align 2 .b8 __vselect_i16_param_0[2],
|
||||
.param .align 2 .b8 __vselect_i16_param_1[2],
|
||||
.param .align 4 .b8 __vselect_i16_param_2[4]
|
||||
) // @__vselect_i16
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u32 %r0, [__vselect_i16_param_2];
|
||||
setp.eq.s32 %p0, %r0, 0;
|
||||
ld.param.u16 %rs0, [__vselect_i16_param_0];
|
||||
ld.param.u16 %rs1, [__vselect_i16_param_1];
|
||||
selp.b16 %rs0, %rs0, %rs1, %p0;
|
||||
st.param.b16 [func_retval0+0], %rs0;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __vselect_i64
|
||||
.func (.param .align 8 .b8 func_retval0[8]) __vselect_i64(
|
||||
.param .align 8 .b8 __vselect_i64_param_0[8],
|
||||
.param .align 8 .b8 __vselect_i64_param_1[8],
|
||||
.param .align 4 .b8 __vselect_i64_param_2[4]
|
||||
) // @__vselect_i64
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u32 %r0, [__vselect_i64_param_2];
|
||||
setp.eq.s32 %p0, %r0, 0;
|
||||
ld.param.u64 %rl0, [__vselect_i64_param_0];
|
||||
ld.param.u64 %rl1, [__vselect_i64_param_1];
|
||||
selp.b64 %rl0, %rl0, %rl1, %p0;
|
||||
st.param.b64 [func_retval0+0], %rl0;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __aos_to_soa4_float1
|
||||
.func __aos_to_soa4_float1(
|
||||
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
|
||||
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
|
||||
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
|
||||
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
|
||||
.param .b64 __aos_to_soa4_float1_param_4,
|
||||
.param .b64 __aos_to_soa4_float1_param_5,
|
||||
.param .b64 __aos_to_soa4_float1_param_6,
|
||||
.param .b64 __aos_to_soa4_float1_param_7
|
||||
) // @__aos_to_soa4_float1
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
|
||||
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
|
||||
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
|
||||
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
|
||||
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
|
||||
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
|
||||
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
|
||||
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
|
||||
st.f32 [%rl0], %f0;
|
||||
st.f32 [%rl1], %f1;
|
||||
st.f32 [%rl2], %f2;
|
||||
st.f32 [%rl3], %f3;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __soa_to_aos4_float1
|
||||
.func __soa_to_aos4_float1(
|
||||
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
|
||||
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
|
||||
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
|
||||
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
|
||||
.param .b64 __soa_to_aos4_float1_param_4,
|
||||
.param .b64 __soa_to_aos4_float1_param_5,
|
||||
.param .b64 __soa_to_aos4_float1_param_6,
|
||||
.param .b64 __soa_to_aos4_float1_param_7
|
||||
) // @__soa_to_aos4_float1
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
|
||||
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
|
||||
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
|
||||
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
|
||||
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
|
||||
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
|
||||
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
|
||||
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
|
||||
st.f32 [%rl0], %f0;
|
||||
st.f32 [%rl1], %f1;
|
||||
st.f32 [%rl2], %f2;
|
||||
st.f32 [%rl3], %f3;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __aos_to_soa3_float1
|
||||
.func __aos_to_soa3_float1(
|
||||
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
|
||||
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
|
||||
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
|
||||
.param .b64 __aos_to_soa3_float1_param_3,
|
||||
.param .b64 __aos_to_soa3_float1_param_4,
|
||||
.param .b64 __aos_to_soa3_float1_param_5
|
||||
) // @__aos_to_soa3_float1
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
|
||||
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
|
||||
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
|
||||
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
|
||||
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
|
||||
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
|
||||
st.f32 [%rl0], %f0;
|
||||
st.f32 [%rl1], %f1;
|
||||
st.f32 [%rl2], %f2;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __soa_to_aos3_float1
|
||||
.func __soa_to_aos3_float1(
|
||||
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
|
||||
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
|
||||
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
|
||||
.param .b64 __soa_to_aos3_float1_param_3,
|
||||
.param .b64 __soa_to_aos3_float1_param_4,
|
||||
.param .b64 __soa_to_aos3_float1_param_5
|
||||
) // @__soa_to_aos3_float1
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
|
||||
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
|
||||
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
|
||||
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
|
||||
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
|
||||
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
|
||||
st.f32 [%rl0], %f0;
|
||||
st.f32 [%rl1], %f1;
|
||||
st.f32 [%rl2], %f2;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __rsqrt_varying_double
|
||||
.func (.param .align 8 .b8 func_retval0[8]) __rsqrt_varying_double(
|
||||
.param .align 8 .b8 __rsqrt_varying_double_param_0[8]
|
||||
) // @__rsqrt_varying_double
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.f64 %fl0, [__rsqrt_varying_double_param_0];
|
||||
rsqrt.approx.f64 %fl0, %fl0;
|
||||
st.param.f64 [func_retval0+0], %fl0;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
|
||||
.func mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_(
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0,
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1,
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2,
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3,
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4,
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5,
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6,
|
||||
.param .b64 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7,
|
||||
.param .align 4 .b8 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_8[4]
|
||||
) // @mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0: // %allocas
|
||||
ld.param.f32 %f0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0];
|
||||
ld.param.f32 %f1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1];
|
||||
ld.param.f32 %f3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2];
|
||||
ld.param.f32 %f2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3];
|
||||
ld.param.u32 %r0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4];
|
||||
ld.param.u32 %r1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5];
|
||||
ld.param.u32 %r2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6];
|
||||
ld.param.u64 %rl0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7];
|
||||
ld.param.u32 %r3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_8];
|
||||
setp.lt.s32 %p0, %r3, 0;
|
||||
sub.f32 %f3, %f3, %f0;
|
||||
cvt.rn.f32.s32 %f4, %r0;
|
||||
sub.f32 %f2, %f2, %f1;
|
||||
cvt.rn.f32.s32 %f5, %r1;
|
||||
div.rn.f32 %f2, %f2, %f5;
|
||||
div.rn.f32 %f3, %f3, %f4;
|
||||
@%p0 bra BB8_9;
|
||||
// BB#1: // %for_test110.preheader
|
||||
setp.lt.s32 %p0, %r1, 1;
|
||||
@%p0 bra BB8_45;
|
||||
// BB#2: // %outer_not_in_extras140.preheader.lr.ph
|
||||
setp.gt.s32 %p0, %r2, 0;
|
||||
mov.u32 %r3, 0;
|
||||
selp.b32 %r4, -1, 0, %p0;
|
||||
shl.b32 %r5, %r0, 2;
|
||||
mov.u32 %r6, %r3;
|
||||
BB8_3: // %outer_not_in_extras140.preheader
|
||||
// =>This Loop Header: Depth=1
|
||||
// Child Loop BB8_41 Depth 2
|
||||
// Child Loop BB8_43 Depth 2
|
||||
// Child Loop BB8_38 Depth 2
|
||||
// Child Loop BB8_33 Depth 3
|
||||
setp.lt.s32 %p0, %r0, 1;
|
||||
@%p0 bra BB8_4;
|
||||
// BB#31: // %foreach_full_body120.lr.ph
|
||||
// in Loop: Header=BB8_3 Depth=1
|
||||
setp.lt.s32 %p0, %r4, 0;
|
||||
mov.u32 %r7, %r0;
|
||||
mov.u32 %r8, %r3;
|
||||
@%p0 bra BB8_32;
|
||||
bra.uni BB8_43;
|
||||
BB8_32: // in Loop: Header=BB8_3 Depth=1
|
||||
mov.u64 %rl1, 0;
|
||||
cvt.rn.f32.s32 %f4, %r6;
|
||||
fma.rn.f32 %f4, %f2, %f4, %f1;
|
||||
mul.lo.s32 %r7, %r6, %r0;
|
||||
BB8_38: // %for_loop.i380.lr.ph.us
|
||||
// Parent Loop BB8_3 Depth=1
|
||||
// => This Loop Header: Depth=2
|
||||
// Child Loop BB8_33 Depth 3
|
||||
cvt.u32.u64 %r8, %rl1;
|
||||
cvt.rn.f32.s32 %f5, %r8;
|
||||
fma.rn.f32 %f5, %f3, %f5, %f0;
|
||||
mov.u32 %r10, 0;
|
||||
mov.u32 %r12, %r4;
|
||||
mov.u32 %r11, %r10;
|
||||
mov.u32 %r9, %r10;
|
||||
mov.f32 %f7, %f5;
|
||||
mov.f32 %f6, %f4;
|
||||
BB8_33: // %for_loop.i380.us
|
||||
// Parent Loop BB8_3 Depth=1
|
||||
// Parent Loop BB8_38 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
mul.f32 %f8, %f7, %f7;
|
||||
fma.rn.f32 %f9, %f6, %f6, %f8;
|
||||
setp.gtu.f32 %p0, %f9, 0f40800000;
|
||||
selp.b32 %r13, %r12, 0, %p0;
|
||||
or.b32 %r11, %r13, %r11;
|
||||
shr.u32 %r13, %r11, 31;
|
||||
shr.u32 %r14, %r12, 31;
|
||||
setp.eq.s32 %p0, %r13, %r14;
|
||||
@%p0 bra BB8_34;
|
||||
bra.uni BB8_35;
|
||||
BB8_34: // in Loop: Header=BB8_33 Depth=3
|
||||
mov.u32 %r12, %r10;
|
||||
bra.uni BB8_36;
|
||||
BB8_35: // %not_all_continued_or_breaked.i394.us
|
||||
// in Loop: Header=BB8_33 Depth=3
|
||||
mul.f32 %f9, %f6, %f6;
|
||||
not.b32 %r13, %r11;
|
||||
and.b32 %r12, %r12, %r13;
|
||||
sub.f32 %f8, %f8, %f9;
|
||||
add.f32 %f8, %f5, %f8;
|
||||
add.f32 %f7, %f7, %f7;
|
||||
fma.rn.f32 %f6, %f6, %f7, %f4;
|
||||
mov.f32 %f7, %f8;
|
||||
BB8_36: // %for_step.i363.us
|
||||
// in Loop: Header=BB8_33 Depth=3
|
||||
setp.ne.s32 %p0, %r12, 0;
|
||||
selp.u32 %r13, 1, 0, %p0;
|
||||
add.s32 %r9, %r9, %r13;
|
||||
setp.lt.s32 %p0, %r9, %r2;
|
||||
selp.b32 %r12, %r12, 0, %p0;
|
||||
setp.lt.s32 %p0, %r12, 0;
|
||||
@%p0 bra BB8_33;
|
||||
// BB#37: // %mandel___vyfvyfvyi.exit395.us
|
||||
// in Loop: Header=BB8_38 Depth=2
|
||||
add.s32 %r8, %r8, %r7;
|
||||
shl.b32 %r8, %r8, 2;
|
||||
cvt.s64.s32 %rl2, %r8;
|
||||
add.s64 %rl2, %rl2, %rl0;
|
||||
st.u32 [%rl2], %r9;
|
||||
add.s64 %rl1, %rl1, 1;
|
||||
cvt.u32.u64 %r8, %rl1;
|
||||
setp.eq.s32 %p0, %r8, %r0;
|
||||
@%p0 bra BB8_44;
|
||||
bra.uni BB8_38;
|
||||
BB8_43: // %mandel___vyfvyfvyi.exit395
|
||||
// Parent Loop BB8_3 Depth=1
|
||||
// => This Inner Loop Header: Depth=2
|
||||
cvt.s64.s32 %rl1, %r8;
|
||||
add.s64 %rl1, %rl1, %rl0;
|
||||
mov.u32 %r9, 0;
|
||||
st.u32 [%rl1], %r9;
|
||||
add.s32 %r8, %r8, 4;
|
||||
add.s32 %r7, %r7, -1;
|
||||
setp.eq.s32 %p0, %r7, 0;
|
||||
@%p0 bra BB8_44;
|
||||
bra.uni BB8_43;
|
||||
BB8_4: // %partial_inner_all_outer156
|
||||
// in Loop: Header=BB8_3 Depth=1
|
||||
@%p0 bra BB8_44;
|
||||
// BB#5: // %partial_inner_only197
|
||||
// in Loop: Header=BB8_3 Depth=1
|
||||
setp.gt.s32 %p0, %r0, 0;
|
||||
mov.u32 %r8, 0;
|
||||
fma.rn.f32 %f4, %f3, 0f00000000, %f0;
|
||||
cvt.rn.f32.s32 %f5, %r6;
|
||||
fma.rn.f32 %f5, %f2, %f5, %f1;
|
||||
selp.b32 %r7, %r4, 0, %p0;
|
||||
setp.lt.s32 %p1, %r7, 0;
|
||||
mov.u32 %r10, %r4;
|
||||
mov.u32 %r9, %r8;
|
||||
mov.u32 %r7, %r8;
|
||||
mov.f32 %f7, %f4;
|
||||
mov.f32 %f6, %f5;
|
||||
@%p1 bra BB8_41;
|
||||
bra.uni BB8_6;
|
||||
BB8_41: // %for_loop.i
|
||||
// Parent Loop BB8_3 Depth=1
|
||||
// => This Inner Loop Header: Depth=2
|
||||
selp.b32 %r11, %r10, 0, %p0;
|
||||
mul.f32 %f8, %f7, %f7;
|
||||
fma.rn.f32 %f9, %f6, %f6, %f8;
|
||||
setp.gtu.f32 %p1, %f9, 0f40800000;
|
||||
selp.b32 %r12, %r10, 0, %p1;
|
||||
or.b32 %r9, %r12, %r9;
|
||||
selp.b32 %r12, %r9, 0, %p0;
|
||||
shr.u32 %r12, %r12, 31;
|
||||
shr.u32 %r11, %r11, 31;
|
||||
setp.eq.s32 %p1, %r12, %r11;
|
||||
@%p1 bra BB8_42;
|
||||
bra.uni BB8_39;
|
||||
BB8_42: // in Loop: Header=BB8_41 Depth=2
|
||||
mov.u32 %r10, %r8;
|
||||
bra.uni BB8_40;
|
||||
BB8_39: // %not_all_continued_or_breaked.i
|
||||
// in Loop: Header=BB8_41 Depth=2
|
||||
mul.f32 %f9, %f6, %f6;
|
||||
not.b32 %r11, %r9;
|
||||
and.b32 %r10, %r10, %r11;
|
||||
sub.f32 %f8, %f8, %f9;
|
||||
add.f32 %f8, %f4, %f8;
|
||||
add.f32 %f7, %f7, %f7;
|
||||
fma.rn.f32 %f6, %f6, %f7, %f5;
|
||||
mov.f32 %f7, %f8;
|
||||
BB8_40: // %for_step.i
|
||||
// in Loop: Header=BB8_41 Depth=2
|
||||
setp.ne.s32 %p1, %r10, 0;
|
||||
selp.u32 %r11, 1, 0, %p1;
|
||||
add.s32 %r7, %r7, %r11;
|
||||
setp.lt.s32 %p1, %r7, %r2;
|
||||
selp.b32 %r10, %r10, 0, %p1;
|
||||
selp.b32 %r11, %r10, 0, %p0;
|
||||
setp.gt.s32 %p1, %r11, -1;
|
||||
@%p1 bra BB8_7;
|
||||
bra.uni BB8_41;
|
||||
BB8_6: // in Loop: Header=BB8_3 Depth=1
|
||||
mov.u32 %r7, %r8;
|
||||
BB8_7: // %mandel___vyfvyfvyi.exit
|
||||
// in Loop: Header=BB8_3 Depth=1
|
||||
setp.lt.s32 %p0, %r0, 1;
|
||||
@%p0 bra BB8_44;
|
||||
// BB#8: // %pl_dolane.i
|
||||
// in Loop: Header=BB8_3 Depth=1
|
||||
mul.lo.s32 %r8, %r6, %r0;
|
||||
shl.b32 %r8, %r8, 2;
|
||||
cvt.s64.s32 %rl1, %r8;
|
||||
add.s64 %rl1, %rl1, %rl0;
|
||||
st.u32 [%rl1], %r7;
|
||||
BB8_44: // %foreach_reset128
|
||||
// in Loop: Header=BB8_3 Depth=1
|
||||
add.s32 %r6, %r6, 1;
|
||||
add.s32 %r3, %r3, %r5;
|
||||
setp.eq.s32 %p0, %r6, %r1;
|
||||
@%p0 bra BB8_45;
|
||||
bra.uni BB8_3;
|
||||
BB8_9: // %for_test.preheader
|
||||
setp.lt.s32 %p0, %r1, 1;
|
||||
@%p0 bra BB8_45;
|
||||
// BB#10: // %outer_not_in_extras.preheader.lr.ph
|
||||
setp.gt.s32 %p0, %r2, 0;
|
||||
mov.u32 %r3, 0;
|
||||
selp.b32 %r4, -1, 0, %p0;
|
||||
shl.b32 %r5, %r0, 2;
|
||||
mov.u32 %r6, %r3;
|
||||
BB8_11: // %outer_not_in_extras.preheader
|
||||
// =>This Loop Header: Depth=1
|
||||
// Child Loop BB8_23 Depth 2
|
||||
// Child Loop BB8_20 Depth 2
|
||||
// Child Loop BB8_19 Depth 2
|
||||
// Child Loop BB8_14 Depth 3
|
||||
setp.lt.s32 %p0, %r0, 1;
|
||||
@%p0 bra BB8_28;
|
||||
// BB#12: // %foreach_full_body.lr.ph
|
||||
// in Loop: Header=BB8_11 Depth=1
|
||||
setp.lt.s32 %p0, %r4, 0;
|
||||
mov.u32 %r7, %r0;
|
||||
mov.u32 %r8, %r3;
|
||||
@%p0 bra BB8_13;
|
||||
bra.uni BB8_20;
|
||||
BB8_13: // in Loop: Header=BB8_11 Depth=1
|
||||
mov.u64 %rl1, 0;
|
||||
cvt.rn.f32.s32 %f4, %r6;
|
||||
fma.rn.f32 %f4, %f2, %f4, %f1;
|
||||
mul.lo.s32 %r7, %r6, %r0;
|
||||
BB8_19: // %for_loop.i281.lr.ph.us
|
||||
// Parent Loop BB8_11 Depth=1
|
||||
// => This Loop Header: Depth=2
|
||||
// Child Loop BB8_14 Depth 3
|
||||
cvt.u32.u64 %r8, %rl1;
|
||||
cvt.rn.f32.s32 %f5, %r8;
|
||||
fma.rn.f32 %f5, %f3, %f5, %f0;
|
||||
mov.u32 %r10, 0;
|
||||
mov.u32 %r12, %r4;
|
||||
mov.u32 %r11, %r10;
|
||||
mov.u32 %r9, %r10;
|
||||
mov.f32 %f7, %f5;
|
||||
mov.f32 %f6, %f4;
|
||||
BB8_14: // %for_loop.i281.us
|
||||
// Parent Loop BB8_11 Depth=1
|
||||
// Parent Loop BB8_19 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
mul.f32 %f8, %f7, %f7;
|
||||
fma.rn.f32 %f9, %f6, %f6, %f8;
|
||||
setp.gtu.f32 %p0, %f9, 0f40800000;
|
||||
selp.b32 %r13, %r12, 0, %p0;
|
||||
or.b32 %r11, %r13, %r11;
|
||||
shr.u32 %r13, %r11, 31;
|
||||
shr.u32 %r14, %r12, 31;
|
||||
setp.eq.s32 %p0, %r13, %r14;
|
||||
@%p0 bra BB8_15;
|
||||
bra.uni BB8_16;
|
||||
BB8_15: // in Loop: Header=BB8_14 Depth=3
|
||||
mov.u32 %r12, %r10;
|
||||
bra.uni BB8_17;
|
||||
BB8_16: // %not_all_continued_or_breaked.i295.us
|
||||
// in Loop: Header=BB8_14 Depth=3
|
||||
mul.f32 %f9, %f6, %f6;
|
||||
not.b32 %r13, %r11;
|
||||
and.b32 %r12, %r12, %r13;
|
||||
sub.f32 %f8, %f8, %f9;
|
||||
add.f32 %f8, %f5, %f8;
|
||||
add.f32 %f7, %f7, %f7;
|
||||
fma.rn.f32 %f6, %f6, %f7, %f4;
|
||||
mov.f32 %f7, %f8;
|
||||
BB8_17: // %for_step.i264.us
|
||||
// in Loop: Header=BB8_14 Depth=3
|
||||
setp.ne.s32 %p0, %r12, 0;
|
||||
selp.u32 %r13, 1, 0, %p0;
|
||||
add.s32 %r9, %r9, %r13;
|
||||
setp.lt.s32 %p0, %r9, %r2;
|
||||
selp.b32 %r12, %r12, 0, %p0;
|
||||
setp.lt.s32 %p0, %r12, 0;
|
||||
@%p0 bra BB8_14;
|
||||
// BB#18: // %mandel___vyfvyfvyi.exit296.us
|
||||
// in Loop: Header=BB8_19 Depth=2
|
||||
add.s32 %r8, %r8, %r7;
|
||||
shl.b32 %r8, %r8, 2;
|
||||
cvt.s64.s32 %rl2, %r8;
|
||||
add.s64 %rl2, %rl2, %rl0;
|
||||
st.u32 [%rl2], %r9;
|
||||
add.s64 %rl1, %rl1, 1;
|
||||
cvt.u32.u64 %r8, %rl1;
|
||||
setp.eq.s32 %p0, %r8, %r0;
|
||||
@%p0 bra BB8_27;
|
||||
bra.uni BB8_19;
|
||||
BB8_20: // %mandel___vyfvyfvyi.exit296
|
||||
// Parent Loop BB8_11 Depth=1
|
||||
// => This Inner Loop Header: Depth=2
|
||||
cvt.s64.s32 %rl1, %r8;
|
||||
add.s64 %rl1, %rl1, %rl0;
|
||||
mov.u32 %r9, 0;
|
||||
st.u32 [%rl1], %r9;
|
||||
add.s32 %r8, %r8, 4;
|
||||
add.s32 %r7, %r7, -1;
|
||||
setp.eq.s32 %p0, %r7, 0;
|
||||
@%p0 bra BB8_27;
|
||||
bra.uni BB8_20;
|
||||
BB8_28: // %partial_inner_all_outer
|
||||
// in Loop: Header=BB8_11 Depth=1
|
||||
@%p0 bra BB8_27;
|
||||
// BB#29: // %partial_inner_only
|
||||
// in Loop: Header=BB8_11 Depth=1
|
||||
setp.gt.s32 %p0, %r0, 0;
|
||||
mov.u32 %r8, 0;
|
||||
fma.rn.f32 %f4, %f3, 0f00000000, %f0;
|
||||
cvt.rn.f32.s32 %f5, %r6;
|
||||
fma.rn.f32 %f5, %f2, %f5, %f1;
|
||||
selp.b32 %r7, %r4, 0, %p0;
|
||||
setp.lt.s32 %p1, %r7, 0;
|
||||
mov.u32 %r10, %r4;
|
||||
mov.u32 %r9, %r8;
|
||||
mov.u32 %r7, %r8;
|
||||
mov.f32 %f7, %f4;
|
||||
mov.f32 %f6, %f5;
|
||||
@%p1 bra BB8_23;
|
||||
bra.uni BB8_30;
|
||||
BB8_23: // %for_loop.i332
|
||||
// Parent Loop BB8_11 Depth=1
|
||||
// => This Inner Loop Header: Depth=2
|
||||
selp.b32 %r11, %r10, 0, %p0;
|
||||
mul.f32 %f8, %f7, %f7;
|
||||
fma.rn.f32 %f9, %f6, %f6, %f8;
|
||||
setp.gtu.f32 %p1, %f9, 0f40800000;
|
||||
selp.b32 %r12, %r10, 0, %p1;
|
||||
or.b32 %r9, %r12, %r9;
|
||||
selp.b32 %r12, %r9, 0, %p0;
|
||||
shr.u32 %r12, %r12, 31;
|
||||
shr.u32 %r11, %r11, 31;
|
||||
setp.eq.s32 %p1, %r12, %r11;
|
||||
@%p1 bra BB8_24;
|
||||
bra.uni BB8_21;
|
||||
BB8_24: // in Loop: Header=BB8_23 Depth=2
|
||||
mov.u32 %r10, %r8;
|
||||
bra.uni BB8_22;
|
||||
BB8_21: // %not_all_continued_or_breaked.i346
|
||||
// in Loop: Header=BB8_23 Depth=2
|
||||
mul.f32 %f9, %f6, %f6;
|
||||
not.b32 %r11, %r9;
|
||||
and.b32 %r10, %r10, %r11;
|
||||
sub.f32 %f8, %f8, %f9;
|
||||
add.f32 %f8, %f4, %f8;
|
||||
add.f32 %f7, %f7, %f7;
|
||||
fma.rn.f32 %f6, %f6, %f7, %f5;
|
||||
mov.f32 %f7, %f8;
|
||||
BB8_22: // %for_step.i313
|
||||
// in Loop: Header=BB8_23 Depth=2
|
||||
setp.ne.s32 %p1, %r10, 0;
|
||||
selp.u32 %r11, 1, 0, %p1;
|
||||
add.s32 %r7, %r7, %r11;
|
||||
setp.lt.s32 %p1, %r7, %r2;
|
||||
selp.b32 %r10, %r10, 0, %p1;
|
||||
selp.b32 %r11, %r10, 0, %p0;
|
||||
setp.gt.s32 %p1, %r11, -1;
|
||||
@%p1 bra BB8_25;
|
||||
bra.uni BB8_23;
|
||||
BB8_30: // in Loop: Header=BB8_11 Depth=1
|
||||
mov.u32 %r7, %r8;
|
||||
BB8_25: // %mandel___vyfvyfvyi.exit347
|
||||
// in Loop: Header=BB8_11 Depth=1
|
||||
setp.lt.s32 %p0, %r0, 1;
|
||||
@%p0 bra BB8_27;
|
||||
// BB#26: // %pl_dolane.i452
|
||||
// in Loop: Header=BB8_11 Depth=1
|
||||
mul.lo.s32 %r8, %r6, %r0;
|
||||
shl.b32 %r8, %r8, 2;
|
||||
cvt.s64.s32 %rl1, %r8;
|
||||
add.s64 %rl1, %rl1, %rl0;
|
||||
st.u32 [%rl1], %r7;
|
||||
BB8_27: // %foreach_reset
|
||||
// in Loop: Header=BB8_11 Depth=1
|
||||
add.s32 %r6, %r6, 1;
|
||||
add.s32 %r3, %r3, %r5;
|
||||
setp.eq.s32 %p0, %r6, %r1;
|
||||
@%p0 bra BB8_45;
|
||||
bra.uni BB8_11;
|
||||
BB8_45: // %for_exit
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl mandelbrot_ispc
|
||||
.func mandelbrot_ispc(
|
||||
.param .b32 mandelbrot_ispc_param_0,
|
||||
.param .b32 mandelbrot_ispc_param_1,
|
||||
.param .b32 mandelbrot_ispc_param_2,
|
||||
.param .b32 mandelbrot_ispc_param_3,
|
||||
.param .b32 mandelbrot_ispc_param_4,
|
||||
.param .b32 mandelbrot_ispc_param_5,
|
||||
.param .b32 mandelbrot_ispc_param_6,
|
||||
.param .b64 mandelbrot_ispc_param_7
|
||||
) // @mandelbrot_ispc
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0: // %allocas
|
||||
ld.param.u32 %r0, [mandelbrot_ispc_param_5];
|
||||
setp.lt.s32 %p0, %r0, 1;
|
||||
@%p0 bra BB9_18;
|
||||
// BB#1: // %outer_not_in_extras.preheader.lr.ph
|
||||
ld.param.f32 %f0, [mandelbrot_ispc_param_0];
|
||||
ld.param.f32 %f1, [mandelbrot_ispc_param_1];
|
||||
ld.param.f32 %f3, [mandelbrot_ispc_param_2];
|
||||
ld.param.f32 %f2, [mandelbrot_ispc_param_3];
|
||||
ld.param.u32 %r1, [mandelbrot_ispc_param_4];
|
||||
ld.param.u32 %r2, [mandelbrot_ispc_param_6];
|
||||
ld.param.u64 %rl0, [mandelbrot_ispc_param_7];
|
||||
sub.f32 %f3, %f3, %f0;
|
||||
cvt.rn.f32.s32 %f4, %r1;
|
||||
sub.f32 %f2, %f2, %f1;
|
||||
cvt.rn.f32.s32 %f5, %r0;
|
||||
div.rn.f32 %f2, %f2, %f5;
|
||||
div.rn.f32 %f3, %f3, %f4;
|
||||
setp.gt.s32 %p0, %r2, 0;
|
||||
mov.u32 %r3, 0;
|
||||
selp.b32 %r4, -1, 0, %p0;
|
||||
BB9_2: // %outer_not_in_extras.preheader
|
||||
// =>This Loop Header: Depth=1
|
||||
// Child Loop BB9_13 Depth 2
|
||||
// Child Loop BB9_4 Depth 2
|
||||
// Child Loop BB9_9 Depth 3
|
||||
setp.lt.s32 %p0, %r1, 1;
|
||||
@%p0 bra BB9_19;
|
||||
// BB#3: // %foreach_full_body.lr.ph
|
||||
// in Loop: Header=BB9_2 Depth=1
|
||||
mov.u64 %rl1, 0;
|
||||
cvt.rn.f32.s32 %f4, %r3;
|
||||
fma.rn.f32 %f4, %f2, %f4, %f1;
|
||||
mul.lo.s32 %r5, %r3, %r1;
|
||||
BB9_4: // %foreach_full_body
|
||||
// Parent Loop BB9_2 Depth=1
|
||||
// => This Loop Header: Depth=2
|
||||
// Child Loop BB9_9 Depth 3
|
||||
setp.lt.s32 %p0, %r4, 0;
|
||||
cvt.u32.u64 %r6, %rl1;
|
||||
cvt.rn.f32.s32 %f5, %r6;
|
||||
fma.rn.f32 %f5, %f3, %f5, %f0;
|
||||
mov.u32 %r8, 0;
|
||||
mov.u32 %r10, %r4;
|
||||
mov.u32 %r9, %r8;
|
||||
mov.u32 %r7, %r8;
|
||||
mov.f32 %f7, %f5;
|
||||
mov.f32 %f6, %f4;
|
||||
@%p0 bra BB9_9;
|
||||
bra.uni BB9_5;
|
||||
BB9_9: // %for_loop.i281
|
||||
// Parent Loop BB9_2 Depth=1
|
||||
// Parent Loop BB9_4 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
mul.f32 %f8, %f7, %f7;
|
||||
fma.rn.f32 %f9, %f6, %f6, %f8;
|
||||
setp.gtu.f32 %p0, %f9, 0f40800000;
|
||||
selp.b32 %r11, %r10, 0, %p0;
|
||||
or.b32 %r9, %r11, %r9;
|
||||
shr.u32 %r11, %r9, 31;
|
||||
shr.u32 %r12, %r10, 31;
|
||||
setp.eq.s32 %p0, %r11, %r12;
|
||||
@%p0 bra BB9_10;
|
||||
bra.uni BB9_7;
|
||||
BB9_10: // in Loop: Header=BB9_9 Depth=3
|
||||
mov.u32 %r10, %r8;
|
||||
bra.uni BB9_8;
|
||||
BB9_7: // %not_all_continued_or_breaked.i295
|
||||
// in Loop: Header=BB9_9 Depth=3
|
||||
mul.f32 %f9, %f6, %f6;
|
||||
not.b32 %r11, %r9;
|
||||
and.b32 %r10, %r10, %r11;
|
||||
sub.f32 %f8, %f8, %f9;
|
||||
add.f32 %f8, %f5, %f8;
|
||||
add.f32 %f7, %f7, %f7;
|
||||
fma.rn.f32 %f6, %f6, %f7, %f4;
|
||||
mov.f32 %f7, %f8;
|
||||
BB9_8: // %for_step.i264
|
||||
// in Loop: Header=BB9_9 Depth=3
|
||||
setp.ne.s32 %p0, %r10, 0;
|
||||
selp.u32 %r11, 1, 0, %p0;
|
||||
add.s32 %r7, %r7, %r11;
|
||||
setp.lt.s32 %p0, %r7, %r2;
|
||||
selp.b32 %r10, %r10, 0, %p0;
|
||||
setp.gt.s32 %p0, %r10, -1;
|
||||
@%p0 bra BB9_6;
|
||||
bra.uni BB9_9;
|
||||
BB9_5: // in Loop: Header=BB9_4 Depth=2
|
||||
mov.u32 %r7, %r8;
|
||||
BB9_6: // %mandel___vyfvyfvyi.exit296
|
||||
// in Loop: Header=BB9_4 Depth=2
|
||||
add.s32 %r6, %r6, %r5;
|
||||
shl.b32 %r6, %r6, 2;
|
||||
cvt.s64.s32 %rl2, %r6;
|
||||
add.s64 %rl2, %rl2, %rl0;
|
||||
st.u32 [%rl2], %r7;
|
||||
add.s64 %rl1, %rl1, 1;
|
||||
cvt.u32.u64 %r6, %rl1;
|
||||
setp.eq.s32 %p0, %r6, %r1;
|
||||
@%p0 bra BB9_17;
|
||||
bra.uni BB9_4;
|
||||
BB9_19: // %partial_inner_all_outer
|
||||
// in Loop: Header=BB9_2 Depth=1
|
||||
@%p0 bra BB9_17;
|
||||
// BB#20: // %partial_inner_only
|
||||
// in Loop: Header=BB9_2 Depth=1
|
||||
setp.gt.s32 %p0, %r1, 0;
|
||||
mov.u32 %r6, 0;
|
||||
fma.rn.f32 %f4, %f3, 0f00000000, %f0;
|
||||
cvt.rn.f32.s32 %f5, %r3;
|
||||
fma.rn.f32 %f5, %f2, %f5, %f1;
|
||||
selp.b32 %r5, %r4, 0, %p0;
|
||||
setp.lt.s32 %p1, %r5, 0;
|
||||
mov.u32 %r8, %r4;
|
||||
mov.u32 %r7, %r6;
|
||||
mov.u32 %r5, %r6;
|
||||
mov.f32 %f7, %f4;
|
||||
mov.f32 %f6, %f5;
|
||||
@%p1 bra BB9_13;
|
||||
bra.uni BB9_21;
|
||||
BB9_13: // %for_loop.i332
|
||||
// Parent Loop BB9_2 Depth=1
|
||||
// => This Inner Loop Header: Depth=2
|
||||
selp.b32 %r9, %r8, 0, %p0;
|
||||
mul.f32 %f8, %f7, %f7;
|
||||
fma.rn.f32 %f9, %f6, %f6, %f8;
|
||||
setp.gtu.f32 %p1, %f9, 0f40800000;
|
||||
selp.b32 %r10, %r8, 0, %p1;
|
||||
or.b32 %r7, %r10, %r7;
|
||||
selp.b32 %r10, %r7, 0, %p0;
|
||||
shr.u32 %r10, %r10, 31;
|
||||
shr.u32 %r9, %r9, 31;
|
||||
setp.eq.s32 %p1, %r10, %r9;
|
||||
@%p1 bra BB9_14;
|
||||
bra.uni BB9_11;
|
||||
BB9_14: // in Loop: Header=BB9_13 Depth=2
|
||||
mov.u32 %r8, %r6;
|
||||
bra.uni BB9_12;
|
||||
BB9_11: // %not_all_continued_or_breaked.i346
|
||||
// in Loop: Header=BB9_13 Depth=2
|
||||
mul.f32 %f9, %f6, %f6;
|
||||
not.b32 %r9, %r7;
|
||||
and.b32 %r8, %r8, %r9;
|
||||
sub.f32 %f8, %f8, %f9;
|
||||
add.f32 %f8, %f4, %f8;
|
||||
add.f32 %f7, %f7, %f7;
|
||||
fma.rn.f32 %f6, %f6, %f7, %f5;
|
||||
mov.f32 %f7, %f8;
|
||||
BB9_12: // %for_step.i313
|
||||
// in Loop: Header=BB9_13 Depth=2
|
||||
setp.ne.s32 %p1, %r8, 0;
|
||||
selp.u32 %r9, 1, 0, %p1;
|
||||
add.s32 %r5, %r5, %r9;
|
||||
setp.lt.s32 %p1, %r5, %r2;
|
||||
selp.b32 %r8, %r8, 0, %p1;
|
||||
selp.b32 %r9, %r8, 0, %p0;
|
||||
setp.gt.s32 %p1, %r9, -1;
|
||||
@%p1 bra BB9_15;
|
||||
bra.uni BB9_13;
|
||||
BB9_21: // in Loop: Header=BB9_2 Depth=1
|
||||
mov.u32 %r5, %r6;
|
||||
BB9_15: // %mandel___vyfvyfvyi.exit347
|
||||
// in Loop: Header=BB9_2 Depth=1
|
||||
setp.lt.s32 %p0, %r1, 1;
|
||||
@%p0 bra BB9_17;
|
||||
// BB#16: // %pl_dolane.i
|
||||
// in Loop: Header=BB9_2 Depth=1
|
||||
mul.lo.s32 %r6, %r3, %r1;
|
||||
shl.b32 %r6, %r6, 2;
|
||||
cvt.s64.s32 %rl1, %r6;
|
||||
add.s64 %rl1, %rl1, %rl0;
|
||||
st.u32 [%rl1], %r5;
|
||||
BB9_17: // %foreach_reset
|
||||
// in Loop: Header=BB9_2 Depth=1
|
||||
add.s32 %r3, %r3, 1;
|
||||
setp.eq.s32 %p0, %r3, %r0;
|
||||
@%p0 bra BB9_18;
|
||||
bra.uni BB9_2;
|
||||
BB9_18: // %for_exit
|
||||
ret;
|
||||
}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -2,7 +2,7 @@
|
||||
EXAMPLE=mandelbrot_tasks
|
||||
CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
|
||||
ISPC_SRC=mandelbrot_tasks.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x16
|
||||
ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
EXAMPLE=rt
|
||||
CPP_SRC=rt.cpp rt_serial.cpp
|
||||
ISPC_SRC=rt.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x8
|
||||
ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -1,370 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef _DRVAPI_ERROR_STRING_H_
|
||||
#define _DRVAPI_ERROR_STRING_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Error Code string definitions here
|
||||
typedef struct
|
||||
{
|
||||
char const *error_string;
|
||||
int error_id;
|
||||
} s_CudaErrorStr;
|
||||
|
||||
/**
|
||||
* Error codes
|
||||
*/
|
||||
static s_CudaErrorStr sCudaDrvErrorString[] =
|
||||
{
|
||||
/**
|
||||
* The API call returned with no errors. In the case of query calls, this
|
||||
* can also mean that the operation being queried is complete (see
|
||||
* ::cuEventQuery() and ::cuStreamQuery()).
|
||||
*/
|
||||
{ "CUDA_SUCCESS", 0 },
|
||||
|
||||
/**
|
||||
* This indicates that one or more of the parameters passed to the API call
|
||||
* is not within an acceptable range of values.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_VALUE", 1 },
|
||||
|
||||
/**
|
||||
* The API call failed because it was unable to allocate enough memory to
|
||||
* perform the requested operation.
|
||||
*/
|
||||
{ "CUDA_ERROR_OUT_OF_MEMORY", 2 },
|
||||
|
||||
/**
|
||||
* This indicates that the CUDA driver has not been initialized with
|
||||
* ::cuInit() or that initialization has failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_INITIALIZED", 3 },
|
||||
|
||||
/**
|
||||
* This indicates that the CUDA driver is in the process of shutting down.
|
||||
*/
|
||||
{ "CUDA_ERROR_DEINITIALIZED", 4 },
|
||||
|
||||
/**
|
||||
* This indicates profiling APIs are called while application is running
|
||||
* in visual profiler mode.
|
||||
*/
|
||||
{ "CUDA_ERROR_PROFILER_DISABLED", 5 },
|
||||
/**
|
||||
* This indicates profiling has not been initialized for this context.
|
||||
* Call cuProfilerInitialize() to resolve this.
|
||||
*/
|
||||
{ "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 },
|
||||
/**
|
||||
* This indicates profiler has already been started and probably
|
||||
* cuProfilerStart() is incorrectly called.
|
||||
*/
|
||||
{ "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 },
|
||||
/**
|
||||
* This indicates profiler has already been stopped and probably
|
||||
* cuProfilerStop() is incorrectly called.
|
||||
*/
|
||||
{ "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 },
|
||||
/**
|
||||
* This indicates that no CUDA-capable devices were detected by the installed
|
||||
* CUDA driver.
|
||||
*/
|
||||
{ "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 },
|
||||
|
||||
/**
|
||||
* This indicates that the device ordinal supplied by the user does not
|
||||
* correspond to a valid CUDA device.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 },
|
||||
|
||||
|
||||
/**
|
||||
* This indicates that the device kernel image is invalid. This can also
|
||||
* indicate an invalid CUDA module.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_IMAGE", 200 },
|
||||
|
||||
/**
|
||||
* This most frequently indicates that there is no context bound to the
|
||||
* current thread. This can also be returned if the context passed to an
|
||||
* API call is not a valid handle (such as a context that has had
|
||||
* ::cuCtxDestroy() invoked on it). This can also be returned if a user
|
||||
* mixes different API versions (i.e. 3010 context with 3020 API calls).
|
||||
* See ::cuCtxGetApiVersion() for more details.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_CONTEXT", 201 },
|
||||
|
||||
/**
|
||||
* This indicated that the context being supplied as a parameter to the
|
||||
* API call was already the active context.
|
||||
* \deprecated
|
||||
* This error return is deprecated as of CUDA 3.2. It is no longer an
|
||||
* error to attempt to push the active context via ::cuCtxPushCurrent().
|
||||
*/
|
||||
{ "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 },
|
||||
|
||||
/**
|
||||
* This indicates that a map or register operation has failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_MAP_FAILED", 205 },
|
||||
|
||||
/**
|
||||
* This indicates that an unmap or unregister operation has failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_UNMAP_FAILED", 206 },
|
||||
|
||||
/**
|
||||
* This indicates that the specified array is currently mapped and thus
|
||||
* cannot be destroyed.
|
||||
*/
|
||||
{ "CUDA_ERROR_ARRAY_IS_MAPPED", 207 },
|
||||
|
||||
/**
|
||||
* This indicates that the resource is already mapped.
|
||||
*/
|
||||
{ "CUDA_ERROR_ALREADY_MAPPED", 208 },
|
||||
|
||||
/**
|
||||
* This indicates that there is no kernel image available that is suitable
|
||||
* for the device. This can occur when a user specifies code generation
|
||||
* options for a particular CUDA source file that do not include the
|
||||
* corresponding device configuration.
|
||||
*/
|
||||
{ "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 },
|
||||
|
||||
/**
|
||||
* This indicates that a resource has already been acquired.
|
||||
*/
|
||||
{ "CUDA_ERROR_ALREADY_ACQUIRED", 210 },
|
||||
|
||||
/**
|
||||
* This indicates that a resource is not mapped.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_MAPPED", 211 },
|
||||
|
||||
/**
|
||||
* This indicates that a mapped resource is not available for access as an
|
||||
* array.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 },
|
||||
|
||||
/**
|
||||
* This indicates that a mapped resource is not available for access as a
|
||||
* pointer.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 },
|
||||
|
||||
/**
|
||||
* This indicates that an uncorrectable ECC error was detected during
|
||||
* execution.
|
||||
*/
|
||||
{ "CUDA_ERROR_ECC_UNCORRECTABLE", 214 },
|
||||
|
||||
/**
|
||||
* This indicates that the ::CUlimit passed to the API call is not
|
||||
* supported by the active device.
|
||||
*/
|
||||
{ "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 },
|
||||
|
||||
/**
|
||||
* This indicates that the ::CUcontext passed to the API call can
|
||||
* only be bound to a single CPU thread at a time but is already
|
||||
* bound to a CPU thread.
|
||||
*/
|
||||
{ "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 },
|
||||
|
||||
/**
|
||||
* This indicates that peer access is not supported across the given
|
||||
* devices.
|
||||
*/
|
||||
{ "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217},
|
||||
|
||||
/**
|
||||
* This indicates that the device kernel source is invalid.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_SOURCE", 300 },
|
||||
|
||||
/**
|
||||
* This indicates that the file specified was not found.
|
||||
*/
|
||||
{ "CUDA_ERROR_FILE_NOT_FOUND", 301 },
|
||||
|
||||
/**
|
||||
* This indicates that a link to a shared object failed to resolve.
|
||||
*/
|
||||
{ "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 },
|
||||
|
||||
/**
|
||||
* This indicates that initialization of a shared object failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 },
|
||||
|
||||
/**
|
||||
* This indicates that an OS call failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_OPERATING_SYSTEM", 304 },
|
||||
|
||||
|
||||
/**
|
||||
* This indicates that a resource handle passed to the API call was not
|
||||
* valid. Resource handles are opaque types like ::CUstream and ::CUevent.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_HANDLE", 400 },
|
||||
|
||||
|
||||
/**
|
||||
* This indicates that a named symbol was not found. Examples of symbols
|
||||
* are global/constant variable names, texture names }, and surface names.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_FOUND", 500 },
|
||||
|
||||
|
||||
/**
|
||||
* This indicates that asynchronous operations issued previously have not
|
||||
* completed yet. This result is not actually an error, but must be indicated
|
||||
* differently than ::CUDA_SUCCESS (which indicates completion). Calls that
|
||||
* may return this value include ::cuEventQuery() and ::cuStreamQuery().
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_READY", 600 },
|
||||
|
||||
|
||||
/**
|
||||
* An exception occurred on the device while executing a kernel. Common
|
||||
* causes include dereferencing an invalid device pointer and accessing
|
||||
* out of bounds shared memory. The context cannot be used }, so it must
|
||||
* be destroyed (and a new one should be created). All existing device
|
||||
* memory allocations from this context are invalid and must be
|
||||
* reconstructed if the program is to continue using CUDA.
|
||||
*/
|
||||
{ "CUDA_ERROR_LAUNCH_FAILED", 700 },
|
||||
|
||||
/**
|
||||
* This indicates that a launch did not occur because it did not have
|
||||
* appropriate resources. This error usually indicates that the user has
|
||||
* attempted to pass too many arguments to the device kernel, or the
|
||||
* kernel launch specifies too many threads for the kernel's register
|
||||
* count. Passing arguments of the wrong size (i.e. a 64-bit pointer
|
||||
* when a 32-bit int is expected) is equivalent to passing too many
|
||||
* arguments and can also result in this error.
|
||||
*/
|
||||
{ "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 },
|
||||
|
||||
/**
|
||||
* This indicates that the device kernel took too long to execute. This can
|
||||
* only occur if timeouts are enabled - see the device attribute
|
||||
* ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
|
||||
* context cannot be used (and must be destroyed similar to
|
||||
* ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
|
||||
* this context are invalid and must be reconstructed if the program is to
|
||||
* continue using CUDA.
|
||||
*/
|
||||
{ "CUDA_ERROR_LAUNCH_TIMEOUT", 702 },
|
||||
|
||||
/**
|
||||
* This error indicates a kernel launch that uses an incompatible texturing
|
||||
* mode.
|
||||
*/
|
||||
{ "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 },
|
||||
|
||||
/**
|
||||
* This error indicates that a call to ::cuCtxEnablePeerAccess() is
|
||||
* trying to re-enable peer access to a context which has already
|
||||
* had peer access to it enabled.
|
||||
*/
|
||||
{ "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 },
|
||||
|
||||
/**
|
||||
* This error indicates that ::cuCtxDisablePeerAccess() is
|
||||
* trying to disable peer access which has not been enabled yet
|
||||
* via ::cuCtxEnablePeerAccess().
|
||||
*/
|
||||
{ "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 },
|
||||
|
||||
/**
|
||||
* This error indicates that the primary context for the specified device
|
||||
* has already been initialized.
|
||||
*/
|
||||
{ "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 },
|
||||
|
||||
/**
|
||||
* This error indicates that the context current to the calling thread
|
||||
* has been destroyed using ::cuCtxDestroy }, or is a primary context which
|
||||
* has not yet been initialized.
|
||||
*/
|
||||
{ "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 },
|
||||
|
||||
/**
|
||||
* A device-side assert triggered during kernel execution. The context
|
||||
* cannot be used anymore, and must be destroyed. All existing device
|
||||
* memory allocations from this context are invalid and must be
|
||||
* reconstructed if the program is to continue using CUDA.
|
||||
*/
|
||||
{ "CUDA_ERROR_ASSERT", 710 },
|
||||
|
||||
/**
|
||||
* This error indicates that the hardware resources required to enable
|
||||
* peer access have been exhausted for one or more of the devices
|
||||
* passed to ::cuCtxEnablePeerAccess().
|
||||
*/
|
||||
{ "CUDA_ERROR_TOO_MANY_PEERS", 711 },
|
||||
|
||||
/**
|
||||
* This error indicates that the memory range passed to ::cuMemHostRegister()
|
||||
* has already been registered.
|
||||
*/
|
||||
{ "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 },
|
||||
|
||||
/**
|
||||
* This error indicates that the pointer passed to ::cuMemHostUnregister()
|
||||
* does not correspond to any currently registered memory region.
|
||||
*/
|
||||
{ "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 },
|
||||
|
||||
/**
|
||||
* This error indicates that the attempted operation is not permitted.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_PERMITTED", 800 },
|
||||
|
||||
/**
|
||||
* This error indicates that the attempted operation is not supported
|
||||
* on the current system or device.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_SUPPORTED", 801 },
|
||||
|
||||
/**
|
||||
* This indicates that an unknown internal error has occurred.
|
||||
*/
|
||||
{ "CUDA_ERROR_UNKNOWN", 999 },
|
||||
{ NULL, -1 }
|
||||
};
|
||||
|
||||
// This is just a linear search through the array, since the error_id's are not
|
||||
// always ocurring consecutively
|
||||
const char * getCudaDrvErrorString(CUresult error_id)
|
||||
{
|
||||
int index = 0;
|
||||
while (sCudaDrvErrorString[index].error_id != error_id &&
|
||||
sCudaDrvErrorString[index].error_id != -1)
|
||||
{
|
||||
index++;
|
||||
}
|
||||
if (sCudaDrvErrorString[index].error_id == error_id)
|
||||
return (const char *)sCudaDrvErrorString[index].error_string;
|
||||
else
|
||||
return (const char *)"CUDA_ERROR not found!";
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,5 +0,0 @@
|
||||
I have been working with sort example, attempting to use ISPC_USE_OMP for tasking and adding example for sort_paralle.cpp which uses __gnu_parallel::sort to compare apples with apples, but clang has no support for OpenMP.
|
||||
|
||||
The reason to use ISPC_USE_OMP is to control thread-affinity on multi-socket systems. For bandwidth bound throughput, the tasking system based on pthread make it messy to control thread-affinity and w/o this for bandwidth bound work-loads performance may suffer..
|
||||
|
||||
I used sort example to begin with
|
||||
@@ -1,62 +0,0 @@
|
||||
#define programCount 32
|
||||
#define programIndex threadIdx.x
|
||||
#define taskIndex blockIdx.x
|
||||
|
||||
__device__ static void
|
||||
stencil_step( int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4], const double vsq[],
|
||||
const double Ain[], double Aout[]) {
|
||||
const int Nxy = Nx * Ny;
|
||||
|
||||
|
||||
#if 0
|
||||
foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) {
|
||||
#else
|
||||
const double coef0 = coef[0];
|
||||
const double coef1 = coef[1];
|
||||
const double coef2 = coef[2];
|
||||
const double coef3 = coef[3];
|
||||
for ( int z = z0; z < z1; z++)
|
||||
for ( int y = y0 ; y < y1; y++)
|
||||
for ( int xb = x0; xb < x1; xb += programCount)
|
||||
{
|
||||
const int x = xb + programIndex;
|
||||
|
||||
#endif
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
double div =
|
||||
coef0 * A_cur(0, 0, 0) +
|
||||
coef1 * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef2 * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef3 * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
if (x < x1)
|
||||
A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern "C"
|
||||
__global__ void
|
||||
stencil_step_task( int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4], const double vsq[],
|
||||
const double Ain[], double Aout[]) {
|
||||
stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
|
||||
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||
}
|
||||
|
||||
@@ -1,267 +0,0 @@
|
||||
//
|
||||
// Generated by NVIDIA NVVM Compiler
|
||||
// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
|
||||
// Cuda compilation tools, release 5.5, V5.5.0
|
||||
//
|
||||
|
||||
.version 3.2
|
||||
.target sm_35
|
||||
.address_size 64
|
||||
|
||||
.file 1 "/home/evghenii/soft/ispc-code/ispc/examples/stencil/stencil.cu", 1383254912, 2112
|
||||
.file 2 "/usr/local/cuda-5.5/bin/..//include/cuda_device_runtime_api.h", 1375338991, 7655
|
||||
|
||||
.weak .func (.param .b32 func_retval0) cudaMalloc(
|
||||
.param .b64 cudaMalloc_param_0,
|
||||
.param .b64 cudaMalloc_param_1
|
||||
)
|
||||
{
|
||||
.reg .s32 %r<2>;
|
||||
|
||||
|
||||
mov.u32 %r1, 30;
|
||||
st.param.b32 [func_retval0+0], %r1;
|
||||
.loc 2 66 3
|
||||
ret;
|
||||
}
|
||||
|
||||
.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes(
|
||||
.param .b64 cudaFuncGetAttributes_param_0,
|
||||
.param .b64 cudaFuncGetAttributes_param_1
|
||||
)
|
||||
{
|
||||
.reg .s32 %r<2>;
|
||||
|
||||
|
||||
mov.u32 %r1, 30;
|
||||
st.param.b32 [func_retval0+0], %r1;
|
||||
.loc 2 71 3
|
||||
ret;
|
||||
}
|
||||
|
||||
.visible .entry stencil_step_task(
|
||||
.param .u32 stencil_step_task_param_0,
|
||||
.param .u32 stencil_step_task_param_1,
|
||||
.param .u32 stencil_step_task_param_2,
|
||||
.param .u32 stencil_step_task_param_3,
|
||||
.param .u32 stencil_step_task_param_4,
|
||||
.param .u32 stencil_step_task_param_5,
|
||||
.param .u32 stencil_step_task_param_6,
|
||||
.param .u32 stencil_step_task_param_7,
|
||||
.param .u64 stencil_step_task_param_8,
|
||||
.param .u64 stencil_step_task_param_9,
|
||||
.param .u64 stencil_step_task_param_10,
|
||||
.param .u64 stencil_step_task_param_11
|
||||
)
|
||||
{
|
||||
.reg .pred %p<8>;
|
||||
.reg .s32 %r<54>;
|
||||
.reg .s64 %rd<36>;
|
||||
.reg .f64 %fd<48>;
|
||||
|
||||
|
||||
ld.param.u32 %r19, [stencil_step_task_param_0];
|
||||
ld.param.u32 %r20, [stencil_step_task_param_1];
|
||||
ld.param.u32 %r21, [stencil_step_task_param_2];
|
||||
ld.param.u32 %r22, [stencil_step_task_param_3];
|
||||
ld.param.u32 %r23, [stencil_step_task_param_4];
|
||||
ld.param.u32 %r24, [stencil_step_task_param_5];
|
||||
ld.param.u32 %r25, [stencil_step_task_param_6];
|
||||
ld.param.u64 %rd4, [stencil_step_task_param_8];
|
||||
ld.param.u64 %rd1, [stencil_step_task_param_9];
|
||||
ld.param.u64 %rd2, [stencil_step_task_param_10];
|
||||
ld.param.u64 %rd3, [stencil_step_task_param_11];
|
||||
cvta.to.global.u64 %rd5, %rd4;
|
||||
.loc 1 59 1
|
||||
mov.u32 %r26, %ctaid.x;
|
||||
add.s32 %r51, %r26, %r23;
|
||||
add.s32 %r27, %r51, 1;
|
||||
.loc 1 18 1
|
||||
ld.global.f64 %fd1, [%rd5];
|
||||
.loc 1 19 1
|
||||
ld.global.f64 %fd2, [%rd5+8];
|
||||
.loc 1 20 1
|
||||
ld.global.f64 %fd3, [%rd5+16];
|
||||
.loc 1 21 1
|
||||
ld.global.f64 %fd4, [%rd5+24];
|
||||
.loc 1 22 1
|
||||
setp.ge.s32 %p1, %r51, %r27;
|
||||
@%p1 bra BB2_11;
|
||||
|
||||
mul.lo.s32 %r28, %r25, %r24;
|
||||
shl.b32 %r29, %r28, 1;
|
||||
neg.s32 %r30, %r29;
|
||||
shl.b32 %r2, %r30, 3;
|
||||
cvta.to.global.u64 %rd6, %rd2;
|
||||
cvta.to.global.u64 %rd31, %rd3;
|
||||
cvta.to.global.u64 %rd32, %rd1;
|
||||
|
||||
BB2_2:
|
||||
.loc 1 23 1
|
||||
setp.ge.s32 %p2, %r21, %r22;
|
||||
@%p2 bra BB2_10;
|
||||
|
||||
mov.u32 %r52, %r21;
|
||||
|
||||
BB2_4:
|
||||
.loc 1 24 1
|
||||
mov.u32 %r4, %r52;
|
||||
setp.ge.s32 %p3, %r19, %r20;
|
||||
@%p3 bra BB2_9;
|
||||
|
||||
.loc 1 29 1
|
||||
mul.lo.s32 %r32, %r51, %r28;
|
||||
mad.lo.s32 %r5, %r4, %r24, %r32;
|
||||
.loc 1 32 1
|
||||
add.s32 %r6, %r24, %r5;
|
||||
add.s32 %r7, %r5, %r28;
|
||||
shl.b32 %r33, %r24, 1;
|
||||
add.s32 %r8, %r5, %r33;
|
||||
mad.lo.s32 %r9, %r24, -2, %r5;
|
||||
add.s32 %r10, %r5, %r29;
|
||||
mad.lo.s32 %r11, %r28, -2, %r5;
|
||||
add.s32 %r12, %r24, %r8;
|
||||
mad.lo.s32 %r13, %r28, 3, %r5;
|
||||
mov.u32 %r53, %r19;
|
||||
|
||||
BB2_6:
|
||||
.loc 1 26 1
|
||||
mov.u32 %r14, %r53;
|
||||
mov.u32 %r35, %tid.x;
|
||||
add.s32 %r36, %r35, %r14;
|
||||
.loc 1 29 1
|
||||
add.s32 %r15, %r36, %r5;
|
||||
mul.wide.s32 %rd7, %r15, 8;
|
||||
add.s64 %rd8, %rd6, %rd7;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd5, [%rd8];
|
||||
ld.global.f64 %fd7, [%rd8+-8];
|
||||
ld.global.f64 %fd8, [%rd8+8];
|
||||
add.f64 %fd9, %fd8, %fd7;
|
||||
add.s32 %r37, %r6, %r36;
|
||||
mul.wide.s32 %rd9, %r37, 8;
|
||||
add.s64 %rd10, %rd6, %rd9;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd10, [%rd10];
|
||||
add.f64 %fd11, %fd9, %fd10;
|
||||
.loc 1 22 1
|
||||
neg.s32 %r39, %r33;
|
||||
shl.b32 %r40, %r39, 3;
|
||||
cvt.s64.s32 %rd11, %r40;
|
||||
add.s64 %rd12, %rd10, %rd11;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd12, [%rd12];
|
||||
add.f64 %fd13, %fd11, %fd12;
|
||||
add.s32 %r41, %r7, %r36;
|
||||
mul.wide.s32 %rd13, %r41, 8;
|
||||
add.s64 %rd14, %rd6, %rd13;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd14, [%rd14];
|
||||
add.f64 %fd15, %fd13, %fd14;
|
||||
cvt.s64.s32 %rd15, %r2;
|
||||
add.s64 %rd16, %rd14, %rd15;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd16, [%rd16];
|
||||
add.f64 %fd17, %fd15, %fd16;
|
||||
mul.f64 %fd18, %fd2, %fd17;
|
||||
fma.rn.f64 %fd19, %fd1, %fd5, %fd18;
|
||||
ld.global.f64 %fd20, [%rd8+-16];
|
||||
ld.global.f64 %fd21, [%rd8+16];
|
||||
add.f64 %fd22, %fd21, %fd20;
|
||||
add.s32 %r42, %r8, %r36;
|
||||
mul.wide.s32 %rd17, %r42, 8;
|
||||
add.s64 %rd18, %rd6, %rd17;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd23, [%rd18];
|
||||
add.f64 %fd24, %fd22, %fd23;
|
||||
add.s32 %r43, %r9, %r36;
|
||||
mul.wide.s32 %rd19, %r43, 8;
|
||||
add.s64 %rd20, %rd6, %rd19;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd25, [%rd20];
|
||||
add.f64 %fd26, %fd24, %fd25;
|
||||
add.s32 %r44, %r10, %r36;
|
||||
mul.wide.s32 %rd21, %r44, 8;
|
||||
add.s64 %rd22, %rd6, %rd21;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd27, [%rd22];
|
||||
add.f64 %fd28, %fd26, %fd27;
|
||||
add.s32 %r45, %r11, %r36;
|
||||
mul.wide.s32 %rd23, %r45, 8;
|
||||
add.s64 %rd24, %rd6, %rd23;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd29, [%rd24];
|
||||
add.f64 %fd30, %fd28, %fd29;
|
||||
fma.rn.f64 %fd31, %fd3, %fd30, %fd19;
|
||||
ld.global.f64 %fd32, [%rd8+-24];
|
||||
ld.global.f64 %fd33, [%rd8+24];
|
||||
add.f64 %fd34, %fd33, %fd32;
|
||||
add.s32 %r46, %r12, %r36;
|
||||
mul.wide.s32 %rd25, %r46, 8;
|
||||
add.s64 %rd26, %rd6, %rd25;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd35, [%rd26];
|
||||
add.f64 %fd36, %fd34, %fd35;
|
||||
add.s64 %rd27, %rd12, %rd11;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd37, [%rd27];
|
||||
add.f64 %fd38, %fd36, %fd37;
|
||||
add.s32 %r47, %r13, %r36;
|
||||
mul.wide.s32 %rd28, %r47, 8;
|
||||
add.s64 %rd29, %rd6, %rd28;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd39, [%rd29];
|
||||
add.f64 %fd40, %fd38, %fd39;
|
||||
add.s64 %rd30, %rd16, %rd15;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd41, [%rd30];
|
||||
add.f64 %fd42, %fd40, %fd41;
|
||||
fma.rn.f64 %fd6, %fd4, %fd42, %fd31;
|
||||
.loc 1 44 1
|
||||
setp.ge.s32 %p4, %r36, %r20;
|
||||
@%p4 bra BB2_8;
|
||||
|
||||
mul.wide.s32 %rd33, %r15, 8;
|
||||
add.s64 %rd34, %rd31, %rd33;
|
||||
.loc 1 45 1
|
||||
ld.global.f64 %fd43, [%rd34];
|
||||
add.f64 %fd44, %fd5, %fd5;
|
||||
sub.f64 %fd45, %fd44, %fd43;
|
||||
add.s64 %rd35, %rd32, %rd33;
|
||||
.loc 1 45 1
|
||||
ld.global.f64 %fd46, [%rd35];
|
||||
fma.rn.f64 %fd47, %fd46, %fd6, %fd45;
|
||||
st.global.f64 [%rd34], %fd47;
|
||||
|
||||
BB2_8:
|
||||
.loc 1 24 19
|
||||
add.s32 %r16, %r14, 32;
|
||||
.loc 1 24 1
|
||||
setp.lt.s32 %p5, %r16, %r20;
|
||||
mov.u32 %r53, %r16;
|
||||
@%p5 bra BB2_6;
|
||||
|
||||
BB2_9:
|
||||
.loc 1 23 18
|
||||
add.s32 %r17, %r4, 1;
|
||||
.loc 1 23 1
|
||||
setp.lt.s32 %p6, %r17, %r22;
|
||||
mov.u32 %r52, %r17;
|
||||
@%p6 bra BB2_4;
|
||||
|
||||
BB2_10:
|
||||
.loc 1 22 18
|
||||
add.s32 %r51, %r51, 1;
|
||||
.loc 1 59 1
|
||||
add.s32 %r49, %r23, %r26;
|
||||
add.s32 %r50, %r49, 1;
|
||||
.loc 1 22 1
|
||||
setp.lt.s32 %p7, %r51, %r50;
|
||||
@%p7 bra BB2_2;
|
||||
|
||||
BB2_11:
|
||||
.loc 1 61 2
|
||||
ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,224 +0,0 @@
|
||||
//
|
||||
// Generated by NVIDIA NVVM Compiler
|
||||
// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
|
||||
// Cuda compilation tools, release 5.5, V5.5.0
|
||||
//
|
||||
|
||||
.version 3.2
|
||||
.target sm_35
|
||||
.address_size 64
|
||||
|
||||
.file 1 "/home/evghenii/soft/ispc-code/ispc/examples/stencil/stencil.cu", 1383254912, 2112
|
||||
|
||||
)
|
||||
{
|
||||
.reg .s32 %r<2>;
|
||||
|
||||
|
||||
mov.u32 %r1, 30;
|
||||
st.param.b32 [func_retval0+0], %r1;
|
||||
ret;
|
||||
}
|
||||
|
||||
.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes(
|
||||
.param .b64 cudaFuncGetAttributes_param_0,
|
||||
.param .b64 cudaFuncGetAttributes_param_1
|
||||
)
|
||||
{
|
||||
.reg .s32 %r<2>;
|
||||
|
||||
|
||||
mov.u32 %r1, 30;
|
||||
st.param.b32 [func_retval0+0], %r1;
|
||||
ret;
|
||||
}
|
||||
|
||||
.visible .entry stencil_step_task(
|
||||
.param .u32 stencil_step_task_param_0,
|
||||
.param .u32 stencil_step_task_param_1,
|
||||
.param .u32 stencil_step_task_param_2,
|
||||
.param .u32 stencil_step_task_param_3,
|
||||
.param .u32 stencil_step_task_param_4,
|
||||
.param .u32 stencil_step_task_param_5,
|
||||
.param .u32 stencil_step_task_param_6,
|
||||
.param .u32 stencil_step_task_param_7,
|
||||
.param .u64 stencil_step_task_param_8,
|
||||
.param .u64 stencil_step_task_param_9,
|
||||
.param .u64 stencil_step_task_param_10,
|
||||
.param .u64 stencil_step_task_param_11
|
||||
)
|
||||
{
|
||||
.reg .pred %p<8>;
|
||||
.reg .s32 %r<54>;
|
||||
.reg .s64 %rd<36>;
|
||||
.reg .f64 %fd<48>;
|
||||
|
||||
|
||||
ld.param.u32 %r19, [stencil_step_task_param_0];
|
||||
ld.param.u32 %r20, [stencil_step_task_param_1];
|
||||
ld.param.u32 %r21, [stencil_step_task_param_2];
|
||||
ld.param.u32 %r22, [stencil_step_task_param_3];
|
||||
ld.param.u32 %r23, [stencil_step_task_param_4];
|
||||
ld.param.u32 %r24, [stencil_step_task_param_5];
|
||||
ld.param.u32 %r25, [stencil_step_task_param_6];
|
||||
ld.param.u64 %rd4, [stencil_step_task_param_8];
|
||||
ld.param.u64 %rd1, [stencil_step_task_param_9];
|
||||
ld.param.u64 %rd2, [stencil_step_task_param_10];
|
||||
ld.param.u64 %rd3, [stencil_step_task_param_11];
|
||||
cvta.to.global.u64 %rd5, %rd4;
|
||||
mov.u32 %r26, %ctaid.x;
|
||||
add.s32 %r51, %r26, %r23;
|
||||
add.s32 %r27, %r51, 1;
|
||||
ld.global.f64 %fd1, [%rd5];
|
||||
ld.global.f64 %fd2, [%rd5+8];
|
||||
ld.global.f64 %fd3, [%rd5+16];
|
||||
ld.global.f64 %fd4, [%rd5+24];
|
||||
setp.ge.s32 %p1, %r51, %r27;
|
||||
@%p1 bra BB2_11;
|
||||
|
||||
mul.lo.s32 %r28, %r25, %r24;
|
||||
shl.b32 %r29, %r28, 1;
|
||||
neg.s32 %r30, %r29;
|
||||
shl.b32 %r2, %r30, 3;
|
||||
cvta.to.global.u64 %rd6, %rd2;
|
||||
cvta.to.global.u64 %rd31, %rd3;
|
||||
cvta.to.global.u64 %rd32, %rd1;
|
||||
|
||||
BB2_2:
|
||||
setp.ge.s32 %p2, %r21, %r22;
|
||||
@%p2 bra BB2_10;
|
||||
|
||||
mov.u32 %r52, %r21;
|
||||
|
||||
BB2_4:
|
||||
mov.u32 %r4, %r52;
|
||||
setp.ge.s32 %p3, %r19, %r20;
|
||||
@%p3 bra BB2_9;
|
||||
|
||||
mul.lo.s32 %r32, %r51, %r28;
|
||||
mad.lo.s32 %r5, %r4, %r24, %r32;
|
||||
add.s32 %r6, %r24, %r5;
|
||||
add.s32 %r7, %r5, %r28;
|
||||
shl.b32 %r33, %r24, 1;
|
||||
add.s32 %r8, %r5, %r33;
|
||||
mad.lo.s32 %r9, %r24, -2, %r5;
|
||||
add.s32 %r10, %r5, %r29;
|
||||
mad.lo.s32 %r11, %r28, -2, %r5;
|
||||
add.s32 %r12, %r24, %r8;
|
||||
mad.lo.s32 %r13, %r28, 3, %r5;
|
||||
mov.u32 %r53, %r19;
|
||||
|
||||
BB2_6:
|
||||
mov.u32 %r14, %r53;
|
||||
mov.u32 %r35, %tid.x;
|
||||
add.s32 %r36, %r35, %r14;
|
||||
add.s32 %r15, %r36, %r5;
|
||||
mul.wide.s32 %rd7, %r15, 8;
|
||||
add.s64 %rd8, %rd6, %rd7;
|
||||
ld.global.f64 %fd5, [%rd8];
|
||||
ld.global.f64 %fd7, [%rd8+-8];
|
||||
ld.global.f64 %fd8, [%rd8+8];
|
||||
add.f64 %fd9, %fd8, %fd7;
|
||||
add.s32 %r37, %r6, %r36;
|
||||
mul.wide.s32 %rd9, %r37, 8;
|
||||
add.s64 %rd10, %rd6, %rd9;
|
||||
ld.global.f64 %fd10, [%rd10];
|
||||
add.f64 %fd11, %fd9, %fd10;
|
||||
neg.s32 %r39, %r33;
|
||||
shl.b32 %r40, %r39, 3;
|
||||
cvt.s64.s32 %rd11, %r40;
|
||||
add.s64 %rd12, %rd10, %rd11;
|
||||
ld.global.f64 %fd12, [%rd12];
|
||||
add.f64 %fd13, %fd11, %fd12;
|
||||
add.s32 %r41, %r7, %r36;
|
||||
mul.wide.s32 %rd13, %r41, 8;
|
||||
add.s64 %rd14, %rd6, %rd13;
|
||||
ld.global.f64 %fd14, [%rd14];
|
||||
add.f64 %fd15, %fd13, %fd14;
|
||||
cvt.s64.s32 %rd15, %r2;
|
||||
add.s64 %rd16, %rd14, %rd15;
|
||||
ld.global.f64 %fd16, [%rd16];
|
||||
add.f64 %fd17, %fd15, %fd16;
|
||||
mul.f64 %fd18, %fd2, %fd17;
|
||||
fma.rn.f64 %fd19, %fd1, %fd5, %fd18;
|
||||
ld.global.f64 %fd20, [%rd8+-16];
|
||||
ld.global.f64 %fd21, [%rd8+16];
|
||||
add.f64 %fd22, %fd21, %fd20;
|
||||
add.s32 %r42, %r8, %r36;
|
||||
mul.wide.s32 %rd17, %r42, 8;
|
||||
add.s64 %rd18, %rd6, %rd17;
|
||||
ld.global.f64 %fd23, [%rd18];
|
||||
add.f64 %fd24, %fd22, %fd23;
|
||||
add.s32 %r43, %r9, %r36;
|
||||
mul.wide.s32 %rd19, %r43, 8;
|
||||
add.s64 %rd20, %rd6, %rd19;
|
||||
ld.global.f64 %fd25, [%rd20];
|
||||
add.f64 %fd26, %fd24, %fd25;
|
||||
add.s32 %r44, %r10, %r36;
|
||||
mul.wide.s32 %rd21, %r44, 8;
|
||||
add.s64 %rd22, %rd6, %rd21;
|
||||
ld.global.f64 %fd27, [%rd22];
|
||||
add.f64 %fd28, %fd26, %fd27;
|
||||
add.s32 %r45, %r11, %r36;
|
||||
mul.wide.s32 %rd23, %r45, 8;
|
||||
add.s64 %rd24, %rd6, %rd23;
|
||||
ld.global.f64 %fd29, [%rd24];
|
||||
add.f64 %fd30, %fd28, %fd29;
|
||||
fma.rn.f64 %fd31, %fd3, %fd30, %fd19;
|
||||
ld.global.f64 %fd32, [%rd8+-24];
|
||||
ld.global.f64 %fd33, [%rd8+24];
|
||||
add.f64 %fd34, %fd33, %fd32;
|
||||
add.s32 %r46, %r12, %r36;
|
||||
mul.wide.s32 %rd25, %r46, 8;
|
||||
add.s64 %rd26, %rd6, %rd25;
|
||||
ld.global.f64 %fd35, [%rd26];
|
||||
add.f64 %fd36, %fd34, %fd35;
|
||||
add.s64 %rd27, %rd12, %rd11;
|
||||
ld.global.f64 %fd37, [%rd27];
|
||||
add.f64 %fd38, %fd36, %fd37;
|
||||
add.s32 %r47, %r13, %r36;
|
||||
mul.wide.s32 %rd28, %r47, 8;
|
||||
add.s64 %rd29, %rd6, %rd28;
|
||||
ld.global.f64 %fd39, [%rd29];
|
||||
add.f64 %fd40, %fd38, %fd39;
|
||||
add.s64 %rd30, %rd16, %rd15;
|
||||
ld.global.f64 %fd41, [%rd30];
|
||||
add.f64 %fd42, %fd40, %fd41;
|
||||
fma.rn.f64 %fd6, %fd4, %fd42, %fd31;
|
||||
setp.ge.s32 %p4, %r36, %r20;
|
||||
@%p4 bra BB2_8;
|
||||
|
||||
mul.wide.s32 %rd33, %r15, 8;
|
||||
add.s64 %rd34, %rd31, %rd33;
|
||||
ld.global.f64 %fd43, [%rd34];
|
||||
add.f64 %fd44, %fd5, %fd5;
|
||||
sub.f64 %fd45, %fd44, %fd43;
|
||||
add.s64 %rd35, %rd32, %rd33;
|
||||
ld.global.f64 %fd46, [%rd35];
|
||||
fma.rn.f64 %fd47, %fd46, %fd6, %fd45;
|
||||
st.global.f64 [%rd34], %fd47;
|
||||
|
||||
BB2_8:
|
||||
add.s32 %r16, %r14, 32;
|
||||
setp.lt.s32 %p5, %r16, %r20;
|
||||
mov.u32 %r53, %r16;
|
||||
@%p5 bra BB2_6;
|
||||
|
||||
BB2_9:
|
||||
add.s32 %r17, %r4, 1;
|
||||
setp.lt.s32 %p6, %r17, %r22;
|
||||
mov.u32 %r52, %r17;
|
||||
@%p6 bra BB2_4;
|
||||
|
||||
BB2_10:
|
||||
add.s32 %r51, %r51, 1;
|
||||
add.s32 %r49, %r23, %r26;
|
||||
add.s32 %r50, %r49, 1;
|
||||
setp.lt.s32 %p7, %r51, %r50;
|
||||
@%p7 bra BB2_2;
|
||||
|
||||
BB2_11:
|
||||
ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,247 +0,0 @@
|
||||
//
|
||||
// Generated by LLVM NVPTX Back-End
|
||||
//
|
||||
|
||||
.version 3.1
|
||||
.target sm_20, texmode_independent
|
||||
.address_size 64
|
||||
|
||||
// .globl stencil_step_task
|
||||
// @stencil_step_task
|
||||
.entry stencil_step_task(
|
||||
.param .u32 stencil_step_task_param_0,
|
||||
.param .u32 stencil_step_task_param_1,
|
||||
.param .u32 stencil_step_task_param_2,
|
||||
.param .u32 stencil_step_task_param_3,
|
||||
.param .u32 stencil_step_task_param_4,
|
||||
.param .u32 stencil_step_task_param_5,
|
||||
.param .u32 stencil_step_task_param_6,
|
||||
.param .u32 stencil_step_task_param_7,
|
||||
.param .u64 .ptr .align 8 stencil_step_task_param_8,
|
||||
.param .u64 .ptr .align 8 stencil_step_task_param_9,
|
||||
.param .u64 .ptr .align 8 stencil_step_task_param_10,
|
||||
.param .u64 .ptr .align 8 stencil_step_task_param_11
|
||||
)
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0: // %allocas
|
||||
mov.u32 %r12, %ctaid.x;
|
||||
ld.param.u32 %r13, [stencil_step_task_param_4];
|
||||
add.s32 %r16, %r12, %r13;
|
||||
add.s32 %r0, %r16, 1;
|
||||
setp.ge.s32 %p0, %r16, %r0;
|
||||
@%p0 bra BB0_11;
|
||||
// BB#1: // %for_test28.i.preheader.lr.ph
|
||||
ld.param.u32 %r0, [stencil_step_task_param_0];
|
||||
ld.param.u32 %r1, [stencil_step_task_param_1];
|
||||
ld.param.u32 %r2, [stencil_step_task_param_2];
|
||||
ld.param.u32 %r3, [stencil_step_task_param_3];
|
||||
ld.param.u32 %r4, [stencil_step_task_param_5];
|
||||
ld.param.u32 %r5, [stencil_step_task_param_6];
|
||||
mul.lo.s32 %r5, %r5, %r4;
|
||||
ld.param.u64 %rl3, [stencil_step_task_param_8];
|
||||
ld.f64 %fl0, [%rl3];
|
||||
ld.f64 %fl1, [%rl3+8];
|
||||
ld.param.u64 %rl0, [stencil_step_task_param_9];
|
||||
ld.f64 %fl2, [%rl3+16];
|
||||
ld.param.u64 %rl1, [stencil_step_task_param_10];
|
||||
ld.param.u64 %rl2, [stencil_step_task_param_11];
|
||||
ld.f64 %fl3, [%rl3+24];
|
||||
shl.b32 %r6, %r4, 1;
|
||||
mul.lo.s32 %r7, %r4, 3;
|
||||
mul.lo.s32 %r8, %r4, -3;
|
||||
shl.b32 %r9, %r5, 1;
|
||||
mul.lo.s32 %r10, %r5, 3;
|
||||
mul.lo.s32 %r11, %r5, -3;
|
||||
add.s32 %r12, %r12, %r13;
|
||||
neg.s32 %r13, %r9;
|
||||
neg.s32 %r14, %r6;
|
||||
mov.u32 %r32, WARP_SZ;
|
||||
BB0_2: // %for_test28.i.preheader
|
||||
// =>This Loop Header: Depth=1
|
||||
// Child Loop BB0_9 Depth 2
|
||||
// Child Loop BB0_5 Depth 3
|
||||
mov.u32 %r15, %r16;
|
||||
setp.ge.s32 %p0, %r2, %r3;
|
||||
@%p0 bra BB0_10;
|
||||
// BB#3: // %for_test35.i.preheader.lr.ph
|
||||
// in Loop: Header=BB0_2 Depth=1
|
||||
setp.lt.s32 %p0, %r0, %r1;
|
||||
@%p0 bra BB0_4;
|
||||
bra.uni BB0_10;
|
||||
BB0_4: // in Loop: Header=BB0_2 Depth=1
|
||||
mul.lo.s32 %r16, %r15, %r5;
|
||||
mov.u32 %r17, %r2;
|
||||
BB0_9: // %for_loop37.i.lr.ph.us
|
||||
// Parent Loop BB0_2 Depth=1
|
||||
// => This Loop Header: Depth=2
|
||||
// Child Loop BB0_5 Depth 3
|
||||
mad.lo.s32 %r18, %r17, %r4, %r16;
|
||||
add.s32 %r19, %r18, %r4;
|
||||
add.s32 %r20, %r18, %r6;
|
||||
sub.s32 %r21, %r18, %r4;
|
||||
add.s32 %r22, %r18, %r7;
|
||||
add.s32 %r23, %r18, %r14;
|
||||
add.s32 %r24, %r18, %r5;
|
||||
add.s32 %r25, %r18, %r8;
|
||||
add.s32 %r26, %r18, %r9;
|
||||
sub.s32 %r27, %r18, %r5;
|
||||
add.s32 %r28, %r18, %r10;
|
||||
add.s32 %r29, %r18, %r13;
|
||||
add.s32 %r30, %r18, %r11;
|
||||
mov.u32 %r31, %r0;
|
||||
BB0_5: // %for_loop37.i.us
|
||||
// Parent Loop BB0_2 Depth=1
|
||||
// Parent Loop BB0_9 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
mov.u32 %r33, %tid.x;
|
||||
add.s32 %r34, %r32, -1;
|
||||
and.b32 %r33, %r34, %r33;
|
||||
add.s32 %r33, %r33, %r31;
|
||||
setp.ge.s32 %p0, %r33, %r1;
|
||||
@%p0 bra BB0_7;
|
||||
// BB#6: // %pl_dolane.i.us
|
||||
// in Loop: Header=BB0_5 Depth=3
|
||||
add.s32 %r34, %r18, %r33;
|
||||
shl.b32 %r34, %r34, 3;
|
||||
add.s32 %r35, %r34, -8;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl4, [%rl3];
|
||||
add.s32 %r35, %r34, 8;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl5, [%rl3];
|
||||
add.s32 %r35, %r34, -16;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl6, [%rl3];
|
||||
add.s32 %r35, %r34, 16;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl9, [%rl3];
|
||||
add.s32 %r35, %r19, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl8, [%rl3];
|
||||
add.s32 %r35, %r34, -24;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl7, [%rl3];
|
||||
add.s32 %r35, %r34, 24;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl10, [%rl3];
|
||||
add.s32 %r35, %r20, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl13, [%rl3];
|
||||
add.s32 %r35, %r21, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl12, [%rl3];
|
||||
add.s32 %r35, %r22, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl11, [%rl3];
|
||||
add.s32 %r35, %r23, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl16, [%rl3];
|
||||
add.s32 %r35, %r24, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl15, [%rl3];
|
||||
add.s32 %r35, %r25, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl14, [%rl3];
|
||||
add.s32 %r35, %r26, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl19, [%rl3];
|
||||
add.s32 %r35, %r27, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl18, [%rl3];
|
||||
add.s32 %r35, %r28, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl17, [%rl3];
|
||||
add.s32 %r35, %r29, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl24, [%rl3];
|
||||
cvt.s64.s32 %rl4, %r34;
|
||||
add.s64 %rl3, %rl4, %rl1;
|
||||
ld.f64 %fl21, [%rl3];
|
||||
add.s32 %r33, %r30, %r33;
|
||||
shl.b32 %r33, %r33, 3;
|
||||
cvt.s64.s32 %rl3, %r33;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl20, [%rl3];
|
||||
add.s64 %rl3, %rl4, %rl2;
|
||||
ld.f64 %fl23, [%rl3];
|
||||
add.s64 %rl4, %rl4, %rl0;
|
||||
ld.f64 %fl22, [%rl4];
|
||||
add.f64 %fl25, %fl21, %fl21;
|
||||
sub.f64 %fl23, %fl25, %fl23;
|
||||
add.f64 %fl6, %fl6, %fl9;
|
||||
add.f64 %fl6, %fl6, %fl13;
|
||||
add.f64 %fl6, %fl6, %fl16;
|
||||
add.f64 %fl6, %fl6, %fl19;
|
||||
add.f64 %fl6, %fl6, %fl24;
|
||||
add.f64 %fl4, %fl4, %fl5;
|
||||
add.f64 %fl4, %fl4, %fl8;
|
||||
add.f64 %fl4, %fl4, %fl12;
|
||||
add.f64 %fl4, %fl4, %fl15;
|
||||
add.f64 %fl4, %fl4, %fl18;
|
||||
mul.f64 %fl5, %fl0, %fl21;
|
||||
fma.rn.f64 %fl4, %fl1, %fl4, %fl5;
|
||||
fma.rn.f64 %fl4, %fl2, %fl6, %fl4;
|
||||
add.f64 %fl5, %fl7, %fl10;
|
||||
add.f64 %fl5, %fl5, %fl11;
|
||||
add.f64 %fl5, %fl5, %fl14;
|
||||
add.f64 %fl5, %fl5, %fl17;
|
||||
add.f64 %fl5, %fl5, %fl20;
|
||||
fma.rn.f64 %fl4, %fl3, %fl5, %fl4;
|
||||
fma.rn.f64 %fl4, %fl4, %fl22, %fl23;
|
||||
st.f64 [%rl3], %fl4;
|
||||
BB0_7: // %safe_if_after_true.i.us
|
||||
// in Loop: Header=BB0_5 Depth=3
|
||||
add.s32 %r31, %r32, %r31;
|
||||
setp.lt.s32 %p0, %r31, %r1;
|
||||
@%p0 bra BB0_5;
|
||||
// BB#8: // %for_exit38.i.us
|
||||
// in Loop: Header=BB0_9 Depth=2
|
||||
add.s32 %r17, %r17, 1;
|
||||
setp.eq.s32 %p0, %r17, %r3;
|
||||
@%p0 bra BB0_10;
|
||||
bra.uni BB0_9;
|
||||
BB0_10: // %for_exit31.i
|
||||
// in Loop: Header=BB0_2 Depth=1
|
||||
add.s32 %r16, %r15, 1;
|
||||
setp.ne.s32 %p0, %r15, %r12;
|
||||
@%p0 bra BB0_2;
|
||||
BB0_11: // %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit
|
||||
ret;
|
||||
}
|
||||
|
||||
@@ -1,317 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include <math.h>
|
||||
#include "../timing.h"
|
||||
#include "stencil_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <cuda.h>
|
||||
#include "drvapi_error_string.h"
|
||||
|
||||
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
|
||||
// These are the inline versions for all of the SDK helper functions
|
||||
void __checkCudaErrors(CUresult err, const char *file, const int line) {
|
||||
if(CUDA_SUCCESS != err) {
|
||||
std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
|
||||
<< getCudaDrvErrorString(err) << "\" from file <" << file
|
||||
<< ", line " << line << "\n";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
/**********************/
|
||||
/* Basic CUDriver API */
|
||||
CUcontext context;
|
||||
|
||||
void createContext(const int deviceId = 0)
|
||||
{
|
||||
CUdevice device;
|
||||
int devCount;
|
||||
checkCudaErrors(cuInit(0));
|
||||
checkCudaErrors(cuDeviceGetCount(&devCount));
|
||||
assert(devCount > 0);
|
||||
checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
|
||||
|
||||
char name[128];
|
||||
checkCudaErrors(cuDeviceGetName(name, 128, device));
|
||||
std::cout << "Using CUDA Device [0]: " << name << "\n";
|
||||
|
||||
int devMajor, devMinor;
|
||||
checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
|
||||
std::cout << "Device Compute Capability: "
|
||||
<< devMajor << "." << devMinor << "\n";
|
||||
if (devMajor < 2) {
|
||||
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Create driver context
|
||||
checkCudaErrors(cuCtxCreate(&context, 0, device));
|
||||
}
|
||||
void destroyContext()
|
||||
{
|
||||
checkCudaErrors(cuCtxDestroy(context));
|
||||
}
|
||||
|
||||
CUmodule loadModule(const char * module)
|
||||
{
|
||||
CUmodule cudaModule;
|
||||
checkCudaErrors(cuModuleLoadData(&cudaModule, module));
|
||||
return cudaModule;
|
||||
}
|
||||
void unloadModule(CUmodule &cudaModule)
|
||||
{
|
||||
checkCudaErrors(cuModuleUnload(cudaModule));
|
||||
}
|
||||
|
||||
CUfunction getFunction(CUmodule &cudaModule, const char * function)
|
||||
{
|
||||
CUfunction cudaFunction;
|
||||
checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
|
||||
return cudaFunction;
|
||||
}
|
||||
|
||||
CUdeviceptr deviceMalloc(const size_t size)
|
||||
{
|
||||
CUdeviceptr d_buf;
|
||||
checkCudaErrors(cuMemAlloc(&d_buf, size));
|
||||
return d_buf;
|
||||
}
|
||||
void deviceFree(CUdeviceptr d_buf)
|
||||
{
|
||||
checkCudaErrors(cuMemFree(d_buf));
|
||||
}
|
||||
void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
|
||||
{
|
||||
checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
|
||||
}
|
||||
void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
|
||||
{
|
||||
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
|
||||
}
|
||||
#define deviceLaunch(func,nbx,nby,nbz,params) \
|
||||
checkCudaErrors( \
|
||||
cuLaunchKernel( \
|
||||
(func), \
|
||||
(nbx), (nby), (nbz), \
|
||||
32, 1, 1, \
|
||||
0, NULL, (params), NULL \
|
||||
));
|
||||
|
||||
typedef CUdeviceptr devicePtr;
|
||||
|
||||
|
||||
/**************/
|
||||
|
||||
extern "C"
|
||||
{
|
||||
|
||||
void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
void CUDALaunch(
|
||||
void **handlePtr,
|
||||
const char * module_name,
|
||||
const char * module,
|
||||
const char * func_name,
|
||||
void **func_args,
|
||||
int countx, int county, int countz)
|
||||
{
|
||||
assert(module_name != NULL);
|
||||
assert(module != NULL);
|
||||
assert(func_name != NULL);
|
||||
assert(func_args != NULL);
|
||||
CUmodule cudaModule = loadModule(module);
|
||||
CUfunction cudaFunction = getFunction(cudaModule, func_name);
|
||||
deviceLaunch(cudaFunction, countx, county, countz, func_args);
|
||||
unloadModule(cudaModule);
|
||||
}
|
||||
void CUDASync(void *handle)
|
||||
{
|
||||
checkCudaErrors(cuStreamSynchronize(0));
|
||||
}
|
||||
void ISPCSync(void *handle)
|
||||
{
|
||||
checkCudaErrors(cuStreamSynchronize(0));
|
||||
}
|
||||
void CUDAFree(void *handle)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
||||
int y0, int y1, int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[5],
|
||||
const double vsq[],
|
||||
double Aeven[], double Aodd[]);
|
||||
|
||||
|
||||
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
|
||||
int offset = 0;
|
||||
for (int z = 0; z < Nz; ++z)
|
||||
for (int y = 0; y < Ny; ++y)
|
||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||
A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
|
||||
A[1][offset] = 0;
|
||||
vsq[offset] = x*y*z / double(Nx * Ny * Nz);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
int Nx = 256, Ny = 256, Nz = 256;
|
||||
int width = 4;
|
||||
double *Aserial[2], *Aispc[2];
|
||||
Aserial[0] = new double [Nx * Ny * Nz];
|
||||
Aserial[1] = new double [Nx * Ny * Nz];
|
||||
Aispc[0] = new double [Nx * Ny * Nz];
|
||||
Aispc[1] = new double [Nx * Ny * Nz];
|
||||
double *vsq = new double [Nx * Ny * Nz];
|
||||
|
||||
double coeff[4] = { 0.5, -.25, .125, -.0625 };
|
||||
|
||||
/*******************/
|
||||
createContext();
|
||||
/*******************/
|
||||
|
||||
const size_t bufsize = sizeof(double)*Nx*Ny*Nz;
|
||||
devicePtr d_Aispc0 = deviceMalloc(bufsize);
|
||||
devicePtr d_Aispc1 = deviceMalloc(bufsize);
|
||||
devicePtr d_vsq = deviceMalloc(bufsize);
|
||||
devicePtr d_coeff = deviceMalloc(4*sizeof(double));
|
||||
|
||||
|
||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation on one core; report
|
||||
// the minimum time of three runs.
|
||||
//
|
||||
double minTimeISPC = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aispc[0], Aispc[1]);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeISPC = std::min(minTimeISPC, dt);
|
||||
}
|
||||
|
||||
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
|
||||
|
||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||
|
||||
memcpyH2D(d_Aispc0, Aispc[0], bufsize);
|
||||
memcpyH2D(d_Aispc1, Aispc[1], bufsize);
|
||||
memcpyH2D(d_vsq, vsq, bufsize);
|
||||
memcpyH2D(d_coeff, coeff, 4*sizeof(double));
|
||||
//
|
||||
// Compute the image using the ispc implementation with tasks; report
|
||||
// the minimum time of three runs.
|
||||
//
|
||||
double minTimeISPCTasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, (double*)d_coeff, (double*)d_vsq,
|
||||
(double*)d_Aispc0, (double*)d_Aispc1);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
|
||||
}
|
||||
memcpyD2H(Aispc[1], d_Aispc1, bufsize);
|
||||
//memcpyD2H(Aispc[1], d_vsq, bufsize);
|
||||
|
||||
printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
|
||||
|
||||
InitData(Nx, Ny, Nz, Aserial, vsq);
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
// minimum time.
|
||||
//
|
||||
double minTimeSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aserial[0], Aserial[1]);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeSerial = std::min(minTimeSerial, dt);
|
||||
}
|
||||
|
||||
printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
|
||||
|
||||
// Check for agreement
|
||||
int offset = 0;
|
||||
int nerr = 0;
|
||||
for (int z = 0; z < Nz; ++z)
|
||||
for (int y = 0; y < Ny; ++y)
|
||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||
|
||||
double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
|
||||
Aserial[1][offset]);
|
||||
if (error > 1e-3)
|
||||
{
|
||||
if (nerr < 100)
|
||||
printf("Error @ (%d,%d,%d): ispc = %g, serial = %g error= %g\n",
|
||||
x, y, z, Aispc[1][offset], Aserial[1][offset], error);
|
||||
nerr++;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, " nerr= %d frac= %g \n", nerr, 1.0*nerr/(1.0*Nx*Ny*Nz));
|
||||
|
||||
/*******************/
|
||||
destroyContext();
|
||||
/*******************/
|
||||
|
||||
return 0;
|
||||
}
|
||||
Binary file not shown.
@@ -1,87 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
static void
|
||||
stencil_step(int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const float coef[4], const float vsq[],
|
||||
const float Ain[], float Aout[]) {
|
||||
int Nxy = Nx * Ny;
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int z = z0; z < z1; ++z) {
|
||||
for (int y = y0; y < y1; ++y) {
|
||||
for (int x = x0; x < x1; ++x) {
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
float div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void loop_stencil_parallel(int t0, int t1,
|
||||
int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const float coef[4],
|
||||
const float vsq[],
|
||||
float Aeven[], float Aodd[])
|
||||
{
|
||||
for (int t = t0; t < t1; ++t) {
|
||||
if ((t & 1) == 0)
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aeven, Aodd);
|
||||
else
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aodd, Aeven);
|
||||
}
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user