diff --git a/examples/aobench/Makefile b/examples/aobench/Makefile index 28f0f051..c8122c07 100644 --- a/examples/aobench/Makefile +++ b/examples/aobench/Makefile @@ -2,7 +2,7 @@ EXAMPLE=ao CPP_SRC=ao.cpp ao_serial.cpp ISPC_SRC=ao.ispc -ISPC_IA_TARGETS=avx1-i32x8 +ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8 ISPC_ARM_TARGETS=neon include ../common.mk diff --git a/examples/deferred/deferred_shading b/examples/deferred/deferred_shading deleted file mode 100755 index 8cecdec6..00000000 Binary files a/examples/deferred/deferred_shading and /dev/null differ diff --git a/examples/deferred/main_cu.cpp b/examples/deferred/main_cu.cpp deleted file mode 100644 index 4f2be879..00000000 --- a/examples/deferred/main_cu.cpp +++ /dev/null @@ -1,139 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifdef _MSC_VER -#define ISPC_IS_WINDOWS -#define NOMINMAX -#elif defined(__linux__) -#define ISPC_IS_LINUX -#elif defined(__APPLE__) -#define ISPC_IS_APPLE -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef ISPC_IS_WINDOWS - #define WIN32_LEAN_AND_MEAN - #include -#endif -#include "deferred.h" -#include "kernels_ispc.h" -#include "../timing.h" - -/////////////////////////////////////////////////////////////////////////// - -int main(int argc, char** argv) { - if (argc != 2) { - printf("usage: deferred_shading \n"); - return 1; - } - - InputData *input = CreateInputDataFromFile(argv[1]); - if (!input) { - printf("Failed to load input file \"%s\"!\n", argv[1]); - return 1; - } - - Framebuffer framebuffer(input->header.framebufferWidth, - input->header.framebufferHeight); - - InitDynamicC(input); -#ifdef __cilk - InitDynamicCilk(input); -#endif // __cilk - - int nframes = 5; - double ispcCycles = 1e30; - for (int i = 0; i < 5; ++i) { - framebuffer.clear(); - reset_and_start_timer(); - for (int j = 0; j < nframes; ++j) - ispc::RenderStatic(input->header, input->arrays, - VISUALIZE_LIGHT_COUNT, - framebuffer.r, framebuffer.g, framebuffer.b); - double mcycles = get_elapsed_mcycles() / nframes; - ispcCycles = std::min(ispcCycles, mcycles); - } - printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render " - "%d x %d image\n", ispcCycles, - input->header.framebufferWidth, input->header.framebufferHeight); - WriteFrame("deferred-ispc-static.ppm", input, framebuffer); - -#ifdef __cilk - double dynamicCilkCycles = 1e30; - for (int i = 0; i < 5; ++i) { - framebuffer.clear(); - reset_and_start_timer(); - for (int j = 0; j < nframes; ++j) - DispatchDynamicCilk(input, &framebuffer); - double mcycles = get_elapsed_mcycles() / nframes; - dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles); - } - printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", - dynamicCilkCycles); - WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer); -#endif // __cilk - - double serialCycles = 1e30; - for (int i = 0; i < 5; ++i) { - framebuffer.clear(); - reset_and_start_timer(); - for (int j = 0; j < nframes; ++j) - DispatchDynamicC(input, &framebuffer); - double mcycles = get_elapsed_mcycles() / nframes; - serialCycles = std::min(serialCycles, mcycles); - } - printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", - serialCycles); - WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer); - -#ifdef __cilk - printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", - serialCycles/ispcCycles, serialCycles/dynamicCilkCycles); -#else - printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles); -#endif // __cilk - - DeleteInputData(input); - - return 0; -} diff --git a/examples/mandelbrot/avx.out b/examples/mandelbrot/avx.out deleted file mode 100755 index 5434d8a8..00000000 Binary files a/examples/mandelbrot/avx.out and /dev/null differ diff --git a/examples/mandelbrot/avx1.out b/examples/mandelbrot/avx1.out deleted file mode 100755 index 59cfd29f..00000000 Binary files a/examples/mandelbrot/avx1.out and /dev/null differ diff --git a/examples/mandelbrot/out.o b/examples/mandelbrot/out.o deleted file mode 100644 index 986ac38c..00000000 Binary files a/examples/mandelbrot/out.o and /dev/null differ diff --git a/examples/mandelbrot/out.ptx b/examples/mandelbrot/out.ptx deleted file mode 100644 index d14a7ee8..00000000 --- a/examples/mandelbrot/out.ptx +++ /dev/null @@ -1,843 +0,0 @@ -// -// Generated by LLVM NVPTX Back-End -// - -.version 3.1 -.target sm_35, texmode_independent -.address_size 64 - - // .globl __vselect_i8 - // @__vselect_i8 -.func (.param .align 1 .b8 func_retval0[1]) __vselect_i8( - .param .align 1 .b8 __vselect_i8_param_0[1], - .param .align 1 .b8 __vselect_i8_param_1[1], - .param .align 4 .b8 __vselect_i8_param_2[4] -) -{ - .reg .pred %p<396>; - .reg .s16 %rc<396>; - .reg .s16 %rs<396>; - .reg .s32 %r<396>; - .reg .s64 %rl<396>; - .reg .f32 %f<396>; - .reg .f64 %fl<396>; - -// BB#0: - ld.param.u32 %r0, [__vselect_i8_param_2]; - setp.eq.s32 %p0, %r0, 0; - ld.param.u8 %rc0, [__vselect_i8_param_0]; - ld.param.u8 %rc1, [__vselect_i8_param_1]; - selp.b16 %rc0, %rc0, %rc1, %p0; - st.param.b8 [func_retval0+0], %rc0; - ret; -} - - // .globl __vselect_i16 -.func (.param .align 2 .b8 func_retval0[2]) __vselect_i16( - .param .align 2 .b8 __vselect_i16_param_0[2], - .param .align 2 .b8 __vselect_i16_param_1[2], - .param .align 4 .b8 __vselect_i16_param_2[4] -) // @__vselect_i16 -{ - .reg .pred %p<396>; - .reg .s16 %rc<396>; - .reg .s16 %rs<396>; - .reg .s32 %r<396>; - .reg .s64 %rl<396>; - .reg .f32 %f<396>; - .reg .f64 %fl<396>; - -// BB#0: - ld.param.u32 %r0, [__vselect_i16_param_2]; - setp.eq.s32 %p0, %r0, 0; - ld.param.u16 %rs0, [__vselect_i16_param_0]; - ld.param.u16 %rs1, [__vselect_i16_param_1]; - selp.b16 %rs0, %rs0, %rs1, %p0; - st.param.b16 [func_retval0+0], %rs0; - ret; -} - - // .globl __vselect_i64 -.func (.param .align 8 .b8 func_retval0[8]) __vselect_i64( - .param .align 8 .b8 __vselect_i64_param_0[8], - .param .align 8 .b8 __vselect_i64_param_1[8], - .param .align 4 .b8 __vselect_i64_param_2[4] -) // @__vselect_i64 -{ - .reg .pred %p<396>; - .reg .s16 %rc<396>; - .reg .s16 %rs<396>; - .reg .s32 %r<396>; - .reg .s64 %rl<396>; - .reg .f32 %f<396>; - .reg .f64 %fl<396>; - -// BB#0: - ld.param.u32 %r0, [__vselect_i64_param_2]; - setp.eq.s32 %p0, %r0, 0; - ld.param.u64 %rl0, [__vselect_i64_param_0]; - ld.param.u64 %rl1, [__vselect_i64_param_1]; - selp.b64 %rl0, %rl0, %rl1, %p0; - st.param.b64 [func_retval0+0], %rl0; - ret; -} - - // .globl __aos_to_soa4_float1 -.func __aos_to_soa4_float1( - .param .align 4 .b8 __aos_to_soa4_float1_param_0[4], - .param .align 4 .b8 __aos_to_soa4_float1_param_1[4], - .param .align 4 .b8 __aos_to_soa4_float1_param_2[4], - .param .align 4 .b8 __aos_to_soa4_float1_param_3[4], - .param .b64 __aos_to_soa4_float1_param_4, - .param .b64 __aos_to_soa4_float1_param_5, - .param .b64 __aos_to_soa4_float1_param_6, - .param .b64 __aos_to_soa4_float1_param_7 -) // @__aos_to_soa4_float1 -{ - .reg .pred %p<396>; - .reg .s16 %rc<396>; - .reg .s16 %rs<396>; - .reg .s32 %r<396>; - .reg .s64 %rl<396>; - .reg .f32 %f<396>; - .reg .f64 %fl<396>; - -// BB#0: - ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4]; - ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5]; - ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6]; - ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7]; - ld.param.f32 %f0, [__aos_to_soa4_float1_param_0]; - ld.param.f32 %f1, [__aos_to_soa4_float1_param_1]; - ld.param.f32 %f2, [__aos_to_soa4_float1_param_2]; - ld.param.f32 %f3, [__aos_to_soa4_float1_param_3]; - st.f32 [%rl0], %f0; - st.f32 [%rl1], %f1; - st.f32 [%rl2], %f2; - st.f32 [%rl3], %f3; - ret; -} - - // .globl __soa_to_aos4_float1 -.func __soa_to_aos4_float1( - .param .align 4 .b8 __soa_to_aos4_float1_param_0[4], - .param .align 4 .b8 __soa_to_aos4_float1_param_1[4], - .param .align 4 .b8 __soa_to_aos4_float1_param_2[4], - .param .align 4 .b8 __soa_to_aos4_float1_param_3[4], - .param .b64 __soa_to_aos4_float1_param_4, - .param .b64 __soa_to_aos4_float1_param_5, - .param .b64 __soa_to_aos4_float1_param_6, - .param .b64 __soa_to_aos4_float1_param_7 -) // @__soa_to_aos4_float1 -{ - .reg .pred %p<396>; - .reg .s16 %rc<396>; - .reg .s16 %rs<396>; - .reg .s32 %r<396>; - .reg .s64 %rl<396>; - .reg .f32 %f<396>; - .reg .f64 %fl<396>; - -// BB#0: - ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4]; - ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5]; - ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6]; - ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7]; - ld.param.f32 %f0, [__soa_to_aos4_float1_param_0]; - ld.param.f32 %f1, [__soa_to_aos4_float1_param_1]; - ld.param.f32 %f2, [__soa_to_aos4_float1_param_2]; - ld.param.f32 %f3, [__soa_to_aos4_float1_param_3]; - st.f32 [%rl0], %f0; - st.f32 [%rl1], %f1; - st.f32 [%rl2], %f2; - st.f32 [%rl3], %f3; - ret; -} - - // .globl __aos_to_soa3_float1 -.func __aos_to_soa3_float1( - .param .align 4 .b8 __aos_to_soa3_float1_param_0[4], - .param .align 4 .b8 __aos_to_soa3_float1_param_1[4], - .param .align 4 .b8 __aos_to_soa3_float1_param_2[4], - .param .b64 __aos_to_soa3_float1_param_3, - .param .b64 __aos_to_soa3_float1_param_4, - .param .b64 __aos_to_soa3_float1_param_5 -) // @__aos_to_soa3_float1 -{ - .reg .pred %p<396>; - .reg .s16 %rc<396>; - .reg .s16 %rs<396>; - .reg .s32 %r<396>; - .reg .s64 %rl<396>; - .reg .f32 %f<396>; - .reg .f64 %fl<396>; - -// BB#0: - ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3]; - ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4]; - ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5]; - ld.param.f32 %f0, [__aos_to_soa3_float1_param_0]; - ld.param.f32 %f1, [__aos_to_soa3_float1_param_1]; - ld.param.f32 %f2, [__aos_to_soa3_float1_param_2]; - st.f32 [%rl0], %f0; - st.f32 [%rl1], %f1; - st.f32 [%rl2], %f2; - ret; -} - - // .globl __soa_to_aos3_float1 -.func __soa_to_aos3_float1( - .param .align 4 .b8 __soa_to_aos3_float1_param_0[4], - .param .align 4 .b8 __soa_to_aos3_float1_param_1[4], - .param .align 4 .b8 __soa_to_aos3_float1_param_2[4], - .param .b64 __soa_to_aos3_float1_param_3, - .param .b64 __soa_to_aos3_float1_param_4, - .param .b64 __soa_to_aos3_float1_param_5 -) // @__soa_to_aos3_float1 -{ - .reg .pred %p<396>; - .reg .s16 %rc<396>; - .reg .s16 %rs<396>; - .reg .s32 %r<396>; - .reg .s64 %rl<396>; - .reg .f32 %f<396>; - .reg .f64 %fl<396>; - -// BB#0: - ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3]; - ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4]; - ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5]; - ld.param.f32 %f0, [__soa_to_aos3_float1_param_0]; - ld.param.f32 %f1, [__soa_to_aos3_float1_param_1]; - ld.param.f32 %f2, [__soa_to_aos3_float1_param_2]; - st.f32 [%rl0], %f0; - st.f32 [%rl1], %f1; - st.f32 [%rl2], %f2; - ret; -} - - // .globl __rsqrt_varying_double -.func (.param .align 8 .b8 func_retval0[8]) __rsqrt_varying_double( - .param .align 8 .b8 __rsqrt_varying_double_param_0[8] -) // @__rsqrt_varying_double -{ - .reg .pred %p<396>; - .reg .s16 %rc<396>; - .reg .s16 %rs<396>; - .reg .s32 %r<396>; - .reg .s64 %rl<396>; - .reg .f32 %f<396>; - .reg .f64 %fl<396>; - -// BB#0: - ld.param.f64 %fl0, [__rsqrt_varying_double_param_0]; - rsqrt.approx.f64 %fl0, %fl0; - st.param.f64 [func_retval0+0], %fl0; - ret; -} - - // .globl mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_ -.func mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_( - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0, - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1, - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2, - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3, - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4, - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5, - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6, - .param .b64 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7, - .param .align 4 .b8 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_8[4] -) // @mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_ -{ - .reg .pred %p<396>; - .reg .s16 %rc<396>; - .reg .s16 %rs<396>; - .reg .s32 %r<396>; - .reg .s64 %rl<396>; - .reg .f32 %f<396>; - .reg .f64 %fl<396>; - -// BB#0: // %allocas - ld.param.f32 %f0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0]; - ld.param.f32 %f1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1]; - ld.param.f32 %f3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2]; - ld.param.f32 %f2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3]; - ld.param.u32 %r0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4]; - ld.param.u32 %r1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5]; - ld.param.u32 %r2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6]; - ld.param.u64 %rl0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7]; - ld.param.u32 %r3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_8]; - setp.lt.s32 %p0, %r3, 0; - sub.f32 %f3, %f3, %f0; - cvt.rn.f32.s32 %f4, %r0; - sub.f32 %f2, %f2, %f1; - cvt.rn.f32.s32 %f5, %r1; - div.rn.f32 %f2, %f2, %f5; - div.rn.f32 %f3, %f3, %f4; - @%p0 bra BB8_9; -// BB#1: // %for_test110.preheader - setp.lt.s32 %p0, %r1, 1; - @%p0 bra BB8_45; -// BB#2: // %outer_not_in_extras140.preheader.lr.ph - setp.gt.s32 %p0, %r2, 0; - mov.u32 %r3, 0; - selp.b32 %r4, -1, 0, %p0; - shl.b32 %r5, %r0, 2; - mov.u32 %r6, %r3; -BB8_3: // %outer_not_in_extras140.preheader - // =>This Loop Header: Depth=1 - // Child Loop BB8_41 Depth 2 - // Child Loop BB8_43 Depth 2 - // Child Loop BB8_38 Depth 2 - // Child Loop BB8_33 Depth 3 - setp.lt.s32 %p0, %r0, 1; - @%p0 bra BB8_4; -// BB#31: // %foreach_full_body120.lr.ph - // in Loop: Header=BB8_3 Depth=1 - setp.lt.s32 %p0, %r4, 0; - mov.u32 %r7, %r0; - mov.u32 %r8, %r3; - @%p0 bra BB8_32; - bra.uni BB8_43; -BB8_32: // in Loop: Header=BB8_3 Depth=1 - mov.u64 %rl1, 0; - cvt.rn.f32.s32 %f4, %r6; - fma.rn.f32 %f4, %f2, %f4, %f1; - mul.lo.s32 %r7, %r6, %r0; -BB8_38: // %for_loop.i380.lr.ph.us - // Parent Loop BB8_3 Depth=1 - // => This Loop Header: Depth=2 - // Child Loop BB8_33 Depth 3 - cvt.u32.u64 %r8, %rl1; - cvt.rn.f32.s32 %f5, %r8; - fma.rn.f32 %f5, %f3, %f5, %f0; - mov.u32 %r10, 0; - mov.u32 %r12, %r4; - mov.u32 %r11, %r10; - mov.u32 %r9, %r10; - mov.f32 %f7, %f5; - mov.f32 %f6, %f4; -BB8_33: // %for_loop.i380.us - // Parent Loop BB8_3 Depth=1 - // Parent Loop BB8_38 Depth=2 - // => This Inner Loop Header: Depth=3 - mul.f32 %f8, %f7, %f7; - fma.rn.f32 %f9, %f6, %f6, %f8; - setp.gtu.f32 %p0, %f9, 0f40800000; - selp.b32 %r13, %r12, 0, %p0; - or.b32 %r11, %r13, %r11; - shr.u32 %r13, %r11, 31; - shr.u32 %r14, %r12, 31; - setp.eq.s32 %p0, %r13, %r14; - @%p0 bra BB8_34; - bra.uni BB8_35; -BB8_34: // in Loop: Header=BB8_33 Depth=3 - mov.u32 %r12, %r10; - bra.uni BB8_36; -BB8_35: // %not_all_continued_or_breaked.i394.us - // in Loop: Header=BB8_33 Depth=3 - mul.f32 %f9, %f6, %f6; - not.b32 %r13, %r11; - and.b32 %r12, %r12, %r13; - sub.f32 %f8, %f8, %f9; - add.f32 %f8, %f5, %f8; - add.f32 %f7, %f7, %f7; - fma.rn.f32 %f6, %f6, %f7, %f4; - mov.f32 %f7, %f8; -BB8_36: // %for_step.i363.us - // in Loop: Header=BB8_33 Depth=3 - setp.ne.s32 %p0, %r12, 0; - selp.u32 %r13, 1, 0, %p0; - add.s32 %r9, %r9, %r13; - setp.lt.s32 %p0, %r9, %r2; - selp.b32 %r12, %r12, 0, %p0; - setp.lt.s32 %p0, %r12, 0; - @%p0 bra BB8_33; -// BB#37: // %mandel___vyfvyfvyi.exit395.us - // in Loop: Header=BB8_38 Depth=2 - add.s32 %r8, %r8, %r7; - shl.b32 %r8, %r8, 2; - cvt.s64.s32 %rl2, %r8; - add.s64 %rl2, %rl2, %rl0; - st.u32 [%rl2], %r9; - add.s64 %rl1, %rl1, 1; - cvt.u32.u64 %r8, %rl1; - setp.eq.s32 %p0, %r8, %r0; - @%p0 bra BB8_44; - bra.uni BB8_38; -BB8_43: // %mandel___vyfvyfvyi.exit395 - // Parent Loop BB8_3 Depth=1 - // => This Inner Loop Header: Depth=2 - cvt.s64.s32 %rl1, %r8; - add.s64 %rl1, %rl1, %rl0; - mov.u32 %r9, 0; - st.u32 [%rl1], %r9; - add.s32 %r8, %r8, 4; - add.s32 %r7, %r7, -1; - setp.eq.s32 %p0, %r7, 0; - @%p0 bra BB8_44; - bra.uni BB8_43; -BB8_4: // %partial_inner_all_outer156 - // in Loop: Header=BB8_3 Depth=1 - @%p0 bra BB8_44; -// BB#5: // %partial_inner_only197 - // in Loop: Header=BB8_3 Depth=1 - setp.gt.s32 %p0, %r0, 0; - mov.u32 %r8, 0; - fma.rn.f32 %f4, %f3, 0f00000000, %f0; - cvt.rn.f32.s32 %f5, %r6; - fma.rn.f32 %f5, %f2, %f5, %f1; - selp.b32 %r7, %r4, 0, %p0; - setp.lt.s32 %p1, %r7, 0; - mov.u32 %r10, %r4; - mov.u32 %r9, %r8; - mov.u32 %r7, %r8; - mov.f32 %f7, %f4; - mov.f32 %f6, %f5; - @%p1 bra BB8_41; - bra.uni BB8_6; -BB8_41: // %for_loop.i - // Parent Loop BB8_3 Depth=1 - // => This Inner Loop Header: Depth=2 - selp.b32 %r11, %r10, 0, %p0; - mul.f32 %f8, %f7, %f7; - fma.rn.f32 %f9, %f6, %f6, %f8; - setp.gtu.f32 %p1, %f9, 0f40800000; - selp.b32 %r12, %r10, 0, %p1; - or.b32 %r9, %r12, %r9; - selp.b32 %r12, %r9, 0, %p0; - shr.u32 %r12, %r12, 31; - shr.u32 %r11, %r11, 31; - setp.eq.s32 %p1, %r12, %r11; - @%p1 bra BB8_42; - bra.uni BB8_39; -BB8_42: // in Loop: Header=BB8_41 Depth=2 - mov.u32 %r10, %r8; - bra.uni BB8_40; -BB8_39: // %not_all_continued_or_breaked.i - // in Loop: Header=BB8_41 Depth=2 - mul.f32 %f9, %f6, %f6; - not.b32 %r11, %r9; - and.b32 %r10, %r10, %r11; - sub.f32 %f8, %f8, %f9; - add.f32 %f8, %f4, %f8; - add.f32 %f7, %f7, %f7; - fma.rn.f32 %f6, %f6, %f7, %f5; - mov.f32 %f7, %f8; -BB8_40: // %for_step.i - // in Loop: Header=BB8_41 Depth=2 - setp.ne.s32 %p1, %r10, 0; - selp.u32 %r11, 1, 0, %p1; - add.s32 %r7, %r7, %r11; - setp.lt.s32 %p1, %r7, %r2; - selp.b32 %r10, %r10, 0, %p1; - selp.b32 %r11, %r10, 0, %p0; - setp.gt.s32 %p1, %r11, -1; - @%p1 bra BB8_7; - bra.uni BB8_41; -BB8_6: // in Loop: Header=BB8_3 Depth=1 - mov.u32 %r7, %r8; -BB8_7: // %mandel___vyfvyfvyi.exit - // in Loop: Header=BB8_3 Depth=1 - setp.lt.s32 %p0, %r0, 1; - @%p0 bra BB8_44; -// BB#8: // %pl_dolane.i - // in Loop: Header=BB8_3 Depth=1 - mul.lo.s32 %r8, %r6, %r0; - shl.b32 %r8, %r8, 2; - cvt.s64.s32 %rl1, %r8; - add.s64 %rl1, %rl1, %rl0; - st.u32 [%rl1], %r7; -BB8_44: // %foreach_reset128 - // in Loop: Header=BB8_3 Depth=1 - add.s32 %r6, %r6, 1; - add.s32 %r3, %r3, %r5; - setp.eq.s32 %p0, %r6, %r1; - @%p0 bra BB8_45; - bra.uni BB8_3; -BB8_9: // %for_test.preheader - setp.lt.s32 %p0, %r1, 1; - @%p0 bra BB8_45; -// BB#10: // %outer_not_in_extras.preheader.lr.ph - setp.gt.s32 %p0, %r2, 0; - mov.u32 %r3, 0; - selp.b32 %r4, -1, 0, %p0; - shl.b32 %r5, %r0, 2; - mov.u32 %r6, %r3; -BB8_11: // %outer_not_in_extras.preheader - // =>This Loop Header: Depth=1 - // Child Loop BB8_23 Depth 2 - // Child Loop BB8_20 Depth 2 - // Child Loop BB8_19 Depth 2 - // Child Loop BB8_14 Depth 3 - setp.lt.s32 %p0, %r0, 1; - @%p0 bra BB8_28; -// BB#12: // %foreach_full_body.lr.ph - // in Loop: Header=BB8_11 Depth=1 - setp.lt.s32 %p0, %r4, 0; - mov.u32 %r7, %r0; - mov.u32 %r8, %r3; - @%p0 bra BB8_13; - bra.uni BB8_20; -BB8_13: // in Loop: Header=BB8_11 Depth=1 - mov.u64 %rl1, 0; - cvt.rn.f32.s32 %f4, %r6; - fma.rn.f32 %f4, %f2, %f4, %f1; - mul.lo.s32 %r7, %r6, %r0; -BB8_19: // %for_loop.i281.lr.ph.us - // Parent Loop BB8_11 Depth=1 - // => This Loop Header: Depth=2 - // Child Loop BB8_14 Depth 3 - cvt.u32.u64 %r8, %rl1; - cvt.rn.f32.s32 %f5, %r8; - fma.rn.f32 %f5, %f3, %f5, %f0; - mov.u32 %r10, 0; - mov.u32 %r12, %r4; - mov.u32 %r11, %r10; - mov.u32 %r9, %r10; - mov.f32 %f7, %f5; - mov.f32 %f6, %f4; -BB8_14: // %for_loop.i281.us - // Parent Loop BB8_11 Depth=1 - // Parent Loop BB8_19 Depth=2 - // => This Inner Loop Header: Depth=3 - mul.f32 %f8, %f7, %f7; - fma.rn.f32 %f9, %f6, %f6, %f8; - setp.gtu.f32 %p0, %f9, 0f40800000; - selp.b32 %r13, %r12, 0, %p0; - or.b32 %r11, %r13, %r11; - shr.u32 %r13, %r11, 31; - shr.u32 %r14, %r12, 31; - setp.eq.s32 %p0, %r13, %r14; - @%p0 bra BB8_15; - bra.uni BB8_16; -BB8_15: // in Loop: Header=BB8_14 Depth=3 - mov.u32 %r12, %r10; - bra.uni BB8_17; -BB8_16: // %not_all_continued_or_breaked.i295.us - // in Loop: Header=BB8_14 Depth=3 - mul.f32 %f9, %f6, %f6; - not.b32 %r13, %r11; - and.b32 %r12, %r12, %r13; - sub.f32 %f8, %f8, %f9; - add.f32 %f8, %f5, %f8; - add.f32 %f7, %f7, %f7; - fma.rn.f32 %f6, %f6, %f7, %f4; - mov.f32 %f7, %f8; -BB8_17: // %for_step.i264.us - // in Loop: Header=BB8_14 Depth=3 - setp.ne.s32 %p0, %r12, 0; - selp.u32 %r13, 1, 0, %p0; - add.s32 %r9, %r9, %r13; - setp.lt.s32 %p0, %r9, %r2; - selp.b32 %r12, %r12, 0, %p0; - setp.lt.s32 %p0, %r12, 0; - @%p0 bra BB8_14; -// BB#18: // %mandel___vyfvyfvyi.exit296.us - // in Loop: Header=BB8_19 Depth=2 - add.s32 %r8, %r8, %r7; - shl.b32 %r8, %r8, 2; - cvt.s64.s32 %rl2, %r8; - add.s64 %rl2, %rl2, %rl0; - st.u32 [%rl2], %r9; - add.s64 %rl1, %rl1, 1; - cvt.u32.u64 %r8, %rl1; - setp.eq.s32 %p0, %r8, %r0; - @%p0 bra BB8_27; - bra.uni BB8_19; -BB8_20: // %mandel___vyfvyfvyi.exit296 - // Parent Loop BB8_11 Depth=1 - // => This Inner Loop Header: Depth=2 - cvt.s64.s32 %rl1, %r8; - add.s64 %rl1, %rl1, %rl0; - mov.u32 %r9, 0; - st.u32 [%rl1], %r9; - add.s32 %r8, %r8, 4; - add.s32 %r7, %r7, -1; - setp.eq.s32 %p0, %r7, 0; - @%p0 bra BB8_27; - bra.uni BB8_20; -BB8_28: // %partial_inner_all_outer - // in Loop: Header=BB8_11 Depth=1 - @%p0 bra BB8_27; -// BB#29: // %partial_inner_only - // in Loop: Header=BB8_11 Depth=1 - setp.gt.s32 %p0, %r0, 0; - mov.u32 %r8, 0; - fma.rn.f32 %f4, %f3, 0f00000000, %f0; - cvt.rn.f32.s32 %f5, %r6; - fma.rn.f32 %f5, %f2, %f5, %f1; - selp.b32 %r7, %r4, 0, %p0; - setp.lt.s32 %p1, %r7, 0; - mov.u32 %r10, %r4; - mov.u32 %r9, %r8; - mov.u32 %r7, %r8; - mov.f32 %f7, %f4; - mov.f32 %f6, %f5; - @%p1 bra BB8_23; - bra.uni BB8_30; -BB8_23: // %for_loop.i332 - // Parent Loop BB8_11 Depth=1 - // => This Inner Loop Header: Depth=2 - selp.b32 %r11, %r10, 0, %p0; - mul.f32 %f8, %f7, %f7; - fma.rn.f32 %f9, %f6, %f6, %f8; - setp.gtu.f32 %p1, %f9, 0f40800000; - selp.b32 %r12, %r10, 0, %p1; - or.b32 %r9, %r12, %r9; - selp.b32 %r12, %r9, 0, %p0; - shr.u32 %r12, %r12, 31; - shr.u32 %r11, %r11, 31; - setp.eq.s32 %p1, %r12, %r11; - @%p1 bra BB8_24; - bra.uni BB8_21; -BB8_24: // in Loop: Header=BB8_23 Depth=2 - mov.u32 %r10, %r8; - bra.uni BB8_22; -BB8_21: // %not_all_continued_or_breaked.i346 - // in Loop: Header=BB8_23 Depth=2 - mul.f32 %f9, %f6, %f6; - not.b32 %r11, %r9; - and.b32 %r10, %r10, %r11; - sub.f32 %f8, %f8, %f9; - add.f32 %f8, %f4, %f8; - add.f32 %f7, %f7, %f7; - fma.rn.f32 %f6, %f6, %f7, %f5; - mov.f32 %f7, %f8; -BB8_22: // %for_step.i313 - // in Loop: Header=BB8_23 Depth=2 - setp.ne.s32 %p1, %r10, 0; - selp.u32 %r11, 1, 0, %p1; - add.s32 %r7, %r7, %r11; - setp.lt.s32 %p1, %r7, %r2; - selp.b32 %r10, %r10, 0, %p1; - selp.b32 %r11, %r10, 0, %p0; - setp.gt.s32 %p1, %r11, -1; - @%p1 bra BB8_25; - bra.uni BB8_23; -BB8_30: // in Loop: Header=BB8_11 Depth=1 - mov.u32 %r7, %r8; -BB8_25: // %mandel___vyfvyfvyi.exit347 - // in Loop: Header=BB8_11 Depth=1 - setp.lt.s32 %p0, %r0, 1; - @%p0 bra BB8_27; -// BB#26: // %pl_dolane.i452 - // in Loop: Header=BB8_11 Depth=1 - mul.lo.s32 %r8, %r6, %r0; - shl.b32 %r8, %r8, 2; - cvt.s64.s32 %rl1, %r8; - add.s64 %rl1, %rl1, %rl0; - st.u32 [%rl1], %r7; -BB8_27: // %foreach_reset - // in Loop: Header=BB8_11 Depth=1 - add.s32 %r6, %r6, 1; - add.s32 %r3, %r3, %r5; - setp.eq.s32 %p0, %r6, %r1; - @%p0 bra BB8_45; - bra.uni BB8_11; -BB8_45: // %for_exit - ret; -} - - // .globl mandelbrot_ispc -.func mandelbrot_ispc( - .param .b32 mandelbrot_ispc_param_0, - .param .b32 mandelbrot_ispc_param_1, - .param .b32 mandelbrot_ispc_param_2, - .param .b32 mandelbrot_ispc_param_3, - .param .b32 mandelbrot_ispc_param_4, - .param .b32 mandelbrot_ispc_param_5, - .param .b32 mandelbrot_ispc_param_6, - .param .b64 mandelbrot_ispc_param_7 -) // @mandelbrot_ispc -{ - .reg .pred %p<396>; - .reg .s16 %rc<396>; - .reg .s16 %rs<396>; - .reg .s32 %r<396>; - .reg .s64 %rl<396>; - .reg .f32 %f<396>; - .reg .f64 %fl<396>; - -// BB#0: // %allocas - ld.param.u32 %r0, [mandelbrot_ispc_param_5]; - setp.lt.s32 %p0, %r0, 1; - @%p0 bra BB9_18; -// BB#1: // %outer_not_in_extras.preheader.lr.ph - ld.param.f32 %f0, [mandelbrot_ispc_param_0]; - ld.param.f32 %f1, [mandelbrot_ispc_param_1]; - ld.param.f32 %f3, [mandelbrot_ispc_param_2]; - ld.param.f32 %f2, [mandelbrot_ispc_param_3]; - ld.param.u32 %r1, [mandelbrot_ispc_param_4]; - ld.param.u32 %r2, [mandelbrot_ispc_param_6]; - ld.param.u64 %rl0, [mandelbrot_ispc_param_7]; - sub.f32 %f3, %f3, %f0; - cvt.rn.f32.s32 %f4, %r1; - sub.f32 %f2, %f2, %f1; - cvt.rn.f32.s32 %f5, %r0; - div.rn.f32 %f2, %f2, %f5; - div.rn.f32 %f3, %f3, %f4; - setp.gt.s32 %p0, %r2, 0; - mov.u32 %r3, 0; - selp.b32 %r4, -1, 0, %p0; -BB9_2: // %outer_not_in_extras.preheader - // =>This Loop Header: Depth=1 - // Child Loop BB9_13 Depth 2 - // Child Loop BB9_4 Depth 2 - // Child Loop BB9_9 Depth 3 - setp.lt.s32 %p0, %r1, 1; - @%p0 bra BB9_19; -// BB#3: // %foreach_full_body.lr.ph - // in Loop: Header=BB9_2 Depth=1 - mov.u64 %rl1, 0; - cvt.rn.f32.s32 %f4, %r3; - fma.rn.f32 %f4, %f2, %f4, %f1; - mul.lo.s32 %r5, %r3, %r1; -BB9_4: // %foreach_full_body - // Parent Loop BB9_2 Depth=1 - // => This Loop Header: Depth=2 - // Child Loop BB9_9 Depth 3 - setp.lt.s32 %p0, %r4, 0; - cvt.u32.u64 %r6, %rl1; - cvt.rn.f32.s32 %f5, %r6; - fma.rn.f32 %f5, %f3, %f5, %f0; - mov.u32 %r8, 0; - mov.u32 %r10, %r4; - mov.u32 %r9, %r8; - mov.u32 %r7, %r8; - mov.f32 %f7, %f5; - mov.f32 %f6, %f4; - @%p0 bra BB9_9; - bra.uni BB9_5; -BB9_9: // %for_loop.i281 - // Parent Loop BB9_2 Depth=1 - // Parent Loop BB9_4 Depth=2 - // => This Inner Loop Header: Depth=3 - mul.f32 %f8, %f7, %f7; - fma.rn.f32 %f9, %f6, %f6, %f8; - setp.gtu.f32 %p0, %f9, 0f40800000; - selp.b32 %r11, %r10, 0, %p0; - or.b32 %r9, %r11, %r9; - shr.u32 %r11, %r9, 31; - shr.u32 %r12, %r10, 31; - setp.eq.s32 %p0, %r11, %r12; - @%p0 bra BB9_10; - bra.uni BB9_7; -BB9_10: // in Loop: Header=BB9_9 Depth=3 - mov.u32 %r10, %r8; - bra.uni BB9_8; -BB9_7: // %not_all_continued_or_breaked.i295 - // in Loop: Header=BB9_9 Depth=3 - mul.f32 %f9, %f6, %f6; - not.b32 %r11, %r9; - and.b32 %r10, %r10, %r11; - sub.f32 %f8, %f8, %f9; - add.f32 %f8, %f5, %f8; - add.f32 %f7, %f7, %f7; - fma.rn.f32 %f6, %f6, %f7, %f4; - mov.f32 %f7, %f8; -BB9_8: // %for_step.i264 - // in Loop: Header=BB9_9 Depth=3 - setp.ne.s32 %p0, %r10, 0; - selp.u32 %r11, 1, 0, %p0; - add.s32 %r7, %r7, %r11; - setp.lt.s32 %p0, %r7, %r2; - selp.b32 %r10, %r10, 0, %p0; - setp.gt.s32 %p0, %r10, -1; - @%p0 bra BB9_6; - bra.uni BB9_9; -BB9_5: // in Loop: Header=BB9_4 Depth=2 - mov.u32 %r7, %r8; -BB9_6: // %mandel___vyfvyfvyi.exit296 - // in Loop: Header=BB9_4 Depth=2 - add.s32 %r6, %r6, %r5; - shl.b32 %r6, %r6, 2; - cvt.s64.s32 %rl2, %r6; - add.s64 %rl2, %rl2, %rl0; - st.u32 [%rl2], %r7; - add.s64 %rl1, %rl1, 1; - cvt.u32.u64 %r6, %rl1; - setp.eq.s32 %p0, %r6, %r1; - @%p0 bra BB9_17; - bra.uni BB9_4; -BB9_19: // %partial_inner_all_outer - // in Loop: Header=BB9_2 Depth=1 - @%p0 bra BB9_17; -// BB#20: // %partial_inner_only - // in Loop: Header=BB9_2 Depth=1 - setp.gt.s32 %p0, %r1, 0; - mov.u32 %r6, 0; - fma.rn.f32 %f4, %f3, 0f00000000, %f0; - cvt.rn.f32.s32 %f5, %r3; - fma.rn.f32 %f5, %f2, %f5, %f1; - selp.b32 %r5, %r4, 0, %p0; - setp.lt.s32 %p1, %r5, 0; - mov.u32 %r8, %r4; - mov.u32 %r7, %r6; - mov.u32 %r5, %r6; - mov.f32 %f7, %f4; - mov.f32 %f6, %f5; - @%p1 bra BB9_13; - bra.uni BB9_21; -BB9_13: // %for_loop.i332 - // Parent Loop BB9_2 Depth=1 - // => This Inner Loop Header: Depth=2 - selp.b32 %r9, %r8, 0, %p0; - mul.f32 %f8, %f7, %f7; - fma.rn.f32 %f9, %f6, %f6, %f8; - setp.gtu.f32 %p1, %f9, 0f40800000; - selp.b32 %r10, %r8, 0, %p1; - or.b32 %r7, %r10, %r7; - selp.b32 %r10, %r7, 0, %p0; - shr.u32 %r10, %r10, 31; - shr.u32 %r9, %r9, 31; - setp.eq.s32 %p1, %r10, %r9; - @%p1 bra BB9_14; - bra.uni BB9_11; -BB9_14: // in Loop: Header=BB9_13 Depth=2 - mov.u32 %r8, %r6; - bra.uni BB9_12; -BB9_11: // %not_all_continued_or_breaked.i346 - // in Loop: Header=BB9_13 Depth=2 - mul.f32 %f9, %f6, %f6; - not.b32 %r9, %r7; - and.b32 %r8, %r8, %r9; - sub.f32 %f8, %f8, %f9; - add.f32 %f8, %f4, %f8; - add.f32 %f7, %f7, %f7; - fma.rn.f32 %f6, %f6, %f7, %f5; - mov.f32 %f7, %f8; -BB9_12: // %for_step.i313 - // in Loop: Header=BB9_13 Depth=2 - setp.ne.s32 %p1, %r8, 0; - selp.u32 %r9, 1, 0, %p1; - add.s32 %r5, %r5, %r9; - setp.lt.s32 %p1, %r5, %r2; - selp.b32 %r8, %r8, 0, %p1; - selp.b32 %r9, %r8, 0, %p0; - setp.gt.s32 %p1, %r9, -1; - @%p1 bra BB9_15; - bra.uni BB9_13; -BB9_21: // in Loop: Header=BB9_2 Depth=1 - mov.u32 %r5, %r6; -BB9_15: // %mandel___vyfvyfvyi.exit347 - // in Loop: Header=BB9_2 Depth=1 - setp.lt.s32 %p0, %r1, 1; - @%p0 bra BB9_17; -// BB#16: // %pl_dolane.i - // in Loop: Header=BB9_2 Depth=1 - mul.lo.s32 %r6, %r3, %r1; - shl.b32 %r6, %r6, 2; - cvt.s64.s32 %rl1, %r6; - add.s64 %rl1, %rl1, %rl0; - st.u32 [%rl1], %r5; -BB9_17: // %foreach_reset - // in Loop: Header=BB9_2 Depth=1 - add.s32 %r3, %r3, 1; - setp.eq.s32 %p0, %r3, %r0; - @%p0 bra BB9_18; - bra.uni BB9_2; -BB9_18: // %for_exit - ret; -} - diff --git a/examples/mandelbrot/out.s b/examples/mandelbrot/out.s deleted file mode 100644 index 434a1fb5..00000000 Binary files a/examples/mandelbrot/out.s and /dev/null differ diff --git a/examples/mandelbrot/out1.o b/examples/mandelbrot/out1.o deleted file mode 100644 index 61d76b88..00000000 Binary files a/examples/mandelbrot/out1.o and /dev/null differ diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile index cfbad4c1..51866b32 100644 --- a/examples/mandelbrot_tasks/Makefile +++ b/examples/mandelbrot_tasks/Makefile @@ -2,7 +2,7 @@ EXAMPLE=mandelbrot_tasks CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp ISPC_SRC=mandelbrot_tasks.ispc -ISPC_IA_TARGETS=avx1-i32x16 +ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16 ISPC_ARM_TARGETS=neon include ../common.mk diff --git a/examples/rt/Makefile b/examples/rt/Makefile index 0c72f104..e52b02e4 100644 --- a/examples/rt/Makefile +++ b/examples/rt/Makefile @@ -2,7 +2,7 @@ EXAMPLE=rt CPP_SRC=rt.cpp rt_serial.cpp ISPC_SRC=rt.ispc -ISPC_IA_TARGETS=avx1-i32x8 +ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8 ISPC_ARM_TARGETS=neon include ../common.mk diff --git a/examples/sort/sort b/examples/sort/sort deleted file mode 100755 index 51e36ea7..00000000 Binary files a/examples/sort/sort and /dev/null differ diff --git a/examples/stencil/a.out b/examples/stencil/a.out deleted file mode 100755 index db6400a6..00000000 Binary files a/examples/stencil/a.out and /dev/null differ diff --git a/examples/stencil/drvapi_error_string.h b/examples/stencil/drvapi_error_string.h deleted file mode 100644 index ce85f152..00000000 --- a/examples/stencil/drvapi_error_string.h +++ /dev/null @@ -1,370 +0,0 @@ -/* - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * Please refer to the NVIDIA end user license agreement (EULA) associated - * with this source code for terms and conditions that govern your use of - * this software. Any use, reproduction, disclosure, or distribution of - * this software and related documentation outside the terms of the EULA - * is strictly prohibited. - * - */ - -#ifndef _DRVAPI_ERROR_STRING_H_ -#define _DRVAPI_ERROR_STRING_H_ - -#include -#include -#include - -// Error Code string definitions here -typedef struct -{ - char const *error_string; - int error_id; -} s_CudaErrorStr; - -/** - * Error codes - */ -static s_CudaErrorStr sCudaDrvErrorString[] = -{ - /** - * The API call returned with no errors. In the case of query calls, this - * can also mean that the operation being queried is complete (see - * ::cuEventQuery() and ::cuStreamQuery()). - */ - { "CUDA_SUCCESS", 0 }, - - /** - * This indicates that one or more of the parameters passed to the API call - * is not within an acceptable range of values. - */ - { "CUDA_ERROR_INVALID_VALUE", 1 }, - - /** - * The API call failed because it was unable to allocate enough memory to - * perform the requested operation. - */ - { "CUDA_ERROR_OUT_OF_MEMORY", 2 }, - - /** - * This indicates that the CUDA driver has not been initialized with - * ::cuInit() or that initialization has failed. - */ - { "CUDA_ERROR_NOT_INITIALIZED", 3 }, - - /** - * This indicates that the CUDA driver is in the process of shutting down. - */ - { "CUDA_ERROR_DEINITIALIZED", 4 }, - - /** - * This indicates profiling APIs are called while application is running - * in visual profiler mode. - */ - { "CUDA_ERROR_PROFILER_DISABLED", 5 }, - /** - * This indicates profiling has not been initialized for this context. - * Call cuProfilerInitialize() to resolve this. - */ - { "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 }, - /** - * This indicates profiler has already been started and probably - * cuProfilerStart() is incorrectly called. - */ - { "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 }, - /** - * This indicates profiler has already been stopped and probably - * cuProfilerStop() is incorrectly called. - */ - { "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 }, - /** - * This indicates that no CUDA-capable devices were detected by the installed - * CUDA driver. - */ - { "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 }, - - /** - * This indicates that the device ordinal supplied by the user does not - * correspond to a valid CUDA device. - */ - { "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 }, - - - /** - * This indicates that the device kernel image is invalid. This can also - * indicate an invalid CUDA module. - */ - { "CUDA_ERROR_INVALID_IMAGE", 200 }, - - /** - * This most frequently indicates that there is no context bound to the - * current thread. This can also be returned if the context passed to an - * API call is not a valid handle (such as a context that has had - * ::cuCtxDestroy() invoked on it). This can also be returned if a user - * mixes different API versions (i.e. 3010 context with 3020 API calls). - * See ::cuCtxGetApiVersion() for more details. - */ - { "CUDA_ERROR_INVALID_CONTEXT", 201 }, - - /** - * This indicated that the context being supplied as a parameter to the - * API call was already the active context. - * \deprecated - * This error return is deprecated as of CUDA 3.2. It is no longer an - * error to attempt to push the active context via ::cuCtxPushCurrent(). - */ - { "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 }, - - /** - * This indicates that a map or register operation has failed. - */ - { "CUDA_ERROR_MAP_FAILED", 205 }, - - /** - * This indicates that an unmap or unregister operation has failed. - */ - { "CUDA_ERROR_UNMAP_FAILED", 206 }, - - /** - * This indicates that the specified array is currently mapped and thus - * cannot be destroyed. - */ - { "CUDA_ERROR_ARRAY_IS_MAPPED", 207 }, - - /** - * This indicates that the resource is already mapped. - */ - { "CUDA_ERROR_ALREADY_MAPPED", 208 }, - - /** - * This indicates that there is no kernel image available that is suitable - * for the device. This can occur when a user specifies code generation - * options for a particular CUDA source file that do not include the - * corresponding device configuration. - */ - { "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 }, - - /** - * This indicates that a resource has already been acquired. - */ - { "CUDA_ERROR_ALREADY_ACQUIRED", 210 }, - - /** - * This indicates that a resource is not mapped. - */ - { "CUDA_ERROR_NOT_MAPPED", 211 }, - - /** - * This indicates that a mapped resource is not available for access as an - * array. - */ - { "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 }, - - /** - * This indicates that a mapped resource is not available for access as a - * pointer. - */ - { "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 }, - - /** - * This indicates that an uncorrectable ECC error was detected during - * execution. - */ - { "CUDA_ERROR_ECC_UNCORRECTABLE", 214 }, - - /** - * This indicates that the ::CUlimit passed to the API call is not - * supported by the active device. - */ - { "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 }, - - /** - * This indicates that the ::CUcontext passed to the API call can - * only be bound to a single CPU thread at a time but is already - * bound to a CPU thread. - */ - { "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 }, - - /** - * This indicates that peer access is not supported across the given - * devices. - */ - { "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217}, - - /** - * This indicates that the device kernel source is invalid. - */ - { "CUDA_ERROR_INVALID_SOURCE", 300 }, - - /** - * This indicates that the file specified was not found. - */ - { "CUDA_ERROR_FILE_NOT_FOUND", 301 }, - - /** - * This indicates that a link to a shared object failed to resolve. - */ - { "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 }, - - /** - * This indicates that initialization of a shared object failed. - */ - { "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 }, - - /** - * This indicates that an OS call failed. - */ - { "CUDA_ERROR_OPERATING_SYSTEM", 304 }, - - - /** - * This indicates that a resource handle passed to the API call was not - * valid. Resource handles are opaque types like ::CUstream and ::CUevent. - */ - { "CUDA_ERROR_INVALID_HANDLE", 400 }, - - - /** - * This indicates that a named symbol was not found. Examples of symbols - * are global/constant variable names, texture names }, and surface names. - */ - { "CUDA_ERROR_NOT_FOUND", 500 }, - - - /** - * This indicates that asynchronous operations issued previously have not - * completed yet. This result is not actually an error, but must be indicated - * differently than ::CUDA_SUCCESS (which indicates completion). Calls that - * may return this value include ::cuEventQuery() and ::cuStreamQuery(). - */ - { "CUDA_ERROR_NOT_READY", 600 }, - - - /** - * An exception occurred on the device while executing a kernel. Common - * causes include dereferencing an invalid device pointer and accessing - * out of bounds shared memory. The context cannot be used }, so it must - * be destroyed (and a new one should be created). All existing device - * memory allocations from this context are invalid and must be - * reconstructed if the program is to continue using CUDA. - */ - { "CUDA_ERROR_LAUNCH_FAILED", 700 }, - - /** - * This indicates that a launch did not occur because it did not have - * appropriate resources. This error usually indicates that the user has - * attempted to pass too many arguments to the device kernel, or the - * kernel launch specifies too many threads for the kernel's register - * count. Passing arguments of the wrong size (i.e. a 64-bit pointer - * when a 32-bit int is expected) is equivalent to passing too many - * arguments and can also result in this error. - */ - { "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 }, - - /** - * This indicates that the device kernel took too long to execute. This can - * only occur if timeouts are enabled - see the device attribute - * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The - * context cannot be used (and must be destroyed similar to - * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from - * this context are invalid and must be reconstructed if the program is to - * continue using CUDA. - */ - { "CUDA_ERROR_LAUNCH_TIMEOUT", 702 }, - - /** - * This error indicates a kernel launch that uses an incompatible texturing - * mode. - */ - { "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 }, - - /** - * This error indicates that a call to ::cuCtxEnablePeerAccess() is - * trying to re-enable peer access to a context which has already - * had peer access to it enabled. - */ - { "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 }, - - /** - * This error indicates that ::cuCtxDisablePeerAccess() is - * trying to disable peer access which has not been enabled yet - * via ::cuCtxEnablePeerAccess(). - */ - { "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 }, - - /** - * This error indicates that the primary context for the specified device - * has already been initialized. - */ - { "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 }, - - /** - * This error indicates that the context current to the calling thread - * has been destroyed using ::cuCtxDestroy }, or is a primary context which - * has not yet been initialized. - */ - { "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 }, - - /** - * A device-side assert triggered during kernel execution. The context - * cannot be used anymore, and must be destroyed. All existing device - * memory allocations from this context are invalid and must be - * reconstructed if the program is to continue using CUDA. - */ - { "CUDA_ERROR_ASSERT", 710 }, - - /** - * This error indicates that the hardware resources required to enable - * peer access have been exhausted for one or more of the devices - * passed to ::cuCtxEnablePeerAccess(). - */ - { "CUDA_ERROR_TOO_MANY_PEERS", 711 }, - - /** - * This error indicates that the memory range passed to ::cuMemHostRegister() - * has already been registered. - */ - { "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 }, - - /** - * This error indicates that the pointer passed to ::cuMemHostUnregister() - * does not correspond to any currently registered memory region. - */ - { "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 }, - - /** - * This error indicates that the attempted operation is not permitted. - */ - { "CUDA_ERROR_NOT_PERMITTED", 800 }, - - /** - * This error indicates that the attempted operation is not supported - * on the current system or device. - */ - { "CUDA_ERROR_NOT_SUPPORTED", 801 }, - - /** - * This indicates that an unknown internal error has occurred. - */ - { "CUDA_ERROR_UNKNOWN", 999 }, - { NULL, -1 } -}; - -// This is just a linear search through the array, since the error_id's are not -// always ocurring consecutively -const char * getCudaDrvErrorString(CUresult error_id) -{ - int index = 0; - while (sCudaDrvErrorString[index].error_id != error_id && - sCudaDrvErrorString[index].error_id != -1) - { - index++; - } - if (sCudaDrvErrorString[index].error_id == error_id) - return (const char *)sCudaDrvErrorString[index].error_string; - else - return (const char *)"CUDA_ERROR not found!"; -} - -#endif diff --git a/examples/stencil/err b/examples/stencil/err deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/stencil/info b/examples/stencil/info deleted file mode 100644 index 4fc9105f..00000000 --- a/examples/stencil/info +++ /dev/null @@ -1,5 +0,0 @@ -I have been working with sort example, attempting to use ISPC_USE_OMP for tasking and adding example for sort_paralle.cpp which uses __gnu_parallel::sort to compare apples with apples, but clang has no support for OpenMP. - -The reason to use ISPC_USE_OMP is to control thread-affinity on multi-socket systems. For bandwidth bound throughput, the tasking system based on pthread make it messy to control thread-affinity and w/o this for bandwidth bound work-loads performance may suffer.. - -I used sort example to begin with diff --git a/examples/stencil/stencil.cu b/examples/stencil/stencil.cu deleted file mode 100644 index 63b051e8..00000000 --- a/examples/stencil/stencil.cu +++ /dev/null @@ -1,62 +0,0 @@ -#define programCount 32 -#define programIndex threadIdx.x -#define taskIndex blockIdx.x - -__device__ static void -stencil_step( int x0, int x1, - int y0, int y1, - int z0, int z1, - int Nx, int Ny, int Nz, - const double coef[4], const double vsq[], - const double Ain[], double Aout[]) { - const int Nxy = Nx * Ny; - - -#if 0 - foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) { -#else - const double coef0 = coef[0]; - const double coef1 = coef[1]; - const double coef2 = coef[2]; - const double coef3 = coef[3]; - for ( int z = z0; z < z1; z++) - for ( int y = y0 ; y < y1; y++) - for ( int xb = x0; xb < x1; xb += programCount) - { - const int x = xb + programIndex; - -#endif - int index = (z * Nxy) + (y * Nx) + x; -#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] -#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - double div = - coef0 * A_cur(0, 0, 0) + - coef1 * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + - A_cur(0, +1, 0) + A_cur(0, -1, 0) + - A_cur(0, 0, +1) + A_cur(0, 0, -1)) + - coef2 * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + - A_cur(0, +2, 0) + A_cur(0, -2, 0) + - A_cur(0, 0, +2) + A_cur(0, 0, -2)) + - coef3 * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + - A_cur(0, +3, 0) + A_cur(0, -3, 0) + - A_cur(0, 0, +3) + A_cur(0, 0, -3)); - - if (x < x1) - A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + - vsq[index] * div; - } -} - - -extern "C" -__global__ void -stencil_step_task( int x0, int x1, - int y0, int y1, - int z0, - int Nx, int Ny, int Nz, - const double coef[4], const double vsq[], - const double Ain[], double Aout[]) { - stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, - Nx, Ny, Nz, coef, vsq, Ain, Aout); -} - diff --git a/examples/stencil/stencil.ptx b/examples/stencil/stencil.ptx deleted file mode 100644 index e3dcd1ca..00000000 --- a/examples/stencil/stencil.ptx +++ /dev/null @@ -1,267 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) -// Cuda compilation tools, release 5.5, V5.5.0 -// - -.version 3.2 -.target sm_35 -.address_size 64 - - .file 1 "/home/evghenii/soft/ispc-code/ispc/examples/stencil/stencil.cu", 1383254912, 2112 - .file 2 "/usr/local/cuda-5.5/bin/..//include/cuda_device_runtime_api.h", 1375338991, 7655 - -.weak .func (.param .b32 func_retval0) cudaMalloc( - .param .b64 cudaMalloc_param_0, - .param .b64 cudaMalloc_param_1 -) -{ - .reg .s32 %r<2>; - - - mov.u32 %r1, 30; - st.param.b32 [func_retval0+0], %r1; - .loc 2 66 3 - ret; -} - -.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes( - .param .b64 cudaFuncGetAttributes_param_0, - .param .b64 cudaFuncGetAttributes_param_1 -) -{ - .reg .s32 %r<2>; - - - mov.u32 %r1, 30; - st.param.b32 [func_retval0+0], %r1; - .loc 2 71 3 - ret; -} - -.visible .entry stencil_step_task( - .param .u32 stencil_step_task_param_0, - .param .u32 stencil_step_task_param_1, - .param .u32 stencil_step_task_param_2, - .param .u32 stencil_step_task_param_3, - .param .u32 stencil_step_task_param_4, - .param .u32 stencil_step_task_param_5, - .param .u32 stencil_step_task_param_6, - .param .u32 stencil_step_task_param_7, - .param .u64 stencil_step_task_param_8, - .param .u64 stencil_step_task_param_9, - .param .u64 stencil_step_task_param_10, - .param .u64 stencil_step_task_param_11 -) -{ - .reg .pred %p<8>; - .reg .s32 %r<54>; - .reg .s64 %rd<36>; - .reg .f64 %fd<48>; - - - ld.param.u32 %r19, [stencil_step_task_param_0]; - ld.param.u32 %r20, [stencil_step_task_param_1]; - ld.param.u32 %r21, [stencil_step_task_param_2]; - ld.param.u32 %r22, [stencil_step_task_param_3]; - ld.param.u32 %r23, [stencil_step_task_param_4]; - ld.param.u32 %r24, [stencil_step_task_param_5]; - ld.param.u32 %r25, [stencil_step_task_param_6]; - ld.param.u64 %rd4, [stencil_step_task_param_8]; - ld.param.u64 %rd1, [stencil_step_task_param_9]; - ld.param.u64 %rd2, [stencil_step_task_param_10]; - ld.param.u64 %rd3, [stencil_step_task_param_11]; - cvta.to.global.u64 %rd5, %rd4; - .loc 1 59 1 - mov.u32 %r26, %ctaid.x; - add.s32 %r51, %r26, %r23; - add.s32 %r27, %r51, 1; - .loc 1 18 1 - ld.global.f64 %fd1, [%rd5]; - .loc 1 19 1 - ld.global.f64 %fd2, [%rd5+8]; - .loc 1 20 1 - ld.global.f64 %fd3, [%rd5+16]; - .loc 1 21 1 - ld.global.f64 %fd4, [%rd5+24]; - .loc 1 22 1 - setp.ge.s32 %p1, %r51, %r27; - @%p1 bra BB2_11; - - mul.lo.s32 %r28, %r25, %r24; - shl.b32 %r29, %r28, 1; - neg.s32 %r30, %r29; - shl.b32 %r2, %r30, 3; - cvta.to.global.u64 %rd6, %rd2; - cvta.to.global.u64 %rd31, %rd3; - cvta.to.global.u64 %rd32, %rd1; - -BB2_2: - .loc 1 23 1 - setp.ge.s32 %p2, %r21, %r22; - @%p2 bra BB2_10; - - mov.u32 %r52, %r21; - -BB2_4: - .loc 1 24 1 - mov.u32 %r4, %r52; - setp.ge.s32 %p3, %r19, %r20; - @%p3 bra BB2_9; - - .loc 1 29 1 - mul.lo.s32 %r32, %r51, %r28; - mad.lo.s32 %r5, %r4, %r24, %r32; - .loc 1 32 1 - add.s32 %r6, %r24, %r5; - add.s32 %r7, %r5, %r28; - shl.b32 %r33, %r24, 1; - add.s32 %r8, %r5, %r33; - mad.lo.s32 %r9, %r24, -2, %r5; - add.s32 %r10, %r5, %r29; - mad.lo.s32 %r11, %r28, -2, %r5; - add.s32 %r12, %r24, %r8; - mad.lo.s32 %r13, %r28, 3, %r5; - mov.u32 %r53, %r19; - -BB2_6: - .loc 1 26 1 - mov.u32 %r14, %r53; - mov.u32 %r35, %tid.x; - add.s32 %r36, %r35, %r14; - .loc 1 29 1 - add.s32 %r15, %r36, %r5; - mul.wide.s32 %rd7, %r15, 8; - add.s64 %rd8, %rd6, %rd7; - .loc 1 32 1 - ld.global.f64 %fd5, [%rd8]; - ld.global.f64 %fd7, [%rd8+-8]; - ld.global.f64 %fd8, [%rd8+8]; - add.f64 %fd9, %fd8, %fd7; - add.s32 %r37, %r6, %r36; - mul.wide.s32 %rd9, %r37, 8; - add.s64 %rd10, %rd6, %rd9; - .loc 1 32 1 - ld.global.f64 %fd10, [%rd10]; - add.f64 %fd11, %fd9, %fd10; - .loc 1 22 1 - neg.s32 %r39, %r33; - shl.b32 %r40, %r39, 3; - cvt.s64.s32 %rd11, %r40; - add.s64 %rd12, %rd10, %rd11; - .loc 1 32 1 - ld.global.f64 %fd12, [%rd12]; - add.f64 %fd13, %fd11, %fd12; - add.s32 %r41, %r7, %r36; - mul.wide.s32 %rd13, %r41, 8; - add.s64 %rd14, %rd6, %rd13; - .loc 1 32 1 - ld.global.f64 %fd14, [%rd14]; - add.f64 %fd15, %fd13, %fd14; - cvt.s64.s32 %rd15, %r2; - add.s64 %rd16, %rd14, %rd15; - .loc 1 32 1 - ld.global.f64 %fd16, [%rd16]; - add.f64 %fd17, %fd15, %fd16; - mul.f64 %fd18, %fd2, %fd17; - fma.rn.f64 %fd19, %fd1, %fd5, %fd18; - ld.global.f64 %fd20, [%rd8+-16]; - ld.global.f64 %fd21, [%rd8+16]; - add.f64 %fd22, %fd21, %fd20; - add.s32 %r42, %r8, %r36; - mul.wide.s32 %rd17, %r42, 8; - add.s64 %rd18, %rd6, %rd17; - .loc 1 32 1 - ld.global.f64 %fd23, [%rd18]; - add.f64 %fd24, %fd22, %fd23; - add.s32 %r43, %r9, %r36; - mul.wide.s32 %rd19, %r43, 8; - add.s64 %rd20, %rd6, %rd19; - .loc 1 32 1 - ld.global.f64 %fd25, [%rd20]; - add.f64 %fd26, %fd24, %fd25; - add.s32 %r44, %r10, %r36; - mul.wide.s32 %rd21, %r44, 8; - add.s64 %rd22, %rd6, %rd21; - .loc 1 32 1 - ld.global.f64 %fd27, [%rd22]; - add.f64 %fd28, %fd26, %fd27; - add.s32 %r45, %r11, %r36; - mul.wide.s32 %rd23, %r45, 8; - add.s64 %rd24, %rd6, %rd23; - .loc 1 32 1 - ld.global.f64 %fd29, [%rd24]; - add.f64 %fd30, %fd28, %fd29; - fma.rn.f64 %fd31, %fd3, %fd30, %fd19; - ld.global.f64 %fd32, [%rd8+-24]; - ld.global.f64 %fd33, [%rd8+24]; - add.f64 %fd34, %fd33, %fd32; - add.s32 %r46, %r12, %r36; - mul.wide.s32 %rd25, %r46, 8; - add.s64 %rd26, %rd6, %rd25; - .loc 1 32 1 - ld.global.f64 %fd35, [%rd26]; - add.f64 %fd36, %fd34, %fd35; - add.s64 %rd27, %rd12, %rd11; - .loc 1 32 1 - ld.global.f64 %fd37, [%rd27]; - add.f64 %fd38, %fd36, %fd37; - add.s32 %r47, %r13, %r36; - mul.wide.s32 %rd28, %r47, 8; - add.s64 %rd29, %rd6, %rd28; - .loc 1 32 1 - ld.global.f64 %fd39, [%rd29]; - add.f64 %fd40, %fd38, %fd39; - add.s64 %rd30, %rd16, %rd15; - .loc 1 32 1 - ld.global.f64 %fd41, [%rd30]; - add.f64 %fd42, %fd40, %fd41; - fma.rn.f64 %fd6, %fd4, %fd42, %fd31; - .loc 1 44 1 - setp.ge.s32 %p4, %r36, %r20; - @%p4 bra BB2_8; - - mul.wide.s32 %rd33, %r15, 8; - add.s64 %rd34, %rd31, %rd33; - .loc 1 45 1 - ld.global.f64 %fd43, [%rd34]; - add.f64 %fd44, %fd5, %fd5; - sub.f64 %fd45, %fd44, %fd43; - add.s64 %rd35, %rd32, %rd33; - .loc 1 45 1 - ld.global.f64 %fd46, [%rd35]; - fma.rn.f64 %fd47, %fd46, %fd6, %fd45; - st.global.f64 [%rd34], %fd47; - -BB2_8: - .loc 1 24 19 - add.s32 %r16, %r14, 32; - .loc 1 24 1 - setp.lt.s32 %p5, %r16, %r20; - mov.u32 %r53, %r16; - @%p5 bra BB2_6; - -BB2_9: - .loc 1 23 18 - add.s32 %r17, %r4, 1; - .loc 1 23 1 - setp.lt.s32 %p6, %r17, %r22; - mov.u32 %r52, %r17; - @%p6 bra BB2_4; - -BB2_10: - .loc 1 22 18 - add.s32 %r51, %r51, 1; - .loc 1 59 1 - add.s32 %r49, %r23, %r26; - add.s32 %r50, %r49, 1; - .loc 1 22 1 - setp.lt.s32 %p7, %r51, %r50; - @%p7 bra BB2_2; - -BB2_11: - .loc 1 61 2 - ret; -} - - diff --git a/examples/stencil/stencil0.ptx b/examples/stencil/stencil0.ptx deleted file mode 100644 index f06a11d9..00000000 --- a/examples/stencil/stencil0.ptx +++ /dev/null @@ -1,224 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) -// Cuda compilation tools, release 5.5, V5.5.0 -// - -.version 3.2 -.target sm_35 -.address_size 64 - - .file 1 "/home/evghenii/soft/ispc-code/ispc/examples/stencil/stencil.cu", 1383254912, 2112 - -) -{ - .reg .s32 %r<2>; - - - mov.u32 %r1, 30; - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes( - .param .b64 cudaFuncGetAttributes_param_0, - .param .b64 cudaFuncGetAttributes_param_1 -) -{ - .reg .s32 %r<2>; - - - mov.u32 %r1, 30; - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .entry stencil_step_task( - .param .u32 stencil_step_task_param_0, - .param .u32 stencil_step_task_param_1, - .param .u32 stencil_step_task_param_2, - .param .u32 stencil_step_task_param_3, - .param .u32 stencil_step_task_param_4, - .param .u32 stencil_step_task_param_5, - .param .u32 stencil_step_task_param_6, - .param .u32 stencil_step_task_param_7, - .param .u64 stencil_step_task_param_8, - .param .u64 stencil_step_task_param_9, - .param .u64 stencil_step_task_param_10, - .param .u64 stencil_step_task_param_11 -) -{ - .reg .pred %p<8>; - .reg .s32 %r<54>; - .reg .s64 %rd<36>; - .reg .f64 %fd<48>; - - - ld.param.u32 %r19, [stencil_step_task_param_0]; - ld.param.u32 %r20, [stencil_step_task_param_1]; - ld.param.u32 %r21, [stencil_step_task_param_2]; - ld.param.u32 %r22, [stencil_step_task_param_3]; - ld.param.u32 %r23, [stencil_step_task_param_4]; - ld.param.u32 %r24, [stencil_step_task_param_5]; - ld.param.u32 %r25, [stencil_step_task_param_6]; - ld.param.u64 %rd4, [stencil_step_task_param_8]; - ld.param.u64 %rd1, [stencil_step_task_param_9]; - ld.param.u64 %rd2, [stencil_step_task_param_10]; - ld.param.u64 %rd3, [stencil_step_task_param_11]; - cvta.to.global.u64 %rd5, %rd4; - mov.u32 %r26, %ctaid.x; - add.s32 %r51, %r26, %r23; - add.s32 %r27, %r51, 1; - ld.global.f64 %fd1, [%rd5]; - ld.global.f64 %fd2, [%rd5+8]; - ld.global.f64 %fd3, [%rd5+16]; - ld.global.f64 %fd4, [%rd5+24]; - setp.ge.s32 %p1, %r51, %r27; - @%p1 bra BB2_11; - - mul.lo.s32 %r28, %r25, %r24; - shl.b32 %r29, %r28, 1; - neg.s32 %r30, %r29; - shl.b32 %r2, %r30, 3; - cvta.to.global.u64 %rd6, %rd2; - cvta.to.global.u64 %rd31, %rd3; - cvta.to.global.u64 %rd32, %rd1; - -BB2_2: - setp.ge.s32 %p2, %r21, %r22; - @%p2 bra BB2_10; - - mov.u32 %r52, %r21; - -BB2_4: - mov.u32 %r4, %r52; - setp.ge.s32 %p3, %r19, %r20; - @%p3 bra BB2_9; - - mul.lo.s32 %r32, %r51, %r28; - mad.lo.s32 %r5, %r4, %r24, %r32; - add.s32 %r6, %r24, %r5; - add.s32 %r7, %r5, %r28; - shl.b32 %r33, %r24, 1; - add.s32 %r8, %r5, %r33; - mad.lo.s32 %r9, %r24, -2, %r5; - add.s32 %r10, %r5, %r29; - mad.lo.s32 %r11, %r28, -2, %r5; - add.s32 %r12, %r24, %r8; - mad.lo.s32 %r13, %r28, 3, %r5; - mov.u32 %r53, %r19; - -BB2_6: - mov.u32 %r14, %r53; - mov.u32 %r35, %tid.x; - add.s32 %r36, %r35, %r14; - add.s32 %r15, %r36, %r5; - mul.wide.s32 %rd7, %r15, 8; - add.s64 %rd8, %rd6, %rd7; - ld.global.f64 %fd5, [%rd8]; - ld.global.f64 %fd7, [%rd8+-8]; - ld.global.f64 %fd8, [%rd8+8]; - add.f64 %fd9, %fd8, %fd7; - add.s32 %r37, %r6, %r36; - mul.wide.s32 %rd9, %r37, 8; - add.s64 %rd10, %rd6, %rd9; - ld.global.f64 %fd10, [%rd10]; - add.f64 %fd11, %fd9, %fd10; - neg.s32 %r39, %r33; - shl.b32 %r40, %r39, 3; - cvt.s64.s32 %rd11, %r40; - add.s64 %rd12, %rd10, %rd11; - ld.global.f64 %fd12, [%rd12]; - add.f64 %fd13, %fd11, %fd12; - add.s32 %r41, %r7, %r36; - mul.wide.s32 %rd13, %r41, 8; - add.s64 %rd14, %rd6, %rd13; - ld.global.f64 %fd14, [%rd14]; - add.f64 %fd15, %fd13, %fd14; - cvt.s64.s32 %rd15, %r2; - add.s64 %rd16, %rd14, %rd15; - ld.global.f64 %fd16, [%rd16]; - add.f64 %fd17, %fd15, %fd16; - mul.f64 %fd18, %fd2, %fd17; - fma.rn.f64 %fd19, %fd1, %fd5, %fd18; - ld.global.f64 %fd20, [%rd8+-16]; - ld.global.f64 %fd21, [%rd8+16]; - add.f64 %fd22, %fd21, %fd20; - add.s32 %r42, %r8, %r36; - mul.wide.s32 %rd17, %r42, 8; - add.s64 %rd18, %rd6, %rd17; - ld.global.f64 %fd23, [%rd18]; - add.f64 %fd24, %fd22, %fd23; - add.s32 %r43, %r9, %r36; - mul.wide.s32 %rd19, %r43, 8; - add.s64 %rd20, %rd6, %rd19; - ld.global.f64 %fd25, [%rd20]; - add.f64 %fd26, %fd24, %fd25; - add.s32 %r44, %r10, %r36; - mul.wide.s32 %rd21, %r44, 8; - add.s64 %rd22, %rd6, %rd21; - ld.global.f64 %fd27, [%rd22]; - add.f64 %fd28, %fd26, %fd27; - add.s32 %r45, %r11, %r36; - mul.wide.s32 %rd23, %r45, 8; - add.s64 %rd24, %rd6, %rd23; - ld.global.f64 %fd29, [%rd24]; - add.f64 %fd30, %fd28, %fd29; - fma.rn.f64 %fd31, %fd3, %fd30, %fd19; - ld.global.f64 %fd32, [%rd8+-24]; - ld.global.f64 %fd33, [%rd8+24]; - add.f64 %fd34, %fd33, %fd32; - add.s32 %r46, %r12, %r36; - mul.wide.s32 %rd25, %r46, 8; - add.s64 %rd26, %rd6, %rd25; - ld.global.f64 %fd35, [%rd26]; - add.f64 %fd36, %fd34, %fd35; - add.s64 %rd27, %rd12, %rd11; - ld.global.f64 %fd37, [%rd27]; - add.f64 %fd38, %fd36, %fd37; - add.s32 %r47, %r13, %r36; - mul.wide.s32 %rd28, %r47, 8; - add.s64 %rd29, %rd6, %rd28; - ld.global.f64 %fd39, [%rd29]; - add.f64 %fd40, %fd38, %fd39; - add.s64 %rd30, %rd16, %rd15; - ld.global.f64 %fd41, [%rd30]; - add.f64 %fd42, %fd40, %fd41; - fma.rn.f64 %fd6, %fd4, %fd42, %fd31; - setp.ge.s32 %p4, %r36, %r20; - @%p4 bra BB2_8; - - mul.wide.s32 %rd33, %r15, 8; - add.s64 %rd34, %rd31, %rd33; - ld.global.f64 %fd43, [%rd34]; - add.f64 %fd44, %fd5, %fd5; - sub.f64 %fd45, %fd44, %fd43; - add.s64 %rd35, %rd32, %rd33; - ld.global.f64 %fd46, [%rd35]; - fma.rn.f64 %fd47, %fd46, %fd6, %fd45; - st.global.f64 [%rd34], %fd47; - -BB2_8: - add.s32 %r16, %r14, 32; - setp.lt.s32 %p5, %r16, %r20; - mov.u32 %r53, %r16; - @%p5 bra BB2_6; - -BB2_9: - add.s32 %r17, %r4, 1; - setp.lt.s32 %p6, %r17, %r22; - mov.u32 %r52, %r17; - @%p6 bra BB2_4; - -BB2_10: - add.s32 %r51, %r51, 1; - add.s32 %r49, %r23, %r26; - add.s32 %r50, %r49, 1; - setp.lt.s32 %p7, %r51, %r50; - @%p7 bra BB2_2; - -BB2_11: - ret; -} - - diff --git a/examples/stencil/stencil2.ptx b/examples/stencil/stencil2.ptx deleted file mode 100644 index 3e5dfd92..00000000 --- a/examples/stencil/stencil2.ptx +++ /dev/null @@ -1,247 +0,0 @@ -// -// Generated by LLVM NVPTX Back-End -// - -.version 3.1 -.target sm_20, texmode_independent -.address_size 64 - - // .globl stencil_step_task - // @stencil_step_task -.entry stencil_step_task( - .param .u32 stencil_step_task_param_0, - .param .u32 stencil_step_task_param_1, - .param .u32 stencil_step_task_param_2, - .param .u32 stencil_step_task_param_3, - .param .u32 stencil_step_task_param_4, - .param .u32 stencil_step_task_param_5, - .param .u32 stencil_step_task_param_6, - .param .u32 stencil_step_task_param_7, - .param .u64 .ptr .align 8 stencil_step_task_param_8, - .param .u64 .ptr .align 8 stencil_step_task_param_9, - .param .u64 .ptr .align 8 stencil_step_task_param_10, - .param .u64 .ptr .align 8 stencil_step_task_param_11 -) -{ - .reg .pred %p<396>; - .reg .s16 %rc<396>; - .reg .s16 %rs<396>; - .reg .s32 %r<396>; - .reg .s64 %rl<396>; - .reg .f32 %f<396>; - .reg .f64 %fl<396>; - -// BB#0: // %allocas - mov.u32 %r12, %ctaid.x; - ld.param.u32 %r13, [stencil_step_task_param_4]; - add.s32 %r16, %r12, %r13; - add.s32 %r0, %r16, 1; - setp.ge.s32 %p0, %r16, %r0; - @%p0 bra BB0_11; -// BB#1: // %for_test28.i.preheader.lr.ph - ld.param.u32 %r0, [stencil_step_task_param_0]; - ld.param.u32 %r1, [stencil_step_task_param_1]; - ld.param.u32 %r2, [stencil_step_task_param_2]; - ld.param.u32 %r3, [stencil_step_task_param_3]; - ld.param.u32 %r4, [stencil_step_task_param_5]; - ld.param.u32 %r5, [stencil_step_task_param_6]; - mul.lo.s32 %r5, %r5, %r4; - ld.param.u64 %rl3, [stencil_step_task_param_8]; - ld.f64 %fl0, [%rl3]; - ld.f64 %fl1, [%rl3+8]; - ld.param.u64 %rl0, [stencil_step_task_param_9]; - ld.f64 %fl2, [%rl3+16]; - ld.param.u64 %rl1, [stencil_step_task_param_10]; - ld.param.u64 %rl2, [stencil_step_task_param_11]; - ld.f64 %fl3, [%rl3+24]; - shl.b32 %r6, %r4, 1; - mul.lo.s32 %r7, %r4, 3; - mul.lo.s32 %r8, %r4, -3; - shl.b32 %r9, %r5, 1; - mul.lo.s32 %r10, %r5, 3; - mul.lo.s32 %r11, %r5, -3; - add.s32 %r12, %r12, %r13; - neg.s32 %r13, %r9; - neg.s32 %r14, %r6; - mov.u32 %r32, WARP_SZ; -BB0_2: // %for_test28.i.preheader - // =>This Loop Header: Depth=1 - // Child Loop BB0_9 Depth 2 - // Child Loop BB0_5 Depth 3 - mov.u32 %r15, %r16; - setp.ge.s32 %p0, %r2, %r3; - @%p0 bra BB0_10; -// BB#3: // %for_test35.i.preheader.lr.ph - // in Loop: Header=BB0_2 Depth=1 - setp.lt.s32 %p0, %r0, %r1; - @%p0 bra BB0_4; - bra.uni BB0_10; -BB0_4: // in Loop: Header=BB0_2 Depth=1 - mul.lo.s32 %r16, %r15, %r5; - mov.u32 %r17, %r2; -BB0_9: // %for_loop37.i.lr.ph.us - // Parent Loop BB0_2 Depth=1 - // => This Loop Header: Depth=2 - // Child Loop BB0_5 Depth 3 - mad.lo.s32 %r18, %r17, %r4, %r16; - add.s32 %r19, %r18, %r4; - add.s32 %r20, %r18, %r6; - sub.s32 %r21, %r18, %r4; - add.s32 %r22, %r18, %r7; - add.s32 %r23, %r18, %r14; - add.s32 %r24, %r18, %r5; - add.s32 %r25, %r18, %r8; - add.s32 %r26, %r18, %r9; - sub.s32 %r27, %r18, %r5; - add.s32 %r28, %r18, %r10; - add.s32 %r29, %r18, %r13; - add.s32 %r30, %r18, %r11; - mov.u32 %r31, %r0; -BB0_5: // %for_loop37.i.us - // Parent Loop BB0_2 Depth=1 - // Parent Loop BB0_9 Depth=2 - // => This Inner Loop Header: Depth=3 - mov.u32 %r33, %tid.x; - add.s32 %r34, %r32, -1; - and.b32 %r33, %r34, %r33; - add.s32 %r33, %r33, %r31; - setp.ge.s32 %p0, %r33, %r1; - @%p0 bra BB0_7; -// BB#6: // %pl_dolane.i.us - // in Loop: Header=BB0_5 Depth=3 - add.s32 %r34, %r18, %r33; - shl.b32 %r34, %r34, 3; - add.s32 %r35, %r34, -8; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl4, [%rl3]; - add.s32 %r35, %r34, 8; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl5, [%rl3]; - add.s32 %r35, %r34, -16; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl6, [%rl3]; - add.s32 %r35, %r34, 16; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl9, [%rl3]; - add.s32 %r35, %r19, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl8, [%rl3]; - add.s32 %r35, %r34, -24; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl7, [%rl3]; - add.s32 %r35, %r34, 24; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl10, [%rl3]; - add.s32 %r35, %r20, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl13, [%rl3]; - add.s32 %r35, %r21, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl12, [%rl3]; - add.s32 %r35, %r22, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl11, [%rl3]; - add.s32 %r35, %r23, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl16, [%rl3]; - add.s32 %r35, %r24, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl15, [%rl3]; - add.s32 %r35, %r25, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl14, [%rl3]; - add.s32 %r35, %r26, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl19, [%rl3]; - add.s32 %r35, %r27, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl18, [%rl3]; - add.s32 %r35, %r28, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl17, [%rl3]; - add.s32 %r35, %r29, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl24, [%rl3]; - cvt.s64.s32 %rl4, %r34; - add.s64 %rl3, %rl4, %rl1; - ld.f64 %fl21, [%rl3]; - add.s32 %r33, %r30, %r33; - shl.b32 %r33, %r33, 3; - cvt.s64.s32 %rl3, %r33; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl20, [%rl3]; - add.s64 %rl3, %rl4, %rl2; - ld.f64 %fl23, [%rl3]; - add.s64 %rl4, %rl4, %rl0; - ld.f64 %fl22, [%rl4]; - add.f64 %fl25, %fl21, %fl21; - sub.f64 %fl23, %fl25, %fl23; - add.f64 %fl6, %fl6, %fl9; - add.f64 %fl6, %fl6, %fl13; - add.f64 %fl6, %fl6, %fl16; - add.f64 %fl6, %fl6, %fl19; - add.f64 %fl6, %fl6, %fl24; - add.f64 %fl4, %fl4, %fl5; - add.f64 %fl4, %fl4, %fl8; - add.f64 %fl4, %fl4, %fl12; - add.f64 %fl4, %fl4, %fl15; - add.f64 %fl4, %fl4, %fl18; - mul.f64 %fl5, %fl0, %fl21; - fma.rn.f64 %fl4, %fl1, %fl4, %fl5; - fma.rn.f64 %fl4, %fl2, %fl6, %fl4; - add.f64 %fl5, %fl7, %fl10; - add.f64 %fl5, %fl5, %fl11; - add.f64 %fl5, %fl5, %fl14; - add.f64 %fl5, %fl5, %fl17; - add.f64 %fl5, %fl5, %fl20; - fma.rn.f64 %fl4, %fl3, %fl5, %fl4; - fma.rn.f64 %fl4, %fl4, %fl22, %fl23; - st.f64 [%rl3], %fl4; -BB0_7: // %safe_if_after_true.i.us - // in Loop: Header=BB0_5 Depth=3 - add.s32 %r31, %r32, %r31; - setp.lt.s32 %p0, %r31, %r1; - @%p0 bra BB0_5; -// BB#8: // %for_exit38.i.us - // in Loop: Header=BB0_9 Depth=2 - add.s32 %r17, %r17, 1; - setp.eq.s32 %p0, %r17, %r3; - @%p0 bra BB0_10; - bra.uni BB0_9; -BB0_10: // %for_exit31.i - // in Loop: Header=BB0_2 Depth=1 - add.s32 %r16, %r15, 1; - setp.ne.s32 %p0, %r15, %r12; - @%p0 bra BB0_2; -BB0_11: // %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit - ret; -} - diff --git a/examples/stencil/stencil_cu.cpp b/examples/stencil/stencil_cu.cpp deleted file mode 100644 index 3f06e841..00000000 --- a/examples/stencil/stencil_cu.cpp +++ /dev/null @@ -1,317 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifdef _MSC_VER -#define _CRT_SECURE_NO_WARNINGS -#define NOMINMAX -#pragma warning (disable: 4244) -#pragma warning (disable: 4305) -#endif - -#include -#include -#include -#include "../timing.h" -#include "stencil_ispc.h" -using namespace ispc; - -#include -#include -#include -#include "drvapi_error_string.h" - -#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) -// These are the inline versions for all of the SDK helper functions -void __checkCudaErrors(CUresult err, const char *file, const int line) { - if(CUDA_SUCCESS != err) { - std::cerr << "checkCudeErrors() Driver API error = " << err << "\"" - << getCudaDrvErrorString(err) << "\" from file <" << file - << ", line " << line << "\n"; - exit(-1); - } -} - -/**********************/ -/* Basic CUDriver API */ -CUcontext context; - -void createContext(const int deviceId = 0) -{ - CUdevice device; - int devCount; - checkCudaErrors(cuInit(0)); - checkCudaErrors(cuDeviceGetCount(&devCount)); - assert(devCount > 0); - checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0)); - - char name[128]; - checkCudaErrors(cuDeviceGetName(name, 128, device)); - std::cout << "Using CUDA Device [0]: " << name << "\n"; - - int devMajor, devMinor; - checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); - std::cout << "Device Compute Capability: " - << devMajor << "." << devMinor << "\n"; - if (devMajor < 2) { - std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; - exit(1); - } - - // Create driver context - checkCudaErrors(cuCtxCreate(&context, 0, device)); -} -void destroyContext() -{ - checkCudaErrors(cuCtxDestroy(context)); -} - -CUmodule loadModule(const char * module) -{ - CUmodule cudaModule; - checkCudaErrors(cuModuleLoadData(&cudaModule, module)); - return cudaModule; -} -void unloadModule(CUmodule &cudaModule) -{ - checkCudaErrors(cuModuleUnload(cudaModule)); -} - -CUfunction getFunction(CUmodule &cudaModule, const char * function) -{ - CUfunction cudaFunction; - checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function)); - return cudaFunction; -} - -CUdeviceptr deviceMalloc(const size_t size) -{ - CUdeviceptr d_buf; - checkCudaErrors(cuMemAlloc(&d_buf, size)); - return d_buf; -} -void deviceFree(CUdeviceptr d_buf) -{ - checkCudaErrors(cuMemFree(d_buf)); -} -void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size) -{ - checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size)); -} -void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) -{ - checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); -} -#define deviceLaunch(func,nbx,nby,nbz,params) \ - checkCudaErrors( \ - cuLaunchKernel( \ - (func), \ - (nbx), (nby), (nbz), \ - 32, 1, 1, \ - 0, NULL, (params), NULL \ - )); - -typedef CUdeviceptr devicePtr; - - -/**************/ - -extern "C" -{ - - void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment) - { - return NULL; - } - void CUDALaunch( - void **handlePtr, - const char * module_name, - const char * module, - const char * func_name, - void **func_args, - int countx, int county, int countz) - { - assert(module_name != NULL); - assert(module != NULL); - assert(func_name != NULL); - assert(func_args != NULL); - CUmodule cudaModule = loadModule(module); - CUfunction cudaFunction = getFunction(cudaModule, func_name); - deviceLaunch(cudaFunction, countx, county, countz, func_args); - unloadModule(cudaModule); - } - void CUDASync(void *handle) - { - checkCudaErrors(cuStreamSynchronize(0)); - } - void ISPCSync(void *handle) - { - checkCudaErrors(cuStreamSynchronize(0)); - } - void CUDAFree(void *handle) - { - } -} - - -extern void loop_stencil_serial(int t0, int t1, int x0, int x1, - int y0, int y1, int z0, int z1, - int Nx, int Ny, int Nz, - const double coef[5], - const double vsq[], - double Aeven[], double Aodd[]); - - -void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) { - int offset = 0; - for (int z = 0; z < Nz; ++z) - for (int y = 0; y < Ny; ++y) - for (int x = 0; x < Nx; ++x, ++offset) { - A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny); - A[1][offset] = 0; - vsq[offset] = x*y*z / double(Nx * Ny * Nz); - } -} - - -int main() { - int Nx = 256, Ny = 256, Nz = 256; - int width = 4; - double *Aserial[2], *Aispc[2]; - Aserial[0] = new double [Nx * Ny * Nz]; - Aserial[1] = new double [Nx * Ny * Nz]; - Aispc[0] = new double [Nx * Ny * Nz]; - Aispc[1] = new double [Nx * Ny * Nz]; - double *vsq = new double [Nx * Ny * Nz]; - - double coeff[4] = { 0.5, -.25, .125, -.0625 }; - - /*******************/ - createContext(); - /*******************/ - - const size_t bufsize = sizeof(double)*Nx*Ny*Nz; - devicePtr d_Aispc0 = deviceMalloc(bufsize); - devicePtr d_Aispc1 = deviceMalloc(bufsize); - devicePtr d_vsq = deviceMalloc(bufsize); - devicePtr d_coeff = deviceMalloc(4*sizeof(double)); - - - InitData(Nx, Ny, Nz, Aispc, vsq); - - // - // Compute the image using the ispc implementation on one core; report - // the minimum time of three runs. - // - double minTimeISPC = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aispc[0], Aispc[1]); - double dt = get_elapsed_mcycles(); - minTimeISPC = std::min(minTimeISPC, dt); - } - - printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); - - InitData(Nx, Ny, Nz, Aispc, vsq); - - memcpyH2D(d_Aispc0, Aispc[0], bufsize); - memcpyH2D(d_Aispc1, Aispc[1], bufsize); - memcpyH2D(d_vsq, vsq, bufsize); - memcpyH2D(d_coeff, coeff, 4*sizeof(double)); - // - // Compute the image using the ispc implementation with tasks; report - // the minimum time of three runs. - // - double minTimeISPCTasks = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, (double*)d_coeff, (double*)d_vsq, - (double*)d_Aispc0, (double*)d_Aispc1); - double dt = get_elapsed_mcycles(); - minTimeISPCTasks = std::min(minTimeISPCTasks, dt); - } - memcpyD2H(Aispc[1], d_Aispc1, bufsize); - //memcpyD2H(Aispc[1], d_vsq, bufsize); - - printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); - - InitData(Nx, Ny, Nz, Aserial, vsq); - - // - // And run the serial implementation 3 times, again reporting the - // minimum time. - // - double minTimeSerial = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aserial[0], Aserial[1]); - double dt = get_elapsed_mcycles(); - minTimeSerial = std::min(minTimeSerial, dt); - } - - printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); - - printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", - minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); - - // Check for agreement - int offset = 0; - int nerr = 0; - for (int z = 0; z < Nz; ++z) - for (int y = 0; y < Ny; ++y) - for (int x = 0; x < Nx; ++x, ++offset) { - - double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) / - Aserial[1][offset]); - if (error > 1e-3) - { - if (nerr < 100) - printf("Error @ (%d,%d,%d): ispc = %g, serial = %g error= %g\n", - x, y, z, Aispc[1][offset], Aserial[1][offset], error); - nerr++; - } - } - - fprintf(stderr, " nerr= %d frac= %g \n", nerr, 1.0*nerr/(1.0*Nx*Ny*Nz)); - - /*******************/ - destroyContext(); - /*******************/ - - return 0; -} diff --git a/examples/stencil/stencil_cu.o b/examples/stencil/stencil_cu.o deleted file mode 100644 index 90b014c6..00000000 Binary files a/examples/stencil/stencil_cu.o and /dev/null differ diff --git a/examples/stencil/stencil_parallel.cpp b/examples/stencil/stencil_parallel.cpp deleted file mode 100644 index 30ded2cd..00000000 --- a/examples/stencil/stencil_parallel.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - - -static void -stencil_step(int x0, int x1, - int y0, int y1, - int z0, int z1, - int Nx, int Ny, int Nz, - const float coef[4], const float vsq[], - const float Ain[], float Aout[]) { - int Nxy = Nx * Ny; - -#pragma omp parallel for - for (int z = z0; z < z1; ++z) { - for (int y = y0; y < y1; ++y) { - for (int x = x0; x < x1; ++x) { - int index = (z * Nxy) + (y * Nx) + x; -#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] -#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - float div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + - A_cur(0, +1, 0) + A_cur(0, -1, 0) + - A_cur(0, 0, +1) + A_cur(0, 0, -1)) + - coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + - A_cur(0, +2, 0) + A_cur(0, -2, 0) + - A_cur(0, 0, +2) + A_cur(0, 0, -2)) + - coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + - A_cur(0, +3, 0) + A_cur(0, -3, 0) + - A_cur(0, 0, +3) + A_cur(0, 0, -3)); - - A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + - vsq[index] * div; - } - } - } -} - - -void loop_stencil_parallel(int t0, int t1, - int x0, int x1, - int y0, int y1, - int z0, int z1, - int Nx, int Ny, int Nz, - const float coef[4], - const float vsq[], - float Aeven[], float Aodd[]) -{ - for (int t = t0; t < t1; ++t) { - if ((t & 1) == 0) - stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, - Aeven, Aodd); - else - stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, - Aodd, Aeven); - } -} diff --git a/examples/volume_rendering/volume_ispc.bc b/examples/volume_rendering/volume_ispc.bc deleted file mode 100644 index 4a7c47b9..00000000 Binary files a/examples/volume_rendering/volume_ispc.bc and /dev/null differ diff --git a/examples/volume_rendering/volume_ispc_avx.bc b/examples/volume_rendering/volume_ispc_avx.bc deleted file mode 100644 index 9b0e4e5f..00000000 Binary files a/examples/volume_rendering/volume_ispc_avx.bc and /dev/null differ diff --git a/examples/volume_rendering/volume_ispc_nvptx64.bc b/examples/volume_rendering/volume_ispc_nvptx64.bc deleted file mode 100644 index 62a9265e..00000000 Binary files a/examples/volume_rendering/volume_ispc_nvptx64.bc and /dev/null differ