From 589538bf39d9c3eca8be2c76024e7b29b4340ab5 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Mon, 18 Nov 2013 12:04:00 +0100 Subject: [PATCH] added stencil code --- examples/stencil/1.s | 175 --- examples/stencil/2.s | 239 ---- examples/stencil/3.s | 239 ---- examples/stencil/Makefile | 2 +- examples/stencil/stencil.cpp | 2 + examples/stencil/stencil.cubin | Bin 3156 -> 0 bytes examples/stencil/stencil1.cubin | Bin 3604 -> 0 bytes examples/stencil/stencil2.cubin | Bin 3668 -> 0 bytes examples/stencil/stencil_avx.bc | Bin 12836 -> 0 bytes examples/stencil/stencil_cu | Bin 39703 -> 0 bytes examples/stencil/stencil_cu.bc | Bin 22616 -> 0 bytes examples/stencil/stencil_cu.ll | 762 ---------- examples/stencil/stencil_cu.s | 1134 --------------- examples/stencil/stencil_cu_avx.bc | Bin 9820 -> 0 bytes examples/stencil/stencil_cu_avx.s | 214 --- examples/stencil/stencil_cu_nvptx64.bc | Bin 5256 -> 0 bytes examples/stencil/stencil_cu_nvptx64.cubin | Bin 3668 -> 0 bytes examples/stencil/stencil_cu_nvptx64.ll | 269 ---- examples/stencil/stencil_ispc.h | 35 - examples/stencil/stencil_nvptx64.bc | Bin 8500 -> 0 bytes examples_cuda/common.mk | 3 +- examples_cuda/stencil/.stencil.ispc.swn | Bin 0 -> 16384 bytes examples_cuda/stencil/__kernels.ptx | 1246 ----------------- examples_cuda/stencil/drvapi_error_string.h | 370 ----- examples_cuda/stencil/kernel.ptx | 1246 ----------------- examples_cuda/stencil/libcudadevrt.a | Bin 137338 -> 0 bytes examples_cuda/stencil/stencil.cu | 71 +- examples_cuda/stencil/stencilX.ispc | 159 --- examples_cuda/stencil/stencilY.ispc | 126 -- examples_cuda/stencil/stencil_cu | Bin 25158 -> 0 bytes examples_cuda/stencil/stencil_cu.o | Bin 21784 -> 0 bytes examples_cuda/stencil/stencil_ispc.h | 34 - examples_cuda/stencil/stencil_ispc_nvptx64.ll | 974 ------------- .../stencil/stencil_ispc_nvptx64.ptx | 1246 ----------------- examples_cuda/stencil/stencil_orig.cpp | 172 --- examples_cuda/stencil/stencil_orig.ispc | 172 --- examples_cuda/stencil/stencil_serial.o | Bin 2360 -> 0 bytes stdlib.ispc | 2 +- 38 files changed, 72 insertions(+), 8820 deletions(-) delete mode 100644 examples/stencil/1.s delete mode 100644 examples/stencil/2.s delete mode 100644 examples/stencil/3.s delete mode 100644 examples/stencil/stencil.cubin delete mode 100644 examples/stencil/stencil1.cubin delete mode 100644 examples/stencil/stencil2.cubin delete mode 100644 examples/stencil/stencil_avx.bc delete mode 100755 examples/stencil/stencil_cu delete mode 100644 examples/stencil/stencil_cu.bc delete mode 100644 examples/stencil/stencil_cu.ll delete mode 100644 examples/stencil/stencil_cu.s delete mode 100644 examples/stencil/stencil_cu_avx.bc delete mode 100644 examples/stencil/stencil_cu_avx.s delete mode 100644 examples/stencil/stencil_cu_nvptx64.bc delete mode 100644 examples/stencil/stencil_cu_nvptx64.cubin delete mode 100644 examples/stencil/stencil_cu_nvptx64.ll delete mode 100644 examples/stencil/stencil_ispc.h delete mode 100644 examples/stencil/stencil_nvptx64.bc create mode 100644 examples_cuda/stencil/.stencil.ispc.swn delete mode 100644 examples_cuda/stencil/__kernels.ptx delete mode 100644 examples_cuda/stencil/drvapi_error_string.h delete mode 100644 examples_cuda/stencil/kernel.ptx delete mode 100644 examples_cuda/stencil/libcudadevrt.a delete mode 100644 examples_cuda/stencil/stencilX.ispc delete mode 100644 examples_cuda/stencil/stencilY.ispc delete mode 100755 examples_cuda/stencil/stencil_cu delete mode 100644 examples_cuda/stencil/stencil_cu.o delete mode 100644 examples_cuda/stencil/stencil_ispc.h delete mode 100644 examples_cuda/stencil/stencil_ispc_nvptx64.ll delete mode 100644 examples_cuda/stencil/stencil_ispc_nvptx64.ptx delete mode 100644 examples_cuda/stencil/stencil_orig.cpp delete mode 100644 examples_cuda/stencil/stencil_orig.ispc delete mode 100644 examples_cuda/stencil/stencil_serial.o diff --git a/examples/stencil/1.s b/examples/stencil/1.s deleted file mode 100644 index d59cb1f9..00000000 --- a/examples/stencil/1.s +++ /dev/null @@ -1,175 +0,0 @@ - - code for sm_35 - Function : stencil_step_task - .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" - /* 0x0880a010a0a01000 */ - /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ - /*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */ - /*0018*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */ - /*0020*/ IADD R0, R10, c[0x0][0x150]; /* 0x608000002a1c2802 */ - /*0028*/ IADD R11, R0, 0x1; /* 0xc0800000009c002d */ - /*0030*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */ - /*0038*/ ISETP.GE.AND P0, PT, R0, R11, PT; /* 0xdb681c00059c001e */ - /* 0x08a0a1ac118d8d8c */ - /*0048*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */ - /*0050*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */ - /*0058*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */ - /*0060*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */ - /*0068*/ @P0 EXIT ; /* 0x180000000000003c */ - /*0070*/ MOV R11, c[0x0][0x158]; /* 0x64c03c002b1c002e */ - /*0078*/ IMUL R41, R11, c[0x0][0x154]; /* 0x61c018002a9c2ca6 */ - /* 0x08b0a000a010a010 */ - /*0088*/ IADD R11, R10, c[0x0][0x150]; /* 0x608000002a1c282e */ - /*0090*/ SHF.L R40, RZ, 0x1, R41; /* 0xb7c0a400009ffca1 */ - /*0098*/ I2I.S32.S32 R10, -R40; /* 0xe6010000141ce82a */ - /*00a0*/ IADD R49, R11, 0x1; /* 0xc0800000009c2cc5 */ - /*00a8*/ SHF.L R28, RZ, 0x3, R10; /* 0xb7c02800019ffc71 */ - /*00b0*/ MOV R10, c[0x0][0x148]; /* 0x64c03c00291c002a */ - /*00b8*/ ISETP.GE.AND P0, PT, R10, c[0x0][0x14c], PT; /* 0x5b681c00299c281e */ - /* 0x0880acb0a00010ac */ - /*00c8*/ @P0 BRA 0x4f0; /* 0x120000021000003c */ - /*00d0*/ MOV R29, c[0x0][0x148]; /* 0x64c03c00291c0076 */ - /*00d8*/ IMUL R42, R0, R41; /* 0xe1c01800149c00aa */ - /*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */ - /*00e8*/ ISETP.GE.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b681c00289c281e */ - /*00f0*/ @P0 BRA 0x4d8; /* 0x12000001f000003c */ - /*00f8*/ MOV R10, c[0x0][0x154]; /* 0x64c03c002a9c002a */ - /* 0x0880888010a0109c */ - /*0108*/ IMAD R44, R29, c[0x0][0x154], R42; /* 0x5108a8002a9c74b2 */ - /*0110*/ SHF.L R11, RZ, 0x1, R10; /* 0xb7c02800009ffc2d */ - /*0118*/ MOV R39, c[0x0][0x140]; /* 0x64c03c00281c009e */ - /*0120*/ IMAD R34, R10, -0x2, R44; /* 0xa908b3ffff1c2889 */ - /*0128*/ IADD R43, R44, R11; /* 0xe0800000059cb0ae */ - /*0130*/ I2I.S32.S32 R10, -R11; /* 0xe6010000059ce82a */ - /*0138*/ IMAD R36, R41, -0x2, R44; /* 0xa908b3ffff1ca491 */ - /* 0x08a0001084108480 */ - /*0148*/ IADD R32, R44, c[0x0][0x154]; /* 0x608000002a9cb082 */ - /*0150*/ IADD R33, R44, R41; /* 0xe0800000149cb086 */ - /*0158*/ IADD R35, R44, R40; /* 0xe0800000141cb08e */ - /*0160*/ IMAD R38, R41, 0x3, R44; /* 0xa108b000019ca499 */ - /*0168*/ SHF.L R47, RZ, 0x3, R10; /* 0xb7c02800019ffcbd */ - /*0170*/ IADD R37, R43, c[0x0][0x154]; /* 0x608000002a9cac96 */ - /*0178*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */ - /* 0x08a0b0a010908c10 */ - /*0188*/ MOV32I R48, 0x8; /* 0x74000000041fc0c2 */ - /*0190*/ IADD R45, R10, R39; /* 0xe0800000139c28b6 */ - /*0198*/ BFE R30, R47, 0x11f; /* 0xc00800008f9cbc79 */ - /*01a0*/ IADD R46, R45, R44; /* 0xe0800000161cb4ba */ - /*01a8*/ IADD R14, R32, R45; /* 0xe0800000169c803a */ - /*01b0*/ IMAD R10.CC, R46, R48, c[0x0][0x170]; /* 0x910cc0002e1cb82a */ - /*01b8*/ IMAD.HI.X R11, R46, R48, c[0x0][0x174]; /* 0x9318c0002e9cb82e */ - /* 0x0881cc118c118c10 */ - /*01c8*/ IADD R27, R37, R45; /* 0xe0800000169c946e */ - /*01d0*/ LD.E.64 R12, [R10+-0x8]; /* 0xc5fffffffc1c2830 */ - /*01d8*/ BFE R50, R28, 0x11f; /* 0xc00800008f9c70c9 */ - /*01e0*/ LD.E.64 R24, [R10+0x8]; /* 0xc5800000041c2860 */ - /*01e8*/ ISETP.GE.AND P0, PT, R45, c[0x0][0x144], PT; /* 0x5b681c00289cb41e */ - /*01f0*/ LD.E.64 R18, [R10+-0x18]; /* 0xc5fffffff41c2848 */ - /*01f8*/ DADD R20, R24, R12; /* 0xe3800000061c6052 */ - /* 0x098c10a011ac8188 */ - /*0208*/ LD.E.64 R22, [R10+0x18]; /* 0xc58000000c1c2858 */ - /*0210*/ IMAD R16.CC, R14, R48, c[0x0][0x170]; /* 0x910cc0002e1c3842 */ - /*0218*/ LD.E.64 R12, [R10+-0x10]; /* 0xc5fffffff81c2830 */ - /*0220*/ IMAD.HI.X R17, R14, R48, c[0x0][0x174]; /* 0x9318c0002e9c3846 */ - /*0228*/ IADD R25, R43, R45; /* 0xe0800000169cac66 */ - /*0230*/ LD.E.64 R14, [R16]; /* 0xc5800000001c4038 */ - /*0238*/ DADD R22, R22, R18; /* 0xe3800000091c585a */ - /* 0x0994808c848cb180 */ - /*0248*/ LD.E.64 R18, [R10+0x10]; /* 0xc5800000081c2848 */ - /*0250*/ IMAD R26.CC, R27, R48, c[0x0][0x170]; /* 0x910cc0002e1c6c6a */ - /*0258*/ IMAD.HI.X R27, R27, R48, c[0x0][0x174]; /* 0x9318c0002e9c6c6e */ - /*0260*/ IMAD R24.CC, R25, R48, c[0x0][0x170]; /* 0x910cc0002e1c6462 */ - /*0268*/ DADD R14, R20, R14; /* 0xe3800000071c503a */ - /*0270*/ DADD R20, R18, R12; /* 0xe3800000061c4852 */ - /*0278*/ LD.E.64 R12, [R26]; /* 0xc5800000001c6830 */ - /* 0x08b080118010c080 */ - /*0288*/ IMAD.HI.X R25, R25, R48, c[0x0][0x174]; /* 0x9318c0002e9c6466 */ - /*0290*/ IADD R16.CC, R16, R47; /* 0xe0840000179c4042 */ - /*0298*/ LD.E.64 R18, [R24]; /* 0xc5800000001c6048 */ - /*02a0*/ DADD R12, R22, R12; /* 0xe3800000061c5832 */ - /*02a8*/ IADD.X R17, R17, R30; /* 0xe08040000f1c4446 */ - /*02b0*/ IADD R31, R34, R45; /* 0xe0800000169c887e */ - /*02b8*/ IADD R22.CC, R16, R47; /* 0xe0840000179c405a */ - /* 0x089980818880a010 */ - /*02c8*/ IADD.X R23, R17, R30; /* 0xe08040000f1c445e */ - /*02d0*/ IMAD R26.CC, R31, R48, c[0x0][0x170]; /* 0x910cc0002e1c7c6a */ - /*02d8*/ DADD R20, R20, R18; /* 0xe3800000091c5052 */ - /*02e0*/ LD.E.64 R18, [R16]; /* 0xc5800000001c4048 */ - /*02e8*/ IMAD.HI.X R27, R31, R48, c[0x0][0x174]; /* 0x9318c0002e9c7c6e */ - /*02f0*/ LD.E.64 R24, [R22]; /* 0xc5800000001c5860 */ - /*02f8*/ IADD R51, R33, R45; /* 0xe0800000169c84ce */ - /* 0x088880ac818c11b8 */ - /*0308*/ LD.E.64 R30, [R26]; /* 0xc5800000001c6878 */ - /*0310*/ LD.E.64 R26, [R10]; /* 0xc5800000001c2868 */ - /*0318*/ DADD R14, R14, R18; /* 0xe3800000091c383a */ - /*0320*/ IMAD R18.CC, R51, R48, c[0x0][0x170]; /* 0x910cc0002e1ccc4a */ - /*0328*/ IADD R17, R35, R45; /* 0xe0800000169c8c46 */ - /*0330*/ IMAD.HI.X R19, R51, R48, c[0x0][0x174]; /* 0x9318c0002e9ccc4e */ - /*0338*/ DADD R22, R20, R30; /* 0xe38000000f1c505a */ - /* 0x098c10a0999c1090 */ - /*0348*/ IMAD R16.CC, R17, R48, c[0x0][0x170]; /* 0x910cc0002e1c4442 */ - /*0350*/ LD.E.64 R20, [R18]; /* 0xc5800000001c4850 */ - /*0358*/ DADD R12, R12, R24; /* 0xe38000000c1c3032 */ - /*0360*/ IMAD.HI.X R17, R17, R48, c[0x0][0x174]; /* 0x9318c0002e9c4446 */ - /*0368*/ IADD R18.CC, R18, R28; /* 0xe08400000e1c484a */ - /*0370*/ LD.E.64 R24, [R16]; /* 0xc5800000001c4060 */ - /*0378*/ DADD R20, R14, R20; /* 0xe38000000a1c3852 */ - /* 0x088080b4a18010cc */ - /*0388*/ IADD.X R19, R19, R50; /* 0xe0804000191c4c4e */ - /*0390*/ LD.E.64 R14, [R18]; /* 0xc5800000001c4838 */ - /*0398*/ DADD R22, R22, R24; /* 0xe38000000c1c585a */ - /*03a0*/ IADD R25, R36, R45; /* 0xe0800000169c9066 */ - /*03a8*/ IMAD R16.CC, R25, R48, c[0x0][0x170]; /* 0x910cc0002e1c6442 */ - /*03b0*/ DADD R20, R20, R14; /* 0xe3800000071c5052 */ - /*03b8*/ IADD R15, R38, R45; /* 0xe0800000169c983e */ - /* 0x09a010b081ac809c */ - /*03c8*/ IMAD.HI.X R17, R25, R48, c[0x0][0x174]; /* 0x9318c0002e9c6446 */ - /*03d0*/ IMAD R14.CC, R15, R48, c[0x0][0x170]; /* 0x910cc0002e1c3c3a */ - /*03d8*/ LD.E.64 R24, [R16]; /* 0xc5800000001c4060 */ - /*03e0*/ IMAD.HI.X R15, R15, R48, c[0x0][0x174]; /* 0x9318c0002e9c3c3e */ - /*03e8*/ IADD R18.CC, R18, R28; /* 0xe08400000e1c484a */ - /*03f0*/ LD.E.64 R30, [R14]; /* 0xc5800000001c3878 */ - /*03f8*/ IADD.X R19, R19, R50; /* 0xe0804000191c4c4e */ - /* 0x08a480a480b58010 */ - /*0408*/ LD.E.64 R50, [R18]; /* 0xc5800000001c48c8 */ - /*0410*/ DMUL R20, R6, R20; /* 0xe40000000a1c1852 */ - /*0418*/ DADD R22, R22, R24; /* 0xe38000000c1c585a */ - /*0420*/ DADD R12, R12, R30; /* 0xe38000000f1c3032 */ - /*0428*/ DFMA R24, R8, R26, R20; /* 0xdb8050000d1c2062 */ - /*0430*/ DFMA R16, R4, R22, R24; /* 0xdb8060000b1c1042 */ - /*0438*/ DADD R12, R12, R50; /* 0xe3800000191c3032 */ - /* 0x08908cb0a010ac80 */ - /*0448*/ DFMA R10, R2, R12, R16; /* 0xdb804000061c082a */ - /*0450*/ @P0 BRA.U 0x4b8; /* 0x120000003000023c */ - /*0458*/ @!P0 MOV32I R17, 0x8; /* 0x740000000423c046 */ - /*0460*/ @!P0 DADD R18, R26, R26; /* 0xe38000000d20684a */ - /*0468*/ @!P0 IMAD R14.CC, R46, R17, c[0x0][0x178]; /* 0x910c44002f20b83a */ - /*0470*/ @!P0 IMAD.HI.X R15, R46, R17, c[0x0][0x17c]; /* 0x931844002fa0b83e */ - /*0478*/ @!P0 IMAD R16.CC, R46, R17, c[0x0][0x168]; /* 0x910c44002d20b842 */ - /* 0x08a180a5dc10bd9c */ - /*0488*/ @!P0 LD.E.64 R12, [R14]; /* 0xc580000000203830 */ - /*0490*/ @!P0 IMAD.HI.X R17, R46, R17, c[0x0][0x16c]; /* 0x931844002da0b846 */ - /*0498*/ @!P0 LD.E.64 R20, [R16]; /* 0xc580000000204050 */ - /*04a0*/ @!P0 DADD R22, R18, -R12; /* 0xe38100000620485a */ - /*04a8*/ @!P0 DFMA R10, R20, R10, R22; /* 0xdb8058000520502a */ - /*04b0*/ @!P0 ST.E.64 [R14], R10; /* 0xe580000000203828 */ - /*04b8*/ IADD R39, R39, 0x20; /* 0xc0800000101c9c9d */ - /* 0x08b0a0b8b0a0b8b0 */ - /*04c8*/ ISETP.LT.AND P0, PT, R39, c[0x0][0x144], PT; /* 0x5b181c00289c9c1e */ - /*04d0*/ @P0 BRA 0x178; /* 0x12007ffe5000003c */ - /*04d8*/ IADD R29, R29, 0x1; /* 0xc0800000009c7475 */ - /*04e0*/ ISETP.LT.AND P0, PT, R29, c[0x0][0x14c], PT; /* 0x5b181c00299c741e */ - /*04e8*/ @P0 BRA 0xe0; /* 0x12007ffdf800003c */ - /*04f0*/ IADD R0, R0, 0x1; /* 0xc0800000009c0001 */ - /*04f8*/ ISETP.LT.AND P0, PT, R0, R49, PT; /* 0xdb181c00189c001e */ - /* 0x0800000000b810b8 */ - /*0508*/ @P0 BRA 0xb0; /* 0x12007ffdd000003c */ - /*0510*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ - /*0518*/ EXIT ; /* 0x18000000001c003c */ - /*0520*/ BRA 0x520; /* 0x12007ffffc1c003c */ - /*0528*/ NOP; /* 0x85800000001c3c02 */ - /*0530*/ NOP; /* 0x85800000001c3c02 */ - /*0538*/ NOP; /* 0x85800000001c3c02 */ - .................................. - - diff --git a/examples/stencil/2.s b/examples/stencil/2.s deleted file mode 100644 index 76476d03..00000000 --- a/examples/stencil/2.s +++ /dev/null @@ -1,239 +0,0 @@ - - code for sm_35 - Function : stencil_step_task - .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" - /* 0x0880acb0a0a0a000 */ - /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ - /*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */ - /*0018*/ IADD R44, R10, c[0x0][0x150]; /* 0x608000002a1c28b2 */ - /*0020*/ IADD R0, R44, 0x1; /* 0xc0800000009cb001 */ - /*0028*/ ISETP.GE.AND P0, PT, R44, R0, PT; /* 0xdb681c00001cb01e */ - /*0030*/ @P0 EXIT ; /* 0x180000000000003c */ - /*0038*/ MOV R11, c[0x0][0x154]; /* 0x64c03c002a9c002e */ - /* 0x0888108010a01080 */ - /*0048*/ IADD R41, R10, c[0x0][0x150]; /* 0x608000002a1c28a6 */ - /*0050*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */ - /*0058*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */ - /*0060*/ IMUL R35, R11, c[0x0][0x158]; /* 0x61c018002b1c2c8e */ - /*0068*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */ - /*0070*/ SHF.L R36, RZ, 0x1, R11; /* 0xb7c02c00009ffc91 */ - /*0078*/ MOV R42, c[0x0][0x148]; /* 0x64c03c00291c00aa */ - /* 0x088c80108c108c10 */ - /*0088*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */ - /*0090*/ IMUL R0, R11, 0x3; /* 0xc1c01800019c2c01 */ - /*0098*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */ - /*00a0*/ IMUL R18, R11, -0x3; /* 0xc9c01bfffe9c2c49 */ - /*00a8*/ SHF.L R37, RZ, 0x1, R35; /* 0xb7c08c00009ffc95 */ - /*00b0*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */ - /*00b8*/ IMUL R19, R35, 0x3; /* 0xc1c01800019c8c4d */ - /* 0x0880acb0a0acb000 */ - /*00c8*/ IMUL R20, R35, -0x3; /* 0xc9c01bfffe9c8c51 */ - /*00d0*/ ISETP.GE.AND P0, PT, R42, c[0x0][0x14c], PT; /* 0x5b681c00299ca81e */ - /*00d8*/ @P0 BRA 0x6d8; /* 0x12000002fc00003c */ - /*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */ - /*00e8*/ ISETP.LT.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b181c00289c281e */ - /*00f0*/ @!P0 BRA 0x6d8; /* 0x12000002f020003c */ - /*00f8*/ IMUL R40, R44, R35; /* 0xe1c01800119cb0a2 */ - /* 0x088880108c10a000 */ - /*0108*/ MOV R21, c[0x0][0x148]; /* 0x64c03c00291c0056 */ - /*0110*/ IMAD R39, R21, c[0x0][0x154], R40; /* 0x5108a0002a9c549e */ - /*0118*/ MOV R34, c[0x0][0x140]; /* 0x64c03c00281c008a */ - /*0120*/ IADD R29, R39, R37; /* 0xe0800000129c9c76 */ - /*0128*/ IADD R22, R39, c[0x0][0x154]; /* 0x608000002a9c9c5a */ - /*0130*/ ISUB R32, R39, R37; /* 0xe0880000129c9c82 */ - /*0138*/ IADD R23, R39, R36; /* 0xe0800000121c9c5e */ - /* 0x0880808080108c10 */ - /*0148*/ ISUB R24, R39, c[0x0][0x154]; /* 0x608800002a9c9c62 */ - /*0150*/ IADD R25, R39, R0; /* 0xe0800000001c9c66 */ - /*0158*/ ISUB R26, R39, R36; /* 0xe0880000121c9c6a */ - /*0160*/ IADD R27, R39, R35; /* 0xe0800000119c9c6e */ - /*0168*/ IADD R28, R39, R18; /* 0xe0800000091c9c72 */ - /*0170*/ ISUB R30, R39, R35; /* 0xe0880000119c9c7a */ - /*0178*/ IADD R33, R39, R20; /* 0xe08000000a1c9c86 */ - /* 0x08a0acb0a0a0a000 */ - /*0188*/ IADD R31, R39, R19; /* 0xe0800000099c9c7e */ - /*0190*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */ - /*0198*/ LOP.AND R11, R10, 0x1f; /* 0xc20000000f9c282d */ - /*01a0*/ IADD R43, R11, R34; /* 0xe0800000111c2cae */ - /*01a8*/ ISETP.GE.AND P0, PT, R43, c[0x0][0x144], PT; /* 0x5b681c00289cac1e */ - /*01b0*/ @P0 BRA.U 0x6a0; /* 0x120000027400023c */ - /*01b8*/ @!P0 IADD R10, R39, R43; /* 0xe080000015a09c2a */ - /* 0x08a0108c109c80a0 */ - /*01c8*/ @!P0 SHF.L R38, RZ, 0x3, R10; /* 0xb7c0280001a3fc99 */ - /*01d0*/ @!P0 IADD R10, R38, -0x8; /* 0xc88003fffc209829 */ - /*01d8*/ @!P0 IADD R11, R38, 0x8; /* 0xc08000000420982d */ - /*01e0*/ @!P0 BFE R12, R10, 0x11f; /* 0xc00800008fa02831 */ - /*01e8*/ @!P0 IADD R54.CC, R10, c[0x0][0x170]; /* 0x608400002e2028da */ - /*01f0*/ @!P0 IADD R10, R38, -0x10; /* 0xc88003fff8209829 */ - /*01f8*/ @!P0 BFE R13, R11, 0x11f; /* 0xc00800008fa02c35 */ - /* 0x08808080a0108c10 */ - /*0208*/ @!P0 IADD.X R55, R12, c[0x0][0x174]; /* 0x608040002ea030de */ - /*0210*/ @!P0 IADD R46.CC, R11, c[0x0][0x170]; /* 0x608400002e202cba */ - /*0218*/ @!P0 IADD R11, R38, 0x10; /* 0xc08000000820982d */ - /*0220*/ @!P0 BFE R14, R10, 0x11f; /* 0xc00800008fa02839 */ - /*0228*/ @!P0 IADD.X R47, R13, c[0x0][0x174]; /* 0x608040002ea034be */ - /*0230*/ @!P0 IADD R48.CC, R10, c[0x0][0x170]; /* 0x608400002e2028c2 */ - /*0238*/ @!P0 IADD R10, R22, R43; /* 0xe080000015a0582a */ - /* 0x08ac108080909410 */ - /*0248*/ @!P0 LD.E.64 R12, [R54]; /* 0xc58000000020d830 */ - /*0250*/ @!P0 BFE R15, R11, 0x11f; /* 0xc00800008fa02c3d */ - /*0258*/ @!P0 LD.E.64 R16, [R46]; /* 0xc58000000020b840 */ - /*0260*/ @!P0 IADD.X R49, R14, c[0x0][0x174]; /* 0x608040002ea038c6 */ - /*0268*/ @!P0 IADD R52.CC, R11, c[0x0][0x170]; /* 0x608400002e202cd2 */ - /*0270*/ @!P0 SHF.L R50, RZ, 0x3, R10; /* 0xb7c0280001a3fcc9 */ - /*0278*/ @!P0 IADD R14, R23, R43; /* 0xe080000015a05c3a */ - /* 0x08908c108c108010 */ - /*0288*/ @!P0 IADD.X R53, R15, c[0x0][0x174]; /* 0x608040002ea03cd6 */ - /*0290*/ @!P0 BFE R51, R50, 0x11f; /* 0xc00800008fa0c8cd */ - /*0298*/ @!P0 IADD R50.CC, R50, c[0x0][0x170]; /* 0x608400002e20c8ca */ - /*02a0*/ @!P0 SHF.L R45, RZ, 0x3, R14; /* 0xb7c0380001a3fcb5 */ - /*02a8*/ @!P0 LD.E.64 R10, [R48]; /* 0xc58000000020c028 */ - /*02b0*/ @!P0 DADD R12, R12, R16; /* 0xe380000008203032 */ - /*02b8*/ @!P0 LD.E.64 R14, [R52]; /* 0xc58000000020d038 */ - /* 0x089c8010b0108c10 */ - /*02c8*/ @!P0 IADD.X R51, R51, c[0x0][0x174]; /* 0x608040002ea0ccce */ - /*02d0*/ @!P0 BFE R17, R45, 0x11f; /* 0xc00800008fa0b445 */ - /*02d8*/ @!P0 IADD R16, R24, R43; /* 0xe080000015a06042 */ - /*02e0*/ @!P0 IADD R46.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4ba */ - /*02e8*/ @!P0 SHF.L R45, RZ, 0x3, R16; /* 0xb7c0400001a3fcb5 */ - /*02f0*/ @!P0 IADD.X R47, R17, c[0x0][0x174]; /* 0x608040002ea044be */ - /*02f8*/ @!P0 LD.E.64 R16, [R50]; /* 0xc58000000020c840 */ - /* 0x08848010a8108080 */ - /*0308*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*0310*/ @!P0 DADD R48, R10, R14; /* 0xe3800000072028c2 */ - /*0318*/ @!P0 BFE R11, R45, 0x11f; /* 0xc00800008fa0b42d */ - /*0320*/ @!P0 IADD R10, R26, R43; /* 0xe080000015a0682a */ - /*0328*/ @!P0 IADD.X R55, R11, c[0x0][0x174]; /* 0x608040002ea02cde */ - /*0330*/ @!P0 SHF.L R45, RZ, 0x3, R10; /* 0xb7c0280001a3fcb5 */ - /*0338*/ @!P0 LD.E.64 R14, [R46]; /* 0xc58000000020b838 */ - /* 0x0890988010801094 */ - /*0348*/ @!P0 DADD R16, R12, R16; /* 0xe380000008203042 */ - /*0350*/ @!P0 IADD R13, R27, R43; /* 0xe080000015a06c36 */ - /*0358*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0360*/ @!P0 BFE R53, R45, 0x11f; /* 0xc00800008fa0b4d5 */ - /*0368*/ @!P0 IADD R52.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4d2 */ - /*0370*/ @!P0 IADD R12, R29, R43; /* 0xe080000015a07432 */ - /*0378*/ @!P0 SHF.L R13, RZ, 0x3, R13; /* 0xb7c0340001a3fc35 */ - /* 0x0894801094108c10 */ - /*0388*/ @!P0 IADD.X R53, R53, c[0x0][0x174]; /* 0x608040002ea0d4d6 */ - /*0390*/ @!P0 SHF.L R45, RZ, 0x3, R12; /* 0xb7c0300001a3fcb5 */ - /*0398*/ @!P0 BFE R46, R13, 0x11f; /* 0xc00800008fa034b9 */ - /*03a0*/ @!P0 IADD R50.CC, R13, c[0x0][0x170]; /* 0x608400002e2034ca */ - /*03a8*/ @!P0 LD.E.64 R12, [R52]; /* 0xc58000000020d030 */ - /*03b0*/ @!P0 DADD R16, R16, R10; /* 0xe380000005204042 */ - /*03b8*/ @!P0 BFE R10, R45, 0x11f; /* 0xc00800008fa0b429 */ - /* 0x08a0108c109c8010 */ - /*03c8*/ @!P0 IADD.X R51, R46, c[0x0][0x174]; /* 0x608040002ea0b8ce */ - /*03d0*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*03d8*/ @!P0 IADD R45, R30, R43; /* 0xe080000015a078b6 */ - /*03e0*/ @!P0 LD.E.64 R46, [R50]; /* 0xc58000000020c8b8 */ - /*03e8*/ @!P0 DADD R14, R48, R14; /* 0xe38000000720c03a */ - /*03f0*/ @!P0 IADD.X R55, R10, c[0x0][0x174]; /* 0x608040002ea028de */ - /*03f8*/ @!P0 SHF.L R48, RZ, 0x3, R45; /* 0xb7c0b40001a3fcc1 */ - /* 0x088480a080108010 */ - /*0408*/ @!P0 IADD R45, R32, R43; /* 0xe080000015a080b6 */ - /*0410*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0418*/ @!P0 BFE R49, R48, 0x11f; /* 0xc00800008fa0c0c5 */ - /*0420*/ @!P0 IADD R48.CC, R48, c[0x0][0x170]; /* 0x608400002e20c0c2 */ - /*0428*/ @!P0 DADD R14, R14, R12; /* 0xe38000000620383a */ - /*0430*/ @!P0 DADD R12, R16, R46; /* 0xe380000017204032 */ - /*0438*/ @!P0 SHF.L R46, RZ, 0x3, R45; /* 0xb7c0b40001a3fcb9 */ - /* 0x0880808010b08010 */ - /*0448*/ @!P0 IADD.X R49, R49, c[0x0][0x174]; /* 0x608040002ea0c4c6 */ - /*0450*/ @!P0 BFE R45, R38, 0x11f; /* 0xc00800008fa098b5 */ - /*0458*/ @!P0 IADD R16.CC, R38, c[0x0][0x170]; /* 0x608400002e209842 */ - /*0460*/ @!P0 IADD.X R17, R45, c[0x0][0x174]; /* 0x608040002ea0b446 */ - /*0468*/ @!P0 LD.E.64 R50, [R48]; /* 0xc58000000020c0c8 */ - /*0470*/ @!P0 DADD R14, R14, R10; /* 0xe38000000520383a */ - /*0478*/ @!P0 BFE R10, R46, 0x11f; /* 0xc00800008fa0b829 */ - /* 0x0880bc109c1080b0 */ - /*0488*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0490*/ @!P0 IADD.X R47, R10, c[0x0][0x174]; /* 0x608040002ea028be */ - /*0498*/ @!P0 LD.E.64 R10, [R16]; /* 0xc580000000204028 */ - /*04a0*/ @!P0 IADD R48, R38, -0x18; /* 0xc88003fff42098c1 */ - /*04a8*/ @!P0 LD.E.64 R52, [R46]; /* 0xc58000000020b8d0 */ - /*04b0*/ @!P0 DADD R12, R12, R50; /* 0xe380000019203032 */ - /*04b8*/ @!P0 DMUL R50, R8, R10; /* 0xe4000000052020ca */ - /* 0x08b08010b01080a0 */ - /*04c8*/ @!P0 IADD R46, R38, 0x18; /* 0xc08000000c2098b9 */ - /*04d0*/ @!P0 DFMA R16, R6, R12, R50; /* 0xdb80c80006201842 */ - /*04d8*/ @!P0 BFE R13, R48, 0x11f; /* 0xc00800008fa0c035 */ - /*04e0*/ @!P0 IADD R12.CC, R48, c[0x0][0x170]; /* 0x608400002e20c032 */ - /*04e8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*04f0*/ @!P0 IADD.X R13, R13, c[0x0][0x174]; /* 0x608040002ea03436 */ - /*04f8*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /* 0x08a0a080dc109c80 */ - /*0508*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0510*/ @!P0 LD.E.64 R48, [R12]; /* 0xc5800000002030c0 */ - /*0518*/ @!P0 LD.E.64 R50, [R46]; /* 0xc58000000020b8c8 */ - /*0520*/ @!P0 DADD R14, R14, R52; /* 0xe38000001a20383a */ - /*0528*/ @!P0 DADD R12, R48, R50; /* 0xe38000001920c032 */ - /*0530*/ @!P0 IADD R48, R25, R43; /* 0xe080000015a064c2 */ - /*0538*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /* 0x08a080dc10a0b010 */ - /*0548*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0550*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0558*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0560*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*0568*/ @!P0 DADD R10, R10, R10; /* 0xe38000000520282a */ - /*0570*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*0578*/ @!P0 IADD R48, R28, R43; /* 0xe080000015a070c2 */ - /* 0x08a080dca0b010a0 */ - /*0588*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*0590*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0598*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*05a0*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*05a8*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*05b0*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*05b8*/ @!P0 IADD R48, R31, R43; /* 0xe080000015a07cc2 */ - /* 0x0880a010b010a010 */ - /*05c8*/ @!P0 IADD R43, R33, R43; /* 0xe080000015a084ae */ - /*05d0*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*05d8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*05e0*/ @!P0 IADD R48.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8c2 */ - /*05e8*/ @!P0 IADD.X R49, R47, c[0x0][0x174]; /* 0x608040002ea0bcc6 */ - /*05f0*/ @!P0 SHF.L R43, RZ, 0x3, R43; /* 0xb7c0ac0001a3fcad */ - /*05f8*/ @!P0 LD.E.64 R46, [R48]; /* 0xc58000000020c0b8 */ - /* 0x0880909c80a080d8 */ - /*0608*/ @!P0 IADD R52.CC, R43, c[0x0][0x170]; /* 0x608400002e20acd2 */ - /*0610*/ @!P0 DADD R46, R12, R46; /* 0xe3800000172030ba */ - /*0618*/ @!P0 BFE R12, R43, 0x11f; /* 0xc00800008fa0ac31 */ - /*0620*/ @!P0 IADD.X R53, R12, c[0x0][0x174]; /* 0x608040002ea030d6 */ - /*0628*/ @!P0 IADD R12.CC, R38, c[0x0][0x178]; /* 0x608400002f209832 */ - /*0630*/ @!P0 LD.E.64 R48, [R52]; /* 0xc58000000020d0c0 */ - /*0638*/ @!P0 IADD.X R13, R45, c[0x0][0x17c]; /* 0x608040002fa0b436 */ - /* 0x08cc8c10a48090b0 */ - /*0648*/ @!P0 IADD R50.CC, R38, c[0x0][0x168]; /* 0x608400002d2098ca */ - /*0650*/ @!P0 IADD.X R51, R45, c[0x0][0x16c]; /* 0x608040002da0b4ce */ - /*0658*/ @!P0 DADD R46, R46, R48; /* 0xe38000001820b8ba */ - /*0660*/ @!P0 DFMA R48, R4, R14, R16; /* 0xdb804000072010c2 */ - /*0668*/ @!P0 LD.E.64 R16, [R12]; /* 0xc580000000203040 */ - /*0670*/ @!P0 DFMA R48, R2, R46, R48; /* 0xdb80c000172008c2 */ - /*0678*/ @!P0 LD.E.64 R14, [R50]; /* 0xc58000000020c838 */ - /* 0x08a0b8b0a000a4a4 */ - /*0688*/ @!P0 DADD R10, R10, -R16; /* 0xe38100000820282a */ - /*0690*/ @!P0 DFMA R10, R48, R14, R10; /* 0xdb8028000720c02a */ - /*0698*/ @!P0 ST.E.64 [R12], R10; /* 0xe580000000203028 */ - /*06a0*/ IADD R34, R34, 0x20; /* 0xc0800000101c8889 */ - /*06a8*/ ISETP.LT.AND P0, PT, R34, c[0x0][0x144], PT; /* 0x5b181c00289c881e */ - /*06b0*/ @P0 BRA 0x190; /* 0x12007ffd6c00003c */ - /*06b8*/ IADD R21, R21, 0x1; /* 0xc0800000009c5455 */ - /* 0x08b810b8b010b8b0 */ - /*06c8*/ ISETP.EQ.AND P0, PT, R21, c[0x0][0x14c], PT; /* 0x5b281c00299c541e */ - /*06d0*/ @!P0 BRA 0x110; /* 0x12007ffd1c20003c */ - /*06d8*/ ISETP.NE.AND P0, PT, R44, R41, PT; /* 0xdb581c00149cb01e */ - /*06e0*/ IADD R44, R44, 0x1; /* 0xc0800000009cb0b1 */ - /*06e8*/ @P0 BRA 0xd0; /* 0x12007ffcf000003c */ - /*06f0*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ - /*06f8*/ EXIT ; /* 0x18000000001c003c */ - /*0700*/ BRA 0x700; /* 0x12007ffffc1c003c */ - /*0708*/ NOP; /* 0x85800000001c3c02 */ - /*0710*/ NOP; /* 0x85800000001c3c02 */ - /*0718*/ NOP; /* 0x85800000001c3c02 */ - /*0720*/ NOP; /* 0x85800000001c3c02 */ - /*0728*/ NOP; /* 0x85800000001c3c02 */ - /*0730*/ NOP; /* 0x85800000001c3c02 */ - /*0738*/ NOP; /* 0x85800000001c3c02 */ - .................................. - - diff --git a/examples/stencil/3.s b/examples/stencil/3.s deleted file mode 100644 index 76476d03..00000000 --- a/examples/stencil/3.s +++ /dev/null @@ -1,239 +0,0 @@ - - code for sm_35 - Function : stencil_step_task - .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" - /* 0x0880acb0a0a0a000 */ - /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ - /*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */ - /*0018*/ IADD R44, R10, c[0x0][0x150]; /* 0x608000002a1c28b2 */ - /*0020*/ IADD R0, R44, 0x1; /* 0xc0800000009cb001 */ - /*0028*/ ISETP.GE.AND P0, PT, R44, R0, PT; /* 0xdb681c00001cb01e */ - /*0030*/ @P0 EXIT ; /* 0x180000000000003c */ - /*0038*/ MOV R11, c[0x0][0x154]; /* 0x64c03c002a9c002e */ - /* 0x0888108010a01080 */ - /*0048*/ IADD R41, R10, c[0x0][0x150]; /* 0x608000002a1c28a6 */ - /*0050*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */ - /*0058*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */ - /*0060*/ IMUL R35, R11, c[0x0][0x158]; /* 0x61c018002b1c2c8e */ - /*0068*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */ - /*0070*/ SHF.L R36, RZ, 0x1, R11; /* 0xb7c02c00009ffc91 */ - /*0078*/ MOV R42, c[0x0][0x148]; /* 0x64c03c00291c00aa */ - /* 0x088c80108c108c10 */ - /*0088*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */ - /*0090*/ IMUL R0, R11, 0x3; /* 0xc1c01800019c2c01 */ - /*0098*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */ - /*00a0*/ IMUL R18, R11, -0x3; /* 0xc9c01bfffe9c2c49 */ - /*00a8*/ SHF.L R37, RZ, 0x1, R35; /* 0xb7c08c00009ffc95 */ - /*00b0*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */ - /*00b8*/ IMUL R19, R35, 0x3; /* 0xc1c01800019c8c4d */ - /* 0x0880acb0a0acb000 */ - /*00c8*/ IMUL R20, R35, -0x3; /* 0xc9c01bfffe9c8c51 */ - /*00d0*/ ISETP.GE.AND P0, PT, R42, c[0x0][0x14c], PT; /* 0x5b681c00299ca81e */ - /*00d8*/ @P0 BRA 0x6d8; /* 0x12000002fc00003c */ - /*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */ - /*00e8*/ ISETP.LT.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b181c00289c281e */ - /*00f0*/ @!P0 BRA 0x6d8; /* 0x12000002f020003c */ - /*00f8*/ IMUL R40, R44, R35; /* 0xe1c01800119cb0a2 */ - /* 0x088880108c10a000 */ - /*0108*/ MOV R21, c[0x0][0x148]; /* 0x64c03c00291c0056 */ - /*0110*/ IMAD R39, R21, c[0x0][0x154], R40; /* 0x5108a0002a9c549e */ - /*0118*/ MOV R34, c[0x0][0x140]; /* 0x64c03c00281c008a */ - /*0120*/ IADD R29, R39, R37; /* 0xe0800000129c9c76 */ - /*0128*/ IADD R22, R39, c[0x0][0x154]; /* 0x608000002a9c9c5a */ - /*0130*/ ISUB R32, R39, R37; /* 0xe0880000129c9c82 */ - /*0138*/ IADD R23, R39, R36; /* 0xe0800000121c9c5e */ - /* 0x0880808080108c10 */ - /*0148*/ ISUB R24, R39, c[0x0][0x154]; /* 0x608800002a9c9c62 */ - /*0150*/ IADD R25, R39, R0; /* 0xe0800000001c9c66 */ - /*0158*/ ISUB R26, R39, R36; /* 0xe0880000121c9c6a */ - /*0160*/ IADD R27, R39, R35; /* 0xe0800000119c9c6e */ - /*0168*/ IADD R28, R39, R18; /* 0xe0800000091c9c72 */ - /*0170*/ ISUB R30, R39, R35; /* 0xe0880000119c9c7a */ - /*0178*/ IADD R33, R39, R20; /* 0xe08000000a1c9c86 */ - /* 0x08a0acb0a0a0a000 */ - /*0188*/ IADD R31, R39, R19; /* 0xe0800000099c9c7e */ - /*0190*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */ - /*0198*/ LOP.AND R11, R10, 0x1f; /* 0xc20000000f9c282d */ - /*01a0*/ IADD R43, R11, R34; /* 0xe0800000111c2cae */ - /*01a8*/ ISETP.GE.AND P0, PT, R43, c[0x0][0x144], PT; /* 0x5b681c00289cac1e */ - /*01b0*/ @P0 BRA.U 0x6a0; /* 0x120000027400023c */ - /*01b8*/ @!P0 IADD R10, R39, R43; /* 0xe080000015a09c2a */ - /* 0x08a0108c109c80a0 */ - /*01c8*/ @!P0 SHF.L R38, RZ, 0x3, R10; /* 0xb7c0280001a3fc99 */ - /*01d0*/ @!P0 IADD R10, R38, -0x8; /* 0xc88003fffc209829 */ - /*01d8*/ @!P0 IADD R11, R38, 0x8; /* 0xc08000000420982d */ - /*01e0*/ @!P0 BFE R12, R10, 0x11f; /* 0xc00800008fa02831 */ - /*01e8*/ @!P0 IADD R54.CC, R10, c[0x0][0x170]; /* 0x608400002e2028da */ - /*01f0*/ @!P0 IADD R10, R38, -0x10; /* 0xc88003fff8209829 */ - /*01f8*/ @!P0 BFE R13, R11, 0x11f; /* 0xc00800008fa02c35 */ - /* 0x08808080a0108c10 */ - /*0208*/ @!P0 IADD.X R55, R12, c[0x0][0x174]; /* 0x608040002ea030de */ - /*0210*/ @!P0 IADD R46.CC, R11, c[0x0][0x170]; /* 0x608400002e202cba */ - /*0218*/ @!P0 IADD R11, R38, 0x10; /* 0xc08000000820982d */ - /*0220*/ @!P0 BFE R14, R10, 0x11f; /* 0xc00800008fa02839 */ - /*0228*/ @!P0 IADD.X R47, R13, c[0x0][0x174]; /* 0x608040002ea034be */ - /*0230*/ @!P0 IADD R48.CC, R10, c[0x0][0x170]; /* 0x608400002e2028c2 */ - /*0238*/ @!P0 IADD R10, R22, R43; /* 0xe080000015a0582a */ - /* 0x08ac108080909410 */ - /*0248*/ @!P0 LD.E.64 R12, [R54]; /* 0xc58000000020d830 */ - /*0250*/ @!P0 BFE R15, R11, 0x11f; /* 0xc00800008fa02c3d */ - /*0258*/ @!P0 LD.E.64 R16, [R46]; /* 0xc58000000020b840 */ - /*0260*/ @!P0 IADD.X R49, R14, c[0x0][0x174]; /* 0x608040002ea038c6 */ - /*0268*/ @!P0 IADD R52.CC, R11, c[0x0][0x170]; /* 0x608400002e202cd2 */ - /*0270*/ @!P0 SHF.L R50, RZ, 0x3, R10; /* 0xb7c0280001a3fcc9 */ - /*0278*/ @!P0 IADD R14, R23, R43; /* 0xe080000015a05c3a */ - /* 0x08908c108c108010 */ - /*0288*/ @!P0 IADD.X R53, R15, c[0x0][0x174]; /* 0x608040002ea03cd6 */ - /*0290*/ @!P0 BFE R51, R50, 0x11f; /* 0xc00800008fa0c8cd */ - /*0298*/ @!P0 IADD R50.CC, R50, c[0x0][0x170]; /* 0x608400002e20c8ca */ - /*02a0*/ @!P0 SHF.L R45, RZ, 0x3, R14; /* 0xb7c0380001a3fcb5 */ - /*02a8*/ @!P0 LD.E.64 R10, [R48]; /* 0xc58000000020c028 */ - /*02b0*/ @!P0 DADD R12, R12, R16; /* 0xe380000008203032 */ - /*02b8*/ @!P0 LD.E.64 R14, [R52]; /* 0xc58000000020d038 */ - /* 0x089c8010b0108c10 */ - /*02c8*/ @!P0 IADD.X R51, R51, c[0x0][0x174]; /* 0x608040002ea0ccce */ - /*02d0*/ @!P0 BFE R17, R45, 0x11f; /* 0xc00800008fa0b445 */ - /*02d8*/ @!P0 IADD R16, R24, R43; /* 0xe080000015a06042 */ - /*02e0*/ @!P0 IADD R46.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4ba */ - /*02e8*/ @!P0 SHF.L R45, RZ, 0x3, R16; /* 0xb7c0400001a3fcb5 */ - /*02f0*/ @!P0 IADD.X R47, R17, c[0x0][0x174]; /* 0x608040002ea044be */ - /*02f8*/ @!P0 LD.E.64 R16, [R50]; /* 0xc58000000020c840 */ - /* 0x08848010a8108080 */ - /*0308*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*0310*/ @!P0 DADD R48, R10, R14; /* 0xe3800000072028c2 */ - /*0318*/ @!P0 BFE R11, R45, 0x11f; /* 0xc00800008fa0b42d */ - /*0320*/ @!P0 IADD R10, R26, R43; /* 0xe080000015a0682a */ - /*0328*/ @!P0 IADD.X R55, R11, c[0x0][0x174]; /* 0x608040002ea02cde */ - /*0330*/ @!P0 SHF.L R45, RZ, 0x3, R10; /* 0xb7c0280001a3fcb5 */ - /*0338*/ @!P0 LD.E.64 R14, [R46]; /* 0xc58000000020b838 */ - /* 0x0890988010801094 */ - /*0348*/ @!P0 DADD R16, R12, R16; /* 0xe380000008203042 */ - /*0350*/ @!P0 IADD R13, R27, R43; /* 0xe080000015a06c36 */ - /*0358*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0360*/ @!P0 BFE R53, R45, 0x11f; /* 0xc00800008fa0b4d5 */ - /*0368*/ @!P0 IADD R52.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4d2 */ - /*0370*/ @!P0 IADD R12, R29, R43; /* 0xe080000015a07432 */ - /*0378*/ @!P0 SHF.L R13, RZ, 0x3, R13; /* 0xb7c0340001a3fc35 */ - /* 0x0894801094108c10 */ - /*0388*/ @!P0 IADD.X R53, R53, c[0x0][0x174]; /* 0x608040002ea0d4d6 */ - /*0390*/ @!P0 SHF.L R45, RZ, 0x3, R12; /* 0xb7c0300001a3fcb5 */ - /*0398*/ @!P0 BFE R46, R13, 0x11f; /* 0xc00800008fa034b9 */ - /*03a0*/ @!P0 IADD R50.CC, R13, c[0x0][0x170]; /* 0x608400002e2034ca */ - /*03a8*/ @!P0 LD.E.64 R12, [R52]; /* 0xc58000000020d030 */ - /*03b0*/ @!P0 DADD R16, R16, R10; /* 0xe380000005204042 */ - /*03b8*/ @!P0 BFE R10, R45, 0x11f; /* 0xc00800008fa0b429 */ - /* 0x08a0108c109c8010 */ - /*03c8*/ @!P0 IADD.X R51, R46, c[0x0][0x174]; /* 0x608040002ea0b8ce */ - /*03d0*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*03d8*/ @!P0 IADD R45, R30, R43; /* 0xe080000015a078b6 */ - /*03e0*/ @!P0 LD.E.64 R46, [R50]; /* 0xc58000000020c8b8 */ - /*03e8*/ @!P0 DADD R14, R48, R14; /* 0xe38000000720c03a */ - /*03f0*/ @!P0 IADD.X R55, R10, c[0x0][0x174]; /* 0x608040002ea028de */ - /*03f8*/ @!P0 SHF.L R48, RZ, 0x3, R45; /* 0xb7c0b40001a3fcc1 */ - /* 0x088480a080108010 */ - /*0408*/ @!P0 IADD R45, R32, R43; /* 0xe080000015a080b6 */ - /*0410*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0418*/ @!P0 BFE R49, R48, 0x11f; /* 0xc00800008fa0c0c5 */ - /*0420*/ @!P0 IADD R48.CC, R48, c[0x0][0x170]; /* 0x608400002e20c0c2 */ - /*0428*/ @!P0 DADD R14, R14, R12; /* 0xe38000000620383a */ - /*0430*/ @!P0 DADD R12, R16, R46; /* 0xe380000017204032 */ - /*0438*/ @!P0 SHF.L R46, RZ, 0x3, R45; /* 0xb7c0b40001a3fcb9 */ - /* 0x0880808010b08010 */ - /*0448*/ @!P0 IADD.X R49, R49, c[0x0][0x174]; /* 0x608040002ea0c4c6 */ - /*0450*/ @!P0 BFE R45, R38, 0x11f; /* 0xc00800008fa098b5 */ - /*0458*/ @!P0 IADD R16.CC, R38, c[0x0][0x170]; /* 0x608400002e209842 */ - /*0460*/ @!P0 IADD.X R17, R45, c[0x0][0x174]; /* 0x608040002ea0b446 */ - /*0468*/ @!P0 LD.E.64 R50, [R48]; /* 0xc58000000020c0c8 */ - /*0470*/ @!P0 DADD R14, R14, R10; /* 0xe38000000520383a */ - /*0478*/ @!P0 BFE R10, R46, 0x11f; /* 0xc00800008fa0b829 */ - /* 0x0880bc109c1080b0 */ - /*0488*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0490*/ @!P0 IADD.X R47, R10, c[0x0][0x174]; /* 0x608040002ea028be */ - /*0498*/ @!P0 LD.E.64 R10, [R16]; /* 0xc580000000204028 */ - /*04a0*/ @!P0 IADD R48, R38, -0x18; /* 0xc88003fff42098c1 */ - /*04a8*/ @!P0 LD.E.64 R52, [R46]; /* 0xc58000000020b8d0 */ - /*04b0*/ @!P0 DADD R12, R12, R50; /* 0xe380000019203032 */ - /*04b8*/ @!P0 DMUL R50, R8, R10; /* 0xe4000000052020ca */ - /* 0x08b08010b01080a0 */ - /*04c8*/ @!P0 IADD R46, R38, 0x18; /* 0xc08000000c2098b9 */ - /*04d0*/ @!P0 DFMA R16, R6, R12, R50; /* 0xdb80c80006201842 */ - /*04d8*/ @!P0 BFE R13, R48, 0x11f; /* 0xc00800008fa0c035 */ - /*04e0*/ @!P0 IADD R12.CC, R48, c[0x0][0x170]; /* 0x608400002e20c032 */ - /*04e8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*04f0*/ @!P0 IADD.X R13, R13, c[0x0][0x174]; /* 0x608040002ea03436 */ - /*04f8*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /* 0x08a0a080dc109c80 */ - /*0508*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0510*/ @!P0 LD.E.64 R48, [R12]; /* 0xc5800000002030c0 */ - /*0518*/ @!P0 LD.E.64 R50, [R46]; /* 0xc58000000020b8c8 */ - /*0520*/ @!P0 DADD R14, R14, R52; /* 0xe38000001a20383a */ - /*0528*/ @!P0 DADD R12, R48, R50; /* 0xe38000001920c032 */ - /*0530*/ @!P0 IADD R48, R25, R43; /* 0xe080000015a064c2 */ - /*0538*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /* 0x08a080dc10a0b010 */ - /*0548*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0550*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0558*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0560*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*0568*/ @!P0 DADD R10, R10, R10; /* 0xe38000000520282a */ - /*0570*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*0578*/ @!P0 IADD R48, R28, R43; /* 0xe080000015a070c2 */ - /* 0x08a080dca0b010a0 */ - /*0588*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*0590*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0598*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*05a0*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*05a8*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*05b0*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*05b8*/ @!P0 IADD R48, R31, R43; /* 0xe080000015a07cc2 */ - /* 0x0880a010b010a010 */ - /*05c8*/ @!P0 IADD R43, R33, R43; /* 0xe080000015a084ae */ - /*05d0*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*05d8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*05e0*/ @!P0 IADD R48.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8c2 */ - /*05e8*/ @!P0 IADD.X R49, R47, c[0x0][0x174]; /* 0x608040002ea0bcc6 */ - /*05f0*/ @!P0 SHF.L R43, RZ, 0x3, R43; /* 0xb7c0ac0001a3fcad */ - /*05f8*/ @!P0 LD.E.64 R46, [R48]; /* 0xc58000000020c0b8 */ - /* 0x0880909c80a080d8 */ - /*0608*/ @!P0 IADD R52.CC, R43, c[0x0][0x170]; /* 0x608400002e20acd2 */ - /*0610*/ @!P0 DADD R46, R12, R46; /* 0xe3800000172030ba */ - /*0618*/ @!P0 BFE R12, R43, 0x11f; /* 0xc00800008fa0ac31 */ - /*0620*/ @!P0 IADD.X R53, R12, c[0x0][0x174]; /* 0x608040002ea030d6 */ - /*0628*/ @!P0 IADD R12.CC, R38, c[0x0][0x178]; /* 0x608400002f209832 */ - /*0630*/ @!P0 LD.E.64 R48, [R52]; /* 0xc58000000020d0c0 */ - /*0638*/ @!P0 IADD.X R13, R45, c[0x0][0x17c]; /* 0x608040002fa0b436 */ - /* 0x08cc8c10a48090b0 */ - /*0648*/ @!P0 IADD R50.CC, R38, c[0x0][0x168]; /* 0x608400002d2098ca */ - /*0650*/ @!P0 IADD.X R51, R45, c[0x0][0x16c]; /* 0x608040002da0b4ce */ - /*0658*/ @!P0 DADD R46, R46, R48; /* 0xe38000001820b8ba */ - /*0660*/ @!P0 DFMA R48, R4, R14, R16; /* 0xdb804000072010c2 */ - /*0668*/ @!P0 LD.E.64 R16, [R12]; /* 0xc580000000203040 */ - /*0670*/ @!P0 DFMA R48, R2, R46, R48; /* 0xdb80c000172008c2 */ - /*0678*/ @!P0 LD.E.64 R14, [R50]; /* 0xc58000000020c838 */ - /* 0x08a0b8b0a000a4a4 */ - /*0688*/ @!P0 DADD R10, R10, -R16; /* 0xe38100000820282a */ - /*0690*/ @!P0 DFMA R10, R48, R14, R10; /* 0xdb8028000720c02a */ - /*0698*/ @!P0 ST.E.64 [R12], R10; /* 0xe580000000203028 */ - /*06a0*/ IADD R34, R34, 0x20; /* 0xc0800000101c8889 */ - /*06a8*/ ISETP.LT.AND P0, PT, R34, c[0x0][0x144], PT; /* 0x5b181c00289c881e */ - /*06b0*/ @P0 BRA 0x190; /* 0x12007ffd6c00003c */ - /*06b8*/ IADD R21, R21, 0x1; /* 0xc0800000009c5455 */ - /* 0x08b810b8b010b8b0 */ - /*06c8*/ ISETP.EQ.AND P0, PT, R21, c[0x0][0x14c], PT; /* 0x5b281c00299c541e */ - /*06d0*/ @!P0 BRA 0x110; /* 0x12007ffd1c20003c */ - /*06d8*/ ISETP.NE.AND P0, PT, R44, R41, PT; /* 0xdb581c00149cb01e */ - /*06e0*/ IADD R44, R44, 0x1; /* 0xc0800000009cb0b1 */ - /*06e8*/ @P0 BRA 0xd0; /* 0x12007ffcf000003c */ - /*06f0*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ - /*06f8*/ EXIT ; /* 0x18000000001c003c */ - /*0700*/ BRA 0x700; /* 0x12007ffffc1c003c */ - /*0708*/ NOP; /* 0x85800000001c3c02 */ - /*0710*/ NOP; /* 0x85800000001c3c02 */ - /*0718*/ NOP; /* 0x85800000001c3c02 */ - /*0720*/ NOP; /* 0x85800000001c3c02 */ - /*0728*/ NOP; /* 0x85800000001c3c02 */ - /*0730*/ NOP; /* 0x85800000001c3c02 */ - /*0738*/ NOP; /* 0x85800000001c3c02 */ - .................................. - - diff --git a/examples/stencil/Makefile b/examples/stencil/Makefile index 097cd597..47cbf5d5 100644 --- a/examples/stencil/Makefile +++ b/examples/stencil/Makefile @@ -2,7 +2,7 @@ EXAMPLE=stencil CPP_SRC=stencil.cpp stencil_serial.cpp ISPC_SRC=stencil.ispc -ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 +ISPC_IA_TARGETS=avx ISPC_ARM_TARGETS=neon include ../common.mk diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp index 593d901f..93d11b7e 100644 --- a/examples/stencil/stencil.cpp +++ b/examples/stencil/stencil.cpp @@ -85,6 +85,7 @@ int main() { // the minimum time of three runs. // double minTimeISPC = 1e30; +#if 0 for (int i = 0; i < 3; ++i) { reset_and_start_timer(); loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, @@ -95,6 +96,7 @@ int main() { } printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); +#endif InitData(Nx, Ny, Nz, Aispc, vsq); diff --git a/examples/stencil/stencil.cubin b/examples/stencil/stencil.cubin deleted file mode 100644 index db1b1bca7c1d6730958b2611b10f4ee84930169f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3156 zcmeHJUuc_E6hBGR?)K00OB^o6YmM~fTWn+KL_!HAlo~=ABlf9Ouu0dJZ0oYx#?isB zH&RQ5Oj}yS!lz52%9J>wRt@4L2nB^AKKNE2DxISbd)R{yd#J1DeD{9sm)QE^!(O~3 z-?``9-#O=Z?>*oB?(|D%qE3h7^f8dl3EzW5^+^;OrBbMuvbDAqA`pTad@@1?CR3A{ ziA-_`dzQR<$<#9=QyDalq|con8$>xim`P4vgh2XA;Cy=Y@)1LmspQ1S@DYRLX9hTT zIX#(4rZd4qfd9hj5H^pB<=}X7B6&%!>e0`39I63pcMfo~roHaKpn6ZC^k@h8NUa=j zc^Jk~eYdUOz`%*-h$8|u%@J^&Y%)QOxW{n+*?qkMf)#o_=oLBxUxj`QIO^_w|2pti z=(TX7LU(~yq1OQNu=zW&+U&XmTot-Wn(!?c2`A)w+KdP;&LD+@UbPM;?*GT5SphAd z*YbkZfx#eL9`=Lb!OENM0NzHrQ|EpFk!S}4`HAw2V3_t6FlvFPHgngny9+CM<_ltN zfmrtjITnFf*MhjoC|QtW(TMf1oSioS{5gvIahu@>Z#k)9cE}f5K;>E4y#cUV{>Hz< zu^1h9*V|;?*LH6>Ku0_9Q{>;G8>cY;D+LX`3xZ*MDlyL$HU23WCoul+h}RWfCp_Zr z;|b}(dHTt2AOSHa*cTbpZ?c!$DL>}7hB7#h69Rh<+kGgUP0n8KfYX%SuHoO<%@atL-`q8H^_Xg z8&8nm)$fdV0a*FlHU1>&ox(7szr+JjZdwSG43yqQE$7a;H?l0fB*%j~-`(5WTmNZX z`X_X`A1!N!Iscf>|D^H01jmC)Y|mv2?!1<(@7Lu%b%$hpjPrnS(U8nDT9ENW5pM4D zKtIQ?kJuKU=Vo$ZsUG8+`+Z?d`lrXF-_WqsJL4P=K(XBylX-(F8Ap^g;TAQ$?1qdp5Jx^ZpLxw#ruFOYk+^tF@s@SJ z9qp0$-WY$3=9N%<`a-;c{5LeQvL*yOdxkgD{6fkf*O%U#?a_Iew_B|v&dXfKuJ@(g z5QS_}%hyNM`gMjS4=YbsxHA7#NbViWDn#7`8kd~@^6&v2h2Vl(Wdj}myIomLVkNaw&Uu^0U_s0hY$M;8+i={mM zSc^gR?WOlS{%{&RuV)O3S8mDBIbqr#{DpRhI>+$*@}hmSq?Hf{bbkCo;}2_$X*?J1 zgn`xxyeN0~(C!SAF5aq^1o|%x&!YW1fIUt`DNA-E_6cuqbQQGj5O({oO8Lxg#D29u z9yaD(qY4Id3Ybs>bRVHVzT{@Zn}gXdN-?g~eX*nbEu6)Xio)JbOXVN_7O8s*O#XC( zs`k&IjZzUed%LF$!ZzW<{ON8+km_68m`Y^b)hIvyU#QMcesqno?&R35Xg8H+No)Nm ZC;3@CE1Xa%D?*mE`&DzT_OwLHGaw diff --git a/examples/stencil/stencil1.cubin b/examples/stencil/stencil1.cubin deleted file mode 100644 index 8b7d18d97f867ec965d35f0c90002d0f94ea1dad..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3604 zcmeHKO>A355FR^jyKxg(n*c_b=pim?p=qRYaHCMYR#2r;TcuH{ObXOalge#VHzcNN zr9u}-BMZ4;B&&+hJ1nJ#A{ogy=Y*b+B5~xv0VzEMrAR&Wz<~pY&@l66H?iG@3ldyd z$!~XN=jYp*dGprS-Z*?P9188(20X&>9fZu9rpZnwCb#qa(v~i8Aq8#t6eQU@lbb0_ z7qVlxvh$S-0bQ85TtMbT{`A?&Q#enZDr9HQLvQ|4@7esBi<@lCa34LoT#E=R)M!jzDNT<`{l^ah;ArUlZL9Jx#O#r-|MMNu*=z`dcB< zL~ntvCOQJLiEabL(V!p3YHOrJ5NVJJYSEFHWSkQ+iAAFp2lz;dC&0s^a_2rZc0t#wQrE(fNcdSiPYmZ@=8XQwSCKEo zv746(hgv4@7y{8<@dA4TVW%Ae>%+*$m44L2^%bF#n*`z2K@$e+Tc&+e%4wM;t0U6fggn(cjm_ za_5YlqlY!-MJ?Z&M_dtqS-K8JMlDLzmQ^Jt2HAdonDxd6*}h1z+%dv`*sl+1wsU5j ztI(@D{qmXv|pc|Vw|Z#mU}17_(s!= zFH3S$ejT%i?v*&9bd3CslsG=`zRmt7GOUN^2;w)EVSVdQWBfc>68*!?aS7se+^^r# zod0#)X~tvYCGW_ZQuE?0;Yn@8$Ho!kgt&^!HjYE){O{)&msBO`NhPLpj7yrh^qY9> zFLC^AJTDc@xyr1M)khH!=gO>q(B*UNH*w6HJc}uV+hx5Gm*oR>Srwy$4wk)I3G)29 zU$=N$^SX{N#d)mCyXf8%HowB;E1qV1wvI-OJiZUV)BCWu>`-6uss0q!{}EI7LxOR= zmSUXNZhwmTW2Syb`{;dvd#EiJFn;?6bWmSGU1a?FXb<{Nt0ighV7}@7TCwlL5mOhn z+a;|L|B$H*>(A!9eO^DL`i%;X>t2d0_5piN#9yqSe~Z`R%$9Trl=#Zxvi7Y1LnXHB zDP2}YEyglz|K=di_8ij&m#v#lgS(w@RD$}iNnH|`I{5q-OdQXdI7*ZE{3P%9mz~|z zmxOu0v3*sPyesyN>)q6s#>x&}Z)P)bQxDiDb`jtDm!_YLI8HQ(ulo5m_HZFgUx>Su z_m5T%B%F2_R!dPA>qM$0bM9ZS&nBs_`rWhCZxbgMbZtS_7N~!}`sX!>e{j{qwBr6% z>i=o#|6RoIhGY!srI7g!A^-j=^5p`GW5a#e=3T?Sd~bm6H~f%v=Xshk5dItKK(@+fuew;qt ztx2TyDmiH#nY%Xm$KMXiMlu+rw*`OpKNWK~o_EaB&NhDJlk_ak4Gmr?BWUMFy;iKP Jx{b8e`wOT8vg-f< diff --git a/examples/stencil/stencil2.cubin b/examples/stencil/stencil2.cubin deleted file mode 100644 index 64a9d3ea9dc5ff0d9485c61330de6e2206b6b2d2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3668 zcmeHKZ-`V?6hAXNu2(I3ZiZvNh8KN#JI$`obzLUe;IX6-)Ca*rDsskuthuhs?jRPC zx7d~`hT33*2`=?035hYLSe>NvjWA3?NI%n5xL+BCWomw`c?GzTgeLsSD6)O1 zcPKNMNpHueV19LhYv(-{ z2`345?C&4Sr28|@0)c;`X#q9=mC9`c>B00q)zy3dT#ZX%BiJ1PS3kC)zX|^E2Mi<9 z-yE3LInp9`OWKj;HP1^K@*r=>g{D<5L{{D>W0uh#|N3avUJ6c~y#(w!n}JnlF9s9f z=JG1O8KQOeB3MypN5H7Fn*epx;KO*@YV8n2>TD^R;$z4Z28cW&86jdaR1R?K?v_Z% z|3Ci44tSndEO6EgfxGY+B!Pu|GawiyTDsZdd%81Bc4cy`R*nQkR5+72R4@!aqX zZe-c#I=%zU1o^u_$oaj5`8Y)VwB;Zk_|ymUp8zEM(idR1Qk;>)e>}&cj){kdPlXa9 zgz={g2M-C_Pbb9fJFERO9}+I-mzZPXA>>of=2RRM3sV2)oQjJb5v{26MbtTuN3JCf zrcHTbwy?pBqTjkn{!fcyIp+BhJj$PC&ucv&Z^61ew)tI+J0PeZeD(!_e(c9v($3Dl zEKKPqG!C|1u4hxbg{*V-6}g_B?H8B_@q@x#Li20V&d!bqb1B&%`|q(ggpljlfBdC$ zN7`)h05+`hWx|yBSb_9oU17+`x`g)-;XN);g2z6Z|11QyUuw-;^LLu~bX(q{e)?-| z?|lH~U$w1vfZ-Ib-=g*P3FRj+>A&W9?cnm%L_)-xlwkxTg81ht#qB8RyvMRe9|x3zT)d} zq^`N-1!W_;XmG*KpHuja;|kxCQv4I+H9W3gntV#dxzh@d;*S`7m|fLzSRY>Ce6CIN z#CsLbwS>Y+J=o`@u5S!aKhKY{9<8gZPsQ81AKz=9i<-Y9qt@3`U7daj^&`a?yL_pJ z-|=bP zSpRWeuY1Z@`76H4zaeSiZO!L8Tx%uGC#raCKc1h{-zwf?dENiK!fhH?{%O5`$fNX^ zRa}u9!Z^vJAl|5cR#tVKL<-Q%?XMRp=@(Y@gJL{aB7 zpINW?FrS9vQtRi9-2KK8HZ5djPM z1Kuj1=84;?Zq$!2T|QSCP&^(I5I*4hE1oL;b%n?4w}`*N?91VLT>(!;_w}@{?j-Ab0 z`1v!OIzR5U$$O-%>ooA(xqQaeeXTI$_xVgxpUcCC!7G+MhW)|&=FPb{ol~37*ed>~ zcd{nlhxq=<2KP7g{dNdip4%dD`?+5%8~DG>Vq56mx9NU2X+2gHE5O`3^Ob&o;rrm` zJo;}pi9`54xZX|QLI6>m=kK6D+%50zxBu=;B>!ZD5KP~Cjl8nrfJYwZB{JKjGSOaK4? diff --git a/examples/stencil/stencil_avx.bc b/examples/stencil/stencil_avx.bc deleted file mode 100644 index 7a63ccce3094f36eb8f1674fbe86d04a3ae31cd9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12836 zcmds730PCtwhkdA3?T$Xfim2nz|}T|Fo+;v!zfnd;*%;=z%bLw5CMZwsz6Xs!2%+K zfTmW(ck6^8gJ6+Clm;23Eg(`rCDtpb1+;a3Yo8P5LBIOGe&2h1a5!h5v)5XC?KS*s zonvit`ByC}fv_9?=1gegdw5ld)pLeX19nm^n z&iWYLP{~wUWworKt+s<~s@CGV|B9umYK3W2t~n@rpA?*A*FTm3I_p#jgb(1qT2k*D z_@S_GKBm;5Glf{&78}!RwPqQTmc`Ykmu)NwcOKJ*oN4#DlwdmTX{-XlN}-RTaL<`G z(n1zF)B4y7PZ$dQEQQM-B=9YJCDl$zSX4?{B=gN%;Jn(jxYoQwlCT>1p*75p=U6k~ z{~kD~!hgd>4l)eiU8DnxjwQh1PQtgTO7k4|Cd(xpbEI>_Ne(6i*@kSXz+)tVA(1VR zrmS0Jyow+jPU1LA7rQRrm|jG;{{DhJmmKRG=d|VYg-IIh1HRg(slE30#IVDA6oRn1BK&a49B& zQ48iwewZ`)Va~LN8HnB1v11GQ>14s4VuAs85D!1X?r;SZ_yxYfECu`oUxg?ZjZdWZ z5Ss;(BbgE9VfJp4p*4{t&|SSD{Ri!@8?^QkNo>#_fx1o;w?9tOJP8Rp8nVfdB@mDq zT_eKrM;X{h&1|}=qGM{ZjP9y>l8vm;VScOhwcCqtEN*Ez=OOMrrTYoKXz-$^WS-I z#PH=Bh30lyx-^Lx$7am&PtYz$%4-uo+Gju5 z_x=9N(mZ^2-QEN_&CJ9u#>TI8C8yl0dVJ2}*6SMAw?0z;_D6nv`y)+nf8@s7ANl3& zkKBCwBfpM6(rqtU-Wy?dZGnG5a|m7DxFDyH(c6D`T~j;6eX-AH={&vd zu9_iy_ibwOcjMdo85^_nS5{aVx8!;9zG)kHl;^p7=l#BC&*c<^wS%Gik2{q0dPSAV zhy5IEoGRy(#dr7OJs%*qvcsc0EuD(0!lPARA5XmZK;MF?P+Dc6vL=mN4UE+8{<|xX zp`52jDX$qA>6AYjH^nqIBO!MB7}$MUxKZ1EXh`*Lyin-tEYpVbh zY2UPNj*fE5y5jzYvb(2!R_)#Mx2|MX^ak3+ujQeQ>-ua|h~RsZof+RTukQRs)w&hP zCIk0&|Qj3o_Qm@Paf{fet!JdJeHudNz+9P?^dVt?P$ z(qFol&I>w^MGf(enJf$hf0*tTTOK@A+QZ8q^se$4#%Kn^u7vo+eL8e=!Wx2&VGRMW z4>RxWnF*4CbAe=?2D9+X1%5Inol@Xq!5hJEbv_L%NtU_^v2J3>!^2uVXQIlx0zu_X zhi6e_k1JA-D6nLW1v0Q_DPUk*n9e~34vnorS=)6HBXv2K^e$IHhf`W$d#B#hF0s{J z1A{QH{tumMvVO3+^<0u|vEtf3%G_qY{`v zJ1{J_m8b_f@EkAm#iNo%XS>qQa@6 z;JdF3q~>Ick#2b)1>i)Wwc@%iKl{;gb!`yW7@KZo_|DTA^uQ#iq6UWOg*-)O*SWLu zw#?f5fh~?{*;)6w?d1cQTgcFmB2~cy+IV|O=SYchbwjKrNIhlwx1SJYc72yan`5=G z%!9vfi*|dxt?GVH@rzCMZT)@a9}Pmn{4FfoJ+q=COj-RjVUFN0#$b>G7|u84k+L+T z-Dn4dLx-KEU;N0fGtArn@XF)>=x2uPvk$D`777WW72jhi>lqcfOi{k|l7z3nSTP5# z!!~CwsXv+%7UH?sQy)0U3xy#olAi6TwX)V4_()Pw$X#(!d425Lr(g0E99AG5J=zvd zAAYr)pW4qVXST;B`I%~TA6WYK>2Vl$P@PT0p|xa#o={Tu1mqLs|I^rtD+yuIym+DiMsoJ&~%f?W)&`uiWNyjs@Y zIdJyRwM`W_^h$0?t$J52W$=&*yHl()A){CRbIg*zGPFCYp*tS7v49i+z(xBk3e-$M zNaNYU5!+i8Iqiwg9D|1v%$ZHz*_lNva$hZ|YKYqSxx`xv0N7~=0&GU0t0%9`s__ps z+B{jW&8kVRyW3hXWxrbxK&EfZ)UUWd>W2E58~%3K@!hybuLL_ohT?onfhp3R zvM94JA|mtzf_KA?fcY9(3j;T$7au0nA)2BjV!^X_G(t?oFG=tTF9 zpge{mG>#zJ;mN$UC`Oh*Vi+S0U;#*P!LxlM!hD88S3&UVrroClr}I04vPFyu@C!HY z(m9NfV~jl(W>Tcm2bK1)a`4!*dgb@9)6bHjj^pjE>u`z$A;htqhhlla1$-%hnh|1I z2I2-_0)7mngU$;vATbQ!B=hdT=K#|Z8hpih8cQf>?RL`^ji?+MO(+uA|Y~|%lB#Tk!dq@pRo>^*lG7>Nc95#6{qWzYAI1Z38Uy0cO9)JqC z(pGRoY$;*5@aM5=3D@;ie`ZSwIM+pUyG4$=V)WGfC93~~NUe^b)_UGWP-{6>@j)zr zdn>;eJ2Q^_(FWiu4}xVvc_l*iN{-L0#{Lb;;qT=~25NG}A2)Pfz{@!?KiqG=d>EAIK>k0I3V}A0Fxo6c+%Pg*1ODUc1Q$P$gsQ!%WI-6Rf?%H!BL&7NU-n zxuVU2_aUvae>T!S%Tdf~`0Ft;oOzWPye5n{sl>+>0vdElZ}v5}^A0Mg5Sbhjm;adO z6hVb4eV_q+47C(+KSKN7lnFf#lTJS@vb%xtmB7BEigv6~5QD*US1gEwo&8mJV$SML zQIIs=&z&--xD%cVptcw1zo!%PpWz3XofU(f%@A~wXA3$$6M{}Agx?A7i$|hN6m-mB zDUHxMF*v(V>@t3kxqPvEZuYqQ;<}DPPeOI(djfYnJm^?}_lp3*8@A)A948HRxDMpT z4|76syT%>88TesTAjqNAgS$)s)42ho0znm3-Z-8xL_#1cpo#1*udG{xI6o@z^9NLE zh)|^)zIHqpha-MI0O5T1i_1rGzy&J!JurrGwm!gC9SPq793Xj%eQcDpnK*ReYsa)? zfJk>jH?3TQ*U&wNdrtb6jB@arxLPEj84qz#37L^Y=0xp|amWN@j4;s@;r(EbrVHo+oNOf-d5IYl!C1bf@#uma~16rp42 zH;s_l%Sv_CBWbnys~37?Ruls%7=ZALh;kD9Je|gvj>1vYGe4?VnSL1TS95@9E7Lr?>hB< zN9Dj>BUIEk`(nMB$+!RoIV|l?!38FWU*{xTFc!1MAPy4?_i80aC!;4ET;divLnZ-0 z@YYNMacO#z0@H^$uPXP`(V4S0H2*NfGFR62$D=nw2}lUM>sN%S2kF{VjF7PIGG0?+ zZ7qo2-uwM4>aezXvp?Du@Wz?bStlOB0DPf7<}B#YeLPc+^#E~xSnaz-yp+lObfSRt zSc`NBlMFoE0k(ezM3yTK9&;ix=#5SXd)WFQ-|Cc$fiB)m9pwjAB#<^cG&m1yrXo#X zacLG*0{u@ED(9@+jK%@acpnqz2XS7zbjCP{y<;#~RpfCSYldU7?#Ojl#>~JvXtvBe zu^ea`!|Y?A<{4&+CPBL<19@!Gj`hU8nW5ObEIOPzr-@V6EF6NzED@y$b;a&XY%7Bip{4~0I0Ww>#=|HcC;Vp!qo@^u zbvdSl(HY9CFX#5SMzNXVeM~ImVQjC0ovB~v?U|rK5O5Z-{w^Wb%zr!tjW$rilf}lq z;K0lw-TOU4o_fWYb4@6S(VLiOj_~Sc7Sae(p=QpU(Nw*5R?_!_CpX8iJ_AIzsm-zJeWF73u_AOly#P(fz9_wD@+Udo=-Q_SGAyw-%ofN{0shv$ zM&^ut4Gb$H#~X4Ls}+yzIT%(%Y$r$dQ2}R(>|z8zXJjve4VdnD*sfrXcNI){LZ1Nx zJSg={PIYdWfCjUrx_x4`&Y9}`496NvQ}Yp4>RH0Rk2*q8@?@#;9#Sr{r68<_G($Oe zLhLelsqlA{a{%O#>i&P1b3cGFUp_6RpsYV0Y0Ui+afOK;N{R)ZF9c^gO50T zcL;@j2G2jzZx%*$UO%|uon9j`=t~CC3!pMu0u$a$M*R!D{KDP?{oy;Yxe1XWJq9{8 z-r|C>zieW`h+euBOeq-QMe4+Yu^1~Dx6bvd2!F5E7#d|}VZXD#KPpF4SpL^5#j{lO zy7D(q{B`909f0h*X7+{3jdpvRbO0 zS0Xo?68)zX&w&!TBvG}ohStYY7zEU#xc!x@u`#F+(H#?_1UbEh16XEtJZBN#$_a*MZM|{b$_jsSnIHzz` zBo>`tdeu9HL)h1BDtUokWo!pz30@2Wp#i=poKITAM4uTdz~4`@fg}MbX#tn*mQ2WC zD`r?}Wmp?z*jQ!QuA_hOBa1BZ`Qclt4QHiD?MDU?yO2N^=;K5qML9`wonzgV*f@jv zaRwY#qb-N3LN!uUktEkTlImltS+fnN8F1K*w!Tyqx{;zPB4S$~D`7Bys{x1e?nH2w z(}*D2RQn)x3F`8fasLohGi?NhIcA|9b=cdBoGU5mOai50AgD5PSr{@c_4Pm>chO6<3Mb; ziK~>0=`tYpn0z`_wiFS&-Mhf0uLFqHs@K(g=&lCDPV(J2yTTE%NN2gz+)zYJA*6Z% zjRwR9tn?~tt{`G7JES~K&DiE^7*rSI+lsahQhkD2`D)YH>ZCXc!gGS$ZD(2$)mF=y zncBCTv+yz}(`t2mb&~7JLY@-<(n>nI~!BcMJ#_g zy0ak(U2yp(XwHW1=z_yvhVFdEMHkL|V>IWpKy<<88=*Tc912k-nZ-9mb6i}}1%q#Z z?yPr47j!-g%~|h&E(rW@2=>&4gx*BAlTw??q|D1C!&!-LL?2eX(to@N>?9nX3(znz z8iwN)I^tj?uO1FhIT~oh4Gz`{H_&j#F*ppd(J;|Wnp4M-AneHA_4zp|_s0(}t2cUO zUKY49YUQmyj;&@&hLIMNNrc!0mdZN05;mcTE%0dB5@o3#w=W>SKI%2v5DbW?V> zB=V)(@K+z;xH9-&v8B#IKDAk?^P-*j(`sB;3re)%j|zl{h%>!tl&=jNS+>*weu znJzR#i}e`~L|5r_Ddx^t$EM+2AQc(xLMpsfrbxDEt1`Xp+&=P#nc4`t$o!fp z)m>cg4O)3QQe+k4@J|~hngoGQ*zPq(q`gff^Z0_^GUEDF6mbx0r-h|D2G&d_o;?9@f@=%VOhMC7E*Iv zm##g1&EZ@+rmdJAM29}xH@8i{`S`PW3tqat(x)s zCy(qg!Pafwq=-?o46(<=eo}D>-=!zTSHvf!*Tp9ne-@uyN>i*me`gh!ape;YTOWT? z;XuU`Hd{$T#oR7~b$oqs*KT{d+`n?Av`lTif=GprFaEb;T}_V&279{VS*`U9#|ul{ z0^yV1of!1l_et)zb`X0|^^goKd_ zN6K5_?r_QI-O4K}NJYM1+reh~tX+lKQN1>OpA4BJ$6i+RyIPu!cqE#ZW#xytr+xC-ZT zDZyp2dX=KtPs9IU8!_uJL%HzAwoX;u;rIEO5O4;^6bYq;1gX8T!!C*( zPG7@O2^B?U1gb~2<_do9nr23qu#61Cu_LZrWjL8f^)yjW6ZIrKAqFhvD zL7=UpXLZ5qvbhCwOAEU@3yYck7AT}HE~;)|s--_J6b16>_MQ#QA8F`^)6kzzV<*kYRQ6no@v0F!A^g)X^vE1FJB|E>Y3zSHjh&QtQuS*p zi`yy6<{UdD3uW0KI`Gb`KZ+Ex9 z8|pmXo(|v28~wo!e}J?zQyMw~Fvi$$WoJ*vSnFD-)5)7~ceD{123H3C-jHaRYk|EB zp*+;q*#VKd5Cq!m)}XauXGh!ZFi2m~-r1pdhrGd%uE$%e>}>Dq3Hd9%UEbwwfws^Z z1@7t&wfL^Nh8#9m(U;VP0^RPqkj>^>=?&_kptmj5?e@7`wV;(O_jb4W^iH@B?YVvR zx~p!EUUQ?5)hqS+gF%!!yzPqauCLNt{6XQ9`l`x6XNSMuyF38fh?knPq`EFt*4Ejr zqalAOIlyUitzz13b9LFU-qz9W4~E>nb2GDEuP=h3b75#*i5`HqajjI!R!V)HJs~bp z&N~ZhQZ_Wl7w~(7E|^fPyJ0atAq*0$+ZpI~wUKMvxmQtI0nhW8B*f>_yW?0_XeHd) zLU<8X^6^T#PH}|cYxTClXx+EktD}L{2nviCxw^G0*wzth1&?Zp!EX1qbtrmkTSuD$ zKS5acw@~Ek`tt5>E(20Wi?Yb$UQpTCsFxI$7M2?^Ue=2XiwozliIu^25ps@YVqcMg zwLBC5FlljcJ|-j%{>}FmNNvFBo9{(TOrGOZE<%yk@gjV}-eaKLpqQSB|ni-w;L-dh{T&HMnLAgX8GM^Kj_Bis8d5q}B zeMF@d*fSP*ZgfiCx=L?1NJjd}c_fiCxaB!AdIH}3C_ z8tAmgmB%pyo%V+Eh#Kg$hm^+&1Kqf#^N+8fIw+d!waS{@byU6n+@ zTm$_I16?)HuQbr}4fH$%y~sePeYrf!4D@_S1k`%p&W=8d$c#RMFpU19j7Y=T`(ktM z4X8>i?>>B+FSnya_)4lddMt(`?yvZXNAn_;SK2#7CPMzJzcJ?a>;B*Ah-4J!)t84TMuD zj}|d}0pWDoFsd^AI>Ij?+`{l;!YKueDh$7xa0>0w6K6rlyMk~E>Cs~hzl?AS<j1Y&+xYhr%)d4V)(0s z=Mvt`@aG7pkRGjJ_!ERvD3971{s`d|!lOkD-%B`!?x+gf(eRK~V^>_ZK}WqK{8WP$ z{-`eeSxuj*{AiD*$~Wj9I+j({x7`kz<2hRIkR>+Qv)1vO`xe=i-K%f3D|;w$MxTME z@Jm|wK=kyPSZwz$a0%H1vM2m56(2-#_rrqzFF-U--$wLD1pS9VoZ&Z_Jn8v6jqAA$ z^xogaqJIR@5gQ!6730v~-*0=z9X@ia7JgOhef>mDz5Br6z(Q0wFtme#0|S@ZmFRKQ z`LMsA;#=#RG}5Lj+NQx!M(i*P@BW=P!~O=XfcD8+@2MOu{Kjn=%l7Srf0*Eh%k~f} zcFcVFdZO(lOv(?P%7nZY-m4A0^)1bGSPSphj-2g5{aqxhZF(Xk>%S`Mub+Pp#$V~t z;psQ({;O;an%!``hyb^+4{UAdvrMh`xJfHN+HGw$U+#pi-a(TVp41L*Z2Ab3@0+BP zZ28gPtI;{|$o@$Sft#pbw=CNy+S7XHA8kd0p|8ceG_7Heo!Se(5j(tW^cvJ0y^1>& zKIK-NuVt=|_T8rTK5mNM4hJ${F*|IBHpO-}A{s(@4DYk_4w=fI4P62QExn({dfweh zv4Q3{k_!~gcSsw0B}?nuKq0iRe^aa{F@b-2HN6(&)#WoF^ zH~k*@M2oDoTB2J77N%p_!dWltHccxlC>5bJ1&{WEYexELyqZOF)AWS|nthLEXrW ze)K8X*t4#0sWo>OO~$@}b;j=V5Z}DU8=uhx;^l`R%QR}?eY>v^Q(%|`Fq2-6{zS6l z(_=e$y=cXBXCYd(o!qk>Ct1<&gNi0gwGG4MTZF`}S~Bwq^dA#46E?ERyL$lBI658D zd#D@HzbE~-7$koW)~;#;;UE#{#8&G1PY@pJszz%-?h|4^(Vbxn|**^#=KZl=6!b^wY$YBXi(R zriGtzg&&LF$qdK*Mnz9w3+SBroG1Kd^u3SC3+7G#3TB_jx*F5H%kdD!q}E3&4mhnU z5XG^R0NIpFG`72RhTlaoo7e}ktW+oZD)01S55}#Pmy0#bKTX5jtD)8#Jt#FJR95bL zXn>;k=cva#v&V{Q7wv~ZyNsdnJE+<9D=CC<-c1B!2>crnjA6G81g`v-vfdt@l$Ss` zXZ#}0xRDr_Nye}56ONl44UT$b>U!&GPuOWaVZQ50Xb684J{wsSbApSJk&nsp@OUuJZlXJ@iWE`wsKf zzq37I8>})tT4j2~lj*UZIDVa2zuq@rJ&4b<@0)Mk3&Ckw``ibz_ClWm>RzX5FBn6N zF+@vb`GGqxF5myXLtyU1=V_{T`~c>CAIxz@TCz;N`%Es=iSpy-P2a_$=jh#Ua?T4` zPn$Q@0lj6JW2s}A<5q{h4{ISEj+^(Ot!(K6EL2BG`8!42OqZfdpks=(2_qZS8r5$^;-Mn<{TW+>);{@qjTKHirAy_WM4_bGi;<8Y1L@qvlWMU)-J_G2f8 zJ)tAf``()cv1uO6 zKSe_r^1A@cn}3AhkNfOVTx+9X{x3io%>RyZaxibPu7}}9&FMNd^Na6P>*a|}^G+Sq zS*IQXFmL|(Btxf8ma^gB$ZSR^^TN3pAAqtictn2nH>r&O3OD}rtz`Ufk(1&hQV!!O zqIlT7%fbwkGO07s1z3VHaGbnH@rbC8?s^viiTq2eh&!B|f1`IKg(gq3JS%NQ6`7TM zFbqM*;xQhb6J;f8@AIOQ=iA)8R1W!c85_;x0Orm2WO9c*f_!^HehC}R3vvOCW0=nL z?NITBo6hoiGw;C7^B#+w&Nm!wAhINF1m>t=`%ko%c2i zOE_asGYP^2lvjqZbK)rk1*|O5iy-%b*^>7is6@+OzY&nO->V^yz5H71YG+@_y4r<{ zGDCm32q$M7iGlDugU#Ua84}0#%h5ZGixeZ-jt!?Nd4|@~E`UNAYNzJQ%>-;X0zy{) z`{t#v`BvCG{fSodk|)gdPoacDtGV#WRln~2%=f+j=?uT;48Mg-M|%dS4+usmo3_+i zW1iW9KHhvBo9BF#pb*5-+pyIW`%6dfh{?U-0Pd$z92?G%J?2ds*jg{TX3{pGUz~U; zh_tKJ`sN>9qAKC%c6Vc6E*7q)NzNkdk$wmtMX#fkYs2tS*kBtpY$&zx;oetG+L5R> z`7jO8;QQvr!O^AIey8SC;G$P*j!su(Qfm7bw%Da==kXNnV9(QdX!Hhl6(m)i%W08K z*^WkMx+ytL{Dd;$e16a1oy|(L_AQ(mX=|I6-GuI;AL!}Ce)M+G`Cgm88yq+Dy>=b8 z+6NJgVUKk?w!6brTx-3b!b8RQj-0MJ=)_u|jq-+1VsN&3(?4Nv;VM7q4DUBb9>o>g z$em}Lk&cW=b<7#LGt(9Rz-4~mGl%(sN5hYyble$!-u%Ezz&~?F7M=xVpEG<2&E4&c z+++g!B+hsG9i}H7ro*OZvG*Q;3C>8%WhQ4NbXkVO^t9>e@cq_iP#vacBZ13ek+qj) zVso%HQhHg$bJ-c}$mu$Gr2ewA;k&I}D2{y6v61z}*|)_?KY^j~_i&nfe`Lnf=z)Sp zM?v}ojjboOK+(c#6v07+zX$M2-`&g-Fpoej83pPPKA(_3$Q%;J=SbiE%m7k@<%tX+ z<&pXiPN8QNfZ2d$fF}S;0WE+OT*n*_hZ;m*?;PN5sHncMkejLpETl+C& zLul%tGVxgm|ltEAq4Zqft8-+PFPt!$_-Pk0+Yh}(z zRxw~{WCj8Fvl!#H9+?K%bdK3Mmc_9_j%7O|v#5r}$!pF47Z)sYosqS3==xz1;*b?o zXT-w_70yUK3bU*vfgoxoMQg36A{EyHQwa-LrwXRR(OHoIs;sr1=0XF!%7+{~2F!^o zC019&TI%+lqEk>Bp^-C|tTMOnQ^IM`N6s`7{t4j}5(39%IUaYApIWB*J|mpw1n}vQ zM-G9JQhwx}#zUI#2@T$9B-%%fQ!Q@alZ30l-M%vd zEFRze%*Dg^tg}qd_xMJJ?`dzGPC6g;__kRUCclNbghXaoP|I58@nHgi1yW;H5rlsW zo1SMnvNdQN;5XDIuYl;pJRW%+a1zGB_V_p7{;h}X+d#U|W zb>z&_VOTRgclgzHmtHY)X4-wsoRJrs-W++l@pag8oD6y%1~ud!H&HXoEh9f|Iz-)| zI!kg|LhJyNUqA{39o)}rzI689L;>=Ys z$u@HT0?p^B%KWi5&Q>ID;rwHmzQgx`U&Qf2;N%B3hEOBK2Ba8Mi!IOaH`ZB}Dk_0c zwmN)oLxGdow2{?hxJou|w62#`fm0U16@|N5G{EO;Eh=FYI~d&%$5>>YvD9;nCFL`g zr5qEH3A3nPGq3027>nw4WHRvvh_}unc(+ATbAK580d$N&w+M@=599YTjd z5LV)--~!!!NSoVY`*G!LJ2KQ(^QK|W+XCKZi6T8~2&hSa|16VL2y44wRoZ+x3gTpn{*w#o}CRVL?v7XTqP4Fdr z_S0f;H{-j{g8w-3Jl5*Nk+sDpL>BJ&hF{>-iC6d$BS+$kt~xhVI=Tna-AHT+Vz6WJKYk+sidW4luZcml8#&;nQl z7zLaKm;=J4ik&0I#Gac{=Knu9k8P3HzUGWyE zSf`m@tkcj@Vdd1*k*SZswo~x>-3ubq_rm2L(%{Q=8a+5U^d{|quu8v)RXT)K`gHy= zOw1Mp?k8POQGyzxMf=iMOep@I@Yg@Z*CAvrf9@gE@N4b4PvJZDs$P^Q)^m*Yyn0&C zAH}jw12pe$zMj|1_59ahjIHPO)&VE3{NIo3z{5jpI`*N(qsquDXD9tTP7JZ2w_rj4 zed{17U%#IyXF(YR<$1CS`-`>p$SlJ{YikZ}I%M9MM~!+gHnXMyU=^+)m>Zco4U
Hu1D4IPV;q~|1PW?CHV6b~VZA3aXnDMp50Z7+zg z;}ot$m{EqDGAI^riWNabTB9F#5%m56g5ma^pbOfH)KWxMXF(UM+9=4S(8aSN)jI~mNvt9p$_~K5muL}m`#zMr0NMbM zdpy4T*lIs~FLbK3z^hahSZNWm^P8ypBID&VnuRGXBBiw8H9C2Z z|3%Y@;UQPk5G~7WC$SgxXCt*AU{>W1FQ?%d9%^We((;}O9auTg_c725=$HETY$1}~uZSg}Yq z960=HPl2;cWjJ#O;n^IwGZz5I!cFq3#IfEJt`JW00;d7dkTQ=+y3j*7(Rm1r=_j1% z6eUcLkRK$w?j4g`FL7)r&R`eN%!H*;RGEZTEZtk!)^d1xPr=e`hVoGDDOk(VW)cRs zg`+I8fd)9rq8}(WC8TOVm?gOD1xj0%z4Hr}viAAH!|>T;92Q+BHou^S_*jGo$oRbl zvuDi45WYjB`!0RGGxF2Mk7+;n0my?G5_AjSILU%M+W`!+0`?3FG-8lXVhSMW;Y(iN zay$Wyj5NK2N__qUL0+LsPdEM%`;?|vMqXUX4MSURpcOuJte}!UqS$4jnH4O2gFW&z zwkR)>flY`P(urh&?#_13R(to_9liUqo%6P_>}=lhJ=!E78E4ifY#^|n;O95a+XCKZ ztSnn-Wr^*j4a8({a6#oDl;0wMo?A9@>NebCg8N5s8;jwR!RyE9+RcWy2O&%g=?vOu z>dD@_}uP1jtzk1M$jj*HqBXgt%m)o7!^jNeAAH0O$!yVy| zaTCVwX4GfD9>9+1K{V<(Ibt$zx)ZK(hVQdhpm{B_kwg4~W(SU6Ag7~PCRhvt*ElN| z#Ztjy5O@t|%|Nk8uowhRS+UP$L1~uYFbKSvaa5G@1&2Z4D;XyrC9B{t2%N&a&sBty zDmV-Rr-;}3T=@MSNkZ@#1iqg0?8LJO9)rL)bDl=LY{6p?ct7XW5Kj?227zw{j{QV7 z{3w6OCwH%}$FKAK_gK=>Txy{jGwC?ZN*e`-LEwiOhrGXD za2N!BlyKqw)q=+$c*i)8dB02W7z8iMdCbEr1&=}SPH-L@nP$Ob5WG{sC-nYd;r#&! zy29^8UwRR8i=O|j6l=$4m3F-$T?Y{VHp>nwlQylDs*8b?3CxUpIG778@WJP{-% zl*RSf@CW!gFL8Rcp%Fg4I?*f*@M%N`4KRbSTLE1R@F~G!5Ugy@q5=L;uowhx;Vc^9 z6N1Gc@LXcC>GihYFbIyyI5faf!C?@1KI6~;zb-fo0xx158sKAs!yxc7aO42f1!L+I zgW%aYj}7o)!DA3Sjq}(b9uzzV!K>jsHu!^r#~^r(;3W+(xqEDSoe%>|9v!_1Urzjv zzf_Aj@r6v<%)IQhHWH6@WgXScfljQmttq3$b(U>c?g2r8jpGAnua?=2(Lc)n#5I`-W`rA2`8Ie!a ze4_5)RJ7`}szgiu&eu>5XOuAfem?YC_&1gvy z*CPI-iK`pGewlDRoP5MCbHIkjUEu?c@MGbp9pMkdpV3+-*0Gs1YUQutv6X3SCdq9A{tB7b}>r9X~e%I?4VYuVczeecb{ z)HUvZ!cWrmc(Da)2Ad9+iF2^=p?#3EZ=J<8^DN~c;`8`(hlg8B722bnL_>p8p4vvC%m-#$??1G*7S; zwR{(H9)gs1A@MlJE~GqQwRpnE%$sCB9ut4|nMCo#WK4HN#(&QomyZpZ7#A@%Vp4@a zEC0Y8nE_**efO~(I8hc(%7>JHzLPTl{Dn!;6+V`firc=de3smlJ$luvl*-DtG5?r1 zU6MAP;eJbbt0lG(b`Q=18*w&x08fz!-) z)ng-xt7;cUN)<rM^E4tjjJcmzWu_Lycmphc#Skf_IWzyq-wkLh>r$l6aW?wN-*G#tg#E*j{~ z+Q%goEm>$K2tg?vH=({um0^ zTDNMZQ<~{h&2*OH19wa?odYOnr*Ol7{RF==qY-y!G);bgfw|Fn3iPEHvf@D!aUMOz zkU=0#j%-kHr-8*z1$&fnPH=%JlRkBdjNZ-|$5<7MtU?9*F?>0tD3cQU593bsL1=Aq zeQ0HCi4*jBTPPlnl3VB+b{cc*ULBTKUIteJKL6jN+(*LeuWV){i!_ z^y~aqiUa&Czn#JR*jCgWp=BL`<_v$Pfj0>3_$_ijLaAUUD0J@>cCXm1(k!MLw82^r zh&yT!{B~Gg0n5wK%Tl^0ESGNgv(QEOY-hN_8vQxWzUD2pIzwMaiV1Bu>c##S`@ZA9 zWci?XSTpptKly%evg0V}Kc+WWotiSs-JK#$Umcz8qrzc|xl19-yf?K02IH?Ze$ zhfiulXENR4XQTgk07J3qozM*1^VFUv{HZ5=(i#4vBR2g74Smtdj|bnP`}DKvK79+_ zr=Lsr>8l>}E}w2RD$;s~%jEsp{4pwZO|y8iW74BG6~$Ar&~&z5;xVdhJnPkm=c!J` zdfr1-dN_(d=M?RH1h#;(4{l6zm~dsHvcc)lUA47~Yt@dt;J$jjTBI~|w{@&gNuSE!C!pR^ zwCp-n5r}x~ULAYx9=t-OoxKo1$Mf=j^&9h5UGL}#1O(MUXvKs6$wG|g4X)@m$f+H8 zD15$}*P_CHuMeda>J;9|x`xV1S6!W<$GW@vCWpuE)bZWmGBOr7)a#2E>Q%0)#kEaF zM)l%))N2lxhgC5hU&`2wY9Fwdh&r$T|;$ULru-%+Ip8$_qeOv34N7r zs9W4nTbWRee5zA(3l}$3CwO9UjjPsC@2*~?*EQAEyQ++hNqw55+L>rZN>xN;J~Ha? zIM5d@#SjQay}QbV2r^c2)YZ9a6DakI7sFB2O?nNzSHdVuZK4No1U!<=h`Y+jXsEui zdhyNGa+>>A`hDN2>}m14g2B#Ucm7P(8EjkS52}tDw~G1E8C2)15N6ZoQY+XyH-b}y;#jf0EmCSL+RRuJLj-sdNWR_((islO|bTo#sFS(-$we z0f`CeqH4ink0GgyYd=w*23)2Ah#Kq^nLw-~wJx5%>c;64a&F|tsv6`hq(+_6eM2qM z7ne@ZYwVZwxvQG5D#rGWa^%~%4Cyr3SyPLJ2U$Mu*UEbLO)2~-`B+AruDbf##Z8H+ zNn2cp0cOi(JQiwQi`+=Rv_SB~(5_LQ>>DRbHNu*mR}|kr7C1Kh?07)nI5A?!L4o6% z4?A8Hc$UCV3w)BmkK%r6I`EqqKdS5u;#OOB)00J#0P_4p9}m$f$tP}qX>Y<1pctVPYC_L5%`;ezgytBi|wqPJp$JRJ}B@m zfe#6MtH9}gZ8`=8J|b`hPkYg^U*H!C{D8oZ3HgHpx72X?Ljs>ATE*GlkT6z~pIC@|Sew6Mps8v#_prbca-W z_tk`m6h|AgRQW}%y|rbayy^Ox3wet>u0Q>&_nxR)Uh^%EcU>ti_KvOSSmB{P^k! zO)2!Yv;_V1aw)udOr2XgWy)kURk$M1xjZmgo;3k>>9h%&qMqXc9rp3H3(;b5jXI8z zKV@=ZmpABbR||VeipTSH#?*^uotvEP9Oa77QLf}1Z8vAXwP7a(w^LblUC}QTN6#M(Y%oeWAlP@$=tD7UIHyQJ8En6x+U|b z(5x!{+_|KxlrmD!raxuzoqv7(N?aB3bar;B8k>uBxdbn(oNxP6)KGq_T-g?A;hLxq z*eRxIt$tW;;nctT*?xEVNds;79=&^Aa;mju~r=SqjdS_K@7?K z>s2*f@PDyEi`Tx6No!ldNJxOJ@BoTO@e?lD$b5{vIL7>jaWx4}dfExfU0g-l-9LAwlNrw#r{ z-nWz(970B&cGlJk8m}O-$4XRGz2je(uOo zp6&|hEu8^xhaWlRPaU>YnOUqk^Wh+4j|@|yBxz$Id;p1LH?YY<>;jrAzEw!_^hYB2 z-X_3(8=ntif5fPq7eIccjczMrwXz)@Hg?I|dpdE)c%b4J(hBVYCv2sF<*mQCHwdbU z+xcRefpgL~V`CX++<C%&k#rYjT^n835!m+J&N zJDb1+?(r|#1nfg5FoErfzT{w}ZTbWzu)WupZGth!8CEPh6DHG&u8~(9<=sY9R(De3n(n{(N>+|H%q(OFSK ztA=_u+Uuod(kWj$X0aRt*(41n+brUndSa|-p0%Q5Vr zEF?VJ+U_k3cEkmY&9WKku~g0?k~>@Z;k(aY69&Fdj~f;$0IdXnfqH;z zcf?{31K$65EcO)O(j&3h4BYfk@dWsQ9>9%&(}llHx7RAB)fUs&rk!O*z@@Z#0zxwfqxayH4~4IhIJq4LP}6GMzc<53-y&`Mr}WbBd_`Qhahz zU)+pA(oAm%&<^@kA}q?W5SWF}7JR6^Ec8opriT%fP>i^DQ+=e0uaq1HVYr zUy{VH0Kahp{WSh7!M{@IuS%-F7W~x{@V9}#ISrru`!M)h!N>5%4=YMff!+@~#huca z)73la2U%M(BN-Wwj~PJzISu|P#AP1UhW{3iv6=tqZ@y+D4pZ@|eR?}XF8DND@^GR+ zPY+o@|AdH$j~`|@b8_#>T#%zivKHjzZ<*xCDf+>Aj-0aI^D|z{DRSiGgS8+h7orOg zV(jqXdmHo|gq~6waHeNbIz5YW`ZF@WIb}*t8DcLIiEG#-z3vUL2<%l`x;^13hJr#bPfLVPZYbob{%R{W)duM3S{s7YEt94f+m3 zADJZ&8s7&&9|S#2#BXnz^n>&C9DDEiKg_r*Gm@3jC|$^^#KdvTYG=6ol5~knNFhT< zGL-kpNqV0!O5*=53BNZ9-;spN``)JV=Hh=>@&bX&zmJ+FaC?TGmFS)=I#!==XOQmS zqGNNWok6+I)WAn{Q0UN+%fYz6VJpNG`yI_pLYT?l-wO(j10C`nG5XUtbR7PQok2{j;%HCy`Vkk8xnQZ1;q~4^FX*ntvT2{|mre+al=z-I(}Q^1n~o@Wv53wV`) za|K)|;7tOy3b;zZUO} zM8?08VF!jo5NMq;9x~rZ+Lb8K9F+BEWY~dW{TBq4VPju1pGXP@i`@q*>)SKzpeTx6 z5NIz)hpZ>(x5V*>BUp}+^<^H~EE3BJ0V!{hDp_CV1KJyrh^#O3+G>$lWFCVEJCggq z2vo`;vcAk42Sp;R7xkpQQcgnRQ%;h&%uA|BoHe38*-tXkenD1zD3EShU*^FsQNLLT z%JyY_+5h&W`ZBK`7WH=u#Z;e2$@OmojeHVc2Sh&FoC6{b*$&6X#!=!y6pZx;MEwC# z|5%d4Wk1-a6?NAo)n702$$BAhYOZXFLs?H!dz0$Rb#FBdEDm7?NANA%mxM41WTN!{ zz#= z3Mj!#C{X)Fk|3b0KR=01vUEr|4Q1IjV~DSaX#WpJBFeI!Bom%wB4h2`$R+xxaySE^ z0w3u&nOA6km#Te@CKit0k$~YyZr{KfSIi`9Gwdno8Cc&^(KE4brJ`rW_s5Fi$V61i z^~OL(f6i0nI+92~KfZ5Gq-QHK?2fUQMK5pvZig zNY7DZ9!#X0k=N`-5-VpS%4Oa%GBcHF$V1IWl90a;`6U(oBIJ!!^y!Ms3r1y`I2)1k zJ(2#^`1wd8{Sw74=4T>3SCR8Bk$$NWWn~`kQ}LH}?Fl@ZM_I~yu>>LzihENEKhQN7 zlQP~9WsE-F4=&I%6pUk1yGsOpM-rX(1tfnWiQdcf=PFz8Y({BpKL3c7bCm%>m-BQ0 zbkfs&p&g~MBVB**hY0mIKWQ8vVswjQSz{klg3t1M1$}1{{WZ|Z9(g_CDxrt=+a|O|f0Kj`h_T}u&{Ord40O_C$+I)LSz@92 zNgDZolk$~Zo?WX!&F4TjDdXqM3D6acXxN;@?#%ll>T_6VVi!}6K zgRUCnWN|1B{pmD%M$^zwr=jOy1DVPXGZ}q+KA)XNzMj#iDr$wDnSQx&+#u-4+VOm7 z5rOz@8a*e|(5GO-b|K<(hnSDw5)LXAbmb!6F1uC@-Xca9LI!Yi8v1tyym9noj`Of7RYlndRM0Ic;<*5H>Tkb@D-0hyt`+n_m!07+ zbF$(W^sb9J{VGA<06O`f{&X@OG7pEDJR>E1{eY;ufg6usGI`7kp}$BlhlPIn z!_#!g@qIpx{!yXlgcwJ$ol`D4pSMeYNXdApjDDd)e^A_b6oAg+VWFLo!-6?q&z;$P^Od3X_{OQ|teSyvn zf47o?q~lI;ot|6v;+}E31gU#_Rx7x?*cHILL|O{x7R@cejvaRvw6&}zY>vXOVz%_O zx37`*xT>AZoQ1WHDi_{9!7(yNmd9FQ*E?A&@&{(|dMSNLT`17)t_#_0^e{T^&G5E` zy4^l{VNS?avfSI<=F>ZI5!df+cl+>?i@G^_&5b^K2sDH%zWNf|wYn}}hU4Wg!?Bh6 zdRll{_jmNPLmp3BQiWnv@Yu?`@!}k>Gq{S0*M)+3sTCMYY$XW~*0;HAYU(j0i^d+QPb=k_!$%TLmf#U`MPKAuyujhnb=*; zb*<4tolbpKvC@UX3$@zqi#+ZHm3nbuap4^JtGcc;(CumyVZxQf+4-t+ZIS-aof?wP+41Nw~iU& zFLlG0l{VL^M7{Kg89futUgt(HNcg+};kTlk<9#Olh7@tr35(HEA6}u9XoB!L8&AF6 z+t!iT8D9@(A7*ylnhxK}U}pzjtN=~tJ};7IY<%qC@oN0+fO#CR8v_n>o} z2X%LSl|GhL>#Hg$r`FRum{^bhjLenfANOL6P)l2f-qY>JB%t#R^qdqRWZP-B4K0ZE)hz|6u1D;fiAZQm!iY=7VBCI=nr=I16a#2 xnPrL+tA$MWNExt7tOd?ut!5Q|64w3f_t(}`%Z$wK9?T~SJ%#d{!j>`Re*@Hp=|lhk diff --git a/examples/stencil/stencil_cu.bc b/examples/stencil/stencil_cu.bc deleted file mode 100644 index 5d9aecbe48bcf5764a18d325e4867aef3145094f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22616 zcmdVC3tWub_dov3rJ1S5G?OqG}}yrrQe_atG|>#j_RDK z?VQiG*0W>r-L>;R8#q>2y;EbBN1OaezAbVaI%9Fkl{l{@u|jv7-%6BWDI73A3`(Z-yKuupQaD|Q)E;9`H9mdU6V z>om(5zhFAGn9lPmP6J#EIUnjcjo4ygBIYL50N&hzNA$O6q}W+7nH8g{;l*cKJGsSc zQmC5+81@6)vo?;^6a;Lf_cTe1hOg(*2jcZ~C&$HUN9RxGX5<)9oH30RbeV7h#@NV; zj*$7&Wd@tLbh;DQJK99BfFb5n{I-Z}Ua%=_6RxNJL4olR?gZ{QSDsCrlzYvd6)Av^ zRpF^2xszSE@D$x|X7b8}>AU8e5Vu&*v_kN5;?}sQ22Mf^VVqQy$71o>^mw~C5!W?- zTpwT+>nU6(r;Tpb7%hvk&0wzbWv*JiX{}_Ft$0(2WYZe?CR;n^s*FuN7J6}lF`YGH zW{8X#Dq^CGV#aAMbGeim9M4%MW3GlzG2@Mtxd#8BLNRj~RaresN?v=&gR2HD-mZ`pr8I9c;Ys49Y)`30ZJDAIn$e8efc7ugF#0Ald zS5j2omD)%ADc~IO43veg!=xAsge5haNTrLA*w|jZ3H=wxe#R&QD8XEkG+9!l#{7dm zB8(0fa;}++Gy>g=F*n56NNt3!E${^wQY$WQJJ$Vo@%4F=|8-A2@EHZS()Sdj>B z^)=?+<3{X)TO(Wr|1zTb_29++5-SqJt$xPb$J~fAxHZyMa1ow@TT5d_sHdpS!O-S# zmee4>+8|JmGA7o=E;=vHS;`Ds&0Mi-8T%Ec^MT80kaC8zo%`qJ7~VZ#5Gd5Zt3~y@ zO#{mt6Gt0!dv#ohgt?Z*40GEgkucXtXrZe&Nd%kpZ~AfG$T`rJ>SE4I;e4mtMuso^ z`oFI3Q)XmX4y0^0+owf20G+m6yUVoE_gzM3<*~>4ZIbL_u4&c?bz`nF-=u##ms2g& zX{ghFUdMUwsq>+V13WJiV3;fXonAb6(zWJ3Ct5`-Ti61NdoNt073`~f-^EAw*+ z=W0Ve)fU^pQqbMD1Tdcqrns623WrX~ulJCqwJAJCupS6}qC9&zcU~y&v>QBND?YUQ z)K``ldEKe2eCjcxTcO8@ybWu!DW=A$%O9Vzi{wcPy@z#NdpFi|&RCk;t!G#i&Va3~ zld-sI!xEo4CN8bHs9_FInTbZ^`Z20InNc`?!v%}m^%se$#dUAVt8EJSAd+8`D^<{BcG%x-Q zwwxvRWnuIP0Xxr^<)pXK&)|k{0YYRQ^yS!wzU$Ce9{N%*q@~Q~x%wH*=ZOSt(`_52 z2;tC2S9uXhCrcs)6yjE-!0mro?Kbq`@QBg=B9@Z{J+FrojbX-^FEC{Uu>9Dgv4i&vF1RDPsk@)F3uT;3A!)!JIGY#S9df zF#`l6;D4{ZIXluA-aY%ec`X!7VCHbFZ%6Q@IRctt9EHD!`$6;1yTew`Ft|Nz-ER1j z=8RxQv0eDG96_Fdu}FAB$k=7bw&n(8GNRbd`k6Tq&Y2^%a_;Kdon`|EBacN|%5o%8 zKb)cVy*;cT;)e#{0T}W;TE(1kjH~7SM;+7qIQ(BM(3%nKspY*ysO7y>sI~kA_q~CO zVHU;m_OLQ1gS1?OJj1LAOSUz8bscKIzckk%s{duDAE4>A0<9=EbwFHhg>&X%ZD?AL z){DN7TYvAiHNVGn%76l^ynA93ZdM13B=Sl^IqNW@TSj#X45eMfXFXaLqLI`2ohwgfuEnk{j< zpcikQPZ`1wU>UI^wQn7g-uLF<_X4@IJ%?zd$wH*s41H(csH({276KD)BhJbI&zv#k z8c}Scd?%d5DP!|({i`CKVz?g#S+dz$ZX2C^<>-DLai7Q94oweW*Z$pL&*9vYqa#vwKb-Jq%S4W7JGuh(>0npk8K^vAKANUbb7aQT!7xju0RUyq5 zB~d+{JaA6RopDb7WyO!EYk^x4VhtJfR2E{sZX?e8k28f-r$)F%8>TJN_+?oiwso{8 zCr9I@x9DG|#B}EmaEs~ON2>MW8InhYAubP}1vxBwIo#55wZnS7xHd;vz|D1__eq*W zIdUtK$GQZ!2CzafOKdCNv2dbeJ2XtV>)@6nw-)n}^u;@7#KT2usr@#h>aiba{&u{s zHFeons7X7iBhomHb5*R1a}{(@)|Yw9tgppR&$L+TEi;ydzjV!Jry#D3N@3_v&b3`Q zKfc;@NtEr>*v9CE6WCh>vV=()i^qsr6st&Hlrd|Gseqv$TQ$;B-b3HHOmk_zj1_69 znP?Qf&u%=_ew`rF58y|BHaCeiqGt-C&?C8~t7EwN2la=UM~Wv?7smUs9{zqz^p*u< zX8v0}KY)ANWXyPW946Xcqf7j~t`(@6WUSpPt&Wk7D;6hH>?0$zsM9$H)|PI(BjUlb z$X*e;&a|L~3Fgi=GOa(Q){kj#`8~`7Y-GC4vEFX9Afq1UF*f34Ea#SXuZZdx!34QD zIl=;{os0Svn~TWR^|*2&^7t6(Qk?vy7T~!d(k8GQ_e-p_%2Gt)E{VJKJE^XVrIxD z9H$yYAVt~rn;3~`hUT;18N(xt=&dXyruZG=u^>j#1WV1HSs;h#cDy)ihrlsN(v$C8 zhd)J~quDq=K7y5>c5paOjGurLo2TiV#uyF6dTJtK+?`@qL~M%X3W8WKyx4B1U|dig z5R>3fQCDhC4*+5>0_gq^5R;KTq*@&>aJ3wd4{eoMd_v!;8Zq1@rhGJ3bE$px0|b#{ zEQS@!9#(nawzvkrC72g!c!V+?lwvaeWV6}p1Sf{EV8`yLRXC&O+i^zYHLs^JW&@)( zI7fmM*<@f;`r~*(K|8X-$&?Ji4BQIKak`=fRNEs)v&AfRA-HaXnG;1j@ zHWb(%!1Bc``8#pDaKNd`Qsh&EeEaa7%ZNr(HS176*B>g7vBmVL*U1RuXdeRcA| zDX$xd?dba!`a01J^;NK#>V^6mVllif^$~w2;z#_M;(?w!`gXuGbMa@y z!PM#KnJ|mmC>|>b=WP{c$x6X_n}zchpDLXmEb1&UdIv+-6t24Xuw26ic*$MPeJaTwxi$|84PK-%TG6OpBs zXcixe*%x6H9g4T=wpTO<R13LosbI9c(a_?><4G5}qP&V*9e zpw8^YJ97^2Oe$vSmWp>~15UPhuXJ8eY(Ks;D+TY&Q_VKinOo_oGjH+E$f+6PC3t5% z2O_yj*HO10@mmU=S%9~CcaSIrI0J}O&?7b!*{0sxXarL}3wtv3qeY>XzJ~hF z=KY2t{^c7Z=d-F8=i*Z4YsEXo7|HiqNAjIL``O_Ai-`r)%k&plFYq`s8*Orm@m@)k#6;4qzV^~@YOIC42TkR3) zhP22}Qmr=T#MOowl<0%3vIt8t^AI6+m90quFKH>JnD(%?UR*!7`3)f-dKS7u>H4H9?zr}$9ATT&3oHtA+aAo!pb6QI z_$NEYM)3@DxvH4z<(A)oD)qhB22HK$7(LF2SCVZfulvQV9?IW0)#`0@oeY)pP8i*R za(QX?G(Fv6h_A6`GuMhfPfob{WUN_A(phd|!_TPR{s-_K^+(ABS+A)L9?>=xXB4ja zP8f~XGUmBZLw)r?ZFI zuRM_~#HH;jDHM4Y&T}tD0#Zpz$L$M{ZcYmD)o~fD*f1AZp{*}%98%>Yw{DL8?f&T6 zX^CN4rbT)m&VQVixM&PCr3ga;Y_6!xn{M}~Bkx{7{l%_A8m4W!1n z8gy<0>ReUJhPh>k0}#M`)Yx%F4dS4xwOe1Sn#rGUn%ki!QRn{Jj&1^YV2XkK9{l>@$-QS3rX@XvP$Fo(}$jZtq7dkDNj(<;Xb-=m%m$QqqX`Id4kk2Clta zfZc1DmxH9%)J8J!DXWu#@-gk*7)ZXZ$?+;jy_lue8SK$!RWD~ck$M{dL0LthG_f_1HWWwK$gfzvheu`1b; zW<}dajCIy9W2JVHTJi%7SQ}192kT9@Qm9WfQXa;>5y(#|6jA` zVbv3;KG3ltIz1AMatc#ir1!>FF%Awl>i9?4JBiyHE0830K|d_FIIpl9HDf9 z*cYQnh@G^fihB*7Q$c|^jUI-v^Zs|n0-b7=&QyuLt5%n-%!CC%!3 zR37$iiHb64%}i}l#<5dC#*Yrmr%|-W0A!Wf)=u-@%8*8ku_`)TQS+er!@iR2rKiK@ zrWgHkcGuC{>3OriTzI6_KqdUi-^Ha+QjVMG>uE}q8EWV1q(DIPhf>W5zauGa=!(Hm z%?njqwO3hC}EzFR5=ovSKoPVylG^cqY?iJ_}I2<1Tm8uBdCcVz6OjAInNkfXCx z{k0BTo4!YN{sL%( zd6!|mKKD66JlvwVZb;!|)VssSorSO@xCv$38Iu_A4Kbw zDH&lAvE_CO^YwSf=`X_-syO|9I|#g}N(~)u<+Cq!hkE)phC- zL7mEvfutDM5ek8#+s;WnA>WSTAKN90B4_urPFly9;1XO0XIF7Gx=bre2=}83fzYHx zTg~?X$ann^fZw7XzPwgn2)CTuUA6=qk&JiL(p_v%dSZ8 z0fKH6$EQ&53?FytN~`WQ7{~o|n3N#v-}@YvGD6kjAiEFK?wL*2_2fEwxA7jd5p&Q_ zayOM3lJlgzzqR~u`6=P1$O#*%4diU;ho~5vP~$y253mLWKBnFTj!{PssDbhGa~|&3 z@sO^xRp~RL<{}^ql79DWZrZ+g#B2_^{<+=UsJEZ?Urh{_o=YVs1`jx}*73<6CRm%% z91ag^=D^C0zCuJgRWvzXCjWsD<=;?w7`%P|*y2P}=i)@ws_qf0#t@9rnCY-TbpMR! z=hseET)p!uEM;tV#p2`EshkxLYR21vSI94gP%3$hOjiXpa6zEWstOfukR>n!S6gyx z+N)kN@*B`zHCI&!y8ey^dW)Q3Bm@XEXHCbKqohsEipdFEZAzBT;8^oba`Lc5VkHM` zU?m>ejciv%c9ozywu6;HjqP60uD3|fMxTJIopiVwgETIQszYvb@jh9*57}TKUnwz123gX2I9C zn4=eNEgp6I$Nf)QU?I`VqY{*4P()wh{m(OW(PX4*OsA6|R@th;L9JS}Z9u_nK2Fkm zJ}r^Q@ZCX2j!H3nobA2I*>mg*#5ZZsg|_3s*iq^v4XZO?XaohHNW({;&8ZRF`81`q5zdOSAfb%x(Ay1ycnTy)HZwY&{CibPQ=d}V5V%L z9qsgZ%Fx0JzsB>1`%X-Gb#d)vkL-%d(A8JdD*}#xEkBlVw4)D`FKO?HDxfeZ@dH!! z77h~g*At(ZpVsFXX?>k%AZ{jZr_XcQyX)1_v_~*C;WLu?Au; zw&-hwUvt;bh_n`H;B&j`k<$2AK(M&J(n4$Vt`2yRpy`*r;jp=>|5vEIzQ-acXI$;r zdoVus4Qibr(%hj&P#^CYpks52dcS|>E zyruCcg}%r;wi*Ol_RG`5ragPdXTlg7ooG_bG(Xa4adS3v&(B+e27F4`{V;susZW$u zm{Nj-?Bws%`z-&^*=Mp&t-3VjpH$M!Ic!=uz-^z!+YgTqo6b+9Z!$;YzZ>WMW`E}H zpSP@j(rC8jlf}&?8fz&0m;wA47+1$u7}UBp$D!|$D(DBboCK~ZrHXo_D|PKU#!Nn4 zw7cVnX80*-BY!qLY23WvVE9BxusBTZy{3*g;K+ya!=|YN7Vfk7S9tz}B^8HF-}MJx zo~?!AUv;XOU9`RfwIr`sTt@OiI(`G`<7`FIJ5=7J)btZo=;tT$=exktz0UDDF?5f3*YR&ZsMtFAm?Nx=aa$E$LGC4&W~3<-D!UP zu&IgO_e=Njn~qNDzJ02MD_pv!J@vBoS$x$1CCJ^3G-@x>-?d7Hj(mZ!HtE06Qmnr& zf5K|e{3z3Aeo)H>0%rCc?$W1IRnp12(vAEMtyp2&u#mn@gc5y@NKn}Vuu+5>NljPO z=i(Q~H^A5+mC0JvijP=zdTloy$z0n(s%7(B1t>7_AY;!{1p!qz2PJO?`+MC#=BEzD zWkmDw)8G(USbFS@Me`2w_b0OFicF?uKa^tTTuYE$4x@z)ozgo2a+!wIHcZGll?@`A{ z*4L*w4V>Qx=}}k(u16(#Hn=lsXxorpFn;%&^FJ{4-;b1YKr)1b2paVb41VnjgH!*R zJ_VC~a#P~vMWk;-Vv)h(ASHiO5y~tl#0MBgI?dy)DsM%Woa^*oWXV-^w^)6>R{0cT zf8uR>@c?x_pcIjMs8j+;JFbM3N^*3GF`?Wbf4ZMX>Ow-xIVG5VCKUaK3N344NM`j1 z=|5XVNPn6VIwfFE+B))m7WYF8eUkS2a%Z}Z0jWUoCuQ%Z2GT``9fAAkiz<=<pl>eY7aug*RUIe#-(weIStF{@TRZp`|m9yeyyceYOQQ{PFM;3pHYJxhsf|AwEHollW;R(-=y*f;#FRp&>( zNSz<~qE5;LKggmOmsQTN>+}PziPpb6idNG3)Y&>oxn9*Th_q)J6qr(cW2;X{SuTB} z&MzWDrCDYIGSY=GfQVVX5>6}Y%~$9IigaUA&k<(0ILY5vq1X%f@Hm^GCuGbc-ScN#uf za)a9YnY2+#*P?|wg~X@9I8ODr(nu(_=!GgsvgKLPS?@AX=P z=m;si|8|aWYefYC4R>kBf8Mg55}%$eIf&e6DRQ4-azmAKCjy!!dsV)M3~@frQ7>82 zU%lkx)2=~}97T0#RyVuJ;>cW@T;4yqrsQLNP+Mrxo67T#9EO(EoK3y`V_=CNtOq

d>65I3bn{6Pz^%;(VVjH zpV|F_EQ7@1FUkq$Iy{(QCy4+`v#FO(ASkK?Z>`s?O{|-=0&FYSO&|*F`XL=cx_Y#C z;P^K#;@meb;+)DwD4oVtb(eVHx{5Nm5%zLiIwBLG*x&^DPqMF;Z=KbKu8MG*~}aUeTdkKPXrU*6@UlJoi$V?L`?vM1m$)LCCrO z6sLf&@MJZGoDfz+^kMZ*<;k+vKF*}UP&Hg?vUp>>2hG@)^-ryV0d0UM@Tq#v3J6hb zyZi0WhCuqVDx~=cM30U72cqHZ4|p_;$L&?hX$u(*xBgSpzOQ||O#2*4mBBHA1#C^% zD17zm88W9j73OwT7}YF-azcHqDd$yVP3i+BTV0DNH`KL=azkB9!B`K^m3ZMrKV-hU&?OCKvH%i8vtuzRY`@lA?<**lM$Q) zN+w)<1>!7<=(6t7oJt0YwMvV;2^PuTE+qj1hH&~nGF@oiOHK^Rm0BIN$6lQHMYzKR`U;W0unK+{JI&U zXrrw;&o2?V8tpn6ohe8{HnE*eo7=gCNtP^8-}0ud|7P2GTbGPpJn@;ftx2^N3R_V> zQ0%*BEj{u-<=`sQBON-oj>yd2E9$8aRW{$2s1gdQrg<;; z>kP>lano6kCtqR9xm}ywXkLl-ovw7v$*)`Uk;uckjCPd5_QU->-8anQ zQM&uNA<7-OFZ-ZM0(C;gHB^Lh2e1ZG_d`_`4%eY-7B2Y|C9oTq7o9UNfS8*A~ zj!30Fi;|*lbnEbFvcvb$RwdHegRuWQv!=UY{Fhm-(EebpT^g#qMErtKL6<4&DBvd!Wt)g?R(+~+oHoz+F95(==x>qmF~%U|836N zC=xj;q;SjwP6NCk6SQWC)J{-5debEnz3Pj)hF68ENo2H7Q6GNwA${OY-u&komsK>&nr_G ziDnu6c-W^dM=;nWch}Yc<)4*6!d&3cDq>sx{zd4F1t+%CO@0^Zm$u+M;iqQm8@pwn z8EX89A3vI3e>vN=V+EujoTQy1RQ_+JHABLWtqSW#AUMpTL0%4|;<(`n2$cI>vkpO( z7wHO#vjDJ--+RKGFCcTzCS``)qs7Q5uU4vzvIWpHgz*!cFLHG&xzg(4YCsoq6ADyFQO6c>n9?3!gFp>CiG`WIl|N5-$VAoxPE_?`f;Kz$f<<>_T*ylx)imHzTa#L0*<1@2#2TeSTm*VyOVN9Wp`#f zxolNBcQV^4jv7Yzx&II3#Q|h2?G70)u7H(|VC>34!b7;TvPdxl<;~p>R~;oY)6WL! z^E?pZTp|;Py=%c%ZLbZEHf={~%i{XR-*8yEQ+aM%Y8F)g^D06vQzsa<4Un|;=$!KK z#r<$-@d}rAeBTaqjG_bn7bRD*kOe6yV%1@-t|t!SJHeNLb8L{w)YEJwy0Y%D?fa z{}*vG=})J2M&O_~mys?#Md8(9`3*R0K?J*HD^-IE__|oF$`tcasuXiaa9j6FLE`7|Wb=1O z_B(I@m%dXS&4jZlfp*1sCf!i4I7mkyBkWA3BYmu-q=7hECMiKjN^mzjGMG5jSl*ya z+anGGgcgyS)3d0S$YR7#4m?+$MFu9Th&CH;QqNyvaxbh5cCPc{{U!x_4*z8nrD1a>(c{N<0-C0&m& zgl!hiu;gnz>{=vKi)bekHLybQI4nX)VXIOSE}I9(!sKYht`7u*7IxjEuRZZ(Bgk& zkNE8`WDd!AdYr8;$V%9>_}vtemCz1<0$Rt!lQ3EcCfA}n_)0reo8QY9 zs=-1atjGCA*u!2IY*V1zq<^0p0Uh|*xk-26)v_P@$CF!HL^jlA^-M8Fxq7A;qwT6{ zCk|THHTr$ARGJ^B^g6=$_Xwhy`C@ zzaPUW@Y1Msc(1^$-m~p6S2xBu-`K&w4l%%tO{0vh@{w zfE#y#Z{py>(FT~cX{@!?&Ey6BIha?XlUHy5LT??W)cmH*T>IpL93Y#N=9zS@?9^)@ zJJH+xmjB|JZ9vw{FKGeqoRwf1R*)#Xb&54BIT4ZVyTEtP;6isqcB{43-QqDNZCviCBN`Y)_8pCem+GoX}NhJHF##oGc10fUs52Dbyx;( zKoCwm4P-Ozuov!}mosI93w;f+&!(F`TNNj34CY{)b;oZ8vIl!JrLLO4y8_v;UumM< zFFbbx+5Br1!Ib6ZcYy5sXBgMQPj3;BUA+v8O%iT717xq-VWtx~WKX1RLkc*yyQs&68{9rMJL~kH6;Zzj&1y=E8nl;wg;bXR!{7 z*~3=%bj{)WV^NmqM{JIn7u+cjq?+?x*nyc|dT~Y!o7a}`7P6?m>w0Sh#76YUtvV=W z56jBoi{Y(g3_J6vIO+^{M>5+N+| zs!b1I!J)6qJf8lXlF*mRXT`PXMSW%LeDr#XvdcgALa#>n!k!VHS|oX32O_}#`3W$r z8m@ZMC*fn?3$8=9Zu}o!QP<_YbzCozjoxeuBS}PI>Nz{q9(#t1;hZIP! zER<;i??vS0ikSIgR{>Wpq3Op_`oeHc^nBXqG(A_Ok)xGkDd;5_Rn8TNh5lkpYRUgG zH0MGTn4&W#k$k$e=p$`~|Zv}kspJ)`-&V>uYxh+n2r!x>LsEW}(`ZbbEdsQN*? zdNHalUyN7JLe=fc@#;%abA+Ya$8sMBV#d^*CWSLnq!qGpqnu0SZ364{sG1>BJ4C>D~1Nvy9~x_XC_O2 zs!jJH>r$|3+PMj8*Wgk`1IH|qJ&T(1YG_VM=Y!PRdH_9rT2MTBX?n-f3v zOK=&ZpxER&qXpH`9~^mpi|^tm(c2YPNj~#(BJ>F#qIhqxpbQ zu4`(n@WJ!|6i4Fk&GEojr`}yW%iclP_vQ(<*Yh9Y9f$R)El2#ivxK_LlwgI=Ske$0 zwCf0yi$8i$>&s##e~veveb&NA6yOF2Tc#U}&!n(?!zoh<-q2#!9wZs4zn7&SZ0TmZb{kkNWh0kbcFp9*#{|D*s B_wN7z diff --git a/examples/stencil/stencil_cu.ll b/examples/stencil/stencil_cu.ll deleted file mode 100644 index 6ea8748c..00000000 --- a/examples/stencil/stencil_cu.ll +++ /dev/null @@ -1,762 +0,0 @@ -; ModuleID = 'stencil_cu.bc' -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: nounwind -declare i8* @ISPCAlloc(i8**, i64, i32) #0 - -; Function Attrs: nounwind -declare void @ISPCLaunch(i8**, i8*, i8*, i32, i32, i32) #0 - -; Function Attrs: nounwind -declare void @ISPCSync(i8*) #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) #1 - -; Function Attrs: nounwind readonly -declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x double>) #2 - -; Function Attrs: nounwind -declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x double>, <4 x double>) #0 - -; Function Attrs: nounwind -define internal fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, double* noalias nocapture %coef, double* noalias %vsq, double* noalias %Ain, double* noalias %Aout, <8 x i32> %__mask) #3 { -allocas: - %floatmask.i = bitcast <8 x i32> %__mask to <8 x float> - %v.i = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i) #1 - %cmp.i = icmp eq i32 %v.i, 255 - %mul_Nx_load_Ny_load = mul i32 %Ny, %Nx - %coef_load_offset_load = load double* %coef, align 8 - %coef_load18_offset = getelementptr double* %coef, i64 1 - %coef_load18_offset_load = load double* %coef_load18_offset, align 8 - %coef_load21_offset = getelementptr double* %coef, i64 2 - %coef_load21_offset_load = load double* %coef_load21_offset, align 8 - %coef_load24_offset = getelementptr double* %coef, i64 3 - %coef_load24_offset_load = load double* %coef_load24_offset, align 8 - %less_z_load_z1_load260 = icmp slt i32 %z0, %z1 - br i1 %cmp.i, label %for_test.preheader, label %for_test264.preheader - -for_test264.preheader: ; preds = %allocas - br i1 %less_z_load_z1_load260, label %for_test275.preheader.lr.ph, label %for_exit - -for_test275.preheader.lr.ph: ; preds = %for_test264.preheader - %less_y_load282_y1_load283264 = icmp slt i32 %y0, %y1 - %less_xb_load293_x1_load294262 = icmp slt i32 %x0, %x1 - %x1_load463_broadcast_init = insertelement <8 x i32> undef, i32 %x1, i32 0 - %x1_load463_broadcast = shufflevector <8 x i32> %x1_load463_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer - %mul__Nx_load382 = shl i32 %Nx, 1 - %mul__Nx_load431 = mul i32 %Nx, 3 - %mul__Nx_load390 = mul i32 %Nx, -2 - %mul__Nx_load439 = mul i32 %Nx, -3 - %mul__Nxy_load399 = shl i32 %mul_Nx_load_Ny_load, 1 - %mul__Nxy_load448 = mul i32 %mul_Nx_load_Ny_load, 3 - %mul__Nxy_load407 = mul i32 %mul_Nx_load_Ny_load, -2 - %mul__Nxy_load456 = mul i32 %mul_Nx_load_Ny_load, -3 - %Ain_load327_ptr2int_2void = bitcast double* %Ain to i8* - %mask0.i.i201 = shufflevector <8 x i32> %__mask, <8 x i32> undef, <8 x i32> - %mask1.i.i202 = shufflevector <8 x i32> %__mask, <8 x i32> undef, <8 x i32> - %mask0d.i.i203 = bitcast <8 x i32> %mask0.i.i201 to <4 x double> - %mask1d.i.i204 = bitcast <8 x i32> %mask1.i.i202 to <4 x double> - %coef1_load315_broadcast_init = insertelement <8 x double> undef, double %coef_load18_offset_load, i32 0 - %coef0_load306_broadcast_init = insertelement <8 x double> undef, double %coef_load_offset_load, i32 0 - %coef2_load364_broadcast_init = insertelement <8 x double> undef, double %coef_load21_offset_load, i32 0 - %coef1_load315_broadcast = shufflevector <8 x double> %coef1_load315_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef0_load306_broadcast = shufflevector <8 x double> %coef0_load306_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load413_broadcast_init = insertelement <8 x double> undef, double %coef_load24_offset_load, i32 0 - %coef2_load364_broadcast = shufflevector <8 x double> %coef2_load364_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load413_broadcast = shufflevector <8 x double> %coef3_load413_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %Aout_load488_ptr2int_2void = bitcast double* %Aout to i8* - %vsq_load494_ptr2int_2void = bitcast double* %vsq to i8* - br label %for_test275.preheader - -for_test.preheader: ; preds = %allocas - br i1 %less_z_load_z1_load260, label %for_test30.preheader.lr.ph, label %for_exit - -for_test30.preheader.lr.ph: ; preds = %for_test.preheader - %less_y_load_y1_load258 = icmp slt i32 %y0, %y1 - %less_xb_load_x1_load256 = icmp slt i32 %x0, %x1 - %x1_load199_broadcast_init = insertelement <8 x i32> undef, i32 %x1, i32 0 - %x1_load199_broadcast = shufflevector <8 x i32> %x1_load199_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer - %mul__Nx_load119 = shl i32 %Nx, 1 - %mul__Nx_load167 = mul i32 %Nx, 3 - %mul__Nx_load127 = mul i32 %Nx, -2 - %mul__Nx_load175 = mul i32 %Nx, -3 - %mul__Nxy_load136 = shl i32 %mul_Nx_load_Ny_load, 1 - %mul__Nxy_load184 = mul i32 %mul_Nx_load_Ny_load, 3 - %mul__Nxy_load144 = mul i32 %mul_Nx_load_Ny_load, -2 - %mul__Nxy_load192 = mul i32 %mul_Nx_load_Ny_load, -3 - %Ain_load65_ptr2int_2void = bitcast double* %Ain to i8* - %coef1_load_broadcast_init = insertelement <8 x double> undef, double %coef_load18_offset_load, i32 0 - %coef0_load_broadcast_init = insertelement <8 x double> undef, double %coef_load_offset_load, i32 0 - %coef2_load_broadcast_init = insertelement <8 x double> undef, double %coef_load21_offset_load, i32 0 - %coef1_load_broadcast = shufflevector <8 x double> %coef1_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef0_load_broadcast = shufflevector <8 x double> %coef0_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load_broadcast_init = insertelement <8 x double> undef, double %coef_load24_offset_load, i32 0 - %coef2_load_broadcast = shufflevector <8 x double> %coef2_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load_broadcast = shufflevector <8 x double> %coef3_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %Aout_load219_ptr2int_2void = bitcast double* %Aout to i8* - %vsq_load_ptr2int_2void = bitcast double* %vsq to i8* - br label %for_test30.preheader - -for_test30.preheader: ; preds = %for_exit33, %for_test30.preheader.lr.ph - %z.0261 = phi i32 [ %z0, %for_test30.preheader.lr.ph ], [ %z_load242_plus1, %for_exit33 ] - br i1 %less_y_load_y1_load258, label %for_test37.preheader.lr.ph, label %for_exit33 - -for_test37.preheader.lr.ph: ; preds = %for_test30.preheader - %mul_z_load45_Nxy_load = mul i32 %z.0261, %mul_Nx_load_Ny_load - br i1 %less_xb_load_x1_load256, label %for_loop39.lr.ph.us, label %for_exit33 - -for_exit40.us: ; preds = %safe_if_after_true.us - %y_load241_plus1.us = add i32 %y.0259.us, 1 - %exitcond = icmp eq i32 %y_load241_plus1.us, %y1 - br i1 %exitcond, label %for_exit33, label %for_loop39.lr.ph.us - -for_loop39.us: ; preds = %for_loop39.lr.ph.us, %safe_if_after_true.us - %xb.0257.us = phi i32 [ %x0, %for_loop39.lr.ph.us ], [ %add_xb_load240_.us, %safe_if_after_true.us ] - %xb_load44_broadcast_init.us = insertelement <8 x i32> undef, i32 %xb.0257.us, i32 0 - %xb_load44_broadcast.us = shufflevector <8 x i32> %xb_load44_broadcast_init.us, <8 x i32> undef, <8 x i32> zeroinitializer - %add_xb_load44_broadcast_.us = add <8 x i32> %xb_load44_broadcast.us, - %less_x_load198_x1_load199_broadcast.us = icmp slt <8 x i32> %add_xb_load44_broadcast_.us, %x1_load199_broadcast - %"oldMask&test.us" = select <8 x i1> %less_x_load198_x1_load199_broadcast.us, <8 x i32> , <8 x i32> zeroinitializer - %floatmask.i244.us = bitcast <8 x i32> %"oldMask&test.us" to <8 x float> - %v.i245.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i244.us) #1 - %cmp.i246.us = icmp eq i32 %v.i245.us, 0 - br i1 %cmp.i246.us, label %safe_if_after_true.us, label %safe_if_run_true.us - -safe_if_run_true.us: ; preds = %for_loop39.us - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast.elt0.us = add i32 %xb.0257.us, %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us - %scaled_varying.elt0.us = shl i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast.elt0.us, 3 - %"varying+const_offsets.elt0.us" = add i32 %scaled_varying.elt0.us, -8 - %0 = sext i32 %"varying+const_offsets.elt0.us" to i64 - %ptr.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %0, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3 - %ptr_cast_for_load.us = bitcast i8* %ptr.us to <8 x double>* - %ptr_masked_load521.us = load <8 x double>* %ptr_cast_for_load.us, align 8, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3 - %"varying+const_offsets529.elt0.us" = add i32 %scaled_varying.elt0.us, 8 - %1 = sext i32 %"varying+const_offsets529.elt0.us" to i64 - %ptr530.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %1, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5 - %ptr_cast_for_load531.us = bitcast i8* %ptr530.us to <8 x double>* - %ptr530_masked_load532.us = load <8 x double>* %ptr_cast_for_load531.us, align 8, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5 - %"varying+const_offsets540.elt0.us" = add i32 %scaled_varying.elt0.us, -16 - %2 = sext i32 %"varying+const_offsets540.elt0.us" to i64 - %ptr541.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %2, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3 - %ptr_cast_for_load542.us = bitcast i8* %ptr541.us to <8 x double>* - %ptr541_masked_load543.us = load <8 x double>* %ptr_cast_for_load542.us, align 8, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3 - %"varying+const_offsets551.elt0.us" = add i32 %scaled_varying.elt0.us, 16 - %3 = sext i32 %"varying+const_offsets551.elt0.us" to i64 - %ptr552.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %3, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5 - %ptr_cast_for_load553.us = bitcast i8* %ptr552.us to <8 x double>* - %ptr552_masked_load554.us = load <8 x double>* %ptr_cast_for_load553.us, align 8, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5 - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556_mul__Nx_load71_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556.elt0.us, %xb.0257.us - %scaled_varying560.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556_mul__Nx_load71_broadcast.elt0.us, 3 - %4 = sext i32 %scaled_varying560.elt0.us to i64 - %ptr562.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %4, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8 - %ptr_cast_for_load563.us = bitcast i8* %ptr562.us to <8 x double>* - %ptr562_masked_load564.us = load <8 x double>* %ptr_cast_for_load563.us, align 8, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8 - %add_Ain_load57_offset_load_Ain_load65_offset_load.us = fadd <8 x double> %ptr_masked_load521.us, %ptr530_masked_load532.us - %"varying+const_offsets572.elt0.us" = add i32 %scaled_varying.elt0.us, -24 - %5 = sext i32 %"varying+const_offsets572.elt0.us" to i64 - %ptr573.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %5, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3 - %ptr_cast_for_load574.us = bitcast i8* %ptr573.us to <8 x double>* - %ptr573_masked_load575.us = load <8 x double>* %ptr_cast_for_load574.us, align 8, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3 - %"varying+const_offsets583.elt0.us" = add i32 %scaled_varying.elt0.us, 24 - %6 = sext i32 %"varying+const_offsets583.elt0.us" to i64 - %ptr584.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %6, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5 - %ptr_cast_for_load585.us = bitcast i8* %ptr584.us to <8 x double>* - %ptr584_masked_load586.us = load <8 x double>* %ptr_cast_for_load585.us, align 8, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5 - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588_mul__Nx_load119_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588.elt0.us, %xb.0257.us - %scaled_varying593.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588_mul__Nx_load119_broadcast.elt0.us, 3 - %7 = sext i32 %scaled_varying593.elt0.us to i64 - %ptr595.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %7, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1 - %ptr_cast_for_load596.us = bitcast i8* %ptr595.us to <8 x double>* - %ptr595_masked_load597.us = load <8 x double>* %ptr_cast_for_load596.us, align 8, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1 - %add_Ain_load105_offset_load_Ain_load113_offset_load.us = fadd <8 x double> %ptr541_masked_load543.us, %ptr552_masked_load554.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599_mul__Nx_load79_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599.elt0.us, %xb.0257.us - %scaled_varying604.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599_mul__Nx_load79_broadcast.elt0.us, 3 - %8 = sext i32 %scaled_varying604.elt0.us to i64 - %ptr606.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %8, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13 - %ptr_cast_for_load607.us = bitcast i8* %ptr606.us to <8 x double>* - %ptr606_masked_load608.us = load <8 x double>* %ptr_cast_for_load607.us, align 8, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13 - %add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load.us = fadd <8 x double> %add_Ain_load57_offset_load_Ain_load65_offset_load.us, %ptr562_masked_load564.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610_mul__Nx_load167_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610.elt0.us, %xb.0257.us - %scaled_varying615.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610_mul__Nx_load167_broadcast.elt0.us, 3 - %9 = sext i32 %scaled_varying615.elt0.us to i64 - %ptr617.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %9, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1 - %ptr_cast_for_load618.us = bitcast i8* %ptr617.us to <8 x double>* - %ptr617_masked_load619.us = load <8 x double>* %ptr_cast_for_load618.us, align 8, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1 - %add_Ain_load153_offset_load_Ain_load161_offset_load.us = fadd <8 x double> %ptr573_masked_load575.us, %ptr584_masked_load586.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621_mul__Nx_load127_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621.elt0.us, %xb.0257.us - %scaled_varying626.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621_mul__Nx_load127_broadcast.elt0.us, 3 - %10 = sext i32 %scaled_varying626.elt0.us to i64 - %ptr628.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %10, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15 - %ptr_cast_for_load629.us = bitcast i8* %ptr628.us to <8 x double>* - %ptr628_masked_load630.us = load <8 x double>* %ptr_cast_for_load629.us, align 8, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15 - %add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load.us = fadd <8 x double> %add_Ain_load105_offset_load_Ain_load113_offset_load.us, %ptr595_masked_load597.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632_mul__Nxy_load88_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632.elt0.us, %xb.0257.us - %scaled_varying637.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632_mul__Nxy_load88_broadcast.elt0.us, 3 - %11 = sext i32 %scaled_varying637.elt0.us to i64 - %ptr639.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %11, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1 - %ptr_cast_for_load640.us = bitcast i8* %ptr639.us to <8 x double>* - %ptr639_masked_load641.us = load <8 x double>* %ptr_cast_for_load640.us, align 8, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1 - %add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load.us = fadd <8 x double> %add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load.us, %ptr606_masked_load608.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643_mul__Nx_load175_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643.elt0.us, %xb.0257.us - %scaled_varying648.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643_mul__Nx_load175_broadcast.elt0.us, 3 - %12 = sext i32 %scaled_varying648.elt0.us to i64 - %ptr650.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %12, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15 - %ptr_cast_for_load651.us = bitcast i8* %ptr650.us to <8 x double>* - %ptr650_masked_load652.us = load <8 x double>* %ptr_cast_for_load651.us, align 8, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15 - %add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load.us = fadd <8 x double> %add_Ain_load153_offset_load_Ain_load161_offset_load.us, %ptr617_masked_load619.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654_mul__Nxy_load136_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654.elt0.us, %xb.0257.us - %scaled_varying659.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654_mul__Nxy_load136_broadcast.elt0.us, 3 - %13 = sext i32 %scaled_varying659.elt0.us to i64 - %ptr661.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %13, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1 - %ptr_cast_for_load662.us = bitcast i8* %ptr661.us to <8 x double>* - %ptr661_masked_load663.us = load <8 x double>* %ptr_cast_for_load662.us, align 8, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1 - %add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load.us = fadd <8 x double> %add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load.us, %ptr628_masked_load630.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665_mul__Nxy_load96_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665.elt0.us, %xb.0257.us - %scaled_varying670.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665_mul__Nxy_load96_broadcast.elt0.us, 3 - %14 = sext i32 %scaled_varying670.elt0.us to i64 - %ptr672.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %14, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15 - %ptr_cast_for_load673.us = bitcast i8* %ptr672.us to <8 x double>* - %ptr672_masked_load674.us = load <8 x double>* %ptr_cast_for_load673.us, align 8, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15 - %add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load.us = fadd <8 x double> %add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load.us, %ptr639_masked_load641.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676_mul__Nxy_load184_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676.elt0.us, %xb.0257.us - %scaled_varying681.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676_mul__Nxy_load184_broadcast.elt0.us, 3 - %15 = sext i32 %scaled_varying681.elt0.us to i64 - %ptr683.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %15, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1 - %ptr_cast_for_load684.us = bitcast i8* %ptr683.us to <8 x double>* - %ptr683_masked_load685.us = load <8 x double>* %ptr_cast_for_load684.us, align 8, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1 - %add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load.us = fadd <8 x double> %add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load.us, %ptr650_masked_load652.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687_mul__Nxy_load144_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687.elt0.us, %xb.0257.us - %scaled_varying692.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687_mul__Nxy_load144_broadcast.elt0.us, 3 - %16 = sext i32 %scaled_varying692.elt0.us to i64 - %ptr694.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %16, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15 - %ptr_cast_for_load695.us = bitcast i8* %ptr694.us to <8 x double>* - %ptr694_masked_load696.us = load <8 x double>* %ptr_cast_for_load695.us, align 8, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15 - %add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load.us = fadd <8 x double> %add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load.us, %ptr661_masked_load663.us - %add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load.us, %ptr672_masked_load674.us - %17 = sext i32 %scaled_varying.elt0.us to i64 - %ptr705.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %17, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19 - %ptr_cast_for_load706.us = bitcast i8* %ptr705.us to <8 x double>* - %ptr705_masked_load707.us = load <8 x double>* %ptr_cast_for_load706.us, align 8, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19 - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709_mul__Nxy_load192_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709.elt0.us, %xb.0257.us - %scaled_varying714.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709_mul__Nxy_load192_broadcast.elt0.us, 3 - %18 = sext i32 %scaled_varying714.elt0.us to i64 - %ptr716.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %18, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15 - %ptr_cast_for_load717.us = bitcast i8* %ptr716.us to <8 x double>* - %ptr716_masked_load718.us = load <8 x double>* %ptr_cast_for_load717.us, align 8, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15 - %add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load.us = fadd <8 x double> %add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load.us, %ptr683_masked_load685.us - %add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load.us, %ptr694_masked_load696.us - %mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fmul <8 x double> %coef1_load_broadcast, %add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us - %mul_coef0_load_broadcast_Ain_load_offset_load.us = fmul <8 x double> %coef0_load_broadcast, %ptr705_masked_load707.us - %add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load.us, %ptr716_masked_load718.us - %mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fmul <8 x double> %coef2_load_broadcast, %add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us - %add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fadd <8 x double> %mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us, %mul_coef0_load_broadcast_Ain_load_offset_load.us - %mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fmul <8 x double> %coef3_load_broadcast, %add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us - %add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fadd <8 x double> %mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us, %add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us - %add_add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load_mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fadd <8 x double> %add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us, %mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us - %mask0.i.i234.us = shufflevector <8 x i32> %"oldMask&test.us", <8 x i32> undef, <8 x i32> - %mask1.i.i235.us = shufflevector <8 x i32> %"oldMask&test.us", <8 x i32> undef, <8 x i32> - %mask0d.i.i236.us = bitcast <8 x i32> %mask0.i.i234.us to <4 x double> - %mask1d.i.i237.us = bitcast <8 x i32> %mask1.i.i235.us to <4 x double> - %val0d.i.i238.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr705.us, <4 x double> %mask0d.i.i236.us) #0 - %ptr727.sum.us = add i64 %17, 32 - %ptr1.i.i239.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %ptr727.sum.us - %val1d.i.i240.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i239.us, <4 x double> %mask1d.i.i237.us) #0 - %vald.i.i241.us = shufflevector <4 x double> %val0d.i.i238.us, <4 x double> %val1d.i.i240.us, <8 x i32> - %mul__Ain_load211_offset_load.us = fmul <8 x double> %vald.i.i241.us, - %ptr736.us = getelementptr i8* %Aout_load219_ptr2int_2void, i64 %17, !filename !0, !first_line !20, !first_column !21, !last_line !20, !last_column !22 - %val0d.i.i228.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr736.us, <4 x double> %mask0d.i.i236.us) #0 - %ptr1.i.i229.us = getelementptr i8* %Aout_load219_ptr2int_2void, i64 %ptr727.sum.us - %val1d.i.i230.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i229.us, <4 x double> %mask1d.i.i237.us) #0 - %vald.i.i231.us = shufflevector <4 x double> %val0d.i.i228.us, <4 x double> %val1d.i.i230.us, <8 x i32> - %sub_mul__Ain_load211_offset_load_Aout_load219_offset_load.us = fsub <8 x double> %mul__Ain_load211_offset_load.us, %vald.i.i231.us - %ptr745.us = getelementptr i8* %vsq_load_ptr2int_2void, i64 %17, !filename !0, !first_line !23, !first_column !24, !last_line !23, !last_column !7 - %val0d.i.i218.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr745.us, <4 x double> %mask0d.i.i236.us) #0 - %ptr1.i.i219.us = getelementptr i8* %vsq_load_ptr2int_2void, i64 %ptr727.sum.us - %val1d.i.i220.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i219.us, <4 x double> %mask1d.i.i237.us) #0 - %vald.i.i221.us = shufflevector <4 x double> %val0d.i.i218.us, <4 x double> %val1d.i.i220.us, <8 x i32> - %mul_vsq_load_offset_load_div_load.us = fmul <8 x double> %add_add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load_mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us, %vald.i.i221.us - %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us = fadd <8 x double> %sub_mul__Ain_load211_offset_load_Aout_load219_offset_load.us, %mul_vsq_load_offset_load_div_load.us - %val0.i.i.us = shufflevector <8 x double> %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us, <8 x double> undef, <4 x i32> - %val1.i.i.us = shufflevector <8 x double> %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us, <8 x double> undef, <4 x i32> - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr736.us, <4 x double> %mask0d.i.i236.us, <4 x double> %val0.i.i.us) #0 - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr1.i.i229.us, <4 x double> %mask1d.i.i237.us, <4 x double> %val1.i.i.us) #0 - br label %safe_if_after_true.us - -safe_if_after_true.us: ; preds = %safe_if_run_true.us, %for_loop39.us - %add_xb_load240_.us = add i32 %xb.0257.us, 8 - %less_xb_load_x1_load.us = icmp slt i32 %add_xb_load240_.us, %x1 - br i1 %less_xb_load_x1_load.us, label %for_loop39.us, label %for_exit40.us - -for_loop39.lr.ph.us: ; preds = %for_exit40.us, %for_test37.preheader.lr.ph - %y.0259.us = phi i32 [ %y_load241_plus1.us, %for_exit40.us ], [ %y0, %for_test37.preheader.lr.ph ] - %mul_y_load46_Nx_load47.us = mul i32 %y.0259.us, %Nx - %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us = add i32 %mul_y_load46_Nx_load47.us, %mul_z_load45_Nxy_load - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %Nx - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load119 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599.elt0.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %Nx - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load167 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load127 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load175 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load136 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665.elt0.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load184 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load144 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load192 - br label %for_loop39.us - -for_exit: ; preds = %for_exit278, %for_exit33, %for_test.preheader, %for_test264.preheader - ret void - -for_exit33: ; preds = %for_exit40.us, %for_test37.preheader.lr.ph, %for_test30.preheader - %z_load242_plus1 = add i32 %z.0261, 1 - %exitcond269 = icmp eq i32 %z_load242_plus1, %z1 - br i1 %exitcond269, label %for_exit, label %for_test30.preheader - -for_test275.preheader: ; preds = %for_exit278, %for_test275.preheader.lr.ph - %z269.0268 = phi i32 [ %z0, %for_test275.preheader.lr.ph ], [ %z_load518_plus1, %for_exit278 ] - br i1 %less_y_load282_y1_load283264, label %for_test286.preheader.lr.ph, label %for_exit278 - -for_test286.preheader.lr.ph: ; preds = %for_test275.preheader - %mul_z_load300_Nxy_load301 = mul i32 %z269.0268, %mul_Nx_load_Ny_load - br i1 %less_xb_load293_x1_load294262, label %for_loop288.lr.ph.us, label %for_exit278 - -for_exit289.us: ; preds = %safe_if_after_true466.us - %y_load517_plus1.us = add i32 %y280.0265.us, 1 - %exitcond271 = icmp eq i32 %y_load517_plus1.us, %y1 - br i1 %exitcond271, label %for_exit278, label %for_loop288.lr.ph.us - -for_loop288.us: ; preds = %for_loop288.lr.ph.us, %safe_if_after_true466.us - %xb291.0263.us = phi i32 [ %x0, %for_loop288.lr.ph.us ], [ %add_xb291_load_.us, %safe_if_after_true466.us ] - %xb_load298_broadcast_init.us = insertelement <8 x i32> undef, i32 %xb291.0263.us, i32 0 - %xb_load298_broadcast.us = shufflevector <8 x i32> %xb_load298_broadcast_init.us, <8 x i32> undef, <8 x i32> zeroinitializer - %add_xb_load298_broadcast_.us = add <8 x i32> %xb_load298_broadcast.us, - %less_x_load462_x1_load463_broadcast.us = icmp slt <8 x i32> %add_xb_load298_broadcast_.us, %x1_load463_broadcast - %"oldMask&test468.us" = select <8 x i1> %less_x_load462_x1_load463_broadcast.us, <8 x i32> , <8 x i32> zeroinitializer - %"internal_mask&function_mask472.us" = and <8 x i32> %"oldMask&test468.us", %__mask - %floatmask.i211.us = bitcast <8 x i32> %"internal_mask&function_mask472.us" to <8 x float> - %v.i212.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i211.us) #1 - %cmp.i213.us = icmp eq i32 %v.i212.us, 0 - br i1 %cmp.i213.us, label %safe_if_after_true466.us, label %safe_if_run_true467.us - -safe_if_run_true467.us: ; preds = %for_loop288.us - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast.elt0.us = add i32 %xb291.0263.us, %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us - %scaled_varying757.elt0.us = shl i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast.elt0.us, 3 - %"varying+const_offsets.elt0758.us" = add i32 %scaled_varying757.elt0.us, -8 - %19 = sext i32 %"varying+const_offsets.elt0758.us" to i64 - %ptr759.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %19, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3 - %val0d.i.i205.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr759.us, <4 x double> %mask0d.i.i203) #0 - %ptr759.sum.us = add i64 %19, 32 - %ptr1.i.i206.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr759.sum.us - %val1d.i.i207.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i206.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i208.us = shufflevector <4 x double> %val0d.i.i205.us, <4 x double> %val1d.i.i207.us, <8 x i32> - %"varying+const_offsets767.elt0.us" = add i32 %scaled_varying757.elt0.us, 8 - %20 = sext i32 %"varying+const_offsets767.elt0.us" to i64 - %ptr768.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %20, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5 - %val0d.i.i195.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr768.us, <4 x double> %mask0d.i.i203) #0 - %ptr768.sum.us = add i64 %20, 32 - %ptr1.i.i196.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr768.sum.us - %val1d.i.i197.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i196.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i198.us = shufflevector <4 x double> %val0d.i.i195.us, <4 x double> %val1d.i.i197.us, <8 x i32> - %"varying+const_offsets776.elt0.us" = add i32 %scaled_varying757.elt0.us, -16 - %21 = sext i32 %"varying+const_offsets776.elt0.us" to i64 - %ptr777.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %21, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3 - %val0d.i.i185.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr777.us, <4 x double> %mask0d.i.i203) #0 - %ptr777.sum.us = add i64 %21, 32 - %ptr1.i.i186.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr777.sum.us - %val1d.i.i187.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i186.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i188.us = shufflevector <4 x double> %val0d.i.i185.us, <4 x double> %val1d.i.i187.us, <8 x i32> - %"varying+const_offsets785.elt0.us" = add i32 %scaled_varying757.elt0.us, 16 - %22 = sext i32 %"varying+const_offsets785.elt0.us" to i64 - %ptr786.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %22, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5 - %val0d.i.i175.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr786.us, <4 x double> %mask0d.i.i203) #0 - %ptr786.sum.us = add i64 %22, 32 - %ptr1.i.i176.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr786.sum.us - %val1d.i.i177.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i176.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i178.us = shufflevector <4 x double> %val0d.i.i175.us, <4 x double> %val1d.i.i177.us, <8 x i32> - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788_mul__Nx_load333_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788.elt0.us, %xb291.0263.us - %scaled_varying793.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788_mul__Nx_load333_broadcast.elt0.us, 3 - %23 = sext i32 %scaled_varying793.elt0.us to i64 - %ptr795.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %23, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8 - %val0d.i.i165.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr795.us, <4 x double> %mask0d.i.i203) #0 - %ptr795.sum.us = add i64 %23, 32 - %ptr1.i.i166.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr795.sum.us - %val1d.i.i167.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i166.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i168.us = shufflevector <4 x double> %val0d.i.i165.us, <4 x double> %val1d.i.i167.us, <8 x i32> - %add_Ain_load319_offset_load_Ain_load327_offset_load.us = fadd <8 x double> %vald.i.i208.us, %vald.i.i198.us - %"varying+const_offsets803.elt0.us" = add i32 %scaled_varying757.elt0.us, -24 - %24 = sext i32 %"varying+const_offsets803.elt0.us" to i64 - %ptr804.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %24, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3 - %val0d.i.i155.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr804.us, <4 x double> %mask0d.i.i203) #0 - %ptr804.sum.us = add i64 %24, 32 - %ptr1.i.i156.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr804.sum.us - %val1d.i.i157.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i156.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i158.us = shufflevector <4 x double> %val0d.i.i155.us, <4 x double> %val1d.i.i157.us, <8 x i32> - %"varying+const_offsets812.elt0.us" = add i32 %scaled_varying757.elt0.us, 24 - %25 = sext i32 %"varying+const_offsets812.elt0.us" to i64 - %ptr813.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %25, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5 - %val0d.i.i145.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr813.us, <4 x double> %mask0d.i.i203) #0 - %ptr813.sum.us = add i64 %25, 32 - %ptr1.i.i146.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr813.sum.us - %val1d.i.i147.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i146.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i148.us = shufflevector <4 x double> %val0d.i.i145.us, <4 x double> %val1d.i.i147.us, <8 x i32> - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815_mul__Nx_load382_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815.elt0.us, %xb291.0263.us - %scaled_varying820.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815_mul__Nx_load382_broadcast.elt0.us, 3 - %26 = sext i32 %scaled_varying820.elt0.us to i64 - %ptr822.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %26, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1 - %val0d.i.i135.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr822.us, <4 x double> %mask0d.i.i203) #0 - %ptr822.sum.us = add i64 %26, 32 - %ptr1.i.i136.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr822.sum.us - %val1d.i.i137.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i136.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i138.us = shufflevector <4 x double> %val0d.i.i135.us, <4 x double> %val1d.i.i137.us, <8 x i32> - %add_Ain_load368_offset_load_Ain_load376_offset_load.us = fadd <8 x double> %vald.i.i188.us, %vald.i.i178.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824_mul__Nx_load341_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824.elt0.us, %xb291.0263.us - %scaled_varying829.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824_mul__Nx_load341_broadcast.elt0.us, 3 - %27 = sext i32 %scaled_varying829.elt0.us to i64 - %ptr831.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %27, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13 - %val0d.i.i125.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr831.us, <4 x double> %mask0d.i.i203) #0 - %ptr831.sum.us = add i64 %27, 32 - %ptr1.i.i126.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr831.sum.us - %val1d.i.i127.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i126.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i128.us = shufflevector <4 x double> %val0d.i.i125.us, <4 x double> %val1d.i.i127.us, <8 x i32> - %add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load.us = fadd <8 x double> %add_Ain_load319_offset_load_Ain_load327_offset_load.us, %vald.i.i168.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833_mul__Nx_load431_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833.elt0.us, %xb291.0263.us - %scaled_varying838.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833_mul__Nx_load431_broadcast.elt0.us, 3 - %28 = sext i32 %scaled_varying838.elt0.us to i64 - %ptr840.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %28, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1 - %val0d.i.i115.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr840.us, <4 x double> %mask0d.i.i203) #0 - %ptr840.sum.us = add i64 %28, 32 - %ptr1.i.i116.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr840.sum.us - %val1d.i.i117.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i116.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i118.us = shufflevector <4 x double> %val0d.i.i115.us, <4 x double> %val1d.i.i117.us, <8 x i32> - %add_Ain_load417_offset_load_Ain_load425_offset_load.us = fadd <8 x double> %vald.i.i158.us, %vald.i.i148.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842_mul__Nx_load390_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842.elt0.us, %xb291.0263.us - %scaled_varying847.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842_mul__Nx_load390_broadcast.elt0.us, 3 - %29 = sext i32 %scaled_varying847.elt0.us to i64 - %ptr849.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %29, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15 - %val0d.i.i105.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr849.us, <4 x double> %mask0d.i.i203) #0 - %ptr849.sum.us = add i64 %29, 32 - %ptr1.i.i106.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr849.sum.us - %val1d.i.i107.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i106.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i108.us = shufflevector <4 x double> %val0d.i.i105.us, <4 x double> %val1d.i.i107.us, <8 x i32> - %add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load.us = fadd <8 x double> %add_Ain_load368_offset_load_Ain_load376_offset_load.us, %vald.i.i138.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851_mul__Nxy_load350_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851.elt0.us, %xb291.0263.us - %scaled_varying856.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851_mul__Nxy_load350_broadcast.elt0.us, 3 - %30 = sext i32 %scaled_varying856.elt0.us to i64 - %ptr858.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %30, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1 - %val0d.i.i95.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr858.us, <4 x double> %mask0d.i.i203) #0 - %ptr858.sum.us = add i64 %30, 32 - %ptr1.i.i96.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr858.sum.us - %val1d.i.i97.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i96.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i98.us = shufflevector <4 x double> %val0d.i.i95.us, <4 x double> %val1d.i.i97.us, <8 x i32> - %add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load.us = fadd <8 x double> %add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load.us, %vald.i.i128.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860_mul__Nx_load439_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860.elt0.us, %xb291.0263.us - %scaled_varying865.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860_mul__Nx_load439_broadcast.elt0.us, 3 - %31 = sext i32 %scaled_varying865.elt0.us to i64 - %ptr867.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %31, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15 - %val0d.i.i85.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr867.us, <4 x double> %mask0d.i.i203) #0 - %ptr867.sum.us = add i64 %31, 32 - %ptr1.i.i86.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr867.sum.us - %val1d.i.i87.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i86.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i88.us = shufflevector <4 x double> %val0d.i.i85.us, <4 x double> %val1d.i.i87.us, <8 x i32> - %add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load.us = fadd <8 x double> %add_Ain_load417_offset_load_Ain_load425_offset_load.us, %vald.i.i118.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869_mul__Nxy_load399_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869.elt0.us, %xb291.0263.us - %scaled_varying874.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869_mul__Nxy_load399_broadcast.elt0.us, 3 - %32 = sext i32 %scaled_varying874.elt0.us to i64 - %ptr876.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %32, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1 - %val0d.i.i75.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr876.us, <4 x double> %mask0d.i.i203) #0 - %ptr876.sum.us = add i64 %32, 32 - %ptr1.i.i76.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr876.sum.us - %val1d.i.i77.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i76.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i78.us = shufflevector <4 x double> %val0d.i.i75.us, <4 x double> %val1d.i.i77.us, <8 x i32> - %add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load.us = fadd <8 x double> %add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load.us, %vald.i.i108.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878_mul__Nxy_load358_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878.elt0.us, %xb291.0263.us - %scaled_varying883.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878_mul__Nxy_load358_broadcast.elt0.us, 3 - %33 = sext i32 %scaled_varying883.elt0.us to i64 - %ptr885.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %33, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15 - %val0d.i.i65.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr885.us, <4 x double> %mask0d.i.i203) #0 - %ptr885.sum.us = add i64 %33, 32 - %ptr1.i.i66.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr885.sum.us - %val1d.i.i67.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i66.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i68.us = shufflevector <4 x double> %val0d.i.i65.us, <4 x double> %val1d.i.i67.us, <8 x i32> - %add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load.us = fadd <8 x double> %add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load.us, %vald.i.i98.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887_mul__Nxy_load448_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887.elt0.us, %xb291.0263.us - %scaled_varying892.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887_mul__Nxy_load448_broadcast.elt0.us, 3 - %34 = sext i32 %scaled_varying892.elt0.us to i64 - %ptr894.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %34, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1 - %val0d.i.i55.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr894.us, <4 x double> %mask0d.i.i203) #0 - %ptr894.sum.us = add i64 %34, 32 - %ptr1.i.i56.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr894.sum.us - %val1d.i.i57.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i56.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i58.us = shufflevector <4 x double> %val0d.i.i55.us, <4 x double> %val1d.i.i57.us, <8 x i32> - %add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load.us = fadd <8 x double> %add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load.us, %vald.i.i88.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896_mul__Nxy_load407_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896.elt0.us, %xb291.0263.us - %scaled_varying901.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896_mul__Nxy_load407_broadcast.elt0.us, 3 - %35 = sext i32 %scaled_varying901.elt0.us to i64 - %ptr903.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %35, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15 - %val0d.i.i45.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr903.us, <4 x double> %mask0d.i.i203) #0 - %ptr903.sum.us = add i64 %35, 32 - %ptr1.i.i46.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr903.sum.us - %val1d.i.i47.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i46.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i48.us = shufflevector <4 x double> %val0d.i.i45.us, <4 x double> %val1d.i.i47.us, <8 x i32> - %add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load.us = fadd <8 x double> %add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load.us, %vald.i.i78.us - %add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load.us, %vald.i.i68.us - %36 = sext i32 %scaled_varying757.elt0.us to i64 - %ptr912.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %36, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19 - %val0d.i.i35.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr912.us, <4 x double> %mask0d.i.i203) #0 - %ptr912.sum.us = add i64 %36, 32 - %ptr1.i.i36.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr912.sum.us - %val1d.i.i37.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i36.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i38.us = shufflevector <4 x double> %val0d.i.i35.us, <4 x double> %val1d.i.i37.us, <8 x i32> - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914_mul__Nxy_load456_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914.elt0.us, %xb291.0263.us - %scaled_varying919.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914_mul__Nxy_load456_broadcast.elt0.us, 3 - %37 = sext i32 %scaled_varying919.elt0.us to i64 - %ptr921.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %37, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15 - %val0d.i.i25.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr921.us, <4 x double> %mask0d.i.i203) #0 - %ptr921.sum.us = add i64 %37, 32 - %ptr1.i.i26.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr921.sum.us - %val1d.i.i27.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i26.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i28.us = shufflevector <4 x double> %val0d.i.i25.us, <4 x double> %val1d.i.i27.us, <8 x i32> - %add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load.us = fadd <8 x double> %add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load.us, %vald.i.i58.us - %add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load.us, %vald.i.i48.us - %mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fmul <8 x double> %coef1_load315_broadcast, %add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us - %mul_coef0_load306_broadcast_Ain_load310_offset_load.us = fmul <8 x double> %coef0_load306_broadcast, %vald.i.i38.us - %add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load.us, %vald.i.i28.us - %mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fmul <8 x double> %coef2_load364_broadcast, %add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us - %add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fadd <8 x double> %mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us, %mul_coef0_load306_broadcast_Ain_load310_offset_load.us - %mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fmul <8 x double> %coef3_load413_broadcast, %add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us - %add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fadd <8 x double> %mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us, %add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us - %add_add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load_mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fadd <8 x double> %add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us, %mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us - %mask0.i.i11.us = shufflevector <8 x i32> %"internal_mask&function_mask472.us", <8 x i32> undef, <8 x i32> - %mask1.i.i12.us = shufflevector <8 x i32> %"internal_mask&function_mask472.us", <8 x i32> undef, <8 x i32> - %mask0d.i.i13.us = bitcast <8 x i32> %mask0.i.i11.us to <4 x double> - %mask1d.i.i14.us = bitcast <8 x i32> %mask1.i.i12.us to <4 x double> - %val0d.i.i15.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr912.us, <4 x double> %mask0d.i.i13.us) #0 - %val1d.i.i17.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i36.us, <4 x double> %mask1d.i.i14.us) #0 - %vald.i.i18.us = shufflevector <4 x double> %val0d.i.i15.us, <4 x double> %val1d.i.i17.us, <8 x i32> - %mul__Ain_load480_offset_load.us = fmul <8 x double> %vald.i.i18.us, - %ptr939.us = getelementptr i8* %Aout_load488_ptr2int_2void, i64 %36, !filename !0, !first_line !20, !first_column !21, !last_line !20, !last_column !22 - %val0d.i.i5.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr939.us, <4 x double> %mask0d.i.i13.us) #0 - %ptr1.i.i6.us = getelementptr i8* %Aout_load488_ptr2int_2void, i64 %ptr912.sum.us - %val1d.i.i7.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i6.us, <4 x double> %mask1d.i.i14.us) #0 - %vald.i.i8.us = shufflevector <4 x double> %val0d.i.i5.us, <4 x double> %val1d.i.i7.us, <8 x i32> - %sub_mul__Ain_load480_offset_load_Aout_load488_offset_load.us = fsub <8 x double> %mul__Ain_load480_offset_load.us, %vald.i.i8.us - %ptr948.us = getelementptr i8* %vsq_load494_ptr2int_2void, i64 %36, !filename !0, !first_line !23, !first_column !24, !last_line !23, !last_column !7 - %val0d.i.i.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr948.us, <4 x double> %mask0d.i.i13.us) #0 - %ptr1.i.i.us = getelementptr i8* %vsq_load494_ptr2int_2void, i64 %ptr912.sum.us - %val1d.i.i.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i.us, <4 x double> %mask1d.i.i14.us) #0 - %vald.i.i.us = shufflevector <4 x double> %val0d.i.i.us, <4 x double> %val1d.i.i.us, <8 x i32> - %mul_vsq_load494_offset_load_div_load499.us = fmul <8 x double> %add_add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load_mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us, %vald.i.i.us - %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us = fadd <8 x double> %sub_mul__Ain_load480_offset_load_Aout_load488_offset_load.us, %mul_vsq_load494_offset_load_div_load499.us - %val0.i.i253.us = shufflevector <8 x double> %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us, <8 x double> undef, <4 x i32> - %val1.i.i254.us = shufflevector <8 x double> %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us, <8 x double> undef, <4 x i32> - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr939.us, <4 x double> %mask0d.i.i13.us, <4 x double> %val0.i.i253.us) #0 - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr1.i.i6.us, <4 x double> %mask1d.i.i14.us, <4 x double> %val1.i.i254.us) #0 - br label %safe_if_after_true466.us - -safe_if_after_true466.us: ; preds = %safe_if_run_true467.us, %for_loop288.us - %add_xb291_load_.us = add i32 %xb291.0263.us, 8 - %less_xb_load293_x1_load294.us = icmp slt i32 %add_xb291_load_.us, %x1 - br i1 %less_xb_load293_x1_load294.us, label %for_loop288.us, label %for_exit289.us - -for_loop288.lr.ph.us: ; preds = %for_exit289.us, %for_test286.preheader.lr.ph - %y280.0265.us = phi i32 [ %y_load517_plus1.us, %for_exit289.us ], [ %y0, %for_test286.preheader.lr.ph ] - %mul_y_load302_Nx_load303.us = mul i32 %y280.0265.us, %Nx - %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us = add i32 %mul_y_load302_Nx_load303.us, %mul_z_load300_Nxy_load301 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %Nx - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load382 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824.elt0.us = sub i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %Nx - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load431 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load390 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load439 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load399 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878.elt0.us = sub i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load448 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load407 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load456 - br label %for_loop288.us - -for_exit278: ; preds = %for_exit289.us, %for_test286.preheader.lr.ph, %for_test275.preheader - %z_load518_plus1 = add i32 %z269.0268, 1 - %exitcond272 = icmp eq i32 %z_load518_plus1, %z1 - br i1 %exitcond272, label %for_exit, label %for_test275.preheader -} - -; Function Attrs: nounwind -define internal void @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* noalias nocapture, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #3 { -allocas: - %x01 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 0 - %x02 = load i32* %x01, align 4 - %x13 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 1 - %x14 = load i32* %x13, align 4 - %y05 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 2 - %y06 = load i32* %y05, align 4 - %y17 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 3 - %y18 = load i32* %y17, align 4 - %z09 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 4 - %z010 = load i32* %z09, align 4 - %Nx11 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 5 - %Nx12 = load i32* %Nx11, align 4 - %Ny13 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 6 - %Ny14 = load i32* %Ny13, align 4 - %coef17 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 8 - %coef18 = load double** %coef17, align 8 - %vsq19 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 9 - %vsq20 = load double** %vsq19, align 8 - %Ain21 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 10 - %Ain22 = load double** %Ain21, align 8 - %Aout23 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 11 - %Aout24 = load double** %Aout23, align 8 - %task_struct_mask = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 12 - %mask = load <8 x i32>* %task_struct_mask, align 32 - %floatmask.i = bitcast <8 x i32> %mask to <8 x float> - %v.i = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i) #1 - %cmp.i = icmp eq i32 %v.i, 255 - %add_z0_load_taskIndex_load = add i32 %z010, %3 - %add_z0_load27_taskIndex_load28 = add i32 %3, 1 - %add_add_z0_load27_taskIndex_load28_ = add i32 %add_z0_load27_taskIndex_load28, %z010 - br i1 %cmp.i, label %all_on, label %some_on - -all_on: ; preds = %allocas - tail call fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x02, i32 %x14, i32 %y06, i32 %y18, i32 %add_z0_load_taskIndex_load, i32 %add_add_z0_load27_taskIndex_load28_, i32 %Nx12, i32 %Ny14, double* %coef18, double* %vsq20, double* %Ain22, double* %Aout24, <8 x i32> ) - ret void - -some_on: ; preds = %allocas - tail call fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x02, i32 %x14, i32 %y06, i32 %y18, i32 %add_z0_load_taskIndex_load, i32 %add_add_z0_load27_taskIndex_load28_, i32 %Nx12, i32 %Ny14, double* %coef18, double* %vsq20, double* %Ain22, double* %Aout24, <8 x i32> %mask) - ret void -} - -; Function Attrs: nounwind -define void @loop_stencil_ispc_tasks(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd) #3 { -allocas: - %launch_group_handle = alloca i8*, align 8 - store i8* null, i8** %launch_group_handle, align 8 - %less_t_load_t1_load166 = icmp slt i32 %t0, %t1 - br i1 %less_t_load_t1_load166, label %for_loop.lr.ph, label %post_sync73 - -for_loop.lr.ph: ; preds = %allocas - %sub_z1_load_z0_load23 = sub i32 %z1, %z0 - br label %for_loop - -for_loop: ; preds = %post_sync, %for_loop.lr.ph - %t.0167 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load69_plus1, %post_sync ] - %bitop = and i32 %t.0167, 1 - %equal_bitop_ = icmp eq i32 %bitop, 0 - %args_ptr = call i8* @ISPCAlloc(i8** %launch_group_handle, i64 96, i32 32) - %funarg = bitcast i8* %args_ptr to i32* - store i32 %x0, i32* %funarg, align 4 - %funarg24 = getelementptr i8* %args_ptr, i64 4 - %0 = bitcast i8* %funarg24 to i32* - store i32 %x1, i32* %0, align 4 - %funarg25 = getelementptr i8* %args_ptr, i64 8 - %1 = bitcast i8* %funarg25 to i32* - store i32 %y0, i32* %1, align 4 - %funarg26 = getelementptr i8* %args_ptr, i64 12 - %2 = bitcast i8* %funarg26 to i32* - store i32 %y1, i32* %2, align 4 - %funarg27 = getelementptr i8* %args_ptr, i64 16 - %3 = bitcast i8* %funarg27 to i32* - store i32 %z0, i32* %3, align 4 - %funarg28 = getelementptr i8* %args_ptr, i64 20 - %4 = bitcast i8* %funarg28 to i32* - store i32 %Nx, i32* %4, align 4 - %funarg29 = getelementptr i8* %args_ptr, i64 24 - %5 = bitcast i8* %funarg29 to i32* - store i32 %Ny, i32* %5, align 4 - %funarg30 = getelementptr i8* %args_ptr, i64 28 - %6 = bitcast i8* %funarg30 to i32* - store i32 %Nz, i32* %6, align 4 - %funarg31 = getelementptr i8* %args_ptr, i64 32 - %7 = bitcast i8* %funarg31 to double** - store double* %coef, double** %7, align 8 - %funarg32 = getelementptr i8* %args_ptr, i64 40 - %8 = bitcast i8* %funarg32 to double** - store double* %vsq, double** %8, align 8 - %funarg33 = getelementptr i8* %args_ptr, i64 48 - %9 = bitcast i8* %funarg33 to double** - br i1 %equal_bitop_, label %if_then, label %if_else - -for_exit: ; preds = %post_sync - %launch_group_handle_load70.pre = load i8** %launch_group_handle, align 8 - %cmp71 = icmp eq i8* %launch_group_handle_load70.pre, null - br i1 %cmp71, label %post_sync73, label %call_sync72 - -if_then: ; preds = %for_loop - store double* %Aeven, double** %9, align 8 - %funarg34 = getelementptr i8* %args_ptr, i64 56 - %10 = bitcast i8* %funarg34 to double** - store double* %Aodd, double** %10, align 8 - %funarg_mask = getelementptr i8* %args_ptr, i64 64 - %11 = bitcast i8* %funarg_mask to <8 x i32>* - store <8 x i32> , <8 x i32>* %11, align 32 - call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i8*), i8* %args_ptr, i32 %sub_z1_load_z0_load23, i32 1, i32 1) - br label %if_exit - -if_else: ; preds = %for_loop - store double* %Aodd, double** %9, align 8 - %funarg64 = getelementptr i8* %args_ptr, i64 56 - %12 = bitcast i8* %funarg64 to double** - store double* %Aeven, double** %12, align 8 - %funarg_mask67 = getelementptr i8* %args_ptr, i64 64 - %13 = bitcast i8* %funarg_mask67 to <8 x i32>* - store <8 x i32> , <8 x i32>* %13, align 32 - call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i8*), i8* %args_ptr, i32 %sub_z1_load_z0_load23, i32 1, i32 1) - br label %if_exit - -if_exit: ; preds = %if_else, %if_then - %launch_group_handle_load = load i8** %launch_group_handle, align 8 - %cmp = icmp eq i8* %launch_group_handle_load, null - br i1 %cmp, label %post_sync, label %call_sync - -call_sync: ; preds = %if_exit - call void @ISPCSync(i8* %launch_group_handle_load) - store i8* null, i8** %launch_group_handle, align 8 - br label %post_sync - -post_sync: ; preds = %call_sync, %if_exit - %t_load69_plus1 = add i32 %t.0167, 1 - %exitcond = icmp eq i32 %t_load69_plus1, %t1 - br i1 %exitcond, label %for_exit, label %for_loop - -call_sync72: ; preds = %for_exit - call void @ISPCSync(i8* %launch_group_handle_load70.pre) - store i8* null, i8** %launch_group_handle, align 8 - br label %post_sync73 - -post_sync73: ; preds = %call_sync72, %for_exit, %allocas - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind readonly } -attributes #3 = { nounwind "target-cpu"="corei7-avx" "target-features"="+avx,+popcnt,+cmov" } - -!0 = metadata !{metadata !"stencil.ispc"} -!1 = metadata !{i32 68} -!2 = metadata !{i32 69} -!3 = metadata !{i32 113} -!4 = metadata !{i32 22} -!5 = metadata !{i32 66} -!6 = metadata !{i32 71} -!7 = metadata !{i32 23} -!8 = metadata !{i32 67} -!9 = metadata !{i32 74} -!10 = metadata !{i32 72} -!11 = metadata !{i32 24} -!12 = metadata !{i32 70} -!13 = metadata !{i32 114} -!14 = metadata !{i32 75} -!15 = metadata !{i32 115} -!16 = metadata !{i32 73} -!17 = metadata !{i32 76} -!18 = metadata !{i32 21} -!19 = metadata !{i32 64} -!20 = metadata !{i32 79} -!21 = metadata !{i32 112} -!22 = metadata !{i32 156} -!23 = metadata !{i32 80} -!24 = metadata !{i32 13} diff --git a/examples/stencil/stencil_cu.s b/examples/stencil/stencil_cu.s deleted file mode 100644 index a10402a9..00000000 --- a/examples/stencil/stencil_cu.s +++ /dev/null @@ -1,1134 +0,0 @@ - .file "stencil_cu.ll" - .section .rodata.cst16,"aM",@progbits,16 - .align 16 -.LCPI0_0: - .long 4 # 0x4 - .long 5 # 0x5 - .long 6 # 0x6 - .long 7 # 0x7 -.LCPI0_1: - .long 0 # 0x0 - .long 1 # 0x1 - .long 2 # 0x2 - .long 3 # 0x3 - .section .rodata,"a",@progbits - .align 32 -.LCPI0_2: - .quad 4611686018427387904 # double 2.000000e+00 - .quad 4611686018427387904 # double 2.000000e+00 - .quad 4611686018427387904 # double 2.000000e+00 - .quad 4611686018427387904 # double 2.000000e+00 - .text - .align 16, 0x90 - .type stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_,@function -stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_: # @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ -# BB#0: # %allocas - pushq %rbp - pushq %r15 - pushq %r14 - pushq %r13 - pushq %r12 - pushq %rbx - subq $1384, %rsp # imm = 0x568 - movl %ecx, -72(%rsp) # 4-byte Spill - movl %esi, 1308(%rsp) # 4-byte Spill - movl %edi, -68(%rsp) # 4-byte Spill - movq 1456(%rsp), %rcx - vmovsd 24(%rcx), %xmm1 - vmovsd 16(%rcx), %xmm3 - movq 1472(%rsp), %rax - vmovsd (%rcx), %xmm2 - vmovsd 8(%rcx), %xmm4 - movl 1448(%rsp), %esi - vmovmskps %ymm0, %ecx - cmpl $255, %ecx - jne .LBB0_1 -# BB#7: # %for_test.preheader - cmpl %r9d, %r8d - jge .LBB0_6 -# BB#8: # %for_test30.preheader.lr.ph - leal -3(%r8), %ecx - leal 2(%r8), %r13d - leal -1(%r8), %edi - leal 3(%r8), %ebp - movl %esi, %r11d - imull %r11d, %ebp - movl %ebp, %ebx - imull %r11d, %edi - movl %edi, %ebp - imull %r11d, %r13d - imull %r8d, %esi - imull %r11d, %ecx - leal -2(%r8), %r10d - imull %r11d, %r10d - leal 1(%r8), %r14d - imull %r11d, %r14d - movl %edx, -96(%rsp) # 4-byte Spill - addl %edx, %r14d - addl %edx, %r10d - addl %edx, %ecx - movl %ecx, 1344(%rsp) # 4-byte Spill - movl %r9d, -92(%rsp) # 4-byte Spill - leal 1(%rdx,%rsi), %r15d - leal 2(%rdx,%rsi), %edi - addl %edx, %r13d - addl %edx, %ebp - movl %ebp, 1216(%rsp) # 4-byte Spill - addl %edx, %ebx - movl %ebx, 1152(%rsp) # 4-byte Spill - leal -1(%rdx,%rsi), %ebp - leal 3(%rdx,%rsi), %ecx - leal (%rdx,%rsi), %r12d - leal -3(%rdx,%rsi), %ebx - movl %ebx, 1184(%rsp) # 4-byte Spill - movl %r8d, -88(%rsp) # 4-byte Spill - leal -2(%rdx,%rsi), %edx - vmovd 1308(%rsp), %xmm0 # 4-byte Folded Reload - movl 1440(%rsp), %r9d - imull %r9d, %r13d - imull %r9d, %ecx - movl %ecx, 1312(%rsp) # 4-byte Spill - imull %r9d, %ebp - movl %ebp, 1248(%rsp) # 4-byte Spill - imull %r9d, %edi - imull %r9d, %r15d - movl 1344(%rsp), %ecx # 4-byte Reload - imull %r9d, %ecx - movl %ecx, 1344(%rsp) # 4-byte Spill - imull %r9d, %r10d - movl 1152(%rsp), %ebx # 4-byte Reload - imull %r9d, %ebx - movl 1216(%rsp), %ebp # 4-byte Reload - imull %r9d, %ebp - imull %r9d, %r14d - movl 1184(%rsp), %r8d # 4-byte Reload - imull %r9d, %r8d - imull %r9d, %edx - movl %edx, 1216(%rsp) # 4-byte Spill - imull %r9d, %r12d - movl -68(%rsp), %edx # 4-byte Reload - leal (,%rdx,8), %edx - leal -16(%rdx,%r12,8), %esi - movl %esi, 76(%rsp) # 4-byte Spill - leal (%rdx,%r12,8), %ecx - movl %ecx, 72(%rsp) # 4-byte Spill - leal (%rdx,%r15,8), %ecx - movl %ecx, 68(%rsp) # 4-byte Spill - movl -92(%rsp), %ecx # 4-byte Reload - leal (%rdx,%rdi,8), %esi - movl %esi, 64(%rsp) # 4-byte Spill - movl 1248(%rsp), %esi # 4-byte Reload - leal (%rdx,%rsi,8), %esi - movl %esi, 60(%rsp) # 4-byte Spill - movl 1312(%rsp), %esi # 4-byte Reload - leal (%rdx,%rsi,8), %esi - movl %esi, 56(%rsp) # 4-byte Spill - movl 1216(%rsp), %esi # 4-byte Reload - leal (%rdx,%rsi,8), %esi - movl %esi, 52(%rsp) # 4-byte Spill - movl -88(%rsp), %esi # 4-byte Reload - leal (%rdx,%r8,8), %edi - movl %edi, 48(%rsp) # 4-byte Spill - leal (%rdx,%r14,8), %edi - movl %edi, 44(%rsp) # 4-byte Spill - leal (%rdx,%r13,8), %edi - movl %edi, 40(%rsp) # 4-byte Spill - leal (%rdx,%rbp,8), %edi - movl %edi, 36(%rsp) # 4-byte Spill - leal (%rdx,%rbx,8), %edi - movl %edi, 32(%rsp) # 4-byte Spill - leal (%rdx,%r10,8), %edi - movl %edi, 28(%rsp) # 4-byte Spill - movl 1344(%rsp), %edi # 4-byte Reload - leal (%rdx,%rdi,8), %edx - movl %edx, 24(%rsp) # 4-byte Spill - movl $0, -100(%rsp) # 4-byte Folded Spill - imull %r9d, %r11d - shll $3, %r9d - movl %r9d, -76(%rsp) # 4-byte Spill - shll $3, %r11d - movl %r11d, -104(%rsp) # 4-byte Spill - vpermilpd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0] - vpermilpd $0, %xmm2, %xmm2 # xmm2 = xmm2[0,0] - vpermilpd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0] - vpshufd $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] - vinsertf128 $1, %xmm1, %ymm1, %ymm1 - vmovupd %ymm1, 1312(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm3, %ymm3, %ymm1 - vmovupd %ymm1, 1344(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm2, %ymm2, %ymm15 - vmovupd %ymm15, -32(%rsp) # 32-byte Folded Spill - vpermilpd $0, %xmm4, %xmm1 # xmm1 = xmm4[0,0] - vinsertf128 $1, %xmm1, %ymm1, %ymm14 - vmovupd %ymm14, -64(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm0, %ymm0, %ymm0 - vmovups %ymm0, 1248(%rsp) # 32-byte Folded Spill - vmovapd .LCPI0_2(%rip), %ymm13 - .align 16, 0x90 -.LBB0_9: # %for_test30.preheader - # =>This Loop Header: Depth=1 - # Child Loop BB0_16 Depth 2 - # Child Loop BB0_12 Depth 3 - movl %esi, -88(%rsp) # 4-byte Spill - movl -96(%rsp), %edx # 4-byte Reload - cmpl -72(%rsp), %edx # 4-byte Folded Reload - jge .LBB0_11 -# BB#10: # %for_test37.preheader.lr.ph - # in Loop: Header=BB0_9 Depth=1 - movl -68(%rsp), %edx # 4-byte Reload - cmpl 1308(%rsp), %edx # 4-byte Folded Reload - movl -100(%rsp), %edx # 4-byte Reload - movl -96(%rsp), %edi # 4-byte Reload - jge .LBB0_11 - .align 16, 0x90 -.LBB0_16: # %for_loop39.lr.ph.us - # Parent Loop BB0_9 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB0_12 Depth 3 - movl %edi, -84(%rsp) # 4-byte Spill - movl %edx, -80(%rsp) # 4-byte Spill - movl %edx, %r13d - movl -68(%rsp), %ecx # 4-byte Reload - .align 16, 0x90 -.LBB0_12: # %for_loop39.us - # Parent Loop BB0_9 Depth=1 - # Parent Loop BB0_16 Depth=2 - # => This Inner Loop Header: Depth=3 - movl %ecx, 1216(%rsp) # 4-byte Spill - vmovups 1248(%rsp), %ymm3 # 32-byte Folded Reload - vmovups %ymm3, 1248(%rsp) # 32-byte Folded Spill - vextractf128 $1, %ymm3, %xmm0 - vmovd %ecx, %xmm1 - vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0] - vpaddd .LCPI0_0(%rip), %xmm1, %xmm2 - vpcmpgtd %xmm2, %xmm0, %xmm0 - vpaddd .LCPI0_1(%rip), %xmm1, %xmm1 - vpcmpgtd %xmm1, %xmm3, %xmm1 - vinsertf128 $1, %xmm0, %ymm1, %ymm8 - vmovmskps %ymm8, %ecx - testl %ecx, %ecx - je .LBB0_14 -# BB#13: # %safe_if_run_true.us - # in Loop: Header=BB0_12 Depth=3 - movl 76(%rsp), %esi # 4-byte Reload - leal 8(%rsi,%r13), %edx - movl 68(%rsp), %ecx # 4-byte Reload - leal (%rcx,%r13), %ecx - movl 72(%rsp), %r12d # 4-byte Reload - leal 24(%r12,%r13), %r14d - leal -8(%rsi,%r13), %r8d - movl 52(%rsp), %edi # 4-byte Reload - leal (%rdi,%r13), %edi - leal 8(%r12,%r13), %ebp - leal (%rsi,%r13), %esi - leal 16(%r12,%r13), %r11d - movl 64(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r9d - movl 44(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r15d - movl 60(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r10d - movl 40(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 832(%rsp) # 4-byte Spill - movl 56(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 800(%rsp) # 4-byte Spill - movl 36(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 768(%rsp) # 4-byte Spill - movl 28(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 736(%rsp) # 4-byte Spill - leal (%r12,%r13), %ebx - movl %ebx, 960(%rsp) # 4-byte Spill - movl 48(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 896(%rsp) # 4-byte Spill - movl 32(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r12d - movl 24(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 992(%rsp) # 4-byte Spill - movslq %edx, %rdx - movq %rdx, 1184(%rsp) # 8-byte Spill - movslq %ecx, %rbx - movq %rbx, 1056(%rsp) # 8-byte Spill - movslq %esi, %rcx - movq %rcx, 1120(%rsp) # 8-byte Spill - vmovupd (%rax,%rbx), %xmm0 - movq %rbx, %rsi - vmovupd 16(%rax,%rdx), %xmm2 - vmovupd (%rax,%rdx), %xmm3 - movslq %ebp, %rdx - movq %rdx, 1152(%rsp) # 8-byte Spill - vmovupd 16(%rax,%rdx), %xmm1 - vmovupd (%rax,%rdx), %xmm4 - vinsertf128 $1, %xmm1, %ymm4, %ymm1 - vinsertf128 $1, %xmm2, %ymm3, %ymm2 - movslq %edi, %rdx - movq %rdx, 928(%rsp) # 8-byte Spill - movslq %r8d, %rbx - movslq %r14d, %r14 - vmovupd 16(%rax,%rsi), %xmm3 - vmovupd 16(%rax,%rcx), %xmm4 - vmovupd (%rax,%rcx), %xmm5 - movslq %r11d, %rcx - movq %rcx, 1088(%rsp) # 8-byte Spill - vmovupd 16(%rax,%rcx), %xmm6 - vmovupd (%rax,%rcx), %xmm7 - vinsertf128 $1, %xmm6, %ymm7, %ymm6 - vinsertf128 $1, %xmm4, %ymm5, %ymm7 - vaddpd %ymm1, %ymm2, %ymm1 - vinsertf128 $1, %xmm3, %ymm0, %ymm3 - movslq %r10d, %rsi - movq %rsi, 864(%rsp) # 8-byte Spill - vmovupd (%rax,%r14), %xmm5 - vmovupd (%rax,%rbx), %xmm4 - vmovupd (%rax,%rdx), %xmm2 - movslq %r15d, %rbp - movslq %r9d, %rcx - movq %rcx, 1048(%rsp) # 8-byte Spill - vmovupd 16(%rax,%rcx), %xmm0 - vmovupd (%rax,%rcx), %xmm9 - vaddpd %ymm6, %ymm7, %ymm7 - vinsertf128 $1, %xmm0, %ymm9, %ymm9 - vmovupd (%rax,%rbp), %xmm12 - vmovupd (%rax,%rsi), %xmm6 - vmovupd 16(%rax,%rdx), %xmm0 - vaddpd %ymm3, %ymm1, %ymm3 - vinsertf128 $1, 16(%rax,%rsi), %ymm6, %ymm6 - vinsertf128 $1, 16(%rax,%r14), %ymm5, %ymm5 - vinsertf128 $1, 16(%rax,%rbx), %ymm4, %ymm4 - vaddpd %ymm9, %ymm7, %ymm1 - vinsertf128 $1, %xmm0, %ymm2, %ymm2 - movslq 736(%rsp), %r8 # 4-byte Folded Reload - movslq 768(%rsp), %rdx # 4-byte Folded Reload - movslq 800(%rsp), %rdi # 4-byte Folded Reload - vmovupd (%rax,%rdi), %xmm10 - movslq 832(%rsp), %r15 # 4-byte Folded Reload - vmovupd (%rax,%r15), %xmm9 - vmovupd (%rax,%rdx), %xmm7 - vaddpd %ymm5, %ymm4, %ymm4 - vmovupd (%rax,%r8), %xmm11 - vaddpd %ymm6, %ymm3, %ymm5 - vinsertf128 $1, 16(%rax,%rdi), %ymm10, %ymm3 - vinsertf128 $1, 16(%rax,%rbp), %ymm12, %ymm10 - vinsertf128 $1, 16(%rax,%r15), %ymm9, %ymm0 - movslq 896(%rsp), %r11 # 4-byte Folded Reload - vaddpd %ymm2, %ymm1, %ymm1 - movslq 960(%rsp), %rcx # 4-byte Folded Reload - vmovupd (%rax,%rcx), %xmm6 - vaddpd %ymm0, %ymm1, %ymm1 - vinsertf128 $1, 16(%rax,%r8), %ymm11, %ymm2 - vinsertf128 $1, 16(%rax,%rdx), %ymm7, %ymm0 - movslq %r12d, %r12 - vaddpd %ymm10, %ymm5, %ymm7 - vmovupd (%rax,%r11), %xmm5 - vaddpd %ymm3, %ymm4, %ymm3 - vinsertf128 $1, 16(%rax,%r11), %ymm5, %ymm4 - vinsertf128 $1, 16(%rax,%rcx), %ymm6, %ymm9 - vmovupd (%rax,%r12), %xmm5 - movslq 992(%rsp), %rsi # 4-byte Folded Reload - vaddpd %ymm0, %ymm7, %ymm10 - vextractf128 $1, %ymm8, %xmm6 - vaddpd %ymm2, %ymm1, %ymm2 - vpshufd $80, %xmm6, %xmm7 # xmm7 = xmm6[0,0,1,1] - vmulpd %ymm9, %ymm15, %ymm1 - vmovupd (%rax,%rsi), %xmm9 - vaddpd %ymm4, %ymm3, %ymm3 - vinsertf128 $1, 16(%rax,%r12), %ymm5, %ymm4 - vpshufd $80, %xmm8, %xmm5 # xmm5 = xmm8[0,0,1,1] - vpshufd $-6, %xmm6, %xmm0 # xmm0 = xmm6[2,2,3,3] - vpshufd $-6, %xmm8, %xmm6 # xmm6 = xmm8[2,2,3,3] - vinsertf128 $1, %xmm6, %ymm5, %ymm6 - vinsertf128 $1, 16(%rax,%rsi), %ymm9, %ymm5 - vinsertf128 $1, %xmm0, %ymm7, %ymm8 - vmovupd %ymm8, 96(%rsp) # 32-byte Folded Spill - vmovupd 1344(%rsp), %ymm0 # 32-byte Folded Reload - vmovupd %ymm0, 1344(%rsp) # 32-byte Folded Spill - vmovupd %ymm0, 1344(%rsp) # 32-byte Folded Spill - vmulpd %ymm2, %ymm0, %ymm0 - vmulpd %ymm10, %ymm14, %ymm2 - movq 1480(%rsp), %r9 - vmaskmovpd (%r9,%rcx), %ymm6, %ymm7 - vaddpd %ymm1, %ymm2, %ymm1 - vaddpd %ymm1, %ymm0, %ymm0 - vaddpd %ymm4, %ymm3, %ymm3 - vmaskmovpd (%rax,%rcx), %ymm6, %ymm1 - vmulpd %ymm13, %ymm1, %ymm1 - movq 1464(%rsp), %r10 - vmaskmovpd (%r10,%rcx), %ymm6, %ymm2 - vsubpd %ymm7, %ymm1, %ymm1 - vmaskmovpd 32(%r10,%rcx), %ymm8, %ymm4 - vmovupd %ymm4, 992(%rsp) # 32-byte Folded Spill - vaddpd %ymm5, %ymm3, %ymm3 - vmovups 48(%rax,%rsi), %xmm4 - vmovaps %xmm4, 960(%rsp) # 16-byte Spill - vmovupd 1312(%rsp), %ymm4 # 32-byte Folded Reload - vmovupd %ymm4, 1312(%rsp) # 32-byte Folded Spill - vmovupd %ymm4, 1312(%rsp) # 32-byte Folded Spill - vmulpd %ymm3, %ymm4, %ymm3 - vmovups 32(%rax,%rsi), %xmm4 - vmovups %ymm4, 896(%rsp) # 32-byte Folded Spill - vaddpd %ymm3, %ymm0, %ymm0 - vmovups 48(%rax,%r12), %xmm3 - vmovaps %xmm3, 832(%rsp) # 16-byte Spill - vmulpd %ymm2, %ymm0, %ymm0 - vmovups 32(%rax,%r12), %xmm2 - vmovups %ymm2, 800(%rsp) # 32-byte Folded Spill - vaddpd %ymm0, %ymm1, %ymm0 - vmovupd %ymm0, 128(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r11), %xmm0 - vmovaps %xmm0, 768(%rsp) # 16-byte Spill - vmovups 32(%rax,%r11), %xmm0 - vmovups %ymm0, 736(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rdi), %xmm0 - vmovaps %xmm0, 704(%rsp) # 16-byte Spill - vmovups 32(%rax,%rdi), %xmm0 - vmovups %ymm0, 640(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rbx), %xmm0 - vmovaps %xmm0, 592(%rsp) # 16-byte Spill - vmovups 32(%rax,%rbx), %xmm0 - vmovups %ymm0, 544(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r14), %xmm0 - vmovaps %xmm0, 464(%rsp) # 16-byte Spill - vmovups 32(%rax,%r14), %xmm0 - vmovups %ymm0, 416(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rdx), %xmm0 - vmovaps %xmm0, 400(%rsp) # 16-byte Spill - vmovups 32(%rax,%rdx), %xmm0 - vmovups %ymm0, 352(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r8), %xmm0 - vmovaps %xmm0, 336(%rsp) # 16-byte Spill - vmovups 32(%rax,%r8), %xmm0 - vmovups %ymm0, 288(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rbp), %xmm0 - vmovaps %xmm0, 272(%rsp) # 16-byte Spill - vmovups 32(%rax,%rbp), %xmm0 - vmovups %ymm0, 224(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%r9,%rcx), %ymm8, %ymm0 - vmovupd %ymm0, 672(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rcx), %ymm8, %ymm0 - vmovupd %ymm0, 608(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rcx), %xmm0 - vmovaps %xmm0, 528(%rsp) # 16-byte Spill - vmovups 32(%rax,%rcx), %xmm0 - vmovups %ymm0, 480(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r15), %xmm0 - vmovaps %xmm0, 208(%rsp) # 16-byte Spill - vmovups 32(%rax,%r15), %xmm0 - vmovups %ymm0, 160(%rsp) # 32-byte Folded Spill - movq 864(%rsp), %rdx # 8-byte Reload - vmovups 48(%rax,%rdx), %xmm0 - vmovaps %xmm0, 80(%rsp) # 16-byte Spill - vmovups 32(%rax,%rdx), %xmm0 - vmovups %ymm0, 864(%rsp) # 32-byte Folded Spill - movq 928(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm4 - vmovupd 32(%rax,%rdx), %xmm9 - movq 1056(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm5 - vmovupd 32(%rax,%rdx), %xmm11 - movq 1048(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm13 - vmovupd 32(%rax,%rdx), %xmm7 - movq 1184(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm15 - vmovupd 32(%rax,%rdx), %xmm10 - movq 1152(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm12 - vmovupd 32(%rax,%rdx), %xmm14 - movq 1120(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm0 - vmovupd 32(%rax,%rdx), %xmm1 - movq 1088(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm2 - vmovupd 32(%rax,%rdx), %xmm3 - vmovupd 128(%rsp), %ymm8 # 32-byte Folded Reload - vmaskmovpd %ymm8, %ymm6, (%r9,%rcx) - vinsertf128 $1, %xmm2, %ymm3, %ymm2 - vinsertf128 $1, %xmm0, %ymm1, %ymm0 - vaddpd %ymm2, %ymm0, %ymm1 - vinsertf128 $1, %xmm12, %ymm14, %ymm0 - vinsertf128 $1, %xmm15, %ymm10, %ymm2 - vaddpd %ymm0, %ymm2, %ymm0 - vinsertf128 $1, %xmm13, %ymm7, %ymm2 - vinsertf128 $1, %xmm5, %ymm11, %ymm3 - vaddpd %ymm3, %ymm0, %ymm5 - vaddpd %ymm2, %ymm1, %ymm0 - vinsertf128 $1, %xmm4, %ymm9, %ymm1 - vaddpd %ymm1, %ymm0, %ymm0 - vmovupd 864(%rsp), %ymm1 # 32-byte Folded Reload - vinsertf128 $1, 80(%rsp), %ymm1, %ymm1 # 16-byte Folded Reload - vmovupd 160(%rsp), %ymm2 # 32-byte Folded Reload - vinsertf128 $1, 208(%rsp), %ymm2, %ymm2 # 16-byte Folded Reload - vaddpd %ymm2, %ymm0, %ymm0 - vaddpd %ymm1, %ymm5, %ymm1 - vmovupd 224(%rsp), %ymm2 # 32-byte Folded Reload - vinsertf128 $1, 272(%rsp), %ymm2, %ymm2 # 16-byte Folded Reload - vaddpd %ymm2, %ymm1, %ymm1 - vmovupd 288(%rsp), %ymm2 # 32-byte Folded Reload - vinsertf128 $1, 336(%rsp), %ymm2, %ymm2 # 16-byte Folded Reload - vmovupd 352(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 400(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm1, %ymm1 - vaddpd %ymm2, %ymm0, %ymm2 - vmovupd 416(%rsp), %ymm0 # 32-byte Folded Reload - vinsertf128 $1, 464(%rsp), %ymm0, %ymm0 # 16-byte Folded Reload - vmovupd 544(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 592(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm0, %ymm3, %ymm0 - vmovupd 640(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 704(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 1344(%rsp), %ymm3 # 32-byte Folded Reload - vmulpd %ymm2, %ymm3, %ymm2 - vmovupd -64(%rsp), %ymm3 # 32-byte Folded Reload - vmulpd %ymm1, %ymm3, %ymm1 - vmovapd %ymm3, %ymm14 - vmovupd 736(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 768(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vmovupd 480(%rsp), %ymm4 # 32-byte Folded Reload - vinsertf128 $1, 528(%rsp), %ymm4, %ymm4 # 16-byte Folded Reload - vmovupd -32(%rsp), %ymm5 # 32-byte Folded Reload - vmulpd %ymm4, %ymm5, %ymm4 - vmovapd %ymm5, %ymm15 - vaddpd %ymm4, %ymm1, %ymm1 - vmovapd .LCPI0_2(%rip), %ymm5 - vmovupd 608(%rsp), %ymm4 # 32-byte Folded Reload - vmulpd %ymm5, %ymm4, %ymm4 - vmovapd %ymm5, %ymm13 - vaddpd %ymm1, %ymm2, %ymm2 - vsubpd 672(%rsp), %ymm4, %ymm1 # 32-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 800(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 832(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 896(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 960(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 1312(%rsp), %ymm3 # 32-byte Folded Reload - vmulpd %ymm0, %ymm3, %ymm0 - vaddpd %ymm0, %ymm2, %ymm0 - vmulpd 992(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd %ymm0, %ymm1, %ymm0 - vmovupd 96(%rsp), %ymm1 # 32-byte Folded Reload - vmaskmovpd %ymm0, %ymm1, 32(%r9,%rcx) -.LBB0_14: # %safe_if_after_true.us - # in Loop: Header=BB0_12 Depth=3 - addl $64, %r13d - movl 1216(%rsp), %ecx # 4-byte Reload - addl $8, %ecx - cmpl 1308(%rsp), %ecx # 4-byte Folded Reload - jl .LBB0_12 -# BB#15: # %for_exit40.us - # in Loop: Header=BB0_16 Depth=2 - movl -80(%rsp), %edx # 4-byte Reload - addl -76(%rsp), %edx # 4-byte Folded Reload - movl -84(%rsp), %edi # 4-byte Reload - incl %edi - cmpl -72(%rsp), %edi # 4-byte Folded Reload - movl -92(%rsp), %ecx # 4-byte Reload - movl -88(%rsp), %esi # 4-byte Reload - jne .LBB0_16 -.LBB0_11: # %for_exit33 - # in Loop: Header=BB0_9 Depth=1 - movl -100(%rsp), %edx # 4-byte Reload - addl -104(%rsp), %edx # 4-byte Folded Reload - movl %edx, -100(%rsp) # 4-byte Spill - incl %esi - cmpl %ecx, %esi - jne .LBB0_9 - jmp .LBB0_6 -.LBB0_1: # %for_test264.preheader - cmpl %r9d, %r8d - jge .LBB0_6 -# BB#2: # %for_test275.preheader.lr.ph - leal 2(%r8), %r13d - movl %esi, %r10d - imull %r10d, %r13d - movl %r10d, %ecx - imull %r8d, %ecx - movl %edx, %esi - movl %esi, -96(%rsp) # 4-byte Spill - leal (%rsi,%rcx), %r15d - movl %r9d, -92(%rsp) # 4-byte Spill - leal 2(%rsi,%rcx), %edx - movl %edx, 1248(%rsp) # 4-byte Spill - leal -1(%rsi,%rcx), %edx - movl %edx, 1344(%rsp) # 4-byte Spill - leal 3(%rsi,%rcx), %r12d - leal -2(%rsi,%rcx), %edx - movl %edx, 1312(%rsp) # 4-byte Spill - leal -3(%rsi,%rcx), %edi - addl %esi, %r13d - leal 1(%rsi,%rcx), %ecx - leal -3(%r8), %r14d - imull %r10d, %r14d - leal -2(%r8), %r9d - imull %r10d, %r9d - leal 3(%r8), %ebx - imull %r10d, %ebx - leal -1(%r8), %ebp - imull %r10d, %ebp - leal 1(%r8), %edx - imull %r10d, %edx - addl %esi, %edx - addl %esi, %ebp - addl %esi, %ebx - addl %esi, %r9d - addl %esi, %r14d - vmovd 1308(%rsp), %xmm5 # 4-byte Folded Reload - movl 1440(%rsp), %r11d - imull %r11d, %ecx - movl %ecx, 1184(%rsp) # 4-byte Spill - imull %r11d, %r13d - imull %r11d, %edi - movl %edi, 1216(%rsp) # 4-byte Spill - movl 1312(%rsp), %ecx # 4-byte Reload - imull %r11d, %ecx - movl %ecx, 1312(%rsp) # 4-byte Spill - imull %r11d, %r12d - movl 1344(%rsp), %esi # 4-byte Reload - imull %r11d, %esi - movl %esi, 1344(%rsp) # 4-byte Spill - movl 1248(%rsp), %ecx # 4-byte Reload - imull %r11d, %ecx - imull %r11d, %r15d - movl -68(%rsp), %esi # 4-byte Reload - leal (,%rsi,8), %esi - imull %r11d, %r14d - imull %r11d, %r9d - imull %r11d, %ebx - imull %r11d, %ebp - imull %r11d, %edx - leal -16(%rsi,%r15,8), %edi - movl %edi, 672(%rsp) # 4-byte Spill - leal (%rsi,%r15,8), %edi - movl %edi, 640(%rsp) # 4-byte Spill - movl 1184(%rsp), %edi # 4-byte Reload - leal (%rsi,%rdi,8), %edi - movl %edi, 608(%rsp) # 4-byte Spill - movl %r8d, %edi - leal (%rsi,%rcx,8), %ecx - movl %ecx, 592(%rsp) # 4-byte Spill - movl 1344(%rsp), %ecx # 4-byte Reload - leal (%rsi,%rcx,8), %ecx - movl %ecx, 544(%rsp) # 4-byte Spill - leal (%rsi,%r12,8), %ecx - movl %ecx, 528(%rsp) # 4-byte Spill - movl 1312(%rsp), %ecx # 4-byte Reload - leal (%rsi,%rcx,8), %ecx - movl %ecx, 480(%rsp) # 4-byte Spill - movl 1216(%rsp), %ecx # 4-byte Reload - leal (%rsi,%rcx,8), %ecx - movl %ecx, 464(%rsp) # 4-byte Spill - leal (%rsi,%rdx,8), %ecx - movl %ecx, 416(%rsp) # 4-byte Spill - leal (%rsi,%r13,8), %ecx - movl %ecx, 400(%rsp) # 4-byte Spill - leal (%rsi,%rbp,8), %ecx - movl %ecx, 352(%rsp) # 4-byte Spill - leal (%rsi,%rbx,8), %ecx - movl %ecx, 336(%rsp) # 4-byte Spill - leal (%rsi,%r9,8), %ecx - movl %ecx, 288(%rsp) # 4-byte Spill - leal (%rsi,%r14,8), %ecx - movl %ecx, 272(%rsp) # 4-byte Spill - movl $0, 160(%rsp) # 4-byte Folded Spill - imull %r11d, %r10d - shll $3, %r11d - movl %r11d, -76(%rsp) # 4-byte Spill - shll $3, %r10d - movl %r10d, -104(%rsp) # 4-byte Spill - vpermilpd $0, %xmm1, %xmm6 # xmm6 = xmm1[0,0] - vpermilpd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0] - vpermilpd $0, %xmm2, %xmm1 # xmm1 = xmm2[0,0] - vmovaps %ymm0, %ymm8 - vmovups %ymm8, 704(%rsp) # 32-byte Folded Spill - vextractf128 $1, %ymm8, %xmm7 - vpshufd $80, %xmm8, %xmm0 # xmm0 = xmm8[0,0,1,1] - vinsertf128 $1, %xmm6, %ymm6, %ymm13 - vpshufd $80, %xmm7, %xmm2 # xmm2 = xmm7[0,0,1,1] - vinsertf128 $1, %xmm3, %ymm3, %ymm15 - vpshufd $-6, %xmm7, %xmm3 # xmm3 = xmm7[2,2,3,3] - vinsertf128 $1, %xmm1, %ymm1, %ymm10 - vpshufd $-6, %xmm8, %xmm1 # xmm1 = xmm8[2,2,3,3] - vpshufd $0, %xmm5, %xmm7 # xmm7 = xmm5[0,0,0,0] - vpermilpd $0, %xmm4, %xmm4 # xmm4 = xmm4[0,0] - vinsertf128 $1, %xmm4, %ymm4, %ymm4 - vmovupd %ymm4, 1344(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm3, %ymm2, %ymm5 - vinsertf128 $1, %xmm1, %ymm0, %ymm6 - vinsertf128 $1, %xmm7, %ymm7, %ymm0 - vmovups %ymm0, 1312(%rsp) # 32-byte Folded Spill - vmovapd .LCPI0_2(%rip), %ymm14 - .align 16, 0x90 -.LBB0_3: # %for_test275.preheader - # =>This Loop Header: Depth=1 - # Child Loop BB0_21 Depth 2 - # Child Loop BB0_17 Depth 3 - movl %edi, -88(%rsp) # 4-byte Spill - movl -96(%rsp), %ecx # 4-byte Reload - cmpl -72(%rsp), %ecx # 4-byte Folded Reload - jge .LBB0_5 -# BB#4: # %for_test286.preheader.lr.ph - # in Loop: Header=BB0_3 Depth=1 - movl -68(%rsp), %ecx # 4-byte Reload - cmpl 1308(%rsp), %ecx # 4-byte Folded Reload - movl 160(%rsp), %ecx # 4-byte Reload - movl -96(%rsp), %edx # 4-byte Reload - jge .LBB0_5 - .align 16, 0x90 -.LBB0_21: # %for_loop288.lr.ph.us - # Parent Loop BB0_3 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB0_17 Depth 3 - movl %edx, 208(%rsp) # 4-byte Spill - movl %ecx, 224(%rsp) # 4-byte Spill - movl %ecx, %r9d - movl -68(%rsp), %r15d # 4-byte Reload - .align 16, 0x90 -.LBB0_17: # %for_loop288.us - # Parent Loop BB0_3 Depth=1 - # Parent Loop BB0_21 Depth=2 - # => This Inner Loop Header: Depth=3 - vmovups 1312(%rsp), %ymm3 # 32-byte Folded Reload - vmovups %ymm3, 1312(%rsp) # 32-byte Folded Spill - vextractf128 $1, %ymm3, %xmm0 - vmovd %r15d, %xmm1 - vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0] - vpaddd .LCPI0_0(%rip), %xmm1, %xmm2 - vpcmpgtd %xmm2, %xmm0, %xmm0 - vpaddd .LCPI0_1(%rip), %xmm1, %xmm1 - vpcmpgtd %xmm1, %xmm3, %xmm1 - vinsertf128 $1, %xmm0, %ymm1, %ymm0 - vandps 704(%rsp), %ymm0, %ymm11 # 32-byte Folded Reload - vmovmskps %ymm11, %ecx - testl %ecx, %ecx - je .LBB0_19 -# BB#18: # %safe_if_run_true467.us - # in Loop: Header=BB0_17 Depth=3 - movl 640(%rsp), %r11d # 4-byte Reload - leal 24(%r11,%r9), %ecx - movl 528(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %ebx - leal 8(%r11,%r9), %edx - movl %edx, 1088(%rsp) # 4-byte Spill - movl 608(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 1048(%rsp) # 4-byte Spill - movl 480(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 768(%rsp) # 4-byte Spill - movl 464(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %r14d - movl 592(%rsp), %esi # 4-byte Reload - leal (%rsi,%r9), %esi - movl 672(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %ebp - leal 16(%r11,%r9), %r12d - leal -8(%rdi,%r9), %r13d - movl 336(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 832(%rsp) # 4-byte Spill - movl 272(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 800(%rsp) # 4-byte Spill - leal 8(%rdi,%r9), %r10d - movl 544(%rsp), %r8d # 4-byte Reload - leal (%r8,%r9), %edx - movl %edx, 960(%rsp) # 4-byte Spill - leal (%r11,%r9), %edx - movl %edx, 928(%rsp) # 4-byte Spill - movl 416(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %edx - movl %edx, 896(%rsp) # 4-byte Spill - movl 400(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %edx - movl %edx, 864(%rsp) # 4-byte Spill - movl 288(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 992(%rsp) # 4-byte Spill - movl 352(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %r8d - movslq %ecx, %rcx - movq %rcx, 1184(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rcx), %ymm6, %ymm0 - movslq %r13d, %rcx - movq %rcx, 1152(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rcx), %ymm6, %ymm1 - vaddpd %ymm0, %ymm1, %ymm0 - movslq %r12d, %rcx - movq %rcx, 1248(%rsp) # 8-byte Spill - movslq %ebx, %rdx - movq %rdx, 1120(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rdx), %ymm6, %ymm1 - vaddpd %ymm1, %ymm0, %ymm0 - vmaskmovpd (%rax,%rcx), %ymm6, %ymm1 - movslq %ebp, %rcx - movq %rcx, 1216(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rcx), %ymm6, %ymm2 - vaddpd %ymm1, %ymm2, %ymm1 - movslq %esi, %rsi - movq %rsi, 1056(%rsp) # 8-byte Spill - movslq %r14d, %rdx - vmaskmovpd (%rax,%rdx), %ymm6, %ymm2 - vaddpd %ymm2, %ymm0, %ymm0 - movslq 768(%rsp), %rcx # 4-byte Folded Reload - movslq 1048(%rsp), %rdi # 4-byte Folded Reload - movq %rdi, 1048(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rsi), %ymm6, %ymm2 - movslq 1088(%rsp), %rsi # 4-byte Folded Reload - movq %rsi, 1088(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rsi), %ymm6, %ymm3 - movslq %r10d, %r11 - vmaskmovpd (%rax,%r11), %ymm6, %ymm4 - vaddpd %ymm3, %ymm4, %ymm3 - vaddpd %ymm2, %ymm1, %ymm1 - movslq 800(%rsp), %rsi # 4-byte Folded Reload - vmaskmovpd (%rax,%rdi), %ymm6, %ymm7 - vmaskmovpd (%rax,%rcx), %ymm6, %ymm2 - movslq 832(%rsp), %rdi # 4-byte Folded Reload - vmaskmovpd (%rax,%rdi), %ymm6, %ymm8 - vpshufd $80, %xmm11, %xmm4 # xmm4 = xmm11[0,0,1,1] - vaddpd %ymm8, %ymm0, %ymm0 - vaddpd %ymm2, %ymm1, %ymm2 - vaddpd %ymm7, %ymm3, %ymm3 - vmaskmovpd (%rax,%rsi), %ymm6, %ymm1 - movslq 864(%rsp), %r12 # 4-byte Folded Reload - movslq 896(%rsp), %rbx # 4-byte Folded Reload - vpshufd $-6, %xmm11, %xmm7 # xmm7 = xmm11[2,2,3,3] - vinsertf128 $1, %xmm7, %ymm4, %ymm12 - movslq 928(%rsp), %r13 # 4-byte Folded Reload - movslq 960(%rsp), %r10 # 4-byte Folded Reload - vmaskmovpd (%rax,%r10), %ymm6, %ymm4 - vaddpd %ymm4, %ymm3, %ymm4 - vmaskmovpd (%rax,%r13), %ymm12, %ymm7 - vmaskmovpd (%rax,%rbx), %ymm6, %ymm8 - vextractf128 $1, %ymm11, %xmm3 - vmaskmovpd (%rax,%r12), %ymm6, %ymm9 - vaddpd %ymm9, %ymm2, %ymm2 - movslq 992(%rsp), %rbp # 4-byte Folded Reload - vmaskmovpd (%rax,%rbp), %ymm6, %ymm9 - vaddpd %ymm9, %ymm2, %ymm2 - vaddpd %ymm1, %ymm0, %ymm1 - vmulpd %ymm14, %ymm7, %ymm0 - vaddpd %ymm8, %ymm4, %ymm4 - vmaskmovpd (%rax,%r13), %ymm6, %ymm7 - movslq %r8d, %r8 - vmaskmovpd (%rax,%r8), %ymm6, %ymm8 - vaddpd %ymm8, %ymm4, %ymm8 - vmovapd %ymm10, %ymm14 - vmulpd %ymm7, %ymm14, %ymm7 - vpshufd $-6, %xmm3, %xmm4 # xmm4 = xmm3[2,2,3,3] - vpshufd $80, %xmm3, %xmm3 # xmm3 = xmm3[0,0,1,1] - movq 1480(%rsp), %r14 - vmaskmovpd (%r14,%r13), %ymm12, %ymm9 - vsubpd %ymm9, %ymm0, %ymm0 - vmulpd %ymm1, %ymm13, %ymm1 - vmulpd %ymm2, %ymm15, %ymm2 - vmovupd 1344(%rsp), %ymm9 # 32-byte Folded Reload - vmovupd %ymm9, 1344(%rsp) # 32-byte Folded Spill - vmulpd %ymm8, %ymm9, %ymm8 - vaddpd %ymm7, %ymm8, %ymm7 - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm8 - vmovupd %ymm8, 992(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm4, %ymm3, %ymm11 - vmaskmovpd 32(%rax,%rdi), %ymm5, %ymm3 - vmovupd %ymm3, 960(%rsp) # 32-byte Folded Spill - vaddpd %ymm7, %ymm2, %ymm2 - vmaskmovpd 32(%rax,%rdx), %ymm5, %ymm3 - vmovupd %ymm3, 928(%rsp) # 32-byte Folded Spill - vaddpd %ymm1, %ymm2, %ymm1 - movq 1464(%rsp), %rdx - vmaskmovpd (%rdx,%r13), %ymm12, %ymm2 - vmulpd %ymm2, %ymm1, %ymm1 - movq 1120(%rsp), %rsi # 8-byte Reload - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm2 - vmovupd %ymm2, 1120(%rsp) # 32-byte Folded Spill - vaddpd %ymm1, %ymm0, %ymm0 - vmovupd %ymm0, 736(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r8), %ymm5, %ymm0 - vmovupd %ymm0, 896(%rsp) # 32-byte Folded Spill - movq 1184(%rsp), %rsi # 8-byte Reload - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm0 - vmovupd %ymm0, 1184(%rsp) # 32-byte Folded Spill - movq 1152(%rsp), %rsi # 8-byte Reload - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm0 - vmovupd %ymm0, 1152(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rbp), %ymm5, %ymm0 - vmovupd %ymm0, 832(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r12), %ymm5, %ymm0 - vmovupd %ymm0, 800(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm0 - vmovupd %ymm0, 768(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r13), %ymm5, %ymm0 - vmovupd %ymm0, 864(%rsp) # 32-byte Folded Spill - movq 1056(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm0 - vmovupd %ymm0, 1056(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rbx), %ymm5, %ymm7 - vmaskmovpd 32(%rax,%r10), %ymm5, %ymm10 - movq 1048(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm0 - movq 1088(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm1 - vmaskmovpd 32(%rax,%r11), %ymm5, %ymm2 - movq 1248(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm3 - movq 1216(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm4 - vmaskmovpd 32(%rdx,%r13), %ymm11, %ymm8 - vmovupd %ymm8, 1248(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%r14,%r13), %ymm11, %ymm8 - vmovupd %ymm8, 1216(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r13), %ymm11, %ymm8 - vmovupd %ymm8, 1088(%rsp) # 32-byte Folded Spill - vmovupd 736(%rsp), %ymm8 # 32-byte Folded Reload - vmaskmovpd %ymm8, %ymm12, (%r14,%r13) - vaddpd %ymm3, %ymm4, %ymm3 - vaddpd %ymm1, %ymm2, %ymm1 - vaddpd %ymm0, %ymm1, %ymm0 - vaddpd %ymm10, %ymm0, %ymm0 - vaddpd %ymm7, %ymm0, %ymm1 - vaddpd 1056(%rsp), %ymm3, %ymm0 # 32-byte Folded Reload - vaddpd 768(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 800(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 832(%rsp), %ymm0, %ymm2 # 32-byte Folded Reload - vmovupd 1152(%rsp), %ymm0 # 32-byte Folded Reload - vaddpd 1184(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vmulpd %ymm2, %ymm15, %ymm2 - vaddpd 896(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload - vmulpd %ymm1, %ymm9, %ymm1 - vmulpd 864(%rsp), %ymm14, %ymm3 # 32-byte Folded Reload - vmovapd %ymm14, %ymm10 - vaddpd %ymm3, %ymm1, %ymm3 - vmovapd .LCPI0_2(%rip), %ymm4 - vmovupd 1088(%rsp), %ymm1 # 32-byte Folded Reload - vmulpd %ymm4, %ymm1, %ymm1 - vmovapd %ymm4, %ymm14 - vsubpd 1216(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload - vaddpd %ymm3, %ymm2, %ymm2 - vaddpd 1120(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 928(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 960(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 992(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vmulpd %ymm0, %ymm13, %ymm0 - vaddpd %ymm0, %ymm2, %ymm0 - vmulpd 1248(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd %ymm0, %ymm1, %ymm0 - vmaskmovpd %ymm0, %ymm11, 32(%r14,%r13) -.LBB0_19: # %safe_if_after_true466.us - # in Loop: Header=BB0_17 Depth=3 - addl $64, %r9d - addl $8, %r15d - cmpl 1308(%rsp), %r15d # 4-byte Folded Reload - jl .LBB0_17 -# BB#20: # %for_exit289.us - # in Loop: Header=BB0_21 Depth=2 - movl 224(%rsp), %ecx # 4-byte Reload - addl -76(%rsp), %ecx # 4-byte Folded Reload - movl 208(%rsp), %edx # 4-byte Reload - incl %edx - cmpl -72(%rsp), %edx # 4-byte Folded Reload - jne .LBB0_21 -.LBB0_5: # %for_exit278 - # in Loop: Header=BB0_3 Depth=1 - movl 160(%rsp), %ecx # 4-byte Reload - addl -104(%rsp), %ecx # 4-byte Folded Reload - movl %ecx, 160(%rsp) # 4-byte Spill - movl -88(%rsp), %edi # 4-byte Reload - incl %edi - movl -92(%rsp), %ecx # 4-byte Reload - cmpl %ecx, %edi - jne .LBB0_3 -.LBB0_6: # %for_exit - addq $1384, %rsp # imm = 0x568 - popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 - popq %rbp - vzeroupper - ret -.Ltmp0: - .size stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, .Ltmp0-stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ - - .align 16, 0x90 - .type stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_,@function -stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_: # @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ -# BB#0: # %allocas - pushq %rbp - pushq %r15 - pushq %r14 - pushq %rbx - subq $56, %rsp - movq %rdi, %rax - movl 16(%rax), %r8d - movq 56(%rax), %rbx - movq 48(%rax), %r15 - movq 40(%rax), %r14 - movq 32(%rax), %r11 - leal 1(%r8,%rcx), %r9d - movl 24(%rax), %r10d - vmovaps 64(%rax), %ymm0 - addl %ecx, %r8d - movl 20(%rax), %ebp - movl 12(%rax), %ecx - movl 8(%rax), %edx - movl (%rax), %edi - movl 4(%rax), %esi - vmovmskps %ymm0, %eax - cmpl $255, %eax - jne .LBB1_2 -# BB#1: # %all_on - vpcmpeqd %xmm0, %xmm0, %xmm0 - movq %rbx, 40(%rsp) - movq %r15, 32(%rsp) - movq %r14, 24(%rsp) - movq %r11, 16(%rsp) - movl %r10d, 8(%rsp) - movl %ebp, (%rsp) - vinsertf128 $1, %xmm0, %ymm0, %ymm0 - jmp .LBB1_3 -.LBB1_2: # %some_on - movq %rbx, 40(%rsp) - movq %r15, 32(%rsp) - movq %r14, 24(%rsp) - movq %r11, 16(%rsp) - movl %r10d, 8(%rsp) - movl %ebp, (%rsp) -.LBB1_3: # %some_on - callq stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ - addq $56, %rsp - popq %rbx - popq %r14 - popq %r15 - popq %rbp - vzeroupper - ret -.Ltmp1: - .size stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, .Ltmp1-stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ - - .globl loop_stencil_ispc_tasks - .align 16, 0x90 - .type loop_stencil_ispc_tasks,@function -loop_stencil_ispc_tasks: # @loop_stencil_ispc_tasks -# BB#0: # %allocas - pushq %rbp - pushq %r15 - pushq %r14 - pushq %r13 - pushq %r12 - pushq %rbx - subq $104, %rsp - movl %r9d, 92(%rsp) # 4-byte Spill - movl %r8d, 88(%rsp) # 4-byte Spill - movl %ecx, 84(%rsp) # 4-byte Spill - movl %edx, 80(%rsp) # 4-byte Spill - movl %esi, %ebx - movl %edi, %ebp - movq $0, 96(%rsp) - cmpl %ebx, %ebp - jge .LBB2_10 -# BB#1: # %for_loop.lr.ph - movq 216(%rsp), %r13 - movl 168(%rsp), %r14d - movl 160(%rsp), %r12d - subl %r12d, %r14d - leaq 96(%rsp), %r15 - vpcmpeqd %xmm0, %xmm0, %xmm0 - vinsertf128 $1, %xmm0, %ymm0, %ymm1 - vmovups %ymm1, 32(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm0, %ymm0, %ymm0 - vmovups %ymm0, (%rsp) # 32-byte Folded Spill - .align 16, 0x90 -.LBB2_2: # %for_loop - # =>This Inner Loop Header: Depth=1 - movq %r15, %rdi - movl $96, %esi - movl $32, %edx - vzeroupper - callq ISPCAlloc - movq %rax, %rdx - movl 80(%rsp), %eax # 4-byte Reload - movl %eax, (%rdx) - movl 84(%rsp), %eax # 4-byte Reload - movl %eax, 4(%rdx) - movl 88(%rsp), %eax # 4-byte Reload - movl %eax, 8(%rdx) - movl 92(%rsp), %eax # 4-byte Reload - movl %eax, 12(%rdx) - movl %r12d, 16(%rdx) - movl 176(%rsp), %eax - movl %eax, 20(%rdx) - movl 184(%rsp), %eax - movl %eax, 24(%rdx) - testb $1, %bpl - movl 192(%rsp), %eax - movl %eax, 28(%rdx) - movq 200(%rsp), %rax - movq %rax, 32(%rdx) - movq 208(%rsp), %rax - movq %rax, 40(%rdx) - jne .LBB2_4 -# BB#3: # %if_then - # in Loop: Header=BB2_2 Depth=1 - movq %r13, 48(%rdx) - movq 224(%rsp), %rax - movq %rax, 56(%rdx) - vmovups 32(%rsp), %ymm0 # 32-byte Folded Reload - jmp .LBB2_5 - .align 16, 0x90 -.LBB2_4: # %if_else - # in Loop: Header=BB2_2 Depth=1 - movq 224(%rsp), %rax - movq %rax, 48(%rdx) - movq %r13, 56(%rdx) - vmovups (%rsp), %ymm0 # 32-byte Folded Reload -.LBB2_5: # %if_else - # in Loop: Header=BB2_2 Depth=1 - vmovaps %ymm0, 64(%rdx) - movq %r15, %rdi - movl $stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, %esi - movl %r14d, %ecx - movl $1, %r8d - movl $1, %r9d - vzeroupper - callq ISPCLaunch - movq 96(%rsp), %rdi - testq %rdi, %rdi - je .LBB2_7 -# BB#6: # %call_sync - # in Loop: Header=BB2_2 Depth=1 - callq ISPCSync - movq $0, 96(%rsp) -.LBB2_7: # %post_sync - # in Loop: Header=BB2_2 Depth=1 - incl %ebp - cmpl %ebp, %ebx - jne .LBB2_2 -# BB#8: # %for_exit - movq 96(%rsp), %rdi - testq %rdi, %rdi - je .LBB2_10 -# BB#9: # %call_sync72 - callq ISPCSync - movq $0, 96(%rsp) -.LBB2_10: # %post_sync73 - addq $104, %rsp - popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 - popq %rbp - ret -.Ltmp2: - .size loop_stencil_ispc_tasks, .Ltmp2-loop_stencil_ispc_tasks - - - .section ".note.GNU-stack","",@progbits diff --git a/examples/stencil/stencil_cu_avx.bc b/examples/stencil/stencil_cu_avx.bc deleted file mode 100644 index d9338e7cfdf08cacd887729e1069415814417945..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9820 zcmcIp3p~{6`X4i6MrMp5h0qy=t!>IR(Zr5xiX0AGY-rqd)@8zu*e)2Q`$lq!YW8f! z|7=@nq(m#1lpWW!MY2&SC)$d%N_C$1_xmYoQ~&*Zj?ehb`+I-C_j%s;dA`s0dER$y z?JoYN%OsI9rjSUCWDbc;B9T<#pPn>3&E^Vq8I5gXnJ$;Y)R$M#oJrjokf`QBwoPZ) z9$}lPStw{MQ#7&Hb5h)^JG=THiA+u1Xg&HBCzXH#Nddp+ksQcz*C3H*z_-@6&e!lq z=ihupuS6~7MEfQ?iz`)@85FCc3X6+&R;0U+SWnzoeL{K!n>C!MOtMk#;way9V+}RY zdfixEeC4Mcu+uqdkHwNgmSKp$Ge)MSAz2fnw% zMH9YFW;spa1Z|=2n{^}wE_YMD->atTvSSw`<%kPg7(;b3Cn?VDql6uyN=(Rni2~hy zmf2F0VhmN_rcmTHdsVuWZTsUHN1;MuP?GD{Kb^V$)z=XXmfH;|vz&MLYTFpegN~_2 z8!l3Kf5`JBV)FueoEJTAed@QuRC7UUtS~j&`GFALs#nS<1=dvk4~4NsidFg7vJ4%5T0;2oia4j<4PVWf;V@Ky9CqWrcrWh=(q1q|RoTfG_T)8v*>g=W_jW7HRFFlZ7oY@f^pY3NLu8wE# zTN{y_w{p%jhU(VK4^}p>SXt($xJBld+1L`bD6DtKT%Vy+qdv#Py#v1=uOT_=Ov|6%eo=@;1-$V5JzTjsT^{*QGg8#^x zdFZ73jughs%+z+ys*MJecb=6$I<@51t4gnrzEbn?SAO~UD-9oi@x&HB2Zhrig z-`>8`;YcazjJ3QnEhN7&lFc&9&+52q_JWt6)wL#J{m-u|tDXp7>U%cxB0h-gQOv!AutCq_%j zGhG@l&8t7_fr~b243ebhERwZt;5k(48y>byzuG&d;r#adcv6I2{ zq1HIuq&mnn*2;r>#OK|`8^t>8!#J!2!g9GgAj)Su*3`!A% z2mXzj+%GVx_)oE0{LnQm>_*#y;dTV6?dfI6D+|F7`2fDLH9(BP8iqeKvpe|Aaw5#! z{uLBb#J;v}Pa(fOJS~ee|OAHv29&7Y2=WT z{0!Y{M>99~^;r6DjXCNojMwyK?UO~ze>BH&njXmiXQbf%kaqQk@-9vginwn$0h+E$ zp9Oqkii9j`7kgj@&G>edh0~mrjWO78anPp^=QObohU~Mcd#1%w1-h8A&1-8*vm|F$ z^QQJO&O%`Su1&O~y38fDv?2J$1{APD;$08_>a$ke$ImgVs{8b5J8Xj}1lf|Cfxgrj8F*OyyV7Uu`890#~*W7eXW6P>eoaZSmSJ-kYidpOd zUxq8_lNKiz{5b461IXm(SSj4p+Ti%VgXMz>$GrnuEwIw%e{$WH`61|^?ZA0yi3r6* zo2#x`<s5)9R!>;ULi1FZkqLzvvXXeb zc}SC_0%Glu5v=~EVDFHBkJI}?iLNX2kM=%6QIQE3pa0C`Q?)~txAneRE;Hcg+rqtW z3NNnnt4;Fu(CXE;i?aIz%6$j%q-cQv)`4@u{%+&;@fojB8(B~Y7IXy*`kS=jEbK!= zVXv_VoEib-;ipb8iYx%$oLPk&9+B^qQX7P$H97}#KT=dao+lRMBA#Gg21+*uFuYB) zCNsv=BFDW|yYh|Nbv%bzE(T+2!^hOdN9H@#JqRGwwjnwUL~2h?P>Uu=sD)JiR_*CG ztDTuUet@<3&45A}@V3^FqA;uyDMH}=)Hf+6prXbiZuo`L;SQg*t2UnaG!$^yW?aM4 zJ`4p@wwyZ2wC7g!-S=~mn>0u;fB|;VlqO(s5giNBzaRTF?SMfl*}b{Xa!#KptLHr^ zE*ekU;gFf{<5gE>(CM#x&5k*K^$QH!}W-|9FaHMAXmKs%9Az( zlu%Rf*kQ;(?^9l52=X*w;-acHO;uJUNda8{Mo?b($0#pZTHd{g*T10j>WSw*pJ+iz zkrv7ii`l2O4YGQykf)Qj1zYHJSEQi|Z+ z7{+yDKrLJ&I{$U#4<&d?mWIOmtbZiHhl&#@RZuv6iwxNh;CowgnBgS=)KV%2`-or; zMcD%Mac;S$X|(s9DB0bMmjIZQk>ijpkaO@_1DAV$m0(9C$|Z}w`(Bw#x2Jfyc`K_M9W-BkPcn}qDi3Q z64VxAb!(mZj5+Qm(skUXV!hBtC>h@}Q6ow=);|>T=WD{O|FY<)$wZp~RsLlMTo!yt z^a4b3BjJFMH|69AoHKyw@X9fe*Rk+yvk#s$Fb5p|6^cZJ5L8%$B~S{%j3olIo~lLL zx#byk{jE@!XLk2{$BhqkZ$!<(#t5zT5a`juZKq|(p zUH=f;#B9Lw7?=Q6&4mXa9%v6E3S?|WjsGMdg>VKk(fTkF;DK{mgRvn1JcCg*zqU;Zo1eFN8Vg?;O8V9)Pg@=sfV=`!iN!ktErL|5Ya9TRs06U^4LyTX zB^*VdZ5hTP7-)6CIpL>e;b;eD%FBcu9}q6uPrs==FpouJ^~5^_j}b1|1^DH!2c__X zL5WJ0|4C5p{$NmA{%?YEKCt3gP)@*nIGzpP$NVvr_$FY!nEPj#566lOREoWWc~mS| zLyh_8m>;V=CeJIG6%#Q(j_X#S#85n*>%yUa8P9dNXmiAMZKx6mts0-QGHWr>$KVtI z(I}C9WspK@>oP9^o94{l3Gj~h9m{c@gv_!%zMpD|rbanmNc!nj21($-2yBv*sg~)gjXlB^B)s7RiBTIy! zW#+UIF-B;IMF)knAnm(9G{6`l2SS_376=R`QaHkZLm48y5O%?&V2VUcD4TA6Q%nSG zDOy7~xZgw?3<=slGSxTwEb%NCTOalelX(KtL=3M#y>#r{2Ad$-NatV{G@oDz5w@a5 zXbqPI6D;vHn&1g!LEiKU3-d*mz{e3=nz1y9EespCrvuNMZ_BOp9Bf=MY=QOa22MFY z90LQkXp~b4nMHrLKHO%(CTN~A#QLsrNOSUkMB?pq}KOD-5gcQ?F$~s zT5vMU_y4g`al5I6HkvvL-f;f%fO=&ycN>W}+ij0{ z$4hH(dp}+sw>h*bs$<2gh8=5GX&yc{PxXn(#{B#9)I8184gRIFweVr5&OsXSL1FBBg4VZ6ZB>Jv}CS9kZ9DJiulRC9+JJwlga$bQ-?b z4l$C)UK4B8QLL`mS?!`&)vezZE2Mv`K#zAy6$U4Fw{b%0qQg7AOxdo!kwp$(%F9A8Dl@?_b-ZeYkxhkfEf2uH>p74jf$cY{| zCsnhqlGSCTJoJbGzZv(tDGzkAV9x6}B+_*F776F?863T8Z$>^nx@}0|HPvZHko=T; zq65X;Jp#&(HEz2fIeU1nCf)g><*Bnr5P3V^6GT(IC;DoN+iszT+^o}cF9+-vkajj& zP|mQ+%^W~1$)7_a)xz(PQ>o5e^z)=L{9jWHqe`gTrV07pyGR*)l?)r*3|r$2JDUu9 zclL~5jA*@qKY!1(6Il0Z{lXz*H*)9({itcG!l2NqU8*l7CK*pnG8P!s*$bE&Oj8vN z3a!e8T9a5|^r7xFV*$U;K8UHoHdWC?LhNf2<&3AUGZqLwT?nr7Jt0Uo(=l9Ij=5+w z>2Hz>uAPLsAAb9_WBO}*=b#5#|(0#uM-Lq zMcXQv5BaPGOn6CRGqbK@yWCA(+QL@K;&aA)UiC%VMmPFW8VtPEsodZB3u%B&(N3w< z$NbGGm(AS~2P@u)Bqgy|}|R;aO*K-h7c&+{seg0QXTUh6SlFxJIFg?xgsMtHncS)+0Dl8H!wk64to|71N+*neky{;Q~Z`W4A^oxQ_n`Oxr z+q?==2;NqAxlO`}qZdVW(*{b!T z?xN#xv9&?n1^=qmmL+j5ORC#e9;K>ZE(01P`s@VHF2^xGlw4V;$ohdZ@USf@0|o{MUrKSV!C| zveMr#W;Uv|op&G_7n(Wzpd}u$Ei1*Ow8t*Z8`o4MseMuCloHH7V9E@^JAXP%9<;bR zkMOOY9G~ zwHW&E7OcTvwv5o&;^=7~=#SryL>RV^fwV>)yWJW$lj={g%`-h&CQCdr9`s&jv*o#) z6I~hxb3s&GgeSG|);Sg0l9qCdi*7fDUUM@%bEnyGpG8isjMv;E3|Og~mm^QpAP-(U z6X8WI>?~TVQk{Ie7q`$REp;32xyUcDnGT$fheyw7DVRln%k{P4F7(y%gX`{GbiJ^K zJewqYg3S3eA{d|Pw~?WEgBK;B7IqKo=#fux%TpF6%j+UDSGOBDvYij*vN3CI*hF^f zvVVQY;=B1@s!kjAerd=u8r4MC*0W0Z`-7SILaT2S6OFsqiN^1a*u>n|tX9FKSw%Wx z(|%fd0r%3=qDw^MY(3F9|0~hBkfu_7`tDL8=h7EC_JJYP!v4~ye7>5T2G1eG=%`0g z`&LIbLnY1dbg+2rY1t(WWTK#r z4&cnd6-zN3)#$NzPN4}H{DqZ|wbJaYUdx$P*^cg{FzG^)?2bZWECnq1#X8-6N;D>U z&`T_A?&bKLc?rgIdTr`3%n=mXD0pp@bH&YEZvQGqLvZmd$#b@Lxr*hr!N2iMxz!j@ w&b+p-*3@=-JXPJqJ7sUPfo{;5S`6bscg-This Inner Loop Header: Depth=1 - movq %rax, %r12 - movq %r12, %rdi - movl $96, %esi - movl $32, %edx - callq CUDAAlloc - testb $1, %r13b - jne .LBB0_4 -# BB#3: # %if_then - # in Loop: Header=BB0_2 Depth=1 - movl %ebx, 252(%rsp) - leaq 252(%rsp), %rax - movq %rax, 256(%rsp) - movl 20(%rsp), %eax # 4-byte Reload - movl %eax, 248(%rsp) - leaq 248(%rsp), %rax - movq %rax, 264(%rsp) - movl 24(%rsp), %eax # 4-byte Reload - movl %eax, 244(%rsp) - leaq 244(%rsp), %rax - movq %rax, 272(%rsp) - movl 28(%rsp), %eax # 4-byte Reload - movl %eax, 240(%rsp) - leaq 240(%rsp), %rax - movq %rax, 280(%rsp) - movl %r15d, 236(%rsp) - leaq 236(%rsp), %rax - movq %rax, 288(%rsp) - movl 32(%rbp), %eax - movl %eax, 232(%rsp) - leaq 232(%rsp), %rax - movq %rax, 296(%rsp) - movl 40(%rbp), %eax - movl %eax, 228(%rsp) - leaq 228(%rsp), %rax - movq %rax, 304(%rsp) - movl 48(%rbp), %eax - movl %eax, 224(%rsp) - leaq 224(%rsp), %rax - movq %rax, 312(%rsp) - movq 56(%rbp), %rax - movq %rax, 216(%rsp) - leaq 216(%rsp), %rax - movq %rax, 320(%rsp) - movq 64(%rbp), %rax - movq %rax, 208(%rsp) - leaq 208(%rsp), %rax - movq %rax, 328(%rsp) - movq 72(%rbp), %rax - movq %rax, 200(%rsp) - leaq 200(%rsp), %rax - movq %rax, 336(%rsp) - movq 80(%rbp), %rax - movq %rax, 192(%rsp) - leaq 192(%rsp), %rax - movq %rax, 344(%rsp) - movl $1, 8(%rsp) - movl $1, (%rsp) - movq %r12, %rdi - movl $.L.module_str, %esi - movl $.L.ptx_str, %edx - movl $.L.func_str, %ecx - leaq 256(%rsp), %r8 - jmp .LBB0_5 - .align 16, 0x90 -.LBB0_4: # %if_else - # in Loop: Header=BB0_2 Depth=1 - movl %ebx, 92(%rsp) - leaq 92(%rsp), %rax - movq %rax, 96(%rsp) - movl 20(%rsp), %eax # 4-byte Reload - movl %eax, 88(%rsp) - leaq 88(%rsp), %rax - movq %rax, 104(%rsp) - movl 24(%rsp), %eax # 4-byte Reload - movl %eax, 84(%rsp) - leaq 84(%rsp), %rax - movq %rax, 112(%rsp) - movl 28(%rsp), %eax # 4-byte Reload - movl %eax, 80(%rsp) - leaq 80(%rsp), %rax - movq %rax, 120(%rsp) - movl %r15d, 76(%rsp) - leaq 76(%rsp), %rax - movq %rax, 128(%rsp) - movl 32(%rbp), %eax - movl %eax, 72(%rsp) - leaq 72(%rsp), %rax - movq %rax, 136(%rsp) - movl 40(%rbp), %eax - movl %eax, 68(%rsp) - leaq 68(%rsp), %rax - movq %rax, 144(%rsp) - movl 48(%rbp), %eax - movl %eax, 64(%rsp) - leaq 64(%rsp), %rax - movq %rax, 152(%rsp) - movq 56(%rbp), %rax - movq %rax, 56(%rsp) - leaq 56(%rsp), %rax - movq %rax, 160(%rsp) - movq 64(%rbp), %rax - movq %rax, 48(%rsp) - leaq 48(%rsp), %rax - movq %rax, 168(%rsp) - movq 80(%rbp), %rax - movq %rax, 40(%rsp) - leaq 40(%rsp), %rax - movq %rax, 176(%rsp) - movq 72(%rbp), %rax - movq %rax, 32(%rsp) - leaq 32(%rsp), %rax - movq %rax, 184(%rsp) - movl $1, 8(%rsp) - movl $1, (%rsp) - movq %r12, %rdi - movl $.L.module_str, %esi - movl $.L.ptx_str, %edx - movl $.L.func_str1, %ecx - leaq 96(%rsp), %r8 -.LBB0_5: # %if_else - # in Loop: Header=BB0_2 Depth=1 - movl %r14d, %r9d - callq CUDALaunch - movq 352(%rsp), %rdi - testq %rdi, %rdi - je .LBB0_7 -# BB#6: # %call_sync - # in Loop: Header=BB0_2 Depth=1 - callq ISPCSync - movq $0, 352(%rsp) -.LBB0_7: # %post_sync - # in Loop: Header=BB0_2 Depth=1 - incl %r13d - cmpl %r13d, 16(%rsp) # 4-byte Folded Reload - movq %r12, %rax - jne .LBB0_2 -# BB#8: # %for_exit - movq 352(%rsp), %rdi - testq %rdi, %rdi - je .LBB0_10 -# BB#9: # %call_sync113 - callq ISPCSync - movq $0, 352(%rsp) -.LBB0_10: # %post_sync114 - leaq -40(%rbp), %rsp - popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 - popq %rbp - ret -.Ltmp0: - .size loop_stencil_ispc_tasks, .Ltmp0-loop_stencil_ispc_tasks - - .type .L.module_str,@object # @.module_str - .section .rodata,"a",@progbits -.L.module_str: - .asciz "stencil.ispc" - .size .L.module_str, 13 - - .type .L.ptx_str,@object # @.ptx_str - .align 16 -.L.ptx_str: - .asciz "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.1\n.target sm_35, texmode_independent\n.address_size 64\n\n\t// .globl\tstencil_step_task\n // @stencil_step_task\n.entry stencil_step_task(\n\t.param .u32 stencil_step_task_param_0,\n\t.param .u32 stencil_step_task_param_1,\n\t.param .u32 stencil_step_task_param_2,\n\t.param .u32 stencil_step_task_param_3,\n\t.param .u32 stencil_step_task_param_4,\n\t.param .u32 stencil_step_task_param_5,\n\t.param .u32 stencil_step_task_param_6,\n\t.param .u32 stencil_step_task_param_7,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_8,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_9,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_10,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_11\n)\n{\n\t.reg .pred %p<396>;\n\t.reg .s16 %rc<396>;\n\t.reg .s16 %rs<396>;\n\t.reg .s32 %r<396>;\n\t.reg .s64 %rl<396>;\n\t.reg .f32 %f<396>;\n\t.reg .f64 %fl<396>;\n\n// BB#0: // %allocas\n\tmov.u32 \t%r12, %ctaid.x;\n\tld.param.u32 \t%r13, [stencil_step_task_param_4];\n\tadd.s32 \t%r16, %r12, %r13;\n\tadd.s32 \t%r0, %r16, 1;\n\tsetp.ge.s32 \t%p0, %r16, %r0;\n\t@%p0 bra \tBB0_11;\n// BB#1: // %for_test28.i.preheader.lr.ph\n\tld.param.u32 \t%r0, [stencil_step_task_param_0];\n\tld.param.u32 \t%r1, [stencil_step_task_param_1];\n\tld.param.u32 \t%r2, [stencil_step_task_param_2];\n\tld.param.u32 \t%r3, [stencil_step_task_param_3];\n\tld.param.u32 \t%r4, [stencil_step_task_param_5];\n\tld.param.u32 \t%r5, [stencil_step_task_param_6];\n\tmul.lo.s32 \t%r5, %r5, %r4;\n\tld.param.u64 \t%rl3, [stencil_step_task_param_8];\n\tld.f64 \t%fl0, [%rl3];\n\tld.f64 \t%fl1, [%rl3+8];\n\tld.param.u64 \t%rl0, [stencil_step_task_param_9];\n\tld.f64 \t%fl2, [%rl3+16];\n\tld.param.u64 \t%rl1, [stencil_step_task_param_10];\n\tld.param.u64 \t%rl2, [stencil_step_task_param_11];\n\tld.f64 \t%fl3, [%rl3+24];\n\tshl.b32 \t%r6, %r4, 1;\n\tmul.lo.s32 \t%r7, %r4, 3;\n\tmul.lo.s32 \t%r8, %r4, -3;\n\tshl.b32 \t%r9, %r5, 1;\n\tmul.lo.s32 \t%r10, %r5, 3;\n\tmul.lo.s32 \t%r11, %r5, -3;\n\tadd.s32 \t%r12, %r12, %r13;\n\tneg.s32 \t%r13, %r9;\n\tneg.s32 \t%r14, %r6;\n\tmov.u32 \t%r32, WARP_SZ;\nBB0_2: // %for_test28.i.preheader\n // =>This Loop Header: Depth=1\n // Child Loop BB0_9 Depth 2\n // Child Loop BB0_5 Depth 3\n\tmov.u32 \t%r15, %r16;\n\tsetp.ge.s32 \t%p0, %r2, %r3;\n\t@%p0 bra \tBB0_10;\n// BB#3: // %for_test35.i.preheader.lr.ph\n // in Loop: Header=BB0_2 Depth=1\n\tsetp.lt.s32 \t%p0, %r0, %r1;\n\t@%p0 bra \tBB0_4;\n\tbra.uni \tBB0_10;\nBB0_4: // in Loop: Header=BB0_2 Depth=1\n\tmul.lo.s32 \t%r16, %r15, %r5;\n\tmov.u32 \t%r17, %r2;\nBB0_9: // %for_loop37.i.lr.ph.us\n // Parent Loop BB0_2 Depth=1\n // => This Loop Header: Depth=2\n // Child Loop BB0_5 Depth 3\n\tmad.lo.s32 \t%r18, %r17, %r4, %r16;\n\tadd.s32 \t%r19, %r18, %r4;\n\tadd.s32 \t%r20, %r18, %r6;\n\tsub.s32 \t%r21, %r18, %r4;\n\tadd.s32 \t%r22, %r18, %r7;\n\tadd.s32 \t%r23, %r18, %r14;\n\tadd.s32 \t%r24, %r18, %r5;\n\tadd.s32 \t%r25, %r18, %r8;\n\tadd.s32 \t%r26, %r18, %r9;\n\tsub.s32 \t%r27, %r18, %r5;\n\tadd.s32 \t%r28, %r18, %r10;\n\tadd.s32 \t%r29, %r18, %r13;\n\tadd.s32 \t%r30, %r18, %r11;\n\tmov.u32 \t%r31, %r0;\nBB0_5: // %for_loop37.i.us\n // Parent Loop BB0_2 Depth=1\n // Parent Loop BB0_9 Depth=2\n // => This Inner Loop Header: Depth=3\n\tmov.u32 \t%r33, %tid.x;\n\tadd.s32 \t%r34, %r32, -1;\n\tand.b32 \t%r33, %r34, %r33;\n\tadd.s32 \t%r33, %r33, %r31;\n\tsetp.ge.s32 \t%p0, %r33, %r1;\n\t@%p0 bra \tBB0_7;\n// BB#6: // %pl_dolane.i.us\n // in Loop: Header=BB0_5 Depth=3\n\tadd.s32 \t%r34, %r18, %r33;\n\tshl.b32 \t%r34, %r34, 3;\n\tadd.s32 \t%r35, %r34, -8;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl4, [%rl3];\n\tadd.s32 \t%r35, %r34, 8;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl5, [%rl3];\n\tadd.s32 \t%r35, %r34, -16;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl6, [%rl3];\n\tadd.s32 \t%r35, %r34, 16;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl9, [%rl3];\n\tadd.s32 \t%r35, %r19, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl8, [%rl3];\n\tadd.s32 \t%r35, %r34, -24;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl7, [%rl3];\n\tadd.s32 \t%r35, %r34, 24;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl10, [%rl3];\n\tadd.s32 \t%r35, %r20, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl13, [%rl3];\n\tadd.s32 \t%r35, %r21, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl12, [%rl3];\n\tadd.s32 \t%r35, %r22, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl11, [%rl3];\n\tadd.s32 \t%r35, %r23, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl16, [%rl3];\n\tadd.s32 \t%r35, %r24, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl15, [%rl3];\n\tadd.s32 \t%r35, %r25, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl14, [%rl3];\n\tadd.s32 \t%r35, %r26, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl19, [%rl3];\n\tadd.s32 \t%r35, %r27, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl18, [%rl3];\n\tadd.s32 \t%r35, %r28, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl17, [%rl3];\n\tadd.s32 \t%r35, %r29, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl24, [%rl3];\n\tcvt.s64.s32 \t%rl4, %r34;\n\tadd.s64 \t%rl3, %rl4, %rl1;\n\tld.f64 \t%fl21, [%rl3];\n\tadd.s32 \t%r33, %r30, %r33;\n\tshl.b32 \t%r33, %r33, 3;\n\tcvt.s64.s32 \t%rl3, %r33;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl20, [%rl3];\n\tadd.s64 \t%rl3, %rl4, %rl2;\n\tld.f64 \t%fl23, [%rl3];\n\tadd.s64 \t%rl4, %rl4, %rl0;\n\tld.f64 \t%fl22, [%rl4];\n\tadd.f64 \t%fl25, %fl21, %fl21;\n\tsub.f64 \t%fl23, %fl25, %fl23;\n\tadd.f64 \t%fl6, %fl6, %fl9;\n\tadd.f64 \t%fl6, %fl6, %fl13;\n\tadd.f64 \t%fl6, %fl6, %fl16;\n\tadd.f64 \t%fl6, %fl6, %fl19;\n\tadd.f64 \t%fl6, %fl6, %fl24;\n\tadd.f64 \t%fl4, %fl4, %fl5;\n\tadd.f64 \t%fl4, %fl4, %fl8;\n\tadd.f64 \t%fl4, %fl4, %fl12;\n\tadd.f64 \t%fl4, %fl4, %fl15;\n\tadd.f64 \t%fl4, %fl4, %fl18;\n\tmul.f64 \t%fl5, %fl0, %fl21;\n\tfma.rn.f64 \t%fl4, %fl1, %fl4, %fl5;\n\tfma.rn.f64 \t%fl4, %fl2, %fl6, %fl4;\n\tadd.f64 \t%fl5, %fl7, %fl10;\n\tadd.f64 \t%fl5, %fl5, %fl11;\n\tadd.f64 \t%fl5, %fl5, %fl14;\n\tadd.f64 \t%fl5, %fl5, %fl17;\n\tadd.f64 \t%fl5, %fl5, %fl20;\n\tfma.rn.f64 \t%fl4, %fl3, %fl5, %fl4;\n\tfma.rn.f64 \t%fl4, %fl4, %fl22, %fl23;\n\tst.f64 \t[%rl3], %fl4;\nBB0_7: // %safe_if_after_true.i.us\n // in Loop: Header=BB0_5 Depth=3\n\tadd.s32 \t%r31, %r32, %r31;\n\tsetp.lt.s32 \t%p0, %r31, %r1;\n\t@%p0 bra \tBB0_5;\n// BB#8: // %for_exit38.i.us\n // in Loop: Header=BB0_9 Depth=2\n\tadd.s32 \t%r17, %r17, 1;\n\tsetp.eq.s32 \t%p0, %r17, %r3;\n\t@%p0 bra \tBB0_10;\n\tbra.uni \tBB0_9;\nBB0_10: // %for_exit31.i\n // in Loop: Header=BB0_2 Depth=1\n\tadd.s32 \t%r16, %r15, 1;\n\tsetp.ne.s32 \t%p0, %r15, %r12;\n\t@%p0 bra \tBB0_2;\nBB0_11: // %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit\n\tret;\n}\n\n" - .size .L.ptx_str, 7954 - - .type .L.func_str,@object # @.func_str - .align 16 -.L.func_str: - .asciz "stencil_step_task" - .size .L.func_str, 18 - - .type .L.func_str1,@object # @.func_str1 - .align 16 -.L.func_str1: - .asciz "stencil_step_task" - .size .L.func_str1, 18 - - - .section ".note.GNU-stack","",@progbits diff --git a/examples/stencil/stencil_cu_nvptx64.bc b/examples/stencil/stencil_cu_nvptx64.bc deleted file mode 100644 index 2f3c05dadda86c8f7536698a5ab102c3fd7fb8a6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5256 zcma)A3sh5Qwmv7X6Cs2|h!6~zG~!hkYUCwWg@gnf6igt&$`oxtWcm;%A|PNbhleFj z@q#q%)w{-N?9yIcYwWdJ7;J5M>d*m25f!dfgJ7AiisBoFM`r)$BvEu`)@2<|4*!3$ z|GoG3?Qd_5RX5(0@i1%-8^b(Hr5FRlu$k~nl*sq1+F0=%LFn=tyqp)z=8A(@A7_w) zI7V!ZXKcA3)HjOjAMX*WnXC1vmdRU_NuF4iGnZ)N3Nwz`jw#QN;9w`opSCx<((ZsC zLdiJhRbGv?bGD75zwv5r5tk^8s>rcbL@FwBxJ1!0qHumiA$H9a^nd++Kil}E7wZ9f zl_^E`;=vvhK7C3}rBa>`1NpmGieW`AVY5MC4Kr+132*HuOfpCC)D-QAAy6(L@vphDN&>#a%==Vz@O7B zn5OTVUfT7ZX3FF}&6LS|x+(ANQsm6@`z6jj$_?}_@L8b|k{B1FC1tH0S_NEKI1boH6f>^DK2K;z1$g8|Zo|{>j~JmNVp!oREc9=7wr=j~Dq^MlWM z7^gjX>4%1t$~nSDd(4$PYlZ2fvjnSV)(B^fELwYRG%;oAhZ?NRk?>zj;>!CyeMsKa z7UuqN;XX2?AzauQgV$tWKb2q!()ERf0tQ&SE{3>KhQh4ir(h|c0^`Zf6R6C!)5)#Gm%`m(QoD+8>m zf*RRJeXE6*&)~Q@SOm6^Tk##(iE-Kqh2a5k$ZD76_yNeyO1>OUb z>v3eVQaLh#oF2jTO)Jr{Z~`1FF5#kBZ~@y^`a!*C^1^y{z0Jx9Wu!laWhUvNWIN1$ zM~s&|&8h|LOjgD)EEA0)(qql-#oFU0iM=%0qm9ezgJo}Q5hnUm>qc<=b*JOs!txw! zBeu}87dgJu>39(=$6$7F+y^=SgmV0^lJ&4$OnMYimgmrxHNQ}nS74&bbI7t5Sr)`< z7dfaeEe=XvD42mSk1gaGFJhW+ShR#E6aAQOVYNYQ${VpghfT$%ozGGR}0rv4!YnW zI{09AW0})N!97FG_p6-8CH@*6EKaBs)I5H>sUfOyc+2ak>;gR7h8$sW zkzEPV8Jh$T6D~XCWmVmg!-kp%%jsaW4G>Hv?(N(g`9>FC7bs0>e2Co5zelJPw zLD^Sl-;d7N==(fDDDEzqagWdB#{(;$-IGtv9P8{i)GQ~Cy^0Zy6nzJz-g6^W2!Tze z@oFb}0SI!)QJ9V}uR4&>s&99EejOI|*T1jY(EG@VOD(*gq-vvylY9%Hn3cl<7@{-o z<4GtRJq1%Sp21h6Ge|dJkKwJX0K1U}>|0302~L!g9AAxFb2%4JPn4a;y!z!plp=Cq zNriFXOoIf{mmetQxHAT%bzc+&+IKNtOEDYeew%}FGQ{W6#EWo)u~~bZ_|+bw)48|@ zVE-J!Uc4SLBs%%mTDIGnw}CpHc$OY-M;z=ykiZ3=xbFZk$AwVh5R}b3jjpy4KEbEywFA>>rI$s^E`mAOKKWPG-tDRaA zClg+ws--UMU48`e+f^#tf8QWU-gh02$RZ82!*}a+@vDRE6vu~`pd#Mw3e5s}&P+<6 zYZvH!h%+C!oVN~E7h3Ita|q{W4F=ztOUEoY;S>?ePq5vK|7Lo{Q?;JDi-KwB8w#dt zVwbj9{z&K9;O6NvIxR3>J2`Xxj4pnCpzUAc3eZyM>b#s(AGK!gq=LNx!DhOdyX&5z zYsLm(zmF_2^1gh^M@Zm2O=dm5j`p|oKERVpu73qgP>-k1oaaH#b7v&C7syNH2&bcxi{C34l>rV!31m)XaFbDhLQyre#W6OGx6$Wh?+;G15l6Dlt8 zeit#IJ>|7jTERcsB19mzlG!1zt)(QlE?(-8_k_vqeHA3uK=H_aQ#{ z((Z?jp*={co}3uB9H$HF2fCo-@gy($wmd)k0>16vh%614F8<4BQfHU=9ALG%3i}Hv z?4O!~arOiGId5Md;iN$$M4paG59-FXCvHSaOzzo$$^zI+UF94lc7urYZX=zl-1ldw zW*5inCXw8JQsuwZtuf1ic*k7?*$4zl&7d&?4a=9jqG5C`yQyK#9Kk)e%RaG(t54kG z1f4WN<-CAA&HR9gN1a{{WD|=W)>xvpX8rQ`a4CEny_Hl z03YeQp=K}9jVpLD>P5xW05n#-sRug9NLcs7&8QoHHn)tt(o`m|3rd*16m(s>38nd{gj>|_q!$Liz@)kbyw@o2Nv9M&6nIw0AM&RPNs}S!GN^R!BLvu0 zcPW=TOW8Roou#e$yIL^UIXS-LZov_B3r?YWe#8&pf7i=m<(}btowG#u6KJovr14Y~ zLn_WAXTQV+AOg(=n!6NzMO?pZPIJmz0WSH^ON65=uCi?gqKSQ-IR#!EaO>7x!2U~# z-CeXV8aLFWxaw~=)L(B2dHEx$sTNN{gfk6{Izp^QBu8JqAUt`$8@G(PvCKf{?sVyT z4^-~c!0JWib0^;%0ai^=vpaYHuNpbvi74+VbMv~%vgZ<-m`B7*wT zRrbDf?)m-B@0@%0zPE=rJ-ay^3awuZa)elu{#dFQo zxRzyKsQ3=h6Xfp#N6hah^d}(drwt47z^6W({}dqM7rz9(mEyD<{^L0owRAi@d@7W1 zLKuHWv+$6h{Y=8Sb9c3W?jyqG{35e-JcNAextxpxuOReq$;r6bQKuDkzJxmG@yNBr zK{qK+%ydl9qv$v0$p4w=EXO=Qf=l_c>;fni`VyL77j|}L#L<_M4Wj=pd(&~mI`$ub z>Fkj<(|HIRRz9B41wK|F{a9BRGNLZWeT48XcTj@MKA!(P1g2kX%^UM~n|P%yZ%{w| zb*B3P0Q0Y#MmxZ83fFJb`i6w`6PWbhu-taAd1@jd;}B2%_n%bw$xgXmRy^@-GQT!9 z%DGlx)-^-y?^BA`E=l`rQsOQv-g8Rl#+MbIqk`NhyV@<|zN<)G=POdj35rkpMbnpj z9hT5Fo4h1#M0*-{*u@JHziC|Jds32rV!VdO_KTBG%Q(9t@hJYN#)sK;6^HfV1WY^4v&BoX~@PPOAFG@bvTiDC<$Wy82|isrvD~;<>E&J2G;81J%{(7g0Y_jIpa% zYWOX`c&bzNp;PL&RJ}d6v554wDqgIewvb&cNqkVp$wM+vN!7X0mv!ARW!xFcKaTlP zzKivrZ^2RT8UN5PBF-ebzft+R)Hx3FVf%jIuFED*OP#(Y^Hx+o)cdHY+n0IEyJVk4{XX{3bBTZj z`~h#3Pw~V}SvTs(moA?z4@e#t2?!tX{S{9Y|Axfl_3PnpFgre6uPfjwtG=F9^&O*q zEBM8d+K+3v`_3q5mkYB06PDcHbp^RUS1s9>j=Zs!?hjv`!>)06E!`L1C_kd*F0k`? z13!O;Qy0hGHgS)XRGk{0JDX42s;^~+{63#fs&jed2)JI!W!N9QZ{C`X(>XQyv?=3% zdN-@%eTeU$Y;b==-*1PZ<@v1+Za??El7|0FEVh;IeUt8Yoz`QXvjX&O(_g9g7rqbX z=Fxwr$vKSggPYy-Ed&t7dHyc?!`qBLNSyFH%SaU#A57|(||Io)6WVY#047hrBpZ8OY0td zAnKNiCOmyvH{vtsqg0JG+)@r+Q}9A~`j)4V#!t~cmB=O#4$6;l$;SMIql+l`n+zZ8 h#!+PjbI?D^Nw`2~9S99y1qO31u2E|vw$=`CzXP&N7+U}U diff --git a/examples/stencil/stencil_cu_nvptx64.ll b/examples/stencil/stencil_cu_nvptx64.ll deleted file mode 100644 index d0c5e824..00000000 --- a/examples/stencil/stencil_cu_nvptx64.ll +++ /dev/null @@ -1,269 +0,0 @@ -; ModuleID = 'stencil_cu_nvptx64.bc' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -target triple = "nvptx64" - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #0 - -; Function Attrs: nounwind -define void @stencil_step_task(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %Nx, i32 %Ny, i32 %Nz, double* nocapture %coef, double* %vsq, double* %Ain, double* %Aout) #1 { -allocas: - %bid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 - %add_z0_load_calltmp = add i32 %bid.i.i, %z0 - %bid.i.i21 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 - %add_z0_load15_calltmp18 = add i32 %z0, 1 - %add_add_z0_load15_calltmp18_ = add i32 %add_z0_load15_calltmp18, %bid.i.i21 - %mul_Nx_load_Ny_load.i = mul i32 %Ny, %Nx - %coef_load_offset_load.i = load double* %coef, align 8 - %coef_load16_offset.i = getelementptr double* %coef, i64 1 - %coef_load16_offset_load.i = load double* %coef_load16_offset.i, align 8 - %coef_load19_offset.i = getelementptr double* %coef, i64 2 - %coef_load19_offset_load.i = load double* %coef_load19_offset.i, align 8 - %coef_load22_offset.i = getelementptr double* %coef, i64 3 - %coef_load22_offset_load.i = load double* %coef_load22_offset.i, align 8 - %less_z_load_z1_load.i161 = icmp slt i32 %add_z0_load_calltmp, %add_add_z0_load15_calltmp18_ - br i1 %less_z_load_z1_load.i161, label %for_test28.i.preheader.lr.ph, label %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit - -for_test28.i.preheader.lr.ph: ; preds = %allocas - %less_y_load_y1_load.i159 = icmp slt i32 %y0, %y1 - %less_xb_load_x1_load.i157 = icmp slt i32 %x0, %x1 - %x1_load199_broadcast_init.i = insertelement <1 x i32> undef, i32 %x1, i32 0 - %mul__Nx_load119.i = shl i32 %Nx, 1 - %mul__Nx_load167.i = mul i32 %Nx, 3 - %mul__Nx_load127.i = mul i32 %Nx, -2 - %Ain_load65_ptr2int.i = ptrtoint double* %Ain to i64 - %mul__Nx_load175.i = mul i32 %Nx, -3 - %mul__Nxy_load136.i = shl i32 %mul_Nx_load_Ny_load.i, 1 - %mul__Nxy_load184.i = mul i32 %mul_Nx_load_Ny_load.i, 3 - %mul__Nxy_load144.i = mul i32 %mul_Nx_load_Ny_load.i, -2 - %mul__Nxy_load192.i = mul i32 %mul_Nx_load_Ny_load.i, -3 - %Aout_load_ptr2int.i = ptrtoint double* %Aout to i64 - %vsq_load_ptr2int.i = ptrtoint double* %vsq to i64 - %0 = add i32 %bid.i.i21, %z0 - br label %for_test28.i.preheader - -for_test28.i.preheader: ; preds = %for_exit31.i, %for_test28.i.preheader.lr.ph - %z.0.i162 = phi i32 [ %add_z0_load_calltmp, %for_test28.i.preheader.lr.ph ], [ %z_load245_plus1.i, %for_exit31.i ] - br i1 %less_y_load_y1_load.i159, label %for_test35.i.preheader.lr.ph, label %for_exit31.i - -for_test35.i.preheader.lr.ph: ; preds = %for_test28.i.preheader - %mul_z_load45_Nxy_load.i = mul i32 %z.0.i162, %mul_Nx_load_Ny_load.i - br i1 %less_xb_load_x1_load.i157, label %for_loop37.i.lr.ph.us, label %for_exit31.i - -for_exit38.i.us: ; preds = %safe_if_after_true.i.us - %y_load244_plus1.i.us = add i32 %y.0.i160.us, 1 - %exitcond = icmp eq i32 %y_load244_plus1.i.us, %y1 - br i1 %exitcond, label %for_exit31.i, label %for_loop37.i.lr.ph.us - -for_loop37.i.us: ; preds = %for_loop37.i.lr.ph.us, %safe_if_after_true.i.us - %xb.0.i158.us = phi i32 [ %x0, %for_loop37.i.lr.ph.us ], [ %add_xb_load243_calltmp241.i.us, %safe_if_after_true.i.us ] - %tid.i.i.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 - %tid.i.i.i.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2 - %sub_calltmp3_.i.i.us = add i32 %tid.i.i.i.i.us, -1 - %bitop.i.i.us = and i32 %sub_calltmp3_.i.i.us, %tid.i.i.i.us - %add_xb_load42_calltmp.i.us = add i32 %bitop.i.i.us, %xb.0.i158.us - %add_xb_load42_calltmp_broadcast_init.i.us = insertelement <1 x i32> undef, i32 %add_xb_load42_calltmp.i.us, i32 0 - %less_x_load198_x1_load199_broadcast.i.us = icmp slt <1 x i32> %add_xb_load42_calltmp_broadcast_init.i.us, %x1_load199_broadcast_init.i - %v.i.i.us = extractelement <1 x i1> %less_x_load198_x1_load199_broadcast.i.us, i32 0 - br i1 %v.i.i.us, label %pl_dolane.i.us, label %safe_if_after_true.i.us - -pl_dolane.i.us: ; preds = %for_loop37.i.us - %.lhs.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %add_xb_load42_calltmp.i.us - %.lhs.us = shl i32 %.lhs.lhs.us, 3 - %1 = add i32 %.lhs.us, -8 - %iptr__id.i.rhs.us = sext i32 %1 to i64 - %iptr__id.i.us = add i64 %iptr__id.i.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i.us = inttoptr i64 %iptr__id.i.us to double* - %val__id.i.us = load double* %ptr__id.i.us, align 8 - %2 = add i32 %.lhs.us, 8 - %iptr__id.i130.rhs.us = sext i32 %2 to i64 - %iptr__id.i130.us = add i64 %iptr__id.i130.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i131.us = inttoptr i64 %iptr__id.i130.us to double* - %val__id.i132.us = load double* %ptr__id.i131.us, align 8 - %3 = add i32 %.lhs.us, -16 - %iptr__id.i125.rhs.us = sext i32 %3 to i64 - %iptr__id.i125.us = add i64 %iptr__id.i125.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i126.us = inttoptr i64 %iptr__id.i125.us to double* - %val__id.i127.us = load double* %ptr__id.i126.us, align 8 - %4 = add i32 %.lhs.us, 16 - %iptr__id.i120.rhs.us = sext i32 %4 to i64 - %iptr__id.i120.us = add i64 %iptr__id.i120.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i121.us = inttoptr i64 %iptr__id.i120.us to double* - %val__id.i122.us = load double* %ptr__id.i121.us, align 8 - %.lhs138.us = add i32 %.lhs138.lhs.us, %add_xb_load42_calltmp.i.us - %5 = shl i32 %.lhs138.us, 3 - %iptr__id.i115.rhs.us = sext i32 %5 to i64 - %iptr__id.i115.us = add i64 %iptr__id.i115.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i116.us = inttoptr i64 %iptr__id.i115.us to double* - %val__id.i117.us = load double* %ptr__id.i116.us, align 8 - %6 = add i32 %.lhs.us, -24 - %iptr__id.i110.rhs.us = sext i32 %6 to i64 - %iptr__id.i110.us = add i64 %iptr__id.i110.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i111.us = inttoptr i64 %iptr__id.i110.us to double* - %val__id.i112.us = load double* %ptr__id.i111.us, align 8 - %7 = add i32 %.lhs.us, 24 - %iptr__id.i105.rhs.us = sext i32 %7 to i64 - %iptr__id.i105.us = add i64 %iptr__id.i105.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i106.us = inttoptr i64 %iptr__id.i105.us to double* - %val__id.i107.us = load double* %ptr__id.i106.us, align 8 - %.lhs141.us = add i32 %.lhs141.lhs.us, %add_xb_load42_calltmp.i.us - %8 = shl i32 %.lhs141.us, 3 - %iptr__id.i100.rhs.us = sext i32 %8 to i64 - %iptr__id.i100.us = add i64 %iptr__id.i100.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i101.us = inttoptr i64 %iptr__id.i100.us to double* - %val__id.i102.us = load double* %ptr__id.i101.us, align 8 - %.lhs142.us = add i32 %.lhs142.lhs.us, %add_xb_load42_calltmp.i.us - %9 = shl i32 %.lhs142.us, 3 - %iptr__id.i95.rhs.us = sext i32 %9 to i64 - %iptr__id.i95.us = add i64 %iptr__id.i95.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i96.us = inttoptr i64 %iptr__id.i95.us to double* - %val__id.i97.us = load double* %ptr__id.i96.us, align 8 - %.lhs143.us = add i32 %.lhs143.lhs.us, %add_xb_load42_calltmp.i.us - %10 = shl i32 %.lhs143.us, 3 - %iptr__id.i90.rhs.us = sext i32 %10 to i64 - %iptr__id.i90.us = add i64 %iptr__id.i90.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i91.us = inttoptr i64 %iptr__id.i90.us to double* - %val__id.i92.us = load double* %ptr__id.i91.us, align 8 - %.lhs144.us = add i32 %.lhs144.lhs.us, %add_xb_load42_calltmp.i.us - %11 = shl i32 %.lhs144.us, 3 - %iptr__id.i85.rhs.us = sext i32 %11 to i64 - %iptr__id.i85.us = add i64 %iptr__id.i85.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i86.us = inttoptr i64 %iptr__id.i85.us to double* - %val__id.i87.us = load double* %ptr__id.i86.us, align 8 - %.lhs145.us = add i32 %.lhs145.lhs.us, %add_xb_load42_calltmp.i.us - %12 = shl i32 %.lhs145.us, 3 - %iptr__id.i80.rhs.us = sext i32 %12 to i64 - %iptr__id.i80.us = add i64 %iptr__id.i80.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i81.us = inttoptr i64 %iptr__id.i80.us to double* - %val__id.i82.us = load double* %ptr__id.i81.us, align 8 - %.lhs146.us = add i32 %.lhs146.lhs.us, %add_xb_load42_calltmp.i.us - %13 = shl i32 %.lhs146.us, 3 - %iptr__id.i75.rhs.us = sext i32 %13 to i64 - %iptr__id.i75.us = add i64 %iptr__id.i75.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i76.us = inttoptr i64 %iptr__id.i75.us to double* - %val__id.i77.us = load double* %ptr__id.i76.us, align 8 - %.lhs147.us = add i32 %.lhs147.lhs.us, %add_xb_load42_calltmp.i.us - %14 = shl i32 %.lhs147.us, 3 - %iptr__id.i70.rhs.us = sext i32 %14 to i64 - %iptr__id.i70.us = add i64 %iptr__id.i70.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i71.us = inttoptr i64 %iptr__id.i70.us to double* - %val__id.i72.us = load double* %ptr__id.i71.us, align 8 - %.lhs148.us = add i32 %.lhs148.lhs.us, %add_xb_load42_calltmp.i.us - %15 = shl i32 %.lhs148.us, 3 - %iptr__id.i65.rhs.us = sext i32 %15 to i64 - %iptr__id.i65.us = add i64 %iptr__id.i65.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i66.us = inttoptr i64 %iptr__id.i65.us to double* - %val__id.i67.us = load double* %ptr__id.i66.us, align 8 - %.lhs149.us = add i32 %.lhs149.lhs.us, %add_xb_load42_calltmp.i.us - %16 = shl i32 %.lhs149.us, 3 - %iptr__id.i60.rhs.us = sext i32 %16 to i64 - %iptr__id.i60.us = add i64 %iptr__id.i60.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i61.us = inttoptr i64 %iptr__id.i60.us to double* - %val__id.i62.us = load double* %ptr__id.i61.us, align 8 - %.lhs150.us = add i32 %.lhs150.lhs.us, %add_xb_load42_calltmp.i.us - %17 = shl i32 %.lhs150.us, 3 - %iptr__id.i55.rhs.us = sext i32 %17 to i64 - %iptr__id.i55.us = add i64 %iptr__id.i55.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i56.us = inttoptr i64 %iptr__id.i55.us to double* - %val__id.i57.us = load double* %ptr__id.i56.us, align 8 - %.lhs151.us = add i32 %add_xb_load42_calltmp.i.us, %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us - %18 = shl i32 %.lhs151.us, 3 - %iptr__id.i50.rhs.us = sext i32 %18 to i64 - %iptr__id.i50.us = add i64 %iptr__id.i50.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i51.us = inttoptr i64 %iptr__id.i50.us to double* - %val__id.i52.us = load double* %ptr__id.i51.us, align 8 - %.lhs152.us = add i32 %.lhs152.lhs.us, %add_xb_load42_calltmp.i.us - %19 = shl i32 %.lhs152.us, 3 - %iptr__id.i45.rhs.us = sext i32 %19 to i64 - %iptr__id.i45.us = add i64 %iptr__id.i45.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i46.us = inttoptr i64 %iptr__id.i45.us to double* - %val__id.i47.us = load double* %ptr__id.i46.us, align 8 - %val__id.i41.us = load double* %ptr__id.i51.us, align 8 - %iptr__id.i32.us = add i64 %iptr__id.i50.rhs.us, %Aout_load_ptr2int.i - %ptr__id.i33.us = inttoptr i64 %iptr__id.i32.us to double* - %val__id.i34.us = load double* %ptr__id.i33.us, align 8 - %iptr__id.i27.rhs.us = sext i32 %.lhs.us to i64 - %iptr__id.i27.us = add i64 %iptr__id.i27.rhs.us, %vsq_load_ptr2int.i - %ptr__id.i28.us = inttoptr i64 %iptr__id.i27.us to double* - %val__id.i29.us = load double* %ptr__id.i28.us, align 8 - %iptr__id.i23.us = add i64 %iptr__id.i50.rhs.us, %Aout_load_ptr2int.i - %ptr__id.i24.us = inttoptr i64 %iptr__id.i23.us to double* - %val__id.i25.lhs.us.lhs = fmul double %val__id.i41.us, 2.000000e+00 - %val__id.i25.lhs.us = fsub double %val__id.i25.lhs.us.lhs, %val__id.i34.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i127.us, %val__id.i122.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i102.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.us, %val__id.i87.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.us, %val__id.i72.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.us, %val__id.i57.us - %val__id.i25.rhs.rhs.lhs.lhs.us = fmul double %coef_load19_offset_load.i, %val__id.i25.rhs.rhs.lhs.lhs.rhs.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i.us, %val__id.i132.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i117.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.us, %val__id.i97.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.us, %val__id.i82.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.us, %val__id.i67.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.us = fmul double %coef_load16_offset_load.i, %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.us - %val__id.i25.rhs.rhs.lhs.rhs.rhs.us = fmul double %coef_load_offset_load.i, %val__id.i52.us - %val__id.i25.rhs.rhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.us, %val__id.i25.rhs.rhs.lhs.rhs.rhs.us - %val__id.i25.rhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.us, %val__id.i25.rhs.rhs.lhs.rhs.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i112.us, %val__id.i107.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i92.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.us, %val__id.i77.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.us, %val__id.i62.us - %val__id.i25.rhs.rhs.rhs.rhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.us, %val__id.i47.us - %val__id.i25.rhs.rhs.rhs.us = fmul double %coef_load22_offset_load.i, %val__id.i25.rhs.rhs.rhs.rhs.us - %val__id.i25.rhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.us, %val__id.i25.rhs.rhs.rhs.us - %val__id.i25.rhs.us = fmul double %val__id.i25.rhs.rhs.us, %val__id.i29.us - %val__id.i25.us = fadd double %val__id.i25.lhs.us, %val__id.i25.rhs.us - store double %val__id.i25.us, double* %ptr__id.i24.us, align 8 - br label %safe_if_after_true.i.us - -safe_if_after_true.i.us: ; preds = %pl_dolane.i.us, %for_loop37.i.us - %tid.i.i1.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2 - %add_xb_load243_calltmp241.i.us = add i32 %tid.i.i1.i.us, %xb.0.i158.us - %less_xb_load_x1_load.i.us = icmp slt i32 %add_xb_load243_calltmp241.i.us, %x1 - br i1 %less_xb_load_x1_load.i.us, label %for_loop37.i.us, label %for_exit38.i.us - -for_loop37.i.lr.ph.us: ; preds = %for_exit38.i.us, %for_test35.i.preheader.lr.ph - %y.0.i160.us = phi i32 [ %y_load244_plus1.i.us, %for_exit38.i.us ], [ %y0, %for_test35.i.preheader.lr.ph ] - %mul_y_load46_Nx_load47.i.us = mul i32 %y.0.i160.us, %Nx - %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us = add i32 %mul_y_load46_Nx_load47.i.us, %mul_z_load45_Nxy_load.i - %.lhs138.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %Nx - %.lhs141.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load119.i - %.lhs142.lhs.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %Nx - %.lhs143.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load167.i - %.lhs144.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load127.i - %.lhs145.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul_Nx_load_Ny_load.i - %.lhs146.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load175.i - %.lhs147.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load136.i - %.lhs148.lhs.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul_Nx_load_Ny_load.i - %.lhs149.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load184.i - %.lhs150.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load144.i - %.lhs152.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load192.i - br label %for_loop37.i.us - -for_exit31.i: ; preds = %for_exit38.i.us, %for_test35.i.preheader.lr.ph, %for_test28.i.preheader - %z_load245_plus1.i = add i32 %z.0.i162, 1 - %exitcond163 = icmp eq i32 %z.0.i162, %0 - br i1 %exitcond163, label %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit, label %for_test28.i.preheader - -stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit: ; preds = %for_exit31.i, %allocas - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "target-features"="+sm_35" } -attributes #2 = { nounwind } - -!nvvm.annotations = !{!0} - -!0 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task, metadata !"kernel", i32 1} -!1 = metadata !{ } -!2 = metadata !{ metadata !"output", metadata !0 } -!3 = metadata !{ metadata !"input1", metadata !0 } -!4 = metadata !{ metadata !"input2", metadata !0 } diff --git a/examples/stencil/stencil_ispc.h b/examples/stencil/stencil_ispc.h deleted file mode 100644 index ebf29582..00000000 --- a/examples/stencil/stencil_ispc.h +++ /dev/null @@ -1,35 +0,0 @@ -// -// stencil_ispc.h -// (Header automatically generated by the ispc compiler.) -// DO NOT EDIT THIS FILE. -// - -#ifndef ISPC_STENCIL_ISPC_H -#define ISPC_STENCIL_ISPC_H - -#include - - - -#ifdef __cplusplus -namespace ispc { /* namespace */ -#endif // __cplusplus - -/////////////////////////////////////////////////////////////////////////// -// Functions exported from ispc code -/////////////////////////////////////////////////////////////////////////// -#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C) -extern "C" { -#endif // __cplusplus - extern void loop_stencil_ispc(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd); - extern void loop_stencil_ispc_tasks(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd); -#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C) -} /* end extern C */ -#endif // __cplusplus - - -#ifdef __cplusplus -} /* namespace */ -#endif // __cplusplus - -#endif // ISPC_STENCIL_ISPC_H diff --git a/examples/stencil/stencil_nvptx64.bc b/examples/stencil/stencil_nvptx64.bc deleted file mode 100644 index b77be1e37199eec8aefe33226d8f35d06caeff37..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8500 zcma)C30PCdzCS0sA%p~pK-ofRt5TF;0imb_SsGi=h|y{R8xXzreGLX>Q7W3S`-%^= z*5y*4vDJQfuUcx^kxK-mw19%3-~$8%Q7KUt8f+gOE(KO!MOpdiZfnss&8&wSb`sX$ppu;=U?=F<-~n`ZiP=dK_D&reJb>;j3d-4U zm9M-tn?*`#Hj9+fY&I#gSTSVA_}$EUg0TTt0iGoRLJb0Ji)b8wMG*~@J6a3AnBcR- z{y4{47@V6u=baDg82%~H?t6fK0UQwjY$qvAx>0%F-@885=Lp>=Thn#L{?Tc>F0pOb z6}vlH`)aNIjXt}sNc-;@#JWY`_dSlrwzki9n+3W&%ik-lCVQoZy=OOj*YQ~1AY%4d zz#r4_-{Y;cufAe;?X>MS2@{)^YSTzxUhls=o4&kpcXphb{RIGpEK{v!C#mTPyEE0| zOzU4F`49E{d%gU=X8t|N7QaSEyMaji2Yr?uj&@yul=}hiGpWk}ofp+!m+3xj{ywEO zyuipz^>KF6GiHK}owy*=y0MzyCovlvG3g)SKa!h0?&SlXW5odb;FmSB=%cI8Le{(O z3qVp0!N*(!)A~&2YIrZVqKDjt2Q@BL?!xQ7=wXPtNU&xv8{m);;vxWjF8DA4B{Vs6 ztDK!KpTk@$OkZnVyH)B#mxj{l(UK?$&(Erj#gc20qSz%} za?W;1gFte{wkB%hE?%}(c#Y{6F>AY|OUc@*VeR7OSa09OYyPpxEg)KNXQk#lW^3Kk zq^!`>TAT2*sFloHtsc+P@*K}-H>f16&=~MpXDwU1Rb0aQ=05A2OtwqWO|;RRM;Gy8 zFCS2&NXwN4eeJ-r*!fca6WUR(OR4TiieHD-7OzP#8d;XgwaAy6U8MPFOg1_{>rDye z23aQ|mIVqaOYOIixrMRv4qUCLIZM(jG-qiZ8j}wfS~&?OUm6U)h^-ubw~Of_(xc(> z53g8hdm&L&K-bYWl>&MNbhw)vP52ZH(={w9vs4X8>)DWW>hDSm>n>WjK2*Cv&pd09 z{b6bZWWtL{EEIzX({S(fEmodW&VxMHacZxDVKeDMMATLj zp%3+2jFA*_3c#>~W3O5q}8 zGvcf~0A=1y$Q%!beUZC}LOeGS{fIe%SRM(L1c}FhD)uAIXg)F@YWNvOGF$p$8cat@ z>2mEFsG=+eRFOE*N@Vssw(z3eM&}1?DC=6z8W}T4;oFtk=95oX$oX>AOUkT3L>)8> zUqo=$kaPEAMW zDJ2bh9++CaIKWy=7iqc#k($OOwg)1PNGt&_TR487D|9yU>X-mi9U7Pxq=>YXXGBQt z643MxrXti*{$Yf4Q9OFJjoE0*-Jyw0|Cr!G|{7a0Iw_iAyRm> zLv9&5wlY+EJe;|V20*HQ69XyF(c;>f6!$W2Ed%VT55#v}>0G*s=}Hrg%~6Qw!(BaT zIG0gZRy!96^K}XxZ8_X^M2PR2YqBrYFGtF}Koe#C3U<|=!0_porj~rot!Fsri}I!G zA>_Ik5XHHF&vr|T+x;~+>YlSqR0@FWqlM18g)^djSJeCK`VnWSn`C0wk<%YL*U!z@ zMCek~aMuMQd{@N$UMQL`WscEAXBwa@{S)7nab){3?rNrUuVub?Gu-vRm~hu@M}0u0 zqSslD{vu^g?S!8CF0KT0{_kf~zSeT3gFp$Ia9j!1YjT5`OX#9+8lf70AOyO!@dWpi znbT-twZTkZx~LgWLo2|vz_Zzk-hiv=pU4g}*GQQjsHl{oP@Bt9nY1!c9M6zHmzv4_ ze5A}lMD&8D(3r^Q3uSg_n@a&rA?M1qz;6-}5UAQ*p+|}KhHgYsg<%rpCk7{jjv^=v zK{kBLY1}osVrS3g2osl=J7o!{o;Nt&wjT{{F*v2>bd;(xXpQd$8gh6mLWjyW1=i!w zQNu+Jc87oCe=aI`;k)En*1_j3j#Fg2{b|~=bz_G5UZ;r*VB9(m*?=}cYHeQ(e~>wV zwdn8Z4Fsv44L=jpJ^z)rSO2yfLol78Yi9Hm`el{vjgVm=&^%Yfkyw5aax(&Si*eIYs^z;7{5RRyk`kAx zUZSML!xO?ITK|xMv8C6>oSN2zuVcto-1Qa43iQ%|m*!K~BY&d*O-@n+3IVOk7JlOo z;91BGAXFAQR0>AVxe>ga96st@K*(L9@(}tjGVI@MxQozZT31AQnPc6KDlfgJzbt3t zT7u={1Nx`9VY(_mBbl#H4tuQs_%FD)VBEj666_z#BKe7GhbBBk@^hgx!$wc=7<~K=0kQ@XYT%*4(Rv=qYK3aSf9scdKX#ft zm}fx_K94D&_@AsWCHau;>--7ZaIzd)iQG{3Kb>1N?|23L=E|3}Eqmz`=qPLjM(B0A zRZC)Yhr>#Rmk{Or>>xCEyA32gkIBHpj)nbXO?q# zJ#m^$4oj5&arA;s6$$ePMNKm>Pz-F|VnV|HTG(Exg&=*daKGQ8hBfZM-$O~)_52iQ z`*t|hg}xKvXO#SAWzNP5eLpbSWHFE&v}9WpxM^eqqv`SW~Rcu7y323 zaF)&^+@Yg5mQ+3Gdft5`m9SQE%t1Pc1c#eOfetq;24xr)J1(mZp+a(L!s8Dzqr1ty zt0xETkU9LD!2!|t_W>xK4EqToklZ$=sz#NBzf-Y{j6czZ^ewN61zXuiQ2#~yCjhWI z_D-LzkBvk%dEG;zp^sN<9@T}5k2S!Ad33K7x-%R#`4o}vi|z6YQ?2l6FEc-c9c z-~v^7dm3uyCMnL1p$#s2LhAUCupcN{#$&z&B`O|F+NL1s8>k#tf9NnIJ;e{69o((| zfdjn==+ki##AG+XT!lfab!^xoJs@~RR0GYF+$2uL^MBjV)1_IA zhd^1#=!Q2)-aMP~E>OIA#{3fv#(9ZqEjfufKq7R26TQ@CWvbt4r1b|XA9LNwFxy^s zn#xl5%Yh(eTnR3SEV+Hcn%poU*(MgTeSK&F(FL?k+epSbJa)Wi40Gzq(wyVx$aif^ zcRZySdQkr3XDR*`z`OmEd#cu1n7cs-UlEmogK-il@8}z(98(DS;X=iT~F+$2X24LE%*GUlHin13D`V^ez>=!Y2l_X|Nko<$lKD8W_!*GRm z!a_o11ou#Biw9YC_m`V4B%jMD=|)Swb|Zwl(DS{68q1t9MaK7+Z{wx?r?7^wBo*}j zj!OE+M3n8PAQTi2m&dCL=wY+p0scW_CBWC?s%@E|zK-Zmco&Nm$-}@U60xD-;yMC- z3D?4UpqAS&e+Gq3RM~t?rm|vKP23J6Wlz|gr^EG9ZpwlNN$d{WPldSg&3UWWuWE*A zWq866eCVM2S2-IW{n+U&vm0)#ZU)bkl6m46+<1-3| zWiji>L^OOkl8aSh$v}&nqA-3t3tQN^Ag%ji+5E-ysdm7na`c%)0AIqgcP*X0d4us*?$Ar;1&R8AI% z9ThM!ISg!};yKAOHXX<7n$p(TYKWro4~yjKJBc)2Z~SP1N&`C@qyAveBhK#AwE!! z;se)hF5&}~n~bj14qeG^IUtFZ!O#;g;j=2rC?CHMef%81@%26qXy7fu8Ox1v#Hr_3 zBRb+BT3m_hTHQZQRM!{(?8pIB0ColKAwT<-jVX5o`)yl{h3;vvSseWn%xs1L*;f(I zvkU>Xeh6K7`=MSj;|LGP#%(}Po5c}q8UCL^I!SvhS!DwDKcL3$abn9xa}m>br6 z`5wZS9(t|VeVYH{~`r44+dhV?8Ff>iu z-n1}0o>t5rRIGbZGWi_HwE+)g9`OA1kUC~PDgm1G<4Xzn?HmOpJx5#sJ; z=7|tktCqSqYpwi^P2ReMVi#uzja6apz>^ z6iffQq6;V(Yd4r0zdy0=)JFrpnbup0nskR@^;%i&7-ocX*m?B!;gX^##2nt5#pd&XM?<2zfwN%yN)PJS+1a>xo;* z4tOsI8qgN_9%nWGCt8KGBiUZqJ>^tkPFLcNiW$de5FJrl{UtA&*!m%rT8g2Xw0P(o z0`eZqT`qNZ)IW6!@elAVaH#_Ki~~*^Z1sPl8cJwfO`hWLLLuj4)ddP-J%y(!4~%Ms zQ74i`$ek52yuq22d%QC2!+EdX&P+z+{f#tiN0DkAKOw-{AGCx&o;AQI#n84zSo}{W z%K2l|v9b#=abuCo^`~anXzpn_9YcR|MTu)%#IaJbXd61CDISVdtK%H$-GkUEHBz7%ku&KDE96h{~S$Upx|P{!X;8U@`9pMuvHeIVOx zI!nshwnvpo0luf0Nf|zQ_|^i_@bTC3R8q41_JDVL)_aka;q`BMVgF?S{`D3?{vV_x B|2zNy diff --git a/examples_cuda/common.mk b/examples_cuda/common.mk index c11f22dc..b5100169 100644 --- a/examples_cuda/common.mk +++ b/examples_cuda/common.mk @@ -10,7 +10,8 @@ CCFLAGS+=-Iobjs/ -O2 LIBS=-lm $(TASK_LIB) -lstdc++ ISPC=ispc -ISPC_FLAGS+=-O2 --opt=fast-math --math-lib=default +ISPC_FLAGS+=-O2 +ISPC_FLAGS+=--opt=fast-math --math-lib=default ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h) ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/) diff --git a/examples_cuda/stencil/.stencil.ispc.swn b/examples_cuda/stencil/.stencil.ispc.swn new file mode 100644 index 0000000000000000000000000000000000000000..ad3f6c7805836bce454e0ea3c69a06f27d7ae0cb GIT binary patch literal 16384 zcmeHOU2No56?PX|3bYh02&obehus#f5=GAV&c<4g zJ=mUPCbZ=R34v4|SSnHe9*__(JRk%Tg%>0qkXj)j0pg(`fdHxei}D0&f$v<~i6@!e zY`ZTA*~&K_U!QxtMPk4^Q zw{CS@7Xw!Wm2epJLMyN;o;zLo=~Urd==Y*ICmeLkUEdb++ng9#edq~yxLF%~o>0>%{p@$@Rm@`H2(vlgs>+GmtZoGmtZoGmtZoGmtZo zGmtZoGmtZoGmtazKgfV(E6Tg!*XMY_k3Ii%{r}nnit-cSdEmQ12+RN%feXN|-k~VZ z0iOq2z&*hC?^l%X0b`&7{OvwP`77`j;H$tE@E+iWw=2riz$3ul-=-*E2X=r5fnU5; zQGN#e6!-yf1Ly&d0k7VxDBl4>paHA{=YjixU%y3Bz6@*vGr&LYQIsD7p9QW0v%tUZ zMj3De=mNU{0rvoR1OIumqPzyY3j7Ip33w5B0eBwxB+vma0>42*K9l91uXo>ocS2(Pe z=&n6q$ zp(Pqm7^^16wr*l4sX5OI1`I8re`Au^wc#CncD(=LoV)b@wFt{~O%u5Oo(x;HKL zhtbvTB7F>&P99U$bbZmg`hkq12_yUAM`7d^x;UyDqhmMZ_4vf~;aM6E=O=3*G|#X( zK2Zblr6fQ1*}-|R!xR_N7PD8Q6EzSds9{A;SN0g~`;LuaX^WoYi8!$7(TVG=6W5!x zh+jNCB|UwL$3T2AXATdLZG@5S?5A;~?s#Gp72pQ{6)_4IDjaDleykK67ne~mDL6~V z%GpG=!t7!ZV=`M(UM$LJPrS*>&n`;C8ToQjqNLI?92VL0LabEQnW!e~#3e`7xO&!! z*@aVXEfi5Sy~6CmgaOb~PD+mCrC$qKzgS|Tnyhn{U)587Rd3-J<`HyMman8_(ywaP zFP507Rz5CCswHC;v4J3vEt_WzSpwPYQSc=JfnkP`byy^o2e}L}_*NLPhlRqp2o;Uw z0z!N-E^cR(aF`wI!1QsuDo7qYh~gScR5(oZv+41X;ZYpx@#KT2jQ(K|t zWooZ!)M}chzO2{v_7)eJo!T0$sO##gRv~`cQA1m;>#JI$rcu+NCh8a)P=FvdbW?Av zlHMS-u|*py6m;6D-i`@*HldrWrLL>7krHjJX{?0n)tZgAq1M_ZGTVk;YiGq^tlc!) zRL~o>dS?ZeSWnAnUe?>@CH7^x1N)k3LQ`8aj83bqHycG-Yi^*e2Gvyfuma!E*4Rau zZ5rInL{qI}XzN--`fPTV&9>g|v^837HdjnGU~0w{y{5rh9V~Hkps@sc+G+{+S&O!* zt+(o`vBd@49#i{p2P$>7p3rD!T2O+|ZE7^Rq3e1Z-nET0P*Td#HC6ty}A`d33&X;5I+>Z-o3tyGx$ zMpH&ao6I$}UYDUzYqquweRU0dY&104Jh&%s>1d=5cM(d4iq>h8jdjhat>H51T1ho} zNA`*In;Fv03-SgxLfdQ^++k?bX^q^rq^D2u#ELkSj^Y_=ITo&fffZWbI^q9Xv%aE% zM(%qVV=1o}R1>e-P=+Jgb3)?ZARkzPpaUn`!>ml{JtCo&h9YsgOK{xn05sXLt#_olkjYm3W9<`;P?GV@Yx=AqmXP7IQtOm z;f7R&`B1>P2;h+*&+4OIk902*F4g>C;0Jie@jddOEtLpa#EHVREw%VU~DiBoNIeg z+PXV&DekTrI_V4=75c-V%jRrBi8GNEw}7U2(ihR=Xj$L4IX)!QCF0PLv4JSDj~R~k ztmp(wSqcNDXwoCeE8>S)N*rURGlYeuxZ^k~<6vceRM#a*RtPRb5g@d<$+eV2Ds+pq zFkhW7;{#9YQ*out%_zM7e+=vHKLEV`e;6k|euOptSAZ`8Uj)7YJO?}jw1M{mzr&jU z8-NXzfq!Dn|1IE~z-NGGfL-7tzJSo{AN_%xsc4*?GXKSv(m2G9pw;344O@aGla z&%hr6_U~mNKXL|g266^+266^+266^+266^&kAXDVOY#WEUN=oe&CL%_2sez^bg9Oy*X(cX{Djwc z==2@?%)2y&h~7^~NiHoR3#`P>qXW@V)nuhghlFm%!a41|ec#1GI@8-N{4cZX3fDOl zWOV~S4AEy;X^|DufpFc&N#;nov#~5g7##S)b>c-hg)SD-((F+_l_pYo5fY-&{3VL$ zgA`RS(d=x*`O|a@1N}29c}27XDdO3JS1ebHb2t^1SYO&3*>jp!Om~yGcc$(#**12W zq;uKo>GxF4L^}1Rdr5JvsdbK0uxFZjLn+tPUz}6(tmn+^>rLuSOT=bGC%$IwP1~As zeSg3^S^N*|JN%F9az`ti3;>MVteyyCSP>(ne4-?an{H1b51Sd=+41Md(Kn3b=u7S4 dJSfDQqb0esig`lrr; - - - ld.param.u32 %r2, [__shfl_i32_param_0]; - ld.param.u32 %r3, [__shfl_i32_param_1]; - // inline asm - shfl.idx.b32 %r1, %r2, %r3, 0x1f; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __shfl_xor_float( - .param .b32 __shfl_xor_float_param_0, - .param .b32 __shfl_xor_float_param_1 -) -{ - .reg .s32 %r<2>; - .reg .f32 %f<3>; - - - ld.param.f32 %f2, [__shfl_xor_float_param_0]; - ld.param.u32 %r1, [__shfl_xor_float_param_1]; - // inline asm - shfl.bfly.b32 %f1, %f2, %r1, 0x1f; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __shfl_xor_i32( - .param .b32 __shfl_xor_i32_param_0, - .param .b32 __shfl_xor_i32_param_1 -) -{ - .reg .s32 %r<4>; - - - ld.param.u32 %r2, [__shfl_xor_i32_param_0]; - ld.param.u32 %r3, [__shfl_xor_i32_param_1]; - // inline asm - shfl.bfly.b32 %r1, %r2, %r3, 0x1f; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __fminf( - .param .b32 __fminf_param_0, - .param .b32 __fminf_param_1 -) -{ - .reg .f32 %f<4>; - - - ld.param.f32 %f2, [__fminf_param_0]; - ld.param.f32 %f3, [__fminf_param_1]; - // inline asm - min.f32 %f1, %f2, %f3; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __fmaxf( - .param .b32 __fmaxf_param_0, - .param .b32 __fmaxf_param_1 -) -{ - .reg .f32 %f<4>; - - - ld.param.f32 %f2, [__fmaxf_param_0]; - ld.param.f32 %f3, [__fmaxf_param_1]; - // inline asm - max.f32 %f1, %f2, %f3; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __ballot( - .param .b32 __ballot_param_0 -) -{ - .reg .s32 %r<3>; - - - ld.param.u8 %r2, [__ballot_param_0]; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r2, 0; - vote.ballot.b32 %r1, %p1; - } - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __lanemask_lt( - -) -{ - .reg .s32 %r<2>; - - - // inline asm - mov.u32 %r1, %lanemask_lt; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b64 func_retval0) ISPCAlloc( - .param .b64 ISPCAlloc_param_0, - .param .b64 ISPCAlloc_param_1, - .param .b32 ISPCAlloc_param_2 -) -{ - .reg .s64 %rd<2>; - - - mov.u64 %rd1, 1; - st.param.b64 [func_retval0+0], %rd1; - ret; -} - -.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer( - .param .b64 ISPCGetParamBuffer_param_0, - .param .b64 ISPCGetParamBuffer_param_1, - .param .b64 ISPCGetParamBuffer_param_2 -) -{ - .reg .pred %p<2>; - .reg .s32 %r<3>; - .reg .s64 %rd<7>; - - - ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1]; - ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2]; - mov.u32 %r1, %tid.x; - and.b32 %r2, %r1, 31; - setp.ne.s32 %p1, %r2, 0; - mov.u64 %rd6, 0; - @%p1 bra BB8_2; - - // Callseq Start 0 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd3; - .param .b64 param1; - st.param.b64 [param1+0], %rd4; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd6, [retval0+0]; - } - // Callseq End 0 - -BB8_2: - st.param.b64 [func_retval0+0], %rd6; - ret; -} - -.visible .func ISPCLaunch( - .param .b64 ISPCLaunch_param_0, - .param .b64 ISPCLaunch_param_1, - .param .b64 ISPCLaunch_param_2, - .param .b32 ISPCLaunch_param_3, - .param .b32 ISPCLaunch_param_4, - .param .b32 ISPCLaunch_param_5 -) -{ - .reg .pred %p<2>; - .reg .s32 %r<16>; - .reg .s64 %rd<6>; - - - ld.param.u64 %rd1, [ISPCLaunch_param_1]; - ld.param.u64 %rd2, [ISPCLaunch_param_2]; - ld.param.u32 %r1, [ISPCLaunch_param_3]; - ld.param.u32 %r2, [ISPCLaunch_param_4]; - ld.param.u32 %r3, [ISPCLaunch_param_5]; - mov.u32 %r4, %tid.x; - and.b32 %r5, %r4, 31; - setp.ne.s32 %p1, %r5, 0; - @%p1 bra BB9_2; - - add.s32 %r14, %r1, -1; - shr.s32 %r15, %r14, 2; - add.s32 %r7, %r15, 1; - mov.u32 %r12, 1; - mov.u32 %r10, 128; - mov.u32 %r13, 0; - mov.u64 %rd5, 0; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd1; - .param .b64 param1; - st.param.b64 [param1+0], %rd2; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r7; - st.param.b32 [param2+4], %r2; - st.param.b32 [param2+8], %r3; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r10; - st.param.b32 [param3+4], %r12; - st.param.b32 [param3+8], %r12; - .param .b32 param4; - st.param.b32 [param4+0], %r13; - .param .b64 param5; - st.param.b64 [param5+0], %rd5; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r6, [retval0+0]; - } - - // inline asm - -BB9_2: - ret; -} - -.visible .func ISPCSync( - .param .b64 ISPCSync_param_0 -) -{ - .reg .s32 %r<2>; - - - // Callseq Start 1 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r1, [retval0+0]; - } - // Callseq End 1 - ret; -} - -.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan( - .param .b32 __warpBinExclusiveScan_param_0 -) -{ - .reg .s32 %r<8>; - .reg .s64 %rd<5>; - - - ld.param.u8 %r2, [__warpBinExclusiveScan_param_0]; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r2, 0; - vote.ballot.b32 %r1, %p1; - } - // inline asm - // inline asm - popc.b32 %r3, %r1; - // inline asm - // inline asm - mov.u32 %r5, %lanemask_lt; - // inline asm - and.b32 %r7, %r5, %r1; - // inline asm - popc.b32 %r6, %r7; - // inline asm - cvt.u64.u32 %rd1, %r6; - shl.b64 %rd2, %rd1, 32; - cvt.u64.u32 %rd3, %r3; - or.b64 %rd4, %rd2, %rd3; - st.param.b64 [func_retval0+0], %rd4; - ret; -} - -.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_( - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12 -) -{ - .reg .pred %p<14>; - .reg .s32 %r<178>; - .reg .s64 %rd<96>; - .reg .f64 %fd<95>; - - - ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0]; - ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1]; - ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2]; - ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3]; - ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4]; - ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5]; - ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6]; - ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7]; - ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9]; - ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10]; - ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11]; - ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12]; - mov.u32 %r1, %ctaid.x; - shl.b32 %r50, %r1, 2; - mov.u32 %r2, %tid.x; - shr.s32 %r51, %r2, 5; - add.s32 %r52, %r51, %r50; - mov.u32 %r53, %nctaid.x; - shl.b32 %r54, %r53, 2; - setp.ge.s32 %p1, %r52, %r54; - mov.u32 %r55, %nctaid.y; - mov.u32 %r3, %ctaid.y; - setp.ge.s32 %p2, %r3, %r55; - or.pred %p3, %p1, %p2; - mov.u32 %r56, %nctaid.z; - mov.u32 %r4, %ctaid.z; - setp.ge.s32 %p4, %r4, %r56; - or.pred %p5, %p3, %p4; - @%p5 bra BB12_13; - - shl.b32 %r57, %r1, 7; - add.s32 %r58, %r2, %r57; - and.b32 %r59, %r58, -32; - add.s32 %r60, %r59, %r42; - add.s32 %r61, %r60, 32; - min.s32 %r5, %r43, %r61; - shl.b32 %r6, %r3, 3; - add.s32 %r62, %r6, %r44; - add.s32 %r7, %r62, 8; - shl.b32 %r8, %r4, 3; - add.s32 %r172, %r8, %r46; - add.s32 %r63, %r172, 8; - min.s32 %r64, %r47, %r63; - mul.lo.s32 %r10, %r49, %r48; - sub.s32 %r65, %r5, %r60; - shr.s32 %r66, %r65, 31; - shr.u32 %r67, %r66, 27; - add.s32 %r68, %r65, %r67; - and.b32 %r69, %r68, -32; - sub.s32 %r70, %r65, %r69; - sub.s32 %r11, %r5, %r70; - and.b32 %r71, %r2, 31; - cvt.u64.u32 %rd6, %r71; - mov.u64 %rd7, constDeltaForeach1; - add.s64 %rd1, %rd7, %rd6; - setp.ge.s32 %p6, %r172, %r64; - @%p6 bra BB12_13; - - min.s32 %r12, %r45, %r7; - shl.b32 %r15, %r10, 1; - neg.s32 %r16, %r15; - mul.lo.s32 %r17, %r10, 3; - mul.lo.s32 %r18, %r10, -3; - mov.u32 %r72, -9; - sub.s32 %r73, %r72, %r44; - sub.s32 %r74, %r73, %r6; - not.b32 %r75, %r45; - max.s32 %r76, %r74, %r75; - not.b32 %r19, %r76; - sub.s32 %r77, %r72, %r46; - sub.s32 %r78, %r77, %r8; - not.b32 %r79, %r47; - max.s32 %r80, %r78, %r79; - not.b32 %r20, %r80; - ld.global.u8 %r13, [%rd1]; - mov.u32 %r171, %r172; - -BB12_3: - mov.u32 %r21, %r171; - add.s32 %r23, %r21, %r13; - setp.ge.s32 %p7, %r62, %r12; - @%p7 bra BB12_12; - - mul.lo.s32 %r24, %r23, %r10; - mov.u32 %r174, %r62; - mov.u32 %r173, %r62; - -BB12_5: - mov.u32 %r27, %r173; - add.s32 %r30, %r27, %r13; - setp.ge.s32 %p8, %r60, %r11; - mov.u32 %r176, %r60; - @%p8 bra BB12_8; - - mov.u64 %rd9, constDeltaForeach4; - add.s64 %rd10, %rd9, %rd6; - ld.global.u8 %r31, [%rd10]; - mad.lo.s32 %r32, %r30, %r48, %r24; - add.s32 %r177, %r59, %r42; - -BB12_7: - cvta.to.global.u64 %rd11, %rd2; - add.s32 %r98, %r32, %r177; - add.s32 %r99, %r98, %r31; - shl.b32 %r100, %r99, 3; - cvt.s64.s32 %rd12, %r100; - add.s64 %rd13, %rd12, %rd4; - add.s32 %r101, %r100, 8; - cvt.s64.s32 %rd14, %r101; - add.s64 %rd15, %rd14, %rd4; - add.s32 %r102, %r100, -8; - cvt.s64.s32 %rd16, %r102; - add.s64 %rd17, %rd16, %rd4; - add.s32 %r103, %r99, %r48; - shl.b32 %r104, %r103, 3; - cvt.s64.s32 %rd18, %r104; - add.s64 %rd19, %rd18, %rd4; - sub.s32 %r105, %r99, %r48; - shl.b32 %r106, %r105, 3; - cvt.s64.s32 %rd20, %r106; - add.s64 %rd21, %rd20, %rd4; - add.s32 %r108, %r99, %r10; - shl.b32 %r109, %r108, 3; - cvt.s64.s32 %rd22, %r109; - add.s64 %rd23, %rd22, %rd4; - sub.s32 %r110, %r99, %r10; - shl.b32 %r111, %r110, 3; - cvt.s64.s32 %rd24, %r111; - add.s64 %rd25, %rd24, %rd4; - add.s32 %r112, %r100, 16; - cvt.s64.s32 %rd26, %r112; - add.s64 %rd27, %rd26, %rd4; - add.s32 %r113, %r100, -16; - cvt.s64.s32 %rd28, %r113; - add.s64 %rd29, %rd28, %rd4; - shl.b32 %r114, %r48, 1; - add.s32 %r115, %r99, %r114; - shl.b32 %r116, %r115, 3; - cvt.s64.s32 %rd30, %r116; - add.s64 %rd31, %rd30, %rd4; - mad.lo.s32 %r117, %r48, -2, %r99; - shl.b32 %r118, %r117, 3; - cvt.s64.s32 %rd32, %r118; - add.s64 %rd33, %rd32, %rd4; - add.s32 %r119, %r99, %r15; - shl.b32 %r120, %r119, 3; - cvt.s64.s32 %rd34, %r120; - add.s64 %rd35, %rd34, %rd4; - add.s32 %r121, %r99, %r16; - shl.b32 %r122, %r121, 3; - cvt.s64.s32 %rd36, %r122; - add.s64 %rd37, %rd36, %rd4; - add.s32 %r123, %r100, 24; - cvt.s64.s32 %rd38, %r123; - add.s64 %rd39, %rd38, %rd4; - add.s32 %r124, %r100, -24; - cvt.s64.s32 %rd40, %r124; - add.s64 %rd41, %rd40, %rd4; - mad.lo.s32 %r125, %r48, 3, %r99; - shl.b32 %r126, %r125, 3; - cvt.s64.s32 %rd42, %r126; - add.s64 %rd43, %rd42, %rd4; - mad.lo.s32 %r127, %r48, -3, %r99; - shl.b32 %r128, %r127, 3; - cvt.s64.s32 %rd44, %r128; - add.s64 %rd45, %rd44, %rd4; - add.s32 %r129, %r99, %r17; - shl.b32 %r130, %r129, 3; - cvt.s64.s32 %rd46, %r130; - add.s64 %rd47, %rd46, %rd4; - add.s32 %r131, %r99, %r18; - shl.b32 %r132, %r131, 3; - cvt.s64.s32 %rd48, %r132; - add.s64 %rd49, %rd48, %rd4; - add.s64 %rd50, %rd12, %rd5; - add.s64 %rd51, %rd12, %rd3; - ld.f64 %fd1, [%rd13]; - add.f64 %fd2, %fd1, %fd1; - ld.f64 %fd3, [%rd50]; - sub.f64 %fd4, %fd2, %fd3; - ld.global.f64 %fd5, [%rd11]; - ld.f64 %fd6, [%rd17]; - ld.f64 %fd7, [%rd15]; - add.f64 %fd8, %fd7, %fd6; - ld.f64 %fd9, [%rd19]; - add.f64 %fd10, %fd8, %fd9; - ld.f64 %fd11, [%rd21]; - add.f64 %fd12, %fd10, %fd11; - ld.f64 %fd13, [%rd23]; - add.f64 %fd14, %fd12, %fd13; - ld.f64 %fd15, [%rd25]; - add.f64 %fd16, %fd14, %fd15; - ld.global.f64 %fd17, [%rd11+8]; - mul.f64 %fd18, %fd17, %fd16; - fma.rn.f64 %fd19, %fd5, %fd1, %fd18; - ld.f64 %fd20, [%rd29]; - ld.f64 %fd21, [%rd27]; - add.f64 %fd22, %fd21, %fd20; - ld.f64 %fd23, [%rd31]; - add.f64 %fd24, %fd22, %fd23; - ld.f64 %fd25, [%rd33]; - add.f64 %fd26, %fd24, %fd25; - ld.f64 %fd27, [%rd35]; - add.f64 %fd28, %fd26, %fd27; - ld.f64 %fd29, [%rd37]; - add.f64 %fd30, %fd28, %fd29; - ld.global.f64 %fd31, [%rd11+16]; - fma.rn.f64 %fd32, %fd31, %fd30, %fd19; - ld.f64 %fd33, [%rd41]; - ld.f64 %fd34, [%rd39]; - add.f64 %fd35, %fd34, %fd33; - ld.f64 %fd36, [%rd43]; - add.f64 %fd37, %fd35, %fd36; - ld.f64 %fd38, [%rd45]; - add.f64 %fd39, %fd37, %fd38; - ld.f64 %fd40, [%rd47]; - add.f64 %fd41, %fd39, %fd40; - ld.f64 %fd42, [%rd49]; - add.f64 %fd43, %fd41, %fd42; - ld.global.f64 %fd44, [%rd11+24]; - fma.rn.f64 %fd45, %fd44, %fd43, %fd32; - ld.f64 %fd46, [%rd51]; - fma.rn.f64 %fd47, %fd46, %fd45, %fd4; - st.f64 [%rd50], %fd47; - add.s32 %r177, %r177, 32; - setp.lt.s32 %p9, %r177, %r11; - mov.u32 %r175, %r177; - mov.u32 %r176, %r175; - @%p9 bra BB12_7; - -BB12_8: - mov.u32 %r36, %r176; - setp.ge.s32 %p10, %r36, %r5; - @%p10 bra BB12_11; - - mov.u64 %rd53, constDeltaForeach4; - add.s64 %rd54, %rd53, %rd6; - ld.global.u8 %r135, [%rd54]; - add.s32 %r37, %r36, %r135; - setp.ge.s32 %p11, %r37, %r5; - @%p11 bra BB12_11; - - cvta.to.global.u64 %rd55, %rd2; - mad.lo.s32 %r136, %r30, %r48, %r24; - add.s32 %r137, %r136, %r37; - shl.b32 %r138, %r137, 3; - cvt.s64.s32 %rd56, %r138; - add.s64 %rd57, %rd56, %rd4; - add.s32 %r139, %r138, 8; - cvt.s64.s32 %rd58, %r139; - add.s64 %rd59, %rd58, %rd4; - add.s32 %r140, %r138, -8; - cvt.s64.s32 %rd60, %r140; - add.s64 %rd61, %rd60, %rd4; - add.s32 %r141, %r137, %r48; - shl.b32 %r142, %r141, 3; - cvt.s64.s32 %rd62, %r142; - add.s64 %rd63, %rd62, %rd4; - sub.s32 %r143, %r137, %r48; - shl.b32 %r144, %r143, 3; - cvt.s64.s32 %rd64, %r144; - add.s64 %rd65, %rd64, %rd4; - add.s32 %r146, %r137, %r10; - shl.b32 %r147, %r146, 3; - cvt.s64.s32 %rd66, %r147; - add.s64 %rd67, %rd66, %rd4; - sub.s32 %r148, %r137, %r10; - shl.b32 %r149, %r148, 3; - cvt.s64.s32 %rd68, %r149; - add.s64 %rd69, %rd68, %rd4; - add.s32 %r150, %r138, 16; - cvt.s64.s32 %rd70, %r150; - add.s64 %rd71, %rd70, %rd4; - add.s32 %r151, %r138, -16; - cvt.s64.s32 %rd72, %r151; - add.s64 %rd73, %rd72, %rd4; - shl.b32 %r152, %r48, 1; - add.s32 %r153, %r137, %r152; - shl.b32 %r154, %r153, 3; - cvt.s64.s32 %rd74, %r154; - add.s64 %rd75, %rd74, %rd4; - mad.lo.s32 %r155, %r48, -2, %r137; - shl.b32 %r156, %r155, 3; - cvt.s64.s32 %rd76, %r156; - add.s64 %rd77, %rd76, %rd4; - add.s32 %r157, %r137, %r15; - shl.b32 %r158, %r157, 3; - cvt.s64.s32 %rd78, %r158; - add.s64 %rd79, %rd78, %rd4; - add.s32 %r159, %r137, %r16; - shl.b32 %r160, %r159, 3; - cvt.s64.s32 %rd80, %r160; - add.s64 %rd81, %rd80, %rd4; - add.s32 %r161, %r138, 24; - cvt.s64.s32 %rd82, %r161; - add.s64 %rd83, %rd82, %rd4; - add.s32 %r162, %r138, -24; - cvt.s64.s32 %rd84, %r162; - add.s64 %rd85, %rd84, %rd4; - mad.lo.s32 %r163, %r48, 3, %r137; - shl.b32 %r164, %r163, 3; - cvt.s64.s32 %rd86, %r164; - add.s64 %rd87, %rd86, %rd4; - mad.lo.s32 %r165, %r48, -3, %r137; - shl.b32 %r166, %r165, 3; - cvt.s64.s32 %rd88, %r166; - add.s64 %rd89, %rd88, %rd4; - add.s32 %r167, %r137, %r17; - shl.b32 %r168, %r167, 3; - cvt.s64.s32 %rd90, %r168; - add.s64 %rd91, %rd90, %rd4; - add.s32 %r169, %r137, %r18; - shl.b32 %r170, %r169, 3; - cvt.s64.s32 %rd92, %r170; - add.s64 %rd93, %rd92, %rd4; - add.s64 %rd94, %rd56, %rd5; - add.s64 %rd95, %rd56, %rd3; - ld.f64 %fd48, [%rd57]; - add.f64 %fd49, %fd48, %fd48; - ld.f64 %fd50, [%rd94]; - sub.f64 %fd51, %fd49, %fd50; - ld.global.f64 %fd52, [%rd55]; - ld.f64 %fd53, [%rd61]; - ld.f64 %fd54, [%rd59]; - add.f64 %fd55, %fd54, %fd53; - ld.f64 %fd56, [%rd63]; - add.f64 %fd57, %fd55, %fd56; - ld.f64 %fd58, [%rd65]; - add.f64 %fd59, %fd57, %fd58; - ld.f64 %fd60, [%rd67]; - add.f64 %fd61, %fd59, %fd60; - ld.f64 %fd62, [%rd69]; - add.f64 %fd63, %fd61, %fd62; - ld.global.f64 %fd64, [%rd55+8]; - mul.f64 %fd65, %fd64, %fd63; - fma.rn.f64 %fd66, %fd52, %fd48, %fd65; - ld.f64 %fd67, [%rd73]; - ld.f64 %fd68, [%rd71]; - add.f64 %fd69, %fd68, %fd67; - ld.f64 %fd70, [%rd75]; - add.f64 %fd71, %fd69, %fd70; - ld.f64 %fd72, [%rd77]; - add.f64 %fd73, %fd71, %fd72; - ld.f64 %fd74, [%rd79]; - add.f64 %fd75, %fd73, %fd74; - ld.f64 %fd76, [%rd81]; - add.f64 %fd77, %fd75, %fd76; - ld.global.f64 %fd78, [%rd55+16]; - fma.rn.f64 %fd79, %fd78, %fd77, %fd66; - ld.f64 %fd80, [%rd85]; - ld.f64 %fd81, [%rd83]; - add.f64 %fd82, %fd81, %fd80; - ld.f64 %fd83, [%rd87]; - add.f64 %fd84, %fd82, %fd83; - ld.f64 %fd85, [%rd89]; - add.f64 %fd86, %fd84, %fd85; - ld.f64 %fd87, [%rd91]; - add.f64 %fd88, %fd86, %fd87; - ld.f64 %fd89, [%rd93]; - add.f64 %fd90, %fd88, %fd89; - ld.global.f64 %fd91, [%rd55+24]; - fma.rn.f64 %fd92, %fd91, %fd90, %fd79; - ld.f64 %fd93, [%rd95]; - fma.rn.f64 %fd94, %fd92, %fd93, %fd51; - st.f64 [%rd94], %fd94; - -BB12_11: - add.s32 %r39, %r174, 1; - setp.ne.s32 %p12, %r39, %r19; - mov.u32 %r174, %r39; - mov.u32 %r173, %r39; - @%p12 bra BB12_5; - -BB12_12: - add.s32 %r171, %r172, 1; - setp.ne.s32 %p13, %r171, %r20; - mov.u32 %r172, %r171; - @%p13 bra BB12_3; - -BB12_13: - ret; -} - -.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_( - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14, - .param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1] -) -{ - .reg .pred %p<9>; - .reg .s32 %r<63>; - .reg .s64 %rd<18>; - - - ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0]; - ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1]; - ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2]; - ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3]; - ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4]; - ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5]; - ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6]; - ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7]; - ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8]; - ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9]; - ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10]; - ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11]; - ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12]; - ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13]; - ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14]; - setp.ge.s32 %p1, %r62, %r12; - @%p1 bra BB13_14; - - mov.u32 %r22, 31; - sub.s32 %r23, %r22, %r13; - add.s32 %r24, %r23, %r14; - shr.s32 %r25, %r24, 31; - shr.u32 %r26, %r25, 27; - add.s32 %r27, %r24, %r26; - shr.s32 %r28, %r27, 5; - mov.u32 %r29, 7; - sub.s32 %r30, %r29, %r15; - add.s32 %r31, %r30, %r16; - shr.s32 %r32, %r31, 31; - shr.u32 %r33, %r32, 29; - add.s32 %r34, %r31, %r33; - shr.s32 %r1, %r34, 3; - sub.s32 %r35, %r29, %r17; - add.s32 %r36, %r35, %r18; - shr.s32 %r37, %r36, 31; - shr.u32 %r38, %r37, 29; - add.s32 %r39, %r36, %r38; - shr.s32 %r2, %r39, 3; - add.s32 %r40, %r28, -1; - shr.s32 %r41, %r40, 2; - add.s32 %r3, %r41, 1; - mov.u32 %r42, %tid.x; - and.b32 %r4, %r42, 31; - sub.s32 %r61, %r62, %r12; - -BB13_2: - and.b32 %r8, %r62, 1; - setp.ne.s32 %p2, %r4, 0; - mov.u64 %rd17, 0; - @%p2 bra BB13_4; - - mov.u64 %rd9, 8; - mov.u64 %rd10, 72; - // Callseq Start 2 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd10; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd17, [retval0+0]; - } - // Callseq End 2 - -BB13_4: - setp.eq.s32 %p3, %r8, 0; - @%p3 bra BB13_9; - - setp.eq.s64 %p4, %rd17, 0; - @%p4 bra BB13_7; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd7; - st.u64 [%rd17+64], %rd6; - -BB13_7: - @%p2 bra BB13_13; - - mov.u32 %r47, 128; - mov.u32 %r49, 1; - mov.u32 %r50, 0; - mov.u64 %rd13, 0; - mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd11; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r47; - st.param.b32 [param3+4], %r49; - st.param.b32 [param3+8], %r49; - .param .b32 param4; - st.param.b32 [param4+0], %r50; - .param .b64 param5; - st.param.b64 [param5+0], %rd13; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r43, [retval0+0]; - } - - // inline asm - bra.uni BB13_13; - -BB13_9: - setp.eq.s64 %p6, %rd17, 0; - @%p6 bra BB13_11; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd6; - st.u64 [%rd17+64], %rd7; - -BB13_11: - @%p2 bra BB13_13; - - mov.u32 %r55, 128; - mov.u32 %r57, 1; - mov.u32 %r58, 0; - mov.u64 %rd16, 0; - mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd14; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r55; - st.param.b32 [param3+4], %r57; - st.param.b32 [param3+8], %r57; - .param .b32 param4; - st.param.b32 [param4+0], %r58; - .param .b64 param5; - st.param.b64 [param5+0], %rd16; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r51, [retval0+0]; - } - - // inline asm - -BB13_13: - // Callseq Start 3 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r59, [retval0+0]; - } - // Callseq End 3 - add.s32 %r62, %r62, 1; - add.s32 %r61, %r61, 1; - setp.ne.s32 %p8, %r61, 0; - @%p8 bra BB13_2; - -BB13_14: - // Callseq Start 4 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r60, [retval0+0]; - } - // Callseq End 4 - ret; -} - -.visible .entry loop_stencil_ispc_tasks( - .param .u32 loop_stencil_ispc_tasks_param_0, - .param .u32 loop_stencil_ispc_tasks_param_1, - .param .u32 loop_stencil_ispc_tasks_param_2, - .param .u32 loop_stencil_ispc_tasks_param_3, - .param .u32 loop_stencil_ispc_tasks_param_4, - .param .u32 loop_stencil_ispc_tasks_param_5, - .param .u32 loop_stencil_ispc_tasks_param_6, - .param .u32 loop_stencil_ispc_tasks_param_7, - .param .u32 loop_stencil_ispc_tasks_param_8, - .param .u32 loop_stencil_ispc_tasks_param_9, - .param .u32 loop_stencil_ispc_tasks_param_10, - .param .u64 loop_stencil_ispc_tasks_param_11, - .param .u64 loop_stencil_ispc_tasks_param_12, - .param .u64 loop_stencil_ispc_tasks_param_13, - .param .u64 loop_stencil_ispc_tasks_param_14 -) -{ - .reg .pred %p<9>; - .reg .s32 %r<63>; - .reg .s64 %rd<18>; - - - ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0]; - ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1]; - ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2]; - ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3]; - ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4]; - ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5]; - ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6]; - ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7]; - ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8]; - ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9]; - ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10]; - ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11]; - ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12]; - ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13]; - ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14]; - setp.ge.s32 %p1, %r62, %r12; - @%p1 bra BB14_14; - - mov.u32 %r22, 31; - sub.s32 %r23, %r22, %r13; - add.s32 %r24, %r23, %r14; - shr.s32 %r25, %r24, 31; - shr.u32 %r26, %r25, 27; - add.s32 %r27, %r24, %r26; - shr.s32 %r28, %r27, 5; - mov.u32 %r29, 7; - sub.s32 %r30, %r29, %r15; - add.s32 %r31, %r30, %r16; - shr.s32 %r32, %r31, 31; - shr.u32 %r33, %r32, 29; - add.s32 %r34, %r31, %r33; - shr.s32 %r1, %r34, 3; - sub.s32 %r35, %r29, %r17; - add.s32 %r36, %r35, %r18; - shr.s32 %r37, %r36, 31; - shr.u32 %r38, %r37, 29; - add.s32 %r39, %r36, %r38; - shr.s32 %r2, %r39, 3; - add.s32 %r40, %r28, -1; - shr.s32 %r41, %r40, 2; - add.s32 %r3, %r41, 1; - mov.u32 %r42, %tid.x; - and.b32 %r4, %r42, 31; - sub.s32 %r61, %r62, %r12; - -BB14_2: - and.b32 %r8, %r62, 1; - setp.ne.s32 %p2, %r4, 0; - mov.u64 %rd17, 0; - @%p2 bra BB14_4; - - mov.u64 %rd9, 8; - mov.u64 %rd10, 72; - // Callseq Start 5 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd10; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd17, [retval0+0]; - } - // Callseq End 5 - -BB14_4: - setp.eq.s32 %p3, %r8, 0; - @%p3 bra BB14_9; - - setp.eq.s64 %p4, %rd17, 0; - @%p4 bra BB14_7; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd7; - st.u64 [%rd17+64], %rd6; - -BB14_7: - @%p2 bra BB14_13; - - mov.u32 %r47, 128; - mov.u32 %r49, 1; - mov.u32 %r50, 0; - mov.u64 %rd13, 0; - mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd11; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r47; - st.param.b32 [param3+4], %r49; - st.param.b32 [param3+8], %r49; - .param .b32 param4; - st.param.b32 [param4+0], %r50; - .param .b64 param5; - st.param.b64 [param5+0], %rd13; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r43, [retval0+0]; - } - - // inline asm - bra.uni BB14_13; - -BB14_9: - setp.eq.s64 %p6, %rd17, 0; - @%p6 bra BB14_11; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd6; - st.u64 [%rd17+64], %rd7; - -BB14_11: - @%p2 bra BB14_13; - - mov.u32 %r55, 128; - mov.u32 %r57, 1; - mov.u32 %r58, 0; - mov.u64 %rd16, 0; - mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd14; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r55; - st.param.b32 [param3+4], %r57; - st.param.b32 [param3+8], %r57; - .param .b32 param4; - st.param.b32 [param4+0], %r58; - .param .b64 param5; - st.param.b64 [param5+0], %rd16; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r51, [retval0+0]; - } - - // inline asm - -BB14_13: - // Callseq Start 6 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r59, [retval0+0]; - } - // Callseq End 6 - add.s32 %r62, %r62, 1; - add.s32 %r61, %r61, 1; - setp.ne.s32 %p8, %r61, 0; - @%p8 bra BB14_2; - -BB14_14: - // Callseq Start 7 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r60, [retval0+0]; - } - // Callseq End 7 - ret; -} - - - diff --git a/examples_cuda/stencil/drvapi_error_string.h b/examples_cuda/stencil/drvapi_error_string.h deleted file mode 100644 index ce85f152..00000000 --- a/examples_cuda/stencil/drvapi_error_string.h +++ /dev/null @@ -1,370 +0,0 @@ -/* - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * Please refer to the NVIDIA end user license agreement (EULA) associated - * with this source code for terms and conditions that govern your use of - * this software. Any use, reproduction, disclosure, or distribution of - * this software and related documentation outside the terms of the EULA - * is strictly prohibited. - * - */ - -#ifndef _DRVAPI_ERROR_STRING_H_ -#define _DRVAPI_ERROR_STRING_H_ - -#include -#include -#include - -// Error Code string definitions here -typedef struct -{ - char const *error_string; - int error_id; -} s_CudaErrorStr; - -/** - * Error codes - */ -static s_CudaErrorStr sCudaDrvErrorString[] = -{ - /** - * The API call returned with no errors. In the case of query calls, this - * can also mean that the operation being queried is complete (see - * ::cuEventQuery() and ::cuStreamQuery()). - */ - { "CUDA_SUCCESS", 0 }, - - /** - * This indicates that one or more of the parameters passed to the API call - * is not within an acceptable range of values. - */ - { "CUDA_ERROR_INVALID_VALUE", 1 }, - - /** - * The API call failed because it was unable to allocate enough memory to - * perform the requested operation. - */ - { "CUDA_ERROR_OUT_OF_MEMORY", 2 }, - - /** - * This indicates that the CUDA driver has not been initialized with - * ::cuInit() or that initialization has failed. - */ - { "CUDA_ERROR_NOT_INITIALIZED", 3 }, - - /** - * This indicates that the CUDA driver is in the process of shutting down. - */ - { "CUDA_ERROR_DEINITIALIZED", 4 }, - - /** - * This indicates profiling APIs are called while application is running - * in visual profiler mode. - */ - { "CUDA_ERROR_PROFILER_DISABLED", 5 }, - /** - * This indicates profiling has not been initialized for this context. - * Call cuProfilerInitialize() to resolve this. - */ - { "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 }, - /** - * This indicates profiler has already been started and probably - * cuProfilerStart() is incorrectly called. - */ - { "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 }, - /** - * This indicates profiler has already been stopped and probably - * cuProfilerStop() is incorrectly called. - */ - { "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 }, - /** - * This indicates that no CUDA-capable devices were detected by the installed - * CUDA driver. - */ - { "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 }, - - /** - * This indicates that the device ordinal supplied by the user does not - * correspond to a valid CUDA device. - */ - { "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 }, - - - /** - * This indicates that the device kernel image is invalid. This can also - * indicate an invalid CUDA module. - */ - { "CUDA_ERROR_INVALID_IMAGE", 200 }, - - /** - * This most frequently indicates that there is no context bound to the - * current thread. This can also be returned if the context passed to an - * API call is not a valid handle (such as a context that has had - * ::cuCtxDestroy() invoked on it). This can also be returned if a user - * mixes different API versions (i.e. 3010 context with 3020 API calls). - * See ::cuCtxGetApiVersion() for more details. - */ - { "CUDA_ERROR_INVALID_CONTEXT", 201 }, - - /** - * This indicated that the context being supplied as a parameter to the - * API call was already the active context. - * \deprecated - * This error return is deprecated as of CUDA 3.2. It is no longer an - * error to attempt to push the active context via ::cuCtxPushCurrent(). - */ - { "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 }, - - /** - * This indicates that a map or register operation has failed. - */ - { "CUDA_ERROR_MAP_FAILED", 205 }, - - /** - * This indicates that an unmap or unregister operation has failed. - */ - { "CUDA_ERROR_UNMAP_FAILED", 206 }, - - /** - * This indicates that the specified array is currently mapped and thus - * cannot be destroyed. - */ - { "CUDA_ERROR_ARRAY_IS_MAPPED", 207 }, - - /** - * This indicates that the resource is already mapped. - */ - { "CUDA_ERROR_ALREADY_MAPPED", 208 }, - - /** - * This indicates that there is no kernel image available that is suitable - * for the device. This can occur when a user specifies code generation - * options for a particular CUDA source file that do not include the - * corresponding device configuration. - */ - { "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 }, - - /** - * This indicates that a resource has already been acquired. - */ - { "CUDA_ERROR_ALREADY_ACQUIRED", 210 }, - - /** - * This indicates that a resource is not mapped. - */ - { "CUDA_ERROR_NOT_MAPPED", 211 }, - - /** - * This indicates that a mapped resource is not available for access as an - * array. - */ - { "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 }, - - /** - * This indicates that a mapped resource is not available for access as a - * pointer. - */ - { "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 }, - - /** - * This indicates that an uncorrectable ECC error was detected during - * execution. - */ - { "CUDA_ERROR_ECC_UNCORRECTABLE", 214 }, - - /** - * This indicates that the ::CUlimit passed to the API call is not - * supported by the active device. - */ - { "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 }, - - /** - * This indicates that the ::CUcontext passed to the API call can - * only be bound to a single CPU thread at a time but is already - * bound to a CPU thread. - */ - { "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 }, - - /** - * This indicates that peer access is not supported across the given - * devices. - */ - { "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217}, - - /** - * This indicates that the device kernel source is invalid. - */ - { "CUDA_ERROR_INVALID_SOURCE", 300 }, - - /** - * This indicates that the file specified was not found. - */ - { "CUDA_ERROR_FILE_NOT_FOUND", 301 }, - - /** - * This indicates that a link to a shared object failed to resolve. - */ - { "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 }, - - /** - * This indicates that initialization of a shared object failed. - */ - { "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 }, - - /** - * This indicates that an OS call failed. - */ - { "CUDA_ERROR_OPERATING_SYSTEM", 304 }, - - - /** - * This indicates that a resource handle passed to the API call was not - * valid. Resource handles are opaque types like ::CUstream and ::CUevent. - */ - { "CUDA_ERROR_INVALID_HANDLE", 400 }, - - - /** - * This indicates that a named symbol was not found. Examples of symbols - * are global/constant variable names, texture names }, and surface names. - */ - { "CUDA_ERROR_NOT_FOUND", 500 }, - - - /** - * This indicates that asynchronous operations issued previously have not - * completed yet. This result is not actually an error, but must be indicated - * differently than ::CUDA_SUCCESS (which indicates completion). Calls that - * may return this value include ::cuEventQuery() and ::cuStreamQuery(). - */ - { "CUDA_ERROR_NOT_READY", 600 }, - - - /** - * An exception occurred on the device while executing a kernel. Common - * causes include dereferencing an invalid device pointer and accessing - * out of bounds shared memory. The context cannot be used }, so it must - * be destroyed (and a new one should be created). All existing device - * memory allocations from this context are invalid and must be - * reconstructed if the program is to continue using CUDA. - */ - { "CUDA_ERROR_LAUNCH_FAILED", 700 }, - - /** - * This indicates that a launch did not occur because it did not have - * appropriate resources. This error usually indicates that the user has - * attempted to pass too many arguments to the device kernel, or the - * kernel launch specifies too many threads for the kernel's register - * count. Passing arguments of the wrong size (i.e. a 64-bit pointer - * when a 32-bit int is expected) is equivalent to passing too many - * arguments and can also result in this error. - */ - { "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 }, - - /** - * This indicates that the device kernel took too long to execute. This can - * only occur if timeouts are enabled - see the device attribute - * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The - * context cannot be used (and must be destroyed similar to - * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from - * this context are invalid and must be reconstructed if the program is to - * continue using CUDA. - */ - { "CUDA_ERROR_LAUNCH_TIMEOUT", 702 }, - - /** - * This error indicates a kernel launch that uses an incompatible texturing - * mode. - */ - { "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 }, - - /** - * This error indicates that a call to ::cuCtxEnablePeerAccess() is - * trying to re-enable peer access to a context which has already - * had peer access to it enabled. - */ - { "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 }, - - /** - * This error indicates that ::cuCtxDisablePeerAccess() is - * trying to disable peer access which has not been enabled yet - * via ::cuCtxEnablePeerAccess(). - */ - { "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 }, - - /** - * This error indicates that the primary context for the specified device - * has already been initialized. - */ - { "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 }, - - /** - * This error indicates that the context current to the calling thread - * has been destroyed using ::cuCtxDestroy }, or is a primary context which - * has not yet been initialized. - */ - { "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 }, - - /** - * A device-side assert triggered during kernel execution. The context - * cannot be used anymore, and must be destroyed. All existing device - * memory allocations from this context are invalid and must be - * reconstructed if the program is to continue using CUDA. - */ - { "CUDA_ERROR_ASSERT", 710 }, - - /** - * This error indicates that the hardware resources required to enable - * peer access have been exhausted for one or more of the devices - * passed to ::cuCtxEnablePeerAccess(). - */ - { "CUDA_ERROR_TOO_MANY_PEERS", 711 }, - - /** - * This error indicates that the memory range passed to ::cuMemHostRegister() - * has already been registered. - */ - { "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 }, - - /** - * This error indicates that the pointer passed to ::cuMemHostUnregister() - * does not correspond to any currently registered memory region. - */ - { "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 }, - - /** - * This error indicates that the attempted operation is not permitted. - */ - { "CUDA_ERROR_NOT_PERMITTED", 800 }, - - /** - * This error indicates that the attempted operation is not supported - * on the current system or device. - */ - { "CUDA_ERROR_NOT_SUPPORTED", 801 }, - - /** - * This indicates that an unknown internal error has occurred. - */ - { "CUDA_ERROR_UNKNOWN", 999 }, - { NULL, -1 } -}; - -// This is just a linear search through the array, since the error_id's are not -// always ocurring consecutively -const char * getCudaDrvErrorString(CUresult error_id) -{ - int index = 0; - while (sCudaDrvErrorString[index].error_id != error_id && - sCudaDrvErrorString[index].error_id != -1) - { - index++; - } - if (sCudaDrvErrorString[index].error_id == error_id) - return (const char *)sCudaDrvErrorString[index].error_string; - else - return (const char *)"CUDA_ERROR not found!"; -} - -#endif diff --git a/examples_cuda/stencil/kernel.ptx b/examples_cuda/stencil/kernel.ptx deleted file mode 100644 index b0339cbf..00000000 --- a/examples_cuda/stencil/kernel.ptx +++ /dev/null @@ -1,1246 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) -// Cuda compilation tools, release 5.5, V5.5.0 -// - -.version 3.2 -.target sm_35 -.address_size 64 - - -.extern .func (.param .b32 func_retval0) cudaLaunchDevice -( - .param .b64 cudaLaunchDevice_param_0, - .param .b64 cudaLaunchDevice_param_1, - .param .align 4 .b8 cudaLaunchDevice_param_2[12], - .param .align 4 .b8 cudaLaunchDevice_param_3[12], - .param .b32 cudaLaunchDevice_param_4, - .param .b64 cudaLaunchDevice_param_5 -); - - -.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer -( - .param .b64 cudaGetParameterBuffer_param_0, - .param .b64 cudaGetParameterBuffer_param_1 -) -; -.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize -( - -) -; -.global .align 1 .b8 constDeltaForeach1[32]; -.global .align 1 .b8 constDeltaForeach4[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; - -.visible .func (.param .b32 func_retval0) __shfl_i32( - .param .b32 __shfl_i32_param_0, - .param .b32 __shfl_i32_param_1 -) -{ - .reg .s32 %r<4>; - - - ld.param.u32 %r2, [__shfl_i32_param_0]; - ld.param.u32 %r3, [__shfl_i32_param_1]; - // inline asm - shfl.idx.b32 %r1, %r2, %r3, 0x1f; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __shfl_xor_float( - .param .b32 __shfl_xor_float_param_0, - .param .b32 __shfl_xor_float_param_1 -) -{ - .reg .s32 %r<2>; - .reg .f32 %f<3>; - - - ld.param.f32 %f2, [__shfl_xor_float_param_0]; - ld.param.u32 %r1, [__shfl_xor_float_param_1]; - // inline asm - shfl.bfly.b32 %f1, %f2, %r1, 0x1f; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __shfl_xor_i32( - .param .b32 __shfl_xor_i32_param_0, - .param .b32 __shfl_xor_i32_param_1 -) -{ - .reg .s32 %r<4>; - - - ld.param.u32 %r2, [__shfl_xor_i32_param_0]; - ld.param.u32 %r3, [__shfl_xor_i32_param_1]; - // inline asm - shfl.bfly.b32 %r1, %r2, %r3, 0x1f; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __fminf( - .param .b32 __fminf_param_0, - .param .b32 __fminf_param_1 -) -{ - .reg .f32 %f<4>; - - - ld.param.f32 %f2, [__fminf_param_0]; - ld.param.f32 %f3, [__fminf_param_1]; - // inline asm - min.f32 %f1, %f2, %f3; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __fmaxf( - .param .b32 __fmaxf_param_0, - .param .b32 __fmaxf_param_1 -) -{ - .reg .f32 %f<4>; - - - ld.param.f32 %f2, [__fmaxf_param_0]; - ld.param.f32 %f3, [__fmaxf_param_1]; - // inline asm - max.f32 %f1, %f2, %f3; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __ballot( - .param .b32 __ballot_param_0 -) -{ - .reg .s32 %r<3>; - - - ld.param.u8 %r2, [__ballot_param_0]; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r2, 0; - vote.ballot.b32 %r1, %p1; - } - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __lanemask_lt( - -) -{ - .reg .s32 %r<2>; - - - // inline asm - mov.u32 %r1, %lanemask_lt; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b64 func_retval0) ISPCAlloc( - .param .b64 ISPCAlloc_param_0, - .param .b64 ISPCAlloc_param_1, - .param .b32 ISPCAlloc_param_2 -) -{ - .reg .s64 %rd<2>; - - - mov.u64 %rd1, 1; - st.param.b64 [func_retval0+0], %rd1; - ret; -} - -.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer( - .param .b64 ISPCGetParamBuffer_param_0, - .param .b64 ISPCGetParamBuffer_param_1, - .param .b64 ISPCGetParamBuffer_param_2 -) -{ - .reg .pred %p<2>; - .reg .s32 %r<3>; - .reg .s64 %rd<7>; - - - ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1]; - ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2]; - mov.u32 %r1, %tid.x; - and.b32 %r2, %r1, 31; - setp.ne.s32 %p1, %r2, 0; - mov.u64 %rd6, 0; - @%p1 bra BB8_2; - - // Callseq Start 0 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd3; - .param .b64 param1; - st.param.b64 [param1+0], %rd4; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd6, [retval0+0]; - } - // Callseq End 0 - -BB8_2: - st.param.b64 [func_retval0+0], %rd6; - ret; -} - -.visible .func ISPCLaunch( - .param .b64 ISPCLaunch_param_0, - .param .b64 ISPCLaunch_param_1, - .param .b64 ISPCLaunch_param_2, - .param .b32 ISPCLaunch_param_3, - .param .b32 ISPCLaunch_param_4, - .param .b32 ISPCLaunch_param_5 -) -{ - .reg .pred %p<2>; - .reg .s32 %r<16>; - .reg .s64 %rd<6>; - - - ld.param.u64 %rd1, [ISPCLaunch_param_1]; - ld.param.u64 %rd2, [ISPCLaunch_param_2]; - ld.param.u32 %r1, [ISPCLaunch_param_3]; - ld.param.u32 %r2, [ISPCLaunch_param_4]; - ld.param.u32 %r3, [ISPCLaunch_param_5]; - mov.u32 %r4, %tid.x; - and.b32 %r5, %r4, 31; - setp.ne.s32 %p1, %r5, 0; - @%p1 bra BB9_2; - - add.s32 %r14, %r1, -1; - shr.s32 %r15, %r14, 2; - add.s32 %r7, %r15, 1; - mov.u32 %r12, 1; - mov.u32 %r10, 128; - mov.u32 %r13, 0; - mov.u64 %rd5, 0; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd1; - .param .b64 param1; - st.param.b64 [param1+0], %rd2; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r7; - st.param.b32 [param2+4], %r2; - st.param.b32 [param2+8], %r3; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r10; - st.param.b32 [param3+4], %r12; - st.param.b32 [param3+8], %r12; - .param .b32 param4; - st.param.b32 [param4+0], %r13; - .param .b64 param5; - st.param.b64 [param5+0], %rd5; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r6, [retval0+0]; - } - - // inline asm - -BB9_2: - ret; -} - -.visible .func ISPCSync( - .param .b64 ISPCSync_param_0 -) -{ - .reg .s32 %r<2>; - - - // Callseq Start 1 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r1, [retval0+0]; - } - // Callseq End 1 - ret; -} - -.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan( - .param .b32 __warpBinExclusiveScan_param_0 -) -{ - .reg .s32 %r<8>; - .reg .s64 %rd<5>; - - - ld.param.u8 %r2, [__warpBinExclusiveScan_param_0]; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r2, 0; - vote.ballot.b32 %r1, %p1; - } - // inline asm - // inline asm - popc.b32 %r3, %r1; - // inline asm - // inline asm - mov.u32 %r5, %lanemask_lt; - // inline asm - and.b32 %r7, %r5, %r1; - // inline asm - popc.b32 %r6, %r7; - // inline asm - cvt.u64.u32 %rd1, %r6; - shl.b64 %rd2, %rd1, 32; - cvt.u64.u32 %rd3, %r3; - or.b64 %rd4, %rd2, %rd3; - st.param.b64 [func_retval0+0], %rd4; - ret; -} - -.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_( - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12 -) -{ - .reg .pred %p<14>; - .reg .s32 %r<178>; - .reg .s64 %rd<96>; - .reg .f64 %fd<95>; - - - ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0]; - ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1]; - ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2]; - ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3]; - ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4]; - ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5]; - ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6]; - ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7]; - ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9]; - ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10]; - ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11]; - ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12]; - mov.u32 %r1, %ctaid.x; - shl.b32 %r50, %r1, 2; - mov.u32 %r2, %tid.x; - shr.s32 %r51, %r2, 5; - add.s32 %r52, %r51, %r50; - mov.u32 %r53, %nctaid.x; - shl.b32 %r54, %r53, 2; - setp.ge.s32 %p1, %r52, %r54; - mov.u32 %r55, %nctaid.y; - mov.u32 %r3, %ctaid.y; - setp.ge.s32 %p2, %r3, %r55; - or.pred %p3, %p1, %p2; - mov.u32 %r56, %nctaid.z; - mov.u32 %r4, %ctaid.z; - setp.ge.s32 %p4, %r4, %r56; - or.pred %p5, %p3, %p4; - @%p5 bra BB12_13; - - shl.b32 %r57, %r1, 7; - add.s32 %r58, %r2, %r57; - and.b32 %r59, %r58, -32; - add.s32 %r60, %r59, %r42; - add.s32 %r61, %r60, 32; - min.s32 %r5, %r43, %r61; - shl.b32 %r6, %r3, 3; - add.s32 %r62, %r6, %r44; - add.s32 %r7, %r62, 8; - shl.b32 %r8, %r4, 3; - add.s32 %r172, %r8, %r46; - add.s32 %r63, %r172, 8; - min.s32 %r64, %r47, %r63; - mul.lo.s32 %r10, %r49, %r48; - sub.s32 %r65, %r5, %r60; - shr.s32 %r66, %r65, 31; - shr.u32 %r67, %r66, 27; - add.s32 %r68, %r65, %r67; - and.b32 %r69, %r68, -32; - sub.s32 %r70, %r65, %r69; - sub.s32 %r11, %r5, %r70; - and.b32 %r71, %r2, 31; - cvt.u64.u32 %rd6, %r71; - mov.u64 %rd7, constDeltaForeach1; - add.s64 %rd1, %rd7, %rd6; - setp.ge.s32 %p6, %r172, %r64; - @%p6 bra BB12_13; - - min.s32 %r12, %r45, %r7; - shl.b32 %r15, %r10, 1; - neg.s32 %r16, %r15; - mul.lo.s32 %r17, %r10, 3; - mul.lo.s32 %r18, %r10, -3; - mov.u32 %r72, -9; - sub.s32 %r73, %r72, %r44; - sub.s32 %r74, %r73, %r6; - not.b32 %r75, %r45; - max.s32 %r76, %r74, %r75; - not.b32 %r19, %r76; - sub.s32 %r77, %r72, %r46; - sub.s32 %r78, %r77, %r8; - not.b32 %r79, %r47; - max.s32 %r80, %r78, %r79; - not.b32 %r20, %r80; - ld.global.u8 %r13, [%rd1]; - mov.u32 %r171, %r172; - -BB12_3: - mov.u32 %r21, %r171; - add.s32 %r23, %r21, %r13; - setp.ge.s32 %p7, %r62, %r12; - @%p7 bra BB12_12; - - mul.lo.s32 %r24, %r23, %r10; - mov.u32 %r174, %r62; - mov.u32 %r173, %r62; - -BB12_5: - mov.u32 %r27, %r173; - add.s32 %r30, %r27, %r13; - setp.ge.s32 %p8, %r60, %r11; - mov.u32 %r176, %r60; - @%p8 bra BB12_8; - - mov.u64 %rd9, constDeltaForeach4; - add.s64 %rd10, %rd9, %rd6; - ld.global.u8 %r31, [%rd10]; - mad.lo.s32 %r32, %r30, %r48, %r24; - add.s32 %r177, %r59, %r42; - -BB12_7: - cvta.to.global.u64 %rd11, %rd2; - add.s32 %r98, %r32, %r177; - add.s32 %r99, %r98, %r31; - shl.b32 %r100, %r99, 3; - cvt.s64.s32 %rd12, %r100; - add.s64 %rd13, %rd12, %rd4; - add.s32 %r101, %r100, 8; - cvt.s64.s32 %rd14, %r101; - add.s64 %rd15, %rd14, %rd4; - add.s32 %r102, %r100, -8; - cvt.s64.s32 %rd16, %r102; - add.s64 %rd17, %rd16, %rd4; - add.s32 %r103, %r99, %r48; - shl.b32 %r104, %r103, 3; - cvt.s64.s32 %rd18, %r104; - add.s64 %rd19, %rd18, %rd4; - sub.s32 %r105, %r99, %r48; - shl.b32 %r106, %r105, 3; - cvt.s64.s32 %rd20, %r106; - add.s64 %rd21, %rd20, %rd4; - add.s32 %r108, %r99, %r10; - shl.b32 %r109, %r108, 3; - cvt.s64.s32 %rd22, %r109; - add.s64 %rd23, %rd22, %rd4; - sub.s32 %r110, %r99, %r10; - shl.b32 %r111, %r110, 3; - cvt.s64.s32 %rd24, %r111; - add.s64 %rd25, %rd24, %rd4; - add.s32 %r112, %r100, 16; - cvt.s64.s32 %rd26, %r112; - add.s64 %rd27, %rd26, %rd4; - add.s32 %r113, %r100, -16; - cvt.s64.s32 %rd28, %r113; - add.s64 %rd29, %rd28, %rd4; - shl.b32 %r114, %r48, 1; - add.s32 %r115, %r99, %r114; - shl.b32 %r116, %r115, 3; - cvt.s64.s32 %rd30, %r116; - add.s64 %rd31, %rd30, %rd4; - mad.lo.s32 %r117, %r48, -2, %r99; - shl.b32 %r118, %r117, 3; - cvt.s64.s32 %rd32, %r118; - add.s64 %rd33, %rd32, %rd4; - add.s32 %r119, %r99, %r15; - shl.b32 %r120, %r119, 3; - cvt.s64.s32 %rd34, %r120; - add.s64 %rd35, %rd34, %rd4; - add.s32 %r121, %r99, %r16; - shl.b32 %r122, %r121, 3; - cvt.s64.s32 %rd36, %r122; - add.s64 %rd37, %rd36, %rd4; - add.s32 %r123, %r100, 24; - cvt.s64.s32 %rd38, %r123; - add.s64 %rd39, %rd38, %rd4; - add.s32 %r124, %r100, -24; - cvt.s64.s32 %rd40, %r124; - add.s64 %rd41, %rd40, %rd4; - mad.lo.s32 %r125, %r48, 3, %r99; - shl.b32 %r126, %r125, 3; - cvt.s64.s32 %rd42, %r126; - add.s64 %rd43, %rd42, %rd4; - mad.lo.s32 %r127, %r48, -3, %r99; - shl.b32 %r128, %r127, 3; - cvt.s64.s32 %rd44, %r128; - add.s64 %rd45, %rd44, %rd4; - add.s32 %r129, %r99, %r17; - shl.b32 %r130, %r129, 3; - cvt.s64.s32 %rd46, %r130; - add.s64 %rd47, %rd46, %rd4; - add.s32 %r131, %r99, %r18; - shl.b32 %r132, %r131, 3; - cvt.s64.s32 %rd48, %r132; - add.s64 %rd49, %rd48, %rd4; - add.s64 %rd50, %rd12, %rd5; - add.s64 %rd51, %rd12, %rd3; - ld.f64 %fd1, [%rd13]; - add.f64 %fd2, %fd1, %fd1; - ld.f64 %fd3, [%rd50]; - sub.f64 %fd4, %fd2, %fd3; - ld.global.f64 %fd5, [%rd11]; - ld.f64 %fd6, [%rd17]; - ld.f64 %fd7, [%rd15]; - add.f64 %fd8, %fd7, %fd6; - ld.f64 %fd9, [%rd19]; - add.f64 %fd10, %fd8, %fd9; - ld.f64 %fd11, [%rd21]; - add.f64 %fd12, %fd10, %fd11; - ld.f64 %fd13, [%rd23]; - add.f64 %fd14, %fd12, %fd13; - ld.f64 %fd15, [%rd25]; - add.f64 %fd16, %fd14, %fd15; - ld.global.f64 %fd17, [%rd11+8]; - mul.f64 %fd18, %fd17, %fd16; - fma.rn.f64 %fd19, %fd5, %fd1, %fd18; - ld.f64 %fd20, [%rd29]; - ld.f64 %fd21, [%rd27]; - add.f64 %fd22, %fd21, %fd20; - ld.f64 %fd23, [%rd31]; - add.f64 %fd24, %fd22, %fd23; - ld.f64 %fd25, [%rd33]; - add.f64 %fd26, %fd24, %fd25; - ld.f64 %fd27, [%rd35]; - add.f64 %fd28, %fd26, %fd27; - ld.f64 %fd29, [%rd37]; - add.f64 %fd30, %fd28, %fd29; - ld.global.f64 %fd31, [%rd11+16]; - fma.rn.f64 %fd32, %fd31, %fd30, %fd19; - ld.f64 %fd33, [%rd41]; - ld.f64 %fd34, [%rd39]; - add.f64 %fd35, %fd34, %fd33; - ld.f64 %fd36, [%rd43]; - add.f64 %fd37, %fd35, %fd36; - ld.f64 %fd38, [%rd45]; - add.f64 %fd39, %fd37, %fd38; - ld.f64 %fd40, [%rd47]; - add.f64 %fd41, %fd39, %fd40; - ld.f64 %fd42, [%rd49]; - add.f64 %fd43, %fd41, %fd42; - ld.global.f64 %fd44, [%rd11+24]; - fma.rn.f64 %fd45, %fd44, %fd43, %fd32; - ld.f64 %fd46, [%rd51]; - fma.rn.f64 %fd47, %fd46, %fd45, %fd4; - st.f64 [%rd50], %fd47; - add.s32 %r177, %r177, 32; - setp.lt.s32 %p9, %r177, %r11; - mov.u32 %r175, %r177; - mov.u32 %r176, %r175; - @%p9 bra BB12_7; - -BB12_8: - mov.u32 %r36, %r176; - setp.ge.s32 %p10, %r36, %r5; - @%p10 bra BB12_11; - - mov.u64 %rd53, constDeltaForeach4; - add.s64 %rd54, %rd53, %rd6; - ld.global.u8 %r135, [%rd54]; - add.s32 %r37, %r36, %r135; - setp.ge.s32 %p11, %r37, %r5; - @%p11 bra BB12_11; - - cvta.to.global.u64 %rd55, %rd2; - mad.lo.s32 %r136, %r30, %r48, %r24; - add.s32 %r137, %r136, %r37; - shl.b32 %r138, %r137, 3; - cvt.s64.s32 %rd56, %r138; - add.s64 %rd57, %rd56, %rd4; - add.s32 %r139, %r138, 8; - cvt.s64.s32 %rd58, %r139; - add.s64 %rd59, %rd58, %rd4; - add.s32 %r140, %r138, -8; - cvt.s64.s32 %rd60, %r140; - add.s64 %rd61, %rd60, %rd4; - add.s32 %r141, %r137, %r48; - shl.b32 %r142, %r141, 3; - cvt.s64.s32 %rd62, %r142; - add.s64 %rd63, %rd62, %rd4; - sub.s32 %r143, %r137, %r48; - shl.b32 %r144, %r143, 3; - cvt.s64.s32 %rd64, %r144; - add.s64 %rd65, %rd64, %rd4; - add.s32 %r146, %r137, %r10; - shl.b32 %r147, %r146, 3; - cvt.s64.s32 %rd66, %r147; - add.s64 %rd67, %rd66, %rd4; - sub.s32 %r148, %r137, %r10; - shl.b32 %r149, %r148, 3; - cvt.s64.s32 %rd68, %r149; - add.s64 %rd69, %rd68, %rd4; - add.s32 %r150, %r138, 16; - cvt.s64.s32 %rd70, %r150; - add.s64 %rd71, %rd70, %rd4; - add.s32 %r151, %r138, -16; - cvt.s64.s32 %rd72, %r151; - add.s64 %rd73, %rd72, %rd4; - shl.b32 %r152, %r48, 1; - add.s32 %r153, %r137, %r152; - shl.b32 %r154, %r153, 3; - cvt.s64.s32 %rd74, %r154; - add.s64 %rd75, %rd74, %rd4; - mad.lo.s32 %r155, %r48, -2, %r137; - shl.b32 %r156, %r155, 3; - cvt.s64.s32 %rd76, %r156; - add.s64 %rd77, %rd76, %rd4; - add.s32 %r157, %r137, %r15; - shl.b32 %r158, %r157, 3; - cvt.s64.s32 %rd78, %r158; - add.s64 %rd79, %rd78, %rd4; - add.s32 %r159, %r137, %r16; - shl.b32 %r160, %r159, 3; - cvt.s64.s32 %rd80, %r160; - add.s64 %rd81, %rd80, %rd4; - add.s32 %r161, %r138, 24; - cvt.s64.s32 %rd82, %r161; - add.s64 %rd83, %rd82, %rd4; - add.s32 %r162, %r138, -24; - cvt.s64.s32 %rd84, %r162; - add.s64 %rd85, %rd84, %rd4; - mad.lo.s32 %r163, %r48, 3, %r137; - shl.b32 %r164, %r163, 3; - cvt.s64.s32 %rd86, %r164; - add.s64 %rd87, %rd86, %rd4; - mad.lo.s32 %r165, %r48, -3, %r137; - shl.b32 %r166, %r165, 3; - cvt.s64.s32 %rd88, %r166; - add.s64 %rd89, %rd88, %rd4; - add.s32 %r167, %r137, %r17; - shl.b32 %r168, %r167, 3; - cvt.s64.s32 %rd90, %r168; - add.s64 %rd91, %rd90, %rd4; - add.s32 %r169, %r137, %r18; - shl.b32 %r170, %r169, 3; - cvt.s64.s32 %rd92, %r170; - add.s64 %rd93, %rd92, %rd4; - add.s64 %rd94, %rd56, %rd5; - add.s64 %rd95, %rd56, %rd3; - ld.f64 %fd48, [%rd57]; - add.f64 %fd49, %fd48, %fd48; - ld.f64 %fd50, [%rd94]; - sub.f64 %fd51, %fd49, %fd50; - ld.global.f64 %fd52, [%rd55]; - ld.f64 %fd53, [%rd61]; - ld.f64 %fd54, [%rd59]; - add.f64 %fd55, %fd54, %fd53; - ld.f64 %fd56, [%rd63]; - add.f64 %fd57, %fd55, %fd56; - ld.f64 %fd58, [%rd65]; - add.f64 %fd59, %fd57, %fd58; - ld.f64 %fd60, [%rd67]; - add.f64 %fd61, %fd59, %fd60; - ld.f64 %fd62, [%rd69]; - add.f64 %fd63, %fd61, %fd62; - ld.global.f64 %fd64, [%rd55+8]; - mul.f64 %fd65, %fd64, %fd63; - fma.rn.f64 %fd66, %fd52, %fd48, %fd65; - ld.f64 %fd67, [%rd73]; - ld.f64 %fd68, [%rd71]; - add.f64 %fd69, %fd68, %fd67; - ld.f64 %fd70, [%rd75]; - add.f64 %fd71, %fd69, %fd70; - ld.f64 %fd72, [%rd77]; - add.f64 %fd73, %fd71, %fd72; - ld.f64 %fd74, [%rd79]; - add.f64 %fd75, %fd73, %fd74; - ld.f64 %fd76, [%rd81]; - add.f64 %fd77, %fd75, %fd76; - ld.global.f64 %fd78, [%rd55+16]; - fma.rn.f64 %fd79, %fd78, %fd77, %fd66; - ld.f64 %fd80, [%rd85]; - ld.f64 %fd81, [%rd83]; - add.f64 %fd82, %fd81, %fd80; - ld.f64 %fd83, [%rd87]; - add.f64 %fd84, %fd82, %fd83; - ld.f64 %fd85, [%rd89]; - add.f64 %fd86, %fd84, %fd85; - ld.f64 %fd87, [%rd91]; - add.f64 %fd88, %fd86, %fd87; - ld.f64 %fd89, [%rd93]; - add.f64 %fd90, %fd88, %fd89; - ld.global.f64 %fd91, [%rd55+24]; - fma.rn.f64 %fd92, %fd91, %fd90, %fd79; - ld.f64 %fd93, [%rd95]; - fma.rn.f64 %fd94, %fd92, %fd93, %fd51; - st.f64 [%rd94], %fd94; - -BB12_11: - add.s32 %r39, %r174, 1; - setp.ne.s32 %p12, %r39, %r19; - mov.u32 %r174, %r39; - mov.u32 %r173, %r39; - @%p12 bra BB12_5; - -BB12_12: - add.s32 %r171, %r172, 1; - setp.ne.s32 %p13, %r171, %r20; - mov.u32 %r172, %r171; - @%p13 bra BB12_3; - -BB12_13: - ret; -} - -.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_( - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14, - .param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1] -) -{ - .reg .pred %p<9>; - .reg .s32 %r<63>; - .reg .s64 %rd<18>; - - - ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0]; - ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1]; - ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2]; - ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3]; - ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4]; - ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5]; - ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6]; - ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7]; - ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8]; - ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9]; - ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10]; - ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11]; - ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12]; - ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13]; - ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14]; - setp.ge.s32 %p1, %r62, %r12; - @%p1 bra BB13_14; - - mov.u32 %r22, 31; - sub.s32 %r23, %r22, %r13; - add.s32 %r24, %r23, %r14; - shr.s32 %r25, %r24, 31; - shr.u32 %r26, %r25, 27; - add.s32 %r27, %r24, %r26; - shr.s32 %r28, %r27, 5; - mov.u32 %r29, 7; - sub.s32 %r30, %r29, %r15; - add.s32 %r31, %r30, %r16; - shr.s32 %r32, %r31, 31; - shr.u32 %r33, %r32, 29; - add.s32 %r34, %r31, %r33; - shr.s32 %r1, %r34, 3; - sub.s32 %r35, %r29, %r17; - add.s32 %r36, %r35, %r18; - shr.s32 %r37, %r36, 31; - shr.u32 %r38, %r37, 29; - add.s32 %r39, %r36, %r38; - shr.s32 %r2, %r39, 3; - add.s32 %r40, %r28, -1; - shr.s32 %r41, %r40, 2; - add.s32 %r3, %r41, 1; - mov.u32 %r42, %tid.x; - and.b32 %r4, %r42, 31; - sub.s32 %r61, %r62, %r12; - -BB13_2: - and.b32 %r8, %r62, 1; - setp.ne.s32 %p2, %r4, 0; - mov.u64 %rd17, 0; - @%p2 bra BB13_4; - - mov.u64 %rd9, 8; - mov.u64 %rd10, 72; - // Callseq Start 2 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd10; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd17, [retval0+0]; - } - // Callseq End 2 - -BB13_4: - setp.eq.s32 %p3, %r8, 0; - @%p3 bra BB13_9; - - setp.eq.s64 %p4, %rd17, 0; - @%p4 bra BB13_7; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd7; - st.u64 [%rd17+64], %rd6; - -BB13_7: - @%p2 bra BB13_13; - - mov.u32 %r47, 128; - mov.u32 %r49, 1; - mov.u32 %r50, 0; - mov.u64 %rd13, 0; - mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd11; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r47; - st.param.b32 [param3+4], %r49; - st.param.b32 [param3+8], %r49; - .param .b32 param4; - st.param.b32 [param4+0], %r50; - .param .b64 param5; - st.param.b64 [param5+0], %rd13; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r43, [retval0+0]; - } - - // inline asm - bra.uni BB13_13; - -BB13_9: - setp.eq.s64 %p6, %rd17, 0; - @%p6 bra BB13_11; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd6; - st.u64 [%rd17+64], %rd7; - -BB13_11: - @%p2 bra BB13_13; - - mov.u32 %r55, 128; - mov.u32 %r57, 1; - mov.u32 %r58, 0; - mov.u64 %rd16, 0; - mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd14; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r55; - st.param.b32 [param3+4], %r57; - st.param.b32 [param3+8], %r57; - .param .b32 param4; - st.param.b32 [param4+0], %r58; - .param .b64 param5; - st.param.b64 [param5+0], %rd16; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r51, [retval0+0]; - } - - // inline asm - -BB13_13: - // Callseq Start 3 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r59, [retval0+0]; - } - // Callseq End 3 - add.s32 %r62, %r62, 1; - add.s32 %r61, %r61, 1; - setp.ne.s32 %p8, %r61, 0; - @%p8 bra BB13_2; - -BB13_14: - // Callseq Start 4 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r60, [retval0+0]; - } - // Callseq End 4 - ret; -} - -.visible .entry loop_stencil_ispc_tasks( - .param .u32 loop_stencil_ispc_tasks_param_0, - .param .u32 loop_stencil_ispc_tasks_param_1, - .param .u32 loop_stencil_ispc_tasks_param_2, - .param .u32 loop_stencil_ispc_tasks_param_3, - .param .u32 loop_stencil_ispc_tasks_param_4, - .param .u32 loop_stencil_ispc_tasks_param_5, - .param .u32 loop_stencil_ispc_tasks_param_6, - .param .u32 loop_stencil_ispc_tasks_param_7, - .param .u32 loop_stencil_ispc_tasks_param_8, - .param .u32 loop_stencil_ispc_tasks_param_9, - .param .u32 loop_stencil_ispc_tasks_param_10, - .param .u64 loop_stencil_ispc_tasks_param_11, - .param .u64 loop_stencil_ispc_tasks_param_12, - .param .u64 loop_stencil_ispc_tasks_param_13, - .param .u64 loop_stencil_ispc_tasks_param_14 -) -{ - .reg .pred %p<9>; - .reg .s32 %r<63>; - .reg .s64 %rd<18>; - - - ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0]; - ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1]; - ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2]; - ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3]; - ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4]; - ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5]; - ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6]; - ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7]; - ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8]; - ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9]; - ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10]; - ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11]; - ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12]; - ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13]; - ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14]; - setp.ge.s32 %p1, %r62, %r12; - @%p1 bra BB14_14; - - mov.u32 %r22, 31; - sub.s32 %r23, %r22, %r13; - add.s32 %r24, %r23, %r14; - shr.s32 %r25, %r24, 31; - shr.u32 %r26, %r25, 27; - add.s32 %r27, %r24, %r26; - shr.s32 %r28, %r27, 5; - mov.u32 %r29, 7; - sub.s32 %r30, %r29, %r15; - add.s32 %r31, %r30, %r16; - shr.s32 %r32, %r31, 31; - shr.u32 %r33, %r32, 29; - add.s32 %r34, %r31, %r33; - shr.s32 %r1, %r34, 3; - sub.s32 %r35, %r29, %r17; - add.s32 %r36, %r35, %r18; - shr.s32 %r37, %r36, 31; - shr.u32 %r38, %r37, 29; - add.s32 %r39, %r36, %r38; - shr.s32 %r2, %r39, 3; - add.s32 %r40, %r28, -1; - shr.s32 %r41, %r40, 2; - add.s32 %r3, %r41, 1; - mov.u32 %r42, %tid.x; - and.b32 %r4, %r42, 31; - sub.s32 %r61, %r62, %r12; - -BB14_2: - and.b32 %r8, %r62, 1; - setp.ne.s32 %p2, %r4, 0; - mov.u64 %rd17, 0; - @%p2 bra BB14_4; - - mov.u64 %rd9, 8; - mov.u64 %rd10, 72; - // Callseq Start 5 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd10; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd17, [retval0+0]; - } - // Callseq End 5 - -BB14_4: - setp.eq.s32 %p3, %r8, 0; - @%p3 bra BB14_9; - - setp.eq.s64 %p4, %rd17, 0; - @%p4 bra BB14_7; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd7; - st.u64 [%rd17+64], %rd6; - -BB14_7: - @%p2 bra BB14_13; - - mov.u32 %r47, 128; - mov.u32 %r49, 1; - mov.u32 %r50, 0; - mov.u64 %rd13, 0; - mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd11; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r47; - st.param.b32 [param3+4], %r49; - st.param.b32 [param3+8], %r49; - .param .b32 param4; - st.param.b32 [param4+0], %r50; - .param .b64 param5; - st.param.b64 [param5+0], %rd13; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r43, [retval0+0]; - } - - // inline asm - bra.uni BB14_13; - -BB14_9: - setp.eq.s64 %p6, %rd17, 0; - @%p6 bra BB14_11; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd6; - st.u64 [%rd17+64], %rd7; - -BB14_11: - @%p2 bra BB14_13; - - mov.u32 %r55, 128; - mov.u32 %r57, 1; - mov.u32 %r58, 0; - mov.u64 %rd16, 0; - mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd14; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r55; - st.param.b32 [param3+4], %r57; - st.param.b32 [param3+8], %r57; - .param .b32 param4; - st.param.b32 [param4+0], %r58; - .param .b64 param5; - st.param.b64 [param5+0], %rd16; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r51, [retval0+0]; - } - - // inline asm - -BB14_13: - // Callseq Start 6 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r59, [retval0+0]; - } - // Callseq End 6 - add.s32 %r62, %r62, 1; - add.s32 %r61, %r61, 1; - setp.ne.s32 %p8, %r61, 0; - @%p8 bra BB14_2; - -BB14_14: - // Callseq Start 7 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r60, [retval0+0]; - } - // Callseq End 7 - ret; -} - - - diff --git a/examples_cuda/stencil/libcudadevrt.a b/examples_cuda/stencil/libcudadevrt.a deleted file mode 100644 index 6cf40658ca04fd19e705db4b142117eea50c64f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 137338 zcmeFZc|25q^gm9rWSdm>P)P_;6tWD-UQ`sa3<-&{@57KSC1kBEBazA)BI{T}C`3`n zj4itv>+ExXSM^?dKYowz_xtJlczpjbkDB|s=e*9j=XLIDUgx}?_qh92?L6(Tsk6%N z`{(DVypp`4+zCa+eP{jsd*8mJN0ojG*s{{m(QTunqvzcG4}&?{d0%n9>E~(Z z4m+*@^S&&?O<0Oe4XtbV4glVy`673!0g@KkHVat zVaFX$C@af5IMTsjT1MwCn8Ge;Uog=(Hi8*lJbOy}0t`k+C;M-c|2cqP&WxZGx!qjGX56hM7PK_AI|cZ4k~Li>!)HhOx{ z89zSsbn0|}o&7)fL;t)FMMI2UQFlDEBtY2 zKYftzy8zhO`A380M`QcJ&Ew4XlRwp72JbMaRxbwqQ}dr1K;8eVO7g%<{g0L4cKrq= zDxgRI$CgC#{stu<_#@B_Wc(K}ogL zZ&0!(1FjJ<=)cy7|C3*FkL+)-1q8hx_ z;4iTeT$6reA~Buq+YfFgO#jM7{+|>0nXPS(Vf%lLVbkCrV+en<&$7ia0Zhu2^kU!# z<@D}te#Lknl+cSYJ-Dx7bKpn&^kU8&Oa{JHrdxU*(~7KpJNsI$+#n52nL+3c>O)@ZpG`( zh+B6s8w|8t@%rz%jL%m7xH%`b^2dGOXT{cj=Fi-6Yd`a+E4KDCTl?#OZ}+$M*IRhc z_uO&|?}_}sy*`3~BZw7l0%+jlVef-{ApE%JXSfD2$DOA^^pXAa-_b`l5Pg(rafFC~ zIQXU*->>-N*G>AKANb?na6F&ZP5RE7&8OFY$M<}SHtFm?aLK>-PuZli)PN5)|Ir^3 z8SoPil>32M{!IoFY5kL@RCDkbInAFujUQ|7Kjg%I^7Lw?f05hqlXvb1-uVx?#qWR| z{GfS_!Y^_ke)4R8V4?qzd-aodtp-=k$U@2_IKRUd{g;9*i%1{NANNBFUVu(59CpU zZuagkc;nXm10A-Bvu(}4t@-!Qb--t9{o7jqw&E|iHg3gV&*hEB}KZ+{*v9 z_8*&hSobU1Ew}%=9l+iGNie%KgWa6WC9rPU$}?6aT0_)c6lH?vH9Oz5hVt{&-sVvug1l zXxtywhvxr*#{E%!X!{=(jek@hUi%Lm?niYE{Gj`PR5T*KgK(SK->(Rq{ZD`{6!h

%fP!eQq){|;LDmP}QzhiVi@ZIo*zu|LR^8 z^ZlZ?3O3(j@W=Z_E5Nhw?-$(+&VT!yTfA69Pq&lKo&U%COSjR{(?0vxKmto==QPw(%9f4x8Uh`kS;f5t*Rhpp+L&pwJH zA7+1iRJ(rya{wQcMXE!r!~Px8yU$#WjX5QE-{IELL;K_o+`>I!d3fbhQ&N<|b*b9x z^Gkh;6%S7?rB+s`#C(-qeYJGy?1F}+li}Bjd8W&EKi+IW^_0IU+(IQw2bg(Nh@ilNhFsr~?t-pI;Z;SDrkp|IdJjAtPwauO$?>WluhISCQpRo{QfBi$ z1ygg%)r{A8_vrYA*j25eA+F+SH0C&;dAH6ai)?+d!S1HD2gwpv&7a_72bLDP zm)hgk7G-)X-r*`GyheI&t$9!Tr>|V_3V)zy%GsY)qfr^8y82B;^eX;wdl0e?`|QEl zie=OH?vNLkQ`+PAst=*a?uc@!Pm?kQuX-y+t1Q9mwqJjBD)S5Ra=jz4^r(HNI3Q!` z5u1;n>A6~4C!?s*l!80?qT*%+6S}957igH-?>hf}aE{1z{{7+JuMbX}t}X^T@}$GNq^&*Q7o~$1m=Rm_q}E;){OR zp;u)NX29do2{Os=_#O0DnV^j^4vKHlfyf})0rV%`o`(75(NXT&4J`VVC;COulJnAN zmCN7;cLC@AdcWh?sXh_4QfI^j!wNRcUPw)ixpE=UKXMhICu!z*q`jb}Nr&j3UzBuU z&tq3el}>COZ{il~4&U8c;;*W7p8alT`0i6>e5$W=-n2J$V9fRkXRt?UPc-ciwU8E; zp7L-%M&OQTHVoODA(uFq&TN+aT3yM4QVfL{@wDXNFu4iRJdo5mC6IQfJzPwU(}=a> z3eHw?K0K+lL`l_rh4iz_ijw&d33K(R^kGQqsFMG)H`H*GM(oK|%pSQqnyQk;!Mm;D zV#=IG^r-P=+u6AAByG?r^D+`<4^D~`l3EE~b+b#JDrah`x)S52-C+0B3WPf=!5Sef z9SmTu-v42X`8cgf($?HmSo(`cY4Up0E)NVSfk|YgldH7xJL2h~7 z%BP{%m#?~%z2z~fe5&y|r*q)yvJ8{8*}IGYmpjjRK>jV)msefxM94gHJ(kO}-UL?y7uNc5!ZX;Ev>6v35d{b9Nxr!R5tU9<9o!nxAvt4wx>>@K`%F z)*a_Cy(?}hZ#Vjg$?opq?~VN%0}AJ4;#}YF333tGRR4Z^kW1?xpTPs21F;XQ9PiX0 z=V<1+SZNSA+Q+MtR_b~d@@Zb$vR`APrfXo0B?!Eb0DU z!R639OZleJN5U^!?-%jgxt)zK)lMo(tg2dLB*|-4I=-trF1k=O%k|UrGclX;^Rkxm zhZ>HnHS=g!I-Z6bqR$_gm0`<_d!|h0o z=AEWqYgr0shJLE5tztF-mqJqf8uZJs+CYBJp$(#V&-9bxiHDbylzX1BTswPJ3hkrg zl2dDSH}q1>d6#2OqRBHJkHi9-ufzz~nB`2Drp$b>5(~U+5+jT;$#JSlo;fNa7Pz7n zBg}M;Wck7nVSjzr4J$=Gdm#_?DR+%`A&pix-3L>>Oftk7AoiQg$}&q*vjb#TUn~C-FWNq9c|b*>sNQrF|w*l3_-_9$+8SqXq?#R{oS63mb6g! z+Q%i94qRuJv>wcde`N7MgbQdC^LheH?LjqaB2Q&AtZ|$I9-!D9LHNwj_hfbNw12XFNmbKYnJt?#h^ArFpL?yN&R0$L{OffV{LX5*By;DKybj zFFndrrL^p2j>m8BxOmw~%FN?p@zOkRUFCF42$u4#TUz3SQG>?gb4AT%g{CGirPgrj zJ&S=?QTeR{krF;wb#D|QA4R@TrK$;XF3~=o&l5+&Ukx^EYcQG|Q#c5p`>Z@)%gDAj zpfCJ32mZjm@`bp?GRDA4w&4U_Un~}ZF|WGYuBtq0lX<2VmIn{ZN@3j=M=zHBYIjc||RpOI`JqVC$^FGuAd`WhD>JADLY=46YW8 z`1ayXvccDF0Y@C(B*~Bl!fGnVix>iTKC;FOakCb_IQ#runn9k!+J;EJ1MfW!-TKa# zHbWNK%k+HrZPh|*YO(HkO`G)nK`twpoV@2(O|>bQIFB4vEYExV30nTjE|O&XUL7*90NB2_S2bT%0-FJ)-(CtA1{5;8Y#>UX zobs-VpVD|0?RjiXx6DhO?oxQ^(C1^e_~JaMEx-FtmrIhD&oP<4PJ7XCEc+s;{}H=V z!nVTk_on8p>CaypX!4&M8)aTip)b1p7*weG0LAVv{}u90oFQ}VX6v4^SGCv~nrL)N zv{XXAx^0;2pjP0jh2X~#WBZd0thM!RUV-=KS5}nj;3L+w?jv769gkw3Cn4_o9Aopk zvn(^vG44l2wVt$Kzq=wcpkIWVW85JjER{Ox1brF`TQ+~`4J981V^mYP7@7D)ax;=j zFZPI><>|PxY%3WbE=B>V-=k6o7*t)ZgB_Dvv82__n8*s*44ccg&%wC28JErbCN{$+ z6C|~4Guqy12^TB**#*>;SM)vjuGr4r3r`wBnZ+8HrM^ygXXWFWN-9h`Rc5Ygo;a}? zH-#aoOJHRFZc_mm15x8RTW2sbYj1YxoUq+7560lqXPez^^~ucnq3HGLramxQezzGk zMr;X;mQmVMO^HIxh87~y9sn4wr3E0Vh3e*gV35U_sR~OIJxWd2np}L0jE<^*QUrt6 zs`HxUKFeCe3jed|aNAbaK$3NkA09jUs^Iz5O`lr-*`(ZSu`0*M? zIK5M(e;VfYa^|F#b4Ltoi@A*sr1FA``KjjlF{M$Y^n|vR~dkcRDLoZPJ z(EDaFH#TEx5Pd7*&D6UXHe(?R7+02Ok}#LetLqm%8giANj5qf;R+)RttXud8emCHg zwCdER0YjJBTzxY;B|yLc3hjWnN$@U$L5uX)<14 zTv~i=oV6}A3+)50x>X%*!#XaT*~2@G^25FB4Jq$0U{(etX}1P$X9|z4pmhn@VcdzR zdc`+pcS@UWUh%N?1_jUqe6%){|8VbB*rujr5{csP@ zrqy%P0lS%nbGkAF>1Z|moGr$p6k{PJ=wO>%BWf{VRAe8eM-}HJMNgaV_BtvgAW8>d zg$eU4h?cOP$1VUo6o_PSQZ0}UMYPYRbxm`HG_BSR^U_grn0usH07D3c_X!;!1*u!q z2}R9@p%9{P4F{ME1FDZQY43eaDbgm$24K({gjHex=~JPIFARV(Jw)J1RtUh~g~CW8 zDp)b7g)l&{87OC_4V|R}xFL0iIdT2M{*Y$GCk6n@0vv}>oOtOegK9YZ&X6_+w--zV zF$g>})a>aIenbbB%o*h;Pm#~Cc{^wZBW=%!h7~KI9a*{FqZ<#+1&;Rhg)#W#DH{unnP5{)biuhF;pAwItPtA z3fq^Z*n>NOOVfv(}YI4={Wb>e#Jf*6ddLjyzFh#7+%?L>wgLZpg zvnAlcD@Fkh`iKzJadyBg3iYh1#6T(EZG3)VdlPU*O2mS*l7$Ds2CF~!x3jO#be@S)9kiN^Js-INFs!EnGO<)YU z-stp6lgw#ViM;ZfR);@kg&5WH>7{jLZGc1D0ihhIoM?1V0n*0;Q5UPYtgwC)H+pH< zK(aOgvEGO+U?yM#FsM)*7liWMfetam01ITstX6zzTM?V4l2o!O*r3jy`rR03q;FrA zZ?`S6B&ZX3V@(HS7^@&xEBZgPOrgdkp{N}g=y-p3y#G+ZjxN-=Bp@$C2h@RG@D=?v zyE;*Gl8FAVbkN!UZhd1FS$m>eU?)IPqXTXktH5v-{gncp$n|7c850IN7iOXYopYY% zmPT}&(7IgcY1&BNhHF**Ew~Mc77>a|MlGs`LP!K2EP_eo@%eOLkPbZ?wgtin<4}Y7Vx!+iY z_az0fnhazz)6-&&ReJn~W{23PfX^nh79l#sKz^o^+G1VgR8LSR`s)rHHW1^s!{1@p zZ*jSIG93}j*9qhrqoB_C;Z_{%%eE=tjR_5{PLD%vG$I_N*2sfdAqX;;1azSvTbhq% z7|y25gC>(D8qq6_5ZCAs^vW9fOM5oOn7BqZ$qK=a&`$v$P1LFSp{T_$boFjh@NmD| z@DT~1U!6*;$VYdX(}uHF@K`1S+8>kVr-edH%=!&XoTE9xNmPDglLi>M%`RGAp=t@4Eq&78+*NxL8V*NXTFA#bP zvoIlkZ*&L(RP2O|Sbh`>&9CJuQW>ZnRpCo@U_W?3s~A$u@sYB(np3)N<0jcgcxj>Zc*lhGIjq4Fgo`b}t&BFqG6 zAO=F+zDADE3c+(~5@BnzcwRMXTUOZ0ldKhvo^9}O<_4xQk>y5g?2a!cJ#dp;De06FZd}GYPj{fdT)q6%pOnyA}Dz=1WM)fK&wVD4#;oj_g8k zju~ZWkFQpNLee)@XGt2{FyK!UIFsW}mK7aYhUvo#TfuJI$ADH7^+gj}8duh8GUO0j z8Y~i50^W;KV`4uG@FJfAB`23yZ^QAbO&Lo|TB%3l(p0CwTMt)d#GzM`(OHMBXlyKI z6t{(TsuyyF+mL>VFx6+*hGN3p5s76miyztM-fG+$x88O+R3#)=K}7>Gc_~yXL|AA1 z<;mAR6uDW!*&$ECMj>bi3k<b<)o`jBnLm==Nwt~^iSvY7R-4U24Xe@o`$W`6uq;Zr z_GI$nct`?Pgd1NEN1Yw1TBVAP(-hEd-1rta5JN{az|h9%h*D#;{dv@QDLmvbUc_Vx z`eFoex*DNTje=F9jH}&1O5q5)U<9F8jnb_~S&pQ`Q5V7#rSMY8bcA>Oo|%@~sYROaFZWJ&Xzm)SDCcJD5Kw2Qcz z^Wpui592ErRMbRt!n@cOytn6!7}V^_d1-Lcwc>T3BHz7YZ^pvw=lWJ{ubT87Ivt;~ zX4QB}zlUDmq_14FFWH;r;932iZ8uH&G!qoWgy{5ZRfmlxj8%ccbd$BX?Dwq-X8XSG zvfJwv&vyAH+b$tQ{h4;jey+k#e%nsBqow;jtWKUdAk?C#`@sJ4{nT5<^f~tXPFGM2 z9;PNeb-3fag9~F5nJSy&tgLzY(FvKbxSdevt6ylBA4c^*9dk=_R?-wGi<6wGI%{Qe z28EXCce}hMs7YF9A+1ZS&G*ezy-$e?b(qaTAG#N^C_o)>S##;1snU4@E6_pdm;qi1 zqlVn1JgeEAeO}LXPQ7l+@ zZWwXtDY{|?FO&;Q%Z8;{13LF?&gZ5j*<^p{>YF(Zdt!F_iCLabNvTeWph9r{ss6!f zXHw1c*?F(y!NGA)OL_*@))T-(-`aZi(~{lwX{5f|uHN+=K9W>atJTdNR8Motl)rFb z`5C~)2;dYUX=N;OZ0vxPWdWBS(nNMrg^)sRD!Wp-Ac^XNycKR7$~B+yU0BQ`RQQQS zp>Zc1gs=~E_q11pD8WP_xCU0^6IHq*ex7l$sZ!8UkcjN6kQG=2olIlMssE@b z6PpxnJdsRL>GnpHoWbYxs`1g)+tcKs%qTByV&!uN=7yYT;GqfI&Ta}gNN%I1Ey=ipLd*-{Wj1)UJ_FI zz-xLZbvktx@qQMjht?&rWkF?VbCFs7ErCX_&j66yv|N|wV7JgV5v7*NGQvEg@`J^h zM{J!x#x^C?7`WTntEu+2G%YZ4@8q6f)C&rydbS^?#LF>&v!VJ7Ip>4=#ivx})8yvB z{EG6MGW}Fie`Z%BHh9^x%_SjiIdD=X;Ks4SS8kJ1Z_Z%HDctmzlNxS~N%3*IY*-Rl z-OYO-F8yYn0t2L$nzh<$)dhx7pV?AHTMpRck!U564M^dXR3p}b5X}$!99|k%k^mv^ zwmnOt8VGc@2uE?i83_#R(ZQUHU_+OWGid*b6>f}(gVI9U=Qn1dh}f~x{44S>dfL7g zJG3@4*zkyg?#W)CgB+#ok7}XalL6&g*n#a(ta!3F15A*}I%Nmb3oWU_anIeD3bd>q zy;NM4GOyOWE<6?(?_RI_dQ{X3%T)oSderx?rhNv4e8ZMQZK3H)tJzz02N73$m>^P&jgblsS?8Z)E8YhI zHrUs9m^~|ButWv!I-??KJhn!+yzzY93Y7n_UQp>D z8`xsq;E_0$-qzv9FD^ou;3z`Xr7m6UdkZxPTtB#ME#I-?ay)QbaL+Q6>gz^H#{eTU zPf|r9(H?5O- zlOo&*4>6Ixr-^8B+GvsY%6_U*UJ}HqD()~bLA_+MrB)?>QtM>?qzHYWBuD`Tn7%iP zCq-_ZNP+A5NfMdEYz)+Mhf^rV4E4!yy+J7=bLb{TMhpS%HSCC9n<`C$%v+D{P$4FS zXmtR4Hi_Og9cbpY3oH9GFRqMb<4nN*%qegsm9^KP(ZXVYM|K+6;1okZzW08Th!UL1 z11%|@^rPRr3Mwc(3B@+!6P^T_JufEGeJNNAyppmQkfb3Cjwc_~)umW4sTFA6fneX1 ztTF3Ab8cR6(-C{VQ!aM2uOBi{FE+ch@?&@>QYb|lC9L8ieOOZ^B9iX=NJ}Q&zK^|I zk_h=eK;(s$u?2!1c$Gy1nij4OG+nmGYsIcO&?yVFCLn1O*Kj7~)HlT1L57SB#X zS*Q<<)N_rch|MpKmVpvM$#5mXn(yAJHlpb?B8oxL!bITw&vB!KXy7O<4b(HXUD>=9 zYV94-{aR&S8E`$k1QEhF2}>g;#7Pn7A45Sy+}`=FV0jVKPN0jI&aI3UPWggVY*Yq? z9BQ*498Z1<b33O|Q_Dfg2_era&SQGRb zS?_u8F^#;?7wT4$$OJF?aue>gpBbj~b+uwcy*624E$zY#BtMChd1Q7pa*330zvLaC zdNbB}(S?}i<_+x$5aa0S7xaO~^ZWE8yu)-vz>5f(_2DG2=1Xgm>I>UPAsIUSW_`BX zP=dZy$icpF0|Cg+1p-G_Nq%#HxiDo3_?4R{Ib?H2T ziz0XLsS2=WH60K+>ozZVOsz04-eq=G097$Z@hTStY-BP^eDgt)FKEZ`v#ItJg=S*25e-^J1 z|2-FH6GOwGGf$aQme~kXE3SS+rUd@roFRY9k~~?d2}xD-vl9*9k`;Qr%=H=6qPgLr zS@*p51~$*euMhcC;%>>ViKC#~dYD~AYaDC4@+;O#} zzi`e5_ro(PntnkRpo8^;jrz#RZHQ?Zzd&3v$OQ+HVXI1P1$_4EYDyOtcq*Q%0p29QNhoRLvp3 z6)L+6}#<6$J=NkKKU4Qf;`w(^*UAf zQ(M)l0K(zb^5W|g4bLf&YbI@qnE2JzuZtX@?@J8zTb^VLw$(NnOb3ja8Yquy3K+do z&6`aduJAU{pMXD}ioZ^Bu<4f#+7r@5bj|`jFY;uF;A_w?yG~RsSnhI&ge5=~X!j@h zRk5-$sASNBisyOj#E?RN`ZxTepH?m+RC}i@5~GwN_M)AJxnUsCoj-rB&;62hZDN$y{7|yOvlrOI z&gYKHN;xNl`GKdYPYeW1+ZDl9+;qz3*Jex9Kp<-9k{E~mDe#UwF0;|^@_Ben4ej0_ z$T)j@?5$|7-Hd#S0wma1AgZhm>>-t(YxnGAQSRd+@XkJ-Nd~{{&GlPM<)7t-z3Ml( zoJA&J$!W)K>Fnbev>1Oe8lP!KOF>M*i2qj(mr=ZBUl!QD?3e%i=z+rnQ8(s$+ZS%D z^2_bQq5vAD(l)qRuJ4B1pfvBGJ899@Aerr+736PF5<>5~9Nq{#Tl9M1a>=ykY=#&K6==Dw;WYiacJ=4^ zoQ!8_!W^fR=}Y9Fq4geMjz`U)F3qB|X)evj}39VMFZAtzHqa3Cq4M zD@hMfjc@A!nowI6&yoCjfsu7*Y1ZN}7ZTfn;3;bUy=P~PH!goe&8{M^*@BS6G2lzj z<7FgV_#HECV*0iot@a*Kwr~|DAlzitMKLoor10+84SXQG&+SL>w+$adcYf=it*|?Q z{CbEWea0-ahf>@>g>oJ3v}}n|R!QFJ&k@I00S(gBW?3m`&^J^y%b}fO>_xn1fu|ll zG}k)NyMcaoUvXt0^gKrX?KJ7Ul8kxo^UOSkk0F7Rm%EIG--^$OlQSj07Fz(9y5CM` z(qe3ekMOYje*BaIfc z-Kz~>mI%AX9eekHP^8EC0H6%IOp`gf{XrR2`H1mv4A-5PsyIf#fkRk!H(!oq``%!l&!Fa0i3*0pf(NlFb;8lIzA%|czCzy z%{I{wv>bmL@AfCU+C!5HdqofTo#K?tJSrg+s&G;9-EnV)9gQ4E(r#6p8YEb$pll}d zcO_Kz1r*9o?#ik~-6`2uVye!Sz0#)QwGi@>mZEGwh)C;Vz6d`ZqH(Z=ydR8(ybn@2v;EsAG}K-#q;l z9$rSBYCs@K${`Jcrf|>GbpC)9nxOgiNc18HsX|d{_&wBWly8h=XN2VZvX+B~24&!T zdiXv1kD5GR*2>$U1)lO%UT~_0jCMWb#6fJ;NBxJNxbLabk= zO6#jmF%ESNQ|GRA!9@jqQ1A~=XTUw~@Or|~>BJkv6*vq*hdzTmk=3m&rUCcLR2 zxoy8e>Luf!i#Wn>9##bozEDuDRy!x{pk9ga82F4eZU7GJG@Xh;0d%jwP6s78PHxTP|DhWtGPDEJ* zA@Uh~l>-5F>Jg`!gB7H92456t#9Z`4&bz^rh?g0PAUhGjD6RBh6(y(77R&ZCbzyfA zY2g!fh?{QD4-HBpmk}|0oiaQXXuQpWZ3sNM$2~x`ES8*y$C{wTuRUAU&C!`KDUL@n z##~UZ9DG`7W=k{I>d72i!FjlzL74ZY3=7sBeaX=-(g$p9aTB$l7BNo%y4^N(AVXNr~THN-ccu#&TFyd^n=O zxPEE^gTy^|x)+WbGr_`Qo86UIJ)|jU+;jY+qbF#x z&bV-Z6B#HpfkA&WDQRW4fpXG3kb!pGH%qC7lP&8cBJA!WkUmR`KSpz)bP>{)WhbCw zlp-sEr|u?gW{%MjU+I>4QUA_&kzs))=v5Dm*j53I1@C;@?KE1toSW6D( z#C#%pUvZYu=iAFk3!$-WQ!-k0w<;tjPI2$IT0mU)(lWf} zh|cnA1|yL7p1ILYVny*7MH7dJS@ShQ?4q1K@b%3)UT(y|(y7pUXn_d@DeS^f9N_hy zqJ6~07Au-O#*b@i?t#&O^TF(Z&@Quh_>NIOlw8Q+gb*v^3I>lSRjr>QTtvsyLWb!W zPp!e?sdp>P;$O3PKf6WPQ)#?h@a%pgp;DXU!8VZv;|H^R08>(Z%tQ% zSaxL|8+(cCIM49(mYGb|?y=YQx#xnzL<+s=0ZrkVk_m3B{>T-{G3HnyzQHpaxIm`K z)jWo9Edm2^c>BaXVYh<;7M&~Q-l3Czvmm0^NB<)0gnoBw*@%pTveiiq&+6nUU&*fJ zKKjJ@JcdS|OX@C6gBAN5#sj1-lJx4tGA4y5vae+5$7ho=b%2fs6C=zgXbwe>`nBKtoA7tft&*OczIWS9T_{hTKva?FOv8+WYotWEFn2+W-v2w-#Rp!Kp zOx2L3$=7R+!cY}WB=4-MDep?{{q`yj)rO*p!sqr0PhqBB*KXB!6RY&XNc=LPum=`KdiM$Ml-DB@ zwJd5^FL6~iQxuR)5e(QJ_a2+J8k3=Gwd{PUq1STDV)ciM=l7^6f!{01n$Lemqdq;yWpSrgu{PKbUJZfvXz6rFJ3vT?DG z?YXB;c*BCU;*op%@T5%$L63GC@xTr)L>MeB`?ly|WsF~ME#-8NUca)Beq#-%=b*Nx zsCr;X6P2EUN;FH=*t@PWaPEYRd+53oOS_LZuU6WdRmNBn3A+&P6Q~kvxa}sL>VsR; z$Bu~ES)Ea^k?T`t7(f>7=&On8tbcwYVZNZs0U5PIzF;8@P1rTXI4bULJ%X9hxh>8w zu3NOe423Na^VT>fD6ON{C7Jrch=1-WGr8S$`3wtY%V}>9^ny zn8A|n12uQoT%X*WbY1C8b5|`b%SD)8Q97h{3oGkT7pN3F(e*6Is)V_IXIuRPjHvz1 zoWd5O$2*(RhSI4N(hhqX(+tT4ma>@fdc1jzjvkxZ z3(om+nq=QPJFIPR%Kct>7Pk}iFsBDk&C{I-EA`|PEthh3bc$O*xJVXkePQc`$tl-l za*>%_Zctt9$yoT~_vB#DQS7Tlh85ULoBPZg<$K9mL)?r6PF&nYIdqZ70Mw?!b96nv zOlZ74k3m7$=ksFWW8;F;vd!B%_wPr&MHw>Z$i)`~rUr1$%e*dgQIB&`U3$B$&ED{( z5QA=!>psXX0NbV1z*2pq_FO=(Mzv~&foR?C@s?YVxID~BrAJ&0wkI-=GLI)1#p|4m z7-gJb>AHba%9CnQ2zk3~2ZOGQzR2&nO}e5Q1{t?{v7Pv84rMT5IUHG}YO|KNOXFpC zqfPU4dHq4G9?f-VWYEo)x;#mxn`hKa_4|nI<3k9Tj7D1ZIH|EtQMRX*8{GkH-lp$P z@w4vMBx%z21R%~P;kgwDBru zeN3b@aNoX*51tZqD#%>E%F_+a@$#o#PIR*jadF;z%fp_h4v>piypY8xtQ5jGLnn7; zKYBfKa@e@oxYTMol>W`CII?WYa;1rKYUSDLAb%E=-Da@xt)E?p=cK^YI>+h8(s@H= zzq#7=0P8H7O6Y>mfGeWVo@~!BEF$=Y_W9k0ZhjYy?^snC^=0N_e5Na6!H#S%GR(pJ znN#_l@9k%7tc73NH{KqtysL+{KB+;%Ua*W$>Cq&G?D#BFN4tpWkK1UE*5i~i2aJmC zsMTMh*i=eF_I)`OsPG7K3(>l&m~Gt@*w734Rwm+jq$H@MWGB|F_etbp*r89rfti7~ zw1-o!%+mHM*;&+MJ_~8j9mx4QMW~d;*!Qk5*Q?|!48z>>pNm<>y`C~1aD{b{xLwV( zzZAc?qVnR=V*XW$mz>4NU7@s{5u^ExD~z{Ke+b;Q zf_U{1Z&4B2@b%EB-<1>jPJ%b^l8cMCdhUGj)o0p(?4XJ)b8YBHEGA`Fq}}Y;?hc%j znQ2*O)48*dR8W;*ohbY+Y{!+rg`}b8jl?TyGaJJZ3u0f5$`-433>h*mRE?N#SYEBp zp60fz`p8C77hB*PnBa}>FzSr?q9Q(r!tZ}DNT61{vNj)i)U+YE?*^Zn8J(qIiPaG6 zpp;_0a0K;?W#KBnrs~n?M`mu>zTvjf0&T=rrdP5!Wybk?SIOf@9$b<^b!Dt84`Eur zQt&i!Jrp|b!jDezjenEAz?mVA>G#I~0^6yYa}4tYaP1N#);uiXhJTe@XTFVOrG06c z4esO2sI@=-RzGy{%xsUw)5sO?}6I|m#2dEdSuFOQ6c_XU1zG{zvVX}gbjM}7}G z;}eP2zUxcu-$aS*c8C)-iw4fdOu1?Sio;3wBnx3*^)_--coxz+9^K#bB}U;0XI!y8 z`PgSB&lKT$>!B1wSC#WaJ+_*Ug2r9VqMp}X;6y5R67{WkB^NpZ%f$Ob8~kY;an}0& zen&?QS+Hsz&9#TKH0l`auJdN}Tq5OKWbUgIbrlItTu+mjmlD*J<>}c?=f7TEE}lz0 zv~y>?Z8rn)xv+9=^`+RLQmphFY{~sK+w&WY7?t6;8++DZ$31yQ(6jPU)v1dX&h4xr z-2Fp4zm&f^wR9ggd}Oa}tuIILo%;kgB{!xoB_kd1mnkK~#J~q7?dHgxs~S>7tmd}z zZ*#qI7ZXN@7`kd;)yb19ntE&jcdaZAU|&iu&C0a8IT+-Nf5Z=~*y7p22IG^>%~tgeS8w4$2Z&V^CQa{EeVesw7LP_jgE?of&Gn03G)vunZ}M&WryXI~Pi}RR%dz7jZAnkI z1vQ0OWJBk^e0tQtKeMkc7y7D9v4wE8RHUiiq7?A1Q*R=;wmNQCw*-@%x$teA8?xg` zS8HnyyG~53;Qh)*mp&Wgwq+7BAAbo7j;pWAqMo$7DBELCXPUX zcfwf-J%nO-d|!i#V?3%PuPJ1s9DhJV{5-6?>uvkk{qxSujy5{X2@g-Ji;{&Fy0(R6 z`zh4!oG>^AgeXIk6$>m%jLWiMErD>M`E`#<@{2F(dO0?grZaB2K>p{3*mBsjsKw}$ zABR(KsYh>|Vwejv)RbkvPm}$KS~Cnq2<9?=y(Iee^G&^%(C1lHojbv?W~L&MwC98X zD_W*-B{^|n@N=6lTC)uA3fwRC`ruX(x$w0r7ybM$^{mQg=+!dVvrMXHfH#KEQIK5n z;>(gg@y#lGoRneEu6kVq3h}`j?|RUJ{-Dyn#YT9y01wB58f(c#y_PSsmc3aDg?B?k zpMOr7mppPhBxl=3E*r!Xwu5@HT&?b2?ZghD>Fj_tXm@OdBhcp8_21>)$b8WVD%io5Y=3b-3 zMrf-rEeYzXO+K+(sbvYyz1XjNB24|jw-B~*WNyzO;$j^hk2I>N_s`~9T<#VR)Dy92 znZJ{l^`YHY=DNr!VD{r0hc%L;Eadzr*|}ETBQi7j0gNNj<4#Ohazw(WFJxgc`|tI> zy*b;?sew2kGQIv8Bl|dns~9+ZvFA~6A@7cc*hXcRf@0|yzM?B5KAF*SY28|U1}GtV zWjUQAnhr{+eJh-GRgLKF9yane?W=UQNNDoiydJat(7gRw`!%e2)%l+U7=4VC?2U~W z^BN2v56-`DLuNN0&1m>2RL~v(VuyDlw5*m6rj|!V_48{2kwMNOg)Ur#^`{NHow@vP z7hurYy;A7HJ5KHm>^Lu7nGIxS6aPr^YEn@&9rQLH!M>;@&Us*ZRm~Hsej2?UFdYhH zdOuHs(2;BC z>RXQuER(+~#y?gKX!`;i)?QV7hx)XD&;gcD+@-Xue8$#AQs0|f&d8?wXSq!izg|V$ z1b+^BXyuN;$c%h$K$i4Sd=6Obr=sdf@JtCM7)%Sm&czTB@=5DJ+h@SCm4;R#BMQC% z>9&B|g;n4wNNb^?FK++?F9C4O$p$})whEF-%tW6RhlyO6

@$opGiK(#zh3Xp@A3QNoO9jJ`?{|C+>iU**E#onu5+&OtaALlS)H>A1J+dPsYFm@ zLeT{c71;)krNzYXAU3N-Nmp&Vw%qD!hP1ZMPI03PV=j7VAy-!0N1Brf$d`TzAA-^I zzXAoRSVDiXb<#x{_Bw_@&)6|Wdlr{QhCG&F=4h56zi=(9vT?;Bxli5$NsVrips&~z z08kAoEZ(1HVgS~7^-p}y5=?Fw{^>8?K)$>AVW16Pfr*X%P&mn8~ za}ba8H(CN3<1xwD{+lrSW`!lxh2Q5dyM&vhg-s-CdUXJ`cMyt+S!akR?pXA|Umu9| zz$z|WasQ+Vmr%{UzLk`{@=!GpRy5A+u#! zShFSUD)1?8EB!Xx1wM*U@OZWw^33asGhdsOqJRXttePM%XC3~!P1C~;v@b4gh}$b^(Ba&xZi#Et zBm>jIf9a<>JNd0G7M<8pC(yLba>D7bYnx*#MYI#8VkF4Cwhs$y)p>e<5_K#b{SmK2 z2YLT}h0Z!^sk$~%ANlkQ~{yXTwZ>?tTXTy(jFrnhr zR^kRao4RFa&q_!8Ki9V8zA1NdR6Z5_{qB{IIbRjG^|{wj1;M!t)V#Cd7mN zZ3hSav%Xa9uOxG>%(?jid9;8bsCL0HRJb+hb+8PU9dzw8<+!i1TButK^5g>Z^q+h# z-=Ibh=MzkMt>#l3pNc6;&8J-X$5mAlQZT0v?xHL~X=na}q2ow*ltB1X`!)(gR>tk| zs~KvXM;LpaOTzmTZc9P^&0_KGSk2MbO6HZy=3A$2SjdA5wF&bUkHU>XcIm~L7lOZ2 zKKR;CIbx6J2aDE0I=C~h#jSR44|!YlFTVG6#M+qp7CF7zxgsdj(({u2hJ;OobyMSO z$)TM#vGb4BY-`2KwScr}u@VXOk9ovZ7p8KL?>B=X&;^}k@>x1z&EvhiyE9c|0(wye z!_lLN%PFCBlRvVScZ+isFMo$WP=qt4`FFgV-?F>23u0q@{REu@3 zSURoolRm85S;UnkLeE5ODYjfqfDP&&KMiPD{B~C-_&!=NTes*0zruHThT%yxQ&`PT zsxaAq=`UEFf=4IgmT$1&Cz=5CfG->FaD4naXi`4DixjdY4izq>N;=Xm&+HjWjEh#q z9Qg1ZTl;<;B_W^d)~5Ap6WzGRf=!d){fLrN%<89iAVwH#e*48MM)B~bc%bMov*C6k z;n~t&RvkiT9Nz+{#cNtp8&vHwm=1sNdWFlKj+%_epDSx{s3JAzZ;G5*xG zGKim)@_=u76;*T-nY@;t>IWXf{#?^mdC-E6-fyXmPyYa=YP4R((l}n4^<7r?1;vKv zw(szc9n>jD(6-1z>UJ4E*S!~&iDa=n<8;=wY`EyunZQi(HFz2&%N};rl_9q0FvUZ? zs%y>5!YyM4cl5O#p9Su;KyHFjWaDEy7~N&0pbaQkKBKtQaQRa1u{Xi5f3IPM>Jhr` z&o3+Go?g%G2nxD!<_jcM1i6d+&yXb@AklJ+W^|V?L83gr}okQs`Med+k*^4p!jfgS4eU(a~U;_Rd?B|F$W)gZV|s<{*_nee$HNs9Jvcz3I@{v|D; zH)2lg0&Nt=XV@&IAMy@1{O@Fh$yV|u)<3B^hzFgQyzN?}?h8=gBSyc4OUI!>BN z7lq)%7MuVjz8C!gb_6!vt2)*>S@Oi;mxLSq^%fQ7cmWhaV=C!5<_%qYc1mUZ`%`Ad z)I^!L3$r1L$Q?(6_C};kO$a+4WSa6D4Q%h>YQ1^c5wp_|^|8DI{vY!vaem1Bx;)i5;s^U2O);@nb?`7jbq^j4kf)GD zO;`%JoTvRaBCbNe6xIDH)uHy};@t^zC(ciI_4kK61amE-UDx2iQG0&luT4kSKT|Ve zLSPqE@Zvbk=tJ*!ej^dd^e^7VZQgJnu6-{e<1%l&x{P!-yQR+JJ|4t)zu#7W$p%%A zZw5H4X#aG~qmEagIAEH+y>?zD#L{uPdWHPUUGtd=ws8y5n^qyCz<7`$?whbx9lMBq z#PHoX@nNl#>lvg-gMDu`%a8q`H57ki+ANhwaCJq0d{NQZ<0#i0!bZ)DkLd#DJcv$^r zrvwrTmQKTUPHd#LicA|Dt(85|Z;slo-Dc(3o3Dvp1#g{WTGwvZ>*Q@w3xi=(4ExUq z9ns3;&fU}pJ;0PeY3=*9lE4Z_*|q(DFf|{k)d?lA=T1#t(raFxAr6)rr`SQRV2R)E zzkxZb$v5L3s-jTI`00meyUx&4DySApt(D7a$S;HT&HEz+x z&i7;Y%-`S}=jV}}V|3^aUti-QH$3Y+m+_RWccJ1sh}#BYl$jBg zjWV*mfX=TnfkG$Qc)k*idAV=iIU*#q?O_SFQ60=BcwyC{Q~8SVGG)0)Pi;FS+&gA% z_#4q^bvT{HHkf$n!M7C_UGA-(Kn%Mvbs+MFkhVm-E@c`dzjjN5}2Jy`qblKS>xkLwmFWmee(saAB2)aN{ zH`UQL7k{nsJEuD-)U8{}#`>|F8kZLqPNTNd_}&N&ohDBd{m!R_5C+=R+)X~iDJkY;vZx{%4ACP@t||E=c&%+Go=}n!kyY%jBU#J zDqtgpcmv#PX_j)<6UOzYG0zm_oS|7c?K~JkJ4f8ctA6iDbKS=h9qzNld18Z`rF(wc^xq|F)&IEm{M_r zl{ceB>Q#x^C=dGblpo%!qJ6A#gA`;NjED(CZQdZ%yCRlYIJh6Z{OcXvj2iE3%e4la z#3-RY;V3Sh5`4v&f{o0s{o+6f)B8J`Jhd%IX%x+M5ZA^^S={Q(EgJtu*-^_y zF0zz~ZW9>3t=b6cQK%S2waoJfMLl=?=t781-@A##Y-1V+RSxiQ;=YbJqzsxIC%tYu zO|;%>JZ_w}OW6&r$TG|mqk&rh(O_zt@))Iyc~3NxupXhg57D`qwNGYBuAEuD#v*^# zK`hszlXmfLW*Bw^VPh?I%lJRy_Ypa|5DrZuHCV?f21?3h2f5mBeX%*8h}V|nFq3e% z@7mIoFi!I|tVdgH5>uJXj^zw}Zai$39kI>2Dq)f4YZL*;JI{>A6c@FC1^rf85%X{T z9B)kmI+^lCU&kSq zR*_IW5=Q!mA=RQRHVJ~;tfW_yV98#Xc|6IqZb&RY;uTwV6q$8r;{#bc7(T z!3G?2VZW-_VP5O1c`J%u$1;QRRvVA({zYldFydks8~y+2k#2(Xm$Z-~#||fa1@FaI zC%&DA87u`;66a|@wPeKavtcJD;FT%tOId&6!`;BA()ICw1Bib)>*MKEY}_#8H_?gW z=ok;Xm2HkM8o%-Zwk)}h>4hI9j6aK~2qhkv-42lj*J&%RNXSOmcm5_%J^MxA__QA7 z?&P!g#EPv@_K|h>6yJQ#U8a+i#l#|Z0;GsCVPC=TN$jiJ;kE7SkSi^~=ZJO4Wc~(b zsvGfTV?g`a=sJIFCG=Y?`|2{}$DaZCS{iU}ruIQQ@c9aHT4M(+nP+|Q4FWTWa(HI#>St2UWkKYTOu<&C|C0b4ap z_5L1zfHQcbshC|183uB5>|V1mGORmPUiKY{Y5mm8K~H^qM)ip~Id28@47LmgcO}|c zhM8dhh|6*G(eWc{lbV&oHp(x~?HYXSEY|UYrFAo*&Yu)|DBnhq zsX`0RBgaBg++8q54;j!rFD{RKaI%12IZ)ugj8XezE=oATtX^RLj;EQ z9?GBnp3QegZ%8Jci1cX`$LL=pW?0&%5$d)ZAtAZgll3Uq2m9ZRBk*Gn9nS?)^u4ec*J&Gl?{b4K_4Q&Wq>Za7->@HO zc#N&69R_kRd`kqGS~KBK(52%v3Lv=SDLu=ux>v+bm0rM0Chy@s@a)wVIF8295{rFD zVKy878D@gX&;@%5;Jm*@-Q zn5s1i;pZm|*A6Z2b-egMPKrd9w(J?Kpv@~lgGsre=AXF+f*sl4Ft>jilN&{*kz0O~ zBe*P;@?48toeHCBW5;=F45y6yQRK(p)Nw!dg>ksv03aB(R_&)3BK)wGEq)id>dY*# zwu7CanXdbw&gJdlqC2-yCSfJElbp{WW@AVNc6GN`GoWrg9efd}j2nB+a`$>OCU%!@ zuLtU?$$@{H>cAS{xBB&vu0e((_Dd5xG4~F7Rp2P5xR&M(tsx>uIBq+@$hrJWWBdgu zl_VmJ6Zq~;0Pg2zKq{qf>RlaRW`#s_Uy0j=ex@mTwy1U{l5dRrF1BDNjL&W^@=b~&MiS71>Ly!z3*rW`hpcmrq7870irK*1F z$D)7Dx0zQeju@!^itRYOpJL84(6C_j%~o-hGZW7m`1_vnb%eAKu02S%Bp%X%MU)-Dxm?S|9K~e+rikp#xw4E0P!8)cQ*DSkul=HfVy=VpFQl;fw2`Cc zK0vIw8!&`HK5znlWUsO={i7$<08jG}S{b01G!wr+f8iy+UsQsnscdh%zhI361+(D_ zPRZ5_2;4F`_1?zgPqxo07FZmsch?RN?n;h_)s_UAPy4u5`4c)%01=>p|p0F zxh3KgS-#{q@BXlgUcC-n34kZ26jWGyKx zIAZE)B6rD~kj_nGIM~oX2yV*)A?|DpF6b`1IVrZ6pO?MDP;7oq{?vp#;Ivq=B(*2A z5G!5VnmW`B(`Gtp|Oz>a(i*n>OlwColB<44ajuuiPwo3@?wkvCtOQXP&Wq%`E; z_4zF;J~Sl~6@8)4*>8RetQK(%U5CVWtV~Zu@7>m;8~g4K%rT8Ij;G8Qd2+5BJ>?nT z{1RM|a@%2>k?CweD=eT()TA?})kdwrd8IKQw4@`gr0U#lSQERmAeT#tq;!Nkt z=*JAE&Ehj>@yEBUG)~heE<)e4j#j@#6=x>#vU!=atbef-UW&xtcOFnHTz*M&5G}9x zPOJ;#vBmKKkz0y!WDFe)R_(HwyHoRtA;9RW$W77-AK_##zJ1x}s_ZLXqslmQc_55P z%W3Do3dWQqt!bP))M}iEKd2s-5uB^yD|BpLad6+Opnb;^sf+k@89X${Q9!ofV<@(T zpP~z3@xmsXBW9E~|;&*ez)0Ur6Z53;~Z--DgdVLjl6mBwbb?->I; zGE4mX*R9*z;@e4LJrA@4Q`(K4-F178aKB_vClW*_?q6u;U17a`qNQUhos@U1iFj69 zhLc+WP*ax45Pionj@>%*DRS$~zx&Q#$@M%?-dk}FS8Z2ih@-|5g55G)o#FU3gz1`M2 zT>V7qt<7JK1n!ZtH>!UwU{AQSW(d{ENVPY?OOS|}!-U$kCd2G4oeX18Bm#E;OZwTl zk#hPoNHq;<8)Tm2TSq;Vn;9-o(;RHRX$LFqV|rH6qIP+21AHYYAxc>Nwp(t79!cKv ziT%Ju_DaN=os5mN6%f44ez_NI03IxCfHr;2J+~4+5<_3%St%Ws_zoeCne>6k~9)wsh>1^c0lB7pq$fm7M%pUf7SRm;wIO>Wp=r=T&~rGNICro3uieteDZUx zp>eY{Z{U-iY1*r~+xee^+|vlMqvfz$3&wK|XHUXnm^3nLe+5CV(K3H;f^=ep9Nxvu zenPtQTtim;p&@mFm2;W_!i#A2}iI6Rt3UyZ`g{fqAA$T(gyfB4+AvKHXPCZe3yUx!BZP&~~VBEPkQy zjata22iKO}Q36%IKkr}M7t{MbfW_^JMd}NLhm&^cTGt$n6fKL0P@K;a9j_a;jN0Av zceoBZJgyQ7x?d@cPdgl#(_*KqIPWY(5KrzuK>q%5^JZJ~$(N*x#RY7sQ++GjgDXkL zjdkw!cKYQZ_p9LZ$N+gR^d%O%?M#6)`#_1Yh~I(& zK%*nPJ06A*(_agaW{l&6zUe2441H;}uMx;zsWIdnhfU>`br)iLeZf5_!RjnU^MdYWZ!99r9l{ra7ve9c+=Nne1 zL5@h=LhZo?>jeT$R{j*G96?+)Q^w4-ZQF^3U)w_Md}}u&^8TLqFfDM4{6!wN`#4L^50asIx`y5 z^7(&A$qz-uD3Q~u=ZzbV`;@pYWOfI_6t`3va%lDGkp(Q|^rE7}91rj6t{^{w<5dbp zwGKLL?HG~;5eaJ^XGo>-vk)AVkYq+<_mcm7{sBtSNbeg&uwoT=jByJqe{JP{OOcwJ z6m{~eR^pNeN5DKZRmwOej0X&%pI(oX&3FGbk^h#z1DkKf2dIgSaK+lfm_$)XA{>EJ(c?3$hzla(+iM{&iZ=1{>CI+s+F;|^&aFW$htxZ;!9Zu{*nt!1cM=D@Z>G#VNPoX&MSo@9nYMQ2!Phg* zlU3PU&*XGdePfFrXbuHj%D^t3Xanlg$3v9oN&Mq4VahM@S*=tVlk#?6{kW$Bs9%{c zedP3l0j(A0b=B9Qow9Vs9miwXGr%uC47U*L0HUOI@W6aP#kr^@Oft%7wYKfH({xSQ z{jRuK%(CBX>X*T<3f~sAfU{n#myA=(ZQA&Km^(*-YM7eS zEj@qBvw~Ek2OU-u_*-?czz^f+t%0j2EOQ+l6n6TOzoB}I;4i6DBbZ7Rg%;xP{1c2v zQ{o>+xnR^o%57|=77H%NOWRG}?c@2StVY@O-tSjvdJAYbd2fX|=A@9kw^6J26A8Q_ zQ)Hkaurq_W%%Gs#_D(b*8LSHA*BM`_jQJK+6~IRzl_>HP7GQ7vbp0i!Ood9|#srPc zv=nYjjGxIkdqPauqOOCak}+wJnNb5-ZA)SKh3Bc(qq^_{! z3{~ga3M}~J80!8O{mdl0<|X2sE!K*2t)kWY@DWPh=e3hcSnKAI+rYR+F8-3L`L?~l zcqTP`yGK-wGzuGw#GlaqTfN$o@#!<`ZWmmpXXfDeu{B2Osg}Yt_g2L`$1DQMR0P`V z?<779Oo^ukn@@FpGvv}${UjUm!8hw__5wX{VqAHz;Ye8?Mz+^CmQK4bpMm6M5ijv| zt5ZpTta~i1UMyKk?bB~+kcZW(l8E}$0pE+;?N?cC2{BE%Nn2fVTz639 z6hA%75S1DL+&=t7w!L}H{p>o`de-|IMz05Mn7MG?lz+AjfMt{308x|(HyZ@<&A8he z6KIO>_XBj=7He7=fsyYCHF-UrV03=WfUKkBC4TmNE-geiHrfRG1&49#Xa z{R-Pr1IdRFkMkE0tu$EYkb!7P%&D7@pIEdljb3fDD);W_ZmsD4Qz|HoIbIN%*(<7x za0ZWz8w}xVu&N`*3dJ#J2;=1x);@G0oQsfAxtU0isoESEGiduH^XRUA0z6t`X{*-) zBA~NIu@5~U3@5W8jz!|F0qs%wLrGGj)We(b&WGj5bKd5G)9(qRJ6Mz)ook?AoYcMb z*cS ztVHE>fP8xyv1>BM{o{>H^3YBs%lb9PSK>-2`vx^q_bf*_m`>e$Z3p?Y_$_+z+rV8S zK3i!~Df`nmsT7$DHxEWCEq1@NelK&lm0Ec`M&Rgs#kj-eY}yc7%`)U{4aj!Wg6)5C zuVK;wVQt8L9mKcsCxGZ^j9)o;Y|(Tts{rE zsI(m}_MFgH&}C6qO5RBB&s>cj87-km*ZZ-WuUdlMn zw-#S^(wz0gLBsDlUjZmgAN`H$t}9g~?4`^0#w5oW*}{}g&7P#=yMcsHwy-`LVk3ir z=>y#W)srAMt${#mbnI;>?OVKm;fd3zfvh8oL2q#ULZhJeHN<{rT?buHdbiy$K&$~> zU$FqrJqv|Sk}%3!{TbM8=%2^hoNJ$!ErsUFV%GNMRo7$QZ8I;`LVH+tOZbm{7=3{D zbk=~j_xJHM_B5OH;dAAp11yes;D%ANwGaTYft@(QIw2TfG%qn@Y`W&YKeByj1L(Ut zHoY0gNmd~qG-G0qmS#EhuZhSj*}~4PlXRTe>EE(?zw-D0I*LY_4Tho%uC2Le-StuF zICog{8BZ`bJtV_Uy2`@u$%|!evXjuq*0l1eb|d49=^l~PEs=7aIh{_0TmufKq>ps? z1@pDG)%hx!LpbPB3exOWBW=;hm>XMd-5R=UTbw^o5K!Ms*Zmkf+2L>p(+MEXhH~^D zV4f!}2Bi5R8|+CKE#>iH#p^o`qSr7Na*@JkW$EZa%$fDj9lW9KN_gj2f$cCyuYtbk zP?N+41v^7+EsSM6%)r1yy*zr%^$FMqgd(6IfO*gAe3h#9) zbQ-(!wyi$D1%wt-vxm^<*2b%cnqRpE54`^dx7e{+u>#HGmU)Vd=Nacpt2WjDw7uW} zyxId&$hB$;o0k#r=UWLK6m+;5WM8tI?zpK(0TuDa+bZQ|zs9XRz?h5(cbbjG3U*%w+F z#fe1p+iZ-JgXP3W%#UpPmhvA1A3!wpZ6@)PQ*N=}68bQMfq5Mon~8BcC>aF&m=7sF zxKx^%DIVP1Gsmam46}GKQ}JY`jRI|3Nd%{6>N$)9a)MX~PuU1a9FyzKRQ5T4ZppcY z>vap?`tP^o3&Qix@th~h0?`G64Ply0j_=N&Hh5|^PA3liQA_<{m?(W>sv^Zp^yVY1 zX`V(6D6s9ba>Ip$HRpwE+ z_pXPKlgaRtkmeyW9o=gGxm-nsWi1C8SXiUX{$?3^58q@6I>>~-W*NeNvh(-1OJ(6k zW`4JC&sqrr#cf&qIO^^NFu-9BI5vlg4I#&-Y|owwf6NFHMQRF!tT)4O%{dDAX6f7{ zam2Goo$|;X_kc(E>&{Kz$B0;0$L!6Zd|iL1b%sd(FVf;R1Fe4y4Bf=I0;Zi20h~Wk zR{GXgTG%&){$ZVX?EG61zj&4JKxcvLWp3k9V0QksqM)54_46JLHRWi1=)+3yZ8;nLdgt+yzc81zm|l6h`-{2@l{B8gJ#xqv z>aTPYPXoJrwqU3lv|Q^$2m0C=r!klMH`G&E#M3liWjru;B6(~531LFNwKq7h%2%1m zxV8k9z#;BQog-LF;Ff|p{*S@NfbKmHSFwCv)PKvPmEww18aj~g~=S3 zW&fuH`;a3ml?FYOa5<661oyRqYXBx`f4>RLJopt)V}@Z+immcEIGxqFL%P(4XVSdQ zk1+oRvo0M@Mj(vN2+I^)V5F{`*9nZe{PyzdX*%JC*Y6vf1{xu_b#r&t;!eXYBj zC->|RR37K=VP^l`)%=BQeus@3{vBa>DIr&@+_vN8wM)W%LgX#Xw-4B;`~>brI+W0x z`azO^dkc0vd299w;p(plszwCpGS~ce@b^o(gaSY&pyZkRpFg=yp!+jjXD0OT#exXJvRU4 z*{$qXYTF2tRb->D(@)3t&D1m+%=*&yn&%DnDpxxR12p{uSxh+SwX`M>-k7Xa@vuZu znOX{}NH+s<*j_gq(CgTmK>ycLaNak+3{2F0j)w0!LDJnDTbRpZ1ZgQOhllgJxry4W^hk!3iD$!Ouixr zlT%42o(u!7o9u4ZQotA*_)S71p@Z%{tTOelJ&vn zj3(pQmexw{7vQl7=*NY}FatcPYfldOYDsJ*>|Y11X+CUk(y0dd`qVzoj8v+-==<;9R{iyG;e+Og$9tc>p*=ro$iD1T16It2NlI=q#Ha#|9|hi9 zOBl6X+PK$aCh`2T%dd^c(}@k=pUY=WpnM<-9YL1N$Rj`dUuUUV)rf6eE-7%Su`^fB zs_4Ib&g+Eve59CvUSaxbSLeo!LZ7^0GxxrNwNn=ZW3yje6xL=NKj`#zo7=u6cj}+C zx4#Qe`TXqaDdoi%Kb;U)R$Nc!JUx9kaI(L!i<bNPMtQq{|HWsa)F`SZ?+7~S8YM6gRz+_p+Ax-;J&_f_ZJ8(7t`eem)7vy+4l!G6;q+l7B$v^$(+Pzv38bUY7L7vYOU@Gm`|^kdnmoSIRD_{%Bd?) zMkgya3eLG+a{c$Mu(JD=gN}gu`vltxfkwZI!yc9%!<(yB6&~L!@5O$|G@H8Dm#eq( z{L$5%DzTNn{?V7v!?!AQhf|orsF^X}D{h{tGB7es5 zta0`3Q09@Ki+63WWZnMtLN=>F@}v3F>F+m-3`Wh2eO@23zj81Z(Ry~|^st9pG`04c z9cO6r2a{>v%m2)#zkW2c;eY#dnN~s9kXYc?$#HggF48aMH0e@(tI+;wZo#&|)UA&l z;wg@g%p}JSOr93dRKw&P3cR@2km}g))0D+ulGgsTX!K8EX`^x7&p)>sOzh&HmQvl@R6so`?)Y;`@}1i|jz?>1^Hs~B2=DN# z{@-zZmlX|+3-MuMS5j7{H!I{H7D5X}{-#8y9S-pF_~%M5OfB0HR%&QBsq{VfI5@D= z?6}GN&J$^mu($J1KK@QoRIL`xyVxdY1u^IpqL3f?$$~>wj(u~5@1}fD3j6kyjI}@J z^8Iq{m&ymTBZ2xs$wLZX59Nui8oleb=@6@_{GOUA+3?_3s7`Kzum8J-7Z-0Gy`^+E zHbS27YL-`Lj>=tsh2)Ugd)YzXyUiY6qWq3qN-g^`_B^yCCrGDy;eP4bJHm^hz9_t_ zXn(o$Rl`&T#pTod(vQ-+xvm|Ot!W|GsXmSpiMuTRQ1en{!pO&}3H}loav_PA*gj99 zGDp^$uslK-xX+2r6CY96fdo$0UkC&LeVmNh`K0Z}!iwvt0xEOJ_XvPBYCm+(Qu`s+ z4o197vR$fG)C%0PeF%Wwt`B!jZEn7p|#PVn$7 z^7|Y=4|InN=Vy(nW#YI2=N8PjIb75pjEi3JGZP6}22ah7KcT}p8A)7#fRK0J9R`*I zZkNcIlz7U9@`o2iLpYnEWK|D(w8T?Tk2AQEi(>YfZNa5qJU7k>9G&IA#CqNW;Q%Im zGcC9Pr51M$mU9bCh>m(nCqyFo$o`!!`|?Gcs4;R#3zC~0B?;_rLoYxt=FY^Nq9@0| zxal%NaDA&q92cM|x$o{-%K$5ho*<(==+cp(@T}dB3n)%jWh5+rF6{9SfS}!C z@iCG?hy+z;X`b}vk3ZPD5PJG(ChR;tz|Bw}LiazS272zRDq^;-*qAI%xlvAsBZLAaNoK1Sr*JQhgzGN+ib^rB1Q^0QDb7F{h9-|)~29Op!$qj8-7HO%}u=2bS7la==K6EIb>y5BN& zG+2j|!85b)QF80~v*7Vr@eeNgjfSNlAre+NXU7XWdS;s)2`UQV>r6K=iX70gKVA5* zHK56V?}v5^ii@SH!Qz}IF{S#1mgS9q?;iZRN z3io_XF|lA{gy^(O;reGgQN}f!Irm>FRl`ec415tOO#kpd7KawFHh%@#dO`*q=cOlbHW#kK z4ZIQro1f7ck)&TaSM*!2kx0^MFz`z>UV;7;r=14#F6&@9w&b%I0zy!3sxgDIdbUsT zW%xc%Ni>Wnc)y$$%Y>bz%UFPmoUqGp8Q)FPVf$e-id$LTxNo{WFpD}-O7GCwq-`Ah z*FlP^>phu8RMG8rRwDO*7%k=ka^-VRK8vFG_)&AQ-J&J%aV8;w-r}wuJk-7qR|{oA zwFsh3##03)qU}zvhfK#>c7jVI)2`QqpU;Hkov=K84yVEe;7QT z(G0u)A2u1!2r$-~AkJ9=jG+WcCfh&Vm_huIE(ub^|EouYa~}@K`%gLA+Ordk_z%+= zd(>QMyk8^64qrc=Y5JeUY|*7%rr3Yf#A7oM6yvAJ?B5T3qVPM&{!d0oM`I6GwqM=s zsjoj#=pXyFfbTyR!0|s3H?L2-VCjDsQL`Hcj{L_@3VJ#TM(#uE4nRRhJS7LG*VUS> z2*|rDL`RHo43RtA&mzyyR$gJW`7S1$gw^(aBU6YP;dDzmd^=86y^KsjDYu6MmiGZ6 zH=~yQAMWIO>^+3sH}IUKKC8|a{O#5m48vbAvO>0~Dyjcqu^tLRH;IiwY(I~WmINYb z)=td5vxv8O98aN)CyY+uQhxv+xB)@QeG8i&391UAbiR&($NnI*dM;6+hxfAXEwo<+ ztoD^Ro<)kyVjn!21RE!T(H6vcAyjWMOHvk;6N2g&ZF|0Cn;$P4#MAB}^pY`F?LXG0d)l4VN0XxYO~6CXsjBiM-zTjxtK!IuF5Mmp{+J zMR%wdK3(rdrxi8s#tqXN`A&3h=UkVSFAX1D{@Vt8QO)eJ`JwFFjqPwJP?eMm~-lO1I7H%t+G zLDCS(ZT)ni_S>Gk!Tl9X;de<6Y2So(9 zADz9$8oi5N0jKn&6@h!)z#Bdx{EuyG4(61WX}kCR4fm7DNJL^Gd+I*(BhOB)SFRC| zb9UbyI7?829h5PNOy$IceJKPQ?ltp~4dPBir2_S9?amCSgUMeC=}BUyml{vgw@<3{ z{-iPd&W1N%GQQ5P9bdGIzx^4LAEv>|xWC`uf4wY(yuFC#3^#3v-nU3=oOt3B7S8%N zee0mn)(U(7Th&L=snnf~vLR^`wC1C&a?QvOp?XZL?6 z=!3ieC-QhhIfqsT?(R`$G?bFiPEo4bp5!1EV+>KA1>M7jPGP<`U<<)->5z~h;#mz#(G3pc<2J5cp= zd#38{ql?V+-gu`))*M;PA#d<*4uj!^bG|wxme;WPK6egaMn| zpeGSBc$|ZSHnlDyyFW2IyUhybZY^I`)9^=h$)d>YYH|(_qDJZJ)&1>``dU3^IRgqvhf8ZU^+FbCM>>x8id)pGY}|6u>JBYMrw?z_h0;d~21l4W1HlpSk-(R&WP-xXbLxx2W&-Z@^0ysvK6*_TdX63~ ze|6nIUjC0;|EK(wla)bjhx-~Q_zG9h(rRH~Lya&1I)?C|5E-Bh+b0WCfPPlh$VYvs zU7k*gs9hcglO8Z>0SF-df00Ve-^_4K9d$Lf7%Z>c@s-Dc3**?}ij3htPeL?8OI8z9zrG7510^K!GM<10-Xg-A?j^iZff(v>T;`;rh!zZephN|j zr3By^p$OxI#jlv4%!op9q15EyrAyUR%@T=Y17cbIW7+*{$7_-Tl-T@hqvWFk{)g0K zqBFr=fXx&O=9vF7&aWB3Ob14AlVCF`j-eas0nqx(2vrN1t9<8`TmMfvApctmHq(8g zFHgmPIW4U=Fq?px9Qx67e86TR$s>ZdP|GIb&&zNGo9QvZcM)u+q+qInF0h$^3f5pV z)u1Ru!PJ$=kBdqP5X{G6mUoGQhxlRoAki07AFe7RE?r$i5AMPM&b(jUQ~P@W)j%Xw z0X_it42-;tkObeB+6=&C2nzocT)f%{j}gj6R}u#bc(2+J4%9xCb|^sQhNMgPRl)iH zkda~_B@HJ3TlSw6j)eecHrVRaDgt2sFBAGNW0sv^o=3#nYw;sH0}2cWpf9id&ZYMM z%Ks||&|r94cPUSe>Xb^7O1H|RRQ0t9{KEye{c%-{RgL16^;J`PWXT!!4<-4 zGXOjN7fZ~H05~<&TY$wF-h)nNC=1s9{{s=%?8THhwUhugfH+kE$^ha&OoGWE zm~??j6PQ#1qy=2%zodTocfAkD253N8_ucM>m3?kW2lQz7%AUHK4`NCXxZZzA4x*() zN%(xnzH@PfPqkjW))JU3g2@7y%!A1sn9KsC7F^{&1QefK#b(2IB0Be{>y@hv{^N81 z_^a@hOtRTX^4M^#np>_zyR2KzZ^vK%<5>UU@mK%y-@)?7?YtJmWfG*6$mWTFDhRv= z5dscEr)&!&C^nl=9-zSsHX99|3US*-IHZ)I`UXUISjWcNC*&hr$p_;3ZRKmBiXqyg zAl-P4Iiyay`VvB4tx;$`$mSGqMNYiid0Pb_AJ{zMP?dq`j^LC{Z6ttqiw%rp(3kCu zae3Qv-JoG;p`+g>zRpKu!v}PxZ19e~?;W7W)66;y22l0bwTg`dXuC6NWhA&oJ_34V zz;jml+HqVYuugbzZ!LHLb-EN%H{{Dc|p)(Ajc*5x@Po;Lz)vE>7F zw7G))Rp)|HYp3%%FowkrM4(X(tHH$%S|9;To}k)1v=z|cYhM9dhrTMU^;rsq5$)$- zHGIcD@a~_IyWCa*f`)&3$`27QURC*OtwLKx!TeXlA2`c(OqX@cEuTmLAf*66tfK!T zBw;&)$zHYchZ*hDn1$wpORjmJT=zb?_WgtT*LpC*Km6dQw_G4U5-h%#pAWEH&#r)9 z0B*=63$mT*Ed%-P25|ldR+SIZ7M3f zsOaXg0S3I(5#+KSs0&yL3DkkuN!MONxZ{KO+UpP)z7{Z~lQ+Q%t>3Ztf^`DJ`3dt+ zEdUcL0zhdDn+@}M#{wA4W2i{W$VXLdXKI^ZExpj zVpl&N8w;Rzd@=wC0IL7KS?{Cq*C!eH=Ii)h@!Zhrtov)E34?$G$@&K#03cg7015!` zfbIcy(~YkNBfIV&}O$mZoD&sXTaRs>)MKpDXafHL+8@WuZNvDL4WSxI9G00s>E zz~&mn<=1tBi`VdfU=?WX`+DCi&gc)rfj)!Hjt4-2mK_kxI)j(M(*K}V;~z#U!59gL z0hHz-E`Q^(Az1wktms$tTKorifGz)_j_weQUa;A%7zn^?4oJL0x9Wd!s{t$~U>){W zRUj_wgV###_jSOm|1<)V{9iP?;5Ga&nh*YiW?*3~b`H2taJXXw|m@!2t9!v{>oN<FV>zNq1jIXzHzP z+RKB>?DO>$OaIqQF5u_?z{zq-B!s)$IV{^ zv(b!gR8&>!PfuTN&pFf6aAkOewbAaiM*YzF8x~cL-W|zVhs;2=wH5c%Y|ommy5Ha4 z{XutYNNBvdJK5h^%hz9PHMzVxTLHb?+<9q^7AO*Ggbng1uB#I%ueCl*@sAm2-=#B> z=fCV(0nZ_u*XkAd+#~Y2l>VEZN3_HPR$+}7m@TZeKUeoGQ{Bbxjv~d5rar&5jom0J zvwjG{r0!DMUfzcxQwl8ZdsfV*b?gf%% zXP-*>Off$Nx@BZ~sC^mQ&F`Pv;yFj1Y;~4_bvaL&*r&&rf9X9Y+~IG2S$4*78CiI& zl5cS?dzR(Od&XWz+`5df4AL3qy|+6+%A%vf-YR$|nK|w9Y?-Prvbe?v5#B?A!gn0% zXh0gn21B8jH(a4|UWv)JPAeh4lXLz;r6>;#EQAziZxJnPthI;==;duUNsm}R-IF%i zq|)XU;^Y%dpZ@NWrB;fcD|lo*&Me9E*!fA`(_&kup_M3Eo8&r8$c7l1+^{7BI_!d7 z(vkaRNrIlY6f3-UDF_<91KkJa(%$YPSz#)-?ck#;cRh{X$o|&f$Cjx-TJ{WWx-c!a zYy5-IZd4*ZUCpI21KM@7@Bia7vo&cZ)`KGfPiF3^-A9YkfW7=Xov35z$?=FglC)y< zuYKGsp0>h9q!|+=smq}SNB{PjuV=GK6j-&AP=TUPxvW0zi({H6q-c?B{P9_tM`kJWM?YS%ocbLwhc`ryE znSOXq!l86LI4jEV>k6d4G2Oi#{ZR8k8p2poh>O`|{=B}<_$V1F#CfElJ&RQt5@(^+ z($UZ8D~TqgA=S(ULCs&wt-Ucmk&lopv~QWrS{9uY9tsC$yBAtFOVjY1>#&z@`*Kf@ zA1?ls*5g;on26>wQ@0fP+D$VaAEi0G*v>9M6<1S^ux#BROi`tb;C~+e5Zt3|ckr{g z&5)gcl*tI!$N~*{Wrzq~g#UY6rBX#JjC>Qj9f@M8|Z0 z(roHY9Y#dPd*vrN;?Ry-D4}54&EK0;5DG0eF+W)~OWe<)h9y@R#~&Nzpy4z#ZjJ@V znj@Zym`h8&5N9;?<>rzSt^_dF$^^|#(x6dD3oxJP4Y6|&M$$n!7A&ZgV?RCcNq;}B z$Nt9ep%3v9=2HI9tU_m%aaU?7nQmS^yRYj53Ultj{v?Bkl+2C9UkM^ps*7dn{#oc* z+K4`=?~&9d!mOeBqaMj9qHzJsEr$DCl}aQZ%@@CP55<I z;DGbwOs>~?yIMInU7zU@?tr@?U9we-o1*k)dA;*^^>+wWaAID>cj>nyTmYuDm=VZAC#ptMc+e_f3R&J7=RN`ZQskg!h7Er<~5BUq!{ z+}33mxw~cIYcAtjeu)ZvZ2joanOo^Ii}`hzVv!WzvTsP*mO6Ek+iR2Q`9jlz+8!bN$ z4`b>t(s{Kr)KkovFDbgt{%mL%c`RTgE;e%T!&`f>%qGwd<*SnBr07|>YTlufbTBWG zxpTXpadZgJ%=TEwRfWeZ&$PgI8=60FJ>Iyir|tQ{##0A<4x`Hza@3@vE+@YrHY|_}SFsfdPmLUs20YFqo&4&Joyjfn{gaxyJf|bzAoRWi+mHM|HjW ztEAtj&zPdrg6jQY+ohA1b3|^mSISNm@4D|Fms^c1BnoGQd8e4CG2&9K>@8Tlhsi)ql3GF_QMpRcXL^b1OAW zgqzk>GH_K1NT?X6?YyZdzVC57zl^~&%hIZ)Xr9xzNqS+~7++XGt~W7stK1~86CRIH zt_JNo)ZBCNCN(P69K4k@4O(x7o^~9Go)bh$@pQfEsN1=gmX#mIuC|M$UASI!qaBN) zI&d5TmLAO%sB`6-6RzIU-r*FNv83VqUwF==U!h9QgG6g(soGgqt@&#!D|_D^H*qTpw*C(*(XHxlz&HE9!$K;Z# z2k~JKc(zX+RY-DR<>r08=sdGlqwQnoxI|82`vxXlgjVFr9TCQHDzuOH(wTKz7k}`Z zbhy@%dX9QunfKqPye+nbwOB)*El#y@f-X_vo;~?!(K$0(#SJqbr(AZJs_yimg0@sE zC$~3L^Z3o2oplu# z$0SVTz?~A>!BV~T$ZSQrBvboQsr%ItsQIfx-$p>I474*=_QAnr8P3eo{_aww^=@{0 z-U}}9uCJ5`_ustu0Z7)*qmMK=@0`u!b4~qPwNNWdy)z@W;iQ1>^f z4ZfP15BM$Q`CAv6cWjGp)<#XPI}c*>ggaxEA5_xHeD6H0&E)u-%x9W+l&5tjP7_@+ z2+U7Tyr^LnM)obH(o>hB7o>h8S_v#XiuYYU9k8AfPQy&A#A^NxXx!Cny4`ptZ4tjQ zTj3))A_Pgco@BbwRf^b`2|OI~xAx~KFgAq$^(BVTvvLSo%Y}h!nf%b4o3zp8UF714 z@=I<+5@Qc(7|ZX#q~T#6TJBBSB@5G)v9TM7WpafL=cw2XTlC&(1@xOk@ad5N`Jyp8 z4;MG1=6(*?M&dRxvAmP%dpL>#f%iiDS^v;dlk`>9D3}#Atoybd$-ZluF#GBIbn_$HZ&cab@BK@iKD7>-EKfKZ>V=M9#X^HYox zRTfDz$NCV_Ez_7&q%Pby`vD^}n(a+LRhI&eb%L(~pj1>0?CGk7nNWglXJgx#hV9Yx zMNypH8boyh6f3-~5gDJ;Xdpd*7n~X>p0Qx4=6PoNzL)DH%kix}>KLH=c%9VKpv%+{ z``#vV2nmP~v@?P;2;(@`-2CmYS)uyIR#+J=5@FUi4A^&FzWZ8^qMVo3ik)VOP${i% zWSfiiyKk_^Ch_O5&TF~dpMTa^hxW2e9+Ql;nSr_TZ23iP;-*_Gn-wFfqq^!$&zmC!NQ&VgD82D`6n(!R)68u z+!gfBUC*LmHh4XHo>(}xjR&Th-AQhJuV?h@wQlrXREQDJ&Gik~~sW!*0CqZhT{!&eK)n2X>zJfpbY~;hytmf;8x5 z=*atuQh08g7~g!{l$@!rbNKBDcODMWDc3RAeZ*bvV$MR-K{5|2m8SLc4a`oCioogo9qIawjVbD7tLMwRAn3}DBpl~7fH5X zeUvZ_aBzZxOu!zS&wEH?uU=i?Ce+)y6n>s}mSy~!fzLL$S3x=p-=ixd=}(ZxOzSqE z&%(kzet0?eLAN&~vzZvvpX7}sZiF_Q*%{vlCtnZV<=W;wqFJf$+;hi<(#KU8I(CVm z#v%WeLqv-UivY!aJZrFB+cK}x(2YFYR3$@+q?QCVTp&gk^HOx(6#p#Hnc*cT84evU zzgHseg&bt)>?|L=c~Ej0F6Z*MQc`ik`?W$cL%Vj$zTLKFk0NZ)PDPfkdY*h>g+2l$ zR6@aV8IBs(1h-~7qP_tcR(mU|F%Aa|*jl$P149PzX}4GkP^SQ%Q0p3~QX9a7)i!}y z(*?NXT-Hp{)HnWlXKzK5$Kg!8X;1N5eU%#=*GwhVH^PjI``8y_lsOFibi%D`sNP$Q zA^?fY48Rv^eHn&2OYWYXKm zqQw0&;N3+WR-W9=GoZE5u8ZMg2i$OLrqug(!2Dc=g9`x1-XY$1at|x$`M9{UuC@u| z$z5Y_+Cw~Eq5EatHhJYr>zdljMw}7l0P9Pe`(+3q#Btva{#I1M~9*!PUhEc|&TQ;}4ikyJMijN6|c6KvZF74AKy zsy6$l=}fEk`@=*ZPOl9HxgHa0EqKsCVzJi-by??9Qg}1Svln%`)VO27GJHxXHRVC8 zwqo_KZ6U122shaTJj;;_VSUUfmXeYxP@coaENOrr{(-SPk@y zms(y|+3L5H*-GT<1}9{J#J0{_js3u1obRdqQa z4^x*LTl|}l8OahTdUXhY^Bm@>VB|OUT8tF$R(0SLeMJB=|Iqw3=%CIip}PxPN#&(4huRtMi_NGop5T0wBC0^f zmnl;seBU?I&~H(DU2ie`VK$L*a=t>2*f4u66MTYM%RG7dxhePYjiv!zpqR&D4r2ME z3fv9jY3yX7+GpSUsrI4u*w)a=Vk!@JJn!7r7n+aI`KK4QB68~rl$#!y9QW0Vj7o%S zNy4Pp3Cvl1f%{A=P%x473atU-8?izaU$QMCXpYdNrVNW#PFML`9z6p=aa1HSDrNU` zez;aQgwplnN|G59u)2ZT#gZH=UHb!H7p73={OPEwF*Y>yhf8w^YInWcLP7}hlHmym z83^`BKblWa1p<$f(AnoIu0_ea#oR`U1Y4cvVpe1ZGm6(0oz829Xm35dxd& zAl!mEb0M)whFQT@WkSK2fLflbBrL&i+_M*}hb_o4K-faQb}&rS1hQ{)$q#f;m58GFzA{iFXIFGa{{QXhJb_Rx@(tjf`Zo^-An}*dX;_ zF7f$86=7S7qkGZUX=;U-8wD z#h;3u$x5p3F)HnY_4A-mG*AyuXXP2cJw4qk{?4MH%&_opj-9VKdtc;qIZavj1F>^5W3>=kg@&lzeX`!Her5Xv zg&+>s7o79O&+maT6s`--H~>SQO1OX$wg)*P%e(0(L{%1LMkjURJKLVDH;i zc;l&a=e?Q#n_gJTV`Vx5b-Kqe97lhW_{A7k1j*a740`6%Ve4JW@Jv6gvB6)CJZ@C!T;3p}5zTxd6yNL<~+eheZg zk%!4vQPM$}K{~)He`FQq>RY~j!uQ6zQyQN`>$sgLw zS`C|RIgM~enKqCxL|KiU4Lr7DeKX8|UI2rHSTO807z zI945E8kGK61|WbYd%0)hvx|j*nrLHqx6(yNoYC(k4$V(UmXe_=dZKp-LB&V9ze!*}5t@KEnAvnA>`?`63`I`{e1>}GD{aUMh z*qPz@cyjeQ9>e+v@+suAepe~hvcum&tP@@zFI4w$OgmNYC>S6o<<>OdRNUBC&8}~I zvQu{k&;3%nR4JZnfAs3`VN=xM?dL3KtiT^idlFF!i#=F%DsW{X6EEVZoJ(Y&L5?lJ z!5W-o*xIZo+3*$ejGq|v-B{xLfB1;TgVo0=PJt@SnV6dIx;0@pR6(s};Cqv3Z`UT8 zPeiE>$-S^r5lbKsWB(%vrc)MKCdXI*8h=_^**AUm!+~SU0wia3e%Dp>9>uZk2mHyV zu5b;m%dbd>&wB)q6Ml!0fn$df*V)0kiq)oM@ns()G-swHNfwA}e=UPn9O-bF`X1-| zlJV_UvUtJ{ir?C=%;QCC2ouq>Df*u}=ArD72QqXGtcwZ$Tv;3FUD?{r(J(}zuDblG z$#Y{kW3mU#Ua!`Q#2dwD9Ijd?pNo?NWr&Q6vGGD62*=5R`mLOFO6ISC+Lq>~#5V6kNWGc%VYW99>v(b+&qWq9+mZ)FAwvQ;`Y^Vgi^jB{~hFFywCAg67x zOyzBU53Lz~v~Y;q)MsDY2tpObCwHuHr~=RH;6RA!H31nB2(h`$6>DS$a>g~=n~L6c z`wj``M|gb*9(>+2>K=~I*+-$$agHYnaj=Oi*Yot;>gi9L#*yZI#7>((jZ;cY`{F_I zqEf_B;qMZNs6HC&AY>cyHns(&^+cL5Sz_`Kd`|pxk0K2@YD%&fMcz?HZU2-&?qddp z3hMbaARsLOCmwIL_p!cavmBzHC z&U%%cB0eaqeq|JCeLUxit|D{?Ar$S5=TcX$n`jJi*FE_{w7X+KBwqHOIup`}68b?cidmWa) z(9e6Wavf)Qv=!>M$(&T$K*48VpmV!-+Jf)-v4g}FDL!rXS}{zCgWMf}Cz zOZE17Lp3d-7wz#^zN*d;>bpfJxz5P1k991Y^-wU(LJ=CEZ$}Nep*?Z_5gM+8KZ+r* z%XZ_eoUL-t$sa}+oVg-5dN1S>h75RPVGJ+6gz&tHFDqNoR>Qp-6t#89NfRkwZCL_I z7^JD_pV(KgGt9AnOe@ZAT_&(aJAUhVR^YVE)!(fzUaqR-tYR_CNugWP+q~#I$_ZfV z)Eu5`7cd0kG^YJ1U<)@Djd4!E5~o_<1$I+_o%u}}Tc+LU8;#ZGMgLJwAYd$11!{GM zdN0M|rmQZ!!-OmXVT+idycf zl5tBhO=t5U-Hw*g`&Fgc2&dTCo3h!oOOCe&h^sToF$6%nhg#o1?f>AKt%5eE7LNq) zM9b=8E;)d$=_$B?dDgdTb;dY`0GJt|D*65oU~3A5)UD1ilku)@X&ryQE~|?JabzzP zos|)cxoCDYyOQnxpgJ`_nAQ$(Vs-dg-rbzUH8;1RC7pVy&{AwNh}uE=Eaw)tzWD)P ze|3D~em$@Qi!04G;CdcykowBscI*B2`!qREMp;VPGye-mp3IgHvZlveQ$0MIxxAUB zds1jqRv7i9>Jy21z&}K(x-4dCDeYNccbYb?Y@Y}Gdvp^_o=7AI|BWx|KwAtOWdSy& zfDzNisrRHTZv5-=n58Z2%y}|L_NCCAZfBeYlcy5%WX`Xg1((2ydnlgFEL7Qof@!qR zVHApF&Vs$YJeee&XrC9){eKp_3a;@y{tcVqPqf7AP=AbcYWHGCRNi?xl{Fr&azU6X z<`fx&WZxwG4tqUAGYXDYJ9Ytr*s2A`;FIg$Yt+2gr~_hvR%gMItt%j~ZO4xnWNXLj zbo8HSHgPv_5ZmvZiQE{LCgm7WEXhfBb2^0ZO#;#guda7v{2<;JTLhjLLEh^lYi&MGw#TVASMTS*7hSGq$#?&{2s=fp8X zu8CvI@q%8=jU^^8!*zC$FGgBpCTv-xxBY=p9HF*1bThB&pGD}ZD|Kt735T0+4$nA0Za~6OGk^omz_D=Hpwo-0$KPs~1=CvVQH6g5r73f%U`cT;nVO$68u(MEA<= zQkT2N3aDeeJ{R;|kL&E0f-TL1l~Monm=gtUuY;9mGWkyt@b_nw8ob44DD8-&zXJGJZ&Rf^{F$0krH6%N zw33>4zj+9%EI-N!j0auPoH<@`yP|&co4y;>WIaHA{%OB*7`=LdnwTwXY zzLq-kVr6iqpt;d1^(sVuN`^P4iGjmWjU6p~FK~T%En~pWC=3KyF_4vWBA87P7Nc&+ ztkaWtxq~W}%s7U{vj0bOC16furIq!IwLY8KV{j{nl1IDZQ-(Q#L+M6Gt9nH zzQs`QXQjJhtc_kO8&<>*QS4b3G9>ysWszhCylGpt)x6Y|ckNE=&106sE&SpSxzdHKl2MA^k>Xm(l9FS{Rx`1uUZrG_cp^oSC{uUmu@eyA{_p1D6&zc<4?SZ3EWKyAEWjZ`tsePiI62q z`jzFB$O=At)#H4it3#kuG$;-p;3AoXilV%<%Kk<+Eg&oukg?jm&Nr;LWa1Qd!n!}4 z{L%W&<%eFLAPPD@4m;ldt**hR-5W0~6lB46RtQ0*-x+_0<20#B-@`9K6jo|noKQ|J zYe$dH2v1_Q*Gl1kW#${uh38xh+ zsf3yem;P=5!u^|-L)&PIyKMe9ORZvPNViHiUZvBplaAgOILEL3?iUbG3rP;|=mI5$ zd30erK{LO-{aR536qgL-d4ONKUl;=-;5gfF%0yTr|Ke*kS&*Du8chjKUaQDAnmVb| z4Vn>QZC`k6vLHTbH<}U`3Z!Lz{1u8BGqin`D|aX_RvK%<6?6vQ2)dBkbL}y9sk0Am zkD6s2%ZpI-A}-ZT`+?u7BlbDo#Aq&CX^gr4aH42;Vdctnh8gk9wWlmG&Y4$#PU=?+ zL8V3BcWv1gp}c5q*$$z+T5Z`Lp}Ylc*#V)vCxUAIq@=(=)%QtBL468I4xWP5qfwAr zjHmJ=O{hjaEQg_Syk%;0X60WR2@b|g3dJ(ibkO}I0`O-V6+&3G#E2!xW=s5>g+o`G zF-GpaS$rZ!J+!LJbU0oF1dj`6y$u>61C-ys|2)fuBhXyN*_5T0kC$}y)?)S{!Pujs zo|KhI3$E9PmOungV^rj>ZNSwxx;?{0r4V6aRvj-VIpZ}{;Pu~AH-C7b8w-hY&- zH&MC{HBA<<7588*W1SKDc5f?Ey2Ipp8a$ayH@BSPzJv*i*i6w7ZiC8$i0OFQ z7|(!2A!@3UPRP(T=UFqFs^IEg&hYb78A)qd6$v?1X8kOSBswnBt@^j%_mwx$>felW z654+M?j)MEo7$5QC%))i7BJ`!P`=$RaU@ z8KHc1gk&eL1BYS)q04l#Tk045RxW$FPtRR><(ps7$h5C9>aE&jBK)w!rsH}JBsd&; z?}}i5Rhw$PC+j+*M%G*W)whVOm)Bh?=%0w#T+%>>=cx8wob9M;GRAV=DE8Ar#Soli zm0tWztYJ4Db=Pp9mhx*wSZ1$ADj?s#vd5*S>YoS%aK~0vz%su^Z|k8L^PN2Y53#2nMonyMI5-UKU5qqy8QR}D zYU49Cx%-`#?C%V8jlV!WMO2Rv2Wl%oN6~dVXQ4b5aRRFxlunqktM=%U+5+=Us&`*w zGdJC@#V+}M|3HJl3(Wg|DG09L2(YHtbV)V{1y8c#LR0 z^Pi|T!neYTXB+ul@Q{S(n$Cu#SZN9_Pw=6>dTptTWXk8KZ;V7IoM`$(+91Tw8#d?- zaCR(Z=C1X(^1bLc^3`*0>Jqc3amrhXv$4IDm62RcGQES&|9Eg|whG%> zg(O*t#GaiwUpd`<38Pv4B`dR0WOr@LGmA0G2{NSb%O^b+qT-Ogg0mr`1e=_v^}HIf z`?un6h4zDFpqvEnQG#KYX$;!1;ZgS(-6nGPg6?tzmh)YQ0+@z%kk}=D{EjoEQF-_= z{rTr7oF#Ox5(*q<$fjs71W6BAvX}L|9p!C8cj@Q11EZf`IMf4B?)s4l^f_HyjV@h7 zolpyx2kD_>ko6}|IjhBq-cS7bY5%!MPY4Co7)vMyIgRE$V`B@CG%UJGWT7(&TYg*| zk7cpnfvGJc);PATlQCpE<s+{K@TF+Qa!Nq2k>tSX|F-biB1 z`ot`icrMd7F&d|2bYeT{ya-ujM=NYZo5iRWYTk-#NQ=0QDH)^@UXt6ntPcRY+5vF~ zb0|*^+~`jy!gRr9A^HPZ;9_7hCib+6UA%2yU3ci$IVLmB;!uh#Ly z#Im6lOZF$9wi1L8W10Z&(X7N+b(qobtV%zlA4LMRDk0zatkRPNAMz?8Fz&*VD1~8u z-?!M?6D}1xoKR*L$|}Vk>ErBCna+)kX=W!GZ&qvI|$Gvh!JI_-)DL;Hq+U zB<0iZY-XI&QA30P`pIGh7xVd#BLSOP?-l7Ke=_*Qem77AK^!d4yVMdn2b(t*%-9C( zW?$^tP0zGMIgChR{SsjK$wF@+466FLOIzuSXg;*Bu8sv)&qcbRBoRcNauSus72JG4 zL~jyiGb6AnUNxx%Q<<Jz@$3EtALaPt>lQaXd<~FaC2WjF<~nO{(e>Iq2Le7>ZOtWqlEQQp zpTMbPHa5=VvF$?;KzDHN8{Qi~`uiqGx|#NaM!ezjSJq@ZQRJBtolN=w#p2$fvvaGV~{hZA2<*OKr)PxI-d$)A#*)d6%1XPIh6`7-4Z0Tos zW}xztzBdaO7|R*kv`-An3uizD%Z-4b&D31`T(67paRp!TVMUpKQQ;)Sm!@r!cIzg~npm&`UJ-rGZMw<89gcZc@}Jqo5CceB1y>Mbyf zHzw?HRO`T@pVgy2CMaM=SL0Vy^SpscMr#i@g5#)L{!QVjF@$dO?-)>zwM8Pt>$H}5 z-+rf6=m5@Zgsc@C!*Q^myCq7!4G3NX8NpEsKe8oJttV`2BAzdq7hHA*)Bz37$nE>V zTZ{CZ-U;TyqqJlF{QZ1LRj?$u#TVN zdrwR$na`~j9$SB~4%U$8X8iQm#~Ad@cz47e`aO9$+Q$^sBKvlPx$PrXwWKA##sY~FOJyDM$p@Nni)@nT8ijX;)${Jnc-Br}^s6G)HG$)zHrZ6y$Q zv){G<5fgV$b}UL_`iuSPC)x4DeP~n-i%(8&BH8yS*u@bEFYg?qU1GDsPoicJ;>~Q9 z5z#etaZhBlrID{%5XBZ_?MH3MyCk?wh|ENAV&aqqC)v7pgQfYx!7V{wjE%CnW!4tue$`Q&eSAk$q}Qx z6`1v6lU3~yF>$$3HAhWk-)Ew0N<3q(hofs~d~%F*D8z?o7za_s7HjqF!g3rck0{L4 zJjiGziOm+~zD!odkR8MP(rFDBTeNI25?{=&5Kpc!ORi8$u5d}N5J|2uO0Ix!OpWZK z6P_dp&m(v^Ue_%>q$I!ddytP z-WK0#gAfl?-KDP#~g9wYYR5!&X>fr=U?JHtK=Fv z9xkMMo-x?9kNH6~2?WB`Pht(h-@@g7-Dk_)tr%Bl$~={mT7Tpdo!Q>^_=0Lj=ueK$ zN=++-+$5GjYm{^ncp4518CAjsa(gk)6u^KjRrYZ!=s7_5%vLuO%{Ry53rHjC$mwQ1 z=+GrAaWF2hJxyKqh}dvyB4gb0{AA7|1zG-^PAbqsvY7P^?XTzRd-(=Q*7}=+P7hXc z=Y?Qx-rKRR_KE+RDZc9qU?oxQR+siAFuC7ke|8fwso-2e6BWkV zZ}e?w#V!6Ndc#-}h&+9KMw*0LQ#SHj&|YBsV1j7ebUd7Lr|j7CYy7yt^i->aD*9qSN2Tsdk?cvSPv%u@_{pZ}cS(3`cJd@>9Z%Ir{7^uD13hEW z6mcVOrYI>BAJc@_F&(PnEQqfc(Np~EuND3ziRynZ`1tnBJYe4?A5>646K<9gr-gk*#>`>h%lsu$Mzr0J_J?ru%##dJB zTqXHU3KvCM6+EplTFZ9-O_sAnoOkT6XYW*AWK(!Zlox?gx(uH<*3XMLXseXKC7jzJ z6zN}BgldkQR~N-giD$*yHQl}7hYQ?nNpR(GlLIcH@CryKNanDPE~O(IM*_ehPd8+*K}SL41V9rp1zvTe}rEH3RleZJ|hMce*1`= zr%mP|M~bXgZ(XID)NPGBTry5zhG$f9PF28q3egQ758+%rPE!>^wi zBER6as*acb>$L1xPT6(nUI;50;G9a2p;w4pIP{k!$6e!N`Q=~)w087`{Errr2hz=3 zwS82qev5O|9G>J_2<7;bkgbqQ`~4{WQA$>KwIwa$;FA|c)S>=>PE@6ON;W1q%MR_k8 zyVn}i7%BfKaiT~MZ#Y2|zS~J15Qry-`XrU9;8PWlkM9SaUN%Tn6M|nHejMp)KRks4_=I!P80h3kWJ{qg zV$-v;Jur#vmnMT*}`AXwAMXDf6LLt};KEAxXwQai`_01QDYSj(ss? z6-7@|_a-0%DSq1wy=OG=)fuO!ey<_CA>Lk{MO$*lvM(0ir9g8<9NqCNsV`)Y+hO=^ zt^lg?K3i(=05@mwL&I>CQyNuzKAeSfp5#Yqz5Pqf{i`?(M2a9Yy^}afdnwMOO8+Uu z3|ta887ro=Z#tdtqhXf=RcYa5qW(X+-ZCtXpz9XJ-Q6`f!QI^*f;+(p5Zv9}-Q6`f zgIjQSC%Eh2oO$oL?|1KybN+O9t=`@Bbk|JPu2sF)zKEuBqM%4yeC^}C=|h(n`}=Sc zSvKc#RxHfxwaVtDWW^1RVp8oW_IPQdt568N?Pm<;T3(_p6N-yGK#W|}C;AOd4SdGa zgkXM-455*J6HA~>6pM$rA_C4@TJMVFys5DB-3jPe-At6tf*kRFqE$reXfBEdEx>S- zWc}xdO0Hk+80*uhqD|(wd~)-EcMB=ohF;n-CR&!|e`Z<>RER?*K1?hAU=ro(KH)DyhBSb*G5q!){wN+{1ei6DVSuk8XF zBiFU2Cf=rd(fUe!#Fd9yYl+FYEu^@AW_z|KYl z);3TqLlHTt)^lwUYj`_w+W2(^Qq($|jGI`bRp<46@n_5kzK3u&!DDk};kPIgdb$s< zGjnxyHJE^!I&b*h43UIjnBeU}g;wHhu;}j!WM&!TY1~-Nja>J-jNoaS0xCI*$crd2 z%eqP2Fm@*o!(`xTY5Xkla0~IsiFt?wkSB#a9T_g9(d9UQIy8vZ5pXGF;4M*3#e+qg z7ev`OiwU$n%Bi~W%QdxkkJW-}oB^ps9!u>vH&<;U?`$JIRE&YwzOtQ02<%P+A!HwZ zCHUsP?-Zl-u%3gzh0h)h7<67-5L8i@c8d@ex_(dap4$%v+yD9Z%AYHxST4$Dy*Q+X zV!D9HB=9nT6=rv|Ij;ThoBrRF6N(~C8+<1q>zPJC`K<2PKeF)kO%g)hMXcTnTH7Mt zMKaYxT@=pFEOABlwL~JD*NudAG*oej1GA13da@FbrbV^wMMoW1uln`ojx;tz-|n>6 zE7;Q}mnZ^fqnGYgrox1F9&a)-tCv2LNr7QOPoG5q243J)gT`2dGl9Li$^w!QB3lJG zCaNM9KdV-NpTva_D^S7pgw}4ziaL1Lc$xKtwRpuWsi|C+yO?M?ZzSl#+MasJ>WM>E z;f&SDn=&@Z#1C>j&*C4FqqP2o*sg~;qgx=x;E9|ZXB%I4JA1bhts_{|jp>1~RA zakHfkTc;v0S|z54ia5{Z=U+j5JBkwT*188Sbif-?UQda4#-ErR6~tHua}7Q=GW0n5 zajAVCZHx4w-(}Z)9o*9H#~5`je7HsF8Yq2(b>Q_X3AAWe{jSUG<oO!aese}EZ7iY zRQ-bDFKF$Mx%#7Klihr%#8BGGYjpKTOfL8yRQQI=XLM92=2UoZenk%|=pEImITeEP zO%ktKXePFu^tm-hbxXpw<~E}1lO0b6B3so!K_@kktVU@j4L$UgJ=A~_K(^M}ybVae zO~`hjo9FXNk74eBSGExSJ&*dAa5j^ZN(lExmLni@Z0Zt-R(2J69}tMea2=*BKKs>P z6%tc<#-%=_P2G~;0>KbFr?>2O2ty`j*7Thq`B&iXEKw)HkLAZ|d{Ybfp@B7N_8~kB zZ=6~veA6+TFudACIJTko@?*AmD&M`72v?qaCI1Cj44kKZUb)}>Y1x*4Rr?xHMJT2J zF*wRpGaIRP21l>kpi?FD9;6Q~G90*TPK5knAlfCoL@+vo$RCyEN*e5A9T#=NLyHov zZj5jV?LUo*v6Ru8P9c)2{$JA z*83IHu!tqYzfRS#cw8YIURu@%j{?v5*zi{^7(*(IkrP$hCbu}WzHt$28tl|JjlkM1 zCCkdBa!|m4oSCuU0^)Wc_%JjaD$VQHD$+#58g*r8s!EOV+KeiyYMkywG|MGHjy8@3 zRxyZ7T2Xkz6PNq_8cZH)OH!oYoVIIRm)WsqY2~3ncw*UL2NlAYyVTr{o2gD7Gx9*a zXsA~hqr=e#J}4cDl&Ufk`;TiZSP$8JQ`ndP+bE51-$zSCw@7CJm&HC z#!Z+}(8^@WZ`%&!xIHFi{I;}>UA(LR0-zzkCV8MB|FaP;`2%uy0J|@+Nv-f*jOIDW z|Ji^#(b5=@5A%Q9{O8A8_FsczwUG;-l;3>MQFI4B{2dtMJiQD{qWH(z(?3*3U^Q3i zX%8S7eqpEF24c5ij6e6S>%p7*7-QFkun9Gg7^SiaDJ$768|3WAlJpDdqL{%wZ!I0H z%*bCWN$x0LD{&jeZVPr+c5(ptmqMklvwZf?AM73fy+WUE6+hz6cU28}pBgNvC!KEX z8oToNeltCaRrn?9GX!L%NgSXK2 zT?*>>0A$$;vILP)!%YxgZNm|}KAWktT}Fx$jy<{h=zt9;(7~xb@_!C)0zr-K)pi-N z3u)^h`Cz|+-VeKcfOBeaw<3R?gyog4pT1qo>*Zb*eRjHaFLhmoab4#AcSZIZL%uQz z%FF%NDp9e!0pr`Lff^}5UhTx-PD|t(L*e?<Eyu-GDK?K5SH5UUBa^tNU*E z_zSy_LjB}7fwr~|V-i>C1&TEs0D+y|b$ zZY^vb)c3yM3~n9t&W1*5&D3mRv+b_a3|QumMV61*iuF!-CGFGvpv>_I;{714vhj88#f0IIYETW`%fU(FGWju^Ba(}%}@ ze3;{BBIrE8!x)|a9=^(a38r=_8#=A;q@OgZctIR&KD3jn9OlsfrjjjH_P5PtQ~=+A z*Wa7}a;0jj@15=6 z8imrD&Ks7u$LoTgy3?tn_v!!x+L@q*^hPXeLq1Kto0jIIh`pP(Gi{X=S>Ljb zjw^EB@>n5-*@hFy$$DTMhYDU_4FE1L?BQ`1}Xo88(F z`=MJW)!D;N?B@erRGW$qTTUI>PkUQ}F!RjrW`q7-UbB??6?uCejAh(tj}|`nR`zM1 z1&umyf06@KV2w*|kgkqi?TA3g&`h)p&$G^WjWxJ6RyCo_C;vE{rHgc#pp7L@luvsK&vBn99&deH!l5b$` zh8rm`+^~{H_7?q9=f#OIzj}TTe>RHl(yuLMnbO zZz3Juq_n+3z6A=~uog&!hb(|E$1kk)oM*40Bd8tihn|cnd=LM(zM~RQuoTf;=QUm2 zMG3Snl{7DW zfLq#EZjJk0nJyL06ItFLxpu0~O>-PQTns4&d++QJu>Osn*5x)A;9I(ScH$7@woxYy%vE(MoHxIHy7wTA1n;WXoyq2l?g zgizG?m9Rzu*-N&p%51~w{po3{tII*y_kHg(p75#Ic}1%D()gRZZg}Fg!TQmLSwC+> ztF-P_Sa>ofQ_(35U{BS(wP!DKM!G8EOVtwnUeWiKP;10AZw8Gd5WFr*Z-U0*`U0Do?@l0huD)tO-r8*cU`pvi;Ne~eUfXGV$M&S5} zJdIakvZBKheW_z?Mo=*CFxbCHI7jL2NY6w;OQ=u}37FV*2d?!Mq(BCP+kI`_~JdKf>4%+b6lMp2GKE4#VpayvI2bQkoDpSE4VW^wb_FkUU<>kVDrEY@crUm`(3(IHMr1EI@PIXB%W?6?=)ul^=Pu z+dNp%s$?{66_O3O_Le_C0@$%UR)y2q#xL2zmYP!h`bH9BGvJ^(vxRP0vl>h_kzk)xgrx}8JFsjmi5Xz!t?)FjQV4Y^V}!-YUCWTNwgG+3XmWRyyn9JiV0vN}>G3d2uqQsxd* zrRDC0n<$ng8zpMo-NlZ9u3C$KW6X(Q**6iZBW*v!$GG{IIrO4f*WMhR<)TJaQ;~MR z9ENS>hZymgxK*cBg0Ybk^d+;i zjeU%9tFN*$#pKrUjpIaL2++$?B#!Vc=eKwMAUAa)>*_HJ`-hVt1B<)*^x$dkS}FX% zug#W7&_lCBYM+2l=70;>Yik{}x>9jHAuV9w$D*rUeP$7IyhYhF4kiMtfUMwn4*p>ef#wB7L4V|5eLMd`t+_)7cC-cgU1|pt`$4LTv1< z%kQonJtkEQyV%G~4k{J!E2gke26$G6NGR$)16l3%_IVUk$}{^*3KGiloQ*emai&OHnWw`rtE_@8!Bd$HdM8Af5^7dAzIy5N~s|UP$HzrO;bR|!oOtD!j z({u|ttb7*`OZ#qX*sodwmP-jD`EBRE?+4nmqdJ*ciddobT30)WsK(q+p|g!`#ty0; zvq=p&^1nOnKWTSzLe5>{%h)qEg}`i(vhT?gx$_mHtZ1JnK*2`v8?f>l=-`LA9#-ch z(-y3xSyzA4bDo4}{yjwEmM1tT&m};pu5K^f#rf5Y={&WZHQI_#-_dezOtV}wb(gn# zY4^z0^UEQ!mTP~5JzbS7%@2_%pW?^YUJ6cF)hLZDF~Sv_AftUcT%~B8ZP<@2Bp0{FtDger=^J_s&+& zLoCnP_e)Ca&yP|_!uiDhW{tle>l<%^nCx}y&^!Eq8oZc=d z+p44K_x!7)crD!LM+`X=>x#ZzyfWnw_pt)|Z449G~7aDz&1`wWOHQX3Vd0Ir_<<#~3I(_Ssy0>D!zqh=9 zE8=f#&a|IhC^%WqhdHnX(C_!;PPtg#^Ar9ElfSY%y7HbMiP~zhH~qEgi;ifExb^zp z651sCHcmGkI>oS89)dtko{S;q15H<`8Fr{ESK)}!DI!->ameulB!NeMpLS|Ob-^rLJzZ_kp)q2z`ActUb9g032iXaaycmqSS>poK2@^+HG?rtbF@I7 zt2*xBA`<=@osFRl-L$76c@{UlX*&;l4iQW3-ynOqb#84dF&eiahV*O*N;lt?oz?YY z5vwzIZ1rPuvmOi3PDHQ~@A4_z8(hV1^(Ahk>HkUNh1+&bFnc}lK5g|ZC)H+1cUe3i zT24{N%Z#m;wn8-sn_^!8&?cu49H^JybrP3O}d9{ zmO0KJ<=q;1u%0rFo617gh&67_$Br{%d)|Gcc_cY7Q{h&*Sj4!8H{S+28Z*2WWUJdV zMYgW}5Qp+Z=HZO-h(k8FwYr8`i9>D-NZbw&ePI$J>`D%_9E7Gju6*p+SRgSJ^mqGu zxgWgD?t+C2^7#aNE|bp(#i(bvX&k*ny$ruk`V0Bv?6k6z*li5wv~7p4)ejIQ8M~1v z{6;wHzBQ*sTaQjdAeZW=Q>*fP=<9Id~4cE*2q4&azQQvj5?OKQH;xxVY!6t*N8*db_ z?5yO$gBnLr$+A7^9Gw$<>0@_Emo3dxK;hC{Y3=-YnGIE_BI?EQYfgzvK6je?two%# zoh?MfB8JFc>DTS9^V!{CM2}SD8|V6q(Zbrpc2%TE+dJ_jr5ADdPjBMK<-5vy+Uq_x zDp3rjP+xmy)D9)OQYZfQRlEVI6`LTo4N=!kzA8*amK%5sQy;;=>Liz=`AsAJ`Vwor z_Y^-$PMoJrhrJ{>j%gw7rP@u1v&6!kYBk=M!(avse|c0?lDD)geyp&_`_j3Od9*)l z8vpSAG}+_k+EkkH%q3Mh2I^(@$KGl#!^8Y|LWRwm9)q3Lo&{H=VlgGuVa2^Bxx8QG zLhO$n*p=n$G6!IYKkmJnZT3XF(ep>OX%8a4u$8&~ZO;u}QmoG*2sB~qO}}n)0NVWk z5@m|FDLuF>_sb?g0TL3)=$qqu7KCObP` zc-sfmM|S!M!wpXu^&*oGw|%$C}% zyQG@TdwuS+1}D#kbS+^eEn#~`|I}MrFVFh}j=SF*(mD_57IGV&D6@j%_rSBKuJW!? zBmbAZ@UBt+!t76&dt4m;H14F=C-6aE>%IZe@gGPc{b@*L?uOW&Va6Q_(iHz_x z^U$0Q;ZJ>u;FNBo^Ex5dUJRHFNDS~gTM&IN2O6AALLA(MD&~B0JaJ`Ud3iE+P;%)C zj4ne&9-I1m#c(@Movw~@34{~r@7^vi)8iesPr+`VOE5NIFK!IZa$=*e=Nm14$I)|^ zySTBt*VpBh6g0=*kN-qHlAzUBa zzrlIM@W#&__9@*APis92Cbe4>3yI5MG!*CJA{>Vj?agkJ`UWq0vEp20J}|~FXm~|V&HjPL8?@{RuvJ!-fz9(rZbwCND*~vR3ajHkx z-r}~w!FBgP>=Ky=6YdZF8fO@|AE`6 zEJ4aqrb%~!IT!96IF{kfuyIniM0Bh3I%x!Gn)Hb^;CZm_2z~&Nm@S^Oz8a#k3`!q7 zX3)=df#E#v3X8mqUk{E)_t9Vo^ey*HrZe*^x%*1-)TlrE9n{|R_Jx}V6d(rG86+^^ z{%ms#)QiRcH1N-UHSU*Xf4&B%nVNmOv?v>z>t)eqeetB($K9pz7-(J zV$@5KS!AWX#g9FMwP8e)nyy`uW4oO>xQ(zJ$eqAn)!H?0nKt+C&dyAnxwu8c-H?h+ z1mmUU>TMkjmRhBA5UxH|vw@tw@U%l{7_!zreE~Kf9KJ)C{bWZ)Chxs%ySo3Y0F9kw zO{MopRduhXDM&KOr=jDKMwW24d=Jv}RcB*#`ULmjm2+c~==NHiW2BKxW^bjel6?hc zcgnW=$ni!x=q)MjZ=CdOXg(%CbM;(vHBzq)wf*&VIHznT9?Kix*_E7a*gEbUYdPV7mif{v|M54^=X6!by=D z*Cno2ikn2QC{r~=UitH1PtK3C!No^;eO?A0nPMkRdw~ipX?QuK>8hUNzdBo}Fiy*o zE!Y}St&HU9g>k!+lngP7M#+nRO1K)LpPlvh2bUv!SFw&;4N#svQu&^0ZwV@f5qg-236q$yI zK`YE>0wU}66ja~f>461#SQj?Anfm{Trf;%bQ{hz<)q+cNJ1#lKP2Ars`E3LE8rq1# z+_}&)D>n2S>+h^G?28_2*iprx(hAv*o9|r@3EuBML8_6O*WlMiDW2OvP8QefH{YTD zaY7%{Qa?vW`5HZllJ+<|hZ}9l%0q_*vOWTwazFh*!~k}I&~UJS>23@T@g$u2za_qf zoKFe7!k1L#c=fv9?yQ}B)bYzRr*jW7XYrpj7UN`uZRfYrwt$|Edb&!5#PSCP-*sL_ z;f0VWa_RnZQL98?=7%>>L{R9&N-iZ#10 zwi&JW*#S--86Z8Za&9ppUKdXcrZ5fXvHMB;6P*_y`Aj5d2e>>Hk=l#6#Ey$h`cTc+ z7m?EYaWmkOlX0i@#_0tfO<=G%f&*I$U}fBukA zuTL#_OM6v|Dyzt&ZXvS&9@5p7If!>H>_~D&cM;Qr2PIiY_xHjth)NtZu)MpMjF3Pr zAy*-fnp>oQYL{PqJ*hSV|kDbkbyNvS`dhqa6 zAb9aBa7n+x$5HKsR44zaSCglwPg=d4b|(J7iAIB!fD4L}S+|Dzs!rQ;=lPL0Rm+zYr( z=gq%N`2&pk-0TNS+ik$x6xa@4(T@V}VU{9@_*^~XoF{9jOw8FyV)~nkY^vi49Z3tZZ@|hkqwUB%DtqaXWCiI+QTOGnQtqz_kn5&p=sa|$QVQ_iE(U09iGn%_N&i(^c*BoE}3f=QbezAA#x(Q-jc z9He=j{t2*(@!y1tkm{f#BQXE@GZzgF`Qe8wI65ye5_DtTH!*GbsRRCXIEX}HEE?)> zTo2(+UqO){g~9&AA+HsogSkDaClSxw**B!s);dg?U}WpCT49HX&r$H;MFYJQynZl0 zzy16U7s041;(HST4|x{8ix8m~w)=}3ZA|?!feNGIxF0)MmUaO|5u@qVnP(mj_{kKj?AMNiX=n#$MdB?k{{B zTeYu)`B=ubG@F}YVdmuBnz5l1ajhJg6Jf1@YDXPunc+M#E1G|N)Elzp+<(F({zx`+qqZn}nW;kuX1X6h{9(9pm z>iuwlnJL;1?<@2-4&%&-)(QB)2U<#$Q3|3AeBT{N@jyw5g-WC=I2p<@V+n}-@-uA` zq#F}CkAYxTZMyX;aw2m50}U=T*qd!So3wb8ADy0A%uL%OsS_WtUf^P!77D_8k`=g0VNn z3Usbt% z{q=m!#1<}p*k9uIvFcam+S8#{PFp_VC}2$FSC z_>oI7;x}ukN^_gIFj}+4+MC$CHHR@vzC2OTlKHI4Qh!!~ee93i z%wDsL6djU`(KvPS&A%Q?nGZn#EoI0$nti~MSl|YXzVf8TfR!B9GUgwsd-qQ923G?< zNgn=2R)aLGSy(~y#XpTZ{1KgtJJF58_G}yo9hoh|zV`a#M85aqqRfqmkN;d}Yc0|phIEpe z%Bz3eipV7&n9-1y`1ZqqAhm5JaJ+SX_;S}Wti{eHf!RK`7refOC#^Ov;?l+S{*9Z5 z;Nb)C4S8R6jAYcTFb#4=QduXZq;QhPI)EyJ2~2j7?+c>+6(62y z_sOwd#N+D32n>P72RDRaVfTm6OK0=gL{lDDF5#GTYXb9inFkl$S(Z$Xs=i@40@$iJFs4dmeW9z>$w9deamE}U`{{`eVr1b{mP2U{Q6(U*C%btDTxgsuYR$08 zqnly#eVYHFi=`Ja(l7l9Go=_20AMND!Bx1=HVyAv?L3wjd@$Sys#tmVX5pMICLnt< z^MMnx$O-A^O#=vlA^$xvx4iz$%m>$Iwilu8f>&Yx8d0S zu@zYg54R~LkF1N`bk^7M9Z!?^8U7+_i{ChNM{U=i{p;%TcowcO_k{@l4f>|jz){S& zUOD|HYUbincp*{s-0@qpOtR>K0d$9OTiOVYuE|_c zoYF5nNl>u385+O1LfLBOLtzC{3kH}KNhB#&pu(h^xzAz!GyekAOttzHN$W-_j>5%Ci%ybG(0-iC_PB8+# zpfBz8FmaidSht+Rz!2~uDw_S&PX`Ua97(|>b@A|xMShpmszrKIjDNP<3Nd4}j#fMy~G&!ylEwr5l_ea8=k7FQ+ ztF9HJHvg&Or0EG{?I*i>rqIgBcGWEaYbBkQk|gRm!m^xvO387!xWVa@ zIG{ute@VjKKo8;dx67DT(E64A3Z6_5GjR^CTk;xNs_anep4RwZ>WV6|bzx6u(rBQy zWSk#z&eUkmRZTmj23KoJY;n`y{>PXk>-O&0>m=<^sBF_F5sAZjI=JVeE60-17~T@T z$00m`Qd&=g2AYYDcRHbK2ULcXMyL-wUFdiId!eE)u3c@;hU+=yzIeK7+z%x!Kbq}p zFKzVWshen}GR5wG5CKb~ZMz5$NYTG(GYva1bwC00E6lAg~3 z-9A}Fu)xZOWn|HD2pcgw$pAgeKacz&MaQf*#F_B!;T5ihFsLnp1;2Yf`+nzznut^k zO0j@fTX3;Q;`|}hH7X17$~!Tvu1F(3hs*Sdy;&R;>a`q-#2G;`L4?p&jE!54IowN( z^a{;D?;K_2)mwU@wwuaeMJWXaP5! zqF`l;Yr(?hZ~w+V9thBatc7rBl?t$mk+KZ9Q-ov!SZ0ugTEK+Fe}}Bu9wJE+pYy84 z{1RCWVzOp1&9s+5t3gOz>rxM|tD4gSsMWp@iO<=ttaUze)F*gu*a`C~6PPbU9Q2=1 zWUa)l^GP^JZHbUF9Ddti*{Vi#&7^GWoFvg8pKvB_^Iev2W2G77Y=eU5R!Fcvj($#O zMU(F^(XdE3UQnHjjVqX;A74#!7WY?i72qx~h9CICAEGb%h*7nud`o#%aWD_*qNtk2 zpgj?3CcY?{q^q3(z7xijAroYI@T$hdWHiXo$4EYnx_=vN=xo%=T ziW1NmG1$sxjyC{^+weg{5F^4aNk0kCR^#92|8@>3jccSBBRYxR15?-3o4bjpnS5dh zj54d6MdNqn${x5kesPi{DC1`Hl5uEh7;wi(rieRQ%_OgqSlj{t8+i6dBG4MyqeLUK zqn`@H7OXJIv4&P0ykRp=I7SdIXldIHRea;%sKc-2_HsN(V(mf$uPm$XYo zW=s|rYFYImw=XQq9cjtU43T-7+VY2sDl5=8*#fz*{4l_(q|a6lNTt;^fzlC!H&T=y z7Le+FSR3+uOip~HE!Ar|TDS@V6nt~m{XszXpXr%=1 zk*fJm&rSHLM_iFRwrWyUZ|^MFPFpVI)GL3Jwwt0O)H(`nd4`J?nJ7f5K6xdmgRUlv zG_I_enj zilf~WjpZxqFFl(#G61JRM>2+Mxx=D43Qw}udCCG?^)sBE)N5HfkGD`aS6rnoLGvf4 zc>JN7OPq=g<^!BG8oA>>n>=;!_&A%Y85pCV?R)~tJ z?HkUXZo@9?=-QEzda6xTPGnToX_CX zDG3-9GEnU5YleAP$>relg4#)F82D((1rqTBr-wmc<-(g{5Z?mWBF=xLHHSlD2#8>W z_YaaY#_33X18kyW?6IVjOeKPy<}2DmN4-B#-v;g7Z30nF0D=&bid9klLiIgZYLx3P zg?@Z!CPS7iiy^oe_F)Bc6bOL~b}kKbW2&bknPiS(#5A@LN3mqFQJq)l(g_rPQWvb! z13}^b-gP^TKm`0I3ki{PkG*rLSKBku5IX<9Zqi<3UGW9$91>ljlgh&b<6x6VXN~TWgc$VT1!$b^H3OY85 zX-@)DdVM z+t%gSFs_Vc8ym|mz@<$q6~zEyt&fo(25KlD$QXqR(<&u60oqXvcq>i8Lh6O&N#FPH z!ls|4E!~VnVRLrh|KVngL4_F&98|Vaia~=Bw!-ID3S}MGO=6>pIx7g8Evr(eo+}+l zqtMTHEYdaNJ?WtpGR@Sq%3cW(A{FFN=ACX>NWRG-7xq!+60i)7!t(L#pm(Q$;Z>u- zy$v0i9V|xYcP}$}-cD3*#!nh;t2{~Guv$^7ePYxT8tYopv0j?&EVxA7V;BsD}u^AKU?fA}qtm9++u|EEb zl@lr5SVto)*rvvltlpK4gYcqxmL7%aBoXd4IH3s-txXARYQdqW@S_3Pt*G3uJct`| zUC(UUZRb1pdVB@jw{|)e|E_x|&93b{-m(+UED+wJIT@wmLq=4~WF-HUSNZW7Rlni@ zNlJ+c!zV*s@R^__(vYX5od~h8KI@jT&gv27ixbzMVmE3mH_jSCQqA!b?!8QH_8Q6- zL%IKV+qt*nkb`Vl0sEYTPN@oVltQFN@c6WXcixQTK2B8@J&ZxJP{k41Of0a9_oU?l z7AY?&Qq^#YY@w*&y6})}EJchnf*#tI>;X@W&%^$yDCxu%D6*AaS9;3t0>>vi^GGsi zr!tKgPkwQA*N~g`_womiYA&j%H_}8S{si+A-ck8jfZ;wXlKhMg-Mkr`74})mvg;4V zR%T6`_JouYCT(tA@om6xtB1!woGyQoySaPyS$}MDdrpIf;W}mB=6oMq`8Hiwqkn-# zs^Nf)H;di|>xmNpD4A!N`B2I-ttB}TEZo?(HVQSOfiGZ3NdH*nGFF8$ze{e^Mvq-9 zK-pN@mvwH&b#T*;k*iifR$Q|=zb7VH_C>#ShDSzYsLSgzR1V-jNWoD2l&cB0x zqW2r%I<9k=-Zy7~ZbZ|%T%|D3@;gQ|aP7h9$8YU{V zyRN&ychfX*bC#z|zCIxTKr15}f=CkWX+$acMrl_4dAWAS*AcQIgaM}X_3y#Zz|*VZ z&cbF%yg$+)g-RnAQ;jtenni5?V4R1w5xZ}Rc^UIb@hiC4yk0g)+E009-N0RPy2pxxdd8O^|5EvS3xgt2lzqU)+3AAP>mWB9($ z_ZZi%l-5bRE@{DcEjgfM9IM5QvDSW=cYC!qdwES+*<;%Y31KW*t|h#bmi91XYsuhK z+p)XF)!E%Z{zp^p0PN!EPK3Bak&&>zMN23e!qYUk!Tv#=CsxaD{tEU1GPOVA*nOdK z`nFlIz3+>-&e=aa6xh{r9=)2-nSRe?Wa$|dmJhb^Bh#QRfABA(yOWb!6dj2SZ5f&< zW2RV2;G~)-cK9M`y5_|UL6*+GN@QUJk)Bama{e*6@ux>tA?yk4yRVRRQ9UL}c}@x5 z+V~PHdEF9%?Qq3u-XsMt>vrnDqc~}*PWxdqzL>)#>3*G_m~%xjAG{PCc2v`kzK1qa zm>K9(P|x>7#GsgmB(sT%#;k3<@5-R4U&lf69QM*!r|(;XXh?FNG!)$SY~)$vU9~_e zxSUGLw@i-Y>RKoN#8B>Ru_SN@;2l%8>YEYCYv0wS<0UpMMx2o?VN$iX?| zs7;QmO-5tO3lg~daXxlDoLE1f7%xq;`1@b=9UsD_-9DdG_%~NUraG-RUJs&wAsT3k zgsBOtrN)Ul!;kH2OQRlWzJT?Wg(5{_nHx$Ux(LWhTBteCpDC5WLyyZqUcXvau0`kwl-q(zH-O<)Imt0@LKpU3-epp1hHLD`DaDmP&|bGijv64Na1`^j+n@gz+Oy zQAB|aHi|$?#CY@QcyoEF5OE)}fqts*3IFuD%!pq1ey@dVZi)$Y^y_e^BohX?iVJp- z>vCVvaF>LVBvEE$X~H?FAwZ6uYzWIp3PVghDKi;1_!n6pKLrmHQ&EvML0&QmNWfU2 zv1y$?UQn<>ie+xy{gqoUilL@w`{Ls(>$Q~nn04jU9dAQ0XdvYrFM3e*P&05;TX;|L z5!+Y&c3L+Nxmt@-cse))@YgJspJ*!j0TSjLwlcJV67i9`7NQB_iQsQ3k`UoXact7Y zp!|IRh}cJuzDQ;4D|B@xG?n}d(e6n*xqZ*FZZ<1pd;xccsp!R8fVFTOg{fIa{J*L@ z7q}X>_y6w{;#86%DmJ1xNjC{mR0`cFMA${8(q(rexrGo?NeFf19y;V&NQx*Fkwa93 z5OPFODE@2qUdvg{Z$IC2+B*NQ?|L+15~FWJOON|}_(O1w{WY@cnl!hBj-x0L6PO*pfp>gEo7 zFnhY!rYr4oZ6}W8o_G?PRuDdYOjp@*%PCuLREK3c+^LZ+?Xh-TdCalS(UwgOXG*et zujJlcXIE8q^q^)wS_^w6K3eaNlUzjs7Q&D6CxRk{5Ea-PP_Rrj$NU?^4@t?ckFM z9pgG))5qS z)};2;lQzpw*{aToiPDPKNOkhldwV8yan1(qY}e;e+S!vMf*#M_dCFkVFD~}avU^xR zSy|+A#CdJf>JT6ILfI>Mou}PdSKEK?9Hpx{POAeArfxC5yD_&UJ962#riW*X<+ang zaW7p->(=14OWWt@^xt-OU0M)se`L-KAN5XKBwXLF+f*B->##Z{W{`rB;;62j1`J-k zN8>@q1)4qsybf(#Q(R>C%5m{d`y&!XH*f7K8_=mX>Va?ai(gi}_$BYgR?WniK|6DA z=kb@()}cj#UIM*enEdj%Pqz-{0c!4#S9QGEXV>lA zxTK-nN6tzEzC_+H=bbmMzA~tOarDg|r!U-mk)iHoVs5r^nz!|;s|9`3SNR9cQ5g|? zK>lKk(dggS?OD9)X6JbMf=(Ha;!FE3vF3H#Z@GTLm3fQ$&RX+j*0E33Gha0nn%|mm zWW}Br%Qb_u+cvpH)|M@uF_^>o<2A=LB@ve|1|A&wWO`Xh%3h1C-zOJjE6eq`{BdaX;Z%)FPG@$=oU$)bKjL)vbMFscCwt1I zO%9ZrYBNQqGty_?lvS;Ae z(AgIzj#4nV5Lz&}bIjwFhqq|tN|`KpxT`iL-)FS%?s*$o(MAE5)*XFW4#D4V7s#w7cE%sKYOAMICVdWOV56v$MI8*%yaAeQCID z@anrh-1rxR)zAE)G+g7N13%fG7kXvEy5;qj+f#lyblU5z!_Z+}+Wa!tvPa;G4xVQn z>@~VR?5X~4o>x0pjlj*5*B?ExJ#4kr_HqT;7OV_6v zkt*Kp9MYw`&yG94dW8PluAzex=H4^(s#C1~F!fZm&)tj1woSPCJoTBn^7V1bv&y>q z8o2I}3#=Ksr{?XYnjDAjo!+nW(s=m1&nBNUE1uq&nIGWY*7V6Q}PF?l*tE*(N zRd}DAdM&QCN{896_{Q5ZZ^YZ=ck>J0F=eLT^`^@@@{gT0+^c$*`p75GFC3d+*unmM zoMK%MvnRKeJZ7XfX|8j#;IETj;@mE_I%DR*hW0XNX396@#=UMF78kMkTgsrJO*5R2 zjF`EFD2Wp?9U#x33qbudaLhN36=*E|OA?i!?Ti9lb30A zee~x**)thA!)I$;(w?%uTi!>WRmID0dpG{x`|at+JBjq`-f8;azYd8({!J3~FyzF&;-{cpS8 zcy~>Z|Lu?2=L^0{SX-^_(M2J1ZHITN;l0Cly~#Fccj-m(qC;&fx^wg4lTfFIN&zPD>=g7zo?`^Ma zD=tzGzcg{;;B%`Iv);%p@6_YQ`_c{NL+7ksY@=UewXRq(vGmTpolBplFP*GZeJr@cpukb^0RqcxW-6Z{9e~vVlZs&L)-TSxtZM}?a52trlHL>qdV50wc zdenJsMsNvND~uuClAjWw+S@HLkKb zdl#SQ=xLT|jOx=gXzJj89j(X2YX{spAypG@eqk5Ve=iYd*etcxftglmk-TX~r*tA8uvpdvQb&`=w z8@SsouI`#dWNKtnj_vPrHdRS;=j`+~nKI|-=GE_fEk^cQSa>|XOTTyTw2n-v^?p2c z;f9!%$xj#kYTW*F-phlt+{T8~7p5qy>fcav9i@K&HiC%ZGzar+(^8=s#!&@*7R ze1eDU*k#gh7pZ-jt^8$Uzb6*g96oi?O;x}CX6*X9g&IB`ch8FE^nDZLKhZ2{rtjRI z^{2Vg^Rqe9XLSB_e9Ze-G~(|?{TeZ9`V*96|N5BD}U zrXC%6j<@X4(WX%WedCm@*WV0we53Ai;>4CTqXYQ^rutUJtIUWx&npRPAA0S&R2#dt z+f3~bYIczx`0T{^-m}7vO_>v)Ud`X@6sOzy$X#TAgJp4jW`Ia2{Y>ZL_V{lA?`SP8kHy^yeeBX81?VU1T zXJ^@ngaPz$&H(;$9}kXLc?|8+mADuCVp4vA+^4J|zt%>axOQUt?=sC3@eg4bORSb7}owKiOSsgGHu#K?tHQ_MT=dHCzwJjv^Qb={1@qz9>cPkWwQ zpjtZ*@>ymL z$w|Ez%PUnz?OT;RWP8Yl%p!he-Xwc1sorLa(K;1g(l7WjJGb**BEIj;2=7U9C^`)J|m#*TeD#Z8<&wCf`9Kp4Wkyu{3OLoAd&cO}`+ZC6Z zdw8a+BpABfdNplIg5h3i?_ops%f8z_DCWw^ja9$gDQrK)@ZHsk8w#_$ZQ^;e7m7KyLU;^TC~EfBiVAB-f#X zwn4zi!g6!XJsa&D=5;-k5b8eqcjqIT^IX+dnjiJocAhsy_ik$Jh9TF_bgu3)O{?3O zQvF2=?c$y{&d)4ca&a;L(G^#9tKzQ)fo4nE^D~Vf4x3@A^=R9)Hnwmj8*<829n19l|#$xU+(QP2M9VWs-6`y%gaex^@8A5#kqvYO%iydOu~aoEds z7uqPixM?u2SK6()J7z}t)V)i5;3Mb!t*7~nG>L_hozuCqR3eQAs5C@w&T81{y|2E$ zb3oPfk#5h6gLiMZINqe{imgPzQi;gg_S_>?ic-1m+f2hVb}J<7WgUv}9$nSAe9eTi zv2S1OS&^~Qy7Ew#ROIz@<8$7{PrfWIKNbCeROGKa#^>~Kn=BdFr80R^N;{<0Fil)G z?AW$%a;9aAeCoW$sfPGud|99&`y|D{GH8Oz64m*?HF&F>SL82Rxnb#On+K%>eCw2Z zPh5S_Na>hUX2Od?UiyHwujgMVxAI@E@T^^Dqgu-{|Ls}<h_jBj=&b3#Vv2V0p|9K9bPAz(;m045R zO(nWi<-Iib$RBUMa00%`ji@iJZoFyo?L+;y&u=cA%8a`&eZpn_+OWW#Rf>VKbp{=lgtz_p;@~Bd1NYrK zr8++jSK zIT|#f!T%=j?x&TTFC3|L=Dtk3@zCwn1J}mv;q{};4&60rs(rL{$GO_0JWKV4V23L^ zPG27BqF8W9>C3di7Ui!zw(=fw?rLb(FKcY5*k4-tjhFFZO2s#uZDZw5ZmIfwW|OW$ zLC4>s;?h!Faysr-%01-Td+E2W&WaZI-y?VDrazWhV+5u;4 zUevdLn;f1#H2$%JT2buQ1*X9uqpRK)wZH33bpQycvT|Z-c z{fzyKu}L_g@$6@bBXwRS_O&ZUc z8N&_mdT%^uW{lPvuOI1v7Dmg-uWU0*P6sn4pATLRXoD8a$0zh90=o?J)=RGXQ-sppOFddjWkWpuYp?t1+J^Y=1#$ zn0A(8)=^^ntBN^WZvg160lgcbUkvEu0evc<&js{lfW8jU%SZ~_f$gt4=Ir({2K0`A z-Uo9Trhaqbt|93Em@~T#NrL_l=uZOr0zh90=o6uw*45VI>gnhh=?v8|;A&2CcjH>Q z1`GKh1ET>Pj&{&O-(c6-XdWCWoO_a`KzAQk?O^xNV2-w%Yp^Rvdv;Ke@bmy}@o){E z?d8YO7Kr8q`v(T0Wq(G;l%%U|)O=MJEB)q=bqyZl<>%l&*Vo-ISXi8|zZ<&0o9E?* zMed$Fk3d&ncaHWPe_w%2+s{APT^so{2>oYu&OF*(ahJI$BQt#CCj4HEDPG#>_#Fur z`>jToWD?vMq#I0nHdfj`_-1-5+y2WMInS++o#AHn8i`LK~Pe+4(DQ zejPj}ll)w=fakhs&e+A0aY+8~?m|CEHex?nXWU~H?D}2&g!$Nd_PQy`%k~K&PO`kv zBGOJw`!O*h`3bj1jHyWvO?2|a^@l6b^soHHD`&J>RQ{@!LPhuH*#%k#k^KI6neD%H zgit~3B*hT>$+|aM5aoY{2>&}GEy+*H4M&Wri5)*KuHPJOqZ5~2LCf)I5|!T*=MQJ? z7^HquObl8OWq%R2-v#Gq;wOtHlAqZB8)Bp_(c<@1wN6O2zZ2mCTYCUuHe-}s*NdY6 zSKAqB*|DSZr!y;5VwcLSv)i7L7cnRMC}npN%s5@pEE!E~|CzC0RQ<8Z!u<9Fk$_6< z`l(IL78#us=iRqv5U)szY%#_TQI;@VRD>Ch4|r51#)9Sl%NZ;w*n=rCTC_w{8)Oe3 zk5|3%W@d0_I7y`4A`y^4_K)k=cEjG z>D!ziCi;HO`DdcnY|h6a0|e#ceoyR4z}#g}bNvgVAKaW9qH#xH4?ZxEa#mp;ZXnc? zh+f4!*|0g6Z7(b*+o(C8heayKg&AA%t7>u2!qnmSA%w5cy^L?0y zTQuh{F;BK^&Icg}newy8HRl1CS4?QmPhrlnZO%Vo&b4pON1}t0!2U_i`Etx%@P3W5 z3({r8nf;fG*G=*KHs&rY{s{AA7O%#K)K$?(Wg!O8o{ewxUxdf>m zaYi4G_g|t9LhAxMS^5u%GkSTvpAvllI=C@7_D@6cVDuR*eJSFC^6~ga^iz8X^=T~qeZ&Rj<8e=L zO#zuU2E~slKZT_~hB%{-H4xfQ^gYmpM@FB_(yv9F(Wl{YlIY)KeF97GswUJI;qjB` zuOZHqk57}n_qSoF-w6E0<0{ebMx4<{vdZt=OQ@H}<1NvLA7qMyVS z>O)!j9K;!W>_<0mpMkxFdOl0P4RJ=#$K$r(J3q?WrjJnX$Wu-=8G zABXxqqff)*JkcLVoblI*rSGjS)EA8t+E4WB5NGuEEd3{}uffL!qMwcSA4YG*(%(Xy zu}2;sFNl74f1%!#r9X%`qu0U55u#Vn5bBLs`sIi-dV73)A^KNXufx(aUP^M%K|n;3 zq|O8B&60v|CNvQ)OLEX)$x71%;{q8S|JpbU^OV1kU}+BKq@BrTPs|&z{e=5so{YzB z!UHk4k|abpUx0ZHi!a7J0UZM9v>bEOwlw8%R%2ep;_ESwMdwF!+K9PPdzx}MTQRR- z@g&S6rJL*bV6H<>5b)-Mn3u5lQOv_xAwGqC6gg0Ss zgpSX2k{rOSNpp%=yaVQ*%okTGb;4Zm4=4@n|R`hea7;2h|w zkU{#Qo(j=so)qzL*VRT(hj4@!U(K0|vOG&`yD_7!dAxBv9uN4>CvUo*9@@%0nBo@V z>${N0cSV<1d4cX8_VYQs>HlG?#Sk7NL!x22JoHqH7gG#l$t3r=UO~a`funtVTHzn3 z$O_)~i@>OwKvbbVst_5>RAzymiwRyxsx));@tKV-0^19WZ`CJbP+Rq}z#Ldy72 zJASl1nEb*?%r>P460!RSRfgE@fy?-z_Mpo65j%u7{Ife3MfOu<2Pxx6?fB95VDby6 zzwaM^-8qQvO#EvF-G=wqJ$Y!$Un|TFdeeuYvr+uab5L-|>{fohS@0yZuP~8^pMB9I zDCofyRQG?catnPPPjDvPLhl9r;U6DxTG{(nt6UuKzxU+-hWD*nxwzi9TJyKG=YJmn zTBGS(>b~U zYUSd3-)hbOkN11_7oe6M?@5!Bxv=S5>b&5 z|35W}>zANm|MS(tR`N@{n#A$z`|k3u`z3x&;`$|CD>A!WOZuU3(D~bIYa)6^OZ^hI z;6J)5DWbNP`1NBoiR;(DA1~sUcr}URm#{{0dPhs+C4Nof`Xyc~isP4fHMPhu_E*Ng z9}mens$kxNO{rdOgMf?)4CUN`{zt^{aKwc zTfg+NA%1g$z5M-Hl}>R*_dY`RaWd99xCe*uM+eRg5j+>q5NlUdlqb6PU6`2L$C-Ol z$$hEhHr@`1?Lt%IH(~0HmYKxp7odr7a|WWs@GvwH?#@7zNcpPx{&gQA{?64gkH?&` zV-K20IXfu*B8uO~oUN~*^ouDy^BgCW*!q5$Gj=XP6RCF&p!cTqODR2hUWQ%%QA)pz z(!T-9ucP$glwOZ95G8i`!!T!N%h5#aUjXQrQu-B?J`>PiqVy{%y#($+YkA9Hn1P>5BmUJxafZ(kn5Bqr|pf6?4Y^NHmf5oDS$+Dg9bXzZ1~! zr}XP6eI=lOP3hND`T_X)3uf~kitAv`*dIl4KR_Qs={HdN6M+6KrH`ic4S@a|rH`TX zBk^-0Z2QeIXY7xq^eX`UT1vl>(&qvC8tzyK??maD zF^%+}Er33W(r>2prGWkkrQbs7xzcC_CAK{pm^1coMH8{d9nkww`UFaU2+$v+^xG)? zTR{Jj(r>5qO%!K*K#vIu5|=p;p~SGgXd?D&V9xl(93u$VrZ|&l2AY^O5h%xr(kCJv zDThaKrW|G)lU4xbtfllxNJq+vr8rYg8B+*KCxCL!Qu^PJj+B#wIWyZw@k*eaULBDL zC8o~(Xd>m90^AAUe1Jy+JQ?7b051V}9l+)B^I2@av;b}ea5sR51AHgIvjAQW@J4{E z;OEHLehmk>1LjQsXY!~s3n)2J`U7a4#QiA6>Hbgz@G5|lMVHUXRUf0qzU%D1aXZ_%(n(#hmm9^4oz`l%DPnCioFK zw*4*u4+VHEz*7KDe&>c=ZyBI(0JtK4*p6L}4#2Ge?g{V+fF}d|0>B>vyb0i{_&H6s zUn2lM9pFKjlm4KH$A?f#PxptL6sO0B8h}gV=RnzZasf`BCuQsH0lg2vBLTh_;Mo8# z19$_#74a8iw*5K)w*$Bzz&8Lq72wwa{tDod__uL0f>{Z5Ric=g4c^oK@#9ZHkZ)BRx?#p(W#4Dbwq7XiEq;2iY3 zHKOcS1-KEw9Rbb0G&06&2_=?~<(WCo?D z`$H#H(RS(p+zQ|>01pLtEWlF%o)7R!fOF9A4~c4jRe&1-+!5e>fX4v*2*3*gUJdYH z(C;LP@@pW#%`hkZf&7-a6{V;9LlVX5@gW=FB>=AhxHS5GCQwx0`dV}LsWJQ(1y08a&YKENvh&KW4$FI9jW0emXJ=L38*z)u3a z5OdNWa&dpSP3h_Wp!Tb1zf1w{1aLmUBLSWa@JxW00K5+1@>-(((gL^@z})~I4)8>P zX8`;jz&`?9X^?2Y3;=F}Iq46XWPG6Xbbm;qI6V$z1H1&_H2{~^7HvNl;Kl%V0(daM zV*#ED@O*$*0-Q5gv|p+KHv;%nfX@f`W`Lgrcp>JbKb$1v1Er_?gPM+Lzf1w{1aLmU zBLSWa@JxW00K5+1^17n^(gL^@z})~I4)8>PX8`;jz&`?9Nl&z21^~Cgob-oOGCojx zx<4dQyxI5w@DhO609;yMwEaB+J{;gv03HbNXn-FA_!WRZ0r*#d_b?FcFZzWhL6SnB zbURN5xF6;uj(c$&gD5>6$MY1Y%dY@<1Hk2nh_+J`;G+OO9pLi;9tZFv0KW$C=KybG zDB7={03QzUDF6=vcr@lDj)^#qnu^m@K}JS06Yue4*>oV;0mKe+pi7q@c{P#_zHmk2Jo{0zYFjW0GBfn`bFZn7{{>- z=FIsL9mkmzr^nTBfF}Sv4dD3ze+cl;0PkulI$nAJw*|O2z*hr&H^8$1{s7<~0j@Nf zBS`G?SZ&Nn9D{Kj^(j3a$0Zb}IM&UW7S` zqcx7>AC#Vsqnfp7zf1w{1aLmUBLSWa@JxW00K5+1@;0LV(gL^@z})~I4)8>PX8`;j zz-urkaWuzq{6OjHIF7Ux?XL^KLjfKO@DzY&1H2624FFfP6K%f^z^wu93GfJjCj&ea z;3WXB2e`bw&@U24V;n~%%%#v~I*v0bPLHeM08apT8o=`bUIFk%fGbZF9WMib+XLJO z;E@2|3-D}!mjS#1;N2#P_DctI5=R3Z$03xSj^h%F({W4&cm}|W0A2-fj)Q3XRRL}U za7Tdi0UibL6oBUfyaM2j09SSt?e9>4+hb1RsDQnvE#YZhg?blOIpqD>p)79NU8s*`aq|9b^7}BP{2~>hKAWW{@6Rq_ zahIM#eGQ9~_h(Bp1*1gDuiy$+Ib0U6!Q6<&UHS_3_AJiF+>^yC`U&;nEM9|o0*kw7 z3iW9$&c{5T#VZC1^%X2$gLwmsyWky2UQ(Ek+00CQ%r#lOVz5wW%HlPcJF&Qno>0$c zaX#jeEFNwk)F-nzd4F~Wiznm%kCNZ9A@-B^XIHWG*<{CMP6SXQdh-5kWfrf%{|C}x z@fysnSe%2e*SoMdd4F~&i*xby;#d|Z@6S$QansR4yR%uGyg$2y#a+yV`WhA|@6VRT zgB@vya0{WH%i`qy*+wj$Y$?>+vp9KwwkL~c> 5)) +#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5)) +#define taskIndex1 (blockIdx.y) +#define taskIndex2 (blockIdx.z) +#define taskCount0 (gridDim.x*4) +#define taskCount1 (gridDim.y) +#define taskCount2 (gridDim.z) __device__ static void stencil_step( int x0, int x1, @@ -48,15 +53,71 @@ stencil_step( int x0, int x1, } -extern "C" +#define SPANX 32 +#define SPANY 8 +#define SPANZ 8 + __global__ void stencil_step_task( int x0, int x1, int y0, int y1, - int z0, + int z0, int z1, int Nx, int Ny, int Nz, const double coef[4], const double vsq[], const double Ain[], double Aout[]) { - stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, - Nx, Ny, Nz, coef, vsq, Ain, Aout); + if (taskIndex0 >= taskCount0 || + taskIndex1 >= taskCount1 || + taskIndex2 >= taskCount2) + return; + + const int xfirst = x0 + taskIndex0 * SPANX; + const int xlast = min(x1, xfirst + SPANX); + + const int yfirst = y0 + taskIndex1 * SPANY; + const int ylast = min(y1, yfirst + SPANY); + + const int zfirst = z0 + taskIndex2 * SPANZ; + const int zlast = min(z1, zfirst + SPANZ); + + stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, + Nx, Ny, Nz, coef, vsq, Ain, Aout); } + + +extern "C" +__global__ void +loop_stencil_ispc_tasks( int t0, int t1, + int x0, int x1, + int y0, int y1, + int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[4], + const double vsq[], + double Aeven[], double Aodd[]) +{ +#define NB(x,n) (((x)+(n)-1)/(n)) + + dim3 grid((NB(x1-x0,SPANX)-1)/4+1, NB(y1-y0,SPANY), NB(z1-z0,SPANZ)); + + for ( int t = t0; t < t1; ++t) + { + // Parallelize across cores as well: each task will work on a slice + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + { + if (programIndex == 0) + stencil_step_task<<>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd); + } + else + { + if (programIndex == 0) + stencil_step_task<<>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven); + } + + // We need to wait for all of the launched tasks to finish before + // starting the next iteration + cudaDeviceSynchronize(); + } +} diff --git a/examples_cuda/stencil/stencilX.ispc b/examples_cuda/stencil/stencilX.ispc deleted file mode 100644 index 36d9d521..00000000 --- a/examples_cuda/stencil/stencilX.ispc +++ /dev/null @@ -1,159 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -static inline void -stencil_step(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], uniform const double vsq[], - uniform const double Ain[], uniform double Aout[]) { - const uniform int Nxy = Nx * Ny; - -#if 0 -#define VER1 -#endif - -#ifdef VER1 - const uniform int x1o = 1; - const uniform int x2o = 2; - const uniform int x3o = 3; - const uniform int y1o = Nx; - const uniform int y2o = Nx*2; - const uniform int y3o = Nx*3; - const uniform int z1o = Nxy; - const uniform int z2o = Nxy*2; - const uniform int z3o = Nxy*3; -#endif - foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) - { - const int index= (z * Nxy) + (y * Nx) + x; - -#ifndef VER1 -#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] -#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - double div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + - A_cur(0, +1, 0) + A_cur(0, -1, 0) + - A_cur(0, 0, +1) + A_cur(0, 0, -1)) + - coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + - A_cur(0, +2, 0) + A_cur(0, -2, 0) + - A_cur(0, 0, +2) + A_cur(0, 0, -2)) + - coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + - A_cur(0, +3, 0) + A_cur(0, -3, 0) + - A_cur(0, 0, +3) + A_cur(0, 0, -3)); - -#else - -#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)] -#define A_next(x, y, z) Aout[index + (x) + (y) + (z)] - double div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) + - A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) + - A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) + - coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) + - A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) + - A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) + - coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) + - A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) + - A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o)); - -#endif - - A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + - vsq[index] * div; - } -} - -#define SPANX 32 -#define SPANY 8 -#define SPANZ 8 - -static task void -stencil_step_task(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], uniform const double vsq[], - uniform const double Ain[], uniform double Aout[]) { - if (taskIndex0 >= taskCount0 || - taskIndex1 >= taskCount1 || - taskIndex2 >= taskCount2) - return; - - const uniform int xfirst = x0 + taskIndex0 * SPANX; - const uniform int xlast = min(x1, xfirst + SPANX); - - const uniform int yfirst = y0 + taskIndex1 * SPANY; - const uniform int ylast = min(y1, yfirst + SPANY); - - const uniform int zfirst = z0 + taskIndex2 * SPANZ; - const uniform int zlast = min(z1, zfirst + SPANZ); - - stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, - Nx, Ny, Nz, coef, vsq, Ain, Aout); -} - - - -export void -loop_stencil_ispc_tasks(uniform int t0, uniform int t1, - uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], - uniform const double vsq[], - uniform double Aeven[], uniform double Aodd[]) -{ -#define NB(x,n) (((x)+(n)-1)/(n)) - - for (uniform int t = t0; t < t1; ++t) - { - // Parallelize across cores as well: each task will work on a slice - // of 1 in the z extent of the volume. - if ((t & 1) == 0) - launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] - stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, - coef, vsq, Aeven, Aodd); - else - launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] - stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, - coef, vsq, Aodd, Aeven); - - // We need to wait for all of the launched tasks to finish before - // starting the next iteration. - sync; - } -} - diff --git a/examples_cuda/stencil/stencilY.ispc b/examples_cuda/stencil/stencilY.ispc deleted file mode 100644 index 72c28ef6..00000000 --- a/examples_cuda/stencil/stencilY.ispc +++ /dev/null @@ -1,126 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -static inline void -stencil_step(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], uniform const double vsq[], - uniform const double Ain[], uniform double Aout[]) { - const uniform int Nxy = Nx * Ny; - - foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) - { - int index = (z * Nxy) + (y * Nx) + x; -#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] -#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - double div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + - A_cur(0, +1, 0) + A_cur(0, -1, 0) + - A_cur(0, 0, +1) + A_cur(0, 0, -1)) + - coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + - A_cur(0, +2, 0) + A_cur(0, -2, 0) + - A_cur(0, 0, +2) + A_cur(0, 0, -2)) + - coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + - A_cur(0, +3, 0) + A_cur(0, -3, 0) + - A_cur(0, 0, +3) + A_cur(0, 0, -3)); - - A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + - vsq[index] * div; - - } -} - -#define SPANX 32 -#define SPANY 8 -#define SPANZ 8 - -static task void -stencil_step_task(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], uniform const double vsq[], - uniform const double Ain[], uniform double Aout[]) { - if (taskIndex0 >= taskCount0 || - taskIndex1 >= taskCount1 || - taskIndex2 >= taskCount2) - return; - - const uniform int xfirst = x0 + taskIndex0 * SPANX; - const uniform int xlast = min(x1, xfirst + SPANX); - - const uniform int yfirst = y0 + taskIndex1 * SPANY; - const uniform int ylast = min(y1, yfirst + SPANY); - - const uniform int zfirst = z0 + taskIndex2 * SPANZ; - const uniform int zlast = min(z1, zfirst + SPANZ); - - stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, - Nx, Ny, Nz, coef, vsq, Ain, Aout); -} - - - -export void -loop_stencil_ispc_tasks(uniform int t0, uniform int t1, - uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], - uniform const double vsq[], - uniform double Aeven[], uniform double Aodd[]) -{ -#define NB(x,n) (((x)+(n)-1)/(n)) - - for (uniform int t = t0; t < t1; ++t) - { - // Parallelize across cores as well: each task will work on a slice - // of 1 in the z extent of the volume. - if ((t & 1) == 0) - launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] - stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, - coef, vsq, Aeven, Aodd); - else - launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] - stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, - coef, vsq, Aodd, Aeven); - - // We need to wait for all of the launched tasks to finish before - // starting the next iteration. - sync; - } -} - diff --git a/examples_cuda/stencil/stencil_cu b/examples_cuda/stencil/stencil_cu deleted file mode 100755 index 28fe453a9caad27179226202fbe71ab1fc0a3b49..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 25158 zcmch93w&F}mG_nH5Qm551OkTexKIb|5QLqCUm=y5$KXYf~ zD@B5S-}l>{pRdlG|C~89b7tn;x#N51UYBQCj>RI0kRx3u5%>9Qg+a!zB#k;5%drY6 zUs9wCr8&|mK&IhRSPD@iCg>1XEoeTM%LUD&cn$}06eGgv`35HmDr!iS%9{!bj~evo z^#&`bB*_e*d{oz|ih79y?`32wRR|%U2~kX@A67kr$`s;=dQ<2qDMk=fw2#_G*ZI7C z=jSNkh*yYk?NgN50(jj!;TDSbUrSO%l@z*>agrpc$afR+C9BS5zKV!?UfJHZdhwzw z+gq+^Z|m$?f5rNW#aAp|WD5mt3t0UNp@f>ayrzk%qvlXKvlu5yb3vEjIvtmdV>3A= zGM~pQKo{dWm-Cl^UWuy|mlzAw*9&n~;#!1@%BC>gAkyWaCAefwG*A=)#9}<==Oh9res)9~z^X z@cb-xzMA1@NX=I&$T=1KS;`$p{byoU4LH0^xDY&rp)B%qvy^MkVuyc(u1xj+Qx^R< za(w@p3RPn&0GFq@Wb6l7%6%RZnaX`NOS$A{GwD~e=sA`J{|@w!J%^lLNWcf7eiXig zD^q>m%Yt8+rCsC?GwHb?OTXk~!EXRwDE(z}tHTS8Q{kK}<+eg3Q@J;DJ^9PLQ%VrK zIg9+EEcjo!-9}wrNKE-Ax{YNj_ZL}kN0xRi=KUT&OF_=5FguHV$p2<4H_I&XJ$2YlfG7)N-$ zGY|@Q2RCTz7JzpK*0uQq%L8F9Xm4p**49pv)qxIw*9K=es2Y;XdOH0Lfw05pUmI`) zJ6qe<7(({;_MqPop^~70mzz{wE540Lw}+Nmtou%^y-G!1N0y(8GuN!2$htcHV5qc-l&HiFS&qxA-AXhDOi zJ>j-sCqx>;Fi%Ir29(wv>}ixt?&q;p_6PZfkD~Z;)X8 zP`Jf^`Q>E(#gewN^NtSuHiX+l?uKxAxqq#%TMKvl+QK2X-{q(^^tf$mVS(?>1Q)|EkFO+&b%Ef~^J#lRvmptIaX zMV^E^47?${$RF$pGl^h($kj#$=`d`8*c!iI3mF4bYwEnCt+NGVAk>55K!es0YYlb< zI;GZdpuHVtr-1>JxBA-JB^U@c0O()u(=dG3qdz+W9U%Tc@{w~}~Aa2@q070COu!`B9_t!q{wpY2ea@#@!Zf=GMtr2O3p35#wR31{1T*-oT z!VC$LHrDkb2k}NA8N=ZlzlMYbD`AS|U=?TJv?jJl|AdURMwM~ZUAh6w#A(tkTq&hL zZA$MsuMNw`T*;qI&yntdGO@;e@rm29M9hD!) zOE6ns(%M%9asQ%q=QjihD*Q9yWhPwa`lka!wUVL4giqNCc}=*eB4w&D;o{kZaK(gE z*&?VWT&#&HtzjFCrW=;pXR*F%vGHX(&(Jgp21j!pBYc#RTv$VZtvFAW?^xe%Z62cvd1l z-{B>~@VP8z{1iI8EJ9<4o}EM}a(G#k@XJ`t_>mo679qTZ#f+a4hnGbNFJ&>~r_AAH z5yI)YRD=qLmqiH|&%OjG4lfZ_Pw|{gyz1~0VK_Yxi%{qAvMA{h&wT_mJG?}g`~ns; ze!LDZi;%qG7a1$9b$D5XaC$x#q08ZAQNkCqnDMjT;bjrR>A6~jUWb=O38!ay5!Aj9 z^5bv91H=zvy2pQmXLz;mm-%}qulo6wvNSpON&H$bQV=74EoDoLO(M*FoOlcIqm2Iv z@j1i~G5#Uq$+aZ5GyVbM$)zL)82^3Z$(1Df8UH=v$%Q0(8Gk47RNX`uqPF`Wc@? zJh_NOFXK{l8bTR%Y@#GQ`UdF#qJh_5I9pm33{ygFp#=lBDP5nd};|~%~TQmuo z@h=chE+A3J_&vna)K5r^-$^`8`Na4a)c&6nFB3n;_{WK-sh=2S{7;C#g!m!GKSVs; zx)R$N{{Znc^%Db(|32|FTX-X$#@SrT%o1RfC6v$1eh-8u(d+KhE*#V4V89B>(K^!28P#Js&}7-|r^lmx0uOGcwXON0H)wxoH*);Fp@gvl~Nxfn#h}t(%pz813k+W*=4l)i2ezIyeB~6Z5FX|<12XR7vcp?|_ zs=h}Ze*b#aGOFtP)B|7kApcI1RU-$J`QPICdzW5}aW3^}u=G0(zmOFa``yDuM2s4q zhGcd4ueoa9VT)RMC{)~Py~qh&eM1&ipLPVDIC<7{Og=j8O>mIuek*%VbYpP00K=_)^V06%X-ZUV@O$Pw$7e}FV*MEPs2(0|X})zMW_b+jqJ+W%|!@Sk#z z+O6|`SAMX3sM_+YYRe0rTu<@%;U(PON3HXQaJ_QWdfOfd&dl5E9?9DSeGe9-QduCVeyZ3n{?#1;IWjj1#y-0#3CtR_XJWJnRi_0=zdDt5HHY{xK+h=iJ z6E6PT8fgG~%PRY=_Eq-V?Al)R9R)Rdy&9Pe=O+$-Mh1~S770djzpCF_jB(mtd`Q(_ z^yn?cV;(SZj~*@__vm*OPoRTU3|#BR>u_)LM1z<$%L`Tg3%CA>r!=ls?zyAATK~{# zeY(pC zNr|w~<0_xT41lMzZhQlnTOUabgYF)t*?$h|!^e#vO}jCa^Wi(}-!!IKgZkYMCa~-o ziO_9}p1npQbVD;@13VVLmYSwVdZ9bYB&|;{F%Xhbx2mygpQC9KpJhmH`g+OuF{ZQs z>m>&b$*o^6`S&DAZA2Km-p!mQo+fs?pE0nZ>)p(^OVak;s=jL!8q3k|kwb`h^w<~( z2AW5IFpgOKU9@G8{v`wl`4bvcv0b#dh~GvyEpvf?AGkC6FyXGqYt}6X&cNu7?HY$w z%8$k0CP`Z0LedM#=!2AibXxQc_97wvV^n#t(hyk+k->M+cf08UI{q}ZCiWD`(oOYI zk`y=9hrkGJPsu19N@*n&|MsVlT59M!g2Cd9K0vPy zjm5z6PfX>_1Kh_-uvoX;h=r(8c=@s2aTluk9V8DfGfKUKMz5cZ50C#pXk2-$PsD-H zP;n2xlS?cO|4s;kVqjHJ&9x+}5L*X!w+xj?R|&WtK=eUU5`Buss2Yp0!Ly5ssMg_Q zB6TG9Utc2^{|5-D&;emJx{Jz&--{O-D2+0pR0Tp$FgLS{DnQOrKNgQ02o;6S83ox% zwTzDvKKTCTfyx!vl`b2JG@@3ytolziBU(OW{fV_)A5w?klF?VA)PNa#jM+3ehZ%qN zA1{?Lcpe<0G<_F6C_qv|2_up1z@QA%lTztHqc5JP>JK(EAvUb^$QY8MhT&t6Y^Mt9 zyVe_8$VL$FGQ?>tj6|poc}(Xxk1?Hh_tOA{gdU-?qsA21n>L|p15C}`+aN&}8i^F* zmvpG8BBF>H6>A!RG?`Y9P+@9}wE#|WX!nJzu}wRWWe3kfUB_z1nt+6%sf4vg0u_Wi zQ7<(0vr4$&blAWEx;;McV|a8-}D$Ji7Q~skCh)} zw}O%Qw<$IH6x9UXPHos1xfcaM!?;>H;jSF9{@olrxP?x;A(nSND#|4FKX~*LgK(FU zQ2Q7vK*EtIl^%Plj1{z7sL~5DOQsyPmQw_ zCm{X7LZi=!>_$Z}1SpjnHxl^-F=LQLijY1MnE=51gB}#AppnSu0O?uPqd!u@r05xu z#txASO^vA*bp(q`$U%EvF;Q3f^H`+ezR9c@pNH(QKZzSD<IPA81^v9L3};P&K8vB(LY!9H{wgj zjhtnW!u=uo2nodBHE)n~`)>wa$xBcOSLRjCe7E7_f~2E2e!5&$`BV(ce32 zZ8ikBU_8bf@fP|7gSrkjC40$aCyKDGJz+rDkGjVG4E?HD_!ilB$cQq2CxN1zcoWxq z9&1PZQr>{`Sp$rrX|yB$tB=weur|E`kKy4R{YrIbY1UhlZ6H40#+7$tpz(=hEUjcx zZ7pQE`u>BwqUES{#WWxE9ks5CkZ&Z55w%8MrrT`vDFs%8lC2=*gztv#!2@U+?tYrsY8x-Il#E z^Nq$_z#3Bmf5^M7aQ64GN4C2{j8AmrD4KQv9f(ey`5=`LzxObkAj~kk*ODkXREP}Y zc`|0qkK3kQ3`Oxqh#L1}II%4sW5`Dzq`P4>Mq~G5YvJ5^5QCL3m<2k*<{Wgv*&bt$ z3Pc-=K<*%w@dDmEDI>gjy zgr>~VDr@2CG8melsOJF!^5mn|ThX}N(73Y>wpv#nv^M@8F+^IewinlJxA%SNzxUsx zG4d8p{eAt7-P@?GboYWO9xelHEq#Ppq~m?jBs9fHDPbuIKU^a{U$a)*^$~mDa~6B$ zOV-T`kYHG*kG0xT$%u9HbR^)}{CW6{+b)n%)Vza(cfr-uKEOVweiU8u0>Dw`KMdx! z;rJ~dFfZdoj`$NW`d|a93GegI9|DgB4wVnxBA(q>?Zr|6cX3)Y(35fLo_G_y5ch$J z1MGolwC^p8dLXXO7^RUh1TQ+2xD@*@s-7ujFH~htc^!F-&NfVdj!j&M7Vi87>c=*H zP|0^uVYs2~=pC$w;*Y`s(;fN00Poyvk{cws=+foq%ee8qHCO|yln=$dK%W`9Mv-~Yo|8b5i?W9HI;C!RTKLh@Zd@X8xM3imR_&L%MUD}$= za0q3fiFYG|QOXY>ky4+>k+gG`se=DR%6rKiS0$AnB6F-I<=FMxEiLm(@rOuJzr!mH z61$sTK%&jE#6jTV>ASyQJa#uVV24QYL>r5n;bq4`E3n=1B<>$O;C`QV!sotiePWNR za@Yy{6WRe$ty>!57o4$0vto-RyZ%enf5_uMVb@=>$L62yjGcdq{rDfPPavcIkxM^h zKmPjhH|@s{xhzMZbsKhN#$A@TN%5HKKUPhDM^uzGQ0;%)hU+RNASf2#;RI6>cM6WeoL_kzpHq^+7~ZW``$04vPMJB+d`(9fLT6KEse$T$}q-Qg>L`p9x?m`&+3Tl2WZ7-sgi)i1oi1s}TozR=_)UPd$|113BHMbT! z!xtReYYA^tO?qY0TXOi{slRDs($w1ikN2jwmu;=GeN{Dfg*Fp@F*diENZKwd+lKA? zpu$u7j-57FkEz37|`3v|madVy-msdGaR3I@CIf={5+--i03 z+FIBbx;-Sxop_IEsXVtuMnipm#Ma2C8rEoNayVQK4JOlQ?wXtJ9=B7&Zu)6i5`S6koc za%vuTwL7J)!VC?yP4$kHY-Cf7id$CORFh(f+B#Rgz0qB>Tx(d-(CDf*7bf(n_8Mnu zH8QFqklDzr!(&HV)C^s~6B^ysE{q^^7JEa3t3Cy3tgVHiYF21<^fIAYmdZp6UH0m=0 zp=7RYLF;-s)D;M{^mG}{&D~Jvm@j8Z5cMcM)o72b$R%@I=Ci-jB}P@rYs{C8TEJ9r zAuTNbdG&^t6uI>B+%;NLgNqBA z?T9(0*_pqG-J#J={bHg=O*TH4MGiv1Ro9f}c4CxfT6bNCVG=?Rj;4$jkt3WWolr%eY6A8?*_=O|1oR}9L2@jpcDP*9*fI|}(%DU4!g zi-j`IV=s?|X3n>8ek+ot=MQlHWzHYu{87%o$oc%m3XA=o^Cg^rNu)0@(qHEMT+YA3`8qCti1X_?{|C-j zarybYgKp(KeNRPUfXmxBKg#(lIlqp}mvR09&i8Zvr=0(U^TV8P<^$km&Ogif39f$^ z=Re}H^-_QByI6njX4;1M8GzxP#Kf?Jqm*3C% zq6&q{Jp$IO8mjjP*|pqAm+`NnYt82=w2NRwrAID zl!BN2mn<=!F|H+r*Rl0ORa>X8dqZkDhBoov*VvJ#29EFBejU@I+^-ngwqN)8V}ChV zy}IstgJ*WRSHy{0=Hy_)bpzyvN^^8;M?$6zl7n`>E zl)mY?*Pd}Iy1-5^}iEfJ{g?EvzD>jd7kdD?xaXa!2& z^!szRt z=S-8N*!j8z@qr62k))R{Sd7FUUwAfv)IffT!q7Y=CBZ;X_E}_d3hDG74hZ~y-blgI zw=(8@^lc0UHg-XkSqibq@2kc4uXzkbdXBGZaR}x?q2@PcWaQ4s5ma|~9>FHl%a;M>kHcs_k3FH4mq^b8B4d~Egw z38>)muruRDJ@9`E#t)vZuzdehKjK`2^zu4+zud#=7EXW2>CZSF2;jeaC#f3ot)mo=@w3Z$m!2G9pv=4oW8;7hn#-NX#t!Mg$p^gae5u6 zHJskYX(y-maJq%lA9DILP6s*tEvIjA`XQ%Za#}E(m(Qt3+0k}Jo2f&uz_E(u#`TSo07d6pQB2)(1wn%Z#8JR+n{Sj49kH) zcb8(`f@_&p-8+m~upYS(=zZzoX z^F)5(7x!}q)5RSq5mF`c3;#rG3=$Rjg`ch9j!gJfh_H~}{^dZ)r-}T+j}3B1FZxK- zSI7z4i2URu1uy($#TiCLK`UM9le{x!-K&`Nu@Vcm|Qbkn?w^ ziiP4*vWrN{G+_VVjU*(i+a6+yyo(yDlZU(DuW1lgPWQ^0)HI| z=KNw^me2F|^76Cge$S*e)s~!>ET@1 zU5ra|^O?#oS%CzE3rWBf(#tm`ofKu?*5@c0@SNniSms3f8ehp#%n*y`!t03pnmI8S zcLj03O2to+#C<3gpPpQYr{ejNxNoH5Gm`86R6N7^AP~sK9YXl;RQb~-;g?hKlh6I3 z^jw@KCH!cryft|~SSns93BQ+$&x9XS%n*y`(n%?bHv@hq{7xqPEcla5`0V8QXl7-( z(%G2*ndHxr#QaW`FOtN(O~t3qIZDOTZfZz0=E3Pw$=Qlo6leR;J?k%%DIgeH*I6>y zVIIRzwnGKOPqu>#cn+Rwm9%nKa=a@Ir}YNOZ%@PfnErF6LoP33Mtg`pz~bjhV;mRr zcK|r)kOn+n<5`%{QedkfS0k@mDK@RHlo#>oi7Pi`;mTeEWceI6_e3H)ZV^8|2a zAHFW3@+g#|!z>u*^i?8-vRp=_D&Qnv$J@pB(vUj)G;RO=ChDU+lexxyu)5~rYgr7eRPM7LW@2OEJVZV6J2Tt`A`zx1n z{M$_aTr4QO41rn}dVy!M+Yh*&@p%f9V|!bO4KY0@&!d-tvlEcg`uDvo_z9-x@0X`7JO+IyfzE|tt|NZEO;af{v(E;d_Q_5i~I|~ zGo3^Cr!4ZHWWmqG!ZMTHE@L?65uew5IWFClMc$VM{|?8~&!-yzo~hrT&7#M03bl9Y zzKrIuPzXFz{V&RbF9d!G{6pEf3d_&-)Di0gE;A(c=V9QuyPtG_e4Ob&T^hJTVHw`! z*$y#0RR?kJ0;m3>ZwD!e`^bk`5!z{x&^+&)5QFOxr0q5}ZTA;$I75e5`=_A5ybbG)!fVaR73 z|4)W<#zcm);J?d)znKO9Bnv(r2G4YU-yfX{l0G#?! zoDT(-g*92^Ls{_e0?*X0O9};-k`v#b(-c(;cjpwlcvs#)50B^ zpH7SmN#dY56QpS^L2XTYaJ8>pV;iBGuV=l4O-^=3UyE&V+2Vx~C1`Ce>xsJxr?JuQ zYD-T?#|BX!SB;ZZXIZ_y+J(1D434TJ;!~xtT~}6$cu&*cWAk@);fT5Nt8o+@e@o40 zyrovYh~7YDah%xJ(}8pH+H74ov=4^{Yb!nFl_9**=5uzhV$*|3VY{XLVP1 zu*QOfJKXIyJ zLz&jG+Ereu;Z-nKVwa2UA7aiG`^UKV{G#>fxsg8ub=u zRi1o$WZE&4na{{P`NT?g@MCHX*@1{@wOQGM^CDBl=nTa)Df5wwsY%Asp;HHXnhp{* zAGVm9iMKrSiIJ(5p}t}#S*F75K*%(u>Bl#wCTBds^KTyCqI+;AsBu!M>AcC*ax$EQ zN#|pxR{tx{<3x-99}WghFQ68KA8*cTJ)r=dgv$Cw9Ghw!nwg>Ai;Z_@-2Mh1okUu` zDD^#kR}H)o{>_HQuO~;_8SL(W({IE7=zQ?+?mB-%x#n_p)YCbobmXYGTcjR1s+r~{ zJa%Qu8~29G5WWG_a1V7YP|e$sJb%`B1u1#*DmuqF=T*~Cptx_a0fh5MQw_v?Lh@MJ zzxloaAJh^GYHNL+Ep&7$jo>t!8Hb@7cU5ucV&{s|jAzbKTUp_zQ&ZufsCj4^HO!d7 z$&+i-PNYpel9!)Yn%aTvgws^Wd|0W!hYCeO8P4cs&Qy5GsndK@&ek=~QBBovniafj z#5ujm(^5}9K~_;r$D{HySX0$zH0uJKF)OT;Jh+%0rOPvOkDYpkEYkS-tj2+^tONcJ DN)M?J diff --git a/examples_cuda/stencil/stencil_cu.o b/examples_cuda/stencil/stencil_cu.o deleted file mode 100644 index dcd38c9fd49fc3e9836c18ebd5de5a730e5452cd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 21784 zcmc&+4|G)3nSYZ6h=`eBtEHmliJ?spV+K$O{>vmYDml^IZ`uKp~7}|@Zw6A z@bIvfbybvVJ-?W8ppEc3?`6h=*3y&T9g*iaoEhGQg3~kX(_D&@E$`{K)xMhkiq_Lt z-t*aT_lKu-5=xwD)#qihWw0;=Chs#JR{BbhZWqMr*}6ui>gV zW#9A?kd4#E`CQ|$)A*?W!)YGlmoEF0!?yEo$C(%5}g5A2)%9@uV2pX2p5 zkpIp5o3hK5rfjgZDg2+F{@)f4IqfrFa2#>;HQAnPvOVi9_EwxaIp02Q9rRL$>@)k| z@6{puUHgGJx#WOnpkzO9+3&LLhlYOE&<`WD)ed~~lG=k09EL6j;O}MZc5=WzO#x4y zb>|`_ww?nvx9wEzNqgpRz{Ad-gErTV>56mqOaQ7&mphj^mpkur>IYB-q=ul&sk{CD z7QY%#q}8@WcRX^XGF_Rjwj~ptYFn%$s@|l`QafVtsH#jIQ=v=WJ~ZU9MX^%k{838rH4RVpe{UD4ICa8zAdvwXg)@Jd6X zv#UEDRU1NGp_Q?YSb8nARV&Grimu-sO~#`gsp_uu8ikoMBQ;{?M5RXQh^<6(KwmOl z9a4O)i&bwd4&79DG6{xFoRONSbR-g8dMX`_hhtzN7_O&7sk>8(8jmKEH>)!uDhxIh zhSDl^qUn)9u%W>n2oU$^p5{ePug9gsXV6U>T7s>5%L2X0-PGb=LK~V}TA`Px)dT(R zbh~Ji%Wcu%^S3PUc-?;8eWSZLy?(dTwWL5xbBkW@X?FUT=nGo>dZRB$^^0!KhTDT4|7elu zps|YF4Gns*xuM1HcQ>@652(0cb0FyRwfI}zF5T;C@{Ag*a6_Oa=x-R+4SlMkx&S=Pr3Zh+{ zfq>gT3TbU=0Y^12(S2^Wf7Gf(oiGA$1bBoB!G^r6XhX1hVROqJ&F1!3!tK%U-3{H5 zDBCZo$|}{BjID@Sg*rZmX&Q6^q{dbI<;~}WET5ZHQzK9qsA=Nv;tfOA%uAiOX!P{_^Ri(Hck!Q{%Bz#p4d8p zRwu4^HjvNfwXjVq$%F0baxd~UxYf#dLPY`BgfZ4aWQVv3@i?X4gR>NgrXiX~BUMs= zu?vI{vlF^89BYe3BVsLt)YYMmScGZdlB?*rcyQ9iuo{A}!<$>#uqY4RV@3tt8PHio z8s(qS?Yhfx;u1uQ zrur5yi78{oF`ADPE}{TXiCi8C_+`ZJHltTyj6BZsKxl08LCAv0sAIGb`XPFm>``(t zR)}#4pSzpQiwf0GdFb0b2W1jFeSWy^K#5r#844STJ}~gGn_6xfi*bCeJl=UX0xIh%PIynqod-eX1wGzA{EEH zP&w{3(iNw?#(foM;ef4anIraxZh<4YH=8KcvW?}M@h6Y*iMQ&wR=fY6jwa(nm;I?) zPt_T3?U22v7Ki|OjpLrGSG>l1-l`#cW*PQ2hBVtzZ`E6x;V!jj)&is%uQ}~ced;yd z)w0VfrfAuY3e}s9R8)GQRO8L2E9QB#_f^!vFmShEe8TXfVU+cAjw}u$u{?Zc% z&JPc>`xLKn2KwbKJ+YZJ)S=J4#yfdQC;Rw)0gwULs$OMHC(=TW+c6AxAX=u+{?HqM zd5nP*{g96=WWH`h7Q~iP&3M~+hk2V5&>npMY>}c2WY+x&)*-llAhS7N+RjU+K}KdB z$j&mR{YgfH3_03XEqBv1s6;R~d|@UI{fo)m`URP6^onxqRu8-RQh6nMpP#*}dV2s66{8=&0>t9|b}XfP?YBNQnTX}O)qEf_|Gw4Eww2chM8 z<8B_2=gRItJi84^l5nf|WD)*)mba>&k0? z@;D&4R6PA}7h_c{f5Gf!ATx}(Acxal!872dY4JZwJ?xD8%mhW zQ@q4v-oF_aC@_qS0-srPiyOfWVA^&j=KjZkfgT#jl*1>=(7;3}Ldi5NxEY1&1jRnKU^FAR3%cy$U2y7{Wo!u0&^OZ1ZZ@DIxDu@cNt>AwZU{PTVF21|dOy>r z83*hS9lZb-&A@JmYYN|l5cj>NOBmB}#&LvQ3e1bVmfeY#z^vjJ4rU$z1t8&+R&~}> zJ7E9T)KW!BU3gor**44ed| zG85FGh^DZS!F4F}-3cJUc+(tdb`OqC%V{=k03PlE52Tk=VD5GthJzYhH<=eH-E46j zW|4X?4u-bEvg%?oceX>uT2Sp=RGVcOczrQ#(pVF-Bu{rUY#Z``z6LU;&1;;~j5qSE z8eqZoA-e+!dGu=RJ8FVi(~xGLnZfBEvQN&efEC1m#~82#y|0+-#dvqfzStyiVqWrF zFek96*;W95sn`h}SB?eYK=2<`0gQ|9=lA^S>NSCB#EMZRT{bD;`^)Fp-0>=wE~hgh`#e1vQv^k2}VsVn78- zKRnORnfv1|7C_A9&927~^EQ19OFp{=uZGziuHBFAaCgDrzI>$3e&-Q;>#v{$g*JQjORIaGJ?Fy@{Bt&k z(ZXfCXS`vK6|Y|4ifuIj+pBiuJz@?ChGMoDF=Ag?Mf+=|x+6Tq_K;^D}=4jr7mG|NU z7!iGP(W8#OrQ+Sm@&j-bfU_tb8t~4zYCrGI4ihI{{1jr0G1&8#O?&aUcEKR7j6R5> zeJ8$t&Z^}Ah$`IYH_*rQjOhZH*b`ra5$=8(>|^7~8_CqedC0VeUy*ljlzFCGyvAFs zx1*2I+3nKq#|4+o6l>W1uL9$GcK=KU`encl%aoe7d$I&D@r=QA?1N^0$v42a{Lqv? zQn1i(o(+JfjI#Ixp?kz)K8s^IjA$8S&+dyQ15I~aS3r2r`=ukQMszR?Usj$#6XA+i zf5YPt_kVrCdfSs%D6dV*l)N`pd1~^*a8!KfvT*>tJ#`!)=cd9o&s}cVpuGKx>5B65 z71u-K559IO0DnnB57LAIg(lo|G~v~WCKhc7fY&UV-~cfym5!<@N<^whrTQVMepsqM zE7gA~)t{H@FG%%4sg7p{nvT#4l<@^GO~0ZQD8DS#UyU*X7KB?X()%&ITFQoc2QvIM*ACT&Yr21j0u1X8| zN~!*?REG#TEKLEcu1>%kiZFXu)sTp%qifQYv3MF@;Kc5Y>iN8?P*;XE1UnNEc$qv? z?MQ?oO^HZ%N3=4Wh^NwOxIL7dIZI{YMj=A>%0!~WTC%A!m5$7xZ@yK(2?gH5-j~(K z;-Tc)(Rxg2;^{-~gPq$yJ=l8-lT!C^ohhyNmUEB);Yia;->qgHBh&B$qRjHfhKBiS zWn*)&N}XGML$!l9ewpXCd;N-SO}TCAqzR?C<|+WcoVT$cSzlJ3E4r<0%En?>nYzBj zRaV(EuA!_3Z(v-U_a)35%gRyaVvc%}sp|Hcon>ER+**_2Ji^<`?Vq`s_j<2Ywo&HC}qvUxodihfsC<1DL$ruwoe zK&=N=OxGva7t+OMrp$u8lnE0#x-Tqx1G4%H z^M0}2>+>?NO<;?%Sop36M4XoUKQk_g- z1?di$1q97tbpE=x~15b}iGZr5!0>OaSE%pLv35O;H} zuCI!B>)|zP95?8CJdx}S!FMv?#c>F}N8<5?0}kEo?({DVQ_-n(D4Fi=QmR*l!+La0 zINFtluW-N%Oise@px_8`4CXMQVh%T)O5^DA702~TbvV)48I6N?;2ST|YFPT$fP(Pd zTtKv4hi`*)MwNntz77YyDwzOl{wgU9b33_!q&kklOtXZ?dpzp1HAC|IEc$D#7pz~N zV}E?GE%#Re!{sGk?|pz_V$T@NBKs^SWh~+;MaZO0TW>ULeH=vnxEq8$Q#g)m8ud%z zTbU%qe@-8xi|x?@{mXg(uZoU57y1Q^@ecLR768vh|IZ}Dkc&y=b-X;LKX)j5nuQ&B z68c5|R=}|=`~P|VMG6*UMQ6?>RiyjBl+W}+xC4X2VxCYOt_9Iw;8VDM+5SGRzg!Zof()9z zWTC%?>+j|LQ5;?iW&Jq6@NdjI9lX-iQfX%SPtjlGcn&YrAHD*~l04s{F7g|B|C+o8 z-jDZ}_YdmABytz;Z=xfg(|P|9#4L$^0$;-W%l@zA{kKaU#JENOR%n-|h&3I<#~+a9 z@&4%l%ivG+7xRzTE_v+%q#U{mKjGDoq+hBsKghOYzw&GF6A}h3NMD0LJTuDrvAK}` za(o91X}xq~E9*B|V?U8sqWocaYzyiuq!DpR8Sh*?Cc2vq#ma<2;z5oW<>=;O;S2)f zIxHf%xL!B`x3L{#l54ohcQXEn{p@TG*xzwng&Y(58q*KquLvyiv6`Z=^EodUvxv?s zBr9g$3@1RI&x0XpdJ4)m@Q)a=$on`h_L;yBG5pKQIzEJYsAD<{e>mP_92fiXDws&b z>o{J?*ycxZfs>aH_(c$@@qCU6&nK7+CTDy;9zr#ripNABv#FT)KTn22@hC5jv=l2} z8L5y`u_B*h%*JB2BMA|TTbW^(=KU#J|B2(mp923O$HjgY_>VX)>LP!V;bZOG$@!`v zlDs(YH3CAh7-ptWDjst+RFtQID%*~IpRvIIWPwkBU{gpAo;M5Oms#L50lyeV(8UKJ z?oV4e-pg@zj|$~)a=ee@|IPD90H03wk+@#`81O>&{F?>;IfnB?3&WPEbwXz{3Z*$6>-?}atc}S*IM9;_+3o6yE@#} z1rI&&xs3Q#K!|{5-KeAzEJsR)s^eqg>s=MHsO!aK&;C0ld;1>iTS8_F@1ky?P ze*a$}i9IasN+!Ch=hQNJv(2xVH-$Pn65&pSYCEHy;jXpXY}Z=fYKC0L9tvUGoM=1( zIoMG%@=O*@4AkhIE8PxwW_u5&*A7QuYB3jEKd&KdR0IM&%| zCAZPrQt$NYda8S+?$Px-XFFC!(_`HuyA%uxq`mWWJ(h^YWAJRs+c5hE_6d)#ybW_| z@UxZ`=djPYtjF9i{Mvw(I#QlM+Tp-w#(Fv#iltNdkj@X-oRy(eEUYKs$zL?o=?TL( zyaU(iFktA{97uGe+%YA(26TqIo1&dg^trxzwpl};rxPx$|0P5Df@=t5&hx-%UGQKs z7K{0OkpRw&#j{5J&Pa!Eb>VggJeDie@&EDw5|&&lng+4)dpZ@UbS75gL%~3Lc8#v5 z+mnfV^p0qJRk~e=IY=bYp`D>v9DcU8YRwwGE1FCt;_!or^jdv&4N3`hf=_X{kh$0o zFB)NG;LqeJK+DbMW5_^yZaC3B!s5S}8~MSU0y~I3?+tf1q}MdCw;MQRZHcaETxm;1 zqjxK9>1aoX(uONuffbpKbw(3yk;AUv+g7A}W1CKjuNB#28{EZH* zGm(4?BM>5B(K&R5cVZ6YwzEYajNcQBM|6KQ)s2sY^-v0b;HbBSVjZ9XR_eT1BBjF) zi_XQVZD0#gkE9ZMdng_OcNHMHOj3AFNQa+H;m%Xwmz8G196~kxw9!oBjtN6O-Yvqn z0ay~Z$B17|@F0{iA&$>R1-~0XNGKogCd7IF+XzAudd-@m{D>FuF&w@7*u`<#9yp$` zB;Gs?%WZebGhDDLB~A$(k`!u}^1d+d1U5qv)3ue8ANxi=I6i_G^1n}T$~T`s zjQBr{LLoiQ`8Y3PP7wbokwe$>9t(Uwk%MtY?2BhD_`f4~9q@(y9}%3|Z^H^Cv3~y6 ztSaLPPVK*t`-Psn+j@UJC$$MXvkHdFj^f@7N({xHq9 zz+D#jG7Eew$HjX12UEQAeZq%Z$^0|-3|1fs{`btPg7=%4(8dP%6aG9+a2LT#aB)CF zz8n4o|4Rg?{G|lPYpCEqNbp92FXb0wl&=x|Zi0IV{#}CKM)03^VYkoGXEUK=3Al+whPC3GHl#Kf%Xuz+)16e{WWmZxDV9V1j=$!SO61@WljgB>24q z$9ENi{~dzgPVkQi?kD(pf>XU;Ed>%J99JX!3Hdh?oa$XlaD0^`_-TUE>ui?b(+L0H z3EoQZ0|ZB#g#2d-j%O`_zshm+KTh2S_`~#D!e0b+!7t(m5tL2$@6`myt$=@+@HaS^ zkbejK2{}I?IC3Y!AEw77T*!Hv;41KhoC5^ElHe~$_#gTCd>jrG652!U>?Am~GfQx4 z=OYp>?A$?c6?6zYj}V;dosS1_NV0zx6P)VRC0yuD5?lp&Lhn5sNA)$f?q}OO?&|eJMp*2nEr+1=pQQoJi+ODo^TQC zmX9ar%&Jnwag;;Xa}B}qdMfO^j^IlO?j$(M68u#JpF{9BIgaDP5H9W?-ywVqEke#| zg3l)Sp9qfc3I%^c8B`$2^2<1mHd6VQ5S;q$3Q5lMOe%aYj_|49st6zNL)Y=^=p2I6 z@y?U*JzW0HME+91#JCzIzVKT!;nQ&~w&3d){Fnv5%YuKe1^+<{e%6Bjhz0*Ug#Qih z8_d2(aNNI@yzK`>4r0P@KPLE0fbg46Z$o`j3_^AV9x*UuS(Q~UoY$r1L8S7kUZ^fSg9OcU&8F2{|@ z1gCzU%5mWzv2LdmK3%u7EckOQ`11%K{Z_~QR!?y1w?+xaIl$y4@~Pi~gpb#jDU7Ws zcM|?Gz{Gmf34b}~ViqAdl@lZQU4)+`@^!$4d_lp>0aCkg)IQrog&OasL?{l09fHtfmzl6&v=j{TY#&K1` zYdKyc;fpw4C*ets`v{KqZ{YX}34e^^T?9uv1%I7{@8JB+1V{OU9N#YC_+NEk>LobJ z-_P+r3I7ep2MLbyRla|ZNw|;WrwESn&u}^HzzB70M*L#V7w;YgK8xcj?{f z5y$HYj&`o$xR2ndH^K201V{dMj(17;Ajj89_*sr`mT;9HaJNgi#_?W)qdf+f(dEyFTaGcdLzg2dRNO3 d^8^3iB238Vp`F#EgntP`%nz~e1YgAY{{efPftdgR diff --git a/examples_cuda/stencil/stencil_ispc.h b/examples_cuda/stencil/stencil_ispc.h deleted file mode 100644 index 10b0d713..00000000 --- a/examples_cuda/stencil/stencil_ispc.h +++ /dev/null @@ -1,34 +0,0 @@ -// -// stencil_ispc.h -// (Header automatically generated by the ispc compiler.) -// DO NOT EDIT THIS FILE. -// - -#ifndef ISPC_STENCIL_ISPC_H -#define ISPC_STENCIL_ISPC_H - -#include - - - -#ifdef __cplusplus -namespace ispc { /* namespace */ -#endif // __cplusplus - -/////////////////////////////////////////////////////////////////////////// -// Functions exported from ispc code -/////////////////////////////////////////////////////////////////////////// -#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C) -extern "C" { -#endif // __cplusplus - extern void loop_stencil_ispc_tasks(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd); -#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C) -} /* end extern C */ -#endif // __cplusplus - - -#ifdef __cplusplus -} /* namespace */ -#endif // __cplusplus - -#endif // ISPC_STENCIL_ISPC_H diff --git a/examples_cuda/stencil/stencil_ispc_nvptx64.ll b/examples_cuda/stencil/stencil_ispc_nvptx64.ll deleted file mode 100644 index 51c0d95a..00000000 --- a/examples_cuda/stencil/stencil_ispc_nvptx64.ll +++ /dev/null @@ -1,974 +0,0 @@ -; ModuleID = 'stencil_ispc_nvptx64.bc' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -target triple = "nvptx64" - -module asm "" -module asm ".extern .func (.param .b32 func_retval0) cudaLaunchDevice" -module asm "(" -module asm " .param .b64 cudaLaunchDevice_param_0," -module asm " .param .b64 cudaLaunchDevice_param_1," -module asm " .param .align 4 .b8 cudaLaunchDevice_param_2[12]," -module asm " .param .align 4 .b8 cudaLaunchDevice_param_3[12]," -module asm " .param .b32 cudaLaunchDevice_param_4," -module asm " .param .b64 cudaLaunchDevice_param_5" -module asm ");" - -@constDeltaForeach1 = private unnamed_addr constant [32 x i8] zeroinitializer -@constDeltaForeach4 = private unnamed_addr constant [32 x i8] c"\00\01\02\03\04\05\06\07\08\09\0A\0B\0C\0D\0E\0F\10\11\12\13\14\15\16\17\18\19\1A\1B\1C\1D\1E\1F" - -declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone - -declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone - -declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() nounwind readnone - -declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() nounwind readnone - -declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() nounwind readnone - -declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() nounwind readnone - -declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() nounwind readnone - -define i32 @__shfl_i32(i32, i32) { - %shfl = tail call i32 asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) - ret i32 %shfl -} - -define float @__shfl_xor_float(float, i32) { - %shfl = tail call float asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) - ret float %shfl -} - -define i32 @__shfl_xor_i32(i32, i32) { - %shfl = tail call i32 asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) - ret i32 %shfl -} - -define float @__fminf(float, float) { - %min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) - ret float %min -} - -define float @__fmaxf(float, float) { - %max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) - ret float %max -} - -define i32 @__ballot(i1) { - %conv = zext i1 %0 to i32 - %res = tail call i32 asm sideeffect "{ .reg .pred %p1; \0A setp.ne.u32 %p1, $1, 0; \0A vote.ballot.b32 $0, %p1; \0A }", "=r,r"(i32 %conv) - ret i32 %res -} - -define i32 @__lanemask_lt() { - %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() - ret i32 %mask -} - -define i8* @ISPCAlloc(i8**, i64, i32) { - ret i8* inttoptr (i64 1 to i8*) -} - -declare i64 @cudaGetParameterBuffer(i64, i64) - -define i8* @ISPCGetParamBuffer(i8**, i64 %align, i64 %size) { -entry: - %tid.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and = and i32 %tid.i, 31 - %cmp = icmp eq i32 %and, 0 - br i1 %cmp, label %if.then, label %if.end - -if.then: ; preds = %entry - %ptri64tmp = tail call i64 @cudaGetParameterBuffer(i64 %align, i64 %size) - %phitmp = inttoptr i64 %ptri64tmp to i8* - br label %if.end - -if.end: ; preds = %if.then, %entry - %ptri64 = phi i8* [ %phitmp, %if.then ], [ null, %entry ] - ret i8* %ptri64 -} - -define void @ISPCLaunch(i8**, i8* %func_ptr, i8* %func_args, i32 %ntx, i32 %nty, i32 %ntz) { -entry: - %tid.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and = and i32 %tid.i, 31 - %cmp = icmp eq i32 %and, 0 - br i1 %cmp, label %if.then, label %if.end - -if.then: ; preds = %entry - %ntxm1 = add nsw i32 %ntx, -1 - %ntxm1d4 = ashr i32 %ntxm1, 2 - %nbx = add nsw i32 %ntxm1d4, 1 - %args_i64 = ptrtoint i8* %func_args to i64 - %func_i64 = ptrtoint i8* %func_ptr to i64 - %res_tmp = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 %func_i64, i64 %args_i64, i32 %nbx, i32 %nty, i32 %ntz, i32 128, i32 1, i32 1, i32 0, i64 0) - br label %if.end - -if.end: ; preds = %if.then, %entry - ret void -} - -declare i32 @cudaDeviceSynchronize() - -define void @ISPCSync(i8*) { - %2 = tail call i32 @cudaDeviceSynchronize() - ret void -} - -define i64 @__warpBinExclusiveScan(i1 %p) { -entry: - %conv.i = zext i1 %p to i32 - %res.i = tail call i32 asm sideeffect "{ .reg .pred %p1; \0A setp.ne.u32 %p1, $1, 0; \0A vote.ballot.b32 $0, %p1; \0A }", "=r,r"(i32 %conv.i) - %res.i1 = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %res.i) - %mask.i = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() - %and = and i32 %mask.i, %res.i - %res.i2 = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %and) - %retval.sroa.1.4.insert.ext.i = zext i32 %res.i2 to i64 - %retval.sroa.1.4.insert.shift.i = shl nuw i64 %retval.sroa.1.4.insert.ext.i, 32 - %retval.sroa.0.0.insert.ext.i = zext i32 %res.i1 to i64 - %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.1.4.insert.shift.i, %retval.sroa.0.0.insert.ext.i - ret i64 %retval.sroa.0.0.insert.insert.i -} - -define internal void @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Ain, double* %Aout) { -allocas: - %bid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() - %mul_calltmp_.i = shl i32 %bid.i.i, 2 - %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %bitop.i = ashr i32 %tid.i.i, 5 - %add_mul_calltmp__bitop.i = add i32 %bitop.i, %mul_calltmp_.i - %nb.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() - %mul_calltmp_.i57 = shl i32 %nb.i.i, 2 - %greaterequal_calltmp_calltmp18 = icmp sge i32 %add_mul_calltmp__bitop.i, %mul_calltmp_.i57 - %bid.i.i58 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() - %nb.i.i59 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() - %greaterequal_calltmp21_calltmp24 = icmp sge i32 %bid.i.i58, %nb.i.i59 - %logical_or = or i1 %greaterequal_calltmp_calltmp18, %greaterequal_calltmp21_calltmp24 - %bid.i.i60 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() - %nb.i.i61 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() - %greaterequal_calltmp27_calltmp30 = icmp sge i32 %bid.i.i60, %nb.i.i61 - %logical_or31 = or i1 %logical_or, %greaterequal_calltmp27_calltmp30 - br i1 %logical_or31, label %if_then, label %if_exit - -if_then: ; preds = %foreach_reset19.i, %if_exit, %allocas - ret void - -if_exit: ; preds = %allocas - %bid.i.i62 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() - %mul_calltmp_.i63 = shl i32 %bid.i.i62, 7 - %tid.i.i64 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %bitop.i657375 = add i32 %tid.i.i64, %mul_calltmp_.i63 - %mul_calltmp35_ = and i32 %bitop.i657375, -32 - %add_x0_load_mul_calltmp35_ = add i32 %mul_calltmp35_, %x0 - %add_xfirst_load_ = add i32 %add_x0_load_mul_calltmp35_, 32 - %c.i.i = icmp sgt i32 %add_xfirst_load_, %x1 - %r.i.i = select i1 %c.i.i, i32 %x1, i32 %add_xfirst_load_ - %bid.i.i67 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() - %mul_calltmp41_ = shl i32 %bid.i.i67, 3 - %add_y0_load_mul_calltmp41_ = add i32 %mul_calltmp41_, %y0 - %add_yfirst_load_ = add i32 %add_y0_load_mul_calltmp41_, 8 - %bid.i.i70 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() - %mul_calltmp47_ = shl i32 %bid.i.i70, 3 - %add_z0_load_mul_calltmp47_ = add i32 %mul_calltmp47_, %z0 - %add_zfirst_load_ = add i32 %add_z0_load_mul_calltmp47_, 8 - %c.i.i71 = icmp sgt i32 %add_zfirst_load_, %z1 - %r.i.i72 = select i1 %c.i.i71, i32 %z1, i32 %add_zfirst_load_ - %mul_Nx_load_Ny_load.i = mul i32 %Ny, %Nx - %nitems29.i = sub i32 %r.i.i, %add_x0_load_mul_calltmp35_ - %nextras30.i = srem i32 %nitems29.i, 32 - %aligned_end31.i = sub i32 %r.i.i, %nextras30.i - %tid.i4.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %__laneidx.i = and i32 %tid.i4.i, 31 - %0 = zext i32 %__laneidx.i to i64 - %arrayidx.i = getelementptr [32 x i8]* @constDeltaForeach1, i64 0, i64 %0 - %cmp38.i396 = icmp slt i32 %add_z0_load_mul_calltmp47_, %r.i.i72 - br i1 %cmp38.i396, label %foreach_test21.i.preheader.lr.ph, label %if_then - -foreach_test21.i.preheader.lr.ph: ; preds = %if_exit - %c.i.i68 = icmp sgt i32 %add_yfirst_load_, %y1 - %r.i.i69 = select i1 %c.i.i68, i32 %y1, i32 %add_yfirst_load_ - %1 = load i8* %arrayidx.i, align 1 - %_zext.i394 = zext i8 %1 to i32 - %2 = insertelement <1 x i32> undef, i32 %_zext.i394, i32 0 - %smear_counter_init.i393 = insertelement <1 x i32> undef, i32 %add_z0_load_mul_calltmp47_, i32 0 - %iter_val.i395 = add <1 x i32> %smear_counter_init.i393, %2 - %smear_counter_init44.i387 = insertelement <1 x i32> undef, i32 %add_y0_load_mul_calltmp41_, i32 0 - %cmp54.i390 = icmp slt i32 %add_y0_load_mul_calltmp41_, %r.i.i69 - %before_aligned_end73.i385 = icmp slt i32 %add_x0_load_mul_calltmp35_, %aligned_end31.i - %smear_end_init289.i = insertelement <1 x i32> undef, i32 %r.i.i, i32 0 - %Nxy_load298_broadcast_init.i = insertelement <1 x i32> undef, i32 %mul_Nx_load_Ny_load.i, i32 0 - %Nx_load300_broadcast_init.i = insertelement <1 x i32> undef, i32 %Nx, i32 0 - %Ain_load309_ptr2int.i = ptrtoint double* %Ain to i64 - %coef_load314_offset.i = getelementptr double* %coef, i64 1 - %coef_load365_offset.i = getelementptr double* %coef, i64 2 - %mul__Nx_load385.i = shl i32 %Nx, 1 - %mul__Nx_load393.i = mul i32 %Nx, -2 - %mul__Nxy_load402.i = shl i32 %mul_Nx_load_Ny_load.i, 1 - %mul__Nxy_load410.i = mul i32 %mul_Nx_load_Ny_load.i, -2 - %coef_load416_offset.i = getelementptr double* %coef, i64 3 - %mul__Nx_load436.i = mul i32 %Nx, 3 - %mul__Nx_load444.i = mul i32 %Nx, -3 - %mul__Nxy_load453.i = mul i32 %mul_Nx_load_Ny_load.i, 3 - %mul__Nxy_load461.i = mul i32 %mul_Nx_load_Ny_load.i, -3 - %Aout_load470_ptr2int.i = ptrtoint double* %Aout to i64 - %vsq_load488_ptr2int.i = ptrtoint double* %vsq to i64 - %3 = sub i32 -9, %y0 - %4 = shl i32 %bid.i.i67, 3 - %5 = sub i32 %3, %4 - %6 = xor i32 %y1, -1 - %7 = icmp sgt i32 %5, %6 - %smax = select i1 %7, i32 %5, i32 %6 - %8 = xor i32 %smax, -1 - %9 = sub i32 -9, %z0 - %10 = shl i32 %bid.i.i70, 3 - %11 = sub i32 %9, %10 - %12 = xor i32 %z1, -1 - %13 = icmp sgt i32 %11, %12 - %smax399 = select i1 %13, i32 %11, i32 %12 - %14 = xor i32 %smax399, -1 - br label %foreach_test21.i.preheader - -foreach_full_body.i: ; preds = %outer_not_in_extras.i.preheader, %foreach_full_body.i - %counter32.4.i386 = phi i32 [ %new_counter279.i, %foreach_full_body.i ], [ %add_x0_load_mul_calltmp35_, %outer_not_in_extras.i.preheader ] - %tid.i.i56 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %__laneidx80.i = and i32 %tid.i.i56, 31 - %15 = zext i32 %__laneidx80.i to i64 - %arrayidx81.i = getelementptr [32 x i8]* @constDeltaForeach4, i64 0, i64 %15 - %16 = load i8* %arrayidx81.i, align 1 - %_zext82.i = zext i8 %16 to i32 - %coef_load_offset_load.i = load double* %coef, align 8 - %.lhs362.lhs.lhs = extractelement <1 x i32> %mul_z_load297_Nxy_load298_broadcast.i, i32 0 - %.lhs362.lhs.rhs.lhs = extractelement <1 x i32> %iter_val50.i392, i32 0 - %.lhs362.lhs.rhs = mul i32 %.lhs362.lhs.rhs.lhs, %Nx - %.lhs362.lhs = add i32 %.lhs362.lhs.lhs, %.lhs362.lhs.rhs - %.lhs362.rhs = add i32 %counter32.4.i386, %_zext82.i - %.lhs362 = add i32 %.lhs362.lhs, %.lhs362.rhs - %17 = shl i32 %.lhs362, 3 - %iptr__id.i.rhs = sext i32 %17 to i64 - %iptr__id.i = add i64 %iptr__id.i.rhs, %Ain_load309_ptr2int.i - %ptr__id.i = inttoptr i64 %iptr__id.i to double* - %val__id.i = load double* %ptr__id.i, align 8 - %coef_load94_offset_load.i = load double* %coef_load314_offset.i, align 8 - %18 = add i32 %17, 8 - %iptr__id.i335.rhs = sext i32 %18 to i64 - %iptr__id.i335 = add i64 %iptr__id.i335.rhs, %Ain_load309_ptr2int.i - %ptr__id.i336 = inttoptr i64 %iptr__id.i335 to double* - %val__id.i337 = load double* %ptr__id.i336, align 8 - %19 = add i32 %17, -8 - %iptr__id.i330.rhs = sext i32 %19 to i64 - %iptr__id.i330 = add i64 %iptr__id.i330.rhs, %Ain_load309_ptr2int.i - %ptr__id.i331 = inttoptr i64 %iptr__id.i330 to double* - %val__id.i332 = load double* %ptr__id.i331, align 8 - %.lhs365 = add i32 %.lhs362, %Nx - %20 = shl i32 %.lhs365, 3 - %iptr__id.i325.rhs = sext i32 %20 to i64 - %iptr__id.i325 = add i64 %iptr__id.i325.rhs, %Ain_load309_ptr2int.i - %ptr__id.i326 = inttoptr i64 %iptr__id.i325 to double* - %val__id.i327 = load double* %ptr__id.i326, align 8 - %.lhs366 = sub i32 %.lhs362, %Nx - %21 = shl i32 %.lhs366, 3 - %iptr__id.i320.rhs = sext i32 %21 to i64 - %iptr__id.i320 = add i64 %iptr__id.i320.rhs, %Ain_load309_ptr2int.i - %ptr__id.i321 = inttoptr i64 %iptr__id.i320 to double* - %val__id.i322 = load double* %ptr__id.i321, align 8 - %.lhs367 = add i32 %.lhs362, %mul_Nx_load_Ny_load.i - %22 = shl i32 %.lhs367, 3 - %iptr__id.i315.rhs = sext i32 %22 to i64 - %iptr__id.i315 = add i64 %iptr__id.i315.rhs, %Ain_load309_ptr2int.i - %ptr__id.i316 = inttoptr i64 %iptr__id.i315 to double* - %val__id.i317 = load double* %ptr__id.i316, align 8 - %.lhs368 = sub i32 %.lhs362, %mul_Nx_load_Ny_load.i - %23 = shl i32 %.lhs368, 3 - %iptr__id.i310.rhs = sext i32 %23 to i64 - %iptr__id.i310 = add i64 %iptr__id.i310.rhs, %Ain_load309_ptr2int.i - %ptr__id.i311 = inttoptr i64 %iptr__id.i310 to double* - %val__id.i312 = load double* %ptr__id.i311, align 8 - %coef_load145_offset_load.i = load double* %coef_load365_offset.i, align 8 - %24 = add i32 %17, 16 - %iptr__id.i305.rhs = sext i32 %24 to i64 - %iptr__id.i305 = add i64 %iptr__id.i305.rhs, %Ain_load309_ptr2int.i - %ptr__id.i306 = inttoptr i64 %iptr__id.i305 to double* - %val__id.i307 = load double* %ptr__id.i306, align 8 - %25 = add i32 %17, -16 - %iptr__id.i300.rhs = sext i32 %25 to i64 - %iptr__id.i300 = add i64 %iptr__id.i300.rhs, %Ain_load309_ptr2int.i - %ptr__id.i301 = inttoptr i64 %iptr__id.i300 to double* - %val__id.i302 = load double* %ptr__id.i301, align 8 - %.lhs371 = add i32 %.lhs362, %mul__Nx_load385.i - %26 = shl i32 %.lhs371, 3 - %iptr__id.i295.rhs = sext i32 %26 to i64 - %iptr__id.i295 = add i64 %iptr__id.i295.rhs, %Ain_load309_ptr2int.i - %ptr__id.i296 = inttoptr i64 %iptr__id.i295 to double* - %val__id.i297 = load double* %ptr__id.i296, align 8 - %.lhs372 = add i32 %.lhs362, %mul__Nx_load393.i - %27 = shl i32 %.lhs372, 3 - %iptr__id.i290.rhs = sext i32 %27 to i64 - %iptr__id.i290 = add i64 %iptr__id.i290.rhs, %Ain_load309_ptr2int.i - %ptr__id.i291 = inttoptr i64 %iptr__id.i290 to double* - %val__id.i292 = load double* %ptr__id.i291, align 8 - %.lhs373 = add i32 %.lhs362, %mul__Nxy_load402.i - %28 = shl i32 %.lhs373, 3 - %iptr__id.i285.rhs = sext i32 %28 to i64 - %iptr__id.i285 = add i64 %iptr__id.i285.rhs, %Ain_load309_ptr2int.i - %ptr__id.i286 = inttoptr i64 %iptr__id.i285 to double* - %val__id.i287 = load double* %ptr__id.i286, align 8 - %.lhs374 = add i32 %.lhs362, %mul__Nxy_load410.i - %29 = shl i32 %.lhs374, 3 - %iptr__id.i280.rhs = sext i32 %29 to i64 - %iptr__id.i280 = add i64 %iptr__id.i280.rhs, %Ain_load309_ptr2int.i - %ptr__id.i281 = inttoptr i64 %iptr__id.i280 to double* - %val__id.i282 = load double* %ptr__id.i281, align 8 - %coef_load196_offset_load.i = load double* %coef_load416_offset.i, align 8 - %30 = add i32 %17, 24 - %iptr__id.i275.rhs = sext i32 %30 to i64 - %iptr__id.i275 = add i64 %iptr__id.i275.rhs, %Ain_load309_ptr2int.i - %ptr__id.i276 = inttoptr i64 %iptr__id.i275 to double* - %val__id.i277 = load double* %ptr__id.i276, align 8 - %31 = add i32 %17, -24 - %iptr__id.i270.rhs = sext i32 %31 to i64 - %iptr__id.i270 = add i64 %iptr__id.i270.rhs, %Ain_load309_ptr2int.i - %ptr__id.i271 = inttoptr i64 %iptr__id.i270 to double* - %val__id.i272 = load double* %ptr__id.i271, align 8 - %.lhs377 = add i32 %.lhs362, %mul__Nx_load436.i - %32 = shl i32 %.lhs377, 3 - %iptr__id.i265.rhs = sext i32 %32 to i64 - %iptr__id.i265 = add i64 %iptr__id.i265.rhs, %Ain_load309_ptr2int.i - %ptr__id.i266 = inttoptr i64 %iptr__id.i265 to double* - %val__id.i267 = load double* %ptr__id.i266, align 8 - %.lhs378 = add i32 %.lhs362, %mul__Nx_load444.i - %33 = shl i32 %.lhs378, 3 - %iptr__id.i260.rhs = sext i32 %33 to i64 - %iptr__id.i260 = add i64 %iptr__id.i260.rhs, %Ain_load309_ptr2int.i - %ptr__id.i261 = inttoptr i64 %iptr__id.i260 to double* - %val__id.i262 = load double* %ptr__id.i261, align 8 - %.lhs379 = add i32 %.lhs362, %mul__Nxy_load453.i - %34 = shl i32 %.lhs379, 3 - %iptr__id.i255.rhs = sext i32 %34 to i64 - %iptr__id.i255 = add i64 %iptr__id.i255.rhs, %Ain_load309_ptr2int.i - %ptr__id.i256 = inttoptr i64 %iptr__id.i255 to double* - %val__id.i257 = load double* %ptr__id.i256, align 8 - %.lhs380 = add i32 %.lhs362, %mul__Nxy_load461.i - %35 = shl i32 %.lhs380, 3 - %iptr__id.i250.rhs = sext i32 %35 to i64 - %iptr__id.i250 = add i64 %iptr__id.i250.rhs, %Ain_load309_ptr2int.i - %ptr__id.i251 = inttoptr i64 %iptr__id.i250 to double* - %val__id.i252 = load double* %ptr__id.i251, align 8 - %val__id.i247 = load double* %ptr__id.i, align 8 - %iptr__id.i240 = add i64 %iptr__id.i.rhs, %Aout_load470_ptr2int.i - %ptr__id.i241 = inttoptr i64 %iptr__id.i240 to double* - %val__id.i242 = load double* %ptr__id.i241, align 8 - %iptr__id.i235 = add i64 %iptr__id.i.rhs, %vsq_load488_ptr2int.i - %ptr__id.i236 = inttoptr i64 %iptr__id.i235 to double* - %val__id.i237 = load double* %ptr__id.i236, align 8 - %val__id.i233.lhs.lhs = fmul double %val__id.i247, 2.000000e+00 - %val__id.i233.lhs = fsub double %val__id.i233.lhs.lhs, %val__id.i242 - %val__id.i233.rhs.rhs.lhs.lhs.lhs = fmul double %coef_load_offset_load.i, %val__id.i - %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i337, %val__id.i332 - %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i327 - %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i322 - %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs, %val__id.i317 - %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs, %val__id.i312 - %val__id.i233.rhs.rhs.lhs.lhs.rhs = fmul double %coef_load94_offset_load.i, %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs - %val__id.i233.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.lhs, %val__id.i233.rhs.rhs.lhs.lhs.rhs - %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i307, %val__id.i302 - %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i297 - %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i292 - %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs, %val__id.i287 - %val__id.i233.rhs.rhs.lhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs, %val__id.i282 - %val__id.i233.rhs.rhs.lhs.rhs = fmul double %coef_load145_offset_load.i, %val__id.i233.rhs.rhs.lhs.rhs.rhs - %val__id.i233.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs, %val__id.i233.rhs.rhs.lhs.rhs - %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i277, %val__id.i272 - %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i267 - %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs, %val__id.i262 - %val__id.i233.rhs.rhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs, %val__id.i257 - %val__id.i233.rhs.rhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs, %val__id.i252 - %val__id.i233.rhs.rhs.rhs = fmul double %coef_load196_offset_load.i, %val__id.i233.rhs.rhs.rhs.rhs - %val__id.i233.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs, %val__id.i233.rhs.rhs.rhs - %val__id.i233.rhs = fmul double %val__id.i237, %val__id.i233.rhs.rhs - %val__id.i233 = fadd double %val__id.i233.lhs, %val__id.i233.rhs - store double %val__id.i233, double* %ptr__id.i241, align 8 - %new_counter279.i = add i32 %counter32.4.i386, 32 - %before_aligned_end73.i = icmp slt i32 %new_counter279.i, %aligned_end31.i - br i1 %before_aligned_end73.i, label %foreach_full_body.i, label %partial_inner_all_outer.i - -foreach_test21.i.preheader: ; preds = %foreach_reset19.i, %foreach_test21.i.preheader.lr.ph - %iter_val.i398 = phi <1 x i32> [ %iter_val.i395, %foreach_test21.i.preheader.lr.ph ], [ %iter_val.i, %foreach_reset19.i ] - %counter.0.i397 = phi i32 [ %add_z0_load_mul_calltmp47_, %foreach_test21.i.preheader.lr.ph ], [ %new_counter.i, %foreach_reset19.i ] - %tid.i3.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %__laneidx47.i = and i32 %tid.i3.i, 31 - %36 = zext i32 %__laneidx47.i to i64 - %arrayidx48.i = getelementptr [32 x i8]* @constDeltaForeach1, i64 0, i64 %36 - br i1 %cmp54.i390, label %outer_not_in_extras.i.preheader.lr.ph, label %foreach_reset19.i - -outer_not_in_extras.i.preheader.lr.ph: ; preds = %foreach_test21.i.preheader - %37 = load i8* %arrayidx48.i, align 1 - %_zext49.i388 = zext i8 %37 to i32 - %38 = insertelement <1 x i32> undef, i32 %_zext49.i388, i32 0 - %iter_val50.i389 = add <1 x i32> %smear_counter_init44.i387, %38 - %mul_z_load297_Nxy_load298_broadcast.i = mul <1 x i32> %iter_val.i398, %Nxy_load298_broadcast_init.i - br label %outer_not_in_extras.i.preheader - -foreach_reset19.i: ; preds = %foreach_reset27.i, %foreach_test21.i.preheader - %new_counter.i = add i32 %counter.0.i397, 1 - %smear_counter_init.i = insertelement <1 x i32> undef, i32 %new_counter.i, i32 0 - %39 = load i8* %arrayidx.i, align 1 - %_zext.i = zext i8 %39 to i32 - %40 = insertelement <1 x i32> undef, i32 %_zext.i, i32 0 - %iter_val.i = add <1 x i32> %smear_counter_init.i, %40 - %exitcond400 = icmp eq i32 %new_counter.i, %14 - br i1 %exitcond400, label %if_then, label %foreach_test21.i.preheader - -outer_not_in_extras.i.preheader: ; preds = %foreach_reset27.i, %outer_not_in_extras.i.preheader.lr.ph - %iter_val50.i392 = phi <1 x i32> [ %iter_val50.i389, %outer_not_in_extras.i.preheader.lr.ph ], [ %iter_val50.i, %foreach_reset27.i ] - %counter25.1.i391 = phi i32 [ %add_y0_load_mul_calltmp41_, %outer_not_in_extras.i.preheader.lr.ph ], [ %new_counter35.i, %foreach_reset27.i ] - br i1 %before_aligned_end73.i385, label %foreach_full_body.i, label %partial_inner_all_outer.i - -foreach_reset27.i: ; preds = %pl_dolane.i, %partial_inner_only.i, %partial_inner_all_outer.i - %new_counter35.i = add i32 %counter25.1.i391, 1 - %smear_counter_init44.i = insertelement <1 x i32> undef, i32 %new_counter35.i, i32 0 - %41 = load i8* %arrayidx48.i, align 1 - %_zext49.i = zext i8 %41 to i32 - %42 = insertelement <1 x i32> undef, i32 %_zext49.i, i32 0 - %iter_val50.i = add <1 x i32> %smear_counter_init44.i, %42 - %exitcond = icmp eq i32 %new_counter35.i, %8 - br i1 %exitcond, label %foreach_reset19.i, label %outer_not_in_extras.i.preheader - -partial_inner_all_outer.i: ; preds = %outer_not_in_extras.i.preheader, %foreach_full_body.i - %counter32.4.i.lcssa = phi i32 [ %add_x0_load_mul_calltmp35_, %outer_not_in_extras.i.preheader ], [ %new_counter279.i, %foreach_full_body.i ] - %before_full_end.i = icmp slt i32 %counter32.4.i.lcssa, %r.i.i - br i1 %before_full_end.i, label %partial_inner_only.i, label %foreach_reset27.i - -partial_inner_only.i: ; preds = %partial_inner_all_outer.i - %smear_counter_init282.i = insertelement <1 x i32> undef, i32 %counter32.4.i.lcssa, i32 0 - %tid.i2.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %__laneidx285.i = and i32 %tid.i2.i, 31 - %43 = zext i32 %__laneidx285.i to i64 - %arrayidx286.i = getelementptr [32 x i8]* @constDeltaForeach4, i64 0, i64 %43 - %44 = load i8* %arrayidx286.i, align 1 - %_zext287.i = zext i8 %44 to i32 - %45 = insertelement <1 x i32> undef, i32 %_zext287.i, i32 0 - %iter_val288.i = add <1 x i32> %smear_counter_init282.i, %45 - %cmp291.i = icmp slt <1 x i32> %iter_val288.i, %smear_end_init289.i - %mul_y_load299_Nx_load300_broadcast.i = mul <1 x i32> %iter_val50.i392, %Nx_load300_broadcast_init.i - %add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast.i = add <1 x i32> %mul_z_load297_Nxy_load298_broadcast.i, %mul_y_load299_Nx_load300_broadcast.i - %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i = add <1 x i32> %add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast.i, %iter_val288.i - %v.i.i224 = extractelement <1 x i1> %cmp291.i, i32 0 - br i1 %v.i.i224, label %pl_dolane.i, label %foreach_reset27.i - -pl_dolane.i: ; preds = %partial_inner_only.i - %coef_load303_offset_load.i = load double* %coef, align 8 - %.lhs361 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %46 = shl i32 %.lhs361, 3 - %iptr__id.i225.rhs = sext i32 %46 to i64 - %iptr__id.i225 = add i64 %iptr__id.i225.rhs, %Ain_load309_ptr2int.i - %ptr__id.i226 = inttoptr i64 %iptr__id.i225 to double* - %val__id.i227 = load double* %ptr__id.i226, align 8 - %coef_load314_offset_load.i401 = load double* %coef_load314_offset.i, align 8 - %.lhs360.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs360 = shl i32 %.lhs360.lhs, 3 - %47 = add i32 %.lhs360, 8 - %iptr__id.i218.rhs = sext i32 %47 to i64 - %iptr__id.i218 = add i64 %iptr__id.i218.rhs, %Ain_load309_ptr2int.i - %ptr__id.i219 = inttoptr i64 %iptr__id.i218 to double* - %val__id.i220 = load double* %ptr__id.i219, align 8 - %.lhs359.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs359 = shl i32 %.lhs359.lhs, 3 - %48 = add i32 %.lhs359, -8 - %iptr__id.i211.rhs = sext i32 %48 to i64 - %iptr__id.i211 = add i64 %iptr__id.i211.rhs, %Ain_load309_ptr2int.i - %ptr__id.i212 = inttoptr i64 %iptr__id.i211 to double* - %val__id.i213 = load double* %ptr__id.i212, align 8 - %.lhs358.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs358 = add i32 %.lhs358.lhs, %Nx - %49 = shl i32 %.lhs358, 3 - %iptr__id.i204.rhs = sext i32 %49 to i64 - %iptr__id.i204 = add i64 %iptr__id.i204.rhs, %Ain_load309_ptr2int.i - %ptr__id.i205 = inttoptr i64 %iptr__id.i204 to double* - %val__id.i206 = load double* %ptr__id.i205, align 8 - %.lhs357.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs357 = sub i32 %.lhs357.lhs, %Nx - %50 = shl i32 %.lhs357, 3 - %iptr__id.i197.rhs = sext i32 %50 to i64 - %iptr__id.i197 = add i64 %iptr__id.i197.rhs, %Ain_load309_ptr2int.i - %ptr__id.i198 = inttoptr i64 %iptr__id.i197 to double* - %val__id.i199 = load double* %ptr__id.i198, align 8 - %.lhs356.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs356 = add i32 %.lhs356.lhs, %mul_Nx_load_Ny_load.i - %51 = shl i32 %.lhs356, 3 - %iptr__id.i190.rhs = sext i32 %51 to i64 - %iptr__id.i190 = add i64 %iptr__id.i190.rhs, %Ain_load309_ptr2int.i - %ptr__id.i191 = inttoptr i64 %iptr__id.i190 to double* - %val__id.i192 = load double* %ptr__id.i191, align 8 - %.lhs355.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs355 = sub i32 %.lhs355.lhs, %mul_Nx_load_Ny_load.i - %52 = shl i32 %.lhs355, 3 - %iptr__id.i183.rhs = sext i32 %52 to i64 - %iptr__id.i183 = add i64 %iptr__id.i183.rhs, %Ain_load309_ptr2int.i - %ptr__id.i184 = inttoptr i64 %iptr__id.i183 to double* - %val__id.i185 = load double* %ptr__id.i184, align 8 - %coef_load365_offset_load.i457 = load double* %coef_load365_offset.i, align 8 - %.lhs354.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs354 = shl i32 %.lhs354.lhs, 3 - %53 = add i32 %.lhs354, 16 - %iptr__id.i176.rhs = sext i32 %53 to i64 - %iptr__id.i176 = add i64 %iptr__id.i176.rhs, %Ain_load309_ptr2int.i - %ptr__id.i177 = inttoptr i64 %iptr__id.i176 to double* - %val__id.i178 = load double* %ptr__id.i177, align 8 - %.lhs353.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs353 = shl i32 %.lhs353.lhs, 3 - %54 = add i32 %.lhs353, -16 - %iptr__id.i169.rhs = sext i32 %54 to i64 - %iptr__id.i169 = add i64 %iptr__id.i169.rhs, %Ain_load309_ptr2int.i - %ptr__id.i170 = inttoptr i64 %iptr__id.i169 to double* - %val__id.i171 = load double* %ptr__id.i170, align 8 - %.lhs352.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs352 = add i32 %.lhs352.lhs, %mul__Nx_load385.i - %55 = shl i32 %.lhs352, 3 - %iptr__id.i162.rhs = sext i32 %55 to i64 - %iptr__id.i162 = add i64 %iptr__id.i162.rhs, %Ain_load309_ptr2int.i - %ptr__id.i163 = inttoptr i64 %iptr__id.i162 to double* - %val__id.i164 = load double* %ptr__id.i163, align 8 - %.lhs351.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs351 = add i32 %.lhs351.lhs, %mul__Nx_load393.i - %56 = shl i32 %.lhs351, 3 - %iptr__id.i155.rhs = sext i32 %56 to i64 - %iptr__id.i155 = add i64 %iptr__id.i155.rhs, %Ain_load309_ptr2int.i - %ptr__id.i156 = inttoptr i64 %iptr__id.i155 to double* - %val__id.i157 = load double* %ptr__id.i156, align 8 - %.lhs350.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs350 = add i32 %.lhs350.lhs, %mul__Nxy_load402.i - %57 = shl i32 %.lhs350, 3 - %iptr__id.i148.rhs = sext i32 %57 to i64 - %iptr__id.i148 = add i64 %iptr__id.i148.rhs, %Ain_load309_ptr2int.i - %ptr__id.i149 = inttoptr i64 %iptr__id.i148 to double* - %val__id.i150 = load double* %ptr__id.i149, align 8 - %.lhs349.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs349 = add i32 %.lhs349.lhs, %mul__Nxy_load410.i - %58 = shl i32 %.lhs349, 3 - %iptr__id.i141.rhs = sext i32 %58 to i64 - %iptr__id.i141 = add i64 %iptr__id.i141.rhs, %Ain_load309_ptr2int.i - %ptr__id.i142 = inttoptr i64 %iptr__id.i141 to double* - %val__id.i143 = load double* %ptr__id.i142, align 8 - %coef_load416_offset_load.i544 = load double* %coef_load416_offset.i, align 8 - %.lhs348.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs348 = shl i32 %.lhs348.lhs, 3 - %59 = add i32 %.lhs348, 24 - %iptr__id.i134.rhs = sext i32 %59 to i64 - %iptr__id.i134 = add i64 %iptr__id.i134.rhs, %Ain_load309_ptr2int.i - %ptr__id.i135 = inttoptr i64 %iptr__id.i134 to double* - %val__id.i136 = load double* %ptr__id.i135, align 8 - %.lhs347.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs347 = shl i32 %.lhs347.lhs, 3 - %60 = add i32 %.lhs347, -24 - %iptr__id.i127.rhs = sext i32 %60 to i64 - %iptr__id.i127 = add i64 %iptr__id.i127.rhs, %Ain_load309_ptr2int.i - %ptr__id.i128 = inttoptr i64 %iptr__id.i127 to double* - %val__id.i129 = load double* %ptr__id.i128, align 8 - %.lhs346.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs346 = add i32 %.lhs346.lhs, %mul__Nx_load436.i - %61 = shl i32 %.lhs346, 3 - %iptr__id.i120.rhs = sext i32 %61 to i64 - %iptr__id.i120 = add i64 %iptr__id.i120.rhs, %Ain_load309_ptr2int.i - %ptr__id.i121 = inttoptr i64 %iptr__id.i120 to double* - %val__id.i122 = load double* %ptr__id.i121, align 8 - %.lhs345.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs345 = add i32 %.lhs345.lhs, %mul__Nx_load444.i - %62 = shl i32 %.lhs345, 3 - %iptr__id.i113.rhs = sext i32 %62 to i64 - %iptr__id.i113 = add i64 %iptr__id.i113.rhs, %Ain_load309_ptr2int.i - %ptr__id.i114 = inttoptr i64 %iptr__id.i113 to double* - %val__id.i115 = load double* %ptr__id.i114, align 8 - %.lhs344.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs344 = add i32 %.lhs344.lhs, %mul__Nxy_load453.i - %63 = shl i32 %.lhs344, 3 - %iptr__id.i106.rhs = sext i32 %63 to i64 - %iptr__id.i106 = add i64 %iptr__id.i106.rhs, %Ain_load309_ptr2int.i - %ptr__id.i107 = inttoptr i64 %iptr__id.i106 to double* - %val__id.i108 = load double* %ptr__id.i107, align 8 - %.lhs343.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs343 = add i32 %.lhs343.lhs, %mul__Nxy_load461.i - %64 = shl i32 %.lhs343, 3 - %iptr__id.i99.rhs = sext i32 %64 to i64 - %iptr__id.i99 = add i64 %iptr__id.i99.rhs, %Ain_load309_ptr2int.i - %ptr__id.i100 = inttoptr i64 %iptr__id.i99 to double* - %val__id.i101 = load double* %ptr__id.i100, align 8 - %.lhs342 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %65 = shl i32 %.lhs342, 3 - %iptr__id.i92.rhs = sext i32 %65 to i64 - %iptr__id.i92 = add i64 %iptr__id.i92.rhs, %Ain_load309_ptr2int.i - %ptr__id.i93 = inttoptr i64 %iptr__id.i92 to double* - %val__id.i94 = load double* %ptr__id.i93, align 8 - %.lhs341 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %66 = shl i32 %.lhs341, 3 - %iptr__id.i85.rhs = sext i32 %66 to i64 - %iptr__id.i85 = add i64 %iptr__id.i85.rhs, %Aout_load470_ptr2int.i - %ptr__id.i86 = inttoptr i64 %iptr__id.i85 to double* - %val__id.i87 = load double* %ptr__id.i86, align 8 - %.lhs340 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %67 = shl i32 %.lhs340, 3 - %iptr__id.i80.rhs = sext i32 %67 to i64 - %iptr__id.i80 = add i64 %iptr__id.i80.rhs, %vsq_load488_ptr2int.i - %ptr__id.i81 = inttoptr i64 %iptr__id.i80 to double* - %val__id.i82 = load double* %ptr__id.i81, align 8 - %.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %68 = shl i32 %.lhs, 3 - %iptr__id.i76.rhs = sext i32 %68 to i64 - %iptr__id.i76 = add i64 %iptr__id.i76.rhs, %Aout_load470_ptr2int.i - %ptr__id.i77 = inttoptr i64 %iptr__id.i76 to double* - %val__id.i78.lhs.lhs = fmul double %val__id.i94, 2.000000e+00 - %val__id.i78.lhs = fsub double %val__id.i78.lhs.lhs, %val__id.i87 - %val__id.i78.rhs.rhs.lhs.lhs.lhs = fmul double %coef_load303_offset_load.i, %val__id.i227 - %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i220, %val__id.i213 - %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i206 - %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i199 - %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs, %val__id.i192 - %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs, %val__id.i185 - %val__id.i78.rhs.rhs.lhs.lhs.rhs = fmul double %coef_load314_offset_load.i401, %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs - %val__id.i78.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.lhs, %val__id.i78.rhs.rhs.lhs.lhs.rhs - %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i178, %val__id.i171 - %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i164 - %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i157 - %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs, %val__id.i150 - %val__id.i78.rhs.rhs.lhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs, %val__id.i143 - %val__id.i78.rhs.rhs.lhs.rhs = fmul double %coef_load365_offset_load.i457, %val__id.i78.rhs.rhs.lhs.rhs.rhs - %val__id.i78.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs, %val__id.i78.rhs.rhs.lhs.rhs - %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i136, %val__id.i129 - %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i122 - %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs, %val__id.i115 - %val__id.i78.rhs.rhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs, %val__id.i108 - %val__id.i78.rhs.rhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs, %val__id.i101 - %val__id.i78.rhs.rhs.rhs = fmul double %coef_load416_offset_load.i544, %val__id.i78.rhs.rhs.rhs.rhs - %val__id.i78.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs, %val__id.i78.rhs.rhs.rhs - %val__id.i78.rhs = fmul double %val__id.i78.rhs.rhs, %val__id.i82 - %val__id.i78 = fadd double %val__id.i78.lhs, %val__id.i78.rhs - store double %val__id.i78, double* %ptr__id.i77, align 8 - br label %foreach_reset27.i -} - -define void @loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd, <1 x i1> %__mask) { -allocas: - %less_t_load_t1_load94 = icmp slt i32 %t0, %t1 - br i1 %less_t_load_t1_load94, label %for_loop.lr.ph, label %for_exit - -for_loop.lr.ph: ; preds = %allocas - %add_sub_x1_load21_x0_load22_ = sub i32 31, %x0 - %sub_add_sub_x1_load21_x0_load22__ = add i32 %add_sub_x1_load21_x0_load22_, %x1 - %div_sub_add_sub_x1_load21_x0_load22___ = sdiv i32 %sub_add_sub_x1_load21_x0_load22__, 32 - %add_sub_y1_load23_y0_load24_ = sub i32 7, %y0 - %sub_add_sub_y1_load23_y0_load24__ = add i32 %add_sub_y1_load23_y0_load24_, %y1 - %div_sub_add_sub_y1_load23_y0_load24___ = sdiv i32 %sub_add_sub_y1_load23_y0_load24__, 8 - %add_sub_z1_load25_z0_load26_ = sub i32 7, %z0 - %sub_add_sub_z1_load25_z0_load26__ = add i32 %add_sub_z1_load25_z0_load26_, %z1 - %div_sub_add_sub_z1_load25_z0_load26___ = sdiv i32 %sub_add_sub_z1_load25_z0_load26__, 8 - %ntxm1.i = add nsw i32 %div_sub_add_sub_x1_load21_x0_load22___, -1 - %ntxm1d4.i = ashr i32 %ntxm1.i, 2 - %nbx.i = add nsw i32 %ntxm1d4.i, 1 - br label %for_loop - -for_loop: ; preds = %if_exit, %for_loop.lr.ph - %t.095 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load78_plus1, %if_exit ] - %bitop = and i32 %t.095, 1 - %equal_bitop_ = icmp eq i32 %bitop, 0 - %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and.i = and i32 %tid.i.i, 31 - %cmp.i = icmp eq i32 %and.i, 0 - br i1 %cmp.i, label %if.then.i, label %ISPCGetParamBuffer.exit - -if.then.i: ; preds = %for_loop - %ptri64tmp.i = tail call i64 @cudaGetParameterBuffer(i64 8, i64 72) - %phitmp.i = inttoptr i64 %ptri64tmp.i to i8* - br label %ISPCGetParamBuffer.exit - -ISPCGetParamBuffer.exit: ; preds = %if.then.i, %for_loop - %ptri64.i = phi i8* [ %phitmp.i, %if.then.i ], [ null, %for_loop ] - %cmp1 = icmp eq i8* %ptri64.i, null - br i1 %equal_bitop_, label %if_then, label %if_else - -for_exit: ; preds = %if_exit, %allocas - %0 = tail call i32 @cudaDeviceSynchronize() - ret void - -if_then: ; preds = %ISPCGetParamBuffer.exit - br i1 %cmp1, label %if_false, label %if_true - -if_else: ; preds = %ISPCGetParamBuffer.exit - br i1 %cmp1, label %if_false62, label %if_true61 - -if_exit: ; preds = %if.then.i92, %if_false62, %if.then.i83, %if_false - %1 = tail call i32 @cudaDeviceSynchronize() - %t_load78_plus1 = add i32 %t.095, 1 - %exitcond = icmp eq i32 %t_load78_plus1, %t1 - br i1 %exitcond, label %for_exit, label %for_loop - -if_true: ; preds = %if_then - %funarg = bitcast i8* %ptri64.i to i32* - store i32 %x0, i32* %funarg, align 4 - %funarg27 = getelementptr i8* %ptri64.i, i64 4 - %2 = bitcast i8* %funarg27 to i32* - store i32 %x1, i32* %2, align 4 - %funarg28 = getelementptr i8* %ptri64.i, i64 8 - %3 = bitcast i8* %funarg28 to i32* - store i32 %y0, i32* %3, align 4 - %funarg29 = getelementptr i8* %ptri64.i, i64 12 - %4 = bitcast i8* %funarg29 to i32* - store i32 %y1, i32* %4, align 4 - %funarg30 = getelementptr i8* %ptri64.i, i64 16 - %5 = bitcast i8* %funarg30 to i32* - store i32 %z0, i32* %5, align 4 - %funarg31 = getelementptr i8* %ptri64.i, i64 20 - %6 = bitcast i8* %funarg31 to i32* - store i32 %z1, i32* %6, align 4 - %funarg32 = getelementptr i8* %ptri64.i, i64 24 - %7 = bitcast i8* %funarg32 to i32* - store i32 %Nx, i32* %7, align 4 - %funarg33 = getelementptr i8* %ptri64.i, i64 28 - %8 = bitcast i8* %funarg33 to i32* - store i32 %Ny, i32* %8, align 4 - %funarg34 = getelementptr i8* %ptri64.i, i64 32 - %9 = bitcast i8* %funarg34 to i32* - store i32 %Nz, i32* %9, align 4 - %funarg35 = getelementptr i8* %ptri64.i, i64 40 - %10 = bitcast i8* %funarg35 to double** - store double* %coef, double** %10, align 8 - %funarg36 = getelementptr i8* %ptri64.i, i64 48 - %11 = bitcast i8* %funarg36 to double** - store double* %vsq, double** %11, align 8 - %funarg37 = getelementptr i8* %ptri64.i, i64 56 - %12 = bitcast i8* %funarg37 to double** - store double* %Aeven, double** %12, align 8 - %funarg38 = getelementptr i8* %ptri64.i, i64 64 - %13 = bitcast i8* %funarg38 to double** - store double* %Aodd, double** %13, align 8 - br label %if_false - -if_false: ; preds = %if_true, %if_then - %tid.i.i80 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and.i81 = and i32 %tid.i.i80, 31 - %cmp.i82 = icmp eq i32 %and.i81, 0 - br i1 %cmp.i82, label %if.then.i83, label %if_exit - -if.then.i83: ; preds = %if_false - %args_i64.i = ptrtoint i8* %ptri64.i to i64 - %res_tmp.i = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) - br label %if_exit - -if_true61: ; preds = %if_else - %funarg64 = bitcast i8* %ptri64.i to i32* - store i32 %x0, i32* %funarg64, align 4 - %funarg65 = getelementptr i8* %ptri64.i, i64 4 - %14 = bitcast i8* %funarg65 to i32* - store i32 %x1, i32* %14, align 4 - %funarg66 = getelementptr i8* %ptri64.i, i64 8 - %15 = bitcast i8* %funarg66 to i32* - store i32 %y0, i32* %15, align 4 - %funarg67 = getelementptr i8* %ptri64.i, i64 12 - %16 = bitcast i8* %funarg67 to i32* - store i32 %y1, i32* %16, align 4 - %funarg68 = getelementptr i8* %ptri64.i, i64 16 - %17 = bitcast i8* %funarg68 to i32* - store i32 %z0, i32* %17, align 4 - %funarg69 = getelementptr i8* %ptri64.i, i64 20 - %18 = bitcast i8* %funarg69 to i32* - store i32 %z1, i32* %18, align 4 - %funarg70 = getelementptr i8* %ptri64.i, i64 24 - %19 = bitcast i8* %funarg70 to i32* - store i32 %Nx, i32* %19, align 4 - %funarg71 = getelementptr i8* %ptri64.i, i64 28 - %20 = bitcast i8* %funarg71 to i32* - store i32 %Ny, i32* %20, align 4 - %funarg72 = getelementptr i8* %ptri64.i, i64 32 - %21 = bitcast i8* %funarg72 to i32* - store i32 %Nz, i32* %21, align 4 - %funarg73 = getelementptr i8* %ptri64.i, i64 40 - %22 = bitcast i8* %funarg73 to double** - store double* %coef, double** %22, align 8 - %funarg74 = getelementptr i8* %ptri64.i, i64 48 - %23 = bitcast i8* %funarg74 to double** - store double* %vsq, double** %23, align 8 - %funarg75 = getelementptr i8* %ptri64.i, i64 56 - %24 = bitcast i8* %funarg75 to double** - store double* %Aodd, double** %24, align 8 - %funarg76 = getelementptr i8* %ptri64.i, i64 64 - %25 = bitcast i8* %funarg76 to double** - store double* %Aeven, double** %25, align 8 - br label %if_false62 - -if_false62: ; preds = %if_true61, %if_else - %tid.i.i84 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and.i85 = and i32 %tid.i.i84, 31 - %cmp.i86 = icmp eq i32 %and.i85, 0 - br i1 %cmp.i86, label %if.then.i92, label %if_exit - -if.then.i92: ; preds = %if_false62 - %args_i64.i90 = ptrtoint i8* %ptri64.i to i64 - %res_tmp.i91 = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i90, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) - br label %if_exit -} - -define void @loop_stencil_ispc_tasks(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd) { -allocas: - %less_t_load_t1_load94 = icmp slt i32 %t0, %t1 - br i1 %less_t_load_t1_load94, label %for_loop.lr.ph, label %for_exit - -for_loop.lr.ph: ; preds = %allocas - %add_sub_x1_load21_x0_load22_ = sub i32 31, %x0 - %sub_add_sub_x1_load21_x0_load22__ = add i32 %add_sub_x1_load21_x0_load22_, %x1 - %div_sub_add_sub_x1_load21_x0_load22___ = sdiv i32 %sub_add_sub_x1_load21_x0_load22__, 32 - %add_sub_y1_load23_y0_load24_ = sub i32 7, %y0 - %sub_add_sub_y1_load23_y0_load24__ = add i32 %add_sub_y1_load23_y0_load24_, %y1 - %div_sub_add_sub_y1_load23_y0_load24___ = sdiv i32 %sub_add_sub_y1_load23_y0_load24__, 8 - %add_sub_z1_load25_z0_load26_ = sub i32 7, %z0 - %sub_add_sub_z1_load25_z0_load26__ = add i32 %add_sub_z1_load25_z0_load26_, %z1 - %div_sub_add_sub_z1_load25_z0_load26___ = sdiv i32 %sub_add_sub_z1_load25_z0_load26__, 8 - %ntxm1.i = add nsw i32 %div_sub_add_sub_x1_load21_x0_load22___, -1 - %ntxm1d4.i = ashr i32 %ntxm1.i, 2 - %nbx.i = add nsw i32 %ntxm1d4.i, 1 - br label %for_loop - -for_loop: ; preds = %if_exit, %for_loop.lr.ph - %t.095 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load78_plus1, %if_exit ] - %bitop = and i32 %t.095, 1 - %equal_bitop_ = icmp eq i32 %bitop, 0 - %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and.i = and i32 %tid.i.i, 31 - %cmp.i = icmp eq i32 %and.i, 0 - br i1 %cmp.i, label %if.then.i, label %ISPCGetParamBuffer.exit - -if.then.i: ; preds = %for_loop - %ptri64tmp.i = tail call i64 @cudaGetParameterBuffer(i64 8, i64 72) - %phitmp.i = inttoptr i64 %ptri64tmp.i to i8* - br label %ISPCGetParamBuffer.exit - -ISPCGetParamBuffer.exit: ; preds = %if.then.i, %for_loop - %ptri64.i = phi i8* [ %phitmp.i, %if.then.i ], [ null, %for_loop ] - %cmp1 = icmp eq i8* %ptri64.i, null - br i1 %equal_bitop_, label %if_then, label %if_else - -for_exit: ; preds = %if_exit, %allocas - %0 = tail call i32 @cudaDeviceSynchronize() - ret void - -if_then: ; preds = %ISPCGetParamBuffer.exit - br i1 %cmp1, label %if_false, label %if_true - -if_else: ; preds = %ISPCGetParamBuffer.exit - br i1 %cmp1, label %if_false62, label %if_true61 - -if_exit: ; preds = %if.then.i92, %if_false62, %if.then.i83, %if_false - %1 = tail call i32 @cudaDeviceSynchronize() - %t_load78_plus1 = add i32 %t.095, 1 - %exitcond = icmp eq i32 %t_load78_plus1, %t1 - br i1 %exitcond, label %for_exit, label %for_loop - -if_true: ; preds = %if_then - %funarg = bitcast i8* %ptri64.i to i32* - store i32 %x0, i32* %funarg, align 4 - %funarg27 = getelementptr i8* %ptri64.i, i64 4 - %2 = bitcast i8* %funarg27 to i32* - store i32 %x1, i32* %2, align 4 - %funarg28 = getelementptr i8* %ptri64.i, i64 8 - %3 = bitcast i8* %funarg28 to i32* - store i32 %y0, i32* %3, align 4 - %funarg29 = getelementptr i8* %ptri64.i, i64 12 - %4 = bitcast i8* %funarg29 to i32* - store i32 %y1, i32* %4, align 4 - %funarg30 = getelementptr i8* %ptri64.i, i64 16 - %5 = bitcast i8* %funarg30 to i32* - store i32 %z0, i32* %5, align 4 - %funarg31 = getelementptr i8* %ptri64.i, i64 20 - %6 = bitcast i8* %funarg31 to i32* - store i32 %z1, i32* %6, align 4 - %funarg32 = getelementptr i8* %ptri64.i, i64 24 - %7 = bitcast i8* %funarg32 to i32* - store i32 %Nx, i32* %7, align 4 - %funarg33 = getelementptr i8* %ptri64.i, i64 28 - %8 = bitcast i8* %funarg33 to i32* - store i32 %Ny, i32* %8, align 4 - %funarg34 = getelementptr i8* %ptri64.i, i64 32 - %9 = bitcast i8* %funarg34 to i32* - store i32 %Nz, i32* %9, align 4 - %funarg35 = getelementptr i8* %ptri64.i, i64 40 - %10 = bitcast i8* %funarg35 to double** - store double* %coef, double** %10, align 8 - %funarg36 = getelementptr i8* %ptri64.i, i64 48 - %11 = bitcast i8* %funarg36 to double** - store double* %vsq, double** %11, align 8 - %funarg37 = getelementptr i8* %ptri64.i, i64 56 - %12 = bitcast i8* %funarg37 to double** - store double* %Aeven, double** %12, align 8 - %funarg38 = getelementptr i8* %ptri64.i, i64 64 - %13 = bitcast i8* %funarg38 to double** - store double* %Aodd, double** %13, align 8 - br label %if_false - -if_false: ; preds = %if_true, %if_then - %tid.i.i80 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and.i81 = and i32 %tid.i.i80, 31 - %cmp.i82 = icmp eq i32 %and.i81, 0 - br i1 %cmp.i82, label %if.then.i83, label %if_exit - -if.then.i83: ; preds = %if_false - %args_i64.i = ptrtoint i8* %ptri64.i to i64 - %res_tmp.i = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) - br label %if_exit - -if_true61: ; preds = %if_else - %funarg64 = bitcast i8* %ptri64.i to i32* - store i32 %x0, i32* %funarg64, align 4 - %funarg65 = getelementptr i8* %ptri64.i, i64 4 - %14 = bitcast i8* %funarg65 to i32* - store i32 %x1, i32* %14, align 4 - %funarg66 = getelementptr i8* %ptri64.i, i64 8 - %15 = bitcast i8* %funarg66 to i32* - store i32 %y0, i32* %15, align 4 - %funarg67 = getelementptr i8* %ptri64.i, i64 12 - %16 = bitcast i8* %funarg67 to i32* - store i32 %y1, i32* %16, align 4 - %funarg68 = getelementptr i8* %ptri64.i, i64 16 - %17 = bitcast i8* %funarg68 to i32* - store i32 %z0, i32* %17, align 4 - %funarg69 = getelementptr i8* %ptri64.i, i64 20 - %18 = bitcast i8* %funarg69 to i32* - store i32 %z1, i32* %18, align 4 - %funarg70 = getelementptr i8* %ptri64.i, i64 24 - %19 = bitcast i8* %funarg70 to i32* - store i32 %Nx, i32* %19, align 4 - %funarg71 = getelementptr i8* %ptri64.i, i64 28 - %20 = bitcast i8* %funarg71 to i32* - store i32 %Ny, i32* %20, align 4 - %funarg72 = getelementptr i8* %ptri64.i, i64 32 - %21 = bitcast i8* %funarg72 to i32* - store i32 %Nz, i32* %21, align 4 - %funarg73 = getelementptr i8* %ptri64.i, i64 40 - %22 = bitcast i8* %funarg73 to double** - store double* %coef, double** %22, align 8 - %funarg74 = getelementptr i8* %ptri64.i, i64 48 - %23 = bitcast i8* %funarg74 to double** - store double* %vsq, double** %23, align 8 - %funarg75 = getelementptr i8* %ptri64.i, i64 56 - %24 = bitcast i8* %funarg75 to double** - store double* %Aodd, double** %24, align 8 - %funarg76 = getelementptr i8* %ptri64.i, i64 64 - %25 = bitcast i8* %funarg76 to double** - store double* %Aeven, double** %25, align 8 - br label %if_false62 - -if_false62: ; preds = %if_true61, %if_else - %tid.i.i84 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and.i85 = and i32 %tid.i.i84, 31 - %cmp.i86 = icmp eq i32 %and.i85, 0 - br i1 %cmp.i86, label %if.then.i92, label %if_exit - -if.then.i92: ; preds = %if_false62 - %args_i64.i90 = ptrtoint i8* %ptri64.i to i64 - %res_tmp.i91 = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i90, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) - br label %if_exit -} - -!llvm.ident = !{!0} -!nvvm.annotations = !{!1, !2} - -!0 = metadata !{metadata !"clang version 3.4 (trunk 194723)"} -!1 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, metadata !"kernel", i32 1} -!2 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @loop_stencil_ispc_tasks, metadata !"kernel", i32 1} diff --git a/examples_cuda/stencil/stencil_ispc_nvptx64.ptx b/examples_cuda/stencil/stencil_ispc_nvptx64.ptx deleted file mode 100644 index b0339cbf..00000000 --- a/examples_cuda/stencil/stencil_ispc_nvptx64.ptx +++ /dev/null @@ -1,1246 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) -// Cuda compilation tools, release 5.5, V5.5.0 -// - -.version 3.2 -.target sm_35 -.address_size 64 - - -.extern .func (.param .b32 func_retval0) cudaLaunchDevice -( - .param .b64 cudaLaunchDevice_param_0, - .param .b64 cudaLaunchDevice_param_1, - .param .align 4 .b8 cudaLaunchDevice_param_2[12], - .param .align 4 .b8 cudaLaunchDevice_param_3[12], - .param .b32 cudaLaunchDevice_param_4, - .param .b64 cudaLaunchDevice_param_5 -); - - -.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer -( - .param .b64 cudaGetParameterBuffer_param_0, - .param .b64 cudaGetParameterBuffer_param_1 -) -; -.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize -( - -) -; -.global .align 1 .b8 constDeltaForeach1[32]; -.global .align 1 .b8 constDeltaForeach4[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; - -.visible .func (.param .b32 func_retval0) __shfl_i32( - .param .b32 __shfl_i32_param_0, - .param .b32 __shfl_i32_param_1 -) -{ - .reg .s32 %r<4>; - - - ld.param.u32 %r2, [__shfl_i32_param_0]; - ld.param.u32 %r3, [__shfl_i32_param_1]; - // inline asm - shfl.idx.b32 %r1, %r2, %r3, 0x1f; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __shfl_xor_float( - .param .b32 __shfl_xor_float_param_0, - .param .b32 __shfl_xor_float_param_1 -) -{ - .reg .s32 %r<2>; - .reg .f32 %f<3>; - - - ld.param.f32 %f2, [__shfl_xor_float_param_0]; - ld.param.u32 %r1, [__shfl_xor_float_param_1]; - // inline asm - shfl.bfly.b32 %f1, %f2, %r1, 0x1f; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __shfl_xor_i32( - .param .b32 __shfl_xor_i32_param_0, - .param .b32 __shfl_xor_i32_param_1 -) -{ - .reg .s32 %r<4>; - - - ld.param.u32 %r2, [__shfl_xor_i32_param_0]; - ld.param.u32 %r3, [__shfl_xor_i32_param_1]; - // inline asm - shfl.bfly.b32 %r1, %r2, %r3, 0x1f; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __fminf( - .param .b32 __fminf_param_0, - .param .b32 __fminf_param_1 -) -{ - .reg .f32 %f<4>; - - - ld.param.f32 %f2, [__fminf_param_0]; - ld.param.f32 %f3, [__fminf_param_1]; - // inline asm - min.f32 %f1, %f2, %f3; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __fmaxf( - .param .b32 __fmaxf_param_0, - .param .b32 __fmaxf_param_1 -) -{ - .reg .f32 %f<4>; - - - ld.param.f32 %f2, [__fmaxf_param_0]; - ld.param.f32 %f3, [__fmaxf_param_1]; - // inline asm - max.f32 %f1, %f2, %f3; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __ballot( - .param .b32 __ballot_param_0 -) -{ - .reg .s32 %r<3>; - - - ld.param.u8 %r2, [__ballot_param_0]; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r2, 0; - vote.ballot.b32 %r1, %p1; - } - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __lanemask_lt( - -) -{ - .reg .s32 %r<2>; - - - // inline asm - mov.u32 %r1, %lanemask_lt; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b64 func_retval0) ISPCAlloc( - .param .b64 ISPCAlloc_param_0, - .param .b64 ISPCAlloc_param_1, - .param .b32 ISPCAlloc_param_2 -) -{ - .reg .s64 %rd<2>; - - - mov.u64 %rd1, 1; - st.param.b64 [func_retval0+0], %rd1; - ret; -} - -.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer( - .param .b64 ISPCGetParamBuffer_param_0, - .param .b64 ISPCGetParamBuffer_param_1, - .param .b64 ISPCGetParamBuffer_param_2 -) -{ - .reg .pred %p<2>; - .reg .s32 %r<3>; - .reg .s64 %rd<7>; - - - ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1]; - ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2]; - mov.u32 %r1, %tid.x; - and.b32 %r2, %r1, 31; - setp.ne.s32 %p1, %r2, 0; - mov.u64 %rd6, 0; - @%p1 bra BB8_2; - - // Callseq Start 0 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd3; - .param .b64 param1; - st.param.b64 [param1+0], %rd4; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd6, [retval0+0]; - } - // Callseq End 0 - -BB8_2: - st.param.b64 [func_retval0+0], %rd6; - ret; -} - -.visible .func ISPCLaunch( - .param .b64 ISPCLaunch_param_0, - .param .b64 ISPCLaunch_param_1, - .param .b64 ISPCLaunch_param_2, - .param .b32 ISPCLaunch_param_3, - .param .b32 ISPCLaunch_param_4, - .param .b32 ISPCLaunch_param_5 -) -{ - .reg .pred %p<2>; - .reg .s32 %r<16>; - .reg .s64 %rd<6>; - - - ld.param.u64 %rd1, [ISPCLaunch_param_1]; - ld.param.u64 %rd2, [ISPCLaunch_param_2]; - ld.param.u32 %r1, [ISPCLaunch_param_3]; - ld.param.u32 %r2, [ISPCLaunch_param_4]; - ld.param.u32 %r3, [ISPCLaunch_param_5]; - mov.u32 %r4, %tid.x; - and.b32 %r5, %r4, 31; - setp.ne.s32 %p1, %r5, 0; - @%p1 bra BB9_2; - - add.s32 %r14, %r1, -1; - shr.s32 %r15, %r14, 2; - add.s32 %r7, %r15, 1; - mov.u32 %r12, 1; - mov.u32 %r10, 128; - mov.u32 %r13, 0; - mov.u64 %rd5, 0; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd1; - .param .b64 param1; - st.param.b64 [param1+0], %rd2; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r7; - st.param.b32 [param2+4], %r2; - st.param.b32 [param2+8], %r3; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r10; - st.param.b32 [param3+4], %r12; - st.param.b32 [param3+8], %r12; - .param .b32 param4; - st.param.b32 [param4+0], %r13; - .param .b64 param5; - st.param.b64 [param5+0], %rd5; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r6, [retval0+0]; - } - - // inline asm - -BB9_2: - ret; -} - -.visible .func ISPCSync( - .param .b64 ISPCSync_param_0 -) -{ - .reg .s32 %r<2>; - - - // Callseq Start 1 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r1, [retval0+0]; - } - // Callseq End 1 - ret; -} - -.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan( - .param .b32 __warpBinExclusiveScan_param_0 -) -{ - .reg .s32 %r<8>; - .reg .s64 %rd<5>; - - - ld.param.u8 %r2, [__warpBinExclusiveScan_param_0]; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r2, 0; - vote.ballot.b32 %r1, %p1; - } - // inline asm - // inline asm - popc.b32 %r3, %r1; - // inline asm - // inline asm - mov.u32 %r5, %lanemask_lt; - // inline asm - and.b32 %r7, %r5, %r1; - // inline asm - popc.b32 %r6, %r7; - // inline asm - cvt.u64.u32 %rd1, %r6; - shl.b64 %rd2, %rd1, 32; - cvt.u64.u32 %rd3, %r3; - or.b64 %rd4, %rd2, %rd3; - st.param.b64 [func_retval0+0], %rd4; - ret; -} - -.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_( - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12 -) -{ - .reg .pred %p<14>; - .reg .s32 %r<178>; - .reg .s64 %rd<96>; - .reg .f64 %fd<95>; - - - ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0]; - ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1]; - ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2]; - ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3]; - ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4]; - ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5]; - ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6]; - ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7]; - ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9]; - ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10]; - ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11]; - ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12]; - mov.u32 %r1, %ctaid.x; - shl.b32 %r50, %r1, 2; - mov.u32 %r2, %tid.x; - shr.s32 %r51, %r2, 5; - add.s32 %r52, %r51, %r50; - mov.u32 %r53, %nctaid.x; - shl.b32 %r54, %r53, 2; - setp.ge.s32 %p1, %r52, %r54; - mov.u32 %r55, %nctaid.y; - mov.u32 %r3, %ctaid.y; - setp.ge.s32 %p2, %r3, %r55; - or.pred %p3, %p1, %p2; - mov.u32 %r56, %nctaid.z; - mov.u32 %r4, %ctaid.z; - setp.ge.s32 %p4, %r4, %r56; - or.pred %p5, %p3, %p4; - @%p5 bra BB12_13; - - shl.b32 %r57, %r1, 7; - add.s32 %r58, %r2, %r57; - and.b32 %r59, %r58, -32; - add.s32 %r60, %r59, %r42; - add.s32 %r61, %r60, 32; - min.s32 %r5, %r43, %r61; - shl.b32 %r6, %r3, 3; - add.s32 %r62, %r6, %r44; - add.s32 %r7, %r62, 8; - shl.b32 %r8, %r4, 3; - add.s32 %r172, %r8, %r46; - add.s32 %r63, %r172, 8; - min.s32 %r64, %r47, %r63; - mul.lo.s32 %r10, %r49, %r48; - sub.s32 %r65, %r5, %r60; - shr.s32 %r66, %r65, 31; - shr.u32 %r67, %r66, 27; - add.s32 %r68, %r65, %r67; - and.b32 %r69, %r68, -32; - sub.s32 %r70, %r65, %r69; - sub.s32 %r11, %r5, %r70; - and.b32 %r71, %r2, 31; - cvt.u64.u32 %rd6, %r71; - mov.u64 %rd7, constDeltaForeach1; - add.s64 %rd1, %rd7, %rd6; - setp.ge.s32 %p6, %r172, %r64; - @%p6 bra BB12_13; - - min.s32 %r12, %r45, %r7; - shl.b32 %r15, %r10, 1; - neg.s32 %r16, %r15; - mul.lo.s32 %r17, %r10, 3; - mul.lo.s32 %r18, %r10, -3; - mov.u32 %r72, -9; - sub.s32 %r73, %r72, %r44; - sub.s32 %r74, %r73, %r6; - not.b32 %r75, %r45; - max.s32 %r76, %r74, %r75; - not.b32 %r19, %r76; - sub.s32 %r77, %r72, %r46; - sub.s32 %r78, %r77, %r8; - not.b32 %r79, %r47; - max.s32 %r80, %r78, %r79; - not.b32 %r20, %r80; - ld.global.u8 %r13, [%rd1]; - mov.u32 %r171, %r172; - -BB12_3: - mov.u32 %r21, %r171; - add.s32 %r23, %r21, %r13; - setp.ge.s32 %p7, %r62, %r12; - @%p7 bra BB12_12; - - mul.lo.s32 %r24, %r23, %r10; - mov.u32 %r174, %r62; - mov.u32 %r173, %r62; - -BB12_5: - mov.u32 %r27, %r173; - add.s32 %r30, %r27, %r13; - setp.ge.s32 %p8, %r60, %r11; - mov.u32 %r176, %r60; - @%p8 bra BB12_8; - - mov.u64 %rd9, constDeltaForeach4; - add.s64 %rd10, %rd9, %rd6; - ld.global.u8 %r31, [%rd10]; - mad.lo.s32 %r32, %r30, %r48, %r24; - add.s32 %r177, %r59, %r42; - -BB12_7: - cvta.to.global.u64 %rd11, %rd2; - add.s32 %r98, %r32, %r177; - add.s32 %r99, %r98, %r31; - shl.b32 %r100, %r99, 3; - cvt.s64.s32 %rd12, %r100; - add.s64 %rd13, %rd12, %rd4; - add.s32 %r101, %r100, 8; - cvt.s64.s32 %rd14, %r101; - add.s64 %rd15, %rd14, %rd4; - add.s32 %r102, %r100, -8; - cvt.s64.s32 %rd16, %r102; - add.s64 %rd17, %rd16, %rd4; - add.s32 %r103, %r99, %r48; - shl.b32 %r104, %r103, 3; - cvt.s64.s32 %rd18, %r104; - add.s64 %rd19, %rd18, %rd4; - sub.s32 %r105, %r99, %r48; - shl.b32 %r106, %r105, 3; - cvt.s64.s32 %rd20, %r106; - add.s64 %rd21, %rd20, %rd4; - add.s32 %r108, %r99, %r10; - shl.b32 %r109, %r108, 3; - cvt.s64.s32 %rd22, %r109; - add.s64 %rd23, %rd22, %rd4; - sub.s32 %r110, %r99, %r10; - shl.b32 %r111, %r110, 3; - cvt.s64.s32 %rd24, %r111; - add.s64 %rd25, %rd24, %rd4; - add.s32 %r112, %r100, 16; - cvt.s64.s32 %rd26, %r112; - add.s64 %rd27, %rd26, %rd4; - add.s32 %r113, %r100, -16; - cvt.s64.s32 %rd28, %r113; - add.s64 %rd29, %rd28, %rd4; - shl.b32 %r114, %r48, 1; - add.s32 %r115, %r99, %r114; - shl.b32 %r116, %r115, 3; - cvt.s64.s32 %rd30, %r116; - add.s64 %rd31, %rd30, %rd4; - mad.lo.s32 %r117, %r48, -2, %r99; - shl.b32 %r118, %r117, 3; - cvt.s64.s32 %rd32, %r118; - add.s64 %rd33, %rd32, %rd4; - add.s32 %r119, %r99, %r15; - shl.b32 %r120, %r119, 3; - cvt.s64.s32 %rd34, %r120; - add.s64 %rd35, %rd34, %rd4; - add.s32 %r121, %r99, %r16; - shl.b32 %r122, %r121, 3; - cvt.s64.s32 %rd36, %r122; - add.s64 %rd37, %rd36, %rd4; - add.s32 %r123, %r100, 24; - cvt.s64.s32 %rd38, %r123; - add.s64 %rd39, %rd38, %rd4; - add.s32 %r124, %r100, -24; - cvt.s64.s32 %rd40, %r124; - add.s64 %rd41, %rd40, %rd4; - mad.lo.s32 %r125, %r48, 3, %r99; - shl.b32 %r126, %r125, 3; - cvt.s64.s32 %rd42, %r126; - add.s64 %rd43, %rd42, %rd4; - mad.lo.s32 %r127, %r48, -3, %r99; - shl.b32 %r128, %r127, 3; - cvt.s64.s32 %rd44, %r128; - add.s64 %rd45, %rd44, %rd4; - add.s32 %r129, %r99, %r17; - shl.b32 %r130, %r129, 3; - cvt.s64.s32 %rd46, %r130; - add.s64 %rd47, %rd46, %rd4; - add.s32 %r131, %r99, %r18; - shl.b32 %r132, %r131, 3; - cvt.s64.s32 %rd48, %r132; - add.s64 %rd49, %rd48, %rd4; - add.s64 %rd50, %rd12, %rd5; - add.s64 %rd51, %rd12, %rd3; - ld.f64 %fd1, [%rd13]; - add.f64 %fd2, %fd1, %fd1; - ld.f64 %fd3, [%rd50]; - sub.f64 %fd4, %fd2, %fd3; - ld.global.f64 %fd5, [%rd11]; - ld.f64 %fd6, [%rd17]; - ld.f64 %fd7, [%rd15]; - add.f64 %fd8, %fd7, %fd6; - ld.f64 %fd9, [%rd19]; - add.f64 %fd10, %fd8, %fd9; - ld.f64 %fd11, [%rd21]; - add.f64 %fd12, %fd10, %fd11; - ld.f64 %fd13, [%rd23]; - add.f64 %fd14, %fd12, %fd13; - ld.f64 %fd15, [%rd25]; - add.f64 %fd16, %fd14, %fd15; - ld.global.f64 %fd17, [%rd11+8]; - mul.f64 %fd18, %fd17, %fd16; - fma.rn.f64 %fd19, %fd5, %fd1, %fd18; - ld.f64 %fd20, [%rd29]; - ld.f64 %fd21, [%rd27]; - add.f64 %fd22, %fd21, %fd20; - ld.f64 %fd23, [%rd31]; - add.f64 %fd24, %fd22, %fd23; - ld.f64 %fd25, [%rd33]; - add.f64 %fd26, %fd24, %fd25; - ld.f64 %fd27, [%rd35]; - add.f64 %fd28, %fd26, %fd27; - ld.f64 %fd29, [%rd37]; - add.f64 %fd30, %fd28, %fd29; - ld.global.f64 %fd31, [%rd11+16]; - fma.rn.f64 %fd32, %fd31, %fd30, %fd19; - ld.f64 %fd33, [%rd41]; - ld.f64 %fd34, [%rd39]; - add.f64 %fd35, %fd34, %fd33; - ld.f64 %fd36, [%rd43]; - add.f64 %fd37, %fd35, %fd36; - ld.f64 %fd38, [%rd45]; - add.f64 %fd39, %fd37, %fd38; - ld.f64 %fd40, [%rd47]; - add.f64 %fd41, %fd39, %fd40; - ld.f64 %fd42, [%rd49]; - add.f64 %fd43, %fd41, %fd42; - ld.global.f64 %fd44, [%rd11+24]; - fma.rn.f64 %fd45, %fd44, %fd43, %fd32; - ld.f64 %fd46, [%rd51]; - fma.rn.f64 %fd47, %fd46, %fd45, %fd4; - st.f64 [%rd50], %fd47; - add.s32 %r177, %r177, 32; - setp.lt.s32 %p9, %r177, %r11; - mov.u32 %r175, %r177; - mov.u32 %r176, %r175; - @%p9 bra BB12_7; - -BB12_8: - mov.u32 %r36, %r176; - setp.ge.s32 %p10, %r36, %r5; - @%p10 bra BB12_11; - - mov.u64 %rd53, constDeltaForeach4; - add.s64 %rd54, %rd53, %rd6; - ld.global.u8 %r135, [%rd54]; - add.s32 %r37, %r36, %r135; - setp.ge.s32 %p11, %r37, %r5; - @%p11 bra BB12_11; - - cvta.to.global.u64 %rd55, %rd2; - mad.lo.s32 %r136, %r30, %r48, %r24; - add.s32 %r137, %r136, %r37; - shl.b32 %r138, %r137, 3; - cvt.s64.s32 %rd56, %r138; - add.s64 %rd57, %rd56, %rd4; - add.s32 %r139, %r138, 8; - cvt.s64.s32 %rd58, %r139; - add.s64 %rd59, %rd58, %rd4; - add.s32 %r140, %r138, -8; - cvt.s64.s32 %rd60, %r140; - add.s64 %rd61, %rd60, %rd4; - add.s32 %r141, %r137, %r48; - shl.b32 %r142, %r141, 3; - cvt.s64.s32 %rd62, %r142; - add.s64 %rd63, %rd62, %rd4; - sub.s32 %r143, %r137, %r48; - shl.b32 %r144, %r143, 3; - cvt.s64.s32 %rd64, %r144; - add.s64 %rd65, %rd64, %rd4; - add.s32 %r146, %r137, %r10; - shl.b32 %r147, %r146, 3; - cvt.s64.s32 %rd66, %r147; - add.s64 %rd67, %rd66, %rd4; - sub.s32 %r148, %r137, %r10; - shl.b32 %r149, %r148, 3; - cvt.s64.s32 %rd68, %r149; - add.s64 %rd69, %rd68, %rd4; - add.s32 %r150, %r138, 16; - cvt.s64.s32 %rd70, %r150; - add.s64 %rd71, %rd70, %rd4; - add.s32 %r151, %r138, -16; - cvt.s64.s32 %rd72, %r151; - add.s64 %rd73, %rd72, %rd4; - shl.b32 %r152, %r48, 1; - add.s32 %r153, %r137, %r152; - shl.b32 %r154, %r153, 3; - cvt.s64.s32 %rd74, %r154; - add.s64 %rd75, %rd74, %rd4; - mad.lo.s32 %r155, %r48, -2, %r137; - shl.b32 %r156, %r155, 3; - cvt.s64.s32 %rd76, %r156; - add.s64 %rd77, %rd76, %rd4; - add.s32 %r157, %r137, %r15; - shl.b32 %r158, %r157, 3; - cvt.s64.s32 %rd78, %r158; - add.s64 %rd79, %rd78, %rd4; - add.s32 %r159, %r137, %r16; - shl.b32 %r160, %r159, 3; - cvt.s64.s32 %rd80, %r160; - add.s64 %rd81, %rd80, %rd4; - add.s32 %r161, %r138, 24; - cvt.s64.s32 %rd82, %r161; - add.s64 %rd83, %rd82, %rd4; - add.s32 %r162, %r138, -24; - cvt.s64.s32 %rd84, %r162; - add.s64 %rd85, %rd84, %rd4; - mad.lo.s32 %r163, %r48, 3, %r137; - shl.b32 %r164, %r163, 3; - cvt.s64.s32 %rd86, %r164; - add.s64 %rd87, %rd86, %rd4; - mad.lo.s32 %r165, %r48, -3, %r137; - shl.b32 %r166, %r165, 3; - cvt.s64.s32 %rd88, %r166; - add.s64 %rd89, %rd88, %rd4; - add.s32 %r167, %r137, %r17; - shl.b32 %r168, %r167, 3; - cvt.s64.s32 %rd90, %r168; - add.s64 %rd91, %rd90, %rd4; - add.s32 %r169, %r137, %r18; - shl.b32 %r170, %r169, 3; - cvt.s64.s32 %rd92, %r170; - add.s64 %rd93, %rd92, %rd4; - add.s64 %rd94, %rd56, %rd5; - add.s64 %rd95, %rd56, %rd3; - ld.f64 %fd48, [%rd57]; - add.f64 %fd49, %fd48, %fd48; - ld.f64 %fd50, [%rd94]; - sub.f64 %fd51, %fd49, %fd50; - ld.global.f64 %fd52, [%rd55]; - ld.f64 %fd53, [%rd61]; - ld.f64 %fd54, [%rd59]; - add.f64 %fd55, %fd54, %fd53; - ld.f64 %fd56, [%rd63]; - add.f64 %fd57, %fd55, %fd56; - ld.f64 %fd58, [%rd65]; - add.f64 %fd59, %fd57, %fd58; - ld.f64 %fd60, [%rd67]; - add.f64 %fd61, %fd59, %fd60; - ld.f64 %fd62, [%rd69]; - add.f64 %fd63, %fd61, %fd62; - ld.global.f64 %fd64, [%rd55+8]; - mul.f64 %fd65, %fd64, %fd63; - fma.rn.f64 %fd66, %fd52, %fd48, %fd65; - ld.f64 %fd67, [%rd73]; - ld.f64 %fd68, [%rd71]; - add.f64 %fd69, %fd68, %fd67; - ld.f64 %fd70, [%rd75]; - add.f64 %fd71, %fd69, %fd70; - ld.f64 %fd72, [%rd77]; - add.f64 %fd73, %fd71, %fd72; - ld.f64 %fd74, [%rd79]; - add.f64 %fd75, %fd73, %fd74; - ld.f64 %fd76, [%rd81]; - add.f64 %fd77, %fd75, %fd76; - ld.global.f64 %fd78, [%rd55+16]; - fma.rn.f64 %fd79, %fd78, %fd77, %fd66; - ld.f64 %fd80, [%rd85]; - ld.f64 %fd81, [%rd83]; - add.f64 %fd82, %fd81, %fd80; - ld.f64 %fd83, [%rd87]; - add.f64 %fd84, %fd82, %fd83; - ld.f64 %fd85, [%rd89]; - add.f64 %fd86, %fd84, %fd85; - ld.f64 %fd87, [%rd91]; - add.f64 %fd88, %fd86, %fd87; - ld.f64 %fd89, [%rd93]; - add.f64 %fd90, %fd88, %fd89; - ld.global.f64 %fd91, [%rd55+24]; - fma.rn.f64 %fd92, %fd91, %fd90, %fd79; - ld.f64 %fd93, [%rd95]; - fma.rn.f64 %fd94, %fd92, %fd93, %fd51; - st.f64 [%rd94], %fd94; - -BB12_11: - add.s32 %r39, %r174, 1; - setp.ne.s32 %p12, %r39, %r19; - mov.u32 %r174, %r39; - mov.u32 %r173, %r39; - @%p12 bra BB12_5; - -BB12_12: - add.s32 %r171, %r172, 1; - setp.ne.s32 %p13, %r171, %r20; - mov.u32 %r172, %r171; - @%p13 bra BB12_3; - -BB12_13: - ret; -} - -.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_( - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14, - .param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1] -) -{ - .reg .pred %p<9>; - .reg .s32 %r<63>; - .reg .s64 %rd<18>; - - - ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0]; - ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1]; - ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2]; - ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3]; - ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4]; - ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5]; - ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6]; - ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7]; - ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8]; - ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9]; - ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10]; - ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11]; - ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12]; - ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13]; - ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14]; - setp.ge.s32 %p1, %r62, %r12; - @%p1 bra BB13_14; - - mov.u32 %r22, 31; - sub.s32 %r23, %r22, %r13; - add.s32 %r24, %r23, %r14; - shr.s32 %r25, %r24, 31; - shr.u32 %r26, %r25, 27; - add.s32 %r27, %r24, %r26; - shr.s32 %r28, %r27, 5; - mov.u32 %r29, 7; - sub.s32 %r30, %r29, %r15; - add.s32 %r31, %r30, %r16; - shr.s32 %r32, %r31, 31; - shr.u32 %r33, %r32, 29; - add.s32 %r34, %r31, %r33; - shr.s32 %r1, %r34, 3; - sub.s32 %r35, %r29, %r17; - add.s32 %r36, %r35, %r18; - shr.s32 %r37, %r36, 31; - shr.u32 %r38, %r37, 29; - add.s32 %r39, %r36, %r38; - shr.s32 %r2, %r39, 3; - add.s32 %r40, %r28, -1; - shr.s32 %r41, %r40, 2; - add.s32 %r3, %r41, 1; - mov.u32 %r42, %tid.x; - and.b32 %r4, %r42, 31; - sub.s32 %r61, %r62, %r12; - -BB13_2: - and.b32 %r8, %r62, 1; - setp.ne.s32 %p2, %r4, 0; - mov.u64 %rd17, 0; - @%p2 bra BB13_4; - - mov.u64 %rd9, 8; - mov.u64 %rd10, 72; - // Callseq Start 2 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd10; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd17, [retval0+0]; - } - // Callseq End 2 - -BB13_4: - setp.eq.s32 %p3, %r8, 0; - @%p3 bra BB13_9; - - setp.eq.s64 %p4, %rd17, 0; - @%p4 bra BB13_7; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd7; - st.u64 [%rd17+64], %rd6; - -BB13_7: - @%p2 bra BB13_13; - - mov.u32 %r47, 128; - mov.u32 %r49, 1; - mov.u32 %r50, 0; - mov.u64 %rd13, 0; - mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd11; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r47; - st.param.b32 [param3+4], %r49; - st.param.b32 [param3+8], %r49; - .param .b32 param4; - st.param.b32 [param4+0], %r50; - .param .b64 param5; - st.param.b64 [param5+0], %rd13; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r43, [retval0+0]; - } - - // inline asm - bra.uni BB13_13; - -BB13_9: - setp.eq.s64 %p6, %rd17, 0; - @%p6 bra BB13_11; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd6; - st.u64 [%rd17+64], %rd7; - -BB13_11: - @%p2 bra BB13_13; - - mov.u32 %r55, 128; - mov.u32 %r57, 1; - mov.u32 %r58, 0; - mov.u64 %rd16, 0; - mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd14; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r55; - st.param.b32 [param3+4], %r57; - st.param.b32 [param3+8], %r57; - .param .b32 param4; - st.param.b32 [param4+0], %r58; - .param .b64 param5; - st.param.b64 [param5+0], %rd16; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r51, [retval0+0]; - } - - // inline asm - -BB13_13: - // Callseq Start 3 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r59, [retval0+0]; - } - // Callseq End 3 - add.s32 %r62, %r62, 1; - add.s32 %r61, %r61, 1; - setp.ne.s32 %p8, %r61, 0; - @%p8 bra BB13_2; - -BB13_14: - // Callseq Start 4 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r60, [retval0+0]; - } - // Callseq End 4 - ret; -} - -.visible .entry loop_stencil_ispc_tasks( - .param .u32 loop_stencil_ispc_tasks_param_0, - .param .u32 loop_stencil_ispc_tasks_param_1, - .param .u32 loop_stencil_ispc_tasks_param_2, - .param .u32 loop_stencil_ispc_tasks_param_3, - .param .u32 loop_stencil_ispc_tasks_param_4, - .param .u32 loop_stencil_ispc_tasks_param_5, - .param .u32 loop_stencil_ispc_tasks_param_6, - .param .u32 loop_stencil_ispc_tasks_param_7, - .param .u32 loop_stencil_ispc_tasks_param_8, - .param .u32 loop_stencil_ispc_tasks_param_9, - .param .u32 loop_stencil_ispc_tasks_param_10, - .param .u64 loop_stencil_ispc_tasks_param_11, - .param .u64 loop_stencil_ispc_tasks_param_12, - .param .u64 loop_stencil_ispc_tasks_param_13, - .param .u64 loop_stencil_ispc_tasks_param_14 -) -{ - .reg .pred %p<9>; - .reg .s32 %r<63>; - .reg .s64 %rd<18>; - - - ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0]; - ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1]; - ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2]; - ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3]; - ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4]; - ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5]; - ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6]; - ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7]; - ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8]; - ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9]; - ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10]; - ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11]; - ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12]; - ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13]; - ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14]; - setp.ge.s32 %p1, %r62, %r12; - @%p1 bra BB14_14; - - mov.u32 %r22, 31; - sub.s32 %r23, %r22, %r13; - add.s32 %r24, %r23, %r14; - shr.s32 %r25, %r24, 31; - shr.u32 %r26, %r25, 27; - add.s32 %r27, %r24, %r26; - shr.s32 %r28, %r27, 5; - mov.u32 %r29, 7; - sub.s32 %r30, %r29, %r15; - add.s32 %r31, %r30, %r16; - shr.s32 %r32, %r31, 31; - shr.u32 %r33, %r32, 29; - add.s32 %r34, %r31, %r33; - shr.s32 %r1, %r34, 3; - sub.s32 %r35, %r29, %r17; - add.s32 %r36, %r35, %r18; - shr.s32 %r37, %r36, 31; - shr.u32 %r38, %r37, 29; - add.s32 %r39, %r36, %r38; - shr.s32 %r2, %r39, 3; - add.s32 %r40, %r28, -1; - shr.s32 %r41, %r40, 2; - add.s32 %r3, %r41, 1; - mov.u32 %r42, %tid.x; - and.b32 %r4, %r42, 31; - sub.s32 %r61, %r62, %r12; - -BB14_2: - and.b32 %r8, %r62, 1; - setp.ne.s32 %p2, %r4, 0; - mov.u64 %rd17, 0; - @%p2 bra BB14_4; - - mov.u64 %rd9, 8; - mov.u64 %rd10, 72; - // Callseq Start 5 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd10; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd17, [retval0+0]; - } - // Callseq End 5 - -BB14_4: - setp.eq.s32 %p3, %r8, 0; - @%p3 bra BB14_9; - - setp.eq.s64 %p4, %rd17, 0; - @%p4 bra BB14_7; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd7; - st.u64 [%rd17+64], %rd6; - -BB14_7: - @%p2 bra BB14_13; - - mov.u32 %r47, 128; - mov.u32 %r49, 1; - mov.u32 %r50, 0; - mov.u64 %rd13, 0; - mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd11; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r47; - st.param.b32 [param3+4], %r49; - st.param.b32 [param3+8], %r49; - .param .b32 param4; - st.param.b32 [param4+0], %r50; - .param .b64 param5; - st.param.b64 [param5+0], %rd13; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r43, [retval0+0]; - } - - // inline asm - bra.uni BB14_13; - -BB14_9: - setp.eq.s64 %p6, %rd17, 0; - @%p6 bra BB14_11; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd6; - st.u64 [%rd17+64], %rd7; - -BB14_11: - @%p2 bra BB14_13; - - mov.u32 %r55, 128; - mov.u32 %r57, 1; - mov.u32 %r58, 0; - mov.u64 %rd16, 0; - mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd14; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r55; - st.param.b32 [param3+4], %r57; - st.param.b32 [param3+8], %r57; - .param .b32 param4; - st.param.b32 [param4+0], %r58; - .param .b64 param5; - st.param.b64 [param5+0], %rd16; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r51, [retval0+0]; - } - - // inline asm - -BB14_13: - // Callseq Start 6 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r59, [retval0+0]; - } - // Callseq End 6 - add.s32 %r62, %r62, 1; - add.s32 %r61, %r61, 1; - setp.ne.s32 %p8, %r61, 0; - @%p8 bra BB14_2; - -BB14_14: - // Callseq Start 7 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r60, [retval0+0]; - } - // Callseq End 7 - ret; -} - - - diff --git a/examples_cuda/stencil/stencil_orig.cpp b/examples_cuda/stencil/stencil_orig.cpp deleted file mode 100644 index 015f2b80..00000000 --- a/examples_cuda/stencil/stencil_orig.cpp +++ /dev/null @@ -1,172 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifdef _MSC_VER -#define _CRT_SECURE_NO_WARNINGS -#define NOMINMAX -#pragma warning (disable: 4244) -#pragma warning (disable: 4305) -#endif - -#include -#include -#include -#include "../timing.h" -#include "stencil_ispc.h" -using namespace ispc; - -#include - - -double rtc(void) -{ - struct timeval Tvalue; - double etime; - struct timezone dummy; - - gettimeofday(&Tvalue,&dummy); - etime = (double) Tvalue.tv_sec + - 1.e-6*((double) Tvalue.tv_usec); - return etime; -} - - -extern void loop_stencil_serial(int t0, int t1, int x0, int x1, - int y0, int y1, int z0, int z1, - int Nx, int Ny, int Nz, - const double coef[5], - const double vsq[], - double Aeven[], double Aodd[]); - - -void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) { - int offset = 0; - for (int z = 0; z < Nz; ++z) - for (int y = 0; y < Ny; ++y) - for (int x = 0; x < Nx; ++x, ++offset) { - A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny); - A[1][offset] = 0; - vsq[offset] = x*y*z / double(Nx * Ny * Nz); - } -} - - -int main() { - int Nx = 256, Ny = 256, Nz = 256; - int width = 4; - double *Aserial[2], *Aispc[2]; - Aserial[0] = new double [Nx * Ny * Nz]; - Aserial[1] = new double [Nx * Ny * Nz]; - Aispc[0] = new double [Nx * Ny * Nz]; - Aispc[1] = new double [Nx * Ny * Nz]; - double *vsq = new double [Nx * Ny * Nz]; - - double coeff[4] = { 0.5, -.25, .125, -.0625 }; - -// InitData(Nx, Ny, Nz, Aispc, vsq); - - // - // Compute the image using the ispc implementation on one core; report - // the minimum time of three runs. - // - double minTimeISPC = 1e30; -#if 0 - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aispc[0], Aispc[1]); - double dt = get_elapsed_mcycles(); - minTimeISPC = std::min(minTimeISPC, dt); - } - - printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); -#endif - - fprintf(stderr, " -- init -- \n"); - InitData(Nx, Ny, Nz, Aispc, vsq); - fprintf(stderr, " -- done init -- \n"); - - // - // Compute the image using the ispc implementation with tasks; report - // the minimum time of three runs. - // - double minTimeISPCTasks = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - const double t0 = rtc(); - loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aispc[0], Aispc[1]); - double dt = 1e3*(rtc() - t0); //get_elapsed_mcycles(); - minTimeISPCTasks = std::min(minTimeISPCTasks, dt); - } - - fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); - - - InitData(Nx, Ny, Nz, Aserial, vsq); - - // - // And run the serial implementation 3 times, again reporting the - // minimum time. - // - double minTimeSerial = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aserial[0], Aserial[1]); - double dt = get_elapsed_mcycles(); - minTimeSerial = std::min(minTimeSerial, dt); - } - - printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); - - printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", - minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); - - // Check for agreement - int offset = 0; - for (int z = 0; z < Nz; ++z) - for (int y = 0; y < Ny; ++y) - for (int x = 0; x < Nx; ++x, ++offset) { - double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) / - Aserial[1][offset]); - if (error > 1e-4) - printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n", - x, y, z, Aispc[1][offset], Aserial[1][offset]); - } - - return 0; -} diff --git a/examples_cuda/stencil/stencil_orig.ispc b/examples_cuda/stencil/stencil_orig.ispc deleted file mode 100644 index d2e095b3..00000000 --- a/examples_cuda/stencil/stencil_orig.ispc +++ /dev/null @@ -1,172 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifdef __NVPTX__ -#warning "emitting DEVICE code" -#define taskIndex blockIndex0() -#define taskCount blockCount0() -#define programIndex laneIndex() -#define programCount warpSize() -#else -#warning "emitting HOST code" -#endif - -static inline void -stencil_step(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], uniform const double vsq[], - uniform const double Ain[], uniform double Aout[]) { - const uniform int Nxy = Nx * Ny; - -// foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) -#if 0 -#define VER1 -#endif - -#ifdef VER1 - const uniform long x1o = 1; - const uniform long x2o = 2; - const uniform long x3o = 3; - const uniform long y1o = Nx; - const uniform long y2o = Nx*2; - const uniform long y3o = Nx*3; - const uniform long z1o = Nxy; - const uniform long z2o = Nxy*2; - const uniform long z3o = Nxy*3; -#endif - for (uniform int z = z0; z < z1; z++) - for (uniform int y = y0; y < y1; y++) - { - const int index_base = (z * Nxy) + (y * Nx); - for (uniform int xb = x0; xb < x1; xb += programCount) - { - const int x = xb + programIndex; - int index = index_base + x; -#ifndef VER1 -#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] -#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - double div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + - A_cur(0, +1, 0) + A_cur(0, -1, 0) + - A_cur(0, 0, +1) + A_cur(0, 0, -1)) + - coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + - A_cur(0, +2, 0) + A_cur(0, -2, 0) + - A_cur(0, 0, +2) + A_cur(0, 0, -2)) + - coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + - A_cur(0, +3, 0) + A_cur(0, -3, 0) + - A_cur(0, 0, +3) + A_cur(0, 0, -3)); -#else -#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)] -#define A_next(x, y, z) Aout[index + (x) + (y) + (z)] - double div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) + - A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) + - A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) + - coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) + - A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) + - A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) + - coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) + - A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) + - A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o)); -#endif - - if (x < x1) - A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + - vsq[index] * div; - } - } -} - - -static task void -stencil_step_task(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], uniform const double vsq[], - uniform const double Ain[], uniform double Aout[]) { - if(taskIndex >= taskCount) return; - - stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, - Nx, Ny, Nz, coef, vsq, Ain, Aout); -} - - -export void -loop_stencil_ispc_tasks(uniform int t0, uniform int t1, - uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], - uniform const double vsq[], - uniform double Aeven[], uniform double Aodd[]) -{ - for (uniform int t = t0; t < t1; ++t) { - // Parallelize across cores as well: each task will work on a slice - // of 1 in the z extent of the volume. - if ((t & 1) == 0) - launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, - coef, vsq, Aeven, Aodd); - else - launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, - coef, vsq, Aodd, Aeven); - - // We need to wait for all of the launched tasks to finish before - // starting the next iteration. - sync; - } -} - - -export void -loop_stencil_ispc(uniform int t0, uniform int t1, - uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], - uniform const double vsq[], - uniform double Aeven[], uniform double Aodd[]) -{ - for (uniform int t = t0; t < t1; ++t) { - if ((t & 1) == 0) - stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, - Aeven, Aodd); - else - stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, - Aodd, Aeven); - } -} diff --git a/examples_cuda/stencil/stencil_serial.o b/examples_cuda/stencil/stencil_serial.o deleted file mode 100644 index 1fd32c299aeb42b3c7b58279918024d18b00f656..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2360 zcmbtUUrZZy9RFR*pi^_#5n@O-Lrs2(CC&>NRO*Pi>#pv{JPeVd;Fw)^Eg6x2q4zKi zi6kAyOLvS%9(s#!0MXSu48L@ArBKrJ^tUrM>Up z&*%I7eSg2Zk3#C}E{-G4;>c^{z;e8VDzWB_YOJd8Z2k^&Hdr#}WyxQ?SS!Oc zg{HAk7R@w_CLajXMPViYgJ!Ug9K;rcrYWon%u4}iPME<%!9p7h5ScFHPnOat3@A*u zI*n{+;m6Jb5KhuL#uXX2$GAsAlSFm}uf=&26jt07`lS4z;QtmB`1c>*4GMiq5&Q%7 zmb!r{O15YjD`51?9`I}cT0%w2S&@7>A!lV5i`aD93T#;ruLD6v+JcATq*_@vf2PQi zE5O<>HTHx?+s=-c2z zw&OkOz^f@Ksbx9~nM!Jr&Vf`@U(vKBzn}{s1@jYl!bMYzzZk?;z`A$A;b*C9bO}7n z7^Z$aZ=aI7P7$F%WC~<*jf2ZCLFXONc{}W~O*LY)Q3Zro0bY+{bkN-Af0s&U2B@Mx z^I(nB)0MnF1y+F&5RMM7*3x z5K$W&8HtaAZ*)wLi?HVx5_(j7pNJFj;iwqD5*eI`j>PTG9#5PEIDQvvt7s(KzTrL` z@E$hFvtT2>Dve~S9X77wb8C5?gQw>)@TUQb^;_Uyxt#j-l}^wx7ndVns?e=~{rKfS z`gd?n?s*%JSFCd7Y<#-X2|Dhza)rRCo(aZ1$IDsHzR<)oa`27iFgXG8s8(lf(y*Q9 zi0b~)uI?Rk({*J;zAVTSAhCz)`sh2V{+yjSX5$@EUElFnOU$bwf0U~IXuJ1M?{L(8 zW*y+W0t$ysNgpxbN5F<3rBsek0b?5`kZLOdLu~M8y_JZVe2Njg3e40Tz*i09gig Y$9h^Lg!r%~ENr9loMV>v!0+_=5ANaM@c;k- diff --git a/stdlib.ispc b/stdlib.ispc index 25728ed0..fcb61eb4 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -93,7 +93,7 @@ __declspec(safe,cost0) __declspec(safe,cost0) static inline uniform int blockIndex2() { - return __ctaid_y(); + return __ctaid_z(); } /***************/ __declspec(safe,cost0)