From 8d4dd137508ffdbe5b77e88fae3312eed6a5aa0b Mon Sep 17 00:00:00 2001 From: Evghenii Date: Mon, 18 Nov 2013 11:58:19 +0100 Subject: [PATCH] changes --- examples_cuda/stencil/1.s | 175 --- examples_cuda/stencil/2.s | 239 ---- examples_cuda/stencil/3.s | 239 ---- examples_cuda/stencil/Makefile_gpu | 55 + examples_cuda/stencil/__kernels.ptx | 1246 +++++++++++++++++ examples_cuda/stencil/err | 0 examples_cuda/stencil/info | 5 - examples_cuda/stencil/kernel.ptx | 1246 +++++++++++++++++ examples_cuda/stencil/libcudadevrt.a | Bin 0 -> 137338 bytes examples_cuda/stencil/stencil.cpp | 3 +- examples_cuda/stencil/stencil.ispc | 154 +- examples_cuda/stencil/stencil.ptx | 267 ---- examples_cuda/stencil/stencil0.ptx | 224 --- examples_cuda/stencil/stencil1.cubin | Bin 3604 -> 0 bytes examples_cuda/stencil/stencil2.cubin | Bin 3668 -> 0 bytes examples_cuda/stencil/stencil2.ptx | 247 ---- examples_cuda/stencil/stencilX.ispc | 159 +++ examples_cuda/stencil/stencilY.ispc | 126 ++ examples_cuda/stencil/stencil_avx.bc | Bin 12836 -> 0 bytes examples_cuda/stencil/stencil_cu | Bin 50485 -> 25158 bytes examples_cuda/stencil/stencil_cu.bc | Bin 22616 -> 0 bytes examples_cuda/stencil/stencil_cu.cpp | 213 +-- examples_cuda/stencil/stencil_cu.ll | 762 ---------- examples_cuda/stencil/stencil_cu.o | Bin 18464 -> 21784 bytes examples_cuda/stencil/stencil_cu.s | 1134 --------------- examples_cuda/stencil/stencil_cu_avx.bc | Bin 9820 -> 0 bytes examples_cuda/stencil/stencil_cu_avx.s | 214 --- examples_cuda/stencil/stencil_cu_nvptx64.bc | Bin 5256 -> 0 bytes .../stencil/stencil_cu_nvptx64.cubin | Bin 3668 -> 0 bytes examples_cuda/stencil/stencil_cu_nvptx64.ll | 269 ---- examples_cuda/stencil/stencil_ispc.h | 1 - examples_cuda/stencil/stencil_ispc_nvptx64.ll | 974 +++++++++++++ .../stencil/stencil_ispc_nvptx64.ptx | 1246 +++++++++++++++++ examples_cuda/stencil/stencil_nvptx64.bc | Bin 8500 -> 0 bytes examples_cuda/stencil/stencil_orig.cpp | 172 +++ examples_cuda/stencil/stencil_orig.ispc | 172 +++ examples_cuda/stencil/stencil_parallel.cpp | 12 +- examples_cuda/stencil/stencil_serial.o | Bin 0 -> 2360 bytes 38 files changed, 5481 insertions(+), 4073 deletions(-) delete mode 100644 examples_cuda/stencil/1.s delete mode 100644 examples_cuda/stencil/2.s delete mode 100644 examples_cuda/stencil/3.s create mode 100644 examples_cuda/stencil/Makefile_gpu create mode 100644 examples_cuda/stencil/__kernels.ptx delete mode 100644 examples_cuda/stencil/err delete mode 100644 examples_cuda/stencil/info create mode 100644 examples_cuda/stencil/kernel.ptx create mode 100644 examples_cuda/stencil/libcudadevrt.a delete mode 100644 examples_cuda/stencil/stencil.ptx delete mode 100644 examples_cuda/stencil/stencil0.ptx delete mode 100644 examples_cuda/stencil/stencil1.cubin delete mode 100644 examples_cuda/stencil/stencil2.cubin delete mode 100644 examples_cuda/stencil/stencil2.ptx create mode 100644 examples_cuda/stencil/stencilX.ispc create mode 100644 examples_cuda/stencil/stencilY.ispc delete mode 100644 examples_cuda/stencil/stencil_avx.bc delete mode 100644 examples_cuda/stencil/stencil_cu.bc delete mode 100644 examples_cuda/stencil/stencil_cu.ll delete mode 100644 examples_cuda/stencil/stencil_cu.s delete mode 100644 examples_cuda/stencil/stencil_cu_avx.bc delete mode 100644 examples_cuda/stencil/stencil_cu_avx.s delete mode 100644 examples_cuda/stencil/stencil_cu_nvptx64.bc delete mode 100644 examples_cuda/stencil/stencil_cu_nvptx64.cubin delete mode 100644 examples_cuda/stencil/stencil_cu_nvptx64.ll create mode 100644 examples_cuda/stencil/stencil_ispc_nvptx64.ll create mode 100644 examples_cuda/stencil/stencil_ispc_nvptx64.ptx delete mode 100644 examples_cuda/stencil/stencil_nvptx64.bc create mode 100644 examples_cuda/stencil/stencil_orig.cpp create mode 100644 examples_cuda/stencil/stencil_orig.ispc create mode 100644 examples_cuda/stencil/stencil_serial.o diff --git a/examples_cuda/stencil/1.s b/examples_cuda/stencil/1.s deleted file mode 100644 index d59cb1f9..00000000 --- a/examples_cuda/stencil/1.s +++ /dev/null @@ -1,175 +0,0 @@ - - code for sm_35 - Function : stencil_step_task - .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" - /* 0x0880a010a0a01000 */ - /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ - /*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */ - /*0018*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */ - /*0020*/ IADD R0, R10, c[0x0][0x150]; /* 0x608000002a1c2802 */ - /*0028*/ IADD R11, R0, 0x1; /* 0xc0800000009c002d */ - /*0030*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */ - /*0038*/ ISETP.GE.AND P0, PT, R0, R11, PT; /* 0xdb681c00059c001e */ - /* 0x08a0a1ac118d8d8c */ - /*0048*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */ - /*0050*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */ - /*0058*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */ - /*0060*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */ - /*0068*/ @P0 EXIT ; /* 0x180000000000003c */ - /*0070*/ MOV R11, c[0x0][0x158]; /* 0x64c03c002b1c002e */ - /*0078*/ IMUL R41, R11, c[0x0][0x154]; /* 0x61c018002a9c2ca6 */ - /* 0x08b0a000a010a010 */ - /*0088*/ IADD R11, R10, c[0x0][0x150]; /* 0x608000002a1c282e */ - /*0090*/ SHF.L R40, RZ, 0x1, R41; /* 0xb7c0a400009ffca1 */ - /*0098*/ I2I.S32.S32 R10, -R40; /* 0xe6010000141ce82a */ - /*00a0*/ IADD R49, R11, 0x1; /* 0xc0800000009c2cc5 */ - /*00a8*/ SHF.L R28, RZ, 0x3, R10; /* 0xb7c02800019ffc71 */ - /*00b0*/ MOV R10, c[0x0][0x148]; /* 0x64c03c00291c002a */ - /*00b8*/ ISETP.GE.AND P0, PT, R10, c[0x0][0x14c], PT; /* 0x5b681c00299c281e */ - /* 0x0880acb0a00010ac */ - /*00c8*/ @P0 BRA 0x4f0; /* 0x120000021000003c */ - /*00d0*/ MOV R29, c[0x0][0x148]; /* 0x64c03c00291c0076 */ - /*00d8*/ IMUL R42, R0, R41; /* 0xe1c01800149c00aa */ - /*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */ - /*00e8*/ ISETP.GE.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b681c00289c281e */ - /*00f0*/ @P0 BRA 0x4d8; /* 0x12000001f000003c */ - /*00f8*/ MOV R10, c[0x0][0x154]; /* 0x64c03c002a9c002a */ - /* 0x0880888010a0109c */ - /*0108*/ IMAD R44, R29, c[0x0][0x154], R42; /* 0x5108a8002a9c74b2 */ - /*0110*/ SHF.L R11, RZ, 0x1, R10; /* 0xb7c02800009ffc2d */ - /*0118*/ MOV R39, c[0x0][0x140]; /* 0x64c03c00281c009e */ - /*0120*/ IMAD R34, R10, -0x2, R44; /* 0xa908b3ffff1c2889 */ - /*0128*/ IADD R43, R44, R11; /* 0xe0800000059cb0ae */ - /*0130*/ I2I.S32.S32 R10, -R11; /* 0xe6010000059ce82a */ - /*0138*/ IMAD R36, R41, -0x2, R44; /* 0xa908b3ffff1ca491 */ - /* 0x08a0001084108480 */ - /*0148*/ IADD R32, R44, c[0x0][0x154]; /* 0x608000002a9cb082 */ - /*0150*/ IADD R33, R44, R41; /* 0xe0800000149cb086 */ - /*0158*/ IADD R35, R44, R40; /* 0xe0800000141cb08e */ - /*0160*/ IMAD R38, R41, 0x3, R44; /* 0xa108b000019ca499 */ - /*0168*/ SHF.L R47, RZ, 0x3, R10; /* 0xb7c02800019ffcbd */ - /*0170*/ IADD R37, R43, c[0x0][0x154]; /* 0x608000002a9cac96 */ - /*0178*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */ - /* 0x08a0b0a010908c10 */ - /*0188*/ MOV32I R48, 0x8; /* 0x74000000041fc0c2 */ - /*0190*/ IADD R45, R10, R39; /* 0xe0800000139c28b6 */ - /*0198*/ BFE R30, R47, 0x11f; /* 0xc00800008f9cbc79 */ - /*01a0*/ IADD R46, R45, R44; /* 0xe0800000161cb4ba */ - /*01a8*/ IADD R14, R32, R45; /* 0xe0800000169c803a */ - /*01b0*/ IMAD R10.CC, R46, R48, c[0x0][0x170]; /* 0x910cc0002e1cb82a */ - /*01b8*/ IMAD.HI.X R11, R46, R48, c[0x0][0x174]; /* 0x9318c0002e9cb82e */ - /* 0x0881cc118c118c10 */ - /*01c8*/ IADD R27, R37, R45; /* 0xe0800000169c946e */ - /*01d0*/ LD.E.64 R12, [R10+-0x8]; /* 0xc5fffffffc1c2830 */ - /*01d8*/ BFE R50, R28, 0x11f; /* 0xc00800008f9c70c9 */ - /*01e0*/ LD.E.64 R24, [R10+0x8]; /* 0xc5800000041c2860 */ - /*01e8*/ ISETP.GE.AND P0, PT, R45, c[0x0][0x144], PT; /* 0x5b681c00289cb41e */ - /*01f0*/ LD.E.64 R18, [R10+-0x18]; /* 0xc5fffffff41c2848 */ - /*01f8*/ DADD R20, R24, R12; /* 0xe3800000061c6052 */ - /* 0x098c10a011ac8188 */ - /*0208*/ LD.E.64 R22, [R10+0x18]; /* 0xc58000000c1c2858 */ - /*0210*/ IMAD R16.CC, R14, R48, c[0x0][0x170]; /* 0x910cc0002e1c3842 */ - /*0218*/ LD.E.64 R12, [R10+-0x10]; /* 0xc5fffffff81c2830 */ - /*0220*/ IMAD.HI.X R17, R14, R48, c[0x0][0x174]; /* 0x9318c0002e9c3846 */ - /*0228*/ IADD R25, R43, R45; /* 0xe0800000169cac66 */ - /*0230*/ LD.E.64 R14, [R16]; /* 0xc5800000001c4038 */ - /*0238*/ DADD R22, R22, R18; /* 0xe3800000091c585a */ - /* 0x0994808c848cb180 */ - /*0248*/ LD.E.64 R18, [R10+0x10]; /* 0xc5800000081c2848 */ - /*0250*/ IMAD R26.CC, R27, R48, c[0x0][0x170]; /* 0x910cc0002e1c6c6a */ - /*0258*/ IMAD.HI.X R27, R27, R48, c[0x0][0x174]; /* 0x9318c0002e9c6c6e */ - /*0260*/ IMAD R24.CC, R25, R48, c[0x0][0x170]; /* 0x910cc0002e1c6462 */ - /*0268*/ DADD R14, R20, R14; /* 0xe3800000071c503a */ - /*0270*/ DADD R20, R18, R12; /* 0xe3800000061c4852 */ - /*0278*/ LD.E.64 R12, [R26]; /* 0xc5800000001c6830 */ - /* 0x08b080118010c080 */ - /*0288*/ IMAD.HI.X R25, R25, R48, c[0x0][0x174]; /* 0x9318c0002e9c6466 */ - /*0290*/ IADD R16.CC, R16, R47; /* 0xe0840000179c4042 */ - /*0298*/ LD.E.64 R18, [R24]; /* 0xc5800000001c6048 */ - /*02a0*/ DADD R12, R22, R12; /* 0xe3800000061c5832 */ - /*02a8*/ IADD.X R17, R17, R30; /* 0xe08040000f1c4446 */ - /*02b0*/ IADD R31, R34, R45; /* 0xe0800000169c887e */ - /*02b8*/ IADD R22.CC, R16, R47; /* 0xe0840000179c405a */ - /* 0x089980818880a010 */ - /*02c8*/ IADD.X R23, R17, R30; /* 0xe08040000f1c445e */ - /*02d0*/ IMAD R26.CC, R31, R48, c[0x0][0x170]; /* 0x910cc0002e1c7c6a */ - /*02d8*/ DADD R20, R20, R18; /* 0xe3800000091c5052 */ - /*02e0*/ LD.E.64 R18, [R16]; /* 0xc5800000001c4048 */ - /*02e8*/ IMAD.HI.X R27, R31, R48, c[0x0][0x174]; /* 0x9318c0002e9c7c6e */ - /*02f0*/ LD.E.64 R24, [R22]; /* 0xc5800000001c5860 */ - /*02f8*/ IADD R51, R33, R45; /* 0xe0800000169c84ce */ - /* 0x088880ac818c11b8 */ - /*0308*/ LD.E.64 R30, [R26]; /* 0xc5800000001c6878 */ - /*0310*/ LD.E.64 R26, [R10]; /* 0xc5800000001c2868 */ - /*0318*/ DADD R14, R14, R18; /* 0xe3800000091c383a */ - /*0320*/ IMAD R18.CC, R51, R48, c[0x0][0x170]; /* 0x910cc0002e1ccc4a */ - /*0328*/ IADD R17, R35, R45; /* 0xe0800000169c8c46 */ - /*0330*/ IMAD.HI.X R19, R51, R48, c[0x0][0x174]; /* 0x9318c0002e9ccc4e */ - /*0338*/ DADD R22, R20, R30; /* 0xe38000000f1c505a */ - /* 0x098c10a0999c1090 */ - /*0348*/ IMAD R16.CC, R17, R48, c[0x0][0x170]; /* 0x910cc0002e1c4442 */ - /*0350*/ LD.E.64 R20, [R18]; /* 0xc5800000001c4850 */ - /*0358*/ DADD R12, R12, R24; /* 0xe38000000c1c3032 */ - /*0360*/ IMAD.HI.X R17, R17, R48, c[0x0][0x174]; /* 0x9318c0002e9c4446 */ - /*0368*/ IADD R18.CC, R18, R28; /* 0xe08400000e1c484a */ - /*0370*/ LD.E.64 R24, [R16]; /* 0xc5800000001c4060 */ - /*0378*/ DADD R20, R14, R20; /* 0xe38000000a1c3852 */ - /* 0x088080b4a18010cc */ - /*0388*/ IADD.X R19, R19, R50; /* 0xe0804000191c4c4e */ - /*0390*/ LD.E.64 R14, [R18]; /* 0xc5800000001c4838 */ - /*0398*/ DADD R22, R22, R24; /* 0xe38000000c1c585a */ - /*03a0*/ IADD R25, R36, R45; /* 0xe0800000169c9066 */ - /*03a8*/ IMAD R16.CC, R25, R48, c[0x0][0x170]; /* 0x910cc0002e1c6442 */ - /*03b0*/ DADD R20, R20, R14; /* 0xe3800000071c5052 */ - /*03b8*/ IADD R15, R38, R45; /* 0xe0800000169c983e */ - /* 0x09a010b081ac809c */ - /*03c8*/ IMAD.HI.X R17, R25, R48, c[0x0][0x174]; /* 0x9318c0002e9c6446 */ - /*03d0*/ IMAD R14.CC, R15, R48, c[0x0][0x170]; /* 0x910cc0002e1c3c3a */ - /*03d8*/ LD.E.64 R24, [R16]; /* 0xc5800000001c4060 */ - /*03e0*/ IMAD.HI.X R15, R15, R48, c[0x0][0x174]; /* 0x9318c0002e9c3c3e */ - /*03e8*/ IADD R18.CC, R18, R28; /* 0xe08400000e1c484a */ - /*03f0*/ LD.E.64 R30, [R14]; /* 0xc5800000001c3878 */ - /*03f8*/ IADD.X R19, R19, R50; /* 0xe0804000191c4c4e */ - /* 0x08a480a480b58010 */ - /*0408*/ LD.E.64 R50, [R18]; /* 0xc5800000001c48c8 */ - /*0410*/ DMUL R20, R6, R20; /* 0xe40000000a1c1852 */ - /*0418*/ DADD R22, R22, R24; /* 0xe38000000c1c585a */ - /*0420*/ DADD R12, R12, R30; /* 0xe38000000f1c3032 */ - /*0428*/ DFMA R24, R8, R26, R20; /* 0xdb8050000d1c2062 */ - /*0430*/ DFMA R16, R4, R22, R24; /* 0xdb8060000b1c1042 */ - /*0438*/ DADD R12, R12, R50; /* 0xe3800000191c3032 */ - /* 0x08908cb0a010ac80 */ - /*0448*/ DFMA R10, R2, R12, R16; /* 0xdb804000061c082a */ - /*0450*/ @P0 BRA.U 0x4b8; /* 0x120000003000023c */ - /*0458*/ @!P0 MOV32I R17, 0x8; /* 0x740000000423c046 */ - /*0460*/ @!P0 DADD R18, R26, R26; /* 0xe38000000d20684a */ - /*0468*/ @!P0 IMAD R14.CC, R46, R17, c[0x0][0x178]; /* 0x910c44002f20b83a */ - /*0470*/ @!P0 IMAD.HI.X R15, R46, R17, c[0x0][0x17c]; /* 0x931844002fa0b83e */ - /*0478*/ @!P0 IMAD R16.CC, R46, R17, c[0x0][0x168]; /* 0x910c44002d20b842 */ - /* 0x08a180a5dc10bd9c */ - /*0488*/ @!P0 LD.E.64 R12, [R14]; /* 0xc580000000203830 */ - /*0490*/ @!P0 IMAD.HI.X R17, R46, R17, c[0x0][0x16c]; /* 0x931844002da0b846 */ - /*0498*/ @!P0 LD.E.64 R20, [R16]; /* 0xc580000000204050 */ - /*04a0*/ @!P0 DADD R22, R18, -R12; /* 0xe38100000620485a */ - /*04a8*/ @!P0 DFMA R10, R20, R10, R22; /* 0xdb8058000520502a */ - /*04b0*/ @!P0 ST.E.64 [R14], R10; /* 0xe580000000203828 */ - /*04b8*/ IADD R39, R39, 0x20; /* 0xc0800000101c9c9d */ - /* 0x08b0a0b8b0a0b8b0 */ - /*04c8*/ ISETP.LT.AND P0, PT, R39, c[0x0][0x144], PT; /* 0x5b181c00289c9c1e */ - /*04d0*/ @P0 BRA 0x178; /* 0x12007ffe5000003c */ - /*04d8*/ IADD R29, R29, 0x1; /* 0xc0800000009c7475 */ - /*04e0*/ ISETP.LT.AND P0, PT, R29, c[0x0][0x14c], PT; /* 0x5b181c00299c741e */ - /*04e8*/ @P0 BRA 0xe0; /* 0x12007ffdf800003c */ - /*04f0*/ IADD R0, R0, 0x1; /* 0xc0800000009c0001 */ - /*04f8*/ ISETP.LT.AND P0, PT, R0, R49, PT; /* 0xdb181c00189c001e */ - /* 0x0800000000b810b8 */ - /*0508*/ @P0 BRA 0xb0; /* 0x12007ffdd000003c */ - /*0510*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ - /*0518*/ EXIT ; /* 0x18000000001c003c */ - /*0520*/ BRA 0x520; /* 0x12007ffffc1c003c */ - /*0528*/ NOP; /* 0x85800000001c3c02 */ - /*0530*/ NOP; /* 0x85800000001c3c02 */ - /*0538*/ NOP; /* 0x85800000001c3c02 */ - .................................. - - diff --git a/examples_cuda/stencil/2.s b/examples_cuda/stencil/2.s deleted file mode 100644 index 76476d03..00000000 --- a/examples_cuda/stencil/2.s +++ /dev/null @@ -1,239 +0,0 @@ - - code for sm_35 - Function : stencil_step_task - .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" - /* 0x0880acb0a0a0a000 */ - /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ - /*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */ - /*0018*/ IADD R44, R10, c[0x0][0x150]; /* 0x608000002a1c28b2 */ - /*0020*/ IADD R0, R44, 0x1; /* 0xc0800000009cb001 */ - /*0028*/ ISETP.GE.AND P0, PT, R44, R0, PT; /* 0xdb681c00001cb01e */ - /*0030*/ @P0 EXIT ; /* 0x180000000000003c */ - /*0038*/ MOV R11, c[0x0][0x154]; /* 0x64c03c002a9c002e */ - /* 0x0888108010a01080 */ - /*0048*/ IADD R41, R10, c[0x0][0x150]; /* 0x608000002a1c28a6 */ - /*0050*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */ - /*0058*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */ - /*0060*/ IMUL R35, R11, c[0x0][0x158]; /* 0x61c018002b1c2c8e */ - /*0068*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */ - /*0070*/ SHF.L R36, RZ, 0x1, R11; /* 0xb7c02c00009ffc91 */ - /*0078*/ MOV R42, c[0x0][0x148]; /* 0x64c03c00291c00aa */ - /* 0x088c80108c108c10 */ - /*0088*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */ - /*0090*/ IMUL R0, R11, 0x3; /* 0xc1c01800019c2c01 */ - /*0098*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */ - /*00a0*/ IMUL R18, R11, -0x3; /* 0xc9c01bfffe9c2c49 */ - /*00a8*/ SHF.L R37, RZ, 0x1, R35; /* 0xb7c08c00009ffc95 */ - /*00b0*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */ - /*00b8*/ IMUL R19, R35, 0x3; /* 0xc1c01800019c8c4d */ - /* 0x0880acb0a0acb000 */ - /*00c8*/ IMUL R20, R35, -0x3; /* 0xc9c01bfffe9c8c51 */ - /*00d0*/ ISETP.GE.AND P0, PT, R42, c[0x0][0x14c], PT; /* 0x5b681c00299ca81e */ - /*00d8*/ @P0 BRA 0x6d8; /* 0x12000002fc00003c */ - /*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */ - /*00e8*/ ISETP.LT.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b181c00289c281e */ - /*00f0*/ @!P0 BRA 0x6d8; /* 0x12000002f020003c */ - /*00f8*/ IMUL R40, R44, R35; /* 0xe1c01800119cb0a2 */ - /* 0x088880108c10a000 */ - /*0108*/ MOV R21, c[0x0][0x148]; /* 0x64c03c00291c0056 */ - /*0110*/ IMAD R39, R21, c[0x0][0x154], R40; /* 0x5108a0002a9c549e */ - /*0118*/ MOV R34, c[0x0][0x140]; /* 0x64c03c00281c008a */ - /*0120*/ IADD R29, R39, R37; /* 0xe0800000129c9c76 */ - /*0128*/ IADD R22, R39, c[0x0][0x154]; /* 0x608000002a9c9c5a */ - /*0130*/ ISUB R32, R39, R37; /* 0xe0880000129c9c82 */ - /*0138*/ IADD R23, R39, R36; /* 0xe0800000121c9c5e */ - /* 0x0880808080108c10 */ - /*0148*/ ISUB R24, R39, c[0x0][0x154]; /* 0x608800002a9c9c62 */ - /*0150*/ IADD R25, R39, R0; /* 0xe0800000001c9c66 */ - /*0158*/ ISUB R26, R39, R36; /* 0xe0880000121c9c6a */ - /*0160*/ IADD R27, R39, R35; /* 0xe0800000119c9c6e */ - /*0168*/ IADD R28, R39, R18; /* 0xe0800000091c9c72 */ - /*0170*/ ISUB R30, R39, R35; /* 0xe0880000119c9c7a */ - /*0178*/ IADD R33, R39, R20; /* 0xe08000000a1c9c86 */ - /* 0x08a0acb0a0a0a000 */ - /*0188*/ IADD R31, R39, R19; /* 0xe0800000099c9c7e */ - /*0190*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */ - /*0198*/ LOP.AND R11, R10, 0x1f; /* 0xc20000000f9c282d */ - /*01a0*/ IADD R43, R11, R34; /* 0xe0800000111c2cae */ - /*01a8*/ ISETP.GE.AND P0, PT, R43, c[0x0][0x144], PT; /* 0x5b681c00289cac1e */ - /*01b0*/ @P0 BRA.U 0x6a0; /* 0x120000027400023c */ - /*01b8*/ @!P0 IADD R10, R39, R43; /* 0xe080000015a09c2a */ - /* 0x08a0108c109c80a0 */ - /*01c8*/ @!P0 SHF.L R38, RZ, 0x3, R10; /* 0xb7c0280001a3fc99 */ - /*01d0*/ @!P0 IADD R10, R38, -0x8; /* 0xc88003fffc209829 */ - /*01d8*/ @!P0 IADD R11, R38, 0x8; /* 0xc08000000420982d */ - /*01e0*/ @!P0 BFE R12, R10, 0x11f; /* 0xc00800008fa02831 */ - /*01e8*/ @!P0 IADD R54.CC, R10, c[0x0][0x170]; /* 0x608400002e2028da */ - /*01f0*/ @!P0 IADD R10, R38, -0x10; /* 0xc88003fff8209829 */ - /*01f8*/ @!P0 BFE R13, R11, 0x11f; /* 0xc00800008fa02c35 */ - /* 0x08808080a0108c10 */ - /*0208*/ @!P0 IADD.X R55, R12, c[0x0][0x174]; /* 0x608040002ea030de */ - /*0210*/ @!P0 IADD R46.CC, R11, c[0x0][0x170]; /* 0x608400002e202cba */ - /*0218*/ @!P0 IADD R11, R38, 0x10; /* 0xc08000000820982d */ - /*0220*/ @!P0 BFE R14, R10, 0x11f; /* 0xc00800008fa02839 */ - /*0228*/ @!P0 IADD.X R47, R13, c[0x0][0x174]; /* 0x608040002ea034be */ - /*0230*/ @!P0 IADD R48.CC, R10, c[0x0][0x170]; /* 0x608400002e2028c2 */ - /*0238*/ @!P0 IADD R10, R22, R43; /* 0xe080000015a0582a */ - /* 0x08ac108080909410 */ - /*0248*/ @!P0 LD.E.64 R12, [R54]; /* 0xc58000000020d830 */ - /*0250*/ @!P0 BFE R15, R11, 0x11f; /* 0xc00800008fa02c3d */ - /*0258*/ @!P0 LD.E.64 R16, [R46]; /* 0xc58000000020b840 */ - /*0260*/ @!P0 IADD.X R49, R14, c[0x0][0x174]; /* 0x608040002ea038c6 */ - /*0268*/ @!P0 IADD R52.CC, R11, c[0x0][0x170]; /* 0x608400002e202cd2 */ - /*0270*/ @!P0 SHF.L R50, RZ, 0x3, R10; /* 0xb7c0280001a3fcc9 */ - /*0278*/ @!P0 IADD R14, R23, R43; /* 0xe080000015a05c3a */ - /* 0x08908c108c108010 */ - /*0288*/ @!P0 IADD.X R53, R15, c[0x0][0x174]; /* 0x608040002ea03cd6 */ - /*0290*/ @!P0 BFE R51, R50, 0x11f; /* 0xc00800008fa0c8cd */ - /*0298*/ @!P0 IADD R50.CC, R50, c[0x0][0x170]; /* 0x608400002e20c8ca */ - /*02a0*/ @!P0 SHF.L R45, RZ, 0x3, R14; /* 0xb7c0380001a3fcb5 */ - /*02a8*/ @!P0 LD.E.64 R10, [R48]; /* 0xc58000000020c028 */ - /*02b0*/ @!P0 DADD R12, R12, R16; /* 0xe380000008203032 */ - /*02b8*/ @!P0 LD.E.64 R14, [R52]; /* 0xc58000000020d038 */ - /* 0x089c8010b0108c10 */ - /*02c8*/ @!P0 IADD.X R51, R51, c[0x0][0x174]; /* 0x608040002ea0ccce */ - /*02d0*/ @!P0 BFE R17, R45, 0x11f; /* 0xc00800008fa0b445 */ - /*02d8*/ @!P0 IADD R16, R24, R43; /* 0xe080000015a06042 */ - /*02e0*/ @!P0 IADD R46.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4ba */ - /*02e8*/ @!P0 SHF.L R45, RZ, 0x3, R16; /* 0xb7c0400001a3fcb5 */ - /*02f0*/ @!P0 IADD.X R47, R17, c[0x0][0x174]; /* 0x608040002ea044be */ - /*02f8*/ @!P0 LD.E.64 R16, [R50]; /* 0xc58000000020c840 */ - /* 0x08848010a8108080 */ - /*0308*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*0310*/ @!P0 DADD R48, R10, R14; /* 0xe3800000072028c2 */ - /*0318*/ @!P0 BFE R11, R45, 0x11f; /* 0xc00800008fa0b42d */ - /*0320*/ @!P0 IADD R10, R26, R43; /* 0xe080000015a0682a */ - /*0328*/ @!P0 IADD.X R55, R11, c[0x0][0x174]; /* 0x608040002ea02cde */ - /*0330*/ @!P0 SHF.L R45, RZ, 0x3, R10; /* 0xb7c0280001a3fcb5 */ - /*0338*/ @!P0 LD.E.64 R14, [R46]; /* 0xc58000000020b838 */ - /* 0x0890988010801094 */ - /*0348*/ @!P0 DADD R16, R12, R16; /* 0xe380000008203042 */ - /*0350*/ @!P0 IADD R13, R27, R43; /* 0xe080000015a06c36 */ - /*0358*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0360*/ @!P0 BFE R53, R45, 0x11f; /* 0xc00800008fa0b4d5 */ - /*0368*/ @!P0 IADD R52.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4d2 */ - /*0370*/ @!P0 IADD R12, R29, R43; /* 0xe080000015a07432 */ - /*0378*/ @!P0 SHF.L R13, RZ, 0x3, R13; /* 0xb7c0340001a3fc35 */ - /* 0x0894801094108c10 */ - /*0388*/ @!P0 IADD.X R53, R53, c[0x0][0x174]; /* 0x608040002ea0d4d6 */ - /*0390*/ @!P0 SHF.L R45, RZ, 0x3, R12; /* 0xb7c0300001a3fcb5 */ - /*0398*/ @!P0 BFE R46, R13, 0x11f; /* 0xc00800008fa034b9 */ - /*03a0*/ @!P0 IADD R50.CC, R13, c[0x0][0x170]; /* 0x608400002e2034ca */ - /*03a8*/ @!P0 LD.E.64 R12, [R52]; /* 0xc58000000020d030 */ - /*03b0*/ @!P0 DADD R16, R16, R10; /* 0xe380000005204042 */ - /*03b8*/ @!P0 BFE R10, R45, 0x11f; /* 0xc00800008fa0b429 */ - /* 0x08a0108c109c8010 */ - /*03c8*/ @!P0 IADD.X R51, R46, c[0x0][0x174]; /* 0x608040002ea0b8ce */ - /*03d0*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*03d8*/ @!P0 IADD R45, R30, R43; /* 0xe080000015a078b6 */ - /*03e0*/ @!P0 LD.E.64 R46, [R50]; /* 0xc58000000020c8b8 */ - /*03e8*/ @!P0 DADD R14, R48, R14; /* 0xe38000000720c03a */ - /*03f0*/ @!P0 IADD.X R55, R10, c[0x0][0x174]; /* 0x608040002ea028de */ - /*03f8*/ @!P0 SHF.L R48, RZ, 0x3, R45; /* 0xb7c0b40001a3fcc1 */ - /* 0x088480a080108010 */ - /*0408*/ @!P0 IADD R45, R32, R43; /* 0xe080000015a080b6 */ - /*0410*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0418*/ @!P0 BFE R49, R48, 0x11f; /* 0xc00800008fa0c0c5 */ - /*0420*/ @!P0 IADD R48.CC, R48, c[0x0][0x170]; /* 0x608400002e20c0c2 */ - /*0428*/ @!P0 DADD R14, R14, R12; /* 0xe38000000620383a */ - /*0430*/ @!P0 DADD R12, R16, R46; /* 0xe380000017204032 */ - /*0438*/ @!P0 SHF.L R46, RZ, 0x3, R45; /* 0xb7c0b40001a3fcb9 */ - /* 0x0880808010b08010 */ - /*0448*/ @!P0 IADD.X R49, R49, c[0x0][0x174]; /* 0x608040002ea0c4c6 */ - /*0450*/ @!P0 BFE R45, R38, 0x11f; /* 0xc00800008fa098b5 */ - /*0458*/ @!P0 IADD R16.CC, R38, c[0x0][0x170]; /* 0x608400002e209842 */ - /*0460*/ @!P0 IADD.X R17, R45, c[0x0][0x174]; /* 0x608040002ea0b446 */ - /*0468*/ @!P0 LD.E.64 R50, [R48]; /* 0xc58000000020c0c8 */ - /*0470*/ @!P0 DADD R14, R14, R10; /* 0xe38000000520383a */ - /*0478*/ @!P0 BFE R10, R46, 0x11f; /* 0xc00800008fa0b829 */ - /* 0x0880bc109c1080b0 */ - /*0488*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0490*/ @!P0 IADD.X R47, R10, c[0x0][0x174]; /* 0x608040002ea028be */ - /*0498*/ @!P0 LD.E.64 R10, [R16]; /* 0xc580000000204028 */ - /*04a0*/ @!P0 IADD R48, R38, -0x18; /* 0xc88003fff42098c1 */ - /*04a8*/ @!P0 LD.E.64 R52, [R46]; /* 0xc58000000020b8d0 */ - /*04b0*/ @!P0 DADD R12, R12, R50; /* 0xe380000019203032 */ - /*04b8*/ @!P0 DMUL R50, R8, R10; /* 0xe4000000052020ca */ - /* 0x08b08010b01080a0 */ - /*04c8*/ @!P0 IADD R46, R38, 0x18; /* 0xc08000000c2098b9 */ - /*04d0*/ @!P0 DFMA R16, R6, R12, R50; /* 0xdb80c80006201842 */ - /*04d8*/ @!P0 BFE R13, R48, 0x11f; /* 0xc00800008fa0c035 */ - /*04e0*/ @!P0 IADD R12.CC, R48, c[0x0][0x170]; /* 0x608400002e20c032 */ - /*04e8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*04f0*/ @!P0 IADD.X R13, R13, c[0x0][0x174]; /* 0x608040002ea03436 */ - /*04f8*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /* 0x08a0a080dc109c80 */ - /*0508*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0510*/ @!P0 LD.E.64 R48, [R12]; /* 0xc5800000002030c0 */ - /*0518*/ @!P0 LD.E.64 R50, [R46]; /* 0xc58000000020b8c8 */ - /*0520*/ @!P0 DADD R14, R14, R52; /* 0xe38000001a20383a */ - /*0528*/ @!P0 DADD R12, R48, R50; /* 0xe38000001920c032 */ - /*0530*/ @!P0 IADD R48, R25, R43; /* 0xe080000015a064c2 */ - /*0538*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /* 0x08a080dc10a0b010 */ - /*0548*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0550*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0558*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0560*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*0568*/ @!P0 DADD R10, R10, R10; /* 0xe38000000520282a */ - /*0570*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*0578*/ @!P0 IADD R48, R28, R43; /* 0xe080000015a070c2 */ - /* 0x08a080dca0b010a0 */ - /*0588*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*0590*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0598*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*05a0*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*05a8*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*05b0*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*05b8*/ @!P0 IADD R48, R31, R43; /* 0xe080000015a07cc2 */ - /* 0x0880a010b010a010 */ - /*05c8*/ @!P0 IADD R43, R33, R43; /* 0xe080000015a084ae */ - /*05d0*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*05d8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*05e0*/ @!P0 IADD R48.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8c2 */ - /*05e8*/ @!P0 IADD.X R49, R47, c[0x0][0x174]; /* 0x608040002ea0bcc6 */ - /*05f0*/ @!P0 SHF.L R43, RZ, 0x3, R43; /* 0xb7c0ac0001a3fcad */ - /*05f8*/ @!P0 LD.E.64 R46, [R48]; /* 0xc58000000020c0b8 */ - /* 0x0880909c80a080d8 */ - /*0608*/ @!P0 IADD R52.CC, R43, c[0x0][0x170]; /* 0x608400002e20acd2 */ - /*0610*/ @!P0 DADD R46, R12, R46; /* 0xe3800000172030ba */ - /*0618*/ @!P0 BFE R12, R43, 0x11f; /* 0xc00800008fa0ac31 */ - /*0620*/ @!P0 IADD.X R53, R12, c[0x0][0x174]; /* 0x608040002ea030d6 */ - /*0628*/ @!P0 IADD R12.CC, R38, c[0x0][0x178]; /* 0x608400002f209832 */ - /*0630*/ @!P0 LD.E.64 R48, [R52]; /* 0xc58000000020d0c0 */ - /*0638*/ @!P0 IADD.X R13, R45, c[0x0][0x17c]; /* 0x608040002fa0b436 */ - /* 0x08cc8c10a48090b0 */ - /*0648*/ @!P0 IADD R50.CC, R38, c[0x0][0x168]; /* 0x608400002d2098ca */ - /*0650*/ @!P0 IADD.X R51, R45, c[0x0][0x16c]; /* 0x608040002da0b4ce */ - /*0658*/ @!P0 DADD R46, R46, R48; /* 0xe38000001820b8ba */ - /*0660*/ @!P0 DFMA R48, R4, R14, R16; /* 0xdb804000072010c2 */ - /*0668*/ @!P0 LD.E.64 R16, [R12]; /* 0xc580000000203040 */ - /*0670*/ @!P0 DFMA R48, R2, R46, R48; /* 0xdb80c000172008c2 */ - /*0678*/ @!P0 LD.E.64 R14, [R50]; /* 0xc58000000020c838 */ - /* 0x08a0b8b0a000a4a4 */ - /*0688*/ @!P0 DADD R10, R10, -R16; /* 0xe38100000820282a */ - /*0690*/ @!P0 DFMA R10, R48, R14, R10; /* 0xdb8028000720c02a */ - /*0698*/ @!P0 ST.E.64 [R12], R10; /* 0xe580000000203028 */ - /*06a0*/ IADD R34, R34, 0x20; /* 0xc0800000101c8889 */ - /*06a8*/ ISETP.LT.AND P0, PT, R34, c[0x0][0x144], PT; /* 0x5b181c00289c881e */ - /*06b0*/ @P0 BRA 0x190; /* 0x12007ffd6c00003c */ - /*06b8*/ IADD R21, R21, 0x1; /* 0xc0800000009c5455 */ - /* 0x08b810b8b010b8b0 */ - /*06c8*/ ISETP.EQ.AND P0, PT, R21, c[0x0][0x14c], PT; /* 0x5b281c00299c541e */ - /*06d0*/ @!P0 BRA 0x110; /* 0x12007ffd1c20003c */ - /*06d8*/ ISETP.NE.AND P0, PT, R44, R41, PT; /* 0xdb581c00149cb01e */ - /*06e0*/ IADD R44, R44, 0x1; /* 0xc0800000009cb0b1 */ - /*06e8*/ @P0 BRA 0xd0; /* 0x12007ffcf000003c */ - /*06f0*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ - /*06f8*/ EXIT ; /* 0x18000000001c003c */ - /*0700*/ BRA 0x700; /* 0x12007ffffc1c003c */ - /*0708*/ NOP; /* 0x85800000001c3c02 */ - /*0710*/ NOP; /* 0x85800000001c3c02 */ - /*0718*/ NOP; /* 0x85800000001c3c02 */ - /*0720*/ NOP; /* 0x85800000001c3c02 */ - /*0728*/ NOP; /* 0x85800000001c3c02 */ - /*0730*/ NOP; /* 0x85800000001c3c02 */ - /*0738*/ NOP; /* 0x85800000001c3c02 */ - .................................. - - diff --git a/examples_cuda/stencil/3.s b/examples_cuda/stencil/3.s deleted file mode 100644 index 76476d03..00000000 --- a/examples_cuda/stencil/3.s +++ /dev/null @@ -1,239 +0,0 @@ - - code for sm_35 - Function : stencil_step_task - .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" - /* 0x0880acb0a0a0a000 */ - /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ - /*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */ - /*0018*/ IADD R44, R10, c[0x0][0x150]; /* 0x608000002a1c28b2 */ - /*0020*/ IADD R0, R44, 0x1; /* 0xc0800000009cb001 */ - /*0028*/ ISETP.GE.AND P0, PT, R44, R0, PT; /* 0xdb681c00001cb01e */ - /*0030*/ @P0 EXIT ; /* 0x180000000000003c */ - /*0038*/ MOV R11, c[0x0][0x154]; /* 0x64c03c002a9c002e */ - /* 0x0888108010a01080 */ - /*0048*/ IADD R41, R10, c[0x0][0x150]; /* 0x608000002a1c28a6 */ - /*0050*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */ - /*0058*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */ - /*0060*/ IMUL R35, R11, c[0x0][0x158]; /* 0x61c018002b1c2c8e */ - /*0068*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */ - /*0070*/ SHF.L R36, RZ, 0x1, R11; /* 0xb7c02c00009ffc91 */ - /*0078*/ MOV R42, c[0x0][0x148]; /* 0x64c03c00291c00aa */ - /* 0x088c80108c108c10 */ - /*0088*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */ - /*0090*/ IMUL R0, R11, 0x3; /* 0xc1c01800019c2c01 */ - /*0098*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */ - /*00a0*/ IMUL R18, R11, -0x3; /* 0xc9c01bfffe9c2c49 */ - /*00a8*/ SHF.L R37, RZ, 0x1, R35; /* 0xb7c08c00009ffc95 */ - /*00b0*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */ - /*00b8*/ IMUL R19, R35, 0x3; /* 0xc1c01800019c8c4d */ - /* 0x0880acb0a0acb000 */ - /*00c8*/ IMUL R20, R35, -0x3; /* 0xc9c01bfffe9c8c51 */ - /*00d0*/ ISETP.GE.AND P0, PT, R42, c[0x0][0x14c], PT; /* 0x5b681c00299ca81e */ - /*00d8*/ @P0 BRA 0x6d8; /* 0x12000002fc00003c */ - /*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */ - /*00e8*/ ISETP.LT.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b181c00289c281e */ - /*00f0*/ @!P0 BRA 0x6d8; /* 0x12000002f020003c */ - /*00f8*/ IMUL R40, R44, R35; /* 0xe1c01800119cb0a2 */ - /* 0x088880108c10a000 */ - /*0108*/ MOV R21, c[0x0][0x148]; /* 0x64c03c00291c0056 */ - /*0110*/ IMAD R39, R21, c[0x0][0x154], R40; /* 0x5108a0002a9c549e */ - /*0118*/ MOV R34, c[0x0][0x140]; /* 0x64c03c00281c008a */ - /*0120*/ IADD R29, R39, R37; /* 0xe0800000129c9c76 */ - /*0128*/ IADD R22, R39, c[0x0][0x154]; /* 0x608000002a9c9c5a */ - /*0130*/ ISUB R32, R39, R37; /* 0xe0880000129c9c82 */ - /*0138*/ IADD R23, R39, R36; /* 0xe0800000121c9c5e */ - /* 0x0880808080108c10 */ - /*0148*/ ISUB R24, R39, c[0x0][0x154]; /* 0x608800002a9c9c62 */ - /*0150*/ IADD R25, R39, R0; /* 0xe0800000001c9c66 */ - /*0158*/ ISUB R26, R39, R36; /* 0xe0880000121c9c6a */ - /*0160*/ IADD R27, R39, R35; /* 0xe0800000119c9c6e */ - /*0168*/ IADD R28, R39, R18; /* 0xe0800000091c9c72 */ - /*0170*/ ISUB R30, R39, R35; /* 0xe0880000119c9c7a */ - /*0178*/ IADD R33, R39, R20; /* 0xe08000000a1c9c86 */ - /* 0x08a0acb0a0a0a000 */ - /*0188*/ IADD R31, R39, R19; /* 0xe0800000099c9c7e */ - /*0190*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */ - /*0198*/ LOP.AND R11, R10, 0x1f; /* 0xc20000000f9c282d */ - /*01a0*/ IADD R43, R11, R34; /* 0xe0800000111c2cae */ - /*01a8*/ ISETP.GE.AND P0, PT, R43, c[0x0][0x144], PT; /* 0x5b681c00289cac1e */ - /*01b0*/ @P0 BRA.U 0x6a0; /* 0x120000027400023c */ - /*01b8*/ @!P0 IADD R10, R39, R43; /* 0xe080000015a09c2a */ - /* 0x08a0108c109c80a0 */ - /*01c8*/ @!P0 SHF.L R38, RZ, 0x3, R10; /* 0xb7c0280001a3fc99 */ - /*01d0*/ @!P0 IADD R10, R38, -0x8; /* 0xc88003fffc209829 */ - /*01d8*/ @!P0 IADD R11, R38, 0x8; /* 0xc08000000420982d */ - /*01e0*/ @!P0 BFE R12, R10, 0x11f; /* 0xc00800008fa02831 */ - /*01e8*/ @!P0 IADD R54.CC, R10, c[0x0][0x170]; /* 0x608400002e2028da */ - /*01f0*/ @!P0 IADD R10, R38, -0x10; /* 0xc88003fff8209829 */ - /*01f8*/ @!P0 BFE R13, R11, 0x11f; /* 0xc00800008fa02c35 */ - /* 0x08808080a0108c10 */ - /*0208*/ @!P0 IADD.X R55, R12, c[0x0][0x174]; /* 0x608040002ea030de */ - /*0210*/ @!P0 IADD R46.CC, R11, c[0x0][0x170]; /* 0x608400002e202cba */ - /*0218*/ @!P0 IADD R11, R38, 0x10; /* 0xc08000000820982d */ - /*0220*/ @!P0 BFE R14, R10, 0x11f; /* 0xc00800008fa02839 */ - /*0228*/ @!P0 IADD.X R47, R13, c[0x0][0x174]; /* 0x608040002ea034be */ - /*0230*/ @!P0 IADD R48.CC, R10, c[0x0][0x170]; /* 0x608400002e2028c2 */ - /*0238*/ @!P0 IADD R10, R22, R43; /* 0xe080000015a0582a */ - /* 0x08ac108080909410 */ - /*0248*/ @!P0 LD.E.64 R12, [R54]; /* 0xc58000000020d830 */ - /*0250*/ @!P0 BFE R15, R11, 0x11f; /* 0xc00800008fa02c3d */ - /*0258*/ @!P0 LD.E.64 R16, [R46]; /* 0xc58000000020b840 */ - /*0260*/ @!P0 IADD.X R49, R14, c[0x0][0x174]; /* 0x608040002ea038c6 */ - /*0268*/ @!P0 IADD R52.CC, R11, c[0x0][0x170]; /* 0x608400002e202cd2 */ - /*0270*/ @!P0 SHF.L R50, RZ, 0x3, R10; /* 0xb7c0280001a3fcc9 */ - /*0278*/ @!P0 IADD R14, R23, R43; /* 0xe080000015a05c3a */ - /* 0x08908c108c108010 */ - /*0288*/ @!P0 IADD.X R53, R15, c[0x0][0x174]; /* 0x608040002ea03cd6 */ - /*0290*/ @!P0 BFE R51, R50, 0x11f; /* 0xc00800008fa0c8cd */ - /*0298*/ @!P0 IADD R50.CC, R50, c[0x0][0x170]; /* 0x608400002e20c8ca */ - /*02a0*/ @!P0 SHF.L R45, RZ, 0x3, R14; /* 0xb7c0380001a3fcb5 */ - /*02a8*/ @!P0 LD.E.64 R10, [R48]; /* 0xc58000000020c028 */ - /*02b0*/ @!P0 DADD R12, R12, R16; /* 0xe380000008203032 */ - /*02b8*/ @!P0 LD.E.64 R14, [R52]; /* 0xc58000000020d038 */ - /* 0x089c8010b0108c10 */ - /*02c8*/ @!P0 IADD.X R51, R51, c[0x0][0x174]; /* 0x608040002ea0ccce */ - /*02d0*/ @!P0 BFE R17, R45, 0x11f; /* 0xc00800008fa0b445 */ - /*02d8*/ @!P0 IADD R16, R24, R43; /* 0xe080000015a06042 */ - /*02e0*/ @!P0 IADD R46.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4ba */ - /*02e8*/ @!P0 SHF.L R45, RZ, 0x3, R16; /* 0xb7c0400001a3fcb5 */ - /*02f0*/ @!P0 IADD.X R47, R17, c[0x0][0x174]; /* 0x608040002ea044be */ - /*02f8*/ @!P0 LD.E.64 R16, [R50]; /* 0xc58000000020c840 */ - /* 0x08848010a8108080 */ - /*0308*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*0310*/ @!P0 DADD R48, R10, R14; /* 0xe3800000072028c2 */ - /*0318*/ @!P0 BFE R11, R45, 0x11f; /* 0xc00800008fa0b42d */ - /*0320*/ @!P0 IADD R10, R26, R43; /* 0xe080000015a0682a */ - /*0328*/ @!P0 IADD.X R55, R11, c[0x0][0x174]; /* 0x608040002ea02cde */ - /*0330*/ @!P0 SHF.L R45, RZ, 0x3, R10; /* 0xb7c0280001a3fcb5 */ - /*0338*/ @!P0 LD.E.64 R14, [R46]; /* 0xc58000000020b838 */ - /* 0x0890988010801094 */ - /*0348*/ @!P0 DADD R16, R12, R16; /* 0xe380000008203042 */ - /*0350*/ @!P0 IADD R13, R27, R43; /* 0xe080000015a06c36 */ - /*0358*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0360*/ @!P0 BFE R53, R45, 0x11f; /* 0xc00800008fa0b4d5 */ - /*0368*/ @!P0 IADD R52.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4d2 */ - /*0370*/ @!P0 IADD R12, R29, R43; /* 0xe080000015a07432 */ - /*0378*/ @!P0 SHF.L R13, RZ, 0x3, R13; /* 0xb7c0340001a3fc35 */ - /* 0x0894801094108c10 */ - /*0388*/ @!P0 IADD.X R53, R53, c[0x0][0x174]; /* 0x608040002ea0d4d6 */ - /*0390*/ @!P0 SHF.L R45, RZ, 0x3, R12; /* 0xb7c0300001a3fcb5 */ - /*0398*/ @!P0 BFE R46, R13, 0x11f; /* 0xc00800008fa034b9 */ - /*03a0*/ @!P0 IADD R50.CC, R13, c[0x0][0x170]; /* 0x608400002e2034ca */ - /*03a8*/ @!P0 LD.E.64 R12, [R52]; /* 0xc58000000020d030 */ - /*03b0*/ @!P0 DADD R16, R16, R10; /* 0xe380000005204042 */ - /*03b8*/ @!P0 BFE R10, R45, 0x11f; /* 0xc00800008fa0b429 */ - /* 0x08a0108c109c8010 */ - /*03c8*/ @!P0 IADD.X R51, R46, c[0x0][0x174]; /* 0x608040002ea0b8ce */ - /*03d0*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*03d8*/ @!P0 IADD R45, R30, R43; /* 0xe080000015a078b6 */ - /*03e0*/ @!P0 LD.E.64 R46, [R50]; /* 0xc58000000020c8b8 */ - /*03e8*/ @!P0 DADD R14, R48, R14; /* 0xe38000000720c03a */ - /*03f0*/ @!P0 IADD.X R55, R10, c[0x0][0x174]; /* 0x608040002ea028de */ - /*03f8*/ @!P0 SHF.L R48, RZ, 0x3, R45; /* 0xb7c0b40001a3fcc1 */ - /* 0x088480a080108010 */ - /*0408*/ @!P0 IADD R45, R32, R43; /* 0xe080000015a080b6 */ - /*0410*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0418*/ @!P0 BFE R49, R48, 0x11f; /* 0xc00800008fa0c0c5 */ - /*0420*/ @!P0 IADD R48.CC, R48, c[0x0][0x170]; /* 0x608400002e20c0c2 */ - /*0428*/ @!P0 DADD R14, R14, R12; /* 0xe38000000620383a */ - /*0430*/ @!P0 DADD R12, R16, R46; /* 0xe380000017204032 */ - /*0438*/ @!P0 SHF.L R46, RZ, 0x3, R45; /* 0xb7c0b40001a3fcb9 */ - /* 0x0880808010b08010 */ - /*0448*/ @!P0 IADD.X R49, R49, c[0x0][0x174]; /* 0x608040002ea0c4c6 */ - /*0450*/ @!P0 BFE R45, R38, 0x11f; /* 0xc00800008fa098b5 */ - /*0458*/ @!P0 IADD R16.CC, R38, c[0x0][0x170]; /* 0x608400002e209842 */ - /*0460*/ @!P0 IADD.X R17, R45, c[0x0][0x174]; /* 0x608040002ea0b446 */ - /*0468*/ @!P0 LD.E.64 R50, [R48]; /* 0xc58000000020c0c8 */ - /*0470*/ @!P0 DADD R14, R14, R10; /* 0xe38000000520383a */ - /*0478*/ @!P0 BFE R10, R46, 0x11f; /* 0xc00800008fa0b829 */ - /* 0x0880bc109c1080b0 */ - /*0488*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0490*/ @!P0 IADD.X R47, R10, c[0x0][0x174]; /* 0x608040002ea028be */ - /*0498*/ @!P0 LD.E.64 R10, [R16]; /* 0xc580000000204028 */ - /*04a0*/ @!P0 IADD R48, R38, -0x18; /* 0xc88003fff42098c1 */ - /*04a8*/ @!P0 LD.E.64 R52, [R46]; /* 0xc58000000020b8d0 */ - /*04b0*/ @!P0 DADD R12, R12, R50; /* 0xe380000019203032 */ - /*04b8*/ @!P0 DMUL R50, R8, R10; /* 0xe4000000052020ca */ - /* 0x08b08010b01080a0 */ - /*04c8*/ @!P0 IADD R46, R38, 0x18; /* 0xc08000000c2098b9 */ - /*04d0*/ @!P0 DFMA R16, R6, R12, R50; /* 0xdb80c80006201842 */ - /*04d8*/ @!P0 BFE R13, R48, 0x11f; /* 0xc00800008fa0c035 */ - /*04e0*/ @!P0 IADD R12.CC, R48, c[0x0][0x170]; /* 0x608400002e20c032 */ - /*04e8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*04f0*/ @!P0 IADD.X R13, R13, c[0x0][0x174]; /* 0x608040002ea03436 */ - /*04f8*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /* 0x08a0a080dc109c80 */ - /*0508*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0510*/ @!P0 LD.E.64 R48, [R12]; /* 0xc5800000002030c0 */ - /*0518*/ @!P0 LD.E.64 R50, [R46]; /* 0xc58000000020b8c8 */ - /*0520*/ @!P0 DADD R14, R14, R52; /* 0xe38000001a20383a */ - /*0528*/ @!P0 DADD R12, R48, R50; /* 0xe38000001920c032 */ - /*0530*/ @!P0 IADD R48, R25, R43; /* 0xe080000015a064c2 */ - /*0538*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /* 0x08a080dc10a0b010 */ - /*0548*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0550*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0558*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0560*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*0568*/ @!P0 DADD R10, R10, R10; /* 0xe38000000520282a */ - /*0570*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*0578*/ @!P0 IADD R48, R28, R43; /* 0xe080000015a070c2 */ - /* 0x08a080dca0b010a0 */ - /*0588*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*0590*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0598*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*05a0*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*05a8*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*05b0*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*05b8*/ @!P0 IADD R48, R31, R43; /* 0xe080000015a07cc2 */ - /* 0x0880a010b010a010 */ - /*05c8*/ @!P0 IADD R43, R33, R43; /* 0xe080000015a084ae */ - /*05d0*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*05d8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*05e0*/ @!P0 IADD R48.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8c2 */ - /*05e8*/ @!P0 IADD.X R49, R47, c[0x0][0x174]; /* 0x608040002ea0bcc6 */ - /*05f0*/ @!P0 SHF.L R43, RZ, 0x3, R43; /* 0xb7c0ac0001a3fcad */ - /*05f8*/ @!P0 LD.E.64 R46, [R48]; /* 0xc58000000020c0b8 */ - /* 0x0880909c80a080d8 */ - /*0608*/ @!P0 IADD R52.CC, R43, c[0x0][0x170]; /* 0x608400002e20acd2 */ - /*0610*/ @!P0 DADD R46, R12, R46; /* 0xe3800000172030ba */ - /*0618*/ @!P0 BFE R12, R43, 0x11f; /* 0xc00800008fa0ac31 */ - /*0620*/ @!P0 IADD.X R53, R12, c[0x0][0x174]; /* 0x608040002ea030d6 */ - /*0628*/ @!P0 IADD R12.CC, R38, c[0x0][0x178]; /* 0x608400002f209832 */ - /*0630*/ @!P0 LD.E.64 R48, [R52]; /* 0xc58000000020d0c0 */ - /*0638*/ @!P0 IADD.X R13, R45, c[0x0][0x17c]; /* 0x608040002fa0b436 */ - /* 0x08cc8c10a48090b0 */ - /*0648*/ @!P0 IADD R50.CC, R38, c[0x0][0x168]; /* 0x608400002d2098ca */ - /*0650*/ @!P0 IADD.X R51, R45, c[0x0][0x16c]; /* 0x608040002da0b4ce */ - /*0658*/ @!P0 DADD R46, R46, R48; /* 0xe38000001820b8ba */ - /*0660*/ @!P0 DFMA R48, R4, R14, R16; /* 0xdb804000072010c2 */ - /*0668*/ @!P0 LD.E.64 R16, [R12]; /* 0xc580000000203040 */ - /*0670*/ @!P0 DFMA R48, R2, R46, R48; /* 0xdb80c000172008c2 */ - /*0678*/ @!P0 LD.E.64 R14, [R50]; /* 0xc58000000020c838 */ - /* 0x08a0b8b0a000a4a4 */ - /*0688*/ @!P0 DADD R10, R10, -R16; /* 0xe38100000820282a */ - /*0690*/ @!P0 DFMA R10, R48, R14, R10; /* 0xdb8028000720c02a */ - /*0698*/ @!P0 ST.E.64 [R12], R10; /* 0xe580000000203028 */ - /*06a0*/ IADD R34, R34, 0x20; /* 0xc0800000101c8889 */ - /*06a8*/ ISETP.LT.AND P0, PT, R34, c[0x0][0x144], PT; /* 0x5b181c00289c881e */ - /*06b0*/ @P0 BRA 0x190; /* 0x12007ffd6c00003c */ - /*06b8*/ IADD R21, R21, 0x1; /* 0xc0800000009c5455 */ - /* 0x08b810b8b010b8b0 */ - /*06c8*/ ISETP.EQ.AND P0, PT, R21, c[0x0][0x14c], PT; /* 0x5b281c00299c541e */ - /*06d0*/ @!P0 BRA 0x110; /* 0x12007ffd1c20003c */ - /*06d8*/ ISETP.NE.AND P0, PT, R44, R41, PT; /* 0xdb581c00149cb01e */ - /*06e0*/ IADD R44, R44, 0x1; /* 0xc0800000009cb0b1 */ - /*06e8*/ @P0 BRA 0xd0; /* 0x12007ffcf000003c */ - /*06f0*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ - /*06f8*/ EXIT ; /* 0x18000000001c003c */ - /*0700*/ BRA 0x700; /* 0x12007ffffc1c003c */ - /*0708*/ NOP; /* 0x85800000001c3c02 */ - /*0710*/ NOP; /* 0x85800000001c3c02 */ - /*0718*/ NOP; /* 0x85800000001c3c02 */ - /*0720*/ NOP; /* 0x85800000001c3c02 */ - /*0728*/ NOP; /* 0x85800000001c3c02 */ - /*0730*/ NOP; /* 0x85800000001c3c02 */ - /*0738*/ NOP; /* 0x85800000001c3c02 */ - .................................. - - diff --git a/examples_cuda/stencil/Makefile_gpu b/examples_cuda/stencil/Makefile_gpu new file mode 100644 index 00000000..ac1f3b25 --- /dev/null +++ b/examples_cuda/stencil/Makefile_gpu @@ -0,0 +1,55 @@ +PROG=stencil_cu +ISPC_SRC=stencil.ispc +CXX_SRC=stencil_cu.cpp stencil_serial.cpp + +CXX=g++ +CXXFLAGS=-O3 -I$(CUDATK)/include +LD=g++ +LDFLAGS=-lcuda + +ISPC=ispc +ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math + +LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 +LLVM = $(HOME)/usr/local/llvm/bin-3.3 +PTXGEN = $(HOME)/ptxgen +PTXGEN += -opt=3 +PTXGEN += -ftz=1 -prec-div=0 -prec-sqrt=0 -fma=1 + +LLVM32DIS=$(LLVM32)/bin/llvm-dis + +.SUFFIXES: .bc .o .ptx .cu _ispc_nvptx64.bc + + +ISPC_OBJ=$(ISPC_SRC:%.ispc=%_ispc.o) +ISPC_BC=$(ISPC_SRC:%.ispc=%_ispc_nvptx64.bc) +PTXSRC=$(ISPC_SRC:%.ispc=%_ispc_nvptx64.ptx) +CXX_OBJ=$(CXX_SRC:%.cpp=%.o) + +all: $(PROG) + + +$(CXX_OBJ) : kernel.ptx +$(PROG): $(CXX_OBJ) kernel.ptx + /bin/cp kernel.ptx __kernels.ptx + $(LD) -o $@ $(CXX_OBJ) $(LDFLAGS) + +%.o: %.cpp + $(CXX) $(CXXFLAGS) -o $@ -c $< + + +%_ispc_nvptx64.bc: %.ispc + $(ISPC) $(ISPCFLAGS) --emit-llvm -o `basename $< .ispc`_ispc_nvptx64.bc -h `basename $< .ispc`_ispc.h $< --emit-llvm + +%.ptx: %.bc + $(LLVM32DIS) $< + $(PTXGEN) `basename $< .bc`.ll > $@ + +kernel.ptx: $(PTXSRC) + cat $^ > kernel.ptx + +clean: + /bin/rm -rf *.ptx *.bc *.ll $(PROG) + + + diff --git a/examples_cuda/stencil/__kernels.ptx b/examples_cuda/stencil/__kernels.ptx new file mode 100644 index 00000000..b0339cbf --- /dev/null +++ b/examples_cuda/stencil/__kernels.ptx @@ -0,0 +1,1246 @@ +// +// Generated by NVIDIA NVVM Compiler +// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) +// Cuda compilation tools, release 5.5, V5.5.0 +// + +.version 3.2 +.target sm_35 +.address_size 64 + + +.extern .func (.param .b32 func_retval0) cudaLaunchDevice +( + .param .b64 cudaLaunchDevice_param_0, + .param .b64 cudaLaunchDevice_param_1, + .param .align 4 .b8 cudaLaunchDevice_param_2[12], + .param .align 4 .b8 cudaLaunchDevice_param_3[12], + .param .b32 cudaLaunchDevice_param_4, + .param .b64 cudaLaunchDevice_param_5 +); + + +.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer +( + .param .b64 cudaGetParameterBuffer_param_0, + .param .b64 cudaGetParameterBuffer_param_1 +) +; +.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize +( + +) +; +.global .align 1 .b8 constDeltaForeach1[32]; +.global .align 1 .b8 constDeltaForeach4[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + +.visible .func (.param .b32 func_retval0) __shfl_i32( + .param .b32 __shfl_i32_param_0, + .param .b32 __shfl_i32_param_1 +) +{ + .reg .s32 %r<4>; + + + ld.param.u32 %r2, [__shfl_i32_param_0]; + ld.param.u32 %r3, [__shfl_i32_param_1]; + // inline asm + shfl.idx.b32 %r1, %r2, %r3, 0x1f; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __shfl_xor_float( + .param .b32 __shfl_xor_float_param_0, + .param .b32 __shfl_xor_float_param_1 +) +{ + .reg .s32 %r<2>; + .reg .f32 %f<3>; + + + ld.param.f32 %f2, [__shfl_xor_float_param_0]; + ld.param.u32 %r1, [__shfl_xor_float_param_1]; + // inline asm + shfl.bfly.b32 %f1, %f2, %r1, 0x1f; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __shfl_xor_i32( + .param .b32 __shfl_xor_i32_param_0, + .param .b32 __shfl_xor_i32_param_1 +) +{ + .reg .s32 %r<4>; + + + ld.param.u32 %r2, [__shfl_xor_i32_param_0]; + ld.param.u32 %r3, [__shfl_xor_i32_param_1]; + // inline asm + shfl.bfly.b32 %r1, %r2, %r3, 0x1f; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __fminf( + .param .b32 __fminf_param_0, + .param .b32 __fminf_param_1 +) +{ + .reg .f32 %f<4>; + + + ld.param.f32 %f2, [__fminf_param_0]; + ld.param.f32 %f3, [__fminf_param_1]; + // inline asm + min.f32 %f1, %f2, %f3; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __fmaxf( + .param .b32 __fmaxf_param_0, + .param .b32 __fmaxf_param_1 +) +{ + .reg .f32 %f<4>; + + + ld.param.f32 %f2, [__fmaxf_param_0]; + ld.param.f32 %f3, [__fmaxf_param_1]; + // inline asm + max.f32 %f1, %f2, %f3; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __ballot( + .param .b32 __ballot_param_0 +) +{ + .reg .s32 %r<3>; + + + ld.param.u8 %r2, [__ballot_param_0]; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r2, 0; + vote.ballot.b32 %r1, %p1; + } + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __lanemask_lt( + +) +{ + .reg .s32 %r<2>; + + + // inline asm + mov.u32 %r1, %lanemask_lt; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b64 func_retval0) ISPCAlloc( + .param .b64 ISPCAlloc_param_0, + .param .b64 ISPCAlloc_param_1, + .param .b32 ISPCAlloc_param_2 +) +{ + .reg .s64 %rd<2>; + + + mov.u64 %rd1, 1; + st.param.b64 [func_retval0+0], %rd1; + ret; +} + +.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer( + .param .b64 ISPCGetParamBuffer_param_0, + .param .b64 ISPCGetParamBuffer_param_1, + .param .b64 ISPCGetParamBuffer_param_2 +) +{ + .reg .pred %p<2>; + .reg .s32 %r<3>; + .reg .s64 %rd<7>; + + + ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1]; + ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2]; + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + setp.ne.s32 %p1, %r2, 0; + mov.u64 %rd6, 0; + @%p1 bra BB8_2; + + // Callseq Start 0 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd3; + .param .b64 param1; + st.param.b64 [param1+0], %rd4; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd6, [retval0+0]; + } + // Callseq End 0 + +BB8_2: + st.param.b64 [func_retval0+0], %rd6; + ret; +} + +.visible .func ISPCLaunch( + .param .b64 ISPCLaunch_param_0, + .param .b64 ISPCLaunch_param_1, + .param .b64 ISPCLaunch_param_2, + .param .b32 ISPCLaunch_param_3, + .param .b32 ISPCLaunch_param_4, + .param .b32 ISPCLaunch_param_5 +) +{ + .reg .pred %p<2>; + .reg .s32 %r<16>; + .reg .s64 %rd<6>; + + + ld.param.u64 %rd1, [ISPCLaunch_param_1]; + ld.param.u64 %rd2, [ISPCLaunch_param_2]; + ld.param.u32 %r1, [ISPCLaunch_param_3]; + ld.param.u32 %r2, [ISPCLaunch_param_4]; + ld.param.u32 %r3, [ISPCLaunch_param_5]; + mov.u32 %r4, %tid.x; + and.b32 %r5, %r4, 31; + setp.ne.s32 %p1, %r5, 0; + @%p1 bra BB9_2; + + add.s32 %r14, %r1, -1; + shr.s32 %r15, %r14, 2; + add.s32 %r7, %r15, 1; + mov.u32 %r12, 1; + mov.u32 %r10, 128; + mov.u32 %r13, 0; + mov.u64 %rd5, 0; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd1; + .param .b64 param1; + st.param.b64 [param1+0], %rd2; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r7; + st.param.b32 [param2+4], %r2; + st.param.b32 [param2+8], %r3; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r10; + st.param.b32 [param3+4], %r12; + st.param.b32 [param3+8], %r12; + .param .b32 param4; + st.param.b32 [param4+0], %r13; + .param .b64 param5; + st.param.b64 [param5+0], %rd5; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r6, [retval0+0]; + } + + // inline asm + +BB9_2: + ret; +} + +.visible .func ISPCSync( + .param .b64 ISPCSync_param_0 +) +{ + .reg .s32 %r<2>; + + + // Callseq Start 1 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r1, [retval0+0]; + } + // Callseq End 1 + ret; +} + +.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan( + .param .b32 __warpBinExclusiveScan_param_0 +) +{ + .reg .s32 %r<8>; + .reg .s64 %rd<5>; + + + ld.param.u8 %r2, [__warpBinExclusiveScan_param_0]; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r2, 0; + vote.ballot.b32 %r1, %p1; + } + // inline asm + // inline asm + popc.b32 %r3, %r1; + // inline asm + // inline asm + mov.u32 %r5, %lanemask_lt; + // inline asm + and.b32 %r7, %r5, %r1; + // inline asm + popc.b32 %r6, %r7; + // inline asm + cvt.u64.u32 %rd1, %r6; + shl.b64 %rd2, %rd1, 32; + cvt.u64.u32 %rd3, %r3; + or.b64 %rd4, %rd2, %rd3; + st.param.b64 [func_retval0+0], %rd4; + ret; +} + +.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_( + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12 +) +{ + .reg .pred %p<14>; + .reg .s32 %r<178>; + .reg .s64 %rd<96>; + .reg .f64 %fd<95>; + + + ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0]; + ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1]; + ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2]; + ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3]; + ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4]; + ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5]; + ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6]; + ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7]; + ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9]; + ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10]; + ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11]; + ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12]; + mov.u32 %r1, %ctaid.x; + shl.b32 %r50, %r1, 2; + mov.u32 %r2, %tid.x; + shr.s32 %r51, %r2, 5; + add.s32 %r52, %r51, %r50; + mov.u32 %r53, %nctaid.x; + shl.b32 %r54, %r53, 2; + setp.ge.s32 %p1, %r52, %r54; + mov.u32 %r55, %nctaid.y; + mov.u32 %r3, %ctaid.y; + setp.ge.s32 %p2, %r3, %r55; + or.pred %p3, %p1, %p2; + mov.u32 %r56, %nctaid.z; + mov.u32 %r4, %ctaid.z; + setp.ge.s32 %p4, %r4, %r56; + or.pred %p5, %p3, %p4; + @%p5 bra BB12_13; + + shl.b32 %r57, %r1, 7; + add.s32 %r58, %r2, %r57; + and.b32 %r59, %r58, -32; + add.s32 %r60, %r59, %r42; + add.s32 %r61, %r60, 32; + min.s32 %r5, %r43, %r61; + shl.b32 %r6, %r3, 3; + add.s32 %r62, %r6, %r44; + add.s32 %r7, %r62, 8; + shl.b32 %r8, %r4, 3; + add.s32 %r172, %r8, %r46; + add.s32 %r63, %r172, 8; + min.s32 %r64, %r47, %r63; + mul.lo.s32 %r10, %r49, %r48; + sub.s32 %r65, %r5, %r60; + shr.s32 %r66, %r65, 31; + shr.u32 %r67, %r66, 27; + add.s32 %r68, %r65, %r67; + and.b32 %r69, %r68, -32; + sub.s32 %r70, %r65, %r69; + sub.s32 %r11, %r5, %r70; + and.b32 %r71, %r2, 31; + cvt.u64.u32 %rd6, %r71; + mov.u64 %rd7, constDeltaForeach1; + add.s64 %rd1, %rd7, %rd6; + setp.ge.s32 %p6, %r172, %r64; + @%p6 bra BB12_13; + + min.s32 %r12, %r45, %r7; + shl.b32 %r15, %r10, 1; + neg.s32 %r16, %r15; + mul.lo.s32 %r17, %r10, 3; + mul.lo.s32 %r18, %r10, -3; + mov.u32 %r72, -9; + sub.s32 %r73, %r72, %r44; + sub.s32 %r74, %r73, %r6; + not.b32 %r75, %r45; + max.s32 %r76, %r74, %r75; + not.b32 %r19, %r76; + sub.s32 %r77, %r72, %r46; + sub.s32 %r78, %r77, %r8; + not.b32 %r79, %r47; + max.s32 %r80, %r78, %r79; + not.b32 %r20, %r80; + ld.global.u8 %r13, [%rd1]; + mov.u32 %r171, %r172; + +BB12_3: + mov.u32 %r21, %r171; + add.s32 %r23, %r21, %r13; + setp.ge.s32 %p7, %r62, %r12; + @%p7 bra BB12_12; + + mul.lo.s32 %r24, %r23, %r10; + mov.u32 %r174, %r62; + mov.u32 %r173, %r62; + +BB12_5: + mov.u32 %r27, %r173; + add.s32 %r30, %r27, %r13; + setp.ge.s32 %p8, %r60, %r11; + mov.u32 %r176, %r60; + @%p8 bra BB12_8; + + mov.u64 %rd9, constDeltaForeach4; + add.s64 %rd10, %rd9, %rd6; + ld.global.u8 %r31, [%rd10]; + mad.lo.s32 %r32, %r30, %r48, %r24; + add.s32 %r177, %r59, %r42; + +BB12_7: + cvta.to.global.u64 %rd11, %rd2; + add.s32 %r98, %r32, %r177; + add.s32 %r99, %r98, %r31; + shl.b32 %r100, %r99, 3; + cvt.s64.s32 %rd12, %r100; + add.s64 %rd13, %rd12, %rd4; + add.s32 %r101, %r100, 8; + cvt.s64.s32 %rd14, %r101; + add.s64 %rd15, %rd14, %rd4; + add.s32 %r102, %r100, -8; + cvt.s64.s32 %rd16, %r102; + add.s64 %rd17, %rd16, %rd4; + add.s32 %r103, %r99, %r48; + shl.b32 %r104, %r103, 3; + cvt.s64.s32 %rd18, %r104; + add.s64 %rd19, %rd18, %rd4; + sub.s32 %r105, %r99, %r48; + shl.b32 %r106, %r105, 3; + cvt.s64.s32 %rd20, %r106; + add.s64 %rd21, %rd20, %rd4; + add.s32 %r108, %r99, %r10; + shl.b32 %r109, %r108, 3; + cvt.s64.s32 %rd22, %r109; + add.s64 %rd23, %rd22, %rd4; + sub.s32 %r110, %r99, %r10; + shl.b32 %r111, %r110, 3; + cvt.s64.s32 %rd24, %r111; + add.s64 %rd25, %rd24, %rd4; + add.s32 %r112, %r100, 16; + cvt.s64.s32 %rd26, %r112; + add.s64 %rd27, %rd26, %rd4; + add.s32 %r113, %r100, -16; + cvt.s64.s32 %rd28, %r113; + add.s64 %rd29, %rd28, %rd4; + shl.b32 %r114, %r48, 1; + add.s32 %r115, %r99, %r114; + shl.b32 %r116, %r115, 3; + cvt.s64.s32 %rd30, %r116; + add.s64 %rd31, %rd30, %rd4; + mad.lo.s32 %r117, %r48, -2, %r99; + shl.b32 %r118, %r117, 3; + cvt.s64.s32 %rd32, %r118; + add.s64 %rd33, %rd32, %rd4; + add.s32 %r119, %r99, %r15; + shl.b32 %r120, %r119, 3; + cvt.s64.s32 %rd34, %r120; + add.s64 %rd35, %rd34, %rd4; + add.s32 %r121, %r99, %r16; + shl.b32 %r122, %r121, 3; + cvt.s64.s32 %rd36, %r122; + add.s64 %rd37, %rd36, %rd4; + add.s32 %r123, %r100, 24; + cvt.s64.s32 %rd38, %r123; + add.s64 %rd39, %rd38, %rd4; + add.s32 %r124, %r100, -24; + cvt.s64.s32 %rd40, %r124; + add.s64 %rd41, %rd40, %rd4; + mad.lo.s32 %r125, %r48, 3, %r99; + shl.b32 %r126, %r125, 3; + cvt.s64.s32 %rd42, %r126; + add.s64 %rd43, %rd42, %rd4; + mad.lo.s32 %r127, %r48, -3, %r99; + shl.b32 %r128, %r127, 3; + cvt.s64.s32 %rd44, %r128; + add.s64 %rd45, %rd44, %rd4; + add.s32 %r129, %r99, %r17; + shl.b32 %r130, %r129, 3; + cvt.s64.s32 %rd46, %r130; + add.s64 %rd47, %rd46, %rd4; + add.s32 %r131, %r99, %r18; + shl.b32 %r132, %r131, 3; + cvt.s64.s32 %rd48, %r132; + add.s64 %rd49, %rd48, %rd4; + add.s64 %rd50, %rd12, %rd5; + add.s64 %rd51, %rd12, %rd3; + ld.f64 %fd1, [%rd13]; + add.f64 %fd2, %fd1, %fd1; + ld.f64 %fd3, [%rd50]; + sub.f64 %fd4, %fd2, %fd3; + ld.global.f64 %fd5, [%rd11]; + ld.f64 %fd6, [%rd17]; + ld.f64 %fd7, [%rd15]; + add.f64 %fd8, %fd7, %fd6; + ld.f64 %fd9, [%rd19]; + add.f64 %fd10, %fd8, %fd9; + ld.f64 %fd11, [%rd21]; + add.f64 %fd12, %fd10, %fd11; + ld.f64 %fd13, [%rd23]; + add.f64 %fd14, %fd12, %fd13; + ld.f64 %fd15, [%rd25]; + add.f64 %fd16, %fd14, %fd15; + ld.global.f64 %fd17, [%rd11+8]; + mul.f64 %fd18, %fd17, %fd16; + fma.rn.f64 %fd19, %fd5, %fd1, %fd18; + ld.f64 %fd20, [%rd29]; + ld.f64 %fd21, [%rd27]; + add.f64 %fd22, %fd21, %fd20; + ld.f64 %fd23, [%rd31]; + add.f64 %fd24, %fd22, %fd23; + ld.f64 %fd25, [%rd33]; + add.f64 %fd26, %fd24, %fd25; + ld.f64 %fd27, [%rd35]; + add.f64 %fd28, %fd26, %fd27; + ld.f64 %fd29, [%rd37]; + add.f64 %fd30, %fd28, %fd29; + ld.global.f64 %fd31, [%rd11+16]; + fma.rn.f64 %fd32, %fd31, %fd30, %fd19; + ld.f64 %fd33, [%rd41]; + ld.f64 %fd34, [%rd39]; + add.f64 %fd35, %fd34, %fd33; + ld.f64 %fd36, [%rd43]; + add.f64 %fd37, %fd35, %fd36; + ld.f64 %fd38, [%rd45]; + add.f64 %fd39, %fd37, %fd38; + ld.f64 %fd40, [%rd47]; + add.f64 %fd41, %fd39, %fd40; + ld.f64 %fd42, [%rd49]; + add.f64 %fd43, %fd41, %fd42; + ld.global.f64 %fd44, [%rd11+24]; + fma.rn.f64 %fd45, %fd44, %fd43, %fd32; + ld.f64 %fd46, [%rd51]; + fma.rn.f64 %fd47, %fd46, %fd45, %fd4; + st.f64 [%rd50], %fd47; + add.s32 %r177, %r177, 32; + setp.lt.s32 %p9, %r177, %r11; + mov.u32 %r175, %r177; + mov.u32 %r176, %r175; + @%p9 bra BB12_7; + +BB12_8: + mov.u32 %r36, %r176; + setp.ge.s32 %p10, %r36, %r5; + @%p10 bra BB12_11; + + mov.u64 %rd53, constDeltaForeach4; + add.s64 %rd54, %rd53, %rd6; + ld.global.u8 %r135, [%rd54]; + add.s32 %r37, %r36, %r135; + setp.ge.s32 %p11, %r37, %r5; + @%p11 bra BB12_11; + + cvta.to.global.u64 %rd55, %rd2; + mad.lo.s32 %r136, %r30, %r48, %r24; + add.s32 %r137, %r136, %r37; + shl.b32 %r138, %r137, 3; + cvt.s64.s32 %rd56, %r138; + add.s64 %rd57, %rd56, %rd4; + add.s32 %r139, %r138, 8; + cvt.s64.s32 %rd58, %r139; + add.s64 %rd59, %rd58, %rd4; + add.s32 %r140, %r138, -8; + cvt.s64.s32 %rd60, %r140; + add.s64 %rd61, %rd60, %rd4; + add.s32 %r141, %r137, %r48; + shl.b32 %r142, %r141, 3; + cvt.s64.s32 %rd62, %r142; + add.s64 %rd63, %rd62, %rd4; + sub.s32 %r143, %r137, %r48; + shl.b32 %r144, %r143, 3; + cvt.s64.s32 %rd64, %r144; + add.s64 %rd65, %rd64, %rd4; + add.s32 %r146, %r137, %r10; + shl.b32 %r147, %r146, 3; + cvt.s64.s32 %rd66, %r147; + add.s64 %rd67, %rd66, %rd4; + sub.s32 %r148, %r137, %r10; + shl.b32 %r149, %r148, 3; + cvt.s64.s32 %rd68, %r149; + add.s64 %rd69, %rd68, %rd4; + add.s32 %r150, %r138, 16; + cvt.s64.s32 %rd70, %r150; + add.s64 %rd71, %rd70, %rd4; + add.s32 %r151, %r138, -16; + cvt.s64.s32 %rd72, %r151; + add.s64 %rd73, %rd72, %rd4; + shl.b32 %r152, %r48, 1; + add.s32 %r153, %r137, %r152; + shl.b32 %r154, %r153, 3; + cvt.s64.s32 %rd74, %r154; + add.s64 %rd75, %rd74, %rd4; + mad.lo.s32 %r155, %r48, -2, %r137; + shl.b32 %r156, %r155, 3; + cvt.s64.s32 %rd76, %r156; + add.s64 %rd77, %rd76, %rd4; + add.s32 %r157, %r137, %r15; + shl.b32 %r158, %r157, 3; + cvt.s64.s32 %rd78, %r158; + add.s64 %rd79, %rd78, %rd4; + add.s32 %r159, %r137, %r16; + shl.b32 %r160, %r159, 3; + cvt.s64.s32 %rd80, %r160; + add.s64 %rd81, %rd80, %rd4; + add.s32 %r161, %r138, 24; + cvt.s64.s32 %rd82, %r161; + add.s64 %rd83, %rd82, %rd4; + add.s32 %r162, %r138, -24; + cvt.s64.s32 %rd84, %r162; + add.s64 %rd85, %rd84, %rd4; + mad.lo.s32 %r163, %r48, 3, %r137; + shl.b32 %r164, %r163, 3; + cvt.s64.s32 %rd86, %r164; + add.s64 %rd87, %rd86, %rd4; + mad.lo.s32 %r165, %r48, -3, %r137; + shl.b32 %r166, %r165, 3; + cvt.s64.s32 %rd88, %r166; + add.s64 %rd89, %rd88, %rd4; + add.s32 %r167, %r137, %r17; + shl.b32 %r168, %r167, 3; + cvt.s64.s32 %rd90, %r168; + add.s64 %rd91, %rd90, %rd4; + add.s32 %r169, %r137, %r18; + shl.b32 %r170, %r169, 3; + cvt.s64.s32 %rd92, %r170; + add.s64 %rd93, %rd92, %rd4; + add.s64 %rd94, %rd56, %rd5; + add.s64 %rd95, %rd56, %rd3; + ld.f64 %fd48, [%rd57]; + add.f64 %fd49, %fd48, %fd48; + ld.f64 %fd50, [%rd94]; + sub.f64 %fd51, %fd49, %fd50; + ld.global.f64 %fd52, [%rd55]; + ld.f64 %fd53, [%rd61]; + ld.f64 %fd54, [%rd59]; + add.f64 %fd55, %fd54, %fd53; + ld.f64 %fd56, [%rd63]; + add.f64 %fd57, %fd55, %fd56; + ld.f64 %fd58, [%rd65]; + add.f64 %fd59, %fd57, %fd58; + ld.f64 %fd60, [%rd67]; + add.f64 %fd61, %fd59, %fd60; + ld.f64 %fd62, [%rd69]; + add.f64 %fd63, %fd61, %fd62; + ld.global.f64 %fd64, [%rd55+8]; + mul.f64 %fd65, %fd64, %fd63; + fma.rn.f64 %fd66, %fd52, %fd48, %fd65; + ld.f64 %fd67, [%rd73]; + ld.f64 %fd68, [%rd71]; + add.f64 %fd69, %fd68, %fd67; + ld.f64 %fd70, [%rd75]; + add.f64 %fd71, %fd69, %fd70; + ld.f64 %fd72, [%rd77]; + add.f64 %fd73, %fd71, %fd72; + ld.f64 %fd74, [%rd79]; + add.f64 %fd75, %fd73, %fd74; + ld.f64 %fd76, [%rd81]; + add.f64 %fd77, %fd75, %fd76; + ld.global.f64 %fd78, [%rd55+16]; + fma.rn.f64 %fd79, %fd78, %fd77, %fd66; + ld.f64 %fd80, [%rd85]; + ld.f64 %fd81, [%rd83]; + add.f64 %fd82, %fd81, %fd80; + ld.f64 %fd83, [%rd87]; + add.f64 %fd84, %fd82, %fd83; + ld.f64 %fd85, [%rd89]; + add.f64 %fd86, %fd84, %fd85; + ld.f64 %fd87, [%rd91]; + add.f64 %fd88, %fd86, %fd87; + ld.f64 %fd89, [%rd93]; + add.f64 %fd90, %fd88, %fd89; + ld.global.f64 %fd91, [%rd55+24]; + fma.rn.f64 %fd92, %fd91, %fd90, %fd79; + ld.f64 %fd93, [%rd95]; + fma.rn.f64 %fd94, %fd92, %fd93, %fd51; + st.f64 [%rd94], %fd94; + +BB12_11: + add.s32 %r39, %r174, 1; + setp.ne.s32 %p12, %r39, %r19; + mov.u32 %r174, %r39; + mov.u32 %r173, %r39; + @%p12 bra BB12_5; + +BB12_12: + add.s32 %r171, %r172, 1; + setp.ne.s32 %p13, %r171, %r20; + mov.u32 %r172, %r171; + @%p13 bra BB12_3; + +BB12_13: + ret; +} + +.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_( + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14, + .param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1] +) +{ + .reg .pred %p<9>; + .reg .s32 %r<63>; + .reg .s64 %rd<18>; + + + ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0]; + ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1]; + ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2]; + ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3]; + ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4]; + ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5]; + ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6]; + ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7]; + ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8]; + ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9]; + ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10]; + ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11]; + ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12]; + ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13]; + ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14]; + setp.ge.s32 %p1, %r62, %r12; + @%p1 bra BB13_14; + + mov.u32 %r22, 31; + sub.s32 %r23, %r22, %r13; + add.s32 %r24, %r23, %r14; + shr.s32 %r25, %r24, 31; + shr.u32 %r26, %r25, 27; + add.s32 %r27, %r24, %r26; + shr.s32 %r28, %r27, 5; + mov.u32 %r29, 7; + sub.s32 %r30, %r29, %r15; + add.s32 %r31, %r30, %r16; + shr.s32 %r32, %r31, 31; + shr.u32 %r33, %r32, 29; + add.s32 %r34, %r31, %r33; + shr.s32 %r1, %r34, 3; + sub.s32 %r35, %r29, %r17; + add.s32 %r36, %r35, %r18; + shr.s32 %r37, %r36, 31; + shr.u32 %r38, %r37, 29; + add.s32 %r39, %r36, %r38; + shr.s32 %r2, %r39, 3; + add.s32 %r40, %r28, -1; + shr.s32 %r41, %r40, 2; + add.s32 %r3, %r41, 1; + mov.u32 %r42, %tid.x; + and.b32 %r4, %r42, 31; + sub.s32 %r61, %r62, %r12; + +BB13_2: + and.b32 %r8, %r62, 1; + setp.ne.s32 %p2, %r4, 0; + mov.u64 %rd17, 0; + @%p2 bra BB13_4; + + mov.u64 %rd9, 8; + mov.u64 %rd10, 72; + // Callseq Start 2 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd10; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd17, [retval0+0]; + } + // Callseq End 2 + +BB13_4: + setp.eq.s32 %p3, %r8, 0; + @%p3 bra BB13_9; + + setp.eq.s64 %p4, %rd17, 0; + @%p4 bra BB13_7; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd7; + st.u64 [%rd17+64], %rd6; + +BB13_7: + @%p2 bra BB13_13; + + mov.u32 %r47, 128; + mov.u32 %r49, 1; + mov.u32 %r50, 0; + mov.u64 %rd13, 0; + mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd11; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r47; + st.param.b32 [param3+4], %r49; + st.param.b32 [param3+8], %r49; + .param .b32 param4; + st.param.b32 [param4+0], %r50; + .param .b64 param5; + st.param.b64 [param5+0], %rd13; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r43, [retval0+0]; + } + + // inline asm + bra.uni BB13_13; + +BB13_9: + setp.eq.s64 %p6, %rd17, 0; + @%p6 bra BB13_11; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd6; + st.u64 [%rd17+64], %rd7; + +BB13_11: + @%p2 bra BB13_13; + + mov.u32 %r55, 128; + mov.u32 %r57, 1; + mov.u32 %r58, 0; + mov.u64 %rd16, 0; + mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd14; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r55; + st.param.b32 [param3+4], %r57; + st.param.b32 [param3+8], %r57; + .param .b32 param4; + st.param.b32 [param4+0], %r58; + .param .b64 param5; + st.param.b64 [param5+0], %rd16; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r51, [retval0+0]; + } + + // inline asm + +BB13_13: + // Callseq Start 3 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r59, [retval0+0]; + } + // Callseq End 3 + add.s32 %r62, %r62, 1; + add.s32 %r61, %r61, 1; + setp.ne.s32 %p8, %r61, 0; + @%p8 bra BB13_2; + +BB13_14: + // Callseq Start 4 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r60, [retval0+0]; + } + // Callseq End 4 + ret; +} + +.visible .entry loop_stencil_ispc_tasks( + .param .u32 loop_stencil_ispc_tasks_param_0, + .param .u32 loop_stencil_ispc_tasks_param_1, + .param .u32 loop_stencil_ispc_tasks_param_2, + .param .u32 loop_stencil_ispc_tasks_param_3, + .param .u32 loop_stencil_ispc_tasks_param_4, + .param .u32 loop_stencil_ispc_tasks_param_5, + .param .u32 loop_stencil_ispc_tasks_param_6, + .param .u32 loop_stencil_ispc_tasks_param_7, + .param .u32 loop_stencil_ispc_tasks_param_8, + .param .u32 loop_stencil_ispc_tasks_param_9, + .param .u32 loop_stencil_ispc_tasks_param_10, + .param .u64 loop_stencil_ispc_tasks_param_11, + .param .u64 loop_stencil_ispc_tasks_param_12, + .param .u64 loop_stencil_ispc_tasks_param_13, + .param .u64 loop_stencil_ispc_tasks_param_14 +) +{ + .reg .pred %p<9>; + .reg .s32 %r<63>; + .reg .s64 %rd<18>; + + + ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0]; + ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1]; + ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2]; + ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3]; + ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4]; + ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5]; + ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6]; + ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7]; + ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8]; + ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9]; + ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10]; + ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11]; + ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12]; + ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13]; + ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14]; + setp.ge.s32 %p1, %r62, %r12; + @%p1 bra BB14_14; + + mov.u32 %r22, 31; + sub.s32 %r23, %r22, %r13; + add.s32 %r24, %r23, %r14; + shr.s32 %r25, %r24, 31; + shr.u32 %r26, %r25, 27; + add.s32 %r27, %r24, %r26; + shr.s32 %r28, %r27, 5; + mov.u32 %r29, 7; + sub.s32 %r30, %r29, %r15; + add.s32 %r31, %r30, %r16; + shr.s32 %r32, %r31, 31; + shr.u32 %r33, %r32, 29; + add.s32 %r34, %r31, %r33; + shr.s32 %r1, %r34, 3; + sub.s32 %r35, %r29, %r17; + add.s32 %r36, %r35, %r18; + shr.s32 %r37, %r36, 31; + shr.u32 %r38, %r37, 29; + add.s32 %r39, %r36, %r38; + shr.s32 %r2, %r39, 3; + add.s32 %r40, %r28, -1; + shr.s32 %r41, %r40, 2; + add.s32 %r3, %r41, 1; + mov.u32 %r42, %tid.x; + and.b32 %r4, %r42, 31; + sub.s32 %r61, %r62, %r12; + +BB14_2: + and.b32 %r8, %r62, 1; + setp.ne.s32 %p2, %r4, 0; + mov.u64 %rd17, 0; + @%p2 bra BB14_4; + + mov.u64 %rd9, 8; + mov.u64 %rd10, 72; + // Callseq Start 5 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd10; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd17, [retval0+0]; + } + // Callseq End 5 + +BB14_4: + setp.eq.s32 %p3, %r8, 0; + @%p3 bra BB14_9; + + setp.eq.s64 %p4, %rd17, 0; + @%p4 bra BB14_7; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd7; + st.u64 [%rd17+64], %rd6; + +BB14_7: + @%p2 bra BB14_13; + + mov.u32 %r47, 128; + mov.u32 %r49, 1; + mov.u32 %r50, 0; + mov.u64 %rd13, 0; + mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd11; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r47; + st.param.b32 [param3+4], %r49; + st.param.b32 [param3+8], %r49; + .param .b32 param4; + st.param.b32 [param4+0], %r50; + .param .b64 param5; + st.param.b64 [param5+0], %rd13; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r43, [retval0+0]; + } + + // inline asm + bra.uni BB14_13; + +BB14_9: + setp.eq.s64 %p6, %rd17, 0; + @%p6 bra BB14_11; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd6; + st.u64 [%rd17+64], %rd7; + +BB14_11: + @%p2 bra BB14_13; + + mov.u32 %r55, 128; + mov.u32 %r57, 1; + mov.u32 %r58, 0; + mov.u64 %rd16, 0; + mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd14; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r55; + st.param.b32 [param3+4], %r57; + st.param.b32 [param3+8], %r57; + .param .b32 param4; + st.param.b32 [param4+0], %r58; + .param .b64 param5; + st.param.b64 [param5+0], %rd16; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r51, [retval0+0]; + } + + // inline asm + +BB14_13: + // Callseq Start 6 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r59, [retval0+0]; + } + // Callseq End 6 + add.s32 %r62, %r62, 1; + add.s32 %r61, %r61, 1; + setp.ne.s32 %p8, %r61, 0; + @%p8 bra BB14_2; + +BB14_14: + // Callseq Start 7 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r60, [retval0+0]; + } + // Callseq End 7 + ret; +} + + + diff --git a/examples_cuda/stencil/err b/examples_cuda/stencil/err deleted file mode 100644 index e69de29b..00000000 diff --git a/examples_cuda/stencil/info b/examples_cuda/stencil/info deleted file mode 100644 index 4fc9105f..00000000 --- a/examples_cuda/stencil/info +++ /dev/null @@ -1,5 +0,0 @@ -I have been working with sort example, attempting to use ISPC_USE_OMP for tasking and adding example for sort_paralle.cpp which uses __gnu_parallel::sort to compare apples with apples, but clang has no support for OpenMP. - -The reason to use ISPC_USE_OMP is to control thread-affinity on multi-socket systems. For bandwidth bound throughput, the tasking system based on pthread make it messy to control thread-affinity and w/o this for bandwidth bound work-loads performance may suffer.. - -I used sort example to begin with diff --git a/examples_cuda/stencil/kernel.ptx b/examples_cuda/stencil/kernel.ptx new file mode 100644 index 00000000..b0339cbf --- /dev/null +++ b/examples_cuda/stencil/kernel.ptx @@ -0,0 +1,1246 @@ +// +// Generated by NVIDIA NVVM Compiler +// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) +// Cuda compilation tools, release 5.5, V5.5.0 +// + +.version 3.2 +.target sm_35 +.address_size 64 + + +.extern .func (.param .b32 func_retval0) cudaLaunchDevice +( + .param .b64 cudaLaunchDevice_param_0, + .param .b64 cudaLaunchDevice_param_1, + .param .align 4 .b8 cudaLaunchDevice_param_2[12], + .param .align 4 .b8 cudaLaunchDevice_param_3[12], + .param .b32 cudaLaunchDevice_param_4, + .param .b64 cudaLaunchDevice_param_5 +); + + +.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer +( + .param .b64 cudaGetParameterBuffer_param_0, + .param .b64 cudaGetParameterBuffer_param_1 +) +; +.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize +( + +) +; +.global .align 1 .b8 constDeltaForeach1[32]; +.global .align 1 .b8 constDeltaForeach4[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + +.visible .func (.param .b32 func_retval0) __shfl_i32( + .param .b32 __shfl_i32_param_0, + .param .b32 __shfl_i32_param_1 +) +{ + .reg .s32 %r<4>; + + + ld.param.u32 %r2, [__shfl_i32_param_0]; + ld.param.u32 %r3, [__shfl_i32_param_1]; + // inline asm + shfl.idx.b32 %r1, %r2, %r3, 0x1f; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __shfl_xor_float( + .param .b32 __shfl_xor_float_param_0, + .param .b32 __shfl_xor_float_param_1 +) +{ + .reg .s32 %r<2>; + .reg .f32 %f<3>; + + + ld.param.f32 %f2, [__shfl_xor_float_param_0]; + ld.param.u32 %r1, [__shfl_xor_float_param_1]; + // inline asm + shfl.bfly.b32 %f1, %f2, %r1, 0x1f; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __shfl_xor_i32( + .param .b32 __shfl_xor_i32_param_0, + .param .b32 __shfl_xor_i32_param_1 +) +{ + .reg .s32 %r<4>; + + + ld.param.u32 %r2, [__shfl_xor_i32_param_0]; + ld.param.u32 %r3, [__shfl_xor_i32_param_1]; + // inline asm + shfl.bfly.b32 %r1, %r2, %r3, 0x1f; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __fminf( + .param .b32 __fminf_param_0, + .param .b32 __fminf_param_1 +) +{ + .reg .f32 %f<4>; + + + ld.param.f32 %f2, [__fminf_param_0]; + ld.param.f32 %f3, [__fminf_param_1]; + // inline asm + min.f32 %f1, %f2, %f3; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __fmaxf( + .param .b32 __fmaxf_param_0, + .param .b32 __fmaxf_param_1 +) +{ + .reg .f32 %f<4>; + + + ld.param.f32 %f2, [__fmaxf_param_0]; + ld.param.f32 %f3, [__fmaxf_param_1]; + // inline asm + max.f32 %f1, %f2, %f3; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __ballot( + .param .b32 __ballot_param_0 +) +{ + .reg .s32 %r<3>; + + + ld.param.u8 %r2, [__ballot_param_0]; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r2, 0; + vote.ballot.b32 %r1, %p1; + } + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __lanemask_lt( + +) +{ + .reg .s32 %r<2>; + + + // inline asm + mov.u32 %r1, %lanemask_lt; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b64 func_retval0) ISPCAlloc( + .param .b64 ISPCAlloc_param_0, + .param .b64 ISPCAlloc_param_1, + .param .b32 ISPCAlloc_param_2 +) +{ + .reg .s64 %rd<2>; + + + mov.u64 %rd1, 1; + st.param.b64 [func_retval0+0], %rd1; + ret; +} + +.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer( + .param .b64 ISPCGetParamBuffer_param_0, + .param .b64 ISPCGetParamBuffer_param_1, + .param .b64 ISPCGetParamBuffer_param_2 +) +{ + .reg .pred %p<2>; + .reg .s32 %r<3>; + .reg .s64 %rd<7>; + + + ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1]; + ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2]; + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + setp.ne.s32 %p1, %r2, 0; + mov.u64 %rd6, 0; + @%p1 bra BB8_2; + + // Callseq Start 0 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd3; + .param .b64 param1; + st.param.b64 [param1+0], %rd4; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd6, [retval0+0]; + } + // Callseq End 0 + +BB8_2: + st.param.b64 [func_retval0+0], %rd6; + ret; +} + +.visible .func ISPCLaunch( + .param .b64 ISPCLaunch_param_0, + .param .b64 ISPCLaunch_param_1, + .param .b64 ISPCLaunch_param_2, + .param .b32 ISPCLaunch_param_3, + .param .b32 ISPCLaunch_param_4, + .param .b32 ISPCLaunch_param_5 +) +{ + .reg .pred %p<2>; + .reg .s32 %r<16>; + .reg .s64 %rd<6>; + + + ld.param.u64 %rd1, [ISPCLaunch_param_1]; + ld.param.u64 %rd2, [ISPCLaunch_param_2]; + ld.param.u32 %r1, [ISPCLaunch_param_3]; + ld.param.u32 %r2, [ISPCLaunch_param_4]; + ld.param.u32 %r3, [ISPCLaunch_param_5]; + mov.u32 %r4, %tid.x; + and.b32 %r5, %r4, 31; + setp.ne.s32 %p1, %r5, 0; + @%p1 bra BB9_2; + + add.s32 %r14, %r1, -1; + shr.s32 %r15, %r14, 2; + add.s32 %r7, %r15, 1; + mov.u32 %r12, 1; + mov.u32 %r10, 128; + mov.u32 %r13, 0; + mov.u64 %rd5, 0; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd1; + .param .b64 param1; + st.param.b64 [param1+0], %rd2; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r7; + st.param.b32 [param2+4], %r2; + st.param.b32 [param2+8], %r3; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r10; + st.param.b32 [param3+4], %r12; + st.param.b32 [param3+8], %r12; + .param .b32 param4; + st.param.b32 [param4+0], %r13; + .param .b64 param5; + st.param.b64 [param5+0], %rd5; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r6, [retval0+0]; + } + + // inline asm + +BB9_2: + ret; +} + +.visible .func ISPCSync( + .param .b64 ISPCSync_param_0 +) +{ + .reg .s32 %r<2>; + + + // Callseq Start 1 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r1, [retval0+0]; + } + // Callseq End 1 + ret; +} + +.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan( + .param .b32 __warpBinExclusiveScan_param_0 +) +{ + .reg .s32 %r<8>; + .reg .s64 %rd<5>; + + + ld.param.u8 %r2, [__warpBinExclusiveScan_param_0]; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r2, 0; + vote.ballot.b32 %r1, %p1; + } + // inline asm + // inline asm + popc.b32 %r3, %r1; + // inline asm + // inline asm + mov.u32 %r5, %lanemask_lt; + // inline asm + and.b32 %r7, %r5, %r1; + // inline asm + popc.b32 %r6, %r7; + // inline asm + cvt.u64.u32 %rd1, %r6; + shl.b64 %rd2, %rd1, 32; + cvt.u64.u32 %rd3, %r3; + or.b64 %rd4, %rd2, %rd3; + st.param.b64 [func_retval0+0], %rd4; + ret; +} + +.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_( + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12 +) +{ + .reg .pred %p<14>; + .reg .s32 %r<178>; + .reg .s64 %rd<96>; + .reg .f64 %fd<95>; + + + ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0]; + ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1]; + ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2]; + ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3]; + ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4]; + ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5]; + ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6]; + ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7]; + ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9]; + ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10]; + ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11]; + ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12]; + mov.u32 %r1, %ctaid.x; + shl.b32 %r50, %r1, 2; + mov.u32 %r2, %tid.x; + shr.s32 %r51, %r2, 5; + add.s32 %r52, %r51, %r50; + mov.u32 %r53, %nctaid.x; + shl.b32 %r54, %r53, 2; + setp.ge.s32 %p1, %r52, %r54; + mov.u32 %r55, %nctaid.y; + mov.u32 %r3, %ctaid.y; + setp.ge.s32 %p2, %r3, %r55; + or.pred %p3, %p1, %p2; + mov.u32 %r56, %nctaid.z; + mov.u32 %r4, %ctaid.z; + setp.ge.s32 %p4, %r4, %r56; + or.pred %p5, %p3, %p4; + @%p5 bra BB12_13; + + shl.b32 %r57, %r1, 7; + add.s32 %r58, %r2, %r57; + and.b32 %r59, %r58, -32; + add.s32 %r60, %r59, %r42; + add.s32 %r61, %r60, 32; + min.s32 %r5, %r43, %r61; + shl.b32 %r6, %r3, 3; + add.s32 %r62, %r6, %r44; + add.s32 %r7, %r62, 8; + shl.b32 %r8, %r4, 3; + add.s32 %r172, %r8, %r46; + add.s32 %r63, %r172, 8; + min.s32 %r64, %r47, %r63; + mul.lo.s32 %r10, %r49, %r48; + sub.s32 %r65, %r5, %r60; + shr.s32 %r66, %r65, 31; + shr.u32 %r67, %r66, 27; + add.s32 %r68, %r65, %r67; + and.b32 %r69, %r68, -32; + sub.s32 %r70, %r65, %r69; + sub.s32 %r11, %r5, %r70; + and.b32 %r71, %r2, 31; + cvt.u64.u32 %rd6, %r71; + mov.u64 %rd7, constDeltaForeach1; + add.s64 %rd1, %rd7, %rd6; + setp.ge.s32 %p6, %r172, %r64; + @%p6 bra BB12_13; + + min.s32 %r12, %r45, %r7; + shl.b32 %r15, %r10, 1; + neg.s32 %r16, %r15; + mul.lo.s32 %r17, %r10, 3; + mul.lo.s32 %r18, %r10, -3; + mov.u32 %r72, -9; + sub.s32 %r73, %r72, %r44; + sub.s32 %r74, %r73, %r6; + not.b32 %r75, %r45; + max.s32 %r76, %r74, %r75; + not.b32 %r19, %r76; + sub.s32 %r77, %r72, %r46; + sub.s32 %r78, %r77, %r8; + not.b32 %r79, %r47; + max.s32 %r80, %r78, %r79; + not.b32 %r20, %r80; + ld.global.u8 %r13, [%rd1]; + mov.u32 %r171, %r172; + +BB12_3: + mov.u32 %r21, %r171; + add.s32 %r23, %r21, %r13; + setp.ge.s32 %p7, %r62, %r12; + @%p7 bra BB12_12; + + mul.lo.s32 %r24, %r23, %r10; + mov.u32 %r174, %r62; + mov.u32 %r173, %r62; + +BB12_5: + mov.u32 %r27, %r173; + add.s32 %r30, %r27, %r13; + setp.ge.s32 %p8, %r60, %r11; + mov.u32 %r176, %r60; + @%p8 bra BB12_8; + + mov.u64 %rd9, constDeltaForeach4; + add.s64 %rd10, %rd9, %rd6; + ld.global.u8 %r31, [%rd10]; + mad.lo.s32 %r32, %r30, %r48, %r24; + add.s32 %r177, %r59, %r42; + +BB12_7: + cvta.to.global.u64 %rd11, %rd2; + add.s32 %r98, %r32, %r177; + add.s32 %r99, %r98, %r31; + shl.b32 %r100, %r99, 3; + cvt.s64.s32 %rd12, %r100; + add.s64 %rd13, %rd12, %rd4; + add.s32 %r101, %r100, 8; + cvt.s64.s32 %rd14, %r101; + add.s64 %rd15, %rd14, %rd4; + add.s32 %r102, %r100, -8; + cvt.s64.s32 %rd16, %r102; + add.s64 %rd17, %rd16, %rd4; + add.s32 %r103, %r99, %r48; + shl.b32 %r104, %r103, 3; + cvt.s64.s32 %rd18, %r104; + add.s64 %rd19, %rd18, %rd4; + sub.s32 %r105, %r99, %r48; + shl.b32 %r106, %r105, 3; + cvt.s64.s32 %rd20, %r106; + add.s64 %rd21, %rd20, %rd4; + add.s32 %r108, %r99, %r10; + shl.b32 %r109, %r108, 3; + cvt.s64.s32 %rd22, %r109; + add.s64 %rd23, %rd22, %rd4; + sub.s32 %r110, %r99, %r10; + shl.b32 %r111, %r110, 3; + cvt.s64.s32 %rd24, %r111; + add.s64 %rd25, %rd24, %rd4; + add.s32 %r112, %r100, 16; + cvt.s64.s32 %rd26, %r112; + add.s64 %rd27, %rd26, %rd4; + add.s32 %r113, %r100, -16; + cvt.s64.s32 %rd28, %r113; + add.s64 %rd29, %rd28, %rd4; + shl.b32 %r114, %r48, 1; + add.s32 %r115, %r99, %r114; + shl.b32 %r116, %r115, 3; + cvt.s64.s32 %rd30, %r116; + add.s64 %rd31, %rd30, %rd4; + mad.lo.s32 %r117, %r48, -2, %r99; + shl.b32 %r118, %r117, 3; + cvt.s64.s32 %rd32, %r118; + add.s64 %rd33, %rd32, %rd4; + add.s32 %r119, %r99, %r15; + shl.b32 %r120, %r119, 3; + cvt.s64.s32 %rd34, %r120; + add.s64 %rd35, %rd34, %rd4; + add.s32 %r121, %r99, %r16; + shl.b32 %r122, %r121, 3; + cvt.s64.s32 %rd36, %r122; + add.s64 %rd37, %rd36, %rd4; + add.s32 %r123, %r100, 24; + cvt.s64.s32 %rd38, %r123; + add.s64 %rd39, %rd38, %rd4; + add.s32 %r124, %r100, -24; + cvt.s64.s32 %rd40, %r124; + add.s64 %rd41, %rd40, %rd4; + mad.lo.s32 %r125, %r48, 3, %r99; + shl.b32 %r126, %r125, 3; + cvt.s64.s32 %rd42, %r126; + add.s64 %rd43, %rd42, %rd4; + mad.lo.s32 %r127, %r48, -3, %r99; + shl.b32 %r128, %r127, 3; + cvt.s64.s32 %rd44, %r128; + add.s64 %rd45, %rd44, %rd4; + add.s32 %r129, %r99, %r17; + shl.b32 %r130, %r129, 3; + cvt.s64.s32 %rd46, %r130; + add.s64 %rd47, %rd46, %rd4; + add.s32 %r131, %r99, %r18; + shl.b32 %r132, %r131, 3; + cvt.s64.s32 %rd48, %r132; + add.s64 %rd49, %rd48, %rd4; + add.s64 %rd50, %rd12, %rd5; + add.s64 %rd51, %rd12, %rd3; + ld.f64 %fd1, [%rd13]; + add.f64 %fd2, %fd1, %fd1; + ld.f64 %fd3, [%rd50]; + sub.f64 %fd4, %fd2, %fd3; + ld.global.f64 %fd5, [%rd11]; + ld.f64 %fd6, [%rd17]; + ld.f64 %fd7, [%rd15]; + add.f64 %fd8, %fd7, %fd6; + ld.f64 %fd9, [%rd19]; + add.f64 %fd10, %fd8, %fd9; + ld.f64 %fd11, [%rd21]; + add.f64 %fd12, %fd10, %fd11; + ld.f64 %fd13, [%rd23]; + add.f64 %fd14, %fd12, %fd13; + ld.f64 %fd15, [%rd25]; + add.f64 %fd16, %fd14, %fd15; + ld.global.f64 %fd17, [%rd11+8]; + mul.f64 %fd18, %fd17, %fd16; + fma.rn.f64 %fd19, %fd5, %fd1, %fd18; + ld.f64 %fd20, [%rd29]; + ld.f64 %fd21, [%rd27]; + add.f64 %fd22, %fd21, %fd20; + ld.f64 %fd23, [%rd31]; + add.f64 %fd24, %fd22, %fd23; + ld.f64 %fd25, [%rd33]; + add.f64 %fd26, %fd24, %fd25; + ld.f64 %fd27, [%rd35]; + add.f64 %fd28, %fd26, %fd27; + ld.f64 %fd29, [%rd37]; + add.f64 %fd30, %fd28, %fd29; + ld.global.f64 %fd31, [%rd11+16]; + fma.rn.f64 %fd32, %fd31, %fd30, %fd19; + ld.f64 %fd33, [%rd41]; + ld.f64 %fd34, [%rd39]; + add.f64 %fd35, %fd34, %fd33; + ld.f64 %fd36, [%rd43]; + add.f64 %fd37, %fd35, %fd36; + ld.f64 %fd38, [%rd45]; + add.f64 %fd39, %fd37, %fd38; + ld.f64 %fd40, [%rd47]; + add.f64 %fd41, %fd39, %fd40; + ld.f64 %fd42, [%rd49]; + add.f64 %fd43, %fd41, %fd42; + ld.global.f64 %fd44, [%rd11+24]; + fma.rn.f64 %fd45, %fd44, %fd43, %fd32; + ld.f64 %fd46, [%rd51]; + fma.rn.f64 %fd47, %fd46, %fd45, %fd4; + st.f64 [%rd50], %fd47; + add.s32 %r177, %r177, 32; + setp.lt.s32 %p9, %r177, %r11; + mov.u32 %r175, %r177; + mov.u32 %r176, %r175; + @%p9 bra BB12_7; + +BB12_8: + mov.u32 %r36, %r176; + setp.ge.s32 %p10, %r36, %r5; + @%p10 bra BB12_11; + + mov.u64 %rd53, constDeltaForeach4; + add.s64 %rd54, %rd53, %rd6; + ld.global.u8 %r135, [%rd54]; + add.s32 %r37, %r36, %r135; + setp.ge.s32 %p11, %r37, %r5; + @%p11 bra BB12_11; + + cvta.to.global.u64 %rd55, %rd2; + mad.lo.s32 %r136, %r30, %r48, %r24; + add.s32 %r137, %r136, %r37; + shl.b32 %r138, %r137, 3; + cvt.s64.s32 %rd56, %r138; + add.s64 %rd57, %rd56, %rd4; + add.s32 %r139, %r138, 8; + cvt.s64.s32 %rd58, %r139; + add.s64 %rd59, %rd58, %rd4; + add.s32 %r140, %r138, -8; + cvt.s64.s32 %rd60, %r140; + add.s64 %rd61, %rd60, %rd4; + add.s32 %r141, %r137, %r48; + shl.b32 %r142, %r141, 3; + cvt.s64.s32 %rd62, %r142; + add.s64 %rd63, %rd62, %rd4; + sub.s32 %r143, %r137, %r48; + shl.b32 %r144, %r143, 3; + cvt.s64.s32 %rd64, %r144; + add.s64 %rd65, %rd64, %rd4; + add.s32 %r146, %r137, %r10; + shl.b32 %r147, %r146, 3; + cvt.s64.s32 %rd66, %r147; + add.s64 %rd67, %rd66, %rd4; + sub.s32 %r148, %r137, %r10; + shl.b32 %r149, %r148, 3; + cvt.s64.s32 %rd68, %r149; + add.s64 %rd69, %rd68, %rd4; + add.s32 %r150, %r138, 16; + cvt.s64.s32 %rd70, %r150; + add.s64 %rd71, %rd70, %rd4; + add.s32 %r151, %r138, -16; + cvt.s64.s32 %rd72, %r151; + add.s64 %rd73, %rd72, %rd4; + shl.b32 %r152, %r48, 1; + add.s32 %r153, %r137, %r152; + shl.b32 %r154, %r153, 3; + cvt.s64.s32 %rd74, %r154; + add.s64 %rd75, %rd74, %rd4; + mad.lo.s32 %r155, %r48, -2, %r137; + shl.b32 %r156, %r155, 3; + cvt.s64.s32 %rd76, %r156; + add.s64 %rd77, %rd76, %rd4; + add.s32 %r157, %r137, %r15; + shl.b32 %r158, %r157, 3; + cvt.s64.s32 %rd78, %r158; + add.s64 %rd79, %rd78, %rd4; + add.s32 %r159, %r137, %r16; + shl.b32 %r160, %r159, 3; + cvt.s64.s32 %rd80, %r160; + add.s64 %rd81, %rd80, %rd4; + add.s32 %r161, %r138, 24; + cvt.s64.s32 %rd82, %r161; + add.s64 %rd83, %rd82, %rd4; + add.s32 %r162, %r138, -24; + cvt.s64.s32 %rd84, %r162; + add.s64 %rd85, %rd84, %rd4; + mad.lo.s32 %r163, %r48, 3, %r137; + shl.b32 %r164, %r163, 3; + cvt.s64.s32 %rd86, %r164; + add.s64 %rd87, %rd86, %rd4; + mad.lo.s32 %r165, %r48, -3, %r137; + shl.b32 %r166, %r165, 3; + cvt.s64.s32 %rd88, %r166; + add.s64 %rd89, %rd88, %rd4; + add.s32 %r167, %r137, %r17; + shl.b32 %r168, %r167, 3; + cvt.s64.s32 %rd90, %r168; + add.s64 %rd91, %rd90, %rd4; + add.s32 %r169, %r137, %r18; + shl.b32 %r170, %r169, 3; + cvt.s64.s32 %rd92, %r170; + add.s64 %rd93, %rd92, %rd4; + add.s64 %rd94, %rd56, %rd5; + add.s64 %rd95, %rd56, %rd3; + ld.f64 %fd48, [%rd57]; + add.f64 %fd49, %fd48, %fd48; + ld.f64 %fd50, [%rd94]; + sub.f64 %fd51, %fd49, %fd50; + ld.global.f64 %fd52, [%rd55]; + ld.f64 %fd53, [%rd61]; + ld.f64 %fd54, [%rd59]; + add.f64 %fd55, %fd54, %fd53; + ld.f64 %fd56, [%rd63]; + add.f64 %fd57, %fd55, %fd56; + ld.f64 %fd58, [%rd65]; + add.f64 %fd59, %fd57, %fd58; + ld.f64 %fd60, [%rd67]; + add.f64 %fd61, %fd59, %fd60; + ld.f64 %fd62, [%rd69]; + add.f64 %fd63, %fd61, %fd62; + ld.global.f64 %fd64, [%rd55+8]; + mul.f64 %fd65, %fd64, %fd63; + fma.rn.f64 %fd66, %fd52, %fd48, %fd65; + ld.f64 %fd67, [%rd73]; + ld.f64 %fd68, [%rd71]; + add.f64 %fd69, %fd68, %fd67; + ld.f64 %fd70, [%rd75]; + add.f64 %fd71, %fd69, %fd70; + ld.f64 %fd72, [%rd77]; + add.f64 %fd73, %fd71, %fd72; + ld.f64 %fd74, [%rd79]; + add.f64 %fd75, %fd73, %fd74; + ld.f64 %fd76, [%rd81]; + add.f64 %fd77, %fd75, %fd76; + ld.global.f64 %fd78, [%rd55+16]; + fma.rn.f64 %fd79, %fd78, %fd77, %fd66; + ld.f64 %fd80, [%rd85]; + ld.f64 %fd81, [%rd83]; + add.f64 %fd82, %fd81, %fd80; + ld.f64 %fd83, [%rd87]; + add.f64 %fd84, %fd82, %fd83; + ld.f64 %fd85, [%rd89]; + add.f64 %fd86, %fd84, %fd85; + ld.f64 %fd87, [%rd91]; + add.f64 %fd88, %fd86, %fd87; + ld.f64 %fd89, [%rd93]; + add.f64 %fd90, %fd88, %fd89; + ld.global.f64 %fd91, [%rd55+24]; + fma.rn.f64 %fd92, %fd91, %fd90, %fd79; + ld.f64 %fd93, [%rd95]; + fma.rn.f64 %fd94, %fd92, %fd93, %fd51; + st.f64 [%rd94], %fd94; + +BB12_11: + add.s32 %r39, %r174, 1; + setp.ne.s32 %p12, %r39, %r19; + mov.u32 %r174, %r39; + mov.u32 %r173, %r39; + @%p12 bra BB12_5; + +BB12_12: + add.s32 %r171, %r172, 1; + setp.ne.s32 %p13, %r171, %r20; + mov.u32 %r172, %r171; + @%p13 bra BB12_3; + +BB12_13: + ret; +} + +.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_( + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14, + .param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1] +) +{ + .reg .pred %p<9>; + .reg .s32 %r<63>; + .reg .s64 %rd<18>; + + + ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0]; + ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1]; + ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2]; + ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3]; + ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4]; + ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5]; + ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6]; + ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7]; + ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8]; + ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9]; + ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10]; + ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11]; + ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12]; + ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13]; + ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14]; + setp.ge.s32 %p1, %r62, %r12; + @%p1 bra BB13_14; + + mov.u32 %r22, 31; + sub.s32 %r23, %r22, %r13; + add.s32 %r24, %r23, %r14; + shr.s32 %r25, %r24, 31; + shr.u32 %r26, %r25, 27; + add.s32 %r27, %r24, %r26; + shr.s32 %r28, %r27, 5; + mov.u32 %r29, 7; + sub.s32 %r30, %r29, %r15; + add.s32 %r31, %r30, %r16; + shr.s32 %r32, %r31, 31; + shr.u32 %r33, %r32, 29; + add.s32 %r34, %r31, %r33; + shr.s32 %r1, %r34, 3; + sub.s32 %r35, %r29, %r17; + add.s32 %r36, %r35, %r18; + shr.s32 %r37, %r36, 31; + shr.u32 %r38, %r37, 29; + add.s32 %r39, %r36, %r38; + shr.s32 %r2, %r39, 3; + add.s32 %r40, %r28, -1; + shr.s32 %r41, %r40, 2; + add.s32 %r3, %r41, 1; + mov.u32 %r42, %tid.x; + and.b32 %r4, %r42, 31; + sub.s32 %r61, %r62, %r12; + +BB13_2: + and.b32 %r8, %r62, 1; + setp.ne.s32 %p2, %r4, 0; + mov.u64 %rd17, 0; + @%p2 bra BB13_4; + + mov.u64 %rd9, 8; + mov.u64 %rd10, 72; + // Callseq Start 2 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd10; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd17, [retval0+0]; + } + // Callseq End 2 + +BB13_4: + setp.eq.s32 %p3, %r8, 0; + @%p3 bra BB13_9; + + setp.eq.s64 %p4, %rd17, 0; + @%p4 bra BB13_7; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd7; + st.u64 [%rd17+64], %rd6; + +BB13_7: + @%p2 bra BB13_13; + + mov.u32 %r47, 128; + mov.u32 %r49, 1; + mov.u32 %r50, 0; + mov.u64 %rd13, 0; + mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd11; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r47; + st.param.b32 [param3+4], %r49; + st.param.b32 [param3+8], %r49; + .param .b32 param4; + st.param.b32 [param4+0], %r50; + .param .b64 param5; + st.param.b64 [param5+0], %rd13; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r43, [retval0+0]; + } + + // inline asm + bra.uni BB13_13; + +BB13_9: + setp.eq.s64 %p6, %rd17, 0; + @%p6 bra BB13_11; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd6; + st.u64 [%rd17+64], %rd7; + +BB13_11: + @%p2 bra BB13_13; + + mov.u32 %r55, 128; + mov.u32 %r57, 1; + mov.u32 %r58, 0; + mov.u64 %rd16, 0; + mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd14; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r55; + st.param.b32 [param3+4], %r57; + st.param.b32 [param3+8], %r57; + .param .b32 param4; + st.param.b32 [param4+0], %r58; + .param .b64 param5; + st.param.b64 [param5+0], %rd16; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r51, [retval0+0]; + } + + // inline asm + +BB13_13: + // Callseq Start 3 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r59, [retval0+0]; + } + // Callseq End 3 + add.s32 %r62, %r62, 1; + add.s32 %r61, %r61, 1; + setp.ne.s32 %p8, %r61, 0; + @%p8 bra BB13_2; + +BB13_14: + // Callseq Start 4 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r60, [retval0+0]; + } + // Callseq End 4 + ret; +} + +.visible .entry loop_stencil_ispc_tasks( + .param .u32 loop_stencil_ispc_tasks_param_0, + .param .u32 loop_stencil_ispc_tasks_param_1, + .param .u32 loop_stencil_ispc_tasks_param_2, + .param .u32 loop_stencil_ispc_tasks_param_3, + .param .u32 loop_stencil_ispc_tasks_param_4, + .param .u32 loop_stencil_ispc_tasks_param_5, + .param .u32 loop_stencil_ispc_tasks_param_6, + .param .u32 loop_stencil_ispc_tasks_param_7, + .param .u32 loop_stencil_ispc_tasks_param_8, + .param .u32 loop_stencil_ispc_tasks_param_9, + .param .u32 loop_stencil_ispc_tasks_param_10, + .param .u64 loop_stencil_ispc_tasks_param_11, + .param .u64 loop_stencil_ispc_tasks_param_12, + .param .u64 loop_stencil_ispc_tasks_param_13, + .param .u64 loop_stencil_ispc_tasks_param_14 +) +{ + .reg .pred %p<9>; + .reg .s32 %r<63>; + .reg .s64 %rd<18>; + + + ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0]; + ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1]; + ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2]; + ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3]; + ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4]; + ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5]; + ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6]; + ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7]; + ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8]; + ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9]; + ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10]; + ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11]; + ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12]; + ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13]; + ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14]; + setp.ge.s32 %p1, %r62, %r12; + @%p1 bra BB14_14; + + mov.u32 %r22, 31; + sub.s32 %r23, %r22, %r13; + add.s32 %r24, %r23, %r14; + shr.s32 %r25, %r24, 31; + shr.u32 %r26, %r25, 27; + add.s32 %r27, %r24, %r26; + shr.s32 %r28, %r27, 5; + mov.u32 %r29, 7; + sub.s32 %r30, %r29, %r15; + add.s32 %r31, %r30, %r16; + shr.s32 %r32, %r31, 31; + shr.u32 %r33, %r32, 29; + add.s32 %r34, %r31, %r33; + shr.s32 %r1, %r34, 3; + sub.s32 %r35, %r29, %r17; + add.s32 %r36, %r35, %r18; + shr.s32 %r37, %r36, 31; + shr.u32 %r38, %r37, 29; + add.s32 %r39, %r36, %r38; + shr.s32 %r2, %r39, 3; + add.s32 %r40, %r28, -1; + shr.s32 %r41, %r40, 2; + add.s32 %r3, %r41, 1; + mov.u32 %r42, %tid.x; + and.b32 %r4, %r42, 31; + sub.s32 %r61, %r62, %r12; + +BB14_2: + and.b32 %r8, %r62, 1; + setp.ne.s32 %p2, %r4, 0; + mov.u64 %rd17, 0; + @%p2 bra BB14_4; + + mov.u64 %rd9, 8; + mov.u64 %rd10, 72; + // Callseq Start 5 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd10; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd17, [retval0+0]; + } + // Callseq End 5 + +BB14_4: + setp.eq.s32 %p3, %r8, 0; + @%p3 bra BB14_9; + + setp.eq.s64 %p4, %rd17, 0; + @%p4 bra BB14_7; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd7; + st.u64 [%rd17+64], %rd6; + +BB14_7: + @%p2 bra BB14_13; + + mov.u32 %r47, 128; + mov.u32 %r49, 1; + mov.u32 %r50, 0; + mov.u64 %rd13, 0; + mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd11; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r47; + st.param.b32 [param3+4], %r49; + st.param.b32 [param3+8], %r49; + .param .b32 param4; + st.param.b32 [param4+0], %r50; + .param .b64 param5; + st.param.b64 [param5+0], %rd13; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r43, [retval0+0]; + } + + // inline asm + bra.uni BB14_13; + +BB14_9: + setp.eq.s64 %p6, %rd17, 0; + @%p6 bra BB14_11; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd6; + st.u64 [%rd17+64], %rd7; + +BB14_11: + @%p2 bra BB14_13; + + mov.u32 %r55, 128; + mov.u32 %r57, 1; + mov.u32 %r58, 0; + mov.u64 %rd16, 0; + mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd14; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r55; + st.param.b32 [param3+4], %r57; + st.param.b32 [param3+8], %r57; + .param .b32 param4; + st.param.b32 [param4+0], %r58; + .param .b64 param5; + st.param.b64 [param5+0], %rd16; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r51, [retval0+0]; + } + + // inline asm + +BB14_13: + // Callseq Start 6 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r59, [retval0+0]; + } + // Callseq End 6 + add.s32 %r62, %r62, 1; + add.s32 %r61, %r61, 1; + setp.ne.s32 %p8, %r61, 0; + @%p8 bra BB14_2; + +BB14_14: + // Callseq Start 7 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r60, [retval0+0]; + } + // Callseq End 7 + ret; +} + + + diff --git a/examples_cuda/stencil/libcudadevrt.a b/examples_cuda/stencil/libcudadevrt.a new file mode 100644 index 0000000000000000000000000000000000000000..6cf40658ca04fd19e705db4b142117eea50c64f6 GIT binary patch literal 137338 zcmeFZc|25q^gm9rWSdm>P)P_;6tWD-UQ`sa3<-&{@57KSC1kBEBazA)BI{T}C`3`n zj4itv>+ExXSM^?dKYowz_xtJlczpjbkDB|s=e*9j=XLIDUgx}?_qh92?L6(Tsk6%N z`{(DVypp`4+zCa+eP{jsd*8mJN0ojG*s{{m(QTunqvzcG4}&?{d0%n9>E~(Z z4m+*@^S&&?O<0Oe4XtbV4glVy`673!0g@KkHVat zVaFX$C@af5IMTsjT1MwCn8Ge;Uog=(Hi8*lJbOy}0t`k+C;M-c|2cqP&WxZGx!qjGX56hM7PK_AI|cZ4k~Li>!)HhOx{ z89zSsbn0|}o&7)fL;t)FMMI2UQFlDEBtY2 zKYftzy8zhO`A380M`QcJ&Ew4XlRwp72JbMaRxbwqQ}dr1K;8eVO7g%<{g0L4cKrq= zDxgRI$CgC#{stu<_#@B_Wc(K}ogL zZ&0!(1FjJ<=)cy7|C3*FkL+)-1q8hx_ z;4iTeT$6reA~Buq+YfFgO#jM7{+|>0nXPS(Vf%lLVbkCrV+en<&$7ia0Zhu2^kU!# z<@D}te#Lknl+cSYJ-Dx7bKpn&^kU8&Oa{JHrdxU*(~7KpJNsI$+#n52nL+3c>O)@ZpG`( zh+B6s8w|8t@%rz%jL%m7xH%`b^2dGOXT{cj=Fi-6Yd`a+E4KDCTl?#OZ}+$M*IRhc z_uO&|?}_}sy*`3~BZw7l0%+jlVef-{ApE%JXSfD2$DOA^^pXAa-_b`l5Pg(rafFC~ zIQXU*->>-N*G>AKANb?na6F&ZP5RE7&8OFY$M<}SHtFm?aLK>-PuZli)PN5)|Ir^3 z8SoPil>32M{!IoFY5kL@RCDkbInAFujUQ|7Kjg%I^7Lw?f05hqlXvb1-uVx?#qWR| z{GfS_!Y^_ke)4R8V4?qzd-aodtp-=k$U@2_IKRUd{g;9*i%1{NANNBFUVu(59CpU zZuagkc;nXm10A-Bvu(}4t@-!Qb--t9{o7jqw&E|iHg3gV&*hEB}KZ+{*v9 z_8*&hSobU1Ew}%=9l+iGNie%KgWa6WC9rPU$}?6aT0_)c6lH?vH9Oz5hVt{&-sVvug1l zXxtywhvxr*#{E%!X!{=(jek@hUi%Lm?niYE{Gj`PR5T*KgK(SK->(Rq{ZD`{6!h

%fP!eQq){|;LDmP}QzhiVi@ZIo*zu|LR^8 z^ZlZ?3O3(j@W=Z_E5Nhw?-$(+&VT!yTfA69Pq&lKo&U%COSjR{(?0vxKmto==QPw(%9f4x8Uh`kS;f5t*Rhpp+L&pwJH zA7+1iRJ(rya{wQcMXE!r!~Px8yU$#WjX5QE-{IELL;K_o+`>I!d3fbhQ&N<|b*b9x z^Gkh;6%S7?rB+s`#C(-qeYJGy?1F}+li}Bjd8W&EKi+IW^_0IU+(IQw2bg(Nh@ilNhFsr~?t-pI;Z;SDrkp|IdJjAtPwauO$?>WluhISCQpRo{QfBi$ z1ygg%)r{A8_vrYA*j25eA+F+SH0C&;dAH6ai)?+d!S1HD2gwpv&7a_72bLDP zm)hgk7G-)X-r*`GyheI&t$9!Tr>|V_3V)zy%GsY)qfr^8y82B;^eX;wdl0e?`|QEl zie=OH?vNLkQ`+PAst=*a?uc@!Pm?kQuX-y+t1Q9mwqJjBD)S5Ra=jz4^r(HNI3Q!` z5u1;n>A6~4C!?s*l!80?qT*%+6S}957igH-?>hf}aE{1z{{7+JuMbX}t}X^T@}$GNq^&*Q7o~$1m=Rm_q}E;){OR zp;u)NX29do2{Os=_#O0DnV^j^4vKHlfyf})0rV%`o`(75(NXT&4J`VVC;COulJnAN zmCN7;cLC@AdcWh?sXh_4QfI^j!wNRcUPw)ixpE=UKXMhICu!z*q`jb}Nr&j3UzBuU z&tq3el}>COZ{il~4&U8c;;*W7p8alT`0i6>e5$W=-n2J$V9fRkXRt?UPc-ciwU8E; zp7L-%M&OQTHVoODA(uFq&TN+aT3yM4QVfL{@wDXNFu4iRJdo5mC6IQfJzPwU(}=a> z3eHw?K0K+lL`l_rh4iz_ijw&d33K(R^kGQqsFMG)H`H*GM(oK|%pSQqnyQk;!Mm;D zV#=IG^r-P=+u6AAByG?r^D+`<4^D~`l3EE~b+b#JDrah`x)S52-C+0B3WPf=!5Sef z9SmTu-v42X`8cgf($?HmSo(`cY4Up0E)NVSfk|YgldH7xJL2h~7 z%BP{%m#?~%z2z~fe5&y|r*q)yvJ8{8*}IGYmpjjRK>jV)msefxM94gHJ(kO}-UL?y7uNc5!ZX;Ev>6v35d{b9Nxr!R5tU9<9o!nxAvt4wx>>@K`%F z)*a_Cy(?}hZ#Vjg$?opq?~VN%0}AJ4;#}YF333tGRR4Z^kW1?xpTPs21F;XQ9PiX0 z=V<1+SZNSA+Q+MtR_b~d@@Zb$vR`APrfXo0B?!Eb0DU z!R639OZleJN5U^!?-%jgxt)zK)lMo(tg2dLB*|-4I=-trF1k=O%k|UrGclX;^Rkxm zhZ>HnHS=g!I-Z6bqR$_gm0`<_d!|h0o z=AEWqYgr0shJLE5tztF-mqJqf8uZJs+CYBJp$(#V&-9bxiHDbylzX1BTswPJ3hkrg zl2dDSH}q1>d6#2OqRBHJkHi9-ufzz~nB`2Drp$b>5(~U+5+jT;$#JSlo;fNa7Pz7n zBg}M;Wck7nVSjzr4J$=Gdm#_?DR+%`A&pix-3L>>Oftk7AoiQg$}&q*vjb#TUn~C-FWNq9c|b*>sNQrF|w*l3_-_9$+8SqXq?#R{oS63mb6g! z+Q%i94qRuJv>wcde`N7MgbQdC^LheH?LjqaB2Q&AtZ|$I9-!D9LHNwj_hfbNw12XFNmbKYnJt?#h^ArFpL?yN&R0$L{OffV{LX5*By;DKybj zFFndrrL^p2j>m8BxOmw~%FN?p@zOkRUFCF42$u4#TUz3SQG>?gb4AT%g{CGirPgrj zJ&S=?QTeR{krF;wb#D|QA4R@TrK$;XF3~=o&l5+&Ukx^EYcQG|Q#c5p`>Z@)%gDAj zpfCJ32mZjm@`bp?GRDA4w&4U_Un~}ZF|WGYuBtq0lX<2VmIn{ZN@3j=M=zHBYIjc||RpOI`JqVC$^FGuAd`WhD>JADLY=46YW8 z`1ayXvccDF0Y@C(B*~Bl!fGnVix>iTKC;FOakCb_IQ#runn9k!+J;EJ1MfW!-TKa# zHbWNK%k+HrZPh|*YO(HkO`G)nK`twpoV@2(O|>bQIFB4vEYExV30nTjE|O&XUL7*90NB2_S2bT%0-FJ)-(CtA1{5;8Y#>UX zobs-VpVD|0?RjiXx6DhO?oxQ^(C1^e_~JaMEx-FtmrIhD&oP<4PJ7XCEc+s;{}H=V z!nVTk_on8p>CaypX!4&M8)aTip)b1p7*weG0LAVv{}u90oFQ}VX6v4^SGCv~nrL)N zv{XXAx^0;2pjP0jh2X~#WBZd0thM!RUV-=KS5}nj;3L+w?jv769gkw3Cn4_o9Aopk zvn(^vG44l2wVt$Kzq=wcpkIWVW85JjER{Ox1brF`TQ+~`4J981V^mYP7@7D)ax;=j zFZPI><>|PxY%3WbE=B>V-=k6o7*t)ZgB_Dvv82__n8*s*44ccg&%wC28JErbCN{$+ z6C|~4Guqy12^TB**#*>;SM)vjuGr4r3r`wBnZ+8HrM^ygXXWFWN-9h`Rc5Ygo;a}? zH-#aoOJHRFZc_mm15x8RTW2sbYj1YxoUq+7560lqXPez^^~ucnq3HGLramxQezzGk zMr;X;mQmVMO^HIxh87~y9sn4wr3E0Vh3e*gV35U_sR~OIJxWd2np}L0jE<^*QUrt6 zs`HxUKFeCe3jed|aNAbaK$3NkA09jUs^Iz5O`lr-*`(ZSu`0*M? zIK5M(e;VfYa^|F#b4Ltoi@A*sr1FA``KjjlF{M$Y^n|vR~dkcRDLoZPJ z(EDaFH#TEx5Pd7*&D6UXHe(?R7+02Ok}#LetLqm%8giANj5qf;R+)RttXud8emCHg zwCdER0YjJBTzxY;B|yLc3hjWnN$@U$L5uX)<14 zTv~i=oV6}A3+)50x>X%*!#XaT*~2@G^25FB4Jq$0U{(etX}1P$X9|z4pmhn@VcdzR zdc`+pcS@UWUh%N?1_jUqe6%){|8VbB*rujr5{csP@ zrqy%P0lS%nbGkAF>1Z|moGr$p6k{PJ=wO>%BWf{VRAe8eM-}HJMNgaV_BtvgAW8>d zg$eU4h?cOP$1VUo6o_PSQZ0}UMYPYRbxm`HG_BSR^U_grn0usH07D3c_X!;!1*u!q z2}R9@p%9{P4F{ME1FDZQY43eaDbgm$24K({gjHex=~JPIFARV(Jw)J1RtUh~g~CW8 zDp)b7g)l&{87OC_4V|R}xFL0iIdT2M{*Y$GCk6n@0vv}>oOtOegK9YZ&X6_+w--zV zF$g>})a>aIenbbB%o*h;Pm#~Cc{^wZBW=%!h7~KI9a*{FqZ<#+1&;Rhg)#W#DH{unnP5{)biuhF;pAwItPtA z3fq^Z*n>NOOVfv(}YI4={Wb>e#Jf*6ddLjyzFh#7+%?L>wgLZpg zvnAlcD@Fkh`iKzJadyBg3iYh1#6T(EZG3)VdlPU*O2mS*l7$Ds2CF~!x3jO#be@S)9kiN^Js-INFs!EnGO<)YU z-stp6lgw#ViM;ZfR);@kg&5WH>7{jLZGc1D0ihhIoM?1V0n*0;Q5UPYtgwC)H+pH< zK(aOgvEGO+U?yM#FsM)*7liWMfetam01ITstX6zzTM?V4l2o!O*r3jy`rR03q;FrA zZ?`S6B&ZX3V@(HS7^@&xEBZgPOrgdkp{N}g=y-p3y#G+ZjxN-=Bp@$C2h@RG@D=?v zyE;*Gl8FAVbkN!UZhd1FS$m>eU?)IPqXTXktH5v-{gncp$n|7c850IN7iOXYopYY% zmPT}&(7IgcY1&BNhHF**Ew~Mc77>a|MlGs`LP!K2EP_eo@%eOLkPbZ?wgtin<4}Y7Vx!+iY z_az0fnhazz)6-&&ReJn~W{23PfX^nh79l#sKz^o^+G1VgR8LSR`s)rHHW1^s!{1@p zZ*jSIG93}j*9qhrqoB_C;Z_{%%eE=tjR_5{PLD%vG$I_N*2sfdAqX;;1azSvTbhq% z7|y25gC>(D8qq6_5ZCAs^vW9fOM5oOn7BqZ$qK=a&`$v$P1LFSp{T_$boFjh@NmD| z@DT~1U!6*;$VYdX(}uHF@K`1S+8>kVr-edH%=!&XoTE9xNmPDglLi>M%`RGAp=t@4Eq&78+*NxL8V*NXTFA#bP zvoIlkZ*&L(RP2O|Sbh`>&9CJuQW>ZnRpCo@U_W?3s~A$u@sYB(np3)N<0jcgcxj>Zc*lhGIjq4Fgo`b}t&BFqG6 zAO=F+zDADE3c+(~5@BnzcwRMXTUOZ0ldKhvo^9}O<_4xQk>y5g?2a!cJ#dp;De06FZd}GYPj{fdT)q6%pOnyA}Dz=1WM)fK&wVD4#;oj_g8k zju~ZWkFQpNLee)@XGt2{FyK!UIFsW}mK7aYhUvo#TfuJI$ADH7^+gj}8duh8GUO0j z8Y~i50^W;KV`4uG@FJfAB`23yZ^QAbO&Lo|TB%3l(p0CwTMt)d#GzM`(OHMBXlyKI z6t{(TsuyyF+mL>VFx6+*hGN3p5s76miyztM-fG+$x88O+R3#)=K}7>Gc_~yXL|AA1 z<;mAR6uDW!*&$ECMj>bi3k<b<)o`jBnLm==Nwt~^iSvY7R-4U24Xe@o`$W`6uq;Zr z_GI$nct`?Pgd1NEN1Yw1TBVAP(-hEd-1rta5JN{az|h9%h*D#;{dv@QDLmvbUc_Vx z`eFoex*DNTje=F9jH}&1O5q5)U<9F8jnb_~S&pQ`Q5V7#rSMY8bcA>Oo|%@~sYROaFZWJ&Xzm)SDCcJD5Kw2Qcz z^Wpui592ErRMbRt!n@cOytn6!7}V^_d1-Lcwc>T3BHz7YZ^pvw=lWJ{ubT87Ivt;~ zX4QB}zlUDmq_14FFWH;r;932iZ8uH&G!qoWgy{5ZRfmlxj8%ccbd$BX?Dwq-X8XSG zvfJwv&vyAH+b$tQ{h4;jey+k#e%nsBqow;jtWKUdAk?C#`@sJ4{nT5<^f~tXPFGM2 z9;PNeb-3fag9~F5nJSy&tgLzY(FvKbxSdevt6ylBA4c^*9dk=_R?-wGi<6wGI%{Qe z28EXCce}hMs7YF9A+1ZS&G*ezy-$e?b(qaTAG#N^C_o)>S##;1snU4@E6_pdm;qi1 zqlVn1JgeEAeO}LXPQ7l+@ zZWwXtDY{|?FO&;Q%Z8;{13LF?&gZ5j*<^p{>YF(Zdt!F_iCLabNvTeWph9r{ss6!f zXHw1c*?F(y!NGA)OL_*@))T-(-`aZi(~{lwX{5f|uHN+=K9W>atJTdNR8Motl)rFb z`5C~)2;dYUX=N;OZ0vxPWdWBS(nNMrg^)sRD!Wp-Ac^XNycKR7$~B+yU0BQ`RQQQS zp>Zc1gs=~E_q11pD8WP_xCU0^6IHq*ex7l$sZ!8UkcjN6kQG=2olIlMssE@b z6PpxnJdsRL>GnpHoWbYxs`1g)+tcKs%qTByV&!uN=7yYT;GqfI&Ta}gNN%I1Ey=ipLd*-{Wj1)UJ_FI zz-xLZbvktx@qQMjht?&rWkF?VbCFs7ErCX_&j66yv|N|wV7JgV5v7*NGQvEg@`J^h zM{J!x#x^C?7`WTntEu+2G%YZ4@8q6f)C&rydbS^?#LF>&v!VJ7Ip>4=#ivx})8yvB z{EG6MGW}Fie`Z%BHh9^x%_SjiIdD=X;Ks4SS8kJ1Z_Z%HDctmzlNxS~N%3*IY*-Rl z-OYO-F8yYn0t2L$nzh<$)dhx7pV?AHTMpRck!U564M^dXR3p}b5X}$!99|k%k^mv^ zwmnOt8VGc@2uE?i83_#R(ZQUHU_+OWGid*b6>f}(gVI9U=Qn1dh}f~x{44S>dfL7g zJG3@4*zkyg?#W)CgB+#ok7}XalL6&g*n#a(ta!3F15A*}I%Nmb3oWU_anIeD3bd>q zy;NM4GOyOWE<6?(?_RI_dQ{X3%T)oSderx?rhNv4e8ZMQZK3H)tJzz02N73$m>^P&jgblsS?8Z)E8YhI zHrUs9m^~|ButWv!I-??KJhn!+yzzY93Y7n_UQp>D z8`xsq;E_0$-qzv9FD^ou;3z`Xr7m6UdkZxPTtB#ME#I-?ay)QbaL+Q6>gz^H#{eTU zPf|r9(H?5O- zlOo&*4>6Ixr-^8B+GvsY%6_U*UJ}HqD()~bLA_+MrB)?>QtM>?qzHYWBuD`Tn7%iP zCq-_ZNP+A5NfMdEYz)+Mhf^rV4E4!yy+J7=bLb{TMhpS%HSCC9n<`C$%v+D{P$4FS zXmtR4Hi_Og9cbpY3oH9GFRqMb<4nN*%qegsm9^KP(ZXVYM|K+6;1okZzW08Th!UL1 z11%|@^rPRr3Mwc(3B@+!6P^T_JufEGeJNNAyppmQkfb3Cjwc_~)umW4sTFA6fneX1 ztTF3Ab8cR6(-C{VQ!aM2uOBi{FE+ch@?&@>QYb|lC9L8ieOOZ^B9iX=NJ}Q&zK^|I zk_h=eK;(s$u?2!1c$Gy1nij4OG+nmGYsIcO&?yVFCLn1O*Kj7~)HlT1L57SB#X zS*Q<<)N_rch|MpKmVpvM$#5mXn(yAJHlpb?B8oxL!bITw&vB!KXy7O<4b(HXUD>=9 zYV94-{aR&S8E`$k1QEhF2}>g;#7Pn7A45Sy+}`=FV0jVKPN0jI&aI3UPWggVY*Yq? z9BQ*498Z1<b33O|Q_Dfg2_era&SQGRb zS?_u8F^#;?7wT4$$OJF?aue>gpBbj~b+uwcy*624E$zY#BtMChd1Q7pa*330zvLaC zdNbB}(S?}i<_+x$5aa0S7xaO~^ZWE8yu)-vz>5f(_2DG2=1Xgm>I>UPAsIUSW_`BX zP=dZy$icpF0|Cg+1p-G_Nq%#HxiDo3_?4R{Ib?H2T ziz0XLsS2=WH60K+>ozZVOsz04-eq=G097$Z@hTStY-BP^eDgt)FKEZ`v#ItJg=S*25e-^J1 z|2-FH6GOwGGf$aQme~kXE3SS+rUd@roFRY9k~~?d2}xD-vl9*9k`;Qr%=H=6qPgLr zS@*p51~$*euMhcC;%>>ViKC#~dYD~AYaDC4@+;O#} zzi`e5_ro(PntnkRpo8^;jrz#RZHQ?Zzd&3v$OQ+HVXI1P1$_4EYDyOtcq*Q%0p29QNhoRLvp3 z6)L+6}#<6$J=NkKKU4Qf;`w(^*UAf zQ(M)l0K(zb^5W|g4bLf&YbI@qnE2JzuZtX@?@J8zTb^VLw$(NnOb3ja8Yquy3K+do z&6`aduJAU{pMXD}ioZ^Bu<4f#+7r@5bj|`jFY;uF;A_w?yG~RsSnhI&ge5=~X!j@h zRk5-$sASNBisyOj#E?RN`ZxTepH?m+RC}i@5~GwN_M)AJxnUsCoj-rB&;62hZDN$y{7|yOvlrOI z&gYKHN;xNl`GKdYPYeW1+ZDl9+;qz3*Jex9Kp<-9k{E~mDe#UwF0;|^@_Ben4ej0_ z$T)j@?5$|7-Hd#S0wma1AgZhm>>-t(YxnGAQSRd+@XkJ-Nd~{{&GlPM<)7t-z3Ml( zoJA&J$!W)K>Fnbev>1Oe8lP!KOF>M*i2qj(mr=ZBUl!QD?3e%i=z+rnQ8(s$+ZS%D z^2_bQq5vAD(l)qRuJ4B1pfvBGJ899@Aerr+736PF5<>5~9Nq{#Tl9M1a>=ykY=#&K6==Dw;WYiacJ=4^ zoQ!8_!W^fR=}Y9Fq4geMjz`U)F3qB|X)evj}39VMFZAtzHqa3Cq4M zD@hMfjc@A!nowI6&yoCjfsu7*Y1ZN}7ZTfn;3;bUy=P~PH!goe&8{M^*@BS6G2lzj z<7FgV_#HECV*0iot@a*Kwr~|DAlzitMKLoor10+84SXQG&+SL>w+$adcYf=it*|?Q z{CbEWea0-ahf>@>g>oJ3v}}n|R!QFJ&k@I00S(gBW?3m`&^J^y%b}fO>_xn1fu|ll zG}k)NyMcaoUvXt0^gKrX?KJ7Ul8kxo^UOSkk0F7Rm%EIG--^$OlQSj07Fz(9y5CM` z(qe3ekMOYje*BaIfc z-Kz~>mI%AX9eekHP^8EC0H6%IOp`gf{XrR2`H1mv4A-5PsyIf#fkRk!H(!oq``%!l&!Fa0i3*0pf(NlFb;8lIzA%|czCzy z%{I{wv>bmL@AfCU+C!5HdqofTo#K?tJSrg+s&G;9-EnV)9gQ4E(r#6p8YEb$pll}d zcO_Kz1r*9o?#ik~-6`2uVye!Sz0#)QwGi@>mZEGwh)C;Vz6d`ZqH(Z=ydR8(ybn@2v;EsAG}K-#q;l z9$rSBYCs@K${`Jcrf|>GbpC)9nxOgiNc18HsX|d{_&wBWly8h=XN2VZvX+B~24&!T zdiXv1kD5GR*2>$U1)lO%UT~_0jCMWb#6fJ;NBxJNxbLabk= zO6#jmF%ESNQ|GRA!9@jqQ1A~=XTUw~@Or|~>BJkv6*vq*hdzTmk=3m&rUCcLR2 zxoy8e>Luf!i#Wn>9##bozEDuDRy!x{pk9ga82F4eZU7GJG@Xh;0d%jwP6s78PHxTP|DhWtGPDEJ* zA@Uh~l>-5F>Jg`!gB7H92456t#9Z`4&bz^rh?g0PAUhGjD6RBh6(y(77R&ZCbzyfA zY2g!fh?{QD4-HBpmk}|0oiaQXXuQpWZ3sNM$2~x`ES8*y$C{wTuRUAU&C!`KDUL@n z##~UZ9DG`7W=k{I>d72i!FjlzL74ZY3=7sBeaX=-(g$p9aTB$l7BNo%y4^N(AVXNr~THN-ccu#&TFyd^n=O zxPEE^gTy^|x)+WbGr_`Qo86UIJ)|jU+;jY+qbF#x z&bV-Z6B#HpfkA&WDQRW4fpXG3kb!pGH%qC7lP&8cBJA!WkUmR`KSpz)bP>{)WhbCw zlp-sEr|u?gW{%MjU+I>4QUA_&kzs))=v5Dm*j53I1@C;@?KE1toSW6D( z#C#%pUvZYu=iAFk3!$-WQ!-k0w<;tjPI2$IT0mU)(lWf} zh|cnA1|yL7p1ILYVny*7MH7dJS@ShQ?4q1K@b%3)UT(y|(y7pUXn_d@DeS^f9N_hy zqJ6~07Au-O#*b@i?t#&O^TF(Z&@Quh_>NIOlw8Q+gb*v^3I>lSRjr>QTtvsyLWb!W zPp!e?sdp>P;$O3PKf6WPQ)#?h@a%pgp;DXU!8VZv;|H^R08>(Z%tQ% zSaxL|8+(cCIM49(mYGb|?y=YQx#xnzL<+s=0ZrkVk_m3B{>T-{G3HnyzQHpaxIm`K z)jWo9Edm2^c>BaXVYh<;7M&~Q-l3Czvmm0^NB<)0gnoBw*@%pTveiiq&+6nUU&*fJ zKKjJ@JcdS|OX@C6gBAN5#sj1-lJx4tGA4y5vae+5$7ho=b%2fs6C=zgXbwe>`nBKtoA7tft&*OczIWS9T_{hTKva?FOv8+WYotWEFn2+W-v2w-#Rp!Kp zOx2L3$=7R+!cY}WB=4-MDep?{{q`yj)rO*p!sqr0PhqBB*KXB!6RY&XNc=LPum=`KdiM$Ml-DB@ zwJd5^FL6~iQxuR)5e(QJ_a2+J8k3=Gwd{PUq1STDV)ciM=l7^6f!{01n$Lemqdq;yWpSrgu{PKbUJZfvXz6rFJ3vT?DG z?YXB;c*BCU;*op%@T5%$L63GC@xTr)L>MeB`?ly|WsF~ME#-8NUca)Beq#-%=b*Nx zsCr;X6P2EUN;FH=*t@PWaPEYRd+53oOS_LZuU6WdRmNBn3A+&P6Q~kvxa}sL>VsR; z$Bu~ES)Ea^k?T`t7(f>7=&On8tbcwYVZNZs0U5PIzF;8@P1rTXI4bULJ%X9hxh>8w zu3NOe423Na^VT>fD6ON{C7Jrch=1-WGr8S$`3wtY%V}>9^ny zn8A|n12uQoT%X*WbY1C8b5|`b%SD)8Q97h{3oGkT7pN3F(e*6Is)V_IXIuRPjHvz1 zoWd5O$2*(RhSI4N(hhqX(+tT4ma>@fdc1jzjvkxZ z3(om+nq=QPJFIPR%Kct>7Pk}iFsBDk&C{I-EA`|PEthh3bc$O*xJVXkePQc`$tl-l za*>%_Zctt9$yoT~_vB#DQS7Tlh85ULoBPZg<$K9mL)?r6PF&nYIdqZ70Mw?!b96nv zOlZ74k3m7$=ksFWW8;F;vd!B%_wPr&MHw>Z$i)`~rUr1$%e*dgQIB&`U3$B$&ED{( z5QA=!>psXX0NbV1z*2pq_FO=(Mzv~&foR?C@s?YVxID~BrAJ&0wkI-=GLI)1#p|4m z7-gJb>AHba%9CnQ2zk3~2ZOGQzR2&nO}e5Q1{t?{v7Pv84rMT5IUHG}YO|KNOXFpC zqfPU4dHq4G9?f-VWYEo)x;#mxn`hKa_4|nI<3k9Tj7D1ZIH|EtQMRX*8{GkH-lp$P z@w4vMBx%z21R%~P;kgwDBru zeN3b@aNoX*51tZqD#%>E%F_+a@$#o#PIR*jadF;z%fp_h4v>piypY8xtQ5jGLnn7; zKYBfKa@e@oxYTMol>W`CII?WYa;1rKYUSDLAb%E=-Da@xt)E?p=cK^YI>+h8(s@H= zzq#7=0P8H7O6Y>mfGeWVo@~!BEF$=Y_W9k0ZhjYy?^snC^=0N_e5Na6!H#S%GR(pJ znN#_l@9k%7tc73NH{KqtysL+{KB+;%Ua*W$>Cq&G?D#BFN4tpWkK1UE*5i~i2aJmC zsMTMh*i=eF_I)`OsPG7K3(>l&m~Gt@*w734Rwm+jq$H@MWGB|F_etbp*r89rfti7~ zw1-o!%+mHM*;&+MJ_~8j9mx4QMW~d;*!Qk5*Q?|!48z>>pNm<>y`C~1aD{b{xLwV( zzZAc?qVnR=V*XW$mz>4NU7@s{5u^ExD~z{Ke+b;Q zf_U{1Z&4B2@b%EB-<1>jPJ%b^l8cMCdhUGj)o0p(?4XJ)b8YBHEGA`Fq}}Y;?hc%j znQ2*O)48*dR8W;*ohbY+Y{!+rg`}b8jl?TyGaJJZ3u0f5$`-433>h*mRE?N#SYEBp zp60fz`p8C77hB*PnBa}>FzSr?q9Q(r!tZ}DNT61{vNj)i)U+YE?*^Zn8J(qIiPaG6 zpp;_0a0K;?W#KBnrs~n?M`mu>zTvjf0&T=rrdP5!Wybk?SIOf@9$b<^b!Dt84`Eur zQt&i!Jrp|b!jDezjenEAz?mVA>G#I~0^6yYa}4tYaP1N#);uiXhJTe@XTFVOrG06c z4esO2sI@=-RzGy{%xsUw)5sO?}6I|m#2dEdSuFOQ6c_XU1zG{zvVX}gbjM}7}G z;}eP2zUxcu-$aS*c8C)-iw4fdOu1?Sio;3wBnx3*^)_--coxz+9^K#bB}U;0XI!y8 z`PgSB&lKT$>!B1wSC#WaJ+_*Ug2r9VqMp}X;6y5R67{WkB^NpZ%f$Ob8~kY;an}0& zen&?QS+Hsz&9#TKH0l`auJdN}Tq5OKWbUgIbrlItTu+mjmlD*J<>}c?=f7TEE}lz0 zv~y>?Z8rn)xv+9=^`+RLQmphFY{~sK+w&WY7?t6;8++DZ$31yQ(6jPU)v1dX&h4xr z-2Fp4zm&f^wR9ggd}Oa}tuIILo%;kgB{!xoB_kd1mnkK~#J~q7?dHgxs~S>7tmd}z zZ*#qI7ZXN@7`kd;)yb19ntE&jcdaZAU|&iu&C0a8IT+-Nf5Z=~*y7p22IG^>%~tgeS8w4$2Z&V^CQa{EeVesw7LP_jgE?of&Gn03G)vunZ}M&WryXI~Pi}RR%dz7jZAnkI z1vQ0OWJBk^e0tQtKeMkc7y7D9v4wE8RHUiiq7?A1Q*R=;wmNQCw*-@%x$teA8?xg` zS8HnyyG~53;Qh)*mp&Wgwq+7BAAbo7j;pWAqMo$7DBELCXPUX zcfwf-J%nO-d|!i#V?3%PuPJ1s9DhJV{5-6?>uvkk{qxSujy5{X2@g-Ji;{&Fy0(R6 z`zh4!oG>^AgeXIk6$>m%jLWiMErD>M`E`#<@{2F(dO0?grZaB2K>p{3*mBsjsKw}$ zABR(KsYh>|Vwejv)RbkvPm}$KS~Cnq2<9?=y(Iee^G&^%(C1lHojbv?W~L&MwC98X zD_W*-B{^|n@N=6lTC)uA3fwRC`ruX(x$w0r7ybM$^{mQg=+!dVvrMXHfH#KEQIK5n z;>(gg@y#lGoRneEu6kVq3h}`j?|RUJ{-Dyn#YT9y01wB58f(c#y_PSsmc3aDg?B?k zpMOr7mppPhBxl=3E*r!Xwu5@HT&?b2?ZghD>Fj_tXm@OdBhcp8_21>)$b8WVD%io5Y=3b-3 zMrf-rEeYzXO+K+(sbvYyz1XjNB24|jw-B~*WNyzO;$j^hk2I>N_s`~9T<#VR)Dy92 znZJ{l^`YHY=DNr!VD{r0hc%L;Eadzr*|}ETBQi7j0gNNj<4#Ohazw(WFJxgc`|tI> zy*b;?sew2kGQIv8Bl|dns~9+ZvFA~6A@7cc*hXcRf@0|yzM?B5KAF*SY28|U1}GtV zWjUQAnhr{+eJh-GRgLKF9yane?W=UQNNDoiydJat(7gRw`!%e2)%l+U7=4VC?2U~W z^BN2v56-`DLuNN0&1m>2RL~v(VuyDlw5*m6rj|!V_48{2kwMNOg)Ur#^`{NHow@vP z7hurYy;A7HJ5KHm>^Lu7nGIxS6aPr^YEn@&9rQLH!M>;@&Us*ZRm~Hsej2?UFdYhH zdOuHs(2;BC z>RXQuER(+~#y?gKX!`;i)?QV7hx)XD&;gcD+@-Xue8$#AQs0|f&d8?wXSq!izg|V$ z1b+^BXyuN;$c%h$K$i4Sd=6Obr=sdf@JtCM7)%Sm&czTB@=5DJ+h@SCm4;R#BMQC% z>9&B|g;n4wNNb^?FK++?F9C4O$p$})whEF-%tW6RhlyO6

@$opGiK(#zh3Xp@A3QNoO9jJ`?{|C+>iU**E#onu5+&OtaALlS)H>A1J+dPsYFm@ zLeT{c71;)krNzYXAU3N-Nmp&Vw%qD!hP1ZMPI03PV=j7VAy-!0N1Brf$d`TzAA-^I zzXAoRSVDiXb<#x{_Bw_@&)6|Wdlr{QhCG&F=4h56zi=(9vT?;Bxli5$NsVrips&~z z08kAoEZ(1HVgS~7^-p}y5=?Fw{^>8?K)$>AVW16Pfr*X%P&mn8~ za}ba8H(CN3<1xwD{+lrSW`!lxh2Q5dyM&vhg-s-CdUXJ`cMyt+S!akR?pXA|Umu9| zz$z|WasQ+Vmr%{UzLk`{@=!GpRy5A+u#! zShFSUD)1?8EB!Xx1wM*U@OZWw^33asGhdsOqJRXttePM%XC3~!P1C~;v@b4gh}$b^(Ba&xZi#Et zBm>jIf9a<>JNd0G7M<8pC(yLba>D7bYnx*#MYI#8VkF4Cwhs$y)p>e<5_K#b{SmK2 z2YLT}h0Z!^sk$~%ANlkQ~{yXTwZ>?tTXTy(jFrnhr zR^kRao4RFa&q_!8Ki9V8zA1NdR6Z5_{qB{IIbRjG^|{wj1;M!t)V#Cd7mN zZ3hSav%Xa9uOxG>%(?jid9;8bsCL0HRJb+hb+8PU9dzw8<+!i1TButK^5g>Z^q+h# z-=Ibh=MzkMt>#l3pNc6;&8J-X$5mAlQZT0v?xHL~X=na}q2ow*ltB1X`!)(gR>tk| zs~KvXM;LpaOTzmTZc9P^&0_KGSk2MbO6HZy=3A$2SjdA5wF&bUkHU>XcIm~L7lOZ2 zKKR;CIbx6J2aDE0I=C~h#jSR44|!YlFTVG6#M+qp7CF7zxgsdj(({u2hJ;OobyMSO z$)TM#vGb4BY-`2KwScr}u@VXOk9ovZ7p8KL?>B=X&;^}k@>x1z&EvhiyE9c|0(wye z!_lLN%PFCBlRvVScZ+isFMo$WP=qt4`FFgV-?F>23u0q@{REu@3 zSURoolRm85S;UnkLeE5ODYjfqfDP&&KMiPD{B~C-_&!=NTes*0zruHThT%yxQ&`PT zsxaAq=`UEFf=4IgmT$1&Cz=5CfG->FaD4naXi`4DixjdY4izq>N;=Xm&+HjWjEh#q z9Qg1ZTl;<;B_W^d)~5Ap6WzGRf=!d){fLrN%<89iAVwH#e*48MM)B~bc%bMov*C6k z;n~t&RvkiT9Nz+{#cNtp8&vHwm=1sNdWFlKj+%_epDSx{s3JAzZ;G5*xG zGKim)@_=u76;*T-nY@;t>IWXf{#?^mdC-E6-fyXmPyYa=YP4R((l}n4^<7r?1;vKv zw(szc9n>jD(6-1z>UJ4E*S!~&iDa=n<8;=wY`EyunZQi(HFz2&%N};rl_9q0FvUZ? zs%y>5!YyM4cl5O#p9Su;KyHFjWaDEy7~N&0pbaQkKBKtQaQRa1u{Xi5f3IPM>Jhr` z&o3+Go?g%G2nxD!<_jcM1i6d+&yXb@AklJ+W^|V?L83gr}okQs`Med+k*^4p!jfgS4eU(a~U;_Rd?B|F$W)gZV|s<{*_nee$HNs9Jvcz3I@{v|D; zH)2lg0&Nt=XV@&IAMy@1{O@Fh$yV|u)<3B^hzFgQyzN?}?h8=gBSyc4OUI!>BN z7lq)%7MuVjz8C!gb_6!vt2)*>S@Oi;mxLSq^%fQ7cmWhaV=C!5<_%qYc1mUZ`%`Ad z)I^!L3$r1L$Q?(6_C};kO$a+4WSa6D4Q%h>YQ1^c5wp_|^|8DI{vY!vaem1Bx;)i5;s^U2O);@nb?`7jbq^j4kf)GD zO;`%JoTvRaBCbNe6xIDH)uHy};@t^zC(ciI_4kK61amE-UDx2iQG0&luT4kSKT|Ve zLSPqE@Zvbk=tJ*!ej^dd^e^7VZQgJnu6-{e<1%l&x{P!-yQR+JJ|4t)zu#7W$p%%A zZw5H4X#aG~qmEagIAEH+y>?zD#L{uPdWHPUUGtd=ws8y5n^qyCz<7`$?whbx9lMBq z#PHoX@nNl#>lvg-gMDu`%a8q`H57ki+ANhwaCJq0d{NQZ<0#i0!bZ)DkLd#DJcv$^r zrvwrTmQKTUPHd#LicA|Dt(85|Z;slo-Dc(3o3Dvp1#g{WTGwvZ>*Q@w3xi=(4ExUq z9ns3;&fU}pJ;0PeY3=*9lE4Z_*|q(DFf|{k)d?lA=T1#t(raFxAr6)rr`SQRV2R)E zzkxZb$v5L3s-jTI`00meyUx&4DySApt(D7a$S;HT&HEz+x z&i7;Y%-`S}=jV}}V|3^aUti-QH$3Y+m+_RWccJ1sh}#BYl$jBg zjWV*mfX=TnfkG$Qc)k*idAV=iIU*#q?O_SFQ60=BcwyC{Q~8SVGG)0)Pi;FS+&gA% z_#4q^bvT{HHkf$n!M7C_UGA-(Kn%Mvbs+MFkhVm-E@c`dzjjN5}2Jy`qblKS>xkLwmFWmee(saAB2)aN{ zH`UQL7k{nsJEuD-)U8{}#`>|F8kZLqPNTNd_}&N&ohDBd{m!R_5C+=R+)X~iDJkY;vZx{%4ACP@t||E=c&%+Go=}n!kyY%jBU#J zDqtgpcmv#PX_j)<6UOzYG0zm_oS|7c?K~JkJ4f8ctA6iDbKS=h9qzNld18Z`rF(wc^xq|F)&IEm{M_r zl{ceB>Q#x^C=dGblpo%!qJ6A#gA`;NjED(CZQdZ%yCRlYIJh6Z{OcXvj2iE3%e4la z#3-RY;V3Sh5`4v&f{o0s{o+6f)B8J`Jhd%IX%x+M5ZA^^S={Q(EgJtu*-^_y zF0zz~ZW9>3t=b6cQK%S2waoJfMLl=?=t781-@A##Y-1V+RSxiQ;=YbJqzsxIC%tYu zO|;%>JZ_w}OW6&r$TG|mqk&rh(O_zt@))Iyc~3NxupXhg57D`qwNGYBuAEuD#v*^# zK`hszlXmfLW*Bw^VPh?I%lJRy_Ypa|5DrZuHCV?f21?3h2f5mBeX%*8h}V|nFq3e% z@7mIoFi!I|tVdgH5>uJXj^zw}Zai$39kI>2Dq)f4YZL*;JI{>A6c@FC1^rf85%X{T z9B)kmI+^lCU&kSq zR*_IW5=Q!mA=RQRHVJ~;tfW_yV98#Xc|6IqZb&RY;uTwV6q$8r;{#bc7(T z!3G?2VZW-_VP5O1c`J%u$1;QRRvVA({zYldFydks8~y+2k#2(Xm$Z-~#||fa1@FaI zC%&DA87u`;66a|@wPeKavtcJD;FT%tOId&6!`;BA()ICw1Bib)>*MKEY}_#8H_?gW z=ok;Xm2HkM8o%-Zwk)}h>4hI9j6aK~2qhkv-42lj*J&%RNXSOmcm5_%J^MxA__QA7 z?&P!g#EPv@_K|h>6yJQ#U8a+i#l#|Z0;GsCVPC=TN$jiJ;kE7SkSi^~=ZJO4Wc~(b zsvGfTV?g`a=sJIFCG=Y?`|2{}$DaZCS{iU}ruIQQ@c9aHT4M(+nP+|Q4FWTWa(HI#>St2UWkKYTOu<&C|C0b4ap z_5L1zfHQcbshC|183uB5>|V1mGORmPUiKY{Y5mm8K~H^qM)ip~Id28@47LmgcO}|c zhM8dhh|6*G(eWc{lbV&oHp(x~?HYXSEY|UYrFAo*&Yu)|DBnhq zsX`0RBgaBg++8q54;j!rFD{RKaI%12IZ)ugj8XezE=oATtX^RLj;EQ z9?GBnp3QegZ%8Jci1cX`$LL=pW?0&%5$d)ZAtAZgll3Uq2m9ZRBk*Gn9nS?)^u4ec*J&Gl?{b4K_4Q&Wq>Za7->@HO zc#N&69R_kRd`kqGS~KBK(52%v3Lv=SDLu=ux>v+bm0rM0Chy@s@a)wVIF8295{rFD zVKy878D@gX&;@%5;Jm*@-Q zn5s1i;pZm|*A6Z2b-egMPKrd9w(J?Kpv@~lgGsre=AXF+f*sl4Ft>jilN&{*kz0O~ zBe*P;@?48toeHCBW5;=F45y6yQRK(p)Nw!dg>ksv03aB(R_&)3BK)wGEq)id>dY*# zwu7CanXdbw&gJdlqC2-yCSfJElbp{WW@AVNc6GN`GoWrg9efd}j2nB+a`$>OCU%!@ zuLtU?$$@{H>cAS{xBB&vu0e((_Dd5xG4~F7Rp2P5xR&M(tsx>uIBq+@$hrJWWBdgu zl_VmJ6Zq~;0Pg2zKq{qf>RlaRW`#s_Uy0j=ex@mTwy1U{l5dRrF1BDNjL&W^@=b~&MiS71>Ly!z3*rW`hpcmrq7870irK*1F z$D)7Dx0zQeju@!^itRYOpJL84(6C_j%~o-hGZW7m`1_vnb%eAKu02S%Bp%X%MU)-Dxm?S|9K~e+rikp#xw4E0P!8)cQ*DSkul=HfVy=VpFQl;fw2`Cc zK0vIw8!&`HK5znlWUsO={i7$<08jG}S{b01G!wr+f8iy+UsQsnscdh%zhI361+(D_ zPRZ5_2;4F`_1?zgPqxo07FZmsch?RN?n;h_)s_UAPy4u5`4c)%01=>p|p0F zxh3KgS-#{q@BXlgUcC-n34kZ26jWGyKx zIAZE)B6rD~kj_nGIM~oX2yV*)A?|DpF6b`1IVrZ6pO?MDP;7oq{?vp#;Ivq=B(*2A z5G!5VnmW`B(`Gtp|Oz>a(i*n>OlwColB<44ajuuiPwo3@?wkvCtOQXP&Wq%`E; z_4zF;J~Sl~6@8)4*>8RetQK(%U5CVWtV~Zu@7>m;8~g4K%rT8Ij;G8Qd2+5BJ>?nT z{1RM|a@%2>k?CweD=eT()TA?})kdwrd8IKQw4@`gr0U#lSQERmAeT#tq;!Nkt z=*JAE&Ehj>@yEBUG)~heE<)e4j#j@#6=x>#vU!=atbef-UW&xtcOFnHTz*M&5G}9x zPOJ;#vBmKKkz0y!WDFe)R_(HwyHoRtA;9RW$W77-AK_##zJ1x}s_ZLXqslmQc_55P z%W3Do3dWQqt!bP))M}iEKd2s-5uB^yD|BpLad6+Opnb;^sf+k@89X${Q9!ofV<@(T zpP~z3@xmsXBW9E~|;&*ez)0Ur6Z53;~Z--DgdVLjl6mBwbb?->I; zGE4mX*R9*z;@e4LJrA@4Q`(K4-F178aKB_vClW*_?q6u;U17a`qNQUhos@U1iFj69 zhLc+WP*ax45Pionj@>%*DRS$~zx&Q#$@M%?-dk}FS8Z2ih@-|5g55G)o#FU3gz1`M2 zT>V7qt<7JK1n!ZtH>!UwU{AQSW(d{ENVPY?OOS|}!-U$kCd2G4oeX18Bm#E;OZwTl zk#hPoNHq;<8)Tm2TSq;Vn;9-o(;RHRX$LFqV|rH6qIP+21AHYYAxc>Nwp(t79!cKv ziT%Ju_DaN=os5mN6%f44ez_NI03IxCfHr;2J+~4+5<_3%St%Ws_zoeCne>6k~9)wsh>1^c0lB7pq$fm7M%pUf7SRm;wIO>Wp=r=T&~rGNICro3uieteDZUx zp>eY{Z{U-iY1*r~+xee^+|vlMqvfz$3&wK|XHUXnm^3nLe+5CV(K3H;f^=ep9Nxvu zenPtQTtim;p&@mFm2;W_!i#A2}iI6Rt3UyZ`g{fqAA$T(gyfB4+AvKHXPCZe3yUx!BZP&~~VBEPkQy zjata22iKO}Q36%IKkr}M7t{MbfW_^JMd}NLhm&^cTGt$n6fKL0P@K;a9j_a;jN0Av zceoBZJgyQ7x?d@cPdgl#(_*KqIPWY(5KrzuK>q%5^JZJ~$(N*x#RY7sQ++GjgDXkL zjdkw!cKYQZ_p9LZ$N+gR^d%O%?M#6)`#_1Yh~I(& zK%*nPJ06A*(_agaW{l&6zUe2441H;}uMx;zsWIdnhfU>`br)iLeZf5_!RjnU^MdYWZ!99r9l{ra7ve9c+=Nne1 zL5@h=LhZo?>jeT$R{j*G96?+)Q^w4-ZQF^3U)w_Md}}u&^8TLqFfDM4{6!wN`#4L^50asIx`y5 z^7(&A$qz-uD3Q~u=ZzbV`;@pYWOfI_6t`3va%lDGkp(Q|^rE7}91rj6t{^{w<5dbp zwGKLL?HG~;5eaJ^XGo>-vk)AVkYq+<_mcm7{sBtSNbeg&uwoT=jByJqe{JP{OOcwJ z6m{~eR^pNeN5DKZRmwOej0X&%pI(oX&3FGbk^h#z1DkKf2dIgSaK+lfm_$)XA{>EJ(c?3$hzla(+iM{&iZ=1{>CI+s+F;|^&aFW$htxZ;!9Zu{*nt!1cM=D@Z>G#VNPoX&MSo@9nYMQ2!Phg* zlU3PU&*XGdePfFrXbuHj%D^t3Xanlg$3v9oN&Mq4VahM@S*=tVlk#?6{kW$Bs9%{c zedP3l0j(A0b=B9Qow9Vs9miwXGr%uC47U*L0HUOI@W6aP#kr^@Oft%7wYKfH({xSQ z{jRuK%(CBX>X*T<3f~sAfU{n#myA=(ZQA&Km^(*-YM7eS zEj@qBvw~Ek2OU-u_*-?czz^f+t%0j2EOQ+l6n6TOzoB}I;4i6DBbZ7Rg%;xP{1c2v zQ{o>+xnR^o%57|=77H%NOWRG}?c@2StVY@O-tSjvdJAYbd2fX|=A@9kw^6J26A8Q_ zQ)Hkaurq_W%%Gs#_D(b*8LSHA*BM`_jQJK+6~IRzl_>HP7GQ7vbp0i!Ood9|#srPc zv=nYjjGxIkdqPauqOOCak}+wJnNb5-ZA)SKh3Bc(qq^_{! z3{~ga3M}~J80!8O{mdl0<|X2sE!K*2t)kWY@DWPh=e3hcSnKAI+rYR+F8-3L`L?~l zcqTP`yGK-wGzuGw#GlaqTfN$o@#!<`ZWmmpXXfDeu{B2Osg}Yt_g2L`$1DQMR0P`V z?<779Oo^ukn@@FpGvv}${UjUm!8hw__5wX{VqAHz;Ye8?Mz+^CmQK4bpMm6M5ijv| zt5ZpTta~i1UMyKk?bB~+kcZW(l8E}$0pE+;?N?cC2{BE%Nn2fVTz639 z6hA%75S1DL+&=t7w!L}H{p>o`de-|IMz05Mn7MG?lz+AjfMt{308x|(HyZ@<&A8he z6KIO>_XBj=7He7=fsyYCHF-UrV03=WfUKkBC4TmNE-geiHrfRG1&49#Xa z{R-Pr1IdRFkMkE0tu$EYkb!7P%&D7@pIEdljb3fDD);W_ZmsD4Qz|HoIbIN%*(<7x za0ZWz8w}xVu&N`*3dJ#J2;=1x);@G0oQsfAxtU0isoESEGiduH^XRUA0z6t`X{*-) zBA~NIu@5~U3@5W8jz!|F0qs%wLrGGj)We(b&WGj5bKd5G)9(qRJ6Mz)ook?AoYcMb z*cS ztVHE>fP8xyv1>BM{o{>H^3YBs%lb9PSK>-2`vx^q_bf*_m`>e$Z3p?Y_$_+z+rV8S zK3i!~Df`nmsT7$DHxEWCEq1@NelK&lm0Ec`M&Rgs#kj-eY}yc7%`)U{4aj!Wg6)5C zuVK;wVQt8L9mKcsCxGZ^j9)o;Y|(Tts{rE zsI(m}_MFgH&}C6qO5RBB&s>cj87-km*ZZ-WuUdlMn zw-#S^(wz0gLBsDlUjZmgAN`H$t}9g~?4`^0#w5oW*}{}g&7P#=yMcsHwy-`LVk3ir z=>y#W)srAMt${#mbnI;>?OVKm;fd3zfvh8oL2q#ULZhJeHN<{rT?buHdbiy$K&$~> zU$FqrJqv|Sk}%3!{TbM8=%2^hoNJ$!ErsUFV%GNMRo7$QZ8I;`LVH+tOZbm{7=3{D zbk=~j_xJHM_B5OH;dAAp11yes;D%ANwGaTYft@(QIw2TfG%qn@Y`W&YKeByj1L(Ut zHoY0gNmd~qG-G0qmS#EhuZhSj*}~4PlXRTe>EE(?zw-D0I*LY_4Tho%uC2Le-StuF zICog{8BZ`bJtV_Uy2`@u$%|!evXjuq*0l1eb|d49=^l~PEs=7aIh{_0TmufKq>ps? z1@pDG)%hx!LpbPB3exOWBW=;hm>XMd-5R=UTbw^o5K!Ms*Zmkf+2L>p(+MEXhH~^D zV4f!}2Bi5R8|+CKE#>iH#p^o`qSr7Na*@JkW$EZa%$fDj9lW9KN_gj2f$cCyuYtbk zP?N+41v^7+EsSM6%)r1yy*zr%^$FMqgd(6IfO*gAe3h#9) zbQ-(!wyi$D1%wt-vxm^<*2b%cnqRpE54`^dx7e{+u>#HGmU)Vd=Nacpt2WjDw7uW} zyxId&$hB$;o0k#r=UWLK6m+;5WM8tI?zpK(0TuDa+bZQ|zs9XRz?h5(cbbjG3U*%w+F z#fe1p+iZ-JgXP3W%#UpPmhvA1A3!wpZ6@)PQ*N=}68bQMfq5Mon~8BcC>aF&m=7sF zxKx^%DIVP1Gsmam46}GKQ}JY`jRI|3Nd%{6>N$)9a)MX~PuU1a9FyzKRQ5T4ZppcY z>vap?`tP^o3&Qix@th~h0?`G64Ply0j_=N&Hh5|^PA3liQA_<{m?(W>sv^Zp^yVY1 zX`V(6D6s9ba>Ip$HRpwE+ z_pXPKlgaRtkmeyW9o=gGxm-nsWi1C8SXiUX{$?3^58q@6I>>~-W*NeNvh(-1OJ(6k zW`4JC&sqrr#cf&qIO^^NFu-9BI5vlg4I#&-Y|owwf6NFHMQRF!tT)4O%{dDAX6f7{ zam2Goo$|;X_kc(E>&{Kz$B0;0$L!6Zd|iL1b%sd(FVf;R1Fe4y4Bf=I0;Zi20h~Wk zR{GXgTG%&){$ZVX?EG61zj&4JKxcvLWp3k9V0QksqM)54_46JLHRWi1=)+3yZ8;nLdgt+yzc81zm|l6h`-{2@l{B8gJ#xqv z>aTPYPXoJrwqU3lv|Q^$2m0C=r!klMH`G&E#M3liWjru;B6(~531LFNwKq7h%2%1m zxV8k9z#;BQog-LF;Ff|p{*S@NfbKmHSFwCv)PKvPmEww18aj~g~=S3 zW&fuH`;a3ml?FYOa5<661oyRqYXBx`f4>RLJopt)V}@Z+immcEIGxqFL%P(4XVSdQ zk1+oRvo0M@Mj(vN2+I^)V5F{`*9nZe{PyzdX*%JC*Y6vf1{xu_b#r&t;!eXYBj zC->|RR37K=VP^l`)%=BQeus@3{vBa>DIr&@+_vN8wM)W%LgX#Xw-4B;`~>brI+W0x z`azO^dkc0vd299w;p(plszwCpGS~ce@b^o(gaSY&pyZkRpFg=yp!+jjXD0OT#exXJvRU4 z*{$qXYTF2tRb->D(@)3t&D1m+%=*&yn&%DnDpxxR12p{uSxh+SwX`M>-k7Xa@vuZu znOX{}NH+s<*j_gq(CgTmK>ycLaNak+3{2F0j)w0!LDJnDTbRpZ1ZgQOhllgJxry4W^hk!3iD$!Ouixr zlT%42o(u!7o9u4ZQotA*_)S71p@Z%{tTOelJ&vn zj3(pQmexw{7vQl7=*NY}FatcPYfldOYDsJ*>|Y11X+CUk(y0dd`qVzoj8v+-==<;9R{iyG;e+Og$9tc>p*=ro$iD1T16It2NlI=q#Ha#|9|hi9 zOBl6X+PK$aCh`2T%dd^c(}@k=pUY=WpnM<-9YL1N$Rj`dUuUUV)rf6eE-7%Su`^fB zs_4Ib&g+Eve59CvUSaxbSLeo!LZ7^0GxxrNwNn=ZW3yje6xL=NKj`#zo7=u6cj}+C zx4#Qe`TXqaDdoi%Kb;U)R$Nc!JUx9kaI(L!i<bNPMtQq{|HWsa)F`SZ?+7~S8YM6gRz+_p+Ax-;J&_f_ZJ8(7t`eem)7vy+4l!G6;q+l7B$v^$(+Pzv38bUY7L7vYOU@Gm`|^kdnmoSIRD_{%Bd?) zMkgya3eLG+a{c$Mu(JD=gN}gu`vltxfkwZI!yc9%!<(yB6&~L!@5O$|G@H8Dm#eq( z{L$5%DzTNn{?V7v!?!AQhf|orsF^X}D{h{tGB7es5 zta0`3Q09@Ki+63WWZnMtLN=>F@}v3F>F+m-3`Wh2eO@23zj81Z(Ry~|^st9pG`04c z9cO6r2a{>v%m2)#zkW2c;eY#dnN~s9kXYc?$#HggF48aMH0e@(tI+;wZo#&|)UA&l z;wg@g%p}JSOr93dRKw&P3cR@2km}g))0D+ulGgsTX!K8EX`^x7&p)>sOzh&HmQvl@R6so`?)Y;`@}1i|jz?>1^Hs~B2=DN# z{@-zZmlX|+3-MuMS5j7{H!I{H7D5X}{-#8y9S-pF_~%M5OfB0HR%&QBsq{VfI5@D= z?6}GN&J$^mu($J1KK@QoRIL`xyVxdY1u^IpqL3f?$$~>wj(u~5@1}fD3j6kyjI}@J z^8Iq{m&ymTBZ2xs$wLZX59Nui8oleb=@6@_{GOUA+3?_3s7`Kzum8J-7Z-0Gy`^+E zHbS27YL-`Lj>=tsh2)Ugd)YzXyUiY6qWq3qN-g^`_B^yCCrGDy;eP4bJHm^hz9_t_ zXn(o$Rl`&T#pTod(vQ-+xvm|Ot!W|GsXmSpiMuTRQ1en{!pO&}3H}loav_PA*gj99 zGDp^$uslK-xX+2r6CY96fdo$0UkC&LeVmNh`K0Z}!iwvt0xEOJ_XvPBYCm+(Qu`s+ z4o197vR$fG)C%0PeF%Wwt`B!jZEn7p|#PVn$7 z^7|Y=4|InN=Vy(nW#YI2=N8PjIb75pjEi3JGZP6}22ah7KcT}p8A)7#fRK0J9R`*I zZkNcIlz7U9@`o2iLpYnEWK|D(w8T?Tk2AQEi(>YfZNa5qJU7k>9G&IA#CqNW;Q%Im zGcC9Pr51M$mU9bCh>m(nCqyFo$o`!!`|?Gcs4;R#3zC~0B?;_rLoYxt=FY^Nq9@0| zxal%NaDA&q92cM|x$o{-%K$5ho*<(==+cp(@T}dB3n)%jWh5+rF6{9SfS}!C z@iCG?hy+z;X`b}vk3ZPD5PJG(ChR;tz|Bw}LiazS272zRDq^;-*qAI%xlvAsBZLAaNoK1Sr*JQhgzGN+ib^rB1Q^0QDb7F{h9-|)~29Op!$qj8-7HO%}u=2bS7la==K6EIb>y5BN& zG+2j|!85b)QF80~v*7Vr@eeNgjfSNlAre+NXU7XWdS;s)2`UQV>r6K=iX70gKVA5* zHK56V?}v5^ii@SH!Qz}IF{S#1mgS9q?;iZRN z3io_XF|lA{gy^(O;reGgQN}f!Irm>FRl`ec415tOO#kpd7KawFHh%@#dO`*q=cOlbHW#kK z4ZIQro1f7ck)&TaSM*!2kx0^MFz`z>UV;7;r=14#F6&@9w&b%I0zy!3sxgDIdbUsT zW%xc%Ni>Wnc)y$$%Y>bz%UFPmoUqGp8Q)FPVf$e-id$LTxNo{WFpD}-O7GCwq-`Ah z*FlP^>phu8RMG8rRwDO*7%k=ka^-VRK8vFG_)&AQ-J&J%aV8;w-r}wuJk-7qR|{oA zwFsh3##03)qU}zvhfK#>c7jVI)2`QqpU;Hkov=K84yVEe;7QT z(G0u)A2u1!2r$-~AkJ9=jG+WcCfh&Vm_huIE(ub^|EouYa~}@K`%gLA+Ordk_z%+= zd(>QMyk8^64qrc=Y5JeUY|*7%rr3Yf#A7oM6yvAJ?B5T3qVPM&{!d0oM`I6GwqM=s zsjoj#=pXyFfbTyR!0|s3H?L2-VCjDsQL`Hcj{L_@3VJ#TM(#uE4nRRhJS7LG*VUS> z2*|rDL`RHo43RtA&mzyyR$gJW`7S1$gw^(aBU6YP;dDzmd^=86y^KsjDYu6MmiGZ6 zH=~yQAMWIO>^+3sH}IUKKC8|a{O#5m48vbAvO>0~Dyjcqu^tLRH;IiwY(I~WmINYb z)=td5vxv8O98aN)CyY+uQhxv+xB)@QeG8i&391UAbiR&($NnI*dM;6+hxfAXEwo<+ ztoD^Ro<)kyVjn!21RE!T(H6vcAyjWMOHvk;6N2g&ZF|0Cn;$P4#MAB}^pY`F?LXG0d)l4VN0XxYO~6CXsjBiM-zTjxtK!IuF5Mmp{+J zMR%wdK3(rdrxi8s#tqXN`A&3h=UkVSFAX1D{@Vt8QO)eJ`JwFFjqPwJP?eMm~-lO1I7H%t+G zLDCS(ZT)ni_S>Gk!Tl9X;de<6Y2So(9 zADz9$8oi5N0jKn&6@h!)z#Bdx{EuyG4(61WX}kCR4fm7DNJL^Gd+I*(BhOB)SFRC| zb9UbyI7?829h5PNOy$IceJKPQ?ltp~4dPBir2_S9?amCSgUMeC=}BUyml{vgw@<3{ z{-iPd&W1N%GQQ5P9bdGIzx^4LAEv>|xWC`uf4wY(yuFC#3^#3v-nU3=oOt3B7S8%N zee0mn)(U(7Th&L=snnf~vLR^`wC1C&a?QvOp?XZL?6 z=!3ieC-QhhIfqsT?(R`$G?bFiPEo4bp5!1EV+>KA1>M7jPGP<`U<<)->5z~h;#mz#(G3pc<2J5cp= zd#38{ql?V+-gu`))*M;PA#d<*4uj!^bG|wxme;WPK6egaMn| zpeGSBc$|ZSHnlDyyFW2IyUhybZY^I`)9^=h$)d>YYH|(_qDJZJ)&1>``dU3^IRgqvhf8ZU^+FbCM>>x8id)pGY}|6u>JBYMrw?z_h0;d~21l4W1HlpSk-(R&WP-xXbLxx2W&-Z@^0ysvK6*_TdX63~ ze|6nIUjC0;|EK(wla)bjhx-~Q_zG9h(rRH~Lya&1I)?C|5E-Bh+b0WCfPPlh$VYvs zU7k*gs9hcglO8Z>0SF-df00Ve-^_4K9d$Lf7%Z>c@s-Dc3**?}ij3htPeL?8OI8z9zrG7510^K!GM<10-Xg-A?j^iZff(v>T;`;rh!zZephN|j zr3By^p$OxI#jlv4%!op9q15EyrAyUR%@T=Y17cbIW7+*{$7_-Tl-T@hqvWFk{)g0K zqBFr=fXx&O=9vF7&aWB3Ob14AlVCF`j-eas0nqx(2vrN1t9<8`TmMfvApctmHq(8g zFHgmPIW4U=Fq?px9Qx67e86TR$s>ZdP|GIb&&zNGo9QvZcM)u+q+qInF0h$^3f5pV z)u1Ru!PJ$=kBdqP5X{G6mUoGQhxlRoAki07AFe7RE?r$i5AMPM&b(jUQ~P@W)j%Xw z0X_it42-;tkObeB+6=&C2nzocT)f%{j}gj6R}u#bc(2+J4%9xCb|^sQhNMgPRl)iH zkda~_B@HJ3TlSw6j)eecHrVRaDgt2sFBAGNW0sv^o=3#nYw;sH0}2cWpf9id&ZYMM z%Ks||&|r94cPUSe>Xb^7O1H|RRQ0t9{KEye{c%-{RgL16^;J`PWXT!!4<-4 zGXOjN7fZ~H05~<&TY$wF-h)nNC=1s9{{s=%?8THhwUhugfH+kE$^ha&OoGWE zm~??j6PQ#1qy=2%zodTocfAkD253N8_ucM>m3?kW2lQz7%AUHK4`NCXxZZzA4x*() zN%(xnzH@PfPqkjW))JU3g2@7y%!A1sn9KsC7F^{&1QefK#b(2IB0Be{>y@hv{^N81 z_^a@hOtRTX^4M^#np>_zyR2KzZ^vK%<5>UU@mK%y-@)?7?YtJmWfG*6$mWTFDhRv= z5dscEr)&!&C^nl=9-zSsHX99|3US*-IHZ)I`UXUISjWcNC*&hr$p_;3ZRKmBiXqyg zAl-P4Iiyay`VvB4tx;$`$mSGqMNYiid0Pb_AJ{zMP?dq`j^LC{Z6ttqiw%rp(3kCu zae3Qv-JoG;p`+g>zRpKu!v}PxZ19e~?;W7W)66;y22l0bwTg`dXuC6NWhA&oJ_34V zz;jml+HqVYuugbzZ!LHLb-EN%H{{Dc|p)(Ajc*5x@Po;Lz)vE>7F zw7G))Rp)|HYp3%%FowkrM4(X(tHH$%S|9;To}k)1v=z|cYhM9dhrTMU^;rsq5$)$- zHGIcD@a~_IyWCa*f`)&3$`27QURC*OtwLKx!TeXlA2`c(OqX@cEuTmLAf*66tfK!T zBw;&)$zHYchZ*hDn1$wpORjmJT=zb?_WgtT*LpC*Km6dQw_G4U5-h%#pAWEH&#r)9 z0B*=63$mT*Ed%-P25|ldR+SIZ7M3f zsOaXg0S3I(5#+KSs0&yL3DkkuN!MONxZ{KO+UpP)z7{Z~lQ+Q%t>3Ztf^`DJ`3dt+ zEdUcL0zhdDn+@}M#{wA4W2i{W$VXLdXKI^ZExpj zVpl&N8w;Rzd@=wC0IL7KS?{Cq*C!eH=Ii)h@!Zhrtov)E34?$G$@&K#03cg7015!` zfbIcy(~YkNBfIV&}O$mZoD&sXTaRs>)MKpDXafHL+8@WuZNvDL4WSxI9G00s>E zz~&mn<=1tBi`VdfU=?WX`+DCi&gc)rfj)!Hjt4-2mK_kxI)j(M(*K}V;~z#U!59gL z0hHz-E`Q^(Az1wktms$tTKorifGz)_j_weQUa;A%7zn^?4oJL0x9Wd!s{t$~U>){W zRUj_wgV###_jSOm|1<)V{9iP?;5Ga&nh*YiW?*3~b`H2taJXXw|m@!2t9!v{>oN<FV>zNq1jIXzHzP z+RKB>?DO>$OaIqQF5u_?z{zq-B!s)$IV{^ zv(b!gR8&>!PfuTN&pFf6aAkOewbAaiM*YzF8x~cL-W|zVhs;2=wH5c%Y|ommy5Ha4 z{XutYNNBvdJK5h^%hz9PHMzVxTLHb?+<9q^7AO*Ggbng1uB#I%ueCl*@sAm2-=#B> z=fCV(0nZ_u*XkAd+#~Y2l>VEZN3_HPR$+}7m@TZeKUeoGQ{Bbxjv~d5rar&5jom0J zvwjG{r0!DMUfzcxQwl8ZdsfV*b?gf%% zXP-*>Off$Nx@BZ~sC^mQ&F`Pv;yFj1Y;~4_bvaL&*r&&rf9X9Y+~IG2S$4*78CiI& zl5cS?dzR(Od&XWz+`5df4AL3qy|+6+%A%vf-YR$|nK|w9Y?-Prvbe?v5#B?A!gn0% zXh0gn21B8jH(a4|UWv)JPAeh4lXLz;r6>;#EQAziZxJnPthI;==;duUNsm}R-IF%i zq|)XU;^Y%dpZ@NWrB;fcD|lo*&Me9E*!fA`(_&kup_M3Eo8&r8$c7l1+^{7BI_!d7 z(vkaRNrIlY6f3-UDF_<91KkJa(%$YPSz#)-?ck#;cRh{X$o|&f$Cjx-TJ{WWx-c!a zYy5-IZd4*ZUCpI21KM@7@Bia7vo&cZ)`KGfPiF3^-A9YkfW7=Xov35z$?=FglC)y< zuYKGsp0>h9q!|+=smq}SNB{PjuV=GK6j-&AP=TUPxvW0zi({H6q-c?B{P9_tM`kJWM?YS%ocbLwhc`ryE znSOXq!l86LI4jEV>k6d4G2Oi#{ZR8k8p2poh>O`|{=B}<_$V1F#CfElJ&RQt5@(^+ z($UZ8D~TqgA=S(ULCs&wt-Ucmk&lopv~QWrS{9uY9tsC$yBAtFOVjY1>#&z@`*Kf@ zA1?ls*5g;on26>wQ@0fP+D$VaAEi0G*v>9M6<1S^ux#BROi`tb;C~+e5Zt3|ckr{g z&5)gcl*tI!$N~*{Wrzq~g#UY6rBX#JjC>Qj9f@M8|Z0 z(roHY9Y#dPd*vrN;?Ry-D4}54&EK0;5DG0eF+W)~OWe<)h9y@R#~&Nzpy4z#ZjJ@V znj@Zym`h8&5N9;?<>rzSt^_dF$^^|#(x6dD3oxJP4Y6|&M$$n!7A&ZgV?RCcNq;}B z$Nt9ep%3v9=2HI9tU_m%aaU?7nQmS^yRYj53Ultj{v?Bkl+2C9UkM^ps*7dn{#oc* z+K4`=?~&9d!mOeBqaMj9qHzJsEr$DCl}aQZ%@@CP55<I z;DGbwOs>~?yIMInU7zU@?tr@?U9we-o1*k)dA;*^^>+wWaAID>cj>nyTmYuDm=VZAC#ptMc+e_f3R&J7=RN`ZQskg!h7Er<~5BUq!{ z+}33mxw~cIYcAtjeu)ZvZ2joanOo^Ii}`hzVv!WzvTsP*mO6Ek+iR2Q`9jlz+8!bN$ z4`b>t(s{Kr)KkovFDbgt{%mL%c`RTgE;e%T!&`f>%qGwd<*SnBr07|>YTlufbTBWG zxpTXpadZgJ%=TEwRfWeZ&$PgI8=60FJ>Iyir|tQ{##0A<4x`Hza@3@vE+@YrHY|_}SFsfdPmLUs20YFqo&4&Joyjfn{gaxyJf|bzAoRWi+mHM|HjW ztEAtj&zPdrg6jQY+ohA1b3|^mSISNm@4D|Fms^c1BnoGQd8e4CG2&9K>@8Tlhsi)ql3GF_QMpRcXL^b1OAW zgqzk>GH_K1NT?X6?YyZdzVC57zl^~&%hIZ)Xr9xzNqS+~7++XGt~W7stK1~86CRIH zt_JNo)ZBCNCN(P69K4k@4O(x7o^~9Go)bh$@pQfEsN1=gmX#mIuC|M$UASI!qaBN) zI&d5TmLAO%sB`6-6RzIU-r*FNv83VqUwF==U!h9QgG6g(soGgqt@&#!D|_D^H*qTpw*C(*(XHxlz&HE9!$K;Z# z2k~JKc(zX+RY-DR<>r08=sdGlqwQnoxI|82`vxXlgjVFr9TCQHDzuOH(wTKz7k}`Z zbhy@%dX9QunfKqPye+nbwOB)*El#y@f-X_vo;~?!(K$0(#SJqbr(AZJs_yimg0@sE zC$~3L^Z3o2oplu# z$0SVTz?~A>!BV~T$ZSQrBvboQsr%ItsQIfx-$p>I474*=_QAnr8P3eo{_aww^=@{0 z-U}}9uCJ5`_ustu0Z7)*qmMK=@0`u!b4~qPwNNWdy)z@W;iQ1>^f z4ZfP15BM$Q`CAv6cWjGp)<#XPI}c*>ggaxEA5_xHeD6H0&E)u-%x9W+l&5tjP7_@+ z2+U7Tyr^LnM)obH(o>hB7o>h8S_v#XiuYYU9k8AfPQy&A#A^NxXx!Cny4`ptZ4tjQ zTj3))A_Pgco@BbwRf^b`2|OI~xAx~KFgAq$^(BVTvvLSo%Y}h!nf%b4o3zp8UF714 z@=I<+5@Qc(7|ZX#q~T#6TJBBSB@5G)v9TM7WpafL=cw2XTlC&(1@xOk@ad5N`Jyp8 z4;MG1=6(*?M&dRxvAmP%dpL>#f%iiDS^v;dlk`>9D3}#Atoybd$-ZluF#GBIbn_$HZ&cab@BK@iKD7>-EKfKZ>V=M9#X^HYox zRTfDz$NCV_Ez_7&q%Pby`vD^}n(a+LRhI&eb%L(~pj1>0?CGk7nNWglXJgx#hV9Yx zMNypH8boyh6f3-~5gDJ;Xdpd*7n~X>p0Qx4=6PoNzL)DH%kix}>KLH=c%9VKpv%+{ z``#vV2nmP~v@?P;2;(@`-2CmYS)uyIR#+J=5@FUi4A^&FzWZ8^qMVo3ik)VOP${i% zWSfiiyKk_^Ch_O5&TF~dpMTa^hxW2e9+Ql;nSr_TZ23iP;-*_Gn-wFfqq^!$&zmC!NQ&VgD82D`6n(!R)68u z+!gfBUC*LmHh4XHo>(}xjR&Th-AQhJuV?h@wQlrXREQDJ&Gik~~sW!*0CqZhT{!&eK)n2X>zJfpbY~;hytmf;8x5 z=*atuQh08g7~g!{l$@!rbNKBDcODMWDc3RAeZ*bvV$MR-K{5|2m8SLc4a`oCioogo9qIawjVbD7tLMwRAn3}DBpl~7fH5X zeUvZ_aBzZxOu!zS&wEH?uU=i?Ce+)y6n>s}mSy~!fzLL$S3x=p-=ixd=}(ZxOzSqE z&%(kzet0?eLAN&~vzZvvpX7}sZiF_Q*%{vlCtnZV<=W;wqFJf$+;hi<(#KU8I(CVm z#v%WeLqv-UivY!aJZrFB+cK}x(2YFYR3$@+q?QCVTp&gk^HOx(6#p#Hnc*cT84evU zzgHseg&bt)>?|L=c~Ej0F6Z*MQc`ik`?W$cL%Vj$zTLKFk0NZ)PDPfkdY*h>g+2l$ zR6@aV8IBs(1h-~7qP_tcR(mU|F%Aa|*jl$P149PzX}4GkP^SQ%Q0p3~QX9a7)i!}y z(*?NXT-Hp{)HnWlXKzK5$Kg!8X;1N5eU%#=*GwhVH^PjI``8y_lsOFibi%D`sNP$Q zA^?fY48Rv^eHn&2OYWYXKm zqQw0&;N3+WR-W9=GoZE5u8ZMg2i$OLrqug(!2Dc=g9`x1-XY$1at|x$`M9{UuC@u| z$z5Y_+Cw~Eq5EatHhJYr>zdljMw}7l0P9Pe`(+3q#Btva{#I1M~9*!PUhEc|&TQ;}4ikyJMijN6|c6KvZF74AKy zsy6$l=}fEk`@=*ZPOl9HxgHa0EqKsCVzJi-by??9Qg}1Svln%`)VO27GJHxXHRVC8 zwqo_KZ6U122shaTJj;;_VSUUfmXeYxP@coaENOrr{(-SPk@y zms(y|+3L5H*-GT<1}9{J#J0{_js3u1obRdqQa z4^x*LTl|}l8OahTdUXhY^Bm@>VB|OUT8tF$R(0SLeMJB=|Iqw3=%CIip}PxPN#&(4huRtMi_NGop5T0wBC0^f zmnl;seBU?I&~H(DU2ie`VK$L*a=t>2*f4u66MTYM%RG7dxhePYjiv!zpqR&D4r2ME z3fv9jY3yX7+GpSUsrI4u*w)a=Vk!@JJn!7r7n+aI`KK4QB68~rl$#!y9QW0Vj7o%S zNy4Pp3Cvl1f%{A=P%x473atU-8?izaU$QMCXpYdNrVNW#PFML`9z6p=aa1HSDrNU` zez;aQgwplnN|G59u)2ZT#gZH=UHb!H7p73={OPEwF*Y>yhf8w^YInWcLP7}hlHmym z83^`BKblWa1p<$f(AnoIu0_ea#oR`U1Y4cvVpe1ZGm6(0oz829Xm35dxd& zAl!mEb0M)whFQT@WkSK2fLflbBrL&i+_M*}hb_o4K-faQb}&rS1hQ{)$q#f;m58GFzA{iFXIFGa{{QXhJb_Rx@(tjf`Zo^-An}*dX;_ zF7f$86=7S7qkGZUX=;U-8wD z#h;3u$x5p3F)HnY_4A-mG*AyuXXP2cJw4qk{?4MH%&_opj-9VKdtc;qIZavj1F>^5W3>=kg@&lzeX`!Her5Xv zg&+>s7o79O&+maT6s`--H~>SQO1OX$wg)*P%e(0(L{%1LMkjURJKLVDH;i zc;l&a=e?Q#n_gJTV`Vx5b-Kqe97lhW_{A7k1j*a740`6%Ve4JW@Jv6gvB6)CJZ@C!T;3p}5zTxd6yNL<~+eheZg zk%!4vQPM$}K{~)He`FQq>RY~j!uQ6zQyQN`>$sgLw zS`C|RIgM~enKqCxL|KiU4Lr7DeKX8|UI2rHSTO807z zI945E8kGK61|WbYd%0)hvx|j*nrLHqx6(yNoYC(k4$V(UmXe_=dZKp-LB&V9ze!*}5t@KEnAvnA>`?`63`I`{e1>}GD{aUMh z*qPz@cyjeQ9>e+v@+suAepe~hvcum&tP@@zFI4w$OgmNYC>S6o<<>OdRNUBC&8}~I zvQu{k&;3%nR4JZnfAs3`VN=xM?dL3KtiT^idlFF!i#=F%DsW{X6EEVZoJ(Y&L5?lJ z!5W-o*xIZo+3*$ejGq|v-B{xLfB1;TgVo0=PJt@SnV6dIx;0@pR6(s};Cqv3Z`UT8 zPeiE>$-S^r5lbKsWB(%vrc)MKCdXI*8h=_^**AUm!+~SU0wia3e%Dp>9>uZk2mHyV zu5b;m%dbd>&wB)q6Ml!0fn$df*V)0kiq)oM@ns()G-swHNfwA}e=UPn9O-bF`X1-| zlJV_UvUtJ{ir?C=%;QCC2ouq>Df*u}=ArD72QqXGtcwZ$Tv;3FUD?{r(J(}zuDblG z$#Y{kW3mU#Ua!`Q#2dwD9Ijd?pNo?NWr&Q6vGGD62*=5R`mLOFO6ISC+Lq>~#5V6kNWGc%VYW99>v(b+&qWq9+mZ)FAwvQ;`Y^Vgi^jB{~hFFywCAg67x zOyzBU53Lz~v~Y;q)MsDY2tpObCwHuHr~=RH;6RA!H31nB2(h`$6>DS$a>g~=n~L6c z`wj``M|gb*9(>+2>K=~I*+-$$agHYnaj=Oi*Yot;>gi9L#*yZI#7>((jZ;cY`{F_I zqEf_B;qMZNs6HC&AY>cyHns(&^+cL5Sz_`Kd`|pxk0K2@YD%&fMcz?HZU2-&?qddp z3hMbaARsLOCmwIL_p!cavmBzHC z&U%%cB0eaqeq|JCeLUxit|D{?Ar$S5=TcX$n`jJi*FE_{w7X+KBwqHOIup`}68b?cidmWa) z(9e6Wavf)Qv=!>M$(&T$K*48VpmV!-+Jf)-v4g}FDL!rXS}{zCgWMf}Cz zOZE17Lp3d-7wz#^zN*d;>bpfJxz5P1k991Y^-wU(LJ=CEZ$}Nep*?Z_5gM+8KZ+r* z%XZ_eoUL-t$sa}+oVg-5dN1S>h75RPVGJ+6gz&tHFDqNoR>Qp-6t#89NfRkwZCL_I z7^JD_pV(KgGt9AnOe@ZAT_&(aJAUhVR^YVE)!(fzUaqR-tYR_CNugWP+q~#I$_ZfV z)Eu5`7cd0kG^YJ1U<)@Djd4!E5~o_<1$I+_o%u}}Tc+LU8;#ZGMgLJwAYd$11!{GM zdN0M|rmQZ!!-OmXVT+idycf zl5tBhO=t5U-Hw*g`&Fgc2&dTCo3h!oOOCe&h^sToF$6%nhg#o1?f>AKt%5eE7LNq) zM9b=8E;)d$=_$B?dDgdTb;dY`0GJt|D*65oU~3A5)UD1ilku)@X&ryQE~|?JabzzP zos|)cxoCDYyOQnxpgJ`_nAQ$(Vs-dg-rbzUH8;1RC7pVy&{AwNh}uE=Eaw)tzWD)P ze|3D~em$@Qi!04G;CdcykowBscI*B2`!qREMp;VPGye-mp3IgHvZlveQ$0MIxxAUB zds1jqRv7i9>Jy21z&}K(x-4dCDeYNccbYb?Y@Y}Gdvp^_o=7AI|BWx|KwAtOWdSy& zfDzNisrRHTZv5-=n58Z2%y}|L_NCCAZfBeYlcy5%WX`Xg1((2ydnlgFEL7Qof@!qR zVHApF&Vs$YJeee&XrC9){eKp_3a;@y{tcVqPqf7AP=AbcYWHGCRNi?xl{Fr&azU6X z<`fx&WZxwG4tqUAGYXDYJ9Ytr*s2A`;FIg$Yt+2gr~_hvR%gMItt%j~ZO4xnWNXLj zbo8HSHgPv_5ZmvZiQE{LCgm7WEXhfBb2^0ZO#;#guda7v{2<;JTLhjLLEh^lYi&MGw#TVASMTS*7hSGq$#?&{2s=fp8X zu8CvI@q%8=jU^^8!*zC$FGgBpCTv-xxBY=p9HF*1bThB&pGD}ZD|Kt735T0+4$nA0Za~6OGk^omz_D=Hpwo-0$KPs~1=CvVQH6g5r73f%U`cT;nVO$68u(MEA<= zQkT2N3aDeeJ{R;|kL&E0f-TL1l~Monm=gtUuY;9mGWkyt@b_nw8ob44DD8-&zXJGJZ&Rf^{F$0krH6%N zw33>4zj+9%EI-N!j0auPoH<@`yP|&co4y;>WIaHA{%OB*7`=LdnwTwXY zzLq-kVr6iqpt;d1^(sVuN`^P4iGjmWjU6p~FK~T%En~pWC=3KyF_4vWBA87P7Nc&+ ztkaWtxq~W}%s7U{vj0bOC16furIq!IwLY8KV{j{nl1IDZQ-(Q#L+M6Gt9nH zzQs`QXQjJhtc_kO8&<>*QS4b3G9>ysWszhCylGpt)x6Y|ckNE=&106sE&SpSxzdHKl2MA^k>Xm(l9FS{Rx`1uUZrG_cp^oSC{uUmu@eyA{_p1D6&zc<4?SZ3EWKyAEWjZ`tsePiI62q z`jzFB$O=At)#H4it3#kuG$;-p;3AoXilV%<%Kk<+Eg&oukg?jm&Nr;LWa1Qd!n!}4 z{L%W&<%eFLAPPD@4m;ldt**hR-5W0~6lB46RtQ0*-x+_0<20#B-@`9K6jo|noKQ|J zYe$dH2v1_Q*Gl1kW#${uh38xh+ zsf3yem;P=5!u^|-L)&PIyKMe9ORZvPNViHiUZvBplaAgOILEL3?iUbG3rP;|=mI5$ zd30erK{LO-{aR536qgL-d4ONKUl;=-;5gfF%0yTr|Ke*kS&*Du8chjKUaQDAnmVb| z4Vn>QZC`k6vLHTbH<}U`3Z!Lz{1u8BGqin`D|aX_RvK%<6?6vQ2)dBkbL}y9sk0Am zkD6s2%ZpI-A}-ZT`+?u7BlbDo#Aq&CX^gr4aH42;Vdctnh8gk9wWlmG&Y4$#PU=?+ zL8V3BcWv1gp}c5q*$$z+T5Z`Lp}Ylc*#V)vCxUAIq@=(=)%QtBL468I4xWP5qfwAr zjHmJ=O{hjaEQg_Syk%;0X60WR2@b|g3dJ(ibkO}I0`O-V6+&3G#E2!xW=s5>g+o`G zF-GpaS$rZ!J+!LJbU0oF1dj`6y$u>61C-ys|2)fuBhXyN*_5T0kC$}y)?)S{!Pujs zo|KhI3$E9PmOungV^rj>ZNSwxx;?{0r4V6aRvj-VIpZ}{;Pu~AH-C7b8w-hY&- zH&MC{HBA<<7588*W1SKDc5f?Ey2Ipp8a$ayH@BSPzJv*i*i6w7ZiC8$i0OFQ z7|(!2A!@3UPRP(T=UFqFs^IEg&hYb78A)qd6$v?1X8kOSBswnBt@^j%_mwx$>felW z654+M?j)MEo7$5QC%))i7BJ`!P`=$RaU@ z8KHc1gk&eL1BYS)q04l#Tk045RxW$FPtRR><(ps7$h5C9>aE&jBK)w!rsH}JBsd&; z?}}i5Rhw$PC+j+*M%G*W)whVOm)Bh?=%0w#T+%>>=cx8wob9M;GRAV=DE8Ar#Soli zm0tWztYJ4Db=Pp9mhx*wSZ1$ADj?s#vd5*S>YoS%aK~0vz%su^Z|k8L^PN2Y53#2nMonyMI5-UKU5qqy8QR}D zYU49Cx%-`#?C%V8jlV!WMO2Rv2Wl%oN6~dVXQ4b5aRRFxlunqktM=%U+5+=Us&`*w zGdJC@#V+}M|3HJl3(Wg|DG09L2(YHtbV)V{1y8c#LR0 z^Pi|T!neYTXB+ul@Q{S(n$Cu#SZN9_Pw=6>dTptTWXk8KZ;V7IoM`$(+91Tw8#d?- zaCR(Z=C1X(^1bLc^3`*0>Jqc3amrhXv$4IDm62RcGQES&|9Eg|whG%> zg(O*t#GaiwUpd`<38Pv4B`dR0WOr@LGmA0G2{NSb%O^b+qT-Ogg0mr`1e=_v^}HIf z`?un6h4zDFpqvEnQG#KYX$;!1;ZgS(-6nGPg6?tzmh)YQ0+@z%kk}=D{EjoEQF-_= z{rTr7oF#Ox5(*q<$fjs71W6BAvX}L|9p!C8cj@Q11EZf`IMf4B?)s4l^f_HyjV@h7 zolpyx2kD_>ko6}|IjhBq-cS7bY5%!MPY4Co7)vMyIgRE$V`B@CG%UJGWT7(&TYg*| zk7cpnfvGJc);PATlQCpE<s+{K@TF+Qa!Nq2k>tSX|F-biB1 z`ot`icrMd7F&d|2bYeT{ya-ujM=NYZo5iRWYTk-#NQ=0QDH)^@UXt6ntPcRY+5vF~ zb0|*^+~`jy!gRr9A^HPZ;9_7hCib+6UA%2yU3ci$IVLmB;!uh#Ly z#Im6lOZF$9wi1L8W10Z&(X7N+b(qobtV%zlA4LMRDk0zatkRPNAMz?8Fz&*VD1~8u z-?!M?6D}1xoKR*L$|}Vk>ErBCna+)kX=W!GZ&qvI|$Gvh!JI_-)DL;Hq+U zB<0iZY-XI&QA30P`pIGh7xVd#BLSOP?-l7Ke=_*Qem77AK^!d4yVMdn2b(t*%-9C( zW?$^tP0zGMIgChR{SsjK$wF@+466FLOIzuSXg;*Bu8sv)&qcbRBoRcNauSus72JG4 zL~jyiGb6AnUNxx%Q<<Jz@$3EtALaPt>lQaXd<~FaC2WjF<~nO{(e>Iq2Le7>ZOtWqlEQQp zpTMbPHa5=VvF$?;KzDHN8{Qi~`uiqGx|#NaM!ezjSJq@ZQRJBtolN=w#p2$fvvaGV~{hZA2<*OKr)PxI-d$)A#*)d6%1XPIh6`7-4Z0Tos zW}xztzBdaO7|R*kv`-An3uizD%Z-4b&D31`T(67paRp!TVMUpKQQ;)Sm!@r!cIzg~npm&`UJ-rGZMw<89gcZc@}Jqo5CceB1y>Mbyf zHzw?HRO`T@pVgy2CMaM=SL0Vy^SpscMr#i@g5#)L{!QVjF@$dO?-)>zwM8Pt>$H}5 z-+rf6=m5@Zgsc@C!*Q^myCq7!4G3NX8NpEsKe8oJttV`2BAzdq7hHA*)Bz37$nE>V zTZ{CZ-U;TyqqJlF{QZ1LRj?$u#TVN zdrwR$na`~j9$SB~4%U$8X8iQm#~Ad@cz47e`aO9$+Q$^sBKvlPx$PrXwWKA##sY~FOJyDM$p@Nni)@nT8ijX;)${Jnc-Br}^s6G)HG$)zHrZ6y$Q zv){G<5fgV$b}UL_`iuSPC)x4DeP~n-i%(8&BH8yS*u@bEFYg?qU1GDsPoicJ;>~Q9 z5z#etaZhBlrID{%5XBZ_?MH3MyCk?wh|ENAV&aqqC)v7pgQfYx!7V{wjE%CnW!4tue$`Q&eSAk$q}Qx z6`1v6lU3~yF>$$3HAhWk-)Ew0N<3q(hofs~d~%F*D8z?o7za_s7HjqF!g3rck0{L4 zJjiGziOm+~zD!odkR8MP(rFDBTeNI25?{=&5Kpc!ORi8$u5d}N5J|2uO0Ix!OpWZK z6P_dp&m(v^Ue_%>q$I!ddytP z-WK0#gAfl?-KDP#~g9wYYR5!&X>fr=U?JHtK=Fv z9xkMMo-x?9kNH6~2?WB`Pht(h-@@g7-Dk_)tr%Bl$~={mT7Tpdo!Q>^_=0Lj=ueK$ zN=++-+$5GjYm{^ncp4518CAjsa(gk)6u^KjRrYZ!=s7_5%vLuO%{Ry53rHjC$mwQ1 z=+GrAaWF2hJxyKqh}dvyB4gb0{AA7|1zG-^PAbqsvY7P^?XTzRd-(=Q*7}=+P7hXc z=Y?Qx-rKRR_KE+RDZc9qU?oxQR+siAFuC7ke|8fwso-2e6BWkV zZ}e?w#V!6Ndc#-}h&+9KMw*0LQ#SHj&|YBsV1j7ebUd7Lr|j7CYy7yt^i->aD*9qSN2Tsdk?cvSPv%u@_{pZ}cS(3`cJd@>9Z%Ir{7^uD13hEW z6mcVOrYI>BAJc@_F&(PnEQqfc(Np~EuND3ziRynZ`1tnBJYe4?A5>646K<9gr-gk*#>`>h%lsu$Mzr0J_J?ru%##dJB zTqXHU3KvCM6+EplTFZ9-O_sAnoOkT6XYW*AWK(!Zlox?gx(uH<*3XMLXseXKC7jzJ z6zN}BgldkQR~N-giD$*yHQl}7hYQ?nNpR(GlLIcH@CryKNanDPE~O(IM*_ehPd8+*K}SL41V9rp1zvTe}rEH3RleZJ|hMce*1`= zr%mP|M~bXgZ(XID)NPGBTry5zhG$f9PF28q3egQ758+%rPE!>^wi zBER6as*acb>$L1xPT6(nUI;50;G9a2p;w4pIP{k!$6e!N`Q=~)w087`{Errr2hz=3 zwS82qev5O|9G>J_2<7;bkgbqQ`~4{WQA$>KwIwa$;FA|c)S>=>PE@6ON;W1q%MR_k8 zyVn}i7%BfKaiT~MZ#Y2|zS~J15Qry-`XrU9;8PWlkM9SaUN%Tn6M|nHejMp)KRks4_=I!P80h3kWJ{qg zV$-v;Jur#vmnMT*}`AXwAMXDf6LLt};KEAxXwQai`_01QDYSj(ss? z6-7@|_a-0%DSq1wy=OG=)fuO!ey<_CA>Lk{MO$*lvM(0ir9g8<9NqCNsV`)Y+hO=^ zt^lg?K3i(=05@mwL&I>CQyNuzKAeSfp5#Yqz5Pqf{i`?(M2a9Yy^}afdnwMOO8+Uu z3|ta887ro=Z#tdtqhXf=RcYa5qW(X+-ZCtXpz9XJ-Q6`f!QI^*f;+(p5Zv9}-Q6`f zgIjQSC%Eh2oO$oL?|1KybN+O9t=`@Bbk|JPu2sF)zKEuBqM%4yeC^}C=|h(n`}=Sc zSvKc#RxHfxwaVtDWW^1RVp8oW_IPQdt568N?Pm<;T3(_p6N-yGK#W|}C;AOd4SdGa zgkXM-455*J6HA~>6pM$rA_C4@TJMVFys5DB-3jPe-At6tf*kRFqE$reXfBEdEx>S- zWc}xdO0Hk+80*uhqD|(wd~)-EcMB=ohF;n-CR&!|e`Z<>RER?*K1?hAU=ro(KH)DyhBSb*G5q!){wN+{1ei6DVSuk8XF zBiFU2Cf=rd(fUe!#Fd9yYl+FYEu^@AW_z|KYl z);3TqLlHTt)^lwUYj`_w+W2(^Qq($|jGI`bRp<46@n_5kzK3u&!DDk};kPIgdb$s< zGjnxyHJE^!I&b*h43UIjnBeU}g;wHhu;}j!WM&!TY1~-Nja>J-jNoaS0xCI*$crd2 z%eqP2Fm@*o!(`xTY5Xkla0~IsiFt?wkSB#a9T_g9(d9UQIy8vZ5pXGF;4M*3#e+qg z7ev`OiwU$n%Bi~W%QdxkkJW-}oB^ps9!u>vH&<;U?`$JIRE&YwzOtQ02<%P+A!HwZ zCHUsP?-Zl-u%3gzh0h)h7<67-5L8i@c8d@ex_(dap4$%v+yD9Z%AYHxST4$Dy*Q+X zV!D9HB=9nT6=rv|Ij;ThoBrRF6N(~C8+<1q>zPJC`K<2PKeF)kO%g)hMXcTnTH7Mt zMKaYxT@=pFEOABlwL~JD*NudAG*oej1GA13da@FbrbV^wMMoW1uln`ojx;tz-|n>6 zE7;Q}mnZ^fqnGYgrox1F9&a)-tCv2LNr7QOPoG5q243J)gT`2dGl9Li$^w!QB3lJG zCaNM9KdV-NpTva_D^S7pgw}4ziaL1Lc$xKtwRpuWsi|C+yO?M?ZzSl#+MasJ>WM>E z;f&SDn=&@Z#1C>j&*C4FqqP2o*sg~;qgx=x;E9|ZXB%I4JA1bhts_{|jp>1~RA zakHfkTc;v0S|z54ia5{Z=U+j5JBkwT*188Sbif-?UQda4#-ErR6~tHua}7Q=GW0n5 zajAVCZHx4w-(}Z)9o*9H#~5`je7HsF8Yq2(b>Q_X3AAWe{jSUG<oO!aese}EZ7iY zRQ-bDFKF$Mx%#7Klihr%#8BGGYjpKTOfL8yRQQI=XLM92=2UoZenk%|=pEImITeEP zO%ktKXePFu^tm-hbxXpw<~E}1lO0b6B3so!K_@kktVU@j4L$UgJ=A~_K(^M}ybVae zO~`hjo9FXNk74eBSGExSJ&*dAa5j^ZN(lExmLni@Z0Zt-R(2J69}tMea2=*BKKs>P z6%tc<#-%=_P2G~;0>KbFr?>2O2ty`j*7Thq`B&iXEKw)HkLAZ|d{Ybfp@B7N_8~kB zZ=6~veA6+TFudACIJTko@?*AmD&M`72v?qaCI1Cj44kKZUb)}>Y1x*4Rr?xHMJT2J zF*wRpGaIRP21l>kpi?FD9;6Q~G90*TPK5knAlfCoL@+vo$RCyEN*e5A9T#=NLyHov zZj5jV?LUo*v6Ru8P9c)2{$JA z*83IHu!tqYzfRS#cw8YIURu@%j{?v5*zi{^7(*(IkrP$hCbu}WzHt$28tl|JjlkM1 zCCkdBa!|m4oSCuU0^)Wc_%JjaD$VQHD$+#58g*r8s!EOV+KeiyYMkywG|MGHjy8@3 zRxyZ7T2Xkz6PNq_8cZH)OH!oYoVIIRm)WsqY2~3ncw*UL2NlAYyVTr{o2gD7Gx9*a zXsA~hqr=e#J}4cDl&Ufk`;TiZSP$8JQ`ndP+bE51-$zSCw@7CJm&HC z#!Z+}(8^@WZ`%&!xIHFi{I;}>UA(LR0-zzkCV8MB|FaP;`2%uy0J|@+Nv-f*jOIDW z|Ji^#(b5=@5A%Q9{O8A8_FsczwUG;-l;3>MQFI4B{2dtMJiQD{qWH(z(?3*3U^Q3i zX%8S7eqpEF24c5ij6e6S>%p7*7-QFkun9Gg7^SiaDJ$768|3WAlJpDdqL{%wZ!I0H z%*bCWN$x0LD{&jeZVPr+c5(ptmqMklvwZf?AM73fy+WUE6+hz6cU28}pBgNvC!KEX z8oToNeltCaRrn?9GX!L%NgSXK2 zT?*>>0A$$;vILP)!%YxgZNm|}KAWktT}Fx$jy<{h=zt9;(7~xb@_!C)0zr-K)pi-N z3u)^h`Cz|+-VeKcfOBeaw<3R?gyog4pT1qo>*Zb*eRjHaFLhmoab4#AcSZIZL%uQz z%FF%NDp9e!0pr`Lff^}5UhTx-PD|t(L*e?<Eyu-GDK?K5SH5UUBa^tNU*E z_zSy_LjB}7fwr~|V-i>C1&TEs0D+y|b$ zZY^vb)c3yM3~n9t&W1*5&D3mRv+b_a3|QumMV61*iuF!-CGFGvpv>_I;{714vhj88#f0IIYETW`%fU(FGWju^Ba(}%}@ ze3;{BBIrE8!x)|a9=^(a38r=_8#=A;q@OgZctIR&KD3jn9OlsfrjjjH_P5PtQ~=+A z*Wa7}a;0jj@15=6 z8imrD&Ks7u$LoTgy3?tn_v!!x+L@q*^hPXeLq1Kto0jIIh`pP(Gi{X=S>Ljb zjw^EB@>n5-*@hFy$$DTMhYDU_4FE1L?BQ`1}Xo88(F z`=MJW)!D;N?B@erRGW$qTTUI>PkUQ}F!RjrW`q7-UbB??6?uCejAh(tj}|`nR`zM1 z1&umyf06@KV2w*|kgkqi?TA3g&`h)p&$G^WjWxJ6RyCo_C;vE{rHgc#pp7L@luvsK&vBn99&deH!l5b$` zh8rm`+^~{H_7?q9=f#OIzj}TTe>RHl(yuLMnbO zZz3Juq_n+3z6A=~uog&!hb(|E$1kk)oM*40Bd8tihn|cnd=LM(zM~RQuoTf;=QUm2 zMG3Snl{7DW zfLq#EZjJk0nJyL06ItFLxpu0~O>-PQTns4&d++QJu>Osn*5x)A;9I(ScH$7@woxYy%vE(MoHxIHy7wTA1n;WXoyq2l?g zgizG?m9Rzu*-N&p%51~w{po3{tII*y_kHg(p75#Ic}1%D()gRZZg}Fg!TQmLSwC+> ztF-P_Sa>ofQ_(35U{BS(wP!DKM!G8EOVtwnUeWiKP;10AZw8Gd5WFr*Z-U0*`U0Do?@l0huD)tO-r8*cU`pvi;Ne~eUfXGV$M&S5} zJdIakvZBKheW_z?Mo=*CFxbCHI7jL2NY6w;OQ=u}37FV*2d?!Mq(BCP+kI`_~JdKf>4%+b6lMp2GKE4#VpayvI2bQkoDpSE4VW^wb_FkUU<>kVDrEY@crUm`(3(IHMr1EI@PIXB%W?6?=)ul^=Pu z+dNp%s$?{66_O3O_Le_C0@$%UR)y2q#xL2zmYP!h`bH9BGvJ^(vxRP0vl>h_kzk)xgrx}8JFsjmi5Xz!t?)FjQV4Y^V}!-YUCWTNwgG+3XmWRyyn9JiV0vN}>G3d2uqQsxd* zrRDC0n<$ng8zpMo-NlZ9u3C$KW6X(Q**6iZBW*v!$GG{IIrO4f*WMhR<)TJaQ;~MR z9ENS>hZymgxK*cBg0Ybk^d+;i zjeU%9tFN*$#pKrUjpIaL2++$?B#!Vc=eKwMAUAa)>*_HJ`-hVt1B<)*^x$dkS}FX% zug#W7&_lCBYM+2l=70;>Yik{}x>9jHAuV9w$D*rUeP$7IyhYhF4kiMtfUMwn4*p>ef#wB7L4V|5eLMd`t+_)7cC-cgU1|pt`$4LTv1< z%kQonJtkEQyV%G~4k{J!E2gke26$G6NGR$)16l3%_IVUk$}{^*3KGiloQ*emai&OHnWw`rtE_@8!Bd$HdM8Af5^7dAzIy5N~s|UP$HzrO;bR|!oOtD!j z({u|ttb7*`OZ#qX*sodwmP-jD`EBRE?+4nmqdJ*ciddobT30)WsK(q+p|g!`#ty0; zvq=p&^1nOnKWTSzLe5>{%h)qEg}`i(vhT?gx$_mHtZ1JnK*2`v8?f>l=-`LA9#-ch z(-y3xSyzA4bDo4}{yjwEmM1tT&m};pu5K^f#rf5Y={&WZHQI_#-_dezOtV}wb(gn# zY4^z0^UEQ!mTP~5JzbS7%@2_%pW?^YUJ6cF)hLZDF~Sv_AftUcT%~B8ZP<@2Bp0{FtDger=^J_s&+& zLoCnP_e)Ca&yP|_!uiDhW{tle>l<%^nCx}y&^!Eq8oZc=d z+p44K_x!7)crD!LM+`X=>x#ZzyfWnw_pt)|Z449G~7aDz&1`wWOHQX3Vd0Ir_<<#~3I(_Ssy0>D!zqh=9 zE8=f#&a|IhC^%WqhdHnX(C_!;PPtg#^Ar9ElfSY%y7HbMiP~zhH~qEgi;ifExb^zp z651sCHcmGkI>oS89)dtko{S;q15H<`8Fr{ESK)}!DI!->ameulB!NeMpLS|Ob-^rLJzZ_kp)q2z`ActUb9g032iXaaycmqSS>poK2@^+HG?rtbF@I7 zt2*xBA`<=@osFRl-L$76c@{UlX*&;l4iQW3-ynOqb#84dF&eiahV*O*N;lt?oz?YY z5vwzIZ1rPuvmOi3PDHQ~@A4_z8(hV1^(Ahk>HkUNh1+&bFnc}lK5g|ZC)H+1cUe3i zT24{N%Z#m;wn8-sn_^!8&?cu49H^JybrP3O}d9{ zmO0KJ<=q;1u%0rFo617gh&67_$Br{%d)|Gcc_cY7Q{h&*Sj4!8H{S+28Z*2WWUJdV zMYgW}5Qp+Z=HZO-h(k8FwYr8`i9>D-NZbw&ePI$J>`D%_9E7Gju6*p+SRgSJ^mqGu zxgWgD?t+C2^7#aNE|bp(#i(bvX&k*ny$ruk`V0Bv?6k6z*li5wv~7p4)ejIQ8M~1v z{6;wHzBQ*sTaQjdAeZW=Q>*fP=<9Id~4cE*2q4&azQQvj5?OKQH;xxVY!6t*N8*db_ z?5yO$gBnLr$+A7^9Gw$<>0@_Emo3dxK;hC{Y3=-YnGIE_BI?EQYfgzvK6je?two%# zoh?MfB8JFc>DTS9^V!{CM2}SD8|V6q(Zbrpc2%TE+dJ_jr5ADdPjBMK<-5vy+Uq_x zDp3rjP+xmy)D9)OQYZfQRlEVI6`LTo4N=!kzA8*amK%5sQy;;=>Liz=`AsAJ`Vwor z_Y^-$PMoJrhrJ{>j%gw7rP@u1v&6!kYBk=M!(avse|c0?lDD)geyp&_`_j3Od9*)l z8vpSAG}+_k+EkkH%q3Mh2I^(@$KGl#!^8Y|LWRwm9)q3Lo&{H=VlgGuVa2^Bxx8QG zLhO$n*p=n$G6!IYKkmJnZT3XF(ep>OX%8a4u$8&~ZO;u}QmoG*2sB~qO}}n)0NVWk z5@m|FDLuF>_sb?g0TL3)=$qqu7KCObP` zc-sfmM|S!M!wpXu^&*oGw|%$C}% zyQG@TdwuS+1}D#kbS+^eEn#~`|I}MrFVFh}j=SF*(mD_57IGV&D6@j%_rSBKuJW!? zBmbAZ@UBt+!t76&dt4m;H14F=C-6aE>%IZe@gGPc{b@*L?uOW&Va6Q_(iHz_x z^U$0Q;ZJ>u;FNBo^Ex5dUJRHFNDS~gTM&IN2O6AALLA(MD&~B0JaJ`Ud3iE+P;%)C zj4ne&9-I1m#c(@Movw~@34{~r@7^vi)8iesPr+`VOE5NIFK!IZa$=*e=Nm14$I)|^ zySTBt*VpBh6g0=*kN-qHlAzUBa zzrlIM@W#&__9@*APis92Cbe4>3yI5MG!*CJA{>Vj?agkJ`UWq0vEp20J}|~FXm~|V&HjPL8?@{RuvJ!-fz9(rZbwCND*~vR3ajHkx z-r}~w!FBgP>=Ky=6YdZF8fO@|AE`6 zEJ4aqrb%~!IT!96IF{kfuyIniM0Bh3I%x!Gn)Hb^;CZm_2z~&Nm@S^Oz8a#k3`!q7 zX3)=df#E#v3X8mqUk{E)_t9Vo^ey*HrZe*^x%*1-)TlrE9n{|R_Jx}V6d(rG86+^^ z{%ms#)QiRcH1N-UHSU*Xf4&B%nVNmOv?v>z>t)eqeetB($K9pz7-(J zV$@5KS!AWX#g9FMwP8e)nyy`uW4oO>xQ(zJ$eqAn)!H?0nKt+C&dyAnxwu8c-H?h+ z1mmUU>TMkjmRhBA5UxH|vw@tw@U%l{7_!zreE~Kf9KJ)C{bWZ)Chxs%ySo3Y0F9kw zO{MopRduhXDM&KOr=jDKMwW24d=Jv}RcB*#`ULmjm2+c~==NHiW2BKxW^bjel6?hc zcgnW=$ni!x=q)MjZ=CdOXg(%CbM;(vHBzq)wf*&VIHznT9?Kix*_E7a*gEbUYdPV7mif{v|M54^=X6!by=D z*Cno2ikn2QC{r~=UitH1PtK3C!No^;eO?A0nPMkRdw~ipX?QuK>8hUNzdBo}Fiy*o zE!Y}St&HU9g>k!+lngP7M#+nRO1K)LpPlvh2bUv!SFw&;4N#svQu&^0ZwV@f5qg-236q$yI zK`YE>0wU}66ja~f>461#SQj?Anfm{Trf;%bQ{hz<)q+cNJ1#lKP2Ars`E3LE8rq1# z+_}&)D>n2S>+h^G?28_2*iprx(hAv*o9|r@3EuBML8_6O*WlMiDW2OvP8QefH{YTD zaY7%{Qa?vW`5HZllJ+<|hZ}9l%0q_*vOWTwazFh*!~k}I&~UJS>23@T@g$u2za_qf zoKFe7!k1L#c=fv9?yQ}B)bYzRr*jW7XYrpj7UN`uZRfYrwt$|Edb&!5#PSCP-*sL_ z;f0VWa_RnZQL98?=7%>>L{R9&N-iZ#10 zwi&JW*#S--86Z8Za&9ppUKdXcrZ5fXvHMB;6P*_y`Aj5d2e>>Hk=l#6#Ey$h`cTc+ z7m?EYaWmkOlX0i@#_0tfO<=G%f&*I$U}fBukA zuTL#_OM6v|Dyzt&ZXvS&9@5p7If!>H>_~D&cM;Qr2PIiY_xHjth)NtZu)MpMjF3Pr zAy*-fnp>oQYL{PqJ*hSV|kDbkbyNvS`dhqa6 zAb9aBa7n+x$5HKsR44zaSCglwPg=d4b|(J7iAIB!fD4L}S+|Dzs!rQ;=lPL0Rm+zYr( z=gq%N`2&pk-0TNS+ik$x6xa@4(T@V}VU{9@_*^~XoF{9jOw8FyV)~nkY^vi49Z3tZZ@|hkqwUB%DtqaXWCiI+QTOGnQtqz_kn5&p=sa|$QVQ_iE(U09iGn%_N&i(^c*BoE}3f=QbezAA#x(Q-jc z9He=j{t2*(@!y1tkm{f#BQXE@GZzgF`Qe8wI65ye5_DtTH!*GbsRRCXIEX}HEE?)> zTo2(+UqO){g~9&AA+HsogSkDaClSxw**B!s);dg?U}WpCT49HX&r$H;MFYJQynZl0 zzy16U7s041;(HST4|x{8ix8m~w)=}3ZA|?!feNGIxF0)MmUaO|5u@qVnP(mj_{kKj?AMNiX=n#$MdB?k{{B zTeYu)`B=ubG@F}YVdmuBnz5l1ajhJg6Jf1@YDXPunc+M#E1G|N)Elzp+<(F({zx`+qqZn}nW;kuX1X6h{9(9pm z>iuwlnJL;1?<@2-4&%&-)(QB)2U<#$Q3|3AeBT{N@jyw5g-WC=I2p<@V+n}-@-uA` zq#F}CkAYxTZMyX;aw2m50}U=T*qd!So3wb8ADy0A%uL%OsS_WtUf^P!77D_8k`=g0VNn z3Usbt% z{q=m!#1<}p*k9uIvFcam+S8#{PFp_VC}2$FSC z_>oI7;x}ukN^_gIFj}+4+MC$CHHR@vzC2OTlKHI4Qh!!~ee93i z%wDsL6djU`(KvPS&A%Q?nGZn#EoI0$nti~MSl|YXzVf8TfR!B9GUgwsd-qQ923G?< zNgn=2R)aLGSy(~y#XpTZ{1KgtJJF58_G}yo9hoh|zV`a#M85aqqRfqmkN;d}Yc0|phIEpe z%Bz3eipV7&n9-1y`1ZqqAhm5JaJ+SX_;S}Wti{eHf!RK`7refOC#^Ov;?l+S{*9Z5 z;Nb)C4S8R6jAYcTFb#4=QduXZq;QhPI)EyJ2~2j7?+c>+6(62y z_sOwd#N+D32n>P72RDRaVfTm6OK0=gL{lDDF5#GTYXb9inFkl$S(Z$Xs=i@40@$iJFs4dmeW9z>$w9deamE}U`{{`eVr1b{mP2U{Q6(U*C%btDTxgsuYR$08 zqnly#eVYHFi=`Ja(l7l9Go=_20AMND!Bx1=HVyAv?L3wjd@$Sys#tmVX5pMICLnt< z^MMnx$O-A^O#=vlA^$xvx4iz$%m>$Iwilu8f>&Yx8d0S zu@zYg54R~LkF1N`bk^7M9Z!?^8U7+_i{ChNM{U=i{p;%TcowcO_k{@l4f>|jz){S& zUOD|HYUbincp*{s-0@qpOtR>K0d$9OTiOVYuE|_c zoYF5nNl>u385+O1LfLBOLtzC{3kH}KNhB#&pu(h^xzAz!GyekAOttzHN$W-_j>5%Ci%ybG(0-iC_PB8+# zpfBz8FmaidSht+Rz!2~uDw_S&PX`Ua97(|>b@A|xMShpmszrKIjDNP<3Nd4}j#fMy~G&!ylEwr5l_ea8=k7FQ+ ztF9HJHvg&Or0EG{?I*i>rqIgBcGWEaYbBkQk|gRm!m^xvO387!xWVa@ zIG{ute@VjKKo8;dx67DT(E64A3Z6_5GjR^CTk;xNs_anep4RwZ>WV6|bzx6u(rBQy zWSk#z&eUkmRZTmj23KoJY;n`y{>PXk>-O&0>m=<^sBF_F5sAZjI=JVeE60-17~T@T z$00m`Qd&=g2AYYDcRHbK2ULcXMyL-wUFdiId!eE)u3c@;hU+=yzIeK7+z%x!Kbq}p zFKzVWshen}GR5wG5CKb~ZMz5$NYTG(GYva1bwC00E6lAg~3 z-9A}Fu)xZOWn|HD2pcgw$pAgeKacz&MaQf*#F_B!;T5ihFsLnp1;2Yf`+nzznut^k zO0j@fTX3;Q;`|}hH7X17$~!Tvu1F(3hs*Sdy;&R;>a`q-#2G;`L4?p&jE!54IowN( z^a{;D?;K_2)mwU@wwuaeMJWXaP5! zqF`l;Yr(?hZ~w+V9thBatc7rBl?t$mk+KZ9Q-ov!SZ0ugTEK+Fe}}Bu9wJE+pYy84 z{1RCWVzOp1&9s+5t3gOz>rxM|tD4gSsMWp@iO<=ttaUze)F*gu*a`C~6PPbU9Q2=1 zWUa)l^GP^JZHbUF9Ddti*{Vi#&7^GWoFvg8pKvB_^Iev2W2G77Y=eU5R!Fcvj($#O zMU(F^(XdE3UQnHjjVqX;A74#!7WY?i72qx~h9CICAEGb%h*7nud`o#%aWD_*qNtk2 zpgj?3CcY?{q^q3(z7xijAroYI@T$hdWHiXo$4EYnx_=vN=xo%=T ziW1NmG1$sxjyC{^+weg{5F^4aNk0kCR^#92|8@>3jccSBBRYxR15?-3o4bjpnS5dh zj54d6MdNqn${x5kesPi{DC1`Hl5uEh7;wi(rieRQ%_OgqSlj{t8+i6dBG4MyqeLUK zqn`@H7OXJIv4&P0ykRp=I7SdIXldIHRea;%sKc-2_HsN(V(mf$uPm$XYo zW=s|rYFYImw=XQq9cjtU43T-7+VY2sDl5=8*#fz*{4l_(q|a6lNTt;^fzlC!H&T=y z7Le+FSR3+uOip~HE!Ar|TDS@V6nt~m{XszXpXr%=1 zk*fJm&rSHLM_iFRwrWyUZ|^MFPFpVI)GL3Jwwt0O)H(`nd4`J?nJ7f5K6xdmgRUlv zG_I_enj zilf~WjpZxqFFl(#G61JRM>2+Mxx=D43Qw}udCCG?^)sBE)N5HfkGD`aS6rnoLGvf4 zc>JN7OPq=g<^!BG8oA>>n>=;!_&A%Y85pCV?R)~tJ z?HkUXZo@9?=-QEzda6xTPGnToX_CX zDG3-9GEnU5YleAP$>relg4#)F82D((1rqTBr-wmc<-(g{5Z?mWBF=xLHHSlD2#8>W z_YaaY#_33X18kyW?6IVjOeKPy<}2DmN4-B#-v;g7Z30nF0D=&bid9klLiIgZYLx3P zg?@Z!CPS7iiy^oe_F)Bc6bOL~b}kKbW2&bknPiS(#5A@LN3mqFQJq)l(g_rPQWvb! z13}^b-gP^TKm`0I3ki{PkG*rLSKBku5IX<9Zqi<3UGW9$91>ljlgh&b<6x6VXN~TWgc$VT1!$b^H3OY85 zX-@)DdVM z+t%gSFs_Vc8ym|mz@<$q6~zEyt&fo(25KlD$QXqR(<&u60oqXvcq>i8Lh6O&N#FPH z!ls|4E!~VnVRLrh|KVngL4_F&98|Vaia~=Bw!-ID3S}MGO=6>pIx7g8Evr(eo+}+l zqtMTHEYdaNJ?WtpGR@Sq%3cW(A{FFN=ACX>NWRG-7xq!+60i)7!t(L#pm(Q$;Z>u- zy$v0i9V|xYcP}$}-cD3*#!nh;t2{~Guv$^7ePYxT8tYopv0j?&EVxA7V;BsD}u^AKU?fA}qtm9++u|EEb zl@lr5SVto)*rvvltlpK4gYcqxmL7%aBoXd4IH3s-txXARYQdqW@S_3Pt*G3uJct`| zUC(UUZRb1pdVB@jw{|)e|E_x|&93b{-m(+UED+wJIT@wmLq=4~WF-HUSNZW7Rlni@ zNlJ+c!zV*s@R^__(vYX5od~h8KI@jT&gv27ixbzMVmE3mH_jSCQqA!b?!8QH_8Q6- zL%IKV+qt*nkb`Vl0sEYTPN@oVltQFN@c6WXcixQTK2B8@J&ZxJP{k41Of0a9_oU?l z7AY?&Qq^#YY@w*&y6})}EJchnf*#tI>;X@W&%^$yDCxu%D6*AaS9;3t0>>vi^GGsi zr!tKgPkwQA*N~g`_womiYA&j%H_}8S{si+A-ck8jfZ;wXlKhMg-Mkr`74})mvg;4V zR%T6`_JouYCT(tA@om6xtB1!woGyQoySaPyS$}MDdrpIf;W}mB=6oMq`8Hiwqkn-# zs^Nf)H;di|>xmNpD4A!N`B2I-ttB}TEZo?(HVQSOfiGZ3NdH*nGFF8$ze{e^Mvq-9 zK-pN@mvwH&b#T*;k*iifR$Q|=zb7VH_C>#ShDSzYsLSgzR1V-jNWoD2l&cB0x zqW2r%I<9k=-Zy7~ZbZ|%T%|D3@;gQ|aP7h9$8YU{V zyRN&ychfX*bC#z|zCIxTKr15}f=CkWX+$acMrl_4dAWAS*AcQIgaM}X_3y#Zz|*VZ z&cbF%yg$+)g-RnAQ;jtenni5?V4R1w5xZ}Rc^UIb@hiC4yk0g)+E009-N0RPy2pxxdd8O^|5EvS3xgt2lzqU)+3AAP>mWB9($ z_ZZi%l-5bRE@{DcEjgfM9IM5QvDSW=cYC!qdwES+*<;%Y31KW*t|h#bmi91XYsuhK z+p)XF)!E%Z{zp^p0PN!EPK3Bak&&>zMN23e!qYUk!Tv#=CsxaD{tEU1GPOVA*nOdK z`nFlIz3+>-&e=aa6xh{r9=)2-nSRe?Wa$|dmJhb^Bh#QRfABA(yOWb!6dj2SZ5f&< zW2RV2;G~)-cK9M`y5_|UL6*+GN@QUJk)Bama{e*6@ux>tA?yk4yRVRRQ9UL}c}@x5 z+V~PHdEF9%?Qq3u-XsMt>vrnDqc~}*PWxdqzL>)#>3*G_m~%xjAG{PCc2v`kzK1qa zm>K9(P|x>7#GsgmB(sT%#;k3<@5-R4U&lf69QM*!r|(;XXh?FNG!)$SY~)$vU9~_e zxSUGLw@i-Y>RKoN#8B>Ru_SN@;2l%8>YEYCYv0wS<0UpMMx2o?VN$iX?| zs7;QmO-5tO3lg~daXxlDoLE1f7%xq;`1@b=9UsD_-9DdG_%~NUraG-RUJs&wAsT3k zgsBOtrN)Ul!;kH2OQRlWzJT?Wg(5{_nHx$Ux(LWhTBteCpDC5WLyyZqUcXvau0`kwl-q(zH-O<)Imt0@LKpU3-epp1hHLD`DaDmP&|bGijv64Na1`^j+n@gz+Oy zQAB|aHi|$?#CY@QcyoEF5OE)}fqts*3IFuD%!pq1ey@dVZi)$Y^y_e^BohX?iVJp- z>vCVvaF>LVBvEE$X~H?FAwZ6uYzWIp3PVghDKi;1_!n6pKLrmHQ&EvML0&QmNWfU2 zv1y$?UQn<>ie+xy{gqoUilL@w`{Ls(>$Q~nn04jU9dAQ0XdvYrFM3e*P&05;TX;|L z5!+Y&c3L+Nxmt@-cse))@YgJspJ*!j0TSjLwlcJV67i9`7NQB_iQsQ3k`UoXact7Y zp!|IRh}cJuzDQ;4D|B@xG?n}d(e6n*xqZ*FZZ<1pd;xccsp!R8fVFTOg{fIa{J*L@ z7q}X>_y6w{;#86%DmJ1xNjC{mR0`cFMA${8(q(rexrGo?NeFf19y;V&NQx*Fkwa93 z5OPFODE@2qUdvg{Z$IC2+B*NQ?|L+15~FWJOON|}_(O1w{WY@cnl!hBj-x0L6PO*pfp>gEo7 zFnhY!rYr4oZ6}W8o_G?PRuDdYOjp@*%PCuLREK3c+^LZ+?Xh-TdCalS(UwgOXG*et zujJlcXIE8q^q^)wS_^w6K3eaNlUzjs7Q&D6CxRk{5Ea-PP_Rrj$NU?^4@t?ckFM z9pgG))5qS z)};2;lQzpw*{aToiPDPKNOkhldwV8yan1(qY}e;e+S!vMf*#M_dCFkVFD~}avU^xR zSy|+A#CdJf>JT6ILfI>Mou}PdSKEK?9Hpx{POAeArfxC5yD_&UJ962#riW*X<+ang zaW7p->(=14OWWt@^xt-OU0M)se`L-KAN5XKBwXLF+f*B->##Z{W{`rB;;62j1`J-k zN8>@q1)4qsybf(#Q(R>C%5m{d`y&!XH*f7K8_=mX>Va?ai(gi}_$BYgR?WniK|6DA z=kb@()}cj#UIM*enEdj%Pqz-{0c!4#S9QGEXV>lA zxTK-nN6tzEzC_+H=bbmMzA~tOarDg|r!U-mk)iHoVs5r^nz!|;s|9`3SNR9cQ5g|? zK>lKk(dggS?OD9)X6JbMf=(Ha;!FE3vF3H#Z@GTLm3fQ$&RX+j*0E33Gha0nn%|mm zWW}Br%Qb_u+cvpH)|M@uF_^>o<2A=LB@ve|1|A&wWO`Xh%3h1C-zOJjE6eq`{BdaX;Z%)FPG@$=oU$)bKjL)vbMFscCwt1I zO%9ZrYBNQqGty_?lvS;Ae z(AgIzj#4nV5Lz&}bIjwFhqq|tN|`KpxT`iL-)FS%?s*$o(MAE5)*XFW4#D4V7s#w7cE%sKYOAMICVdWOV56v$MI8*%yaAeQCID z@anrh-1rxR)zAE)G+g7N13%fG7kXvEy5;qj+f#lyblU5z!_Z+}+Wa!tvPa;G4xVQn z>@~VR?5X~4o>x0pjlj*5*B?ExJ#4kr_HqT;7OV_6v zkt*Kp9MYw`&yG94dW8PluAzex=H4^(s#C1~F!fZm&)tj1woSPCJoTBn^7V1bv&y>q z8o2I}3#=Ksr{?XYnjDAjo!+nW(s=m1&nBNUE1uq&nIGWY*7V6Q}PF?l*tE*(N zRd}DAdM&QCN{896_{Q5ZZ^YZ=ck>J0F=eLT^`^@@@{gT0+^c$*`p75GFC3d+*unmM zoMK%MvnRKeJZ7XfX|8j#;IETj;@mE_I%DR*hW0XNX396@#=UMF78kMkTgsrJO*5R2 zjF`EFD2Wp?9U#x33qbudaLhN36=*E|OA?i!?Ti9lb30A zee~x**)thA!)I$;(w?%uTi!>WRmID0dpG{x`|at+JBjq`-f8;azYd8({!J3~FyzF&;-{cpS8 zcy~>Z|Lu?2=L^0{SX-^_(M2J1ZHITN;l0Cly~#Fccj-m(qC;&fx^wg4lTfFIN&zPD>=g7zo?`^Ma zD=tzGzcg{;;B%`Iv);%p@6_YQ`_c{NL+7ksY@=UewXRq(vGmTpolBplFP*GZeJr@cpukb^0RqcxW-6Z{9e~vVlZs&L)-TSxtZM}?a52trlHL>qdV50wc zdenJsMsNvND~uuClAjWw+S@HLkKb zdl#SQ=xLT|jOx=gXzJj89j(X2YX{spAypG@eqk5Ve=iYd*etcxftglmk-TX~r*tA8uvpdvQb&`=w z8@SsouI`#dWNKtnj_vPrHdRS;=j`+~nKI|-=GE_fEk^cQSa>|XOTTyTw2n-v^?p2c z;f9!%$xj#kYTW*F-phlt+{T8~7p5qy>fcav9i@K&HiC%ZGzar+(^8=s#!&@*7R ze1eDU*k#gh7pZ-jt^8$Uzb6*g96oi?O;x}CX6*X9g&IB`ch8FE^nDZLKhZ2{rtjRI z^{2Vg^Rqe9XLSB_e9Ze-G~(|?{TeZ9`V*96|N5BD}U zrXC%6j<@X4(WX%WedCm@*WV0we53Ai;>4CTqXYQ^rutUJtIUWx&npRPAA0S&R2#dt z+f3~bYIczx`0T{^-m}7vO_>v)Ud`X@6sOzy$X#TAgJp4jW`Ia2{Y>ZL_V{lA?`SP8kHy^yeeBX81?VU1T zXJ^@ngaPz$&H(;$9}kXLc?|8+mADuCVp4vA+^4J|zt%>axOQUt?=sC3@eg4bORSb7}owKiOSsgGHu#K?tHQ_MT=dHCzwJjv^Qb={1@qz9>cPkWwQ zpjtZ*@>ymL z$w|Ez%PUnz?OT;RWP8Yl%p!he-Xwc1sorLa(K;1g(l7WjJGb**BEIj;2=7U9C^`)J|m#*TeD#Z8<&wCf`9Kp4Wkyu{3OLoAd&cO}`+ZC6Z zdw8a+BpABfdNplIg5h3i?_ops%f8z_DCWw^ja9$gDQrK)@ZHsk8w#_$ZQ^;e7m7KyLU;^TC~EfBiVAB-f#X zwn4zi!g6!XJsa&D=5;-k5b8eqcjqIT^IX+dnjiJocAhsy_ik$Jh9TF_bgu3)O{?3O zQvF2=?c$y{&d)4ca&a;L(G^#9tKzQ)fo4nE^D~Vf4x3@A^=R9)Hnwmj8*<829n19l|#$xU+(QP2M9VWs-6`y%gaex^@8A5#kqvYO%iydOu~aoEds z7uqPixM?u2SK6()J7z}t)V)i5;3Mb!t*7~nG>L_hozuCqR3eQAs5C@w&T81{y|2E$ zb3oPfk#5h6gLiMZINqe{imgPzQi;gg_S_>?ic-1m+f2hVb}J<7WgUv}9$nSAe9eTi zv2S1OS&^~Qy7Ew#ROIz@<8$7{PrfWIKNbCeROGKa#^>~Kn=BdFr80R^N;{<0Fil)G z?AW$%a;9aAeCoW$sfPGud|99&`y|D{GH8Oz64m*?HF&F>SL82Rxnb#On+K%>eCw2Z zPh5S_Na>hUX2Od?UiyHwujgMVxAI@E@T^^Dqgu-{|Ls}<h_jBj=&b3#Vv2V0p|9K9bPAz(;m045R zO(nWi<-Iib$RBUMa00%`ji@iJZoFyo?L+;y&u=cA%8a`&eZpn_+OWW#Rf>VKbp{=lgtz_p;@~Bd1NYrK zr8++jSK zIT|#f!T%=j?x&TTFC3|L=Dtk3@zCwn1J}mv;q{};4&60rs(rL{$GO_0JWKV4V23L^ zPG27BqF8W9>C3di7Ui!zw(=fw?rLb(FKcY5*k4-tjhFFZO2s#uZDZw5ZmIfwW|OW$ zLC4>s;?h!Faysr-%01-Td+E2W&WaZI-y?VDrazWhV+5u;4 zUevdLn;f1#H2$%JT2buQ1*X9uqpRK)wZH33bpQycvT|Z-c z{fzyKu}L_g@$6@bBXwRS_O&ZUc z8N&_mdT%^uW{lPvuOI1v7Dmg-uWU0*P6sn4pATLRXoD8a$0zh90=o?J)=RGXQ-sppOFddjWkWpuYp?t1+J^Y=1#$ zn0A(8)=^^ntBN^WZvg160lgcbUkvEu0evc<&js{lfW8jU%SZ~_f$gt4=Ir({2K0`A z-Uo9Trhaqbt|93Em@~T#NrL_l=uZOr0zh90=o6uw*45VI>gnhh=?v8|;A&2CcjH>Q z1`GKh1ET>Pj&{&O-(c6-XdWCWoO_a`KzAQk?O^xNV2-w%Yp^Rvdv;Ke@bmy}@o){E z?d8YO7Kr8q`v(T0Wq(G;l%%U|)O=MJEB)q=bqyZl<>%l&*Vo-ISXi8|zZ<&0o9E?* zMed$Fk3d&ncaHWPe_w%2+s{APT^so{2>oYu&OF*(ahJI$BQt#CCj4HEDPG#>_#Fur z`>jToWD?vMq#I0nHdfj`_-1-5+y2WMInS++o#AHn8i`LK~Pe+4(DQ zejPj}ll)w=fakhs&e+A0aY+8~?m|CEHex?nXWU~H?D}2&g!$Nd_PQy`%k~K&PO`kv zBGOJw`!O*h`3bj1jHyWvO?2|a^@l6b^soHHD`&J>RQ{@!LPhuH*#%k#k^KI6neD%H zgit~3B*hT>$+|aM5aoY{2>&}GEy+*H4M&Wri5)*KuHPJOqZ5~2LCf)I5|!T*=MQJ? z7^HquObl8OWq%R2-v#Gq;wOtHlAqZB8)Bp_(c<@1wN6O2zZ2mCTYCUuHe-}s*NdY6 zSKAqB*|DSZr!y;5VwcLSv)i7L7cnRMC}npN%s5@pEE!E~|CzC0RQ<8Z!u<9Fk$_6< z`l(IL78#us=iRqv5U)szY%#_TQI;@VRD>Ch4|r51#)9Sl%NZ;w*n=rCTC_w{8)Oe3 zk5|3%W@d0_I7y`4A`y^4_K)k=cEjG z>D!ziCi;HO`DdcnY|h6a0|e#ceoyR4z}#g}bNvgVAKaW9qH#xH4?ZxEa#mp;ZXnc? zh+f4!*|0g6Z7(b*+o(C8heayKg&AA%t7>u2!qnmSA%w5cy^L?0y zTQuh{F;BK^&Icg}newy8HRl1CS4?QmPhrlnZO%Vo&b4pON1}t0!2U_i`Etx%@P3W5 z3({r8nf;fG*G=*KHs&rY{s{AA7O%#K)K$?(Wg!O8o{ewxUxdf>m zaYi4G_g|t9LhAxMS^5u%GkSTvpAvllI=C@7_D@6cVDuR*eJSFC^6~ga^iz8X^=T~qeZ&Rj<8e=L zO#zuU2E~slKZT_~hB%{-H4xfQ^gYmpM@FB_(yv9F(Wl{YlIY)KeF97GswUJI;qjB` zuOZHqk57}n_qSoF-w6E0<0{ebMx4<{vdZt=OQ@H}<1NvLA7qMyVS z>O)!j9K;!W>_<0mpMkxFdOl0P4RJ=#$K$r(J3q?WrjJnX$Wu-=8G zABXxqqff)*JkcLVoblI*rSGjS)EA8t+E4WB5NGuEEd3{}uffL!qMwcSA4YG*(%(Xy zu}2;sFNl74f1%!#r9X%`qu0U55u#Vn5bBLs`sIi-dV73)A^KNXufx(aUP^M%K|n;3 zq|O8B&60v|CNvQ)OLEX)$x71%;{q8S|JpbU^OV1kU}+BKq@BrTPs|&z{e=5so{YzB z!UHk4k|abpUx0ZHi!a7J0UZM9v>bEOwlw8%R%2ep;_ESwMdwF!+K9PPdzx}MTQRR- z@g&S6rJL*bV6H<>5b)-Mn3u5lQOv_xAwGqC6gg0Ss zgpSX2k{rOSNpp%=yaVQ*%okTGb;4Zm4=4@n|R`hea7;2h|w zkU{#Qo(j=so)qzL*VRT(hj4@!U(K0|vOG&`yD_7!dAxBv9uN4>CvUo*9@@%0nBo@V z>${N0cSV<1d4cX8_VYQs>HlG?#Sk7NL!x22JoHqH7gG#l$t3r=UO~a`funtVTHzn3 z$O_)~i@>OwKvbbVst_5>RAzymiwRyxsx));@tKV-0^19WZ`CJbP+Rq}z#Ldy72 zJASl1nEb*?%r>P460!RSRfgE@fy?-z_Mpo65j%u7{Ife3MfOu<2Pxx6?fB95VDby6 zzwaM^-8qQvO#EvF-G=wqJ$Y!$Un|TFdeeuYvr+uab5L-|>{fohS@0yZuP~8^pMB9I zDCofyRQG?catnPPPjDvPLhl9r;U6DxTG{(nt6UuKzxU+-hWD*nxwzi9TJyKG=YJmn zTBGS(>b~U zYUSd3-)hbOkN11_7oe6M?@5!Bxv=S5>b&5 z|35W}>zANm|MS(tR`N@{n#A$z`|k3u`z3x&;`$|CD>A!WOZuU3(D~bIYa)6^OZ^hI z;6J)5DWbNP`1NBoiR;(DA1~sUcr}URm#{{0dPhs+C4Nof`Xyc~isP4fHMPhu_E*Ng z9}mens$kxNO{rdOgMf?)4CUN`{zt^{aKwc zTfg+NA%1g$z5M-Hl}>R*_dY`RaWd99xCe*uM+eRg5j+>q5NlUdlqb6PU6`2L$C-Ol z$$hEhHr@`1?Lt%IH(~0HmYKxp7odr7a|WWs@GvwH?#@7zNcpPx{&gQA{?64gkH?&` zV-K20IXfu*B8uO~oUN~*^ouDy^BgCW*!q5$Gj=XP6RCF&p!cTqODR2hUWQ%%QA)pz z(!T-9ucP$glwOZ95G8i`!!T!N%h5#aUjXQrQu-B?J`>PiqVy{%y#($+YkA9Hn1P>5BmUJxafZ(kn5Bqr|pf6?4Y^NHmf5oDS$+Dg9bXzZ1~! zr}XP6eI=lOP3hND`T_X)3uf~kitAv`*dIl4KR_Qs={HdN6M+6KrH`ic4S@a|rH`TX zBk^-0Z2QeIXY7xq^eX`UT1vl>(&qvC8tzyK??maD zF^%+}Er33W(r>2prGWkkrQbs7xzcC_CAK{pm^1coMH8{d9nkww`UFaU2+$v+^xG)? zTR{Jj(r>5qO%!K*K#vIu5|=p;p~SGgXd?D&V9xl(93u$VrZ|&l2AY^O5h%xr(kCJv zDThaKrW|G)lU4xbtfllxNJq+vr8rYg8B+*KCxCL!Qu^PJj+B#wIWyZw@k*eaULBDL zC8o~(Xd>m90^AAUe1Jy+JQ?7b051V}9l+)B^I2@av;b}ea5sR51AHgIvjAQW@J4{E z;OEHLehmk>1LjQsXY!~s3n)2J`U7a4#QiA6>Hbgz@G5|lMVHUXRUf0qzU%D1aXZ_%(n(#hmm9^4oz`l%DPnCioFK zw*4*u4+VHEz*7KDe&>c=ZyBI(0JtK4*p6L}4#2Ge?g{V+fF}d|0>B>vyb0i{_&H6s zUn2lM9pFKjlm4KH$A?f#PxptL6sO0B8h}gV=RnzZasf`BCuQsH0lg2vBLTh_;Mo8# z19$_#74a8iw*5K)w*$Bzz&8Lq72wwa{tDod__uL0f>{Z5Ric=g4c^oK@#9ZHkZ)BRx?#p(W#4Dbwq7XiEq;2iY3 zHKOcS1-KEw9Rbb0G&06&2_=?~<(WCo?D z`$H#H(RS(p+zQ|>01pLtEWlF%o)7R!fOF9A4~c4jRe&1-+!5e>fX4v*2*3*gUJdYH z(C;LP@@pW#%`hkZf&7-a6{V;9LlVX5@gW=FB>=AhxHS5GCQwx0`dV}LsWJQ(1y08a&YKENvh&KW4$FI9jW0emXJ=L38*z)u3a z5OdNWa&dpSP3h_Wp!Tb1zf1w{1aLmUBLSWa@JxW00K5+1@>-(((gL^@z})~I4)8>P zX8`;jz&`?9X^?2Y3;=F}Iq46XWPG6Xbbm;qI6V$z1H1&_H2{~^7HvNl;Kl%V0(daM zV*#ED@O*$*0-Q5gv|p+KHv;%nfX@f`W`Lgrcp>JbKb$1v1Er_?gPM+Lzf1w{1aLmU zBLSWa@JxW00K5+1^17n^(gL^@z})~I4)8>PX8`;jz&`?9Nl&z21^~Cgob-oOGCojx zx<4dQyxI5w@DhO609;yMwEaB+J{;gv03HbNXn-FA_!WRZ0r*#d_b?FcFZzWhL6SnB zbURN5xF6;uj(c$&gD5>6$MY1Y%dY@<1Hk2nh_+J`;G+OO9pLi;9tZFv0KW$C=KybG zDB7={03QzUDF6=vcr@lDj)^#qnu^m@K}JS06Yue4*>oV;0mKe+pi7q@c{P#_zHmk2Jo{0zYFjW0GBfn`bFZn7{{>- z=FIsL9mkmzr^nTBfF}Sv4dD3ze+cl;0PkulI$nAJw*|O2z*hr&H^8$1{s7<~0j@Nf zBS`G?SZ&Nn9D{Kj^(j3a$0Zb}IM&UW7S` zqcx7>AC#Vsqnfp7zf1w{1aLmUBLSWa@JxW00K5+1@;0LV(gL^@z})~I4)8>PX8`;j zz-urkaWuzq{6OjHIF7Ux?XL^KLjfKO@DzY&1H2624FFfP6K%f^z^wu93GfJjCj&ea z;3WXB2e`bw&@U24V;n~%%%#v~I*v0bPLHeM08apT8o=`bUIFk%fGbZF9WMib+XLJO z;E@2|3-D}!mjS#1;N2#P_DctI5=R3Z$03xSj^h%F({W4&cm}|W0A2-fj)Q3XRRL}U za7Tdi0UibL6oBUfyaM2j09SSt?e9>4+hb1RsDQnvE#YZhg?blOIpqD>p)79NU8s*`aq|9b^7}BP{2~>hKAWW{@6Rq_ zahIM#eGQ9~_h(Bp1*1gDuiy$+Ib0U6!Q6<&UHS_3_AJiF+>^yC`U&;nEM9|o0*kw7 z3iW9$&c{5T#VZC1^%X2$gLwmsyWky2UQ(Ek+00CQ%r#lOVz5wW%HlPcJF&Qno>0$c zaX#jeEFNwk)F-nzd4F~Wiznm%kCNZ9A@-B^XIHWG*<{CMP6SXQdh-5kWfrf%{|C}x z@fysnSe%2e*SoMdd4F~&i*xby;#d|Z@6S$QansR4yR%uGyg$2y#a+yV`WhA|@6VRT zgB@vya0{WH%i`qy*+wj$Y$?>+vp9KwwkL~c= taskCount) return; + if (taskIndex0 >= taskCount0 || + taskIndex1 >= taskCount1 || + taskIndex2 >= taskCount2) + return; - stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, - Nx, Ny, Nz, coef, vsq, Ain, Aout); + const uniform int xfirst = x0 + taskIndex0 * SPANX; + const uniform int xlast = min(x1, xfirst + SPANX); + + const uniform int yfirst = y0 + taskIndex1 * SPANY; + const uniform int ylast = min(y1, yfirst + SPANY); + + const uniform int zfirst = z0 + taskIndex2 * SPANZ; + const uniform int zlast = min(z1, zfirst + SPANZ); + + stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, + Nx, Ny, Nz, coef, vsq, Ain, Aout); } + export void loop_stencil_ispc_tasks(uniform int t0, uniform int t1, uniform int x0, uniform int x1, @@ -134,39 +103,24 @@ loop_stencil_ispc_tasks(uniform int t0, uniform int t1, uniform const double vsq[], uniform double Aeven[], uniform double Aodd[]) { - for (uniform int t = t0; t < t1; ++t) { - // Parallelize across cores as well: each task will work on a slice - // of 1 in the z extent of the volume. - if ((t & 1) == 0) - launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, - coef, vsq, Aeven, Aodd); - else - launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, - coef, vsq, Aodd, Aeven); +#define NB(x,n) (((x)+(n)-1)/(n)) - // We need to wait for all of the launched tasks to finish before - // starting the next iteration. - sync; + for (uniform int t = t0; t < t1; ++t) + { + // Parallelize across cores as well: each task will work on a slice + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd); + else + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven); + + // We need to wait for all of the launched tasks to finish before + // starting the next iteration. + sync; } } - -export void -loop_stencil_ispc(uniform int t0, uniform int t1, - uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], - uniform const double vsq[], - uniform double Aeven[], uniform double Aodd[]) -{ - for (uniform int t = t0; t < t1; ++t) { - if ((t & 1) == 0) - stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, - Aeven, Aodd); - else - stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, - Aodd, Aeven); - } -} diff --git a/examples_cuda/stencil/stencil.ptx b/examples_cuda/stencil/stencil.ptx deleted file mode 100644 index e3dcd1ca..00000000 --- a/examples_cuda/stencil/stencil.ptx +++ /dev/null @@ -1,267 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) -// Cuda compilation tools, release 5.5, V5.5.0 -// - -.version 3.2 -.target sm_35 -.address_size 64 - - .file 1 "/home/evghenii/soft/ispc-code/ispc/examples/stencil/stencil.cu", 1383254912, 2112 - .file 2 "/usr/local/cuda-5.5/bin/..//include/cuda_device_runtime_api.h", 1375338991, 7655 - -.weak .func (.param .b32 func_retval0) cudaMalloc( - .param .b64 cudaMalloc_param_0, - .param .b64 cudaMalloc_param_1 -) -{ - .reg .s32 %r<2>; - - - mov.u32 %r1, 30; - st.param.b32 [func_retval0+0], %r1; - .loc 2 66 3 - ret; -} - -.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes( - .param .b64 cudaFuncGetAttributes_param_0, - .param .b64 cudaFuncGetAttributes_param_1 -) -{ - .reg .s32 %r<2>; - - - mov.u32 %r1, 30; - st.param.b32 [func_retval0+0], %r1; - .loc 2 71 3 - ret; -} - -.visible .entry stencil_step_task( - .param .u32 stencil_step_task_param_0, - .param .u32 stencil_step_task_param_1, - .param .u32 stencil_step_task_param_2, - .param .u32 stencil_step_task_param_3, - .param .u32 stencil_step_task_param_4, - .param .u32 stencil_step_task_param_5, - .param .u32 stencil_step_task_param_6, - .param .u32 stencil_step_task_param_7, - .param .u64 stencil_step_task_param_8, - .param .u64 stencil_step_task_param_9, - .param .u64 stencil_step_task_param_10, - .param .u64 stencil_step_task_param_11 -) -{ - .reg .pred %p<8>; - .reg .s32 %r<54>; - .reg .s64 %rd<36>; - .reg .f64 %fd<48>; - - - ld.param.u32 %r19, [stencil_step_task_param_0]; - ld.param.u32 %r20, [stencil_step_task_param_1]; - ld.param.u32 %r21, [stencil_step_task_param_2]; - ld.param.u32 %r22, [stencil_step_task_param_3]; - ld.param.u32 %r23, [stencil_step_task_param_4]; - ld.param.u32 %r24, [stencil_step_task_param_5]; - ld.param.u32 %r25, [stencil_step_task_param_6]; - ld.param.u64 %rd4, [stencil_step_task_param_8]; - ld.param.u64 %rd1, [stencil_step_task_param_9]; - ld.param.u64 %rd2, [stencil_step_task_param_10]; - ld.param.u64 %rd3, [stencil_step_task_param_11]; - cvta.to.global.u64 %rd5, %rd4; - .loc 1 59 1 - mov.u32 %r26, %ctaid.x; - add.s32 %r51, %r26, %r23; - add.s32 %r27, %r51, 1; - .loc 1 18 1 - ld.global.f64 %fd1, [%rd5]; - .loc 1 19 1 - ld.global.f64 %fd2, [%rd5+8]; - .loc 1 20 1 - ld.global.f64 %fd3, [%rd5+16]; - .loc 1 21 1 - ld.global.f64 %fd4, [%rd5+24]; - .loc 1 22 1 - setp.ge.s32 %p1, %r51, %r27; - @%p1 bra BB2_11; - - mul.lo.s32 %r28, %r25, %r24; - shl.b32 %r29, %r28, 1; - neg.s32 %r30, %r29; - shl.b32 %r2, %r30, 3; - cvta.to.global.u64 %rd6, %rd2; - cvta.to.global.u64 %rd31, %rd3; - cvta.to.global.u64 %rd32, %rd1; - -BB2_2: - .loc 1 23 1 - setp.ge.s32 %p2, %r21, %r22; - @%p2 bra BB2_10; - - mov.u32 %r52, %r21; - -BB2_4: - .loc 1 24 1 - mov.u32 %r4, %r52; - setp.ge.s32 %p3, %r19, %r20; - @%p3 bra BB2_9; - - .loc 1 29 1 - mul.lo.s32 %r32, %r51, %r28; - mad.lo.s32 %r5, %r4, %r24, %r32; - .loc 1 32 1 - add.s32 %r6, %r24, %r5; - add.s32 %r7, %r5, %r28; - shl.b32 %r33, %r24, 1; - add.s32 %r8, %r5, %r33; - mad.lo.s32 %r9, %r24, -2, %r5; - add.s32 %r10, %r5, %r29; - mad.lo.s32 %r11, %r28, -2, %r5; - add.s32 %r12, %r24, %r8; - mad.lo.s32 %r13, %r28, 3, %r5; - mov.u32 %r53, %r19; - -BB2_6: - .loc 1 26 1 - mov.u32 %r14, %r53; - mov.u32 %r35, %tid.x; - add.s32 %r36, %r35, %r14; - .loc 1 29 1 - add.s32 %r15, %r36, %r5; - mul.wide.s32 %rd7, %r15, 8; - add.s64 %rd8, %rd6, %rd7; - .loc 1 32 1 - ld.global.f64 %fd5, [%rd8]; - ld.global.f64 %fd7, [%rd8+-8]; - ld.global.f64 %fd8, [%rd8+8]; - add.f64 %fd9, %fd8, %fd7; - add.s32 %r37, %r6, %r36; - mul.wide.s32 %rd9, %r37, 8; - add.s64 %rd10, %rd6, %rd9; - .loc 1 32 1 - ld.global.f64 %fd10, [%rd10]; - add.f64 %fd11, %fd9, %fd10; - .loc 1 22 1 - neg.s32 %r39, %r33; - shl.b32 %r40, %r39, 3; - cvt.s64.s32 %rd11, %r40; - add.s64 %rd12, %rd10, %rd11; - .loc 1 32 1 - ld.global.f64 %fd12, [%rd12]; - add.f64 %fd13, %fd11, %fd12; - add.s32 %r41, %r7, %r36; - mul.wide.s32 %rd13, %r41, 8; - add.s64 %rd14, %rd6, %rd13; - .loc 1 32 1 - ld.global.f64 %fd14, [%rd14]; - add.f64 %fd15, %fd13, %fd14; - cvt.s64.s32 %rd15, %r2; - add.s64 %rd16, %rd14, %rd15; - .loc 1 32 1 - ld.global.f64 %fd16, [%rd16]; - add.f64 %fd17, %fd15, %fd16; - mul.f64 %fd18, %fd2, %fd17; - fma.rn.f64 %fd19, %fd1, %fd5, %fd18; - ld.global.f64 %fd20, [%rd8+-16]; - ld.global.f64 %fd21, [%rd8+16]; - add.f64 %fd22, %fd21, %fd20; - add.s32 %r42, %r8, %r36; - mul.wide.s32 %rd17, %r42, 8; - add.s64 %rd18, %rd6, %rd17; - .loc 1 32 1 - ld.global.f64 %fd23, [%rd18]; - add.f64 %fd24, %fd22, %fd23; - add.s32 %r43, %r9, %r36; - mul.wide.s32 %rd19, %r43, 8; - add.s64 %rd20, %rd6, %rd19; - .loc 1 32 1 - ld.global.f64 %fd25, [%rd20]; - add.f64 %fd26, %fd24, %fd25; - add.s32 %r44, %r10, %r36; - mul.wide.s32 %rd21, %r44, 8; - add.s64 %rd22, %rd6, %rd21; - .loc 1 32 1 - ld.global.f64 %fd27, [%rd22]; - add.f64 %fd28, %fd26, %fd27; - add.s32 %r45, %r11, %r36; - mul.wide.s32 %rd23, %r45, 8; - add.s64 %rd24, %rd6, %rd23; - .loc 1 32 1 - ld.global.f64 %fd29, [%rd24]; - add.f64 %fd30, %fd28, %fd29; - fma.rn.f64 %fd31, %fd3, %fd30, %fd19; - ld.global.f64 %fd32, [%rd8+-24]; - ld.global.f64 %fd33, [%rd8+24]; - add.f64 %fd34, %fd33, %fd32; - add.s32 %r46, %r12, %r36; - mul.wide.s32 %rd25, %r46, 8; - add.s64 %rd26, %rd6, %rd25; - .loc 1 32 1 - ld.global.f64 %fd35, [%rd26]; - add.f64 %fd36, %fd34, %fd35; - add.s64 %rd27, %rd12, %rd11; - .loc 1 32 1 - ld.global.f64 %fd37, [%rd27]; - add.f64 %fd38, %fd36, %fd37; - add.s32 %r47, %r13, %r36; - mul.wide.s32 %rd28, %r47, 8; - add.s64 %rd29, %rd6, %rd28; - .loc 1 32 1 - ld.global.f64 %fd39, [%rd29]; - add.f64 %fd40, %fd38, %fd39; - add.s64 %rd30, %rd16, %rd15; - .loc 1 32 1 - ld.global.f64 %fd41, [%rd30]; - add.f64 %fd42, %fd40, %fd41; - fma.rn.f64 %fd6, %fd4, %fd42, %fd31; - .loc 1 44 1 - setp.ge.s32 %p4, %r36, %r20; - @%p4 bra BB2_8; - - mul.wide.s32 %rd33, %r15, 8; - add.s64 %rd34, %rd31, %rd33; - .loc 1 45 1 - ld.global.f64 %fd43, [%rd34]; - add.f64 %fd44, %fd5, %fd5; - sub.f64 %fd45, %fd44, %fd43; - add.s64 %rd35, %rd32, %rd33; - .loc 1 45 1 - ld.global.f64 %fd46, [%rd35]; - fma.rn.f64 %fd47, %fd46, %fd6, %fd45; - st.global.f64 [%rd34], %fd47; - -BB2_8: - .loc 1 24 19 - add.s32 %r16, %r14, 32; - .loc 1 24 1 - setp.lt.s32 %p5, %r16, %r20; - mov.u32 %r53, %r16; - @%p5 bra BB2_6; - -BB2_9: - .loc 1 23 18 - add.s32 %r17, %r4, 1; - .loc 1 23 1 - setp.lt.s32 %p6, %r17, %r22; - mov.u32 %r52, %r17; - @%p6 bra BB2_4; - -BB2_10: - .loc 1 22 18 - add.s32 %r51, %r51, 1; - .loc 1 59 1 - add.s32 %r49, %r23, %r26; - add.s32 %r50, %r49, 1; - .loc 1 22 1 - setp.lt.s32 %p7, %r51, %r50; - @%p7 bra BB2_2; - -BB2_11: - .loc 1 61 2 - ret; -} - - diff --git a/examples_cuda/stencil/stencil0.ptx b/examples_cuda/stencil/stencil0.ptx deleted file mode 100644 index f06a11d9..00000000 --- a/examples_cuda/stencil/stencil0.ptx +++ /dev/null @@ -1,224 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) -// Cuda compilation tools, release 5.5, V5.5.0 -// - -.version 3.2 -.target sm_35 -.address_size 64 - - .file 1 "/home/evghenii/soft/ispc-code/ispc/examples/stencil/stencil.cu", 1383254912, 2112 - -) -{ - .reg .s32 %r<2>; - - - mov.u32 %r1, 30; - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes( - .param .b64 cudaFuncGetAttributes_param_0, - .param .b64 cudaFuncGetAttributes_param_1 -) -{ - .reg .s32 %r<2>; - - - mov.u32 %r1, 30; - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .entry stencil_step_task( - .param .u32 stencil_step_task_param_0, - .param .u32 stencil_step_task_param_1, - .param .u32 stencil_step_task_param_2, - .param .u32 stencil_step_task_param_3, - .param .u32 stencil_step_task_param_4, - .param .u32 stencil_step_task_param_5, - .param .u32 stencil_step_task_param_6, - .param .u32 stencil_step_task_param_7, - .param .u64 stencil_step_task_param_8, - .param .u64 stencil_step_task_param_9, - .param .u64 stencil_step_task_param_10, - .param .u64 stencil_step_task_param_11 -) -{ - .reg .pred %p<8>; - .reg .s32 %r<54>; - .reg .s64 %rd<36>; - .reg .f64 %fd<48>; - - - ld.param.u32 %r19, [stencil_step_task_param_0]; - ld.param.u32 %r20, [stencil_step_task_param_1]; - ld.param.u32 %r21, [stencil_step_task_param_2]; - ld.param.u32 %r22, [stencil_step_task_param_3]; - ld.param.u32 %r23, [stencil_step_task_param_4]; - ld.param.u32 %r24, [stencil_step_task_param_5]; - ld.param.u32 %r25, [stencil_step_task_param_6]; - ld.param.u64 %rd4, [stencil_step_task_param_8]; - ld.param.u64 %rd1, [stencil_step_task_param_9]; - ld.param.u64 %rd2, [stencil_step_task_param_10]; - ld.param.u64 %rd3, [stencil_step_task_param_11]; - cvta.to.global.u64 %rd5, %rd4; - mov.u32 %r26, %ctaid.x; - add.s32 %r51, %r26, %r23; - add.s32 %r27, %r51, 1; - ld.global.f64 %fd1, [%rd5]; - ld.global.f64 %fd2, [%rd5+8]; - ld.global.f64 %fd3, [%rd5+16]; - ld.global.f64 %fd4, [%rd5+24]; - setp.ge.s32 %p1, %r51, %r27; - @%p1 bra BB2_11; - - mul.lo.s32 %r28, %r25, %r24; - shl.b32 %r29, %r28, 1; - neg.s32 %r30, %r29; - shl.b32 %r2, %r30, 3; - cvta.to.global.u64 %rd6, %rd2; - cvta.to.global.u64 %rd31, %rd3; - cvta.to.global.u64 %rd32, %rd1; - -BB2_2: - setp.ge.s32 %p2, %r21, %r22; - @%p2 bra BB2_10; - - mov.u32 %r52, %r21; - -BB2_4: - mov.u32 %r4, %r52; - setp.ge.s32 %p3, %r19, %r20; - @%p3 bra BB2_9; - - mul.lo.s32 %r32, %r51, %r28; - mad.lo.s32 %r5, %r4, %r24, %r32; - add.s32 %r6, %r24, %r5; - add.s32 %r7, %r5, %r28; - shl.b32 %r33, %r24, 1; - add.s32 %r8, %r5, %r33; - mad.lo.s32 %r9, %r24, -2, %r5; - add.s32 %r10, %r5, %r29; - mad.lo.s32 %r11, %r28, -2, %r5; - add.s32 %r12, %r24, %r8; - mad.lo.s32 %r13, %r28, 3, %r5; - mov.u32 %r53, %r19; - -BB2_6: - mov.u32 %r14, %r53; - mov.u32 %r35, %tid.x; - add.s32 %r36, %r35, %r14; - add.s32 %r15, %r36, %r5; - mul.wide.s32 %rd7, %r15, 8; - add.s64 %rd8, %rd6, %rd7; - ld.global.f64 %fd5, [%rd8]; - ld.global.f64 %fd7, [%rd8+-8]; - ld.global.f64 %fd8, [%rd8+8]; - add.f64 %fd9, %fd8, %fd7; - add.s32 %r37, %r6, %r36; - mul.wide.s32 %rd9, %r37, 8; - add.s64 %rd10, %rd6, %rd9; - ld.global.f64 %fd10, [%rd10]; - add.f64 %fd11, %fd9, %fd10; - neg.s32 %r39, %r33; - shl.b32 %r40, %r39, 3; - cvt.s64.s32 %rd11, %r40; - add.s64 %rd12, %rd10, %rd11; - ld.global.f64 %fd12, [%rd12]; - add.f64 %fd13, %fd11, %fd12; - add.s32 %r41, %r7, %r36; - mul.wide.s32 %rd13, %r41, 8; - add.s64 %rd14, %rd6, %rd13; - ld.global.f64 %fd14, [%rd14]; - add.f64 %fd15, %fd13, %fd14; - cvt.s64.s32 %rd15, %r2; - add.s64 %rd16, %rd14, %rd15; - ld.global.f64 %fd16, [%rd16]; - add.f64 %fd17, %fd15, %fd16; - mul.f64 %fd18, %fd2, %fd17; - fma.rn.f64 %fd19, %fd1, %fd5, %fd18; - ld.global.f64 %fd20, [%rd8+-16]; - ld.global.f64 %fd21, [%rd8+16]; - add.f64 %fd22, %fd21, %fd20; - add.s32 %r42, %r8, %r36; - mul.wide.s32 %rd17, %r42, 8; - add.s64 %rd18, %rd6, %rd17; - ld.global.f64 %fd23, [%rd18]; - add.f64 %fd24, %fd22, %fd23; - add.s32 %r43, %r9, %r36; - mul.wide.s32 %rd19, %r43, 8; - add.s64 %rd20, %rd6, %rd19; - ld.global.f64 %fd25, [%rd20]; - add.f64 %fd26, %fd24, %fd25; - add.s32 %r44, %r10, %r36; - mul.wide.s32 %rd21, %r44, 8; - add.s64 %rd22, %rd6, %rd21; - ld.global.f64 %fd27, [%rd22]; - add.f64 %fd28, %fd26, %fd27; - add.s32 %r45, %r11, %r36; - mul.wide.s32 %rd23, %r45, 8; - add.s64 %rd24, %rd6, %rd23; - ld.global.f64 %fd29, [%rd24]; - add.f64 %fd30, %fd28, %fd29; - fma.rn.f64 %fd31, %fd3, %fd30, %fd19; - ld.global.f64 %fd32, [%rd8+-24]; - ld.global.f64 %fd33, [%rd8+24]; - add.f64 %fd34, %fd33, %fd32; - add.s32 %r46, %r12, %r36; - mul.wide.s32 %rd25, %r46, 8; - add.s64 %rd26, %rd6, %rd25; - ld.global.f64 %fd35, [%rd26]; - add.f64 %fd36, %fd34, %fd35; - add.s64 %rd27, %rd12, %rd11; - ld.global.f64 %fd37, [%rd27]; - add.f64 %fd38, %fd36, %fd37; - add.s32 %r47, %r13, %r36; - mul.wide.s32 %rd28, %r47, 8; - add.s64 %rd29, %rd6, %rd28; - ld.global.f64 %fd39, [%rd29]; - add.f64 %fd40, %fd38, %fd39; - add.s64 %rd30, %rd16, %rd15; - ld.global.f64 %fd41, [%rd30]; - add.f64 %fd42, %fd40, %fd41; - fma.rn.f64 %fd6, %fd4, %fd42, %fd31; - setp.ge.s32 %p4, %r36, %r20; - @%p4 bra BB2_8; - - mul.wide.s32 %rd33, %r15, 8; - add.s64 %rd34, %rd31, %rd33; - ld.global.f64 %fd43, [%rd34]; - add.f64 %fd44, %fd5, %fd5; - sub.f64 %fd45, %fd44, %fd43; - add.s64 %rd35, %rd32, %rd33; - ld.global.f64 %fd46, [%rd35]; - fma.rn.f64 %fd47, %fd46, %fd6, %fd45; - st.global.f64 [%rd34], %fd47; - -BB2_8: - add.s32 %r16, %r14, 32; - setp.lt.s32 %p5, %r16, %r20; - mov.u32 %r53, %r16; - @%p5 bra BB2_6; - -BB2_9: - add.s32 %r17, %r4, 1; - setp.lt.s32 %p6, %r17, %r22; - mov.u32 %r52, %r17; - @%p6 bra BB2_4; - -BB2_10: - add.s32 %r51, %r51, 1; - add.s32 %r49, %r23, %r26; - add.s32 %r50, %r49, 1; - setp.lt.s32 %p7, %r51, %r50; - @%p7 bra BB2_2; - -BB2_11: - ret; -} - - diff --git a/examples_cuda/stencil/stencil1.cubin b/examples_cuda/stencil/stencil1.cubin deleted file mode 100644 index 8b7d18d97f867ec965d35f0c90002d0f94ea1dad..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3604 zcmeHKO>A355FR^jyKxg(n*c_b=pim?p=qRYaHCMYR#2r;TcuH{ObXOalge#VHzcNN zr9u}-BMZ4;B&&+hJ1nJ#A{ogy=Y*b+B5~xv0VzEMrAR&Wz<~pY&@l66H?iG@3ldyd z$!~XN=jYp*dGprS-Z*?P9188(20X&>9fZu9rpZnwCb#qa(v~i8Aq8#t6eQU@lbb0_ z7qVlxvh$S-0bQ85TtMbT{`A?&Q#enZDr9HQLvQ|4@7esBi<@lCa34LoT#E=R)M!jzDNT<`{l^ah;ArUlZL9Jx#O#r-|MMNu*=z`dcB< zL~ntvCOQJLiEabL(V!p3YHOrJ5NVJJYSEFHWSkQ+iAAFp2lz;dC&0s^a_2rZc0t#wQrE(fNcdSiPYmZ@=8XQwSCKEo zv746(hgv4@7y{8<@dA4TVW%Ae>%+*$m44L2^%bF#n*`z2K@$e+Tc&+e%4wM;t0U6fggn(cjm_ za_5YlqlY!-MJ?Z&M_dtqS-K8JMlDLzmQ^Jt2HAdonDxd6*}h1z+%dv`*sl+1wsU5j ztI(@D{qmXv|pc|Vw|Z#mU}17_(s!= zFH3S$ejT%i?v*&9bd3CslsG=`zRmt7GOUN^2;w)EVSVdQWBfc>68*!?aS7se+^^r# zod0#)X~tvYCGW_ZQuE?0;Yn@8$Ho!kgt&^!HjYE){O{)&msBO`NhPLpj7yrh^qY9> zFLC^AJTDc@xyr1M)khH!=gO>q(B*UNH*w6HJc}uV+hx5Gm*oR>Srwy$4wk)I3G)29 zU$=N$^SX{N#d)mCyXf8%HowB;E1qV1wvI-OJiZUV)BCWu>`-6uss0q!{}EI7LxOR= zmSUXNZhwmTW2Syb`{;dvd#EiJFn;?6bWmSGU1a?FXb<{Nt0ighV7}@7TCwlL5mOhn z+a;|L|B$H*>(A!9eO^DL`i%;X>t2d0_5piN#9yqSe~Z`R%$9Trl=#Zxvi7Y1LnXHB zDP2}YEyglz|K=di_8ij&m#v#lgS(w@RD$}iNnH|`I{5q-OdQXdI7*ZE{3P%9mz~|z zmxOu0v3*sPyesyN>)q6s#>x&}Z)P)bQxDiDb`jtDm!_YLI8HQ(ulo5m_HZFgUx>Su z_m5T%B%F2_R!dPA>qM$0bM9ZS&nBs_`rWhCZxbgMbZtS_7N~!}`sX!>e{j{qwBr6% z>i=o#|6RoIhGY!srI7g!A^-j=^5p`GW5a#e=3T?Sd~bm6H~f%v=Xshk5dItKK(@+fuew;qt ztx2TyDmiH#nY%Xm$KMXiMlu+rw*`OpKNWK~o_EaB&NhDJlk_ak4Gmr?BWUMFy;iKP Jx{b8e`wOT8vg-f< diff --git a/examples_cuda/stencil/stencil2.cubin b/examples_cuda/stencil/stencil2.cubin deleted file mode 100644 index 64a9d3ea9dc5ff0d9485c61330de6e2206b6b2d2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3668 zcmeHKZ-`V?6hAXNu2(I3ZiZvNh8KN#JI$`obzLUe;IX6-)Ca*rDsskuthuhs?jRPC zx7d~`hT33*2`=?035hYLSe>NvjWA3?NI%n5xL+BCWomw`c?GzTgeLsSD6)O1 zcPKNMNpHueV19LhYv(-{ z2`345?C&4Sr28|@0)c;`X#q9=mC9`c>B00q)zy3dT#ZX%BiJ1PS3kC)zX|^E2Mi<9 z-yE3LInp9`OWKj;HP1^K@*r=>g{D<5L{{D>W0uh#|N3avUJ6c~y#(w!n}JnlF9s9f z=JG1O8KQOeB3MypN5H7Fn*epx;KO*@YV8n2>TD^R;$z4Z28cW&86jdaR1R?K?v_Z% z|3Ci44tSndEO6EgfxGY+B!Pu|GawiyTDsZdd%81Bc4cy`R*nQkR5+72R4@!aqX zZe-c#I=%zU1o^u_$oaj5`8Y)VwB;Zk_|ymUp8zEM(idR1Qk;>)e>}&cj){kdPlXa9 zgz={g2M-C_Pbb9fJFERO9}+I-mzZPXA>>of=2RRM3sV2)oQjJb5v{26MbtTuN3JCf zrcHTbwy?pBqTjkn{!fcyIp+BhJj$PC&ucv&Z^61ew)tI+J0PeZeD(!_e(c9v($3Dl zEKKPqG!C|1u4hxbg{*V-6}g_B?H8B_@q@x#Li20V&d!bqb1B&%`|q(ggpljlfBdC$ zN7`)h05+`hWx|yBSb_9oU17+`x`g)-;XN);g2z6Z|11QyUuw-;^LLu~bX(q{e)?-| z?|lH~U$w1vfZ-Ib-=g*P3FRj+>A&W9?cnm%L_)-xlwkxTg81ht#qB8RyvMRe9|x3zT)d} zq^`N-1!W_;XmG*KpHuja;|kxCQv4I+H9W3gntV#dxzh@d;*S`7m|fLzSRY>Ce6CIN z#CsLbwS>Y+J=o`@u5S!aKhKY{9<8gZPsQ81AKz=9i<-Y9qt@3`U7daj^&`a?yL_pJ z-|=bP zSpRWeuY1Z@`76H4zaeSiZO!L8Tx%uGC#raCKc1h{-zwf?dENiK!fhH?{%O5`$fNX^ zRa}u9!Z^vJAl|5cR#tVKL<-Q%?XMRp=@(Y@gJL{aB7 zpINW?FrS9vQtRi9-2KK8HZ5djPM z1Kuj1=84;?Zq$!2T|QSCP&^(I5I*4hE1oL;b%n?4w}`*N?91VLT>(!;_w}@{?j-Ab0 z`1v!OIzR5U$$O-%>ooA(xqQaeeXTI$_xVgxpUcCC!7G+MhW)|&=FPb{ol~37*ed>~ zcd{nlhxq=<2KP7g{dNdip4%dD`?+5%8~DG>Vq56mx9NU2X+2gHE5O`3^Ob&o;rrm` zJo;}pi9`54xZX|QLI6>m=kK6D+%50zxBu=;B>!ZD5KP~Cjl8nrfJYwZB{JKjGSOaK4? diff --git a/examples_cuda/stencil/stencil2.ptx b/examples_cuda/stencil/stencil2.ptx deleted file mode 100644 index 3e5dfd92..00000000 --- a/examples_cuda/stencil/stencil2.ptx +++ /dev/null @@ -1,247 +0,0 @@ -// -// Generated by LLVM NVPTX Back-End -// - -.version 3.1 -.target sm_20, texmode_independent -.address_size 64 - - // .globl stencil_step_task - // @stencil_step_task -.entry stencil_step_task( - .param .u32 stencil_step_task_param_0, - .param .u32 stencil_step_task_param_1, - .param .u32 stencil_step_task_param_2, - .param .u32 stencil_step_task_param_3, - .param .u32 stencil_step_task_param_4, - .param .u32 stencil_step_task_param_5, - .param .u32 stencil_step_task_param_6, - .param .u32 stencil_step_task_param_7, - .param .u64 .ptr .align 8 stencil_step_task_param_8, - .param .u64 .ptr .align 8 stencil_step_task_param_9, - .param .u64 .ptr .align 8 stencil_step_task_param_10, - .param .u64 .ptr .align 8 stencil_step_task_param_11 -) -{ - .reg .pred %p<396>; - .reg .s16 %rc<396>; - .reg .s16 %rs<396>; - .reg .s32 %r<396>; - .reg .s64 %rl<396>; - .reg .f32 %f<396>; - .reg .f64 %fl<396>; - -// BB#0: // %allocas - mov.u32 %r12, %ctaid.x; - ld.param.u32 %r13, [stencil_step_task_param_4]; - add.s32 %r16, %r12, %r13; - add.s32 %r0, %r16, 1; - setp.ge.s32 %p0, %r16, %r0; - @%p0 bra BB0_11; -// BB#1: // %for_test28.i.preheader.lr.ph - ld.param.u32 %r0, [stencil_step_task_param_0]; - ld.param.u32 %r1, [stencil_step_task_param_1]; - ld.param.u32 %r2, [stencil_step_task_param_2]; - ld.param.u32 %r3, [stencil_step_task_param_3]; - ld.param.u32 %r4, [stencil_step_task_param_5]; - ld.param.u32 %r5, [stencil_step_task_param_6]; - mul.lo.s32 %r5, %r5, %r4; - ld.param.u64 %rl3, [stencil_step_task_param_8]; - ld.f64 %fl0, [%rl3]; - ld.f64 %fl1, [%rl3+8]; - ld.param.u64 %rl0, [stencil_step_task_param_9]; - ld.f64 %fl2, [%rl3+16]; - ld.param.u64 %rl1, [stencil_step_task_param_10]; - ld.param.u64 %rl2, [stencil_step_task_param_11]; - ld.f64 %fl3, [%rl3+24]; - shl.b32 %r6, %r4, 1; - mul.lo.s32 %r7, %r4, 3; - mul.lo.s32 %r8, %r4, -3; - shl.b32 %r9, %r5, 1; - mul.lo.s32 %r10, %r5, 3; - mul.lo.s32 %r11, %r5, -3; - add.s32 %r12, %r12, %r13; - neg.s32 %r13, %r9; - neg.s32 %r14, %r6; - mov.u32 %r32, WARP_SZ; -BB0_2: // %for_test28.i.preheader - // =>This Loop Header: Depth=1 - // Child Loop BB0_9 Depth 2 - // Child Loop BB0_5 Depth 3 - mov.u32 %r15, %r16; - setp.ge.s32 %p0, %r2, %r3; - @%p0 bra BB0_10; -// BB#3: // %for_test35.i.preheader.lr.ph - // in Loop: Header=BB0_2 Depth=1 - setp.lt.s32 %p0, %r0, %r1; - @%p0 bra BB0_4; - bra.uni BB0_10; -BB0_4: // in Loop: Header=BB0_2 Depth=1 - mul.lo.s32 %r16, %r15, %r5; - mov.u32 %r17, %r2; -BB0_9: // %for_loop37.i.lr.ph.us - // Parent Loop BB0_2 Depth=1 - // => This Loop Header: Depth=2 - // Child Loop BB0_5 Depth 3 - mad.lo.s32 %r18, %r17, %r4, %r16; - add.s32 %r19, %r18, %r4; - add.s32 %r20, %r18, %r6; - sub.s32 %r21, %r18, %r4; - add.s32 %r22, %r18, %r7; - add.s32 %r23, %r18, %r14; - add.s32 %r24, %r18, %r5; - add.s32 %r25, %r18, %r8; - add.s32 %r26, %r18, %r9; - sub.s32 %r27, %r18, %r5; - add.s32 %r28, %r18, %r10; - add.s32 %r29, %r18, %r13; - add.s32 %r30, %r18, %r11; - mov.u32 %r31, %r0; -BB0_5: // %for_loop37.i.us - // Parent Loop BB0_2 Depth=1 - // Parent Loop BB0_9 Depth=2 - // => This Inner Loop Header: Depth=3 - mov.u32 %r33, %tid.x; - add.s32 %r34, %r32, -1; - and.b32 %r33, %r34, %r33; - add.s32 %r33, %r33, %r31; - setp.ge.s32 %p0, %r33, %r1; - @%p0 bra BB0_7; -// BB#6: // %pl_dolane.i.us - // in Loop: Header=BB0_5 Depth=3 - add.s32 %r34, %r18, %r33; - shl.b32 %r34, %r34, 3; - add.s32 %r35, %r34, -8; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl4, [%rl3]; - add.s32 %r35, %r34, 8; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl5, [%rl3]; - add.s32 %r35, %r34, -16; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl6, [%rl3]; - add.s32 %r35, %r34, 16; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl9, [%rl3]; - add.s32 %r35, %r19, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl8, [%rl3]; - add.s32 %r35, %r34, -24; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl7, [%rl3]; - add.s32 %r35, %r34, 24; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl10, [%rl3]; - add.s32 %r35, %r20, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl13, [%rl3]; - add.s32 %r35, %r21, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl12, [%rl3]; - add.s32 %r35, %r22, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl11, [%rl3]; - add.s32 %r35, %r23, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl16, [%rl3]; - add.s32 %r35, %r24, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl15, [%rl3]; - add.s32 %r35, %r25, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl14, [%rl3]; - add.s32 %r35, %r26, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl19, [%rl3]; - add.s32 %r35, %r27, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl18, [%rl3]; - add.s32 %r35, %r28, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl17, [%rl3]; - add.s32 %r35, %r29, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl24, [%rl3]; - cvt.s64.s32 %rl4, %r34; - add.s64 %rl3, %rl4, %rl1; - ld.f64 %fl21, [%rl3]; - add.s32 %r33, %r30, %r33; - shl.b32 %r33, %r33, 3; - cvt.s64.s32 %rl3, %r33; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl20, [%rl3]; - add.s64 %rl3, %rl4, %rl2; - ld.f64 %fl23, [%rl3]; - add.s64 %rl4, %rl4, %rl0; - ld.f64 %fl22, [%rl4]; - add.f64 %fl25, %fl21, %fl21; - sub.f64 %fl23, %fl25, %fl23; - add.f64 %fl6, %fl6, %fl9; - add.f64 %fl6, %fl6, %fl13; - add.f64 %fl6, %fl6, %fl16; - add.f64 %fl6, %fl6, %fl19; - add.f64 %fl6, %fl6, %fl24; - add.f64 %fl4, %fl4, %fl5; - add.f64 %fl4, %fl4, %fl8; - add.f64 %fl4, %fl4, %fl12; - add.f64 %fl4, %fl4, %fl15; - add.f64 %fl4, %fl4, %fl18; - mul.f64 %fl5, %fl0, %fl21; - fma.rn.f64 %fl4, %fl1, %fl4, %fl5; - fma.rn.f64 %fl4, %fl2, %fl6, %fl4; - add.f64 %fl5, %fl7, %fl10; - add.f64 %fl5, %fl5, %fl11; - add.f64 %fl5, %fl5, %fl14; - add.f64 %fl5, %fl5, %fl17; - add.f64 %fl5, %fl5, %fl20; - fma.rn.f64 %fl4, %fl3, %fl5, %fl4; - fma.rn.f64 %fl4, %fl4, %fl22, %fl23; - st.f64 [%rl3], %fl4; -BB0_7: // %safe_if_after_true.i.us - // in Loop: Header=BB0_5 Depth=3 - add.s32 %r31, %r32, %r31; - setp.lt.s32 %p0, %r31, %r1; - @%p0 bra BB0_5; -// BB#8: // %for_exit38.i.us - // in Loop: Header=BB0_9 Depth=2 - add.s32 %r17, %r17, 1; - setp.eq.s32 %p0, %r17, %r3; - @%p0 bra BB0_10; - bra.uni BB0_9; -BB0_10: // %for_exit31.i - // in Loop: Header=BB0_2 Depth=1 - add.s32 %r16, %r15, 1; - setp.ne.s32 %p0, %r15, %r12; - @%p0 bra BB0_2; -BB0_11: // %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit - ret; -} - diff --git a/examples_cuda/stencil/stencilX.ispc b/examples_cuda/stencil/stencilX.ispc new file mode 100644 index 00000000..36d9d521 --- /dev/null +++ b/examples_cuda/stencil/stencilX.ispc @@ -0,0 +1,159 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +static inline void +stencil_step(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + const uniform int Nxy = Nx * Ny; + +#if 0 +#define VER1 +#endif + +#ifdef VER1 + const uniform int x1o = 1; + const uniform int x2o = 2; + const uniform int x3o = 3; + const uniform int y1o = Nx; + const uniform int y2o = Nx*2; + const uniform int y3o = Nx*3; + const uniform int z1o = Nxy; + const uniform int z2o = Nxy*2; + const uniform int z3o = Nxy*3; +#endif + foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) + { + const int index= (z * Nxy) + (y * Nx) + x; + +#ifndef VER1 +#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] +#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); + +#else + +#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)] +#define A_next(x, y, z) Aout[index + (x) + (y) + (z)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) + + A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) + + A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) + + coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) + + A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) + + A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) + + coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) + + A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) + + A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o)); + +#endif + + A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; + } +} + +#define SPANX 32 +#define SPANY 8 +#define SPANZ 8 + +static task void +stencil_step_task(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + if (taskIndex0 >= taskCount0 || + taskIndex1 >= taskCount1 || + taskIndex2 >= taskCount2) + return; + + const uniform int xfirst = x0 + taskIndex0 * SPANX; + const uniform int xlast = min(x1, xfirst + SPANX); + + const uniform int yfirst = y0 + taskIndex1 * SPANY; + const uniform int ylast = min(y1, yfirst + SPANY); + + const uniform int zfirst = z0 + taskIndex2 * SPANZ; + const uniform int zlast = min(z1, zfirst + SPANZ); + + stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, + Nx, Ny, Nz, coef, vsq, Ain, Aout); +} + + + +export void +loop_stencil_ispc_tasks(uniform int t0, uniform int t1, + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], + uniform const double vsq[], + uniform double Aeven[], uniform double Aodd[]) +{ +#define NB(x,n) (((x)+(n)-1)/(n)) + + for (uniform int t = t0; t < t1; ++t) + { + // Parallelize across cores as well: each task will work on a slice + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd); + else + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven); + + // We need to wait for all of the launched tasks to finish before + // starting the next iteration. + sync; + } +} + diff --git a/examples_cuda/stencil/stencilY.ispc b/examples_cuda/stencil/stencilY.ispc new file mode 100644 index 00000000..72c28ef6 --- /dev/null +++ b/examples_cuda/stencil/stencilY.ispc @@ -0,0 +1,126 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +static inline void +stencil_step(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + const uniform int Nxy = Nx * Ny; + + foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) + { + int index = (z * Nxy) + (y * Nx) + x; +#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] +#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); + + A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; + + } +} + +#define SPANX 32 +#define SPANY 8 +#define SPANZ 8 + +static task void +stencil_step_task(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + if (taskIndex0 >= taskCount0 || + taskIndex1 >= taskCount1 || + taskIndex2 >= taskCount2) + return; + + const uniform int xfirst = x0 + taskIndex0 * SPANX; + const uniform int xlast = min(x1, xfirst + SPANX); + + const uniform int yfirst = y0 + taskIndex1 * SPANY; + const uniform int ylast = min(y1, yfirst + SPANY); + + const uniform int zfirst = z0 + taskIndex2 * SPANZ; + const uniform int zlast = min(z1, zfirst + SPANZ); + + stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, + Nx, Ny, Nz, coef, vsq, Ain, Aout); +} + + + +export void +loop_stencil_ispc_tasks(uniform int t0, uniform int t1, + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], + uniform const double vsq[], + uniform double Aeven[], uniform double Aodd[]) +{ +#define NB(x,n) (((x)+(n)-1)/(n)) + + for (uniform int t = t0; t < t1; ++t) + { + // Parallelize across cores as well: each task will work on a slice + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd); + else + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven); + + // We need to wait for all of the launched tasks to finish before + // starting the next iteration. + sync; + } +} + diff --git a/examples_cuda/stencil/stencil_avx.bc b/examples_cuda/stencil/stencil_avx.bc deleted file mode 100644 index 7a63ccce3094f36eb8f1674fbe86d04a3ae31cd9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12836 zcmds730PCtwhkdA3?T$Xfim2nz|}T|Fo+;v!zfnd;*%;=z%bLw5CMZwsz6Xs!2%+K zfTmW(ck6^8gJ6+Clm;23Eg(`rCDtpb1+;a3Yo8P5LBIOGe&2h1a5!h5v)5XC?KS*s zonvit`ByC}fv_9?=1gegdw5ld)pLeX19nm^n z&iWYLP{~wUWworKt+s<~s@CGV|B9umYK3W2t~n@rpA?*A*FTm3I_p#jgb(1qT2k*D z_@S_GKBm;5Glf{&78}!RwPqQTmc`Ykmu)NwcOKJ*oN4#DlwdmTX{-XlN}-RTaL<`G z(n1zF)B4y7PZ$dQEQQM-B=9YJCDl$zSX4?{B=gN%;Jn(jxYoQwlCT>1p*75p=U6k~ z{~kD~!hgd>4l)eiU8DnxjwQh1PQtgTO7k4|Cd(xpbEI>_Ne(6i*@kSXz+)tVA(1VR zrmS0Jyow+jPU1LA7rQRrm|jG;{{DhJmmKRG=d|VYg-IIh1HRg(slE30#IVDA6oRn1BK&a49B& zQ48iwewZ`)Va~LN8HnB1v11GQ>14s4VuAs85D!1X?r;SZ_yxYfECu`oUxg?ZjZdWZ z5Ss;(BbgE9VfJp4p*4{t&|SSD{Ri!@8?^QkNo>#_fx1o;w?9tOJP8Rp8nVfdB@mDq zT_eKrM;X{h&1|}=qGM{ZjP9y>l8vm;VScOhwcCqtEN*Ez=OOMrrTYoKXz-$^WS-I z#PH=Bh30lyx-^Lx$7am&PtYz$%4-uo+Gju5 z_x=9N(mZ^2-QEN_&CJ9u#>TI8C8yl0dVJ2}*6SMAw?0z;_D6nv`y)+nf8@s7ANl3& zkKBCwBfpM6(rqtU-Wy?dZGnG5a|m7DxFDyH(c6D`T~j;6eX-AH={&vd zu9_iy_ibwOcjMdo85^_nS5{aVx8!;9zG)kHl;^p7=l#BC&*c<^wS%Gik2{q0dPSAV zhy5IEoGRy(#dr7OJs%*qvcsc0EuD(0!lPARA5XmZK;MF?P+Dc6vL=mN4UE+8{<|xX zp`52jDX$qA>6AYjH^nqIBO!MB7}$MUxKZ1EXh`*Lyin-tEYpVbh zY2UPNj*fE5y5jzYvb(2!R_)#Mx2|MX^ak3+ujQeQ>-ua|h~RsZof+RTukQRs)w&hP zCIk0&|Qj3o_Qm@Paf{fet!JdJeHudNz+9P?^dVt?P$ z(qFol&I>w^MGf(enJf$hf0*tTTOK@A+QZ8q^se$4#%Kn^u7vo+eL8e=!Wx2&VGRMW z4>RxWnF*4CbAe=?2D9+X1%5Inol@Xq!5hJEbv_L%NtU_^v2J3>!^2uVXQIlx0zu_X zhi6e_k1JA-D6nLW1v0Q_DPUk*n9e~34vnorS=)6HBXv2K^e$IHhf`W$d#B#hF0s{J z1A{QH{tumMvVO3+^<0u|vEtf3%G_qY{`v zJ1{J_m8b_f@EkAm#iNo%XS>qQa@6 z;JdF3q~>Ick#2b)1>i)Wwc@%iKl{;gb!`yW7@KZo_|DTA^uQ#iq6UWOg*-)O*SWLu zw#?f5fh~?{*;)6w?d1cQTgcFmB2~cy+IV|O=SYchbwjKrNIhlwx1SJYc72yan`5=G z%!9vfi*|dxt?GVH@rzCMZT)@a9}Pmn{4FfoJ+q=COj-RjVUFN0#$b>G7|u84k+L+T z-Dn4dLx-KEU;N0fGtArn@XF)>=x2uPvk$D`777WW72jhi>lqcfOi{k|l7z3nSTP5# z!!~CwsXv+%7UH?sQy)0U3xy#olAi6TwX)V4_()Pw$X#(!d425Lr(g0E99AG5J=zvd zAAYr)pW4qVXST;B`I%~TA6WYK>2Vl$P@PT0p|xa#o={Tu1mqLs|I^rtD+yuIym+DiMsoJ&~%f?W)&`uiWNyjs@Y zIdJyRwM`W_^h$0?t$J52W$=&*yHl()A){CRbIg*zGPFCYp*tS7v49i+z(xBk3e-$M zNaNYU5!+i8Iqiwg9D|1v%$ZHz*_lNva$hZ|YKYqSxx`xv0N7~=0&GU0t0%9`s__ps z+B{jW&8kVRyW3hXWxrbxK&EfZ)UUWd>W2E58~%3K@!hybuLL_ohT?onfhp3R zvM94JA|mtzf_KA?fcY9(3j;T$7au0nA)2BjV!^X_G(t?oFG=tTF9 zpge{mG>#zJ;mN$UC`Oh*Vi+S0U;#*P!LxlM!hD88S3&UVrroClr}I04vPFyu@C!HY z(m9NfV~jl(W>Tcm2bK1)a`4!*dgb@9)6bHjj^pjE>u`z$A;htqhhlla1$-%hnh|1I z2I2-_0)7mngU$;vATbQ!B=hdT=K#|Z8hpih8cQf>?RL`^ji?+MO(+uA|Y~|%lB#Tk!dq@pRo>^*lG7>Nc95#6{qWzYAI1Z38Uy0cO9)JqC z(pGRoY$;*5@aM5=3D@;ie`ZSwIM+pUyG4$=V)WGfC93~~NUe^b)_UGWP-{6>@j)zr zdn>;eJ2Q^_(FWiu4}xVvc_l*iN{-L0#{Lb;;qT=~25NG}A2)Pfz{@!?KiqG=d>EAIK>k0I3V}A0Fxo6c+%Pg*1ODUc1Q$P$gsQ!%WI-6Rf?%H!BL&7NU-n zxuVU2_aUvae>T!S%Tdf~`0Ft;oOzWPye5n{sl>+>0vdElZ}v5}^A0Mg5Sbhjm;adO z6hVb4eV_q+47C(+KSKN7lnFf#lTJS@vb%xtmB7BEigv6~5QD*US1gEwo&8mJV$SML zQIIs=&z&--xD%cVptcw1zo!%PpWz3XofU(f%@A~wXA3$$6M{}Agx?A7i$|hN6m-mB zDUHxMF*v(V>@t3kxqPvEZuYqQ;<}DPPeOI(djfYnJm^?}_lp3*8@A)A948HRxDMpT z4|76syT%>88TesTAjqNAgS$)s)42ho0znm3-Z-8xL_#1cpo#1*udG{xI6o@z^9NLE zh)|^)zIHqpha-MI0O5T1i_1rGzy&J!JurrGwm!gC9SPq793Xj%eQcDpnK*ReYsa)? zfJk>jH?3TQ*U&wNdrtb6jB@arxLPEj84qz#37L^Y=0xp|amWN@j4;s@;r(EbrVHo+oNOf-d5IYl!C1bf@#uma~16rp42 zH;s_l%Sv_CBWbnys~37?Ruls%7=ZALh;kD9Je|gvj>1vYGe4?VnSL1TS95@9E7Lr?>hB< zN9Dj>BUIEk`(nMB$+!RoIV|l?!38FWU*{xTFc!1MAPy4?_i80aC!;4ET;divLnZ-0 z@YYNMacO#z0@H^$uPXP`(V4S0H2*NfGFR62$D=nw2}lUM>sN%S2kF{VjF7PIGG0?+ zZ7qo2-uwM4>aezXvp?Du@Wz?bStlOB0DPf7<}B#YeLPc+^#E~xSnaz-yp+lObfSRt zSc`NBlMFoE0k(ezM3yTK9&;ix=#5SXd)WFQ-|Cc$fiB)m9pwjAB#<^cG&m1yrXo#X zacLG*0{u@ED(9@+jK%@acpnqz2XS7zbjCP{y<;#~RpfCSYldU7?#Ojl#>~JvXtvBe zu^ea`!|Y?A<{4&+CPBL<19@!Gj`hU8nW5ObEIOPzr-@V6EF6NzED@y$b;a&XY%7Bip{4~0I0Ww>#=|HcC;Vp!qo@^u zbvdSl(HY9CFX#5SMzNXVeM~ImVQjC0ovB~v?U|rK5O5Z-{w^Wb%zr!tjW$rilf}lq z;K0lw-TOU4o_fWYb4@6S(VLiOj_~Sc7Sae(p=QpU(Nw*5R?_!_CpX8iJ_AIzsm-zJeWF73u_AOly#P(fz9_wD@+Udo=-Q_SGAyw-%ofN{0shv$ zM&^ut4Gb$H#~X4Ls}+yzIT%(%Y$r$dQ2}R(>|z8zXJjve4VdnD*sfrXcNI){LZ1Nx zJSg={PIYdWfCjUrx_x4`&Y9}`496NvQ}Yp4>RH0Rk2*q8@?@#;9#Sr{r68<_G($Oe zLhLelsqlA{a{%O#>i&P1b3cGFUp_6RpsYV0Y0Ui+afOK;N{R)ZF9c^gO50T zcL;@j2G2jzZx%*$UO%|uon9j`=t~CC3!pMu0u$a$M*R!D{KDP?{oy;Yxe1XWJq9{8 z-r|C>zieW`h+euBOeq-QMe4+Yu^1~Dx6bvd2!F5E7#d|}VZXD#KPpF4SpL^5#j{lO zy7D(q{B`909f0h*X7+{3jdpvRbO0 zS0Xo?68)zX&w&!TBvG}ohStYY7zEU#xc!x@u`#F+(H#?_1UbEh16XEtJZBN#$_a*MZM|{b$_jsSnIHzz` zBo>`tdeu9HL)h1BDtUokWo!pz30@2Wp#i=poKITAM4uTdz~4`@fg}MbX#tn*mQ2WC zD`r?}Wmp?z*jQ!QuA_hOBa1BZ`Qclt4QHiD?MDU?yO2N^=;K5qML9`wonzgV*f@jv zaRwY#qb-N3LN!uUktEkTlImltS+fnN8F1K*w!Tyqx{;zPB4S$~D`7Bys{x1e?nH2w z(}*D2RQn)x3F`8fasLohGi?NhIcA|9b=cdBoGU5mOai50AgD5PSr{@c_4Pm>chO6<3Mb; ziK~>0=`tYpn0z`_wiFS&-Mhf0uLFqHs@K(g=&lCDPV(J2yTTE%NN2gz+)zYJA*6Z% zjRwR9tn?~tt{`G7JES~K&DiE^7*rSI+lsahQhkD2`D)YH>ZCXc!gGS$ZD(2$)mF=y zncBCTv+yz}(`t2mb&~7JLY@-<(n>nI~!BcMJ#_g zy0ak(U2yp(XwHW1=z_yvhVFdEMHkL|V>IWpKy<<88=*Tc912k-nZ-9mb6i}}1%q#Z z?yPr47j!-g%~|h&E(rW@2=>&4gx*BAlTw??q|D1C!&!-LL?2eX(to@N>?9nX3(znz z8iwN)I^tj?uO1FhIT~oh4Gz`{H_&j#F*ppd(J;|Wnp4M-AneHA_4zp|_s0(}t2cUO zUKY49YUQmyj;&@&hLIMNNrc!0mdZN05;mcTE%0dB5@o3#w=W>SKI%2v5DbW?V> zB=V)(@K+z;xH9-&v8B#IKDAk?^P-*j(`sB;3re)%j|zl{h%>!tl&=jNS+>*weu znJzR#i}e`~L|5r_Ddx^t$EM+2AQc(xLMpsfrbxDEt1`Xp+&=P#nc4`t$o!fp z)m>cg4O)3QQe+k4@J|~hngoGQ*zPq(q`gff^Z0_^GUEDF6mbx0r-h|D2G&d_o;?9@f@=%VOhMC7E*Iv zm##g1&EZ@+rmdJAM29}xH@8i{`S`PW3tqat(x)s zCy(qg!Pafwq=-?o46(<=eo}D>-=!zTSHvf!*Tp9ne-@uyN>i*me`gh!ape;YTOWT? z;XuU`Hd{$T#oR7~b$oqs*KT{d+`n?Av`lTif=GprFaEb;T}_V&279{VS*`U9#|ul{ z0^yV1of!1l_et)zb`X0|^^goKd_ zN6K5_?r_QI-O4K}NJYM1+reh~tX+lKQN1>OpA4BJ$6i+RyIPu!cqE#ZW#xytr+xC-ZT zDZyp2dX=KtPs9IU8!_uJL%HzAwoX;u;rIEOm=y5$KXYf~ zD@B5S-}l>{pRdlG|C~89b7tn;x#N51UYBQCj>RI0kRx3u5%>9Qg+a!zB#k;5%drY6 zUs9wCr8&|mK&IhRSPD@iCg>1XEoeTM%LUD&cn$}06eGgv`35HmDr!iS%9{!bj~evo z^#&`bB*_e*d{oz|ih79y?`32wRR|%U2~kX@A67kr$`s;=dQ<2qDMk=fw2#_G*ZI7C z=jSNkh*yYk?NgN50(jj!;TDSbUrSO%l@z*>agrpc$afR+C9BS5zKV!?UfJHZdhwzw z+gq+^Z|m$?f5rNW#aAp|WD5mt3t0UNp@f>ayrzk%qvlXKvlu5yb3vEjIvtmdV>3A= zGM~pQKo{dWm-Cl^UWuy|mlzAw*9&n~;#!1@%BC>gAkyWaCAefwG*A=)#9}<==Oh9res)9~z^X z@cb-xzMA1@NX=I&$T=1KS;`$p{byoU4LH0^xDY&rp)B%qvy^MkVuyc(u1xj+Qx^R< za(w@p3RPn&0GFq@Wb6l7%6%RZnaX`NOS$A{GwD~e=sA`J{|@w!J%^lLNWcf7eiXig zD^q>m%Yt8+rCsC?GwHb?OTXk~!EXRwDE(z}tHTS8Q{kK}<+eg3Q@J;DJ^9PLQ%VrK zIg9+EEcjo!-9}wrNKE-Ax{YNj_ZL}kN0xRi=KUT&OF_=5FguHV$p2<4H_I&XJ$2YlfG7)N-$ zGY|@Q2RCTz7JzpK*0uQq%L8F9Xm4p**49pv)qxIw*9K=es2Y;XdOH0Lfw05pUmI`) zJ6qe<7(({;_MqPop^~70mzz{wE540Lw}+Nmtou%^y-G!1N0y(8GuN!2$htcHV5qc-l&HiFS&qxA-AXhDOi zJ>j-sCqx>;Fi%Ir29(wv>}ixt?&q;p_6PZfkD~Z;)X8 zP`Jf^`Q>E(#gewN^NtSuHiX+l?uKxAxqq#%TMKvl+QK2X-{q(^^tf$mVS(?>1Q)|EkFO+&b%Ef~^J#lRvmptIaX zMV^E^47?${$RF$pGl^h($kj#$=`d`8*c!iI3mF4bYwEnCt+NGVAk>55K!es0YYlb< zI;GZdpuHVtr-1>JxBA-JB^U@c0O()u(=dG3qdz+W9U%Tc@{w~}~Aa2@q070COu!`B9_t!q{wpY2ea@#@!Zf=GMtr2O3p35#wR31{1T*-oT z!VC$LHrDkb2k}NA8N=ZlzlMYbD`AS|U=?TJv?jJl|AdURMwM~ZUAh6w#A(tkTq&hL zZA$MsuMNw`T*;qI&yntdGO@;e@rm29M9hD!) zOE6ns(%M%9asQ%q=QjihD*Q9yWhPwa`lka!wUVL4giqNCc}=*eB4w&D;o{kZaK(gE z*&?VWT&#&HtzjFCrW=;pXR*F%vGHX(&(Jgp21j!pBYc#RTv$VZtvFAW?^xe%Z62cvd1l z-{B>~@VP8z{1iI8EJ9<4o}EM}a(G#k@XJ`t_>mo679qTZ#f+a4hnGbNFJ&>~r_AAH z5yI)YRD=qLmqiH|&%OjG4lfZ_Pw|{gyz1~0VK_Yxi%{qAvMA{h&wT_mJG?}g`~ns; ze!LDZi;%qG7a1$9b$D5XaC$x#q08ZAQNkCqnDMjT;bjrR>A6~jUWb=O38!ay5!Aj9 z^5bv91H=zvy2pQmXLz;mm-%}qulo6wvNSpON&H$bQV=74EoDoLO(M*FoOlcIqm2Iv z@j1i~G5#Uq$+aZ5GyVbM$)zL)82^3Z$(1Df8UH=v$%Q0(8Gk47RNX`uqPF`Wc@? zJh_NOFXK{l8bTR%Y@#GQ`UdF#qJh_5I9pm33{ygFp#=lBDP5nd};|~%~TQmuo z@h=chE+A3J_&vna)K5r^-$^`8`Na4a)c&6nFB3n;_{WK-sh=2S{7;C#g!m!GKSVs; zx)R$N{{Znc^%Db(|32|FTX-X$#@SrT%o1RfC6v$1eh-8u(d+KhE*#V4V89B>(K^!28P#Js&}7-|r^lmx0uOGcwXON0H)wxoH*);Fp@gvl~Nxfn#h}t(%pz813k+W*=4l)i2ezIyeB~6Z5FX|<12XR7vcp?|_ zs=h}Ze*b#aGOFtP)B|7kApcI1RU-$J`QPICdzW5}aW3^}u=G0(zmOFa``yDuM2s4q zhGcd4ueoa9VT)RMC{)~Py~qh&eM1&ipLPVDIC<7{Og=j8O>mIuek*%VbYpP00K=_)^V06%X-ZUV@O$Pw$7e}FV*MEPs2(0|X})zMW_b+jqJ+W%|!@Sk#z z+O6|`SAMX3sM_+YYRe0rTu<@%;U(PON3HXQaJ_QWdfOfd&dl5E9?9DSeGe9-QduCVeyZ3n{?#1;IWjj1#y-0#3CtR_XJWJnRi_0=zdDt5HHY{xK+h=iJ z6E6PT8fgG~%PRY=_Eq-V?Al)R9R)Rdy&9Pe=O+$-Mh1~S770djzpCF_jB(mtd`Q(_ z^yn?cV;(SZj~*@__vm*OPoRTU3|#BR>u_)LM1z<$%L`Tg3%CA>r!=ls?zyAATK~{# zeY(pC zNr|w~<0_xT41lMzZhQlnTOUabgYF)t*?$h|!^e#vO}jCa^Wi(}-!!IKgZkYMCa~-o ziO_9}p1npQbVD;@13VVLmYSwVdZ9bYB&|;{F%Xhbx2mygpQC9KpJhmH`g+OuF{ZQs z>m>&b$*o^6`S&DAZA2Km-p!mQo+fs?pE0nZ>)p(^OVak;s=jL!8q3k|kwb`h^w<~( z2AW5IFpgOKU9@G8{v`wl`4bvcv0b#dh~GvyEpvf?AGkC6FyXGqYt}6X&cNu7?HY$w z%8$k0CP`Z0LedM#=!2AibXxQc_97wvV^n#t(hyk+k->M+cf08UI{q}ZCiWD`(oOYI zk`y=9hrkGJPsu19N@*n&|MsVlT59M!g2Cd9K0vPy zjm5z6PfX>_1Kh_-uvoX;h=r(8c=@s2aTluk9V8DfGfKUKMz5cZ50C#pXk2-$PsD-H zP;n2xlS?cO|4s;kVqjHJ&9x+}5L*X!w+xj?R|&WtK=eUU5`Buss2Yp0!Ly5ssMg_Q zB6TG9Utc2^{|5-D&;emJx{Jz&--{O-D2+0pR0Tp$FgLS{DnQOrKNgQ02o;6S83ox% zwTzDvKKTCTfyx!vl`b2JG@@3ytolziBU(OW{fV_)A5w?klF?VA)PNa#jM+3ehZ%qN zA1{?Lcpe<0G<_F6C_qv|2_up1z@QA%lTztHqc5JP>JK(EAvUb^$QY8MhT&t6Y^Mt9 zyVe_8$VL$FGQ?>tj6|poc}(Xxk1?Hh_tOA{gdU-?qsA21n>L|p15C}`+aN&}8i^F* zmvpG8BBF>H6>A!RG?`Y9P+@9}wE#|WX!nJzu}wRWWe3kfUB_z1nt+6%sf4vg0u_Wi zQ7<(0vr4$&blAWEx;;McV|a8-}D$Ji7Q~skCh)} zw}O%Qw<$IH6x9UXPHos1xfcaM!?;>H;jSF9{@olrxP?x;A(nSND#|4FKX~*LgK(FU zQ2Q7vK*EtIl^%Plj1{z7sL~5DOQsyPmQw_ zCm{X7LZi=!>_$Z}1SpjnHxl^-F=LQLijY1MnE=51gB}#AppnSu0O?uPqd!u@r05xu z#txASO^vA*bp(q`$U%EvF;Q3f^H`+ezR9c@pNH(QKZzSD<IPA81^v9L3};P&K8vB(LY!9H{wgj zjhtnW!u=uo2nodBHE)n~`)>wa$xBcOSLRjCe7E7_f~2E2e!5&$`BV(ce32 zZ8ikBU_8bf@fP|7gSrkjC40$aCyKDGJz+rDkGjVG4E?HD_!ilB$cQq2CxN1zcoWxq z9&1PZQr>{`Sp$rrX|yB$tB=weur|E`kKy4R{YrIbY1UhlZ6H40#+7$tpz(=hEUjcx zZ7pQE`u>BwqUES{#WWxE9ks5CkZ&Z55w%8MrrT`vDFs%8lC2=*gztv#!2@U+?tYrsY8x-Il#E z^Nq$_z#3Bmf5^M7aQ64GN4C2{j8AmrD4KQv9f(ey`5=`LzxObkAj~kk*ODkXREP}Y zc`|0qkK3kQ3`Oxqh#L1}II%4sW5`Dzq`P4>Mq~G5YvJ5^5QCL3m<2k*<{Wgv*&bt$ z3Pc-=K<*%w@dDmEDI>gjy zgr>~VDr@2CG8melsOJF!^5mn|ThX}N(73Y>wpv#nv^M@8F+^IewinlJxA%SNzxUsx zG4d8p{eAt7-P@?GboYWO9xelHEq#Ppq~m?jBs9fHDPbuIKU^a{U$a)*^$~mDa~6B$ zOV-T`kYHG*kG0xT$%u9HbR^)}{CW6{+b)n%)Vza(cfr-uKEOVweiU8u0>Dw`KMdx! z;rJ~dFfZdoj`$NW`d|a93GegI9|DgB4wVnxBA(q>?Zr|6cX3)Y(35fLo_G_y5ch$J z1MGolwC^p8dLXXO7^RUh1TQ+2xD@*@s-7ujFH~htc^!F-&NfVdj!j&M7Vi87>c=*H zP|0^uVYs2~=pC$w;*Y`s(;fN00Poyvk{cws=+foq%ee8qHCO|yln=$dK%W`9Mv-~Yo|8b5i?W9HI;C!RTKLh@Zd@X8xM3imR_&L%MUD}$= za0q3fiFYG|QOXY>ky4+>k+gG`se=DR%6rKiS0$AnB6F-I<=FMxEiLm(@rOuJzr!mH z61$sTK%&jE#6jTV>ASyQJa#uVV24QYL>r5n;bq4`E3n=1B<>$O;C`QV!sotiePWNR za@Yy{6WRe$ty>!57o4$0vto-RyZ%enf5_uMVb@=>$L62yjGcdq{rDfPPavcIkxM^h zKmPjhH|@s{xhzMZbsKhN#$A@TN%5HKKUPhDM^uzGQ0;%)hU+RNASf2#;RI6>cM6WeoL_kzpHq^+7~ZW``$04vPMJB+d`(9fLT6KEse$T$}q-Qg>L`p9x?m`&+3Tl2WZ7-sgi)i1oi1s}TozR=_)UPd$|113BHMbT! z!xtReYYA^tO?qY0TXOi{slRDs($w1ikN2jwmu;=GeN{Dfg*Fp@F*diENZKwd+lKA? zpu$u7j-57FkEz37|`3v|madVy-msdGaR3I@CIf={5+--i03 z+FIBbx;-Sxop_IEsXVtuMnipm#Ma2C8rEoNayVQK4JOlQ?wXtJ9=B7&Zu)6i5`S6koc za%vuTwL7J)!VC?yP4$kHY-Cf7id$CORFh(f+B#Rgz0qB>Tx(d-(CDf*7bf(n_8Mnu zH8QFqklDzr!(&HV)C^s~6B^ysE{q^^7JEa3t3Cy3tgVHiYF21<^fIAYmdZp6UH0m=0 zp=7RYLF;-s)D;M{^mG}{&D~Jvm@j8Z5cMcM)o72b$R%@I=Ci-jB}P@rYs{C8TEJ9r zAuTNbdG&^t6uI>B+%;NLgNqBA z?T9(0*_pqG-J#J={bHg=O*TH4MGiv1Ro9f}c4CxfT6bNCVG=?Rj;4$jkt3WWolr%eY6A8?*_=O|1oR}9L2@jpcDP*9*fI|}(%DU4!g zi-j`IV=s?|X3n>8ek+ot=MQlHWzHYu{87%o$oc%m3XA=o^Cg^rNu)0@(qHEMT+YA3`8qCti1X_?{|C-j zarybYgKp(KeNRPUfXmxBKg#(lIlqp}mvR09&i8Zvr=0(U^TV8P<^$km&Ogif39f$^ z=Re}H^-_QByI6njX4;1M8GzxP#Kf?Jqm*3C% zq6&q{Jp$IO8mjjP*|pqAm+`NnYt82=w2NRwrAID zl!BN2mn<=!F|H+r*Rl0ORa>X8dqZkDhBoov*VvJ#29EFBejU@I+^-ngwqN)8V}ChV zy}IstgJ*WRSHy{0=Hy_)bpzyvN^^8;M?$6zl7n`>E zl)mY?*Pd}Iy1-5^}iEfJ{g?EvzD>jd7kdD?xaXa!2& z^!szRt z=S-8N*!j8z@qr62k))R{Sd7FUUwAfv)IffT!q7Y=CBZ;X_E}_d3hDG74hZ~y-blgI zw=(8@^lc0UHg-XkSqibq@2kc4uXzkbdXBGZaR}x?q2@PcWaQ4s5ma|~9>FHl%a;M>kHcs_k3FH4mq^b8B4d~Egw z38>)muruRDJ@9`E#t)vZuzdehKjK`2^zu4+zud#=7EXW2>CZSF2;jeaC#f3ot)mo=@w3Z$m!2G9pv=4oW8;7hn#-NX#t!Mg$p^gae5u6 zHJskYX(y-maJq%lA9DILP6s*tEvIjA`XQ%Za#}E(m(Qt3+0k}Jo2f&uz_E(u#`TSo07d6pQB2)(1wn%Z#8JR+n{Sj49kH) zcb8(`f@_&p-8+m~upYS(=zZzoX z^F)5(7x!}q)5RSq5mF`c3;#rG3=$Rjg`ch9j!gJfh_H~}{^dZ)r-}T+j}3B1FZxK- zSI7z4i2URu1uy($#TiCLK`UM9le{x!-K&`Nu@Vcm|Qbkn?w^ ziiP4*vWrN{G+_VVjU*(i+a6+yyo(yDlZU(DuW1lgPWQ^0)HI| z=KNw^me2F|^76Cge$S*e)s~!>ET@1 zU5ra|^O?#oS%CzE3rWBf(#tm`ofKu?*5@c0@SNniSms3f8ehp#%n*y`!t03pnmI8S zcLj03O2to+#C<3gpPpQYr{ejNxNoH5Gm`86R6N7^AP~sK9YXl;RQb~-;g?hKlh6I3 z^jw@KCH!cryft|~SSns93BQ+$&x9XS%n*y`(n%?bHv@hq{7xqPEcla5`0V8QXl7-( z(%G2*ndHxr#QaW`FOtN(O~t3qIZDOTZfZz0=E3Pw$=Qlo6leR;J?k%%DIgeH*I6>y zVIIRzwnGKOPqu>#cn+Rwm9%nKa=a@Ir}YNOZ%@PfnErF6LoP33Mtg`pz~bjhV;mRr zcK|r)kOn+n<5`%{QedkfS0k@mDK@RHlo#>oi7Pi`;mTeEWceI6_e3H)ZV^8|2a zAHFW3@+g#|!z>u*^i?8-vRp=_D&Qnv$J@pB(vUj)G;RO=ChDU+lexxyu)5~rYgr7eRPM7LW@2OEJVZV6J2Tt`A`zx1n z{M$_aTr4QO41rn}dVy!M+Yh*&@p%f9V|!bO4KY0@&!d-tvlEcg`uDvo_z9-x@0X`7JO+IyfzE|tt|NZEO;af{v(E;d_Q_5i~I|~ zGo3^Cr!4ZHWWmqG!ZMTHE@L?65uew5IWFClMc$VM{|?8~&!-yzo~hrT&7#M03bl9Y zzKrIuPzXFz{V&RbF9d!G{6pEf3d_&-)Di0gE;A(c=V9QuyPtG_e4Ob&T^hJTVHw`! z*$y#0RR?kJ0;m3>ZwD!e`^bk`5!z{x&^+&)5QFOxr0q5}ZTA;$I75e5`=_A5ybbG)!fVaR73 z|4)W<#zcm);J?d)znKO9Bnv(r2G4YU-yfX{l0G#?! zoDT(-g*92^Ls{_e0?*X0O9};-k`v#b(-c(;cjpwlcvs#)50B^ zpH7SmN#dY56QpS^L2XTYaJ8>pV;iBGuV=l4O-^=3UyE&V+2Vx~C1`Ce>xsJxr?JuQ zYD-T?#|BX!SB;ZZXIZ_y+J(1D434TJ;!~xtT~}6$cu&*cWAk@);fT5Nt8o+@e@o40 zyrovYh~7YDah%xJ(}8pH+H74ov=4^{Yb!nFl_9**=5uzhV$*|3VY{XLVP1 zu*QOfJKXIyJ zLz&jG+Ereu;Z-nKVwa2UA7aiG`^UKV{G#>fxsg8ub=u zRi1o$WZE&4na{{P`NT?g@MCHX*@1{@wOQGM^CDBl=nTa)Df5wwsY%Asp;HHXnhp{* zAGVm9iMKrSiIJ(5p}t}#S*F75K*%(u>Bl#wCTBds^KTyCqI+;AsBu!M>AcC*ax$EQ zN#|pxR{tx{<3x-99}WghFQ68KA8*cTJ)r=dgv$Cw9Ghw!nwg>Ai;Z_@-2Mh1okUu` zDD^#kR}H)o{>_HQuO~;_8SL(W({IE7=zQ?+?mB-%x#n_p)YCbobmXYGTcjR1s+r~{ zJa%Qu8~29G5WWG_a1V7YP|e$sJb%`B1u1#*DmuqF=T*~Cptx_a0fh5MQw_v?Lh@MJ zzxloaAJh^GYHNL+Ep&7$jo>t!8Hb@7cU5ucV&{s|jAzbKTUp_zQ&ZufsCj4^HO!d7 z$&+i-PNYpel9!)Yn%aTvgws^Wd|0W!hYCeO8P4cs&Qy5GsndK@&ek=~QBBovniafj z#5ujm(^5}9K~_;r$D{HySX0$zH0uJKF)OT;Jh+%0rOPvOkDYpkEYkS-tj2+^tONcJ DN)M?J literal 50485 zcmeIb4SZD9xi7pY2^vz&1cVA2+tChff+S`_M1o*v$c*gK38W<0fZ!xdCM0$8agqr_ z3mOe+orbZTUf**$=Wu&+h2Bfg;Z)wfwxt{j6G$Lvt6=@0YBfcz6F(8F;0NUWKhIig zvS*kHcusqL&-uL@X0P>ptY}uE z*=1COvwkE%6>6G=5tNQ})q3fMtG9*&6|Nu9rPm%8{DfoYlq{;NzjC`pcdbiiGa8(ZkCFB2T3jAv+@H0}>KlQ_8@|UNm zA5RMW_khpQ-WyrFtp*ZDua^)_CjYh+_E`{>pEw! zMHjSNI+}fz{$N?Hufbo|(p=xTS}-{Rffk=&L6TeA{Qd-4i)x$vn)lW<-X;FkjqO2y zTUnsCz1`oAKW($PGsk{)bviY(mCjf6%i%G&(3 zL7B0Tf{JG<(~FwQWJKHB)V6tpZMBWTcDK*vS_0gx zRkiJnK5q+Z7P-28RbE%+4c>~|e4MVh$msiz?5@>fdQqwevRzYmF&*yCymhN5Je0O7W z9Sp9$1GY!KK(N)fwECO1`k+4mCB0;YX#D!xMuL26YrPP^7ADo?Z)!&$sBicC@6yot z)D-n-$~vvSwXLx^SWg^HEo;b%NVv43rrO44&0F8t+^C@fA;;^lqkiV~u4-==JU}(q zX$#BByt4|6w1pn`f--M$VR7LNT3L0qV}aXiE1V_J)zx4rDJ(TYqI0KVZb(0L`q(v! zGl4crB~G=Rpcq{=JEqidGPu%YNYYH0rBRrMnHf<{`1b~q(VWwQU&U9~VH$ghwnb)5 z^h*&}^o}bVF&#_SdSmo-%>_Yf&icDY?!q*1oVHV@3Ei6y=^5HDB^=EEAzW{oHXze$ z57!`9F;6tk^VGZm!gy}WOt`6WlJGS5RhRPXgr|9KPwN16?K0rinu%z; z4S2QoAbiAtr!|PW1`T-Q8sn$|PirxC9XH_B8ii;>2E0WvBa9mG*BJ084ftyf_%jCl zR}6S+P}CBfA7hj%eNiD2sT!@Ue= z5j>9I%?zgzOkI4qli`nl0GQMrZe{os!PKRPYZ!imU~+ZC6%3ysn3meZc7~rPn7aCK z5yMXrOkI4~!ti4RQ`a8OVfX;S*#v70?;)7F^6<%XKurG`!PJF^hZz26f+rGul;MX7 zrmj33VfY^irtO5`T@3$_VCvGty$pY!;3))eX83-BsS6KxGW>0Vsp}56GTcTmb=lz> zhQCEHb=BbthF1|xTOh-BhL;meU3R#L;Uxsq*2l1g;ads5n&2FU7Z6NC!LY{gT!N_! z51%|sVIvip6N`k424)0=k62a6phkF^$B6vE%n;A|c z_^SkWGW_v_fWJm?E5oML0~jG+j-|h9XtirDTf|Z22tB=24}DM>`lO=ARC>H4 zr`#8D_YGx~_w1+v&zsqLS6|LZq+`9~*Ur^;ZJ+%vySBe`Ta6ZNMu|f&>7m0>CuHwi z4k`iP1lkjNhoT-t_pOlluLIBpzJ&1KlK8IyaEAUw{GWabe13zFV+OwK$&u*Y5bhX> z4Bv_R+`M_S?QM7H*qwUlgx>Y)$%-oX;YdY25*+TU;oxu)`dzdOY5u%!H`~PFs0!H&qTX{(0qoB8?20zuZZ<-2^HA zzBB3I*Fy*NzBj(En~v(CL;A6E9Z0{2c=hh5V(CxF^quqj&`w&17d8D_<-b{ak?nnN zPNhI-_%9&qeV?T3U2mH7(&O#cdh?f^kku72>7j99P)8;YFn`audx^L7c-x8S4XBYr z*JA*cXpF)Ap4C}1)AB;Ew# z9`XkJ9)x=PY5Eq;L_$6Ed{36%bs!^r-^jUh=T0BHW^j5ZuHbN}PY=DsjD5p7Kk0-( zv$IbR-MM58x{!8kd*)4)5gM5$5d8u7LwxR+ad?!U_G?LI;4jJ)~F{` zXFcgb>E78zL zb;`fO(^F@yCN*ndP7Me590gTXTt8CpF*g4HLkOe6Jq*OJv>-agVU=#b2?JWR??Xj6 z6eGO78iXB+@UsEVX8-eq{h`9{>5sAZ6JZ?)dyJM_gdE9#+eV`Q1GG`|enurk5Bvg{ z66^{5O>z9cH}JoJsxk1t&*m{|kB)D4KjpaHvD8roPhMsH)Dv=APnsX31`d4?Iu~9z z;t0Ll_s*Byq2D^qkBpekxojUE!|he@odM?K?zhbk{vEh61F^kOkMZH24dp#6wep^& zndQFUxcmN?e#&8<@ucl3Tcq6dc)95@PrAo?^3Az&d^=^H5y9`dQ|3DlfN^5RLHFT| z1CXZyd%$Tr074%V`e+C(J$%oU(nI$i0r4Py&rq^A55q6?Kpj`OF2mGy(Bv|mEPd15 z{WlmC9bJb^&YObPPtDzx0Pk4oSm9XdxYOZP<12Rvcj$wJexipyK}*cpw*U>UAEsG2 zf`x302-!Tz?wM$0((^IV zIS@U8wE?tv^M?@hr|(+8@y0$EiVJ~W2>exq#)GZMRM_$+GTB4#llJp}DcO6W2H1~$93~FOsgwm^WP%>rXx&VhtBjZ} zgc)zd^b+RuDMKn6B%`kZqmv+9NB6sbC1Shw>-$7(=<9BtM`6MLbv?A#N&u!Wp+~K| z5c44O2tw%Z32m|tA{xC4{f9M+{t?cPV{UZAr$>Q-=wSjZ;C6=&MfU*E!<($V3`h6S zcF{h%?WY@+?*UMvT^PS@5%a@lo8EWA5{h`j+gS&F`~kgZqm@cSZo5biO=;++)$tNXRIJ98{TNCFGI;%Ox12gj_ONC@x@J$$nIvtYj8ZvWZ-> ztBoal5HqY(=9NzCTBwiv#i;*S*I!Njmu<<^uSJGvm?a3z4>;kQVh!`;H(+I3Fdf{D zW{)dKbj7=4S!MJCGTNR;*?2sm5nBFaMmu5pgd|&pw-d;_2pM|E6FM#6JD6!N8}%Lw zbN(rM3y{XTBQI;0mi++p@SI}GgZGnV2dzc1cE-em2B;ulk>U)}c?FH}9Y(2QA&9$Jlz zP$)M7^8?$)847i1F*pB4Yo|t$i6UZ@kg7#K?fi=?{}b#B8nYJ4|06Pj@|zHtANUP? zQmjSBLwUVtuQa=N*v!x=6ABak8AehxoIvj%R8}8tdrMgQW)$5WO7y?cFC>_zaI(TH z(J-UD();&*#%@jdUC26xzn$~-WEJv#WQYp+4g&K7|1(`w$Yb!gmz^o+*1Rl=Fs>pE z4%MjooWF+Z?9XdN0q(vzL6Pwo7jit9XNRU7yoZbNC<5~Xo6-#JY7@oiyO<($Ao*t+ z6E3Fq*{91QJUTyh5j0VR_cl-w9)UlSrkOLP*#ifAMwy!J1+>mlnC3A!4%EQJDr-&j z{b4psRL1^1OpkV>2FS1UVWzst+6e#=j4aW|!S}v7XZlSLiOReBDulH3Sqm1dt=3!D zI(vfFwJvOu^}T*M7Rwun0P{SH$@iPN3dVHT(bbPlqk&AveV>x^46dhH0NDe?PRLV} z3FvSPh>W~b<`vNSPUt-Ose1GBr_5DPBZf%5x$t*ue%AGg@7{lQhTe0A-oRGg{st-o zjFG&^3TuTqzYb+ILw9o?Vh{-A@ZFeN%lW*cYry2b?=YR^a@=>8^f7nqAnRQynn}M8 z=E|2TB8MGO;oJ#@6| zgh@XZ)yE&D28x_AS4W0d&~TKLG7p=O$thmiAxoO(f76OADdmYI`C!gdePsAn-jzzG z+n2G0kzATiI}b_oS%!f*asOcd(2r}h=&ey&!EUS3_7S+B4jj=&%kcfci@7%4X4ccSELbdbzwCE5r?%9oKtIsI61Xupx8! zAdHsx$vmMCX~B7e)Rdo@h;`j5Pv}q4tA%=*Z$@N41T*k@{&UeZkcNXwj>|TJ6+G6g z=8Z1_ho#JLAHx0Dq4Pz*27N|NhhwzO?f9lxu2t&yy*fmde&1UWxHTv0JJ*R7#8gBk zK1>I!dV-njiLN9Z(a zd|wTc@N9D6%-zwvx3Q4_aKL2lz6UMs3~jg0Ll$~?qX@C0vIE8rz+^<{ND_yDbwQen zXo)0o2)II!auF?(Bn|;r6Umi>*i1>`5O57sEQsYv3WtE{;a`s{4>7Bxa0s}SDMg4` zB!xr3YYFagVSjC-b*iLs2)I+w>_p3vG!6kjAZR+#G9`^ez?%iFf@qqgaR_(|U<@*J z+EHz)QFUt_SnKED_N$^TcFlvDwS#e(dN8$)VgHC~e=E>b`y4`OFJM{wTO^4?kah?X z)&4`0#35ihF^W`F`*T>Gt(#vy1?LE{>3kTecKJ1J<~GBuLMA!ug+k6HT_vi5ty z=nB0Ty%_@_O@(`@8L^`>F*<=}w}wvU7#V=CaF)Jf?(W0DGyFa{xtGyl9O`ABut+Zb zLJ00>B))qc!DueJR|~SG2BzHtYG4kbw+6gy;4_lMAxN2mL=8+wi76F_fO7FijvxJq|&$ z3mP}@QAy(vG+ofR5eFrWL(nP&jT=8AX&i!94O&72Q+1DauU6T>RHMU}LG-OdVMF<>Ty|s(+S7L zcz|(vz&Q!$B)-jzD+10=IIfBOKkV~G<`F4lF2k?yh_smLqE~O z{^NJ3&EJ*aI4GV!Ej0{oZ_#lL8rAkXLj&0Vs{@EV@!gm!Jrw%L)ANw^I55bGCd4$l z(S`}4a)7gB<2bR$`j9!?n~Q`V>lTdT;kGnq_**041?kT4@^jAcg0twY=0^_U_Y#88 zK}0aSdgMcA_*N6bCqgI9j~ocSi>cWTXL$Xkrf}fW5r^p`hv}Tdbk_8VBlMyJchi~h zic8OiCtn)&T$&D5?N9^JbBNBPsPodafsdLuauJ~LF5t|MJcnt~?(l@)q9rtB^)7y= z@q5zz$jhOv)*R+}H?-55%dx{~Gz((jz(f7LR;!2~2K=%!^a}1hJ2^9w)U%yc0PS^X zee4V+V4a-DG9gFkbnGOeb08+{JeGrKiHJrKEfP_3BQq(QaSX9M#CkSz#^H=UAV>uu zNGpM$7!aheKu~N!IOC6Cp32B!#3lP3Un5%;A5+*1huQPU}uM zgdI0It$WSk`;iFHZg7(EBDxD|?l6ZNLEw2LQ>>B)K5BW02cUuB`UsGh{e+O`fTY3!d8Gf9j;Uu*-ipICtvrUk94O39o{^{f zBGmgksU8ljEXYU$$qv=RF;a~rO-LK~pgICFn?*(*x*OZd3SG4f+8 z%%%Luu)>p(3rnJLo3+mZiEU#tslpaUs+^>pYrv%Y9;cjlBZ};VCFgm3gDBbC1Fuy- z)_*_ew*VEvxlS(j_)dF#C%{Zb3f(-u<6s^fc&7TPfnQf2|@VjH-XZ$^uWC zPL$f!9_%#0xdFFUp1H;YhEFL{wymBRMfMi9k+d;4CClF<~%Ek-@)@ zcJcCp4Du4f&np%-loRCz6HHXeAPYZV6!Y`Nn5ghbKJP}%EUeVs12@2>zO9E~Dr|O0 z4R`x4A-Lc8o+9sV2Mzv@p6A9n`XaRYk$z+ebHqJ&BHd@^D@V^RUpV?=T}a~o0;Y_K zV9J07&hP{p7}(S2f{QaD)6{*>P}XstzR2S{gR%)v|1dHv?O#Q*gbKRvV7c!Igt4ce z2Vo;%9`*Fd7737S=6FtK0xNs^T+)C&{Y(rOfAIL;2h5&6w;1Mb@br|})_*N(X`0c~ zKNR!ylg{__y6*+@^tUGU^gD&8M>BYQuaRyU*RZR9xS$}z#;*PZvYZz0>ih36z;HUY zi~m*<7mr4kZSL{CO_}B9qk?}w9z_5Bxd^7q5Z7hAdxB{=yfQ#kqmKDLvm-F$f8EhByK)en{S znY+J=XO+X>kF5_o5p#wYVL5nDx-)#|h%>wftHgR*|GkbMEy`ZTZv?+5!*`yg_1PI( zEWU#!#*T2{l5-)k@-n^eFumt6opw;4cdjChQ%fc74u2+BhSK}(To-AQHZChFtR@sTjS9p^&J{}!WjeT3Pk)9ejnoZ z5`MI(g}9d%geONKh-d#Cqd7c~5g^Io!*!A9@Z_UFPz(s3Co_U#K(O0m1jnid&w!Z+ zgsQOq4W2}l2aKx097NekR}E73>|(11ry`m|xlI_vtb76qr|Ot|9C&i5&}$G; za`cE&Cggz-B?htFn@pLI|G~;{2z3=f6r~l*y+_63Z~MTfc|%z4ZRZgWp7&{*(f0Ae z?~gRY{XN02evI1@OmqEHkC^&jX`1>p?y-epFLgv-D0U)7%friRi}0FMY(4lD?3&w- zpa=ItMKzxYTPKJe+SvJ9%onyIO)pwXt_crdYG7DjVm<*n%}dlOG4xwUC?eL5^_$iD z@3=PbN38#@=lQ{QF&+ASeFVs>e?rJ}E!)MS@b~pEP?50SU0)@J{Qkc66%kqqUWJ^Y zFtwI5Y|TTIn#dWRM^UzdT#9iU^Lme(7$Yt%z}V$a$Van9U~E88#fO2wgH>rV&Tz>L zM7e=ECI(i*kySXuGbu_&2ayoeOM>L0k`bK(!M2J_MUy+jSO7jGmWq>Spz((U7uidg z5q3CtaFD48ghe38j08dh!tHxnN9r1C*8@!j8I#<;V}!6vMD8Svnh`mDqG=0p<)&fE zau72G9j2y9q&3a$J4^_koiLZ%H$Vs)Rw&ubG|ZG=K+=9)aCU;A`<`V5B##w~UE*~6 zaPD5NCV(hO^Y~Du{r5K&%*%p=!_ev_NPD&1_rAxs)fz$RJ-&}TKG*=F#5eO=k8h{7 zS3)H-2L4d}C!PTiZ7y+?V8se~m;+QF)_tN1VOXtaK4$5EsD6#0XoZdpsIEver134C z`kYDkJ<5wV*0vb3-=>A-yh*wbJ4#e6Sj-@5nkFsPG-=t&=ziaZY5gV^|KnvR`}~W4ua*{ zF~Ze2N*C1%c!uEbRCMOYw4?ui5P4aM=>x53Yq$bOB@#^d&Nw_Talyn(TC7`Gj07UA z;3SzLC8Ip9#eCsWx$5jFSW(QNFu{(3^#Y6#^Ma0oIsvl90t^U{tr}oZfTAnJsG=h< zwg1HYf)zQG|5njq2%acl!8X63j`YjPwbJr3cOshUZEC@H==S!&|ENCQ-&bD!KF|?V zyVFYdpmhi-P;kV>dK^voA#7W$HECseHb)rMzyMa31XxfY0DHOi!1``~-^yiw9QeZu zSQ)w_NWx!j_{h2DgdSuL2<8lPCS22*(v4?yKq4IQTS(C zH`12&jt%mjFWk@poWeg2!J$e2RVFESI|0j$Ohb}liJccLfVOo0PoHZs9 zhtNO7@bH6t6zC(21f5heIXrxykQXyNTnmZlG;(;zK~xS86cxh*#l-NCgCRf;4-;d< z!^FhlVImI?n|MY6M^2nHJUGKx&~1_nFN%rrp@gDhd>~1^XgDBf!V;c9h)#hJ;{%Dd zgG|!`3@wzDCKQ5mNO1BP!Ab3~BXIB-!3Z)eAb5;mga(A$_m&>o$tFMQtl*sT-M(QW zuHi7-`=P{}p1(Sm5x zSS;U#M2`>Kgv6U=dKL2}sN5(E{tfdE;q>vV-^HM+;iD(g=uA z0`ljL@4uEB;;%MJKuc?8{3!7-oUUhRznpR;}GtU@SSmZ8YL(tFFR?JpcI@ZTq;v|^^4Y}WW;I0uz&|*1g^p;K`YX| zVxoei>jfB*)H(rb=t0EH%U7{wLts!)$w^YSk}A3YQ~MvrsE|nomZQY|fO#Yo&Y9?S zq#unESfA!*{pTJf?#C#RW$8L#?>dxu)1&-A$GmBMiZQ||@X5xA@q%Pnj{a|YE#toa z2KcXpXQG=^ocE#jjz8}c`W+UdIr7L)4ajuy$%AZ&rE@>do((y!{B!ioo#^6uE8c;@ zVX%EQVCM5Z(c6xw(+7V8COn7yKXUkho=@_T11QHKrov87~qBOU_JYz2=AFm>=4j+`BHit{GXGBL3YS6npm`PM4 z#QdCQEA;f8=1aiRWGd`P3pbt%FT>VS<4E`$CT#6ELchcQ%sYJM0QW=A@Vc|KX%jkS ze&iS(;oIQ|dveYq+aYA@G97lBBIpOl0XV|WoO6gjO}d!|Fe`bHV+YYy2bp=mWjg2# zz2FK5bJB@>#1RhUq`@Y(IKmTh9O0@Q+IqsV9g=2#C6iXL-Kg3if_I^acdDDtC!7P+j6x&!71%~w(7!eiQR8%J@Gw)ON%Q)>J zMP!_Is`kow6!G0MJ{9ptInJ9^yAY+#DkwzzQahQ#8)`d5l(wk0BMJr*(<`Es(^f>G zAJvLO=*Lmit)uVNbUJX9>a;=)l88;H3dGQA#?uO?u4W>gRxsEvtaXd+DL%2#NAr1M zf;VmAX}dYxj*|*>!XRev;NJ?f?nT*l@DsD}w~#E3z7uphwy+ygxEDO*2t8pA&mcbD zpP~iGAfHdjhEDwaiu6_I6UGygo?lUM;?5^LNAEiSHUnxtWTm#jZjF5KV*x}@Yx%(s zHQ^{plcWg~`MH&7J&<8=5n?PUl=d2)cRj3*Y$2R`9UiObFwF>D2+5r=EpBH9r+>vP zN%uWZY;i&f%e4ID@ah~ zIK@bnCv@~|jl7xI9$HJ;J<8?=_2tP4JCL!bSCp@XPp{HYS@~#Sc}~HDa_Des z{R=isyPTh1S)ixkoWcpB+$Nr04Z$fG&M64r!OA{GrO24b53ddssE1egBgbp7v@)Di zpv|T+A6^~UFzqh!@T!+`Q|A-*a%Y4I=)RY^)u!>YtG(QFf%f zT}4Pp?0mvrYXz0{L_xvi8{-~bMffy;$uQeTdDP6jhBCy40%peaND0}fslSCOJZGop zOTq|7aoA%dqdkh_$m)HN19 zp^(p1`Lv2x4oJh`L);Na1a^NY`!iA^3P&LKQc&vz$UPHaaH54zB-}_fAh`~>Vh^*J z4X}E66+s(c@ZlA@ zgbLA@{E!0ahVCvlsnB>FufZKnK*G-AY(j)0uPqxrn?TjTXA=&H$5wPU;SkOyJj(9U zyy-Yi_~Ap-tN3ifPIWe+SKxbr-wyw}6?o~VyZs;y(@s#L1OFLtWBcz8;xPL6ME~sw zJ#G%)2rhbGfu1iMjQQ_HSD++x{=n$J-Gof)zn|b*N#wZ-W8Z2&?p}cOwnQv{PH*lj?`D`V$Z!3 zt%*nXqYj*RYS@8q5)K>#1v~Ka0(dy+lslL1Gt-bEJMfuTAjS@S-WAdzb1xwEiK$m= zr5rd^c~8dyGqN?Q1obqOgyst1~HH+Wq4orZXJ;FNRx z@dHXA6fNdtS*R6u;G$O8gXcn(gC5^mkl2H-n4~ZR7EM(2ooHxvYX12W;=x^f4HEa zYCc-UaMTR0ygYW)tP_2kVczbijT<7$y?#5joZ9P$(^M3rD_0E4m1EbE>Nb{eqjcqX zfEI_3bLFE#$c9E;d3k{+3qBiU`Scyvz&!207&P@Hc3~Ga|l@s z3RliK5tTbEnCk?UJ1M{*@_QJrd%k7VmERATx3cie4F=y2SRFQt&_2(F4x9a_aOHFY z0msY^z?C0_E8i+z`EOIW^8YKfD_>3|%*xs8KV+z`y|3Sn5sXDRU93E$ANgK^;H~9ca`?h0`<@IhMjJHunI!)7*0B4@rLIJ zzKG(%%YeA&<3QPZ>!98h&C$Ew$f3NB>OQQGDWl^A?m*R?EXBxbevn>-)KQKoXuITj zY)IZ+N>6Pbq(2Hm84c=4akM=2k&d?=Q9)JKA$RDw`}FVKry&rlWZiT|H+`&|&e8aY zH)>gCFCzLGyijUCDc;0ZHJ$=0uZvx_^q!U%M%iR4O-Z%v1RTEXs8C1)kbyJl! zS{j6nb#DdH@R~2J9u4cqQG3PrdGylLsQMBZjNcB;=Rxy1 zC}jz~S38y7tIcsjZl*Id&l=r^x3O+oVRZ(t{_vnFxWgzH@54B)d2jwNYHxb+)lk~a ziQi}9S9kfnT|HdzgHj7#&YS)K1<1|#7B9c%i!T;=@?XQAtUL6f-gh?L9eOT$)xV%A zy5A1w+FqdiJfV;ADycK{u482Ki#p1pm%iEd2EDn87k_C-3!e_c_k#jf3%#6+j|i>D zt3F-$h>(0Ql=g=!|1Gn(2AcC18PlN*KD;;*oNN}~Ieaq{-)8K=_YcpEbi9Y8^f5#6 zUBc-8pF$U44x$?6Wr9mDTk3RpT}zfMUSer(30ms$3HQ2dv}?6%E%j|JO_ut`fZsAt zn`yycSn*r%k!^pouQA~DbrkwqTeUj>8vf?H~Z#s93W2 z7PrT>#OridIu>~1Qz;eP<&K4MDkwtP;zd=i>MCPS<&FyPEe?o`%YW&jbZ9QA?X;z6HL5-NRhftBWj+qPJ9*TZ#*dEZ|zrf7YQb zOZt^Nq?-U^tIuMy_*&ZhE9Z_Mf5-H~S@kO|P3VC5a}O5ZI{f8;b|i}Dzs?e@ZNIC1 zj07=${9S;?+Q1l`LZR`v@}?IS*RQp-xBC5c9j(H-xhpHmW?E7ZEK-mBERCzjVz=Z? zubavL^XH19k#m`85n7E>Fh8t^wEEO*zoTWXsvLKgjz0`iu@2ETBz za6nm_h0vB&9rg8SgO++rJN^I$WC67<=qCnwjrweF%vq2X7##|+CgW3`v9iiRJ%}Za z&9|7{K!QAUc&F=jcbUtQ*W6+u0R=w#TO1fIFeu=UleAmzrc&4WgYdlmI-E<1(?|6y zB}62m=J7SwH~Q<8jn!J#;E!U|aSl>)zOh`=-Fa29%9g@sEUMyS8TqJsgrD&`D!uIE z;`E89S1fkJi_#Zg`7+c`Vt(#L-ldhU^ZBTkdL2B1z>|&k!bEwMx&t23yLiE^@O1D$ zS}(^XluWbv|iU#Bmw zL<95q8?%Kau7z&+a~g5PWhl3ipY%JAmr_(W*hNBl_e&Uy3%>4=FcxBb{dWmt8=0@4 zNf?`Ed_6AVaS}cguyYKD#Q3r_**U~#<@nMioGIZ}3D-$@mxTQij!OB15;is3Id(+C zXC(h)63&%$ekmEaVhKMX>92FH-ISDsN{^JtfEMfZODqWf1vU507Hqf;aE|BnU$zLepsD!VV@I1*+ z-!!1BTEd$p+$P~aOZa;du9gO{Rl>B%Nmovbox@*C_~(+&-{u8nj->CG^acq>B)nO| zeG=X+VS0Q<*N}tbe5e@McAl<;8*Ypr%pM|-q%6-oGrgg=q|^dOtA$#>Z~yhg$W z5`JA7I8!zLlLZuA(*jTJiWx;$S?b9>N2=ozUbaDLRO#owpOBY4J)nV4_Hl8yx^cIY_MqHdJe8=HDdm))Fy zi_mu0&7c0{y{F1oReW8*t$1@>Tv%w|tKt+=Kc~J;{aN*4G8L-cgen6jcG+d@Glz1T zRY=n;l5zd@S=V20S?I?K2!A1orVSR4=k{{TqT4H~sx1p@eRmbOn(M%ml~ssoUOP>~ zW)<493h~FaR{MjN_9pMF8)jP2a9Dr&u|}%%pN*ivJFaGpeYhfTLrL~f-Z7htvnpj+}xH<8J;#o1(Z;Ypi z;(!c&MR`~I)dhD-5(1wljXf|4;3%u1F3!%WH?Zb+8EHdcz+@hQ*=(MJNX z$dq}^Nt$A;nk`M5V$7;dXbh>=Se0wz<}es7cTc`07%dy}Xy@$+6b4#U$7VED*z`8) z@=djM24b=3s7z%cv(95duR=|~c z;({iGi}?A{1)jxA_5}IdqoI3}q)d|HIi89~h!^Y$;@Lk^)sVx8uXh{Mq# zOQIW7jf}~}lRXwBDLe}`$WPRbdjus;k}=anTG4S5GUndE87C>(XjiduG%;f~a!Q;; zX{L#cll28kC0gVBdVzbkC)jL4kfr#u3OY~JjZ)IAxQSCS_qJlP4xZK| z4D;MsNL)T7dLl9sYRPyu+ThrBfFwf7spCn*wOyknz5p`4J*P? za~LhOs-FvmsK5C)Td)(K+&=$mW`!P#ZfKmM7qhD(OQTA>@xpa@ekFb`)kig)*bb^& z&i_pHM-3sH!rhFmiewH0z0)wX6bskN?vU3o&2_vGqFG)s`)u;;WT@vQ5=o0`UZ1RE zOEReOR$r=m_M8hg5cTZVfVZwCP}}TB$NsFXldM-uCpAA=HHhJeR%~Myc66;M%ZPN; zyN!v&-~>VeO)h}2dA^NlcjCIMIF`*{ zBo~ZqcZ``!fsHEXKU^A&2xI5-m&pe8IYu^O3Xs^?qC&-;E!)u8F&Z`Yf4CChzsAsx zET@=;#s6?V7+}ZF=P#2Dd~fP(Y@+Huzsg2+j-ef>@OGb{cBq{(v=bBT&rdt#KZbT< z%>DedvyJJr#G>tUY+72Hj!}DJAov_>4_}jrdDh;TE?g}8KTq|HKHM-`+oaDe zx=!tdsd1cE@>JiveRbY^fo%D6bwswY`}*fSWYSVHX068hmhQdiIva0SCXSh6F(2FI zi<@pSEpfUu5YLrX` z0$Fm=1Sc-Fpz?hznrg~eu7-5Hs!k=;HkQ<4s%K+LvUi?)VQPU1NRb<*ED81ksRP5lj z*Vf~qeZ9A~KIq5uq_&Q~@^X|r9rcn~%0%SzMzJQRifAiVW~WGUDkc*pdxj`dlgRir zIhq4&Z9Gr4UGRY_t;uQMi5?oxD*6j8$_=|6ajR%{f5o_~-lvL6w%I0j+o%?}tp2}= ze}2e`srlo}N(LRb=|&ENY+?3A+JY*5F-rg0HrtZzM~rT`#g$wX5WO_9_Tf#)~XC>>r0}o3qBZ`GYrQZOF=!hifP}??8!x_WxD7>8kXb zhM?S4_-*~p$jD?IDB8JUWQ4xd^BaWo08hAQWTXmVF2Vr9nVlme^!K4wAl!>^6T%k} z?nU@1!nY9S;!slVeMpbcgK!1H^$6D^+=*~2!o3LTHP=8F(j(l8@Kc1pK}cV5dl4ah z!tKS4fN}WA^IfC|yyASC^&vLq28(O<-G3+M$1LAQJ} zf&MAbJ3#-cpyyO1&~s4_yFthIXgU4z1o}MC2gi`V0`%jce?`hKPe{KW^ynD$t)QPt zL8toN3wq|gBO_?$*hTI0H1HbmyC|&AZtWWP-Hc7?;k2|ul~%Y*w?$QisJc1{h>au%kNvoL#eTKdy{~#^xyC}Wu3u!MQSnvfdKe*5@ zDm>gXE=|vyY}e9N{~&o(*Sk^9Z=oJDtUt1Au5j8CQP~T#?P&unkc8l(`U;?-<{*D% zZ!1w)s;`Z}w+@Vq{7=%0%ZbWJUSDX%^U8-r=>Pape?NYBWaMQ+j4j8R-DyfYlsyM6 zkx*Nd78mJDex(7QbRxCXMP)7lz6SUZA-}O{+;=bWX4|_i{XyD;>EVpDYSo0=$cX6+ z^Lx^sO4D9TqqCX+|Mg#|25|nCuhtp%xI!s>91;MzBDxYomqaN1kTf<0)1UJ*rn^)+ zy9olVMJC!gcA13pRtp#=%9lOO&Y`lu!At?uIbphXr`tJHbCqLKKDivariwtDB108O zHZAkv8b+x;a$FLq|Inq*BIDa0d_^y_bExnSNLac(t%j+ZHeQA*|8pZFEriOH{HSM! zl^!Q0{0|fCGUfl7ps2SdX-4nJb~z`*>`8X!`;rU`Wq7j;7s>EW88*xC9vMC;!yn1; zr!w3p!zX0;k__LG;W-&*Y_275;i9GamXgAf!W%8cMYiIi zS+-f0yd^k$tJekvzF=0q7RP&Ip-s~Y+t)P(YgZu*wu!Jo#c&AU-`1)X;!R|Kp<{u& zAXvLvD{QE3Z_opIKhHL?Rc|IE2MXb+o6I8zU2cua;a@< z^ntqtf^jS4+=S#+?cnycG&SM)f}#HYy-}J2lTT9pTuZY9!X*(jpE6#`e<|Eo5TQP= z(&whx0bz`l2n$1_H|4(+293w>H>mXXG&?YwW|xSd%uuCM{zSnyf|q=nO0WFhZs|aB zq=Te@M5*-3|Ii$sSXFxEM+YUN@|$4cE3y0wfhC`%(knmp57LnbWICm<;!|N2=;Q+x zto&f6bo`w%J?T$+D*Yu|^C3dAReI%jACT#5B%{h-rB~%|N=P4-emE-AE5A?ash$(l z-whnqi5hp*xD}ZQBrcVYfP}6pPGK8<#`MRfUp+3<_o{-)M5-J)65f!IzEk>_PB~ER zSRxr^pwcO9S3-I<&gQKa9QaCS+@{z^^zYCDtcGX2A3;D*a$h*R`$@iV4> z6(268>(wnBXs?POc2CUzpFlCDAN+ee^A7&KNU|a!foeZN)wG`=VoXodeY&ha;y^Qt zAa*BS`m1rjNTxq33oPA8%>I>L^d6y6f9>HKU~oMwg2en3til(;W=yZaGa5T>}r6di>fzemoy(u zmVe$Jk^V9@VHO~lRYoruq!V|V{XD*OXspIbBQYJC*wgGr6vxxy;nlciB&K8JRO3}V z{t``%L-F`aWAp5Ie5R(xjd=WcO^pZf_^jCaBp&|-P5JM5`~*$;<#_yMn)0{t_-sx2 z(RjQWe$j43aXcMXuKb>nn66EPA4|qR2^%>zMplleYnN-v{}_qs*fmgoB_2OTQ~n|z z|3&ow8Y7D1>Dm>sb(N8rj!{ze+j#smP4&lk{FO$KE)L<7m{sGPL(b;*bDx z@iS@X*8}Z2nv&GR9LAqt4=&)-G_-F*zRM+kR|39C;!h^vyIB4-ZTF9B5EDLy!XI*c znii3G)qi_|r+jO^Xh&@HN>yHR2UOmigm$OBUK8xcl3*u?8Gn9xe+N9}d$Qh+*yuHk zXtH`f13d9>mU{A@GjKPOYoL0Q^A*yAt_I*uTDDfxYG;_f15Vd{62H0K&am=d|03~P zP~dN8s-_)d{50)oLcP5p@y8SJuShu&DWCVAA!`DhE9t5BL%u5IyMQ-o)3hNe2g&)O zJ5_m0%-HP|m&$s6nSkC$Vls-glNz(7$1s+n) zyJ)j88GZ)ZCmH_c6!@hn@Qo?(^z362>c2wT8`R)ylf=(U=tuue;%lz7Gv8MPySAP2 z5-}J}(7+UJCr86!=EQpFh5|q~L#u@e{Pj zEISvKzC_N~I1Qu7s2|RehJFR`$;w4vHBW|jr@*gd`~+>6^bfQirHlSN6SaHfYCFTU z7f;vUOT2c4ond~agxJm$^6BpmC)4LyiO;)S$YEi)eFD5i0fz93OOnf30sKVxD>YuD z8u_{l_-mnm?+iP`{A>cTe*@manE2Z>z~?0Ck0*eqde|)O8D_@UTN2;uw=?{*Kx9_Zf1VTWC#`$f7(>cgKG>}d{;`!@0Ie^ymy_H-@9G# zD?8cD{Ba@}_>q)zQntI?F3<)^4)Sf?CgiA;&oKUS&C(l>KOyDR)Z3XgBomyH_(-n6 zt9kS#D0nh^n34i-W&8x~XoZkdCmUd13jV)Of&WPg{5~aL)@Om#|7Z&SS0sP0w1-`? zU%VsnIZ}=qztS+l&%yjBu^qF4C%e)T{N$G;-kzXOk;D(p6!q{t?2oR8CB9SE^K^mM z{)O=pFa$qKfj=zeR3ykbD)IIzJ4^YF+>khtLe6#A@Tc-drJbv+i-4!{?vnD=I8w@Z z#WfnbQ{bzCr+&06p&tcO@ULb31g$8coApG$#%BLzMU z>pZHTqA!bl`PnQKxRmip);m6YT*14#siheo38-xgYQDB$dk~*lEA(kzZ&?*SS?Ix+ zqP$)}&N%4SvKR?3UJ9a(VsCB7S`C}P ztpR)st*&r((d=0o5xk9cYYDnR6T8oK9ZgN^ls>LSPFClZC600zzMdr@Qb)x{bKzZS z&P5$?k)P-EF0Tv*+TE2wn~lDsgCk+Jjlp)ekG>!nw9Q&o+urE&wqWz!U)$vN;Uis@ zH+U;<^U)_3g4n6|F1Ow2^{S5q*7BF@+HEDijye(d`kOnNz>iNBP=aD4@YqV*@j<~_ zXWJTPuMD=~Gj|{?x6O+CFi@k*R_evq>s(b;_+})&k67hiyogE+&Z5yz`lRIY+#p&K zSIYD7Bve4hAv5VT5_bfIVQ0*A-z#feX^+v{CkY{RE-&eu7yoXDXv=$Yg7Hn#AS z1X#g_p2)2LJz#W6fSP0^^XireZci@f+>QHRv{Dn^m!l}atX<(5iUm6tzx zNybPyd28F+YS*EzQAi^~mfyK-k)zyQrqKfjW!)X+{w81RIz8Cp^sb?1sSFkud2ueP zt>tcSz~8(&*x>ck=N(Z^cKbq)dqG)swRcuwiPnnd3f4=Ux45vl@CGDbRM`?}cQwYd zHu;-yt}C7u2M+^*c$C*0Xk6v<`r142;Z;0!kve*pmqGw3S?pX_v8GA$b%^i8%Fo1( zekB$Z6z7&Uzn{xu=*DC&x32Lnk=?Q^P}|;)J_&yQrtCt0@D_YCHi)krp$vFR6R&@L zyWf9Tg09Q!0u^iG+2{yi0+*NaX$yMmYa0^_D~gB;3(je5Y4@Us`Aghr#4?*JK`y-# zOP?&^&*{rab#)nV5nxIR3-M9J2Yhiz_YYr2ZX3fOXF-y_$1|g5DNcOQHM;3ySSq{&{A6``plAwIb}=n@h=n!Ll%F+ z&DbLRv1c?16j9^QavS+i)zJ*;;aecSGFakk=}*kh0#OI)3Cf8?*wauIB^ zy=s`0AC5-TUeurpUtWY^bvB08%2{6JEkrdGmI@u5`0!6#%Q~tGxJ10D?@#EHXe04? z)N=ktR1@Y480g}eMu$vl(~>$NQ+|bS4Jr$dqGPVtRwU$+Q^kw}BkrgLP_w0|i@&0W zqA&51Q$dYTyCEQps>Hxti}?qNQQx?lKQGEm=jn$spxc#-2@M=?(&o7VUmLD%bD`mi zy>2*X{DNrwgzik^ELA4e4b;HUgF(s}Y|*KGd>!ITgSYwHn*9NsCx&NHPF0SUY8-?A zf>v^na~A6rYjAkM>*sR;=>2QgdRy@YzLsWuel3W$FS6S)2II(s*H$=7t4}r#t8vYP p-b#iI@y&N93~Jt`&3EIoeHatlJDR9y_>lH2OfblmYdm|<{$F6h3bX(K diff --git a/examples_cuda/stencil/stencil_cu.bc b/examples_cuda/stencil/stencil_cu.bc deleted file mode 100644 index 5d9aecbe48bcf5764a18d325e4867aef3145094f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22616 zcmdVC3tWub_dov3rJ1S5G?OqG}}yrrQe_atG|>#j_RDK z?VQiG*0W>r-L>;R8#q>2y;EbBN1OaezAbVaI%9Fkl{l{@u|jv7-%6BWDI73A3`(Z-yKuupQaD|Q)E;9`H9mdU6V z>om(5zhFAGn9lPmP6J#EIUnjcjo4ygBIYL50N&hzNA$O6q}W+7nH8g{;l*cKJGsSc zQmC5+81@6)vo?;^6a;Lf_cTe1hOg(*2jcZ~C&$HUN9RxGX5<)9oH30RbeV7h#@NV; zj*$7&Wd@tLbh;DQJK99BfFb5n{I-Z}Ua%=_6RxNJL4olR?gZ{QSDsCrlzYvd6)Av^ zRpF^2xszSE@D$x|X7b8}>AU8e5Vu&*v_kN5;?}sQ22Mf^VVqQy$71o>^mw~C5!W?- zTpwT+>nU6(r;Tpb7%hvk&0wzbWv*JiX{}_Ft$0(2WYZe?CR;n^s*FuN7J6}lF`YGH zW{8X#Dq^CGV#aAMbGeim9M4%MW3GlzG2@Mtxd#8BLNRj~RaresN?v=&gR2HD-mZ`pr8I9c;Ys49Y)`30ZJDAIn$e8efc7ugF#0Ald zS5j2omD)%ADc~IO43veg!=xAsge5haNTrLA*w|jZ3H=wxe#R&QD8XEkG+9!l#{7dm zB8(0fa;}++Gy>g=F*n56NNt3!E${^wQY$WQJJ$Vo@%4F=|8-A2@EHZS()Sdj>B z^)=?+<3{X)TO(Wr|1zTb_29++5-SqJt$xPb$J~fAxHZyMa1ow@TT5d_sHdpS!O-S# zmee4>+8|JmGA7o=E;=vHS;`Ds&0Mi-8T%Ec^MT80kaC8zo%`qJ7~VZ#5Gd5Zt3~y@ zO#{mt6Gt0!dv#ohgt?Z*40GEgkucXtXrZe&Nd%kpZ~AfG$T`rJ>SE4I;e4mtMuso^ z`oFI3Q)XmX4y0^0+owf20G+m6yUVoE_gzM3<*~>4ZIbL_u4&c?bz`nF-=u##ms2g& zX{ghFUdMUwsq>+V13WJiV3;fXonAb6(zWJ3Ct5`-Ti61NdoNt073`~f-^EAw*+ z=W0Ve)fU^pQqbMD1Tdcqrns623WrX~ulJCqwJAJCupS6}qC9&zcU~y&v>QBND?YUQ z)K``ldEKe2eCjcxTcO8@ybWu!DW=A$%O9Vzi{wcPy@z#NdpFi|&RCk;t!G#i&Va3~ zld-sI!xEo4CN8bHs9_FInTbZ^`Z20InNc`?!v%}m^%se$#dUAVt8EJSAd+8`D^<{BcG%x-Q zwwxvRWnuIP0Xxr^<)pXK&)|k{0YYRQ^yS!wzU$Ce9{N%*q@~Q~x%wH*=ZOSt(`_52 z2;tC2S9uXhCrcs)6yjE-!0mro?Kbq`@QBg=B9@Z{J+FrojbX-^FEC{Uu>9Dgv4i&vF1RDPsk@)F3uT;3A!)!JIGY#S9df zF#`l6;D4{ZIXluA-aY%ec`X!7VCHbFZ%6Q@IRctt9EHD!`$6;1yTew`Ft|Nz-ER1j z=8RxQv0eDG96_Fdu}FAB$k=7bw&n(8GNRbd`k6Tq&Y2^%a_;Kdon`|EBacN|%5o%8 zKb)cVy*;cT;)e#{0T}W;TE(1kjH~7SM;+7qIQ(BM(3%nKspY*ysO7y>sI~kA_q~CO zVHU;m_OLQ1gS1?OJj1LAOSUz8bscKIzckk%s{duDAE4>A0<9=EbwFHhg>&X%ZD?AL z){DN7TYvAiHNVGn%76l^ynA93ZdM13B=Sl^IqNW@TSj#X45eMfXFXaLqLI`2ohwgfuEnk{j< zpcikQPZ`1wU>UI^wQn7g-uLF<_X4@IJ%?zd$wH*s41H(csH({276KD)BhJbI&zv#k z8c}Scd?%d5DP!|({i`CKVz?g#S+dz$ZX2C^<>-DLai7Q94oweW*Z$pL&*9vYqa#vwKb-Jq%S4W7JGuh(>0npk8K^vAKANUbb7aQT!7xju0RUyq5 zB~d+{JaA6RopDb7WyO!EYk^x4VhtJfR2E{sZX?e8k28f-r$)F%8>TJN_+?oiwso{8 zCr9I@x9DG|#B}EmaEs~ON2>MW8InhYAubP}1vxBwIo#55wZnS7xHd;vz|D1__eq*W zIdUtK$GQZ!2CzafOKdCNv2dbeJ2XtV>)@6nw-)n}^u;@7#KT2usr@#h>aiba{&u{s zHFeons7X7iBhomHb5*R1a}{(@)|Yw9tgppR&$L+TEi;ydzjV!Jry#D3N@3_v&b3`Q zKfc;@NtEr>*v9CE6WCh>vV=()i^qsr6st&Hlrd|Gseqv$TQ$;B-b3HHOmk_zj1_69 znP?Qf&u%=_ew`rF58y|BHaCeiqGt-C&?C8~t7EwN2la=UM~Wv?7smUs9{zqz^p*u< zX8v0}KY)ANWXyPW946Xcqf7j~t`(@6WUSpPt&Wk7D;6hH>?0$zsM9$H)|PI(BjUlb z$X*e;&a|L~3Fgi=GOa(Q){kj#`8~`7Y-GC4vEFX9Afq1UF*f34Ea#SXuZZdx!34QD zIl=;{os0Svn~TWR^|*2&^7t6(Qk?vy7T~!d(k8GQ_e-p_%2Gt)E{VJKJE^XVrIxD z9H$yYAVt~rn;3~`hUT;18N(xt=&dXyruZG=u^>j#1WV1HSs;h#cDy)ihrlsN(v$C8 zhd)J~quDq=K7y5>c5paOjGurLo2TiV#uyF6dTJtK+?`@qL~M%X3W8WKyx4B1U|dig z5R>3fQCDhC4*+5>0_gq^5R;KTq*@&>aJ3wd4{eoMd_v!;8Zq1@rhGJ3bE$px0|b#{ zEQS@!9#(nawzvkrC72g!c!V+?lwvaeWV6}p1Sf{EV8`yLRXC&O+i^zYHLs^JW&@)( zI7fmM*<@f;`r~*(K|8X-$&?Ji4BQIKak`=fRNEs)v&AfRA-HaXnG;1j@ zHWb(%!1Bc``8#pDaKNd`Qsh&EeEaa7%ZNr(HS176*B>g7vBmVL*U1RuXdeRcA| zDX$xd?dba!`a01J^;NK#>V^6mVllif^$~w2;z#_M;(?w!`gXuGbMa@y z!PM#KnJ|mmC>|>b=WP{c$x6X_n}zchpDLXmEb1&UdIv+-6t24Xuw26ic*$MPeJaTwxi$|84PK-%TG6OpBs zXcixe*%x6H9g4T=wpTO<R13LosbI9c(a_?><4G5}qP&V*9e zpw8^YJ97^2Oe$vSmWp>~15UPhuXJ8eY(Ks;D+TY&Q_VKinOo_oGjH+E$f+6PC3t5% z2O_yj*HO10@mmU=S%9~CcaSIrI0J}O&?7b!*{0sxXarL}3wtv3qeY>XzJ~hF z=KY2t{^c7Z=d-F8=i*Z4YsEXo7|HiqNAjIL``O_Ai-`r)%k&plFYq`s8*Orm@m@)k#6;4qzV^~@YOIC42TkR3) zhP22}Qmr=T#MOowl<0%3vIt8t^AI6+m90quFKH>JnD(%?UR*!7`3)f-dKS7u>H4H9?zr}$9ATT&3oHtA+aAo!pb6QI z_$NEYM)3@DxvH4z<(A)oD)qhB22HK$7(LF2SCVZfulvQV9?IW0)#`0@oeY)pP8i*R za(QX?G(Fv6h_A6`GuMhfPfob{WUN_A(phd|!_TPR{s-_K^+(ABS+A)L9?>=xXB4ja zP8f~XGUmBZLw)r?ZFI zuRM_~#HH;jDHM4Y&T}tD0#Zpz$L$M{ZcYmD)o~fD*f1AZp{*}%98%>Yw{DL8?f&T6 zX^CN4rbT)m&VQVixM&PCr3ga;Y_6!xn{M}~Bkx{7{l%_A8m4W!1n z8gy<0>ReUJhPh>k0}#M`)Yx%F4dS4xwOe1Sn#rGUn%ki!QRn{Jj&1^YV2XkK9{l>@$-QS3rX@XvP$Fo(}$jZtq7dkDNj(<;Xb-=m%m$QqqX`Id4kk2Clta zfZc1DmxH9%)J8J!DXWu#@-gk*7)ZXZ$?+;jy_lue8SK$!RWD~ck$M{dL0LthG_f_1HWWwK$gfzvheu`1b; zW<}dajCIy9W2JVHTJi%7SQ}192kT9@Qm9WfQXa;>5y(#|6jA` zVbv3;KG3ltIz1AMatc#ir1!>FF%Awl>i9?4JBiyHE0830K|d_FIIpl9HDf9 z*cYQnh@G^fihB*7Q$c|^jUI-v^Zs|n0-b7=&QyuLt5%n-%!CC%!3 zR37$iiHb64%}i}l#<5dC#*Yrmr%|-W0A!Wf)=u-@%8*8ku_`)TQS+er!@iR2rKiK@ zrWgHkcGuC{>3OriTzI6_KqdUi-^Ha+QjVMG>uE}q8EWV1q(DIPhf>W5zauGa=!(Hm z%?njqwO3hC}EzFR5=ovSKoPVylG^cqY?iJ_}I2<1Tm8uBdCcVz6OjAInNkfXCx z{k0BTo4!YN{sL%( zd6!|mKKD66JlvwVZb;!|)VssSorSO@xCv$38Iu_A4Kbw zDH&lAvE_CO^YwSf=`X_-syO|9I|#g}N(~)u<+Cq!hkE)phC- zL7mEvfutDM5ek8#+s;WnA>WSTAKN90B4_urPFly9;1XO0XIF7Gx=bre2=}83fzYHx zTg~?X$ann^fZw7XzPwgn2)CTuUA6=qk&JiL(p_v%dSZ8 z0fKH6$EQ&53?FytN~`WQ7{~o|n3N#v-}@YvGD6kjAiEFK?wL*2_2fEwxA7jd5p&Q_ zayOM3lJlgzzqR~u`6=P1$O#*%4diU;ho~5vP~$y253mLWKBnFTj!{PssDbhGa~|&3 z@sO^xRp~RL<{}^ql79DWZrZ+g#B2_^{<+=UsJEZ?Urh{_o=YVs1`jx}*73<6CRm%% z91ag^=D^C0zCuJgRWvzXCjWsD<=;?w7`%P|*y2P}=i)@ws_qf0#t@9rnCY-TbpMR! z=hseET)p!uEM;tV#p2`EshkxLYR21vSI94gP%3$hOjiXpa6zEWstOfukR>n!S6gyx z+N)kN@*B`zHCI&!y8ey^dW)Q3Bm@XEXHCbKqohsEipdFEZAzBT;8^oba`Lc5VkHM` zU?m>ejciv%c9ozywu6;HjqP60uD3|fMxTJIopiVwgETIQszYvb@jh9*57}TKUnwz123gX2I9C zn4=eNEgp6I$Nf)QU?I`VqY{*4P()wh{m(OW(PX4*OsA6|R@th;L9JS}Z9u_nK2Fkm zJ}r^Q@ZCX2j!H3nobA2I*>mg*#5ZZsg|_3s*iq^v4XZO?XaohHNW({;&8ZRF`81`q5zdOSAfb%x(Ay1ycnTy)HZwY&{CibPQ=d}V5V%L z9qsgZ%Fx0JzsB>1`%X-Gb#d)vkL-%d(A8JdD*}#xEkBlVw4)D`FKO?HDxfeZ@dH!! z77h~g*At(ZpVsFXX?>k%AZ{jZr_XcQyX)1_v_~*C;WLu?Au; zw&-hwUvt;bh_n`H;B&j`k<$2AK(M&J(n4$Vt`2yRpy`*r;jp=>|5vEIzQ-acXI$;r zdoVus4Qibr(%hj&P#^CYpks52dcS|>E zyruCcg}%r;wi*Ol_RG`5ragPdXTlg7ooG_bG(Xa4adS3v&(B+e27F4`{V;susZW$u zm{Nj-?Bws%`z-&^*=Mp&t-3VjpH$M!Ic!=uz-^z!+YgTqo6b+9Z!$;YzZ>WMW`E}H zpSP@j(rC8jlf}&?8fz&0m;wA47+1$u7}UBp$D!|$D(DBboCK~ZrHXo_D|PKU#!Nn4 zw7cVnX80*-BY!qLY23WvVE9BxusBTZy{3*g;K+ya!=|YN7Vfk7S9tz}B^8HF-}MJx zo~?!AUv;XOU9`RfwIr`sTt@OiI(`G`<7`FIJ5=7J)btZo=;tT$=exktz0UDDF?5f3*YR&ZsMtFAm?Nx=aa$E$LGC4&W~3<-D!UP zu&IgO_e=Njn~qNDzJ02MD_pv!J@vBoS$x$1CCJ^3G-@x>-?d7Hj(mZ!HtE06Qmnr& zf5K|e{3z3Aeo)H>0%rCc?$W1IRnp12(vAEMtyp2&u#mn@gc5y@NKn}Vuu+5>NljPO z=i(Q~H^A5+mC0JvijP=zdTloy$z0n(s%7(B1t>7_AY;!{1p!qz2PJO?`+MC#=BEzD zWkmDw)8G(USbFS@Me`2w_b0OFicF?uKa^tTTuYE$4x@z)ozgo2a+!wIHcZGll?@`A{ z*4L*w4V>Qx=}}k(u16(#Hn=lsXxorpFn;%&^FJ{4-;b1YKr)1b2paVb41VnjgH!*R zJ_VC~a#P~vMWk;-Vv)h(ASHiO5y~tl#0MBgI?dy)DsM%Woa^*oWXV-^w^)6>R{0cT zf8uR>@c?x_pcIjMs8j+;JFbM3N^*3GF`?Wbf4ZMX>Ow-xIVG5VCKUaK3N344NM`j1 z=|5XVNPn6VIwfFE+B))m7WYF8eUkS2a%Z}Z0jWUoCuQ%Z2GT``9fAAkiz<=<pl>eY7aug*RUIe#-(weIStF{@TRZp`|m9yeyyceYOQQ{PFM;3pHYJxhsf|AwEHollW;R(-=y*f;#FRp&>( zNSz<~qE5;LKggmOmsQTN>+}PziPpb6idNG3)Y&>oxn9*Th_q)J6qr(cW2;X{SuTB} z&MzWDrCDYIGSY=GfQVVX5>6}Y%~$9IigaUA&k<(0ILY5vq1X%f@Hm^GCuGbc-ScN#uf za)a9YnY2+#*P?|wg~X@9I8ODr(nu(_=!GgsvgKLPS?@AX=P z=m;si|8|aWYefYC4R>kBf8Mg55}%$eIf&e6DRQ4-azmAKCjy!!dsV)M3~@frQ7>82 zU%lkx)2=~}97T0#RyVuJ;>cW@T;4yqrsQLNP+Mrxo67T#9EO(EoK3y`V_=CNtOq

d>65I3bn{6Pz^%;(VVjH zpV|F_EQ7@1FUkq$Iy{(QCy4+`v#FO(ASkK?Z>`s?O{|-=0&FYSO&|*F`XL=cx_Y#C z;P^K#;@meb;+)DwD4oVtb(eVHx{5Nm5%zLiIwBLG*x&^DPqMF;Z=KbKu8MG*~}aUeTdkKPXrU*6@UlJoi$V?L`?vM1m$)LCCrO z6sLf&@MJZGoDfz+^kMZ*<;k+vKF*}UP&Hg?vUp>>2hG@)^-ryV0d0UM@Tq#v3J6hb zyZi0WhCuqVDx~=cM30U72cqHZ4|p_;$L&?hX$u(*xBgSpzOQ||O#2*4mBBHA1#C^% zD17zm88W9j73OwT7}YF-azcHqDd$yVP3i+BTV0DNH`KL=azkB9!B`K^m3ZMrKV-hU&?OCKvH%i8vtuzRY`@lA?<**lM$Q) zN+w)<1>!7<=(6t7oJt0YwMvV;2^PuTE+qj1hH&~nGF@oiOHK^Rm0BIN$6lQHMYzKR`U;W0unK+{JI&U zXrrw;&o2?V8tpn6ohe8{HnE*eo7=gCNtP^8-}0ud|7P2GTbGPpJn@;ftx2^N3R_V> zQ0%*BEj{u-<=`sQBON-oj>yd2E9$8aRW{$2s1gdQrg<;; z>kP>lano6kCtqR9xm}ywXkLl-ovw7v$*)`Uk;uckjCPd5_QU->-8anQ zQM&uNA<7-OFZ-ZM0(C;gHB^Lh2e1ZG_d`_`4%eY-7B2Y|C9oTq7o9UNfS8*A~ zj!30Fi;|*lbnEbFvcvb$RwdHegRuWQv!=UY{Fhm-(EebpT^g#qMErtKL6<4&DBvd!Wt)g?R(+~+oHoz+F95(==x>qmF~%U|836N zC=xj;q;SjwP6NCk6SQWC)J{-5debEnz3Pj)hF68ENo2H7Q6GNwA${OY-u&komsK>&nr_G ziDnu6c-W^dM=;nWch}Yc<)4*6!d&3cDq>sx{zd4F1t+%CO@0^Zm$u+M;iqQm8@pwn z8EX89A3vI3e>vN=V+EujoTQy1RQ_+JHABLWtqSW#AUMpTL0%4|;<(`n2$cI>vkpO( z7wHO#vjDJ--+RKGFCcTzCS``)qs7Q5uU4vzvIWpHgz*!cFLHG&xzg(4YCsoq6ADyFQO6c>n9?3!gFp>CiG`WIl|N5-$VAoxPE_?`f;Kz$f<<>_T*ylx)imHzTa#L0*<1@2#2TeSTm*VyOVN9Wp`#f zxolNBcQV^4jv7Yzx&II3#Q|h2?G70)u7H(|VC>34!b7;TvPdxl<;~p>R~;oY)6WL! z^E?pZTp|;Py=%c%ZLbZEHf={~%i{XR-*8yEQ+aM%Y8F)g^D06vQzsa<4Un|;=$!KK z#r<$-@d}rAeBTaqjG_bn7bRD*kOe6yV%1@-t|t!SJHeNLb8L{w)YEJwy0Y%D?fa z{}*vG=})J2M&O_~mys?#Md8(9`3*R0K?J*HD^-IE__|oF$`tcasuXiaa9j6FLE`7|Wb=1O z_B(I@m%dXS&4jZlfp*1sCf!i4I7mkyBkWA3BYmu-q=7hECMiKjN^mzjGMG5jSl*ya z+anGGgcgyS)3d0S$YR7#4m?+$MFu9Th&CH;QqNyvaxbh5cCPc{{U!x_4*z8nrD1a>(c{N<0-C0&m& zgl!hiu;gnz>{=vKi)bekHLybQI4nX)VXIOSE}I9(!sKYht`7u*7IxjEuRZZ(Bgk& zkNE8`WDd!AdYr8;$V%9>_}vtemCz1<0$Rt!lQ3EcCfA}n_)0reo8QY9 zs=-1atjGCA*u!2IY*V1zq<^0p0Uh|*xk-26)v_P@$CF!HL^jlA^-M8Fxq7A;qwT6{ zCk|THHTr$ARGJ^B^g6=$_Xwhy`C@ zzaPUW@Y1Msc(1^$-m~p6S2xBu-`K&w4l%%tO{0vh@{w zfE#y#Z{py>(FT~cX{@!?&Ey6BIha?XlUHy5LT??W)cmH*T>IpL93Y#N=9zS@?9^)@ zJJH+xmjB|JZ9vw{FKGeqoRwf1R*)#Xb&54BIT4ZVyTEtP;6isqcB{43-QqDNZCviCBN`Y)_8pCem+GoX}NhJHF##oGc10fUs52Dbyx;( zKoCwm4P-Ozuov!}mosI93w;f+&!(F`TNNj34CY{)b;oZ8vIl!JrLLO4y8_v;UumM< zFFbbx+5Br1!Ib6ZcYy5sXBgMQPj3;BUA+v8O%iT717xq-VWtx~WKX1RLkc*yyQs&68{9rMJL~kH6;Zzj&1y=E8nl;wg;bXR!{7 z*~3=%bj{)WV^NmqM{JIn7u+cjq?+?x*nyc|dT~Y!o7a}`7P6?m>w0Sh#76YUtvV=W z56jBoi{Y(g3_J6vIO+^{M>5+N+| zs!b1I!J)6qJf8lXlF*mRXT`PXMSW%LeDr#XvdcgALa#>n!k!VHS|oX32O_}#`3W$r z8m@ZMC*fn?3$8=9Zu}o!QP<_YbzCozjoxeuBS}PI>Nz{q9(#t1;hZIP! zER<;i??vS0ikSIgR{>Wpq3Op_`oeHc^nBXqG(A_Ok)xGkDd;5_Rn8TNh5lkpYRUgG zH0MGTn4&W#k$k$e=p$`~|Zv}kspJ)`-&V>uYxh+n2r!x>LsEW}(`ZbbEdsQN*? zdNHalUyN7JLe=fc@#;%abA+Ya$8sMBV#d^*CWSLnq!qGpqnu0SZ364{sG1>BJ4C>D~1Nvy9~x_XC_O2 zs!jJH>r$|3+PMj8*Wgk`1IH|qJ&T(1YG_VM=Y!PRdH_9rT2MTBX?n-f3v zOK=&ZpxER&qXpH`9~^mpi|^tm(c2YPNj~#(BJ>F#qIhqxpbQ zu4`(n@WJ!|6i4Fk&GEojr`}yW%iclP_vQ(<*Yh9Y9f$R)El2#ivxK_LlwgI=Ske$0 zwCf0yi$8i$>&s##e~veveb&NA6yOF2Tc#U}&!n(?!zoh<-q2#!9wZs4zn7&SZ0TmZb{kkNWh0kbcFp9*#{|D*s B_wN7z diff --git a/examples_cuda/stencil/stencil_cu.cpp b/examples_cuda/stencil/stencil_cu.cpp index f23809a1..a4674f59 100644 --- a/examples_cuda/stencil/stencil_cu.cpp +++ b/examples_cuda/stencil/stencil_cu.cpp @@ -51,189 +51,8 @@ using namespace ispc; #include "drvapi_error_string.h" #include +#include "../cuda_ispc.h" -double rtc(void) -{ - struct timeval Tvalue; - double etime; - struct timezone dummy; - - gettimeofday(&Tvalue,&dummy); - etime = (double) Tvalue.tv_sec + - 1.e-6*((double) Tvalue.tv_usec); - return etime; -} - -#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) -// These are the inline versions for all of the SDK helper functions -void __checkCudaErrors(CUresult err, const char *file, const int line) { - if(CUDA_SUCCESS != err) { - std::cerr << "checkCudeErrors() Driver API error = " << err << "\"" - << getCudaDrvErrorString(err) << "\" from file <" << file - << ", line " << line << "\n"; - exit(-1); - } -} - -/**********************/ -/* Basic CUDriver API */ -CUcontext context; - -void createContext(const int deviceId = 0) -{ - CUdevice device; - int devCount; - checkCudaErrors(cuInit(0)); - checkCudaErrors(cuDeviceGetCount(&devCount)); - assert(devCount > 0); - checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0)); - - char name[128]; - checkCudaErrors(cuDeviceGetName(name, 128, device)); - std::cout << "Using CUDA Device [0]: " << name << "\n"; - - int devMajor, devMinor; - checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); - std::cout << "Device Compute Capability: " - << devMajor << "." << devMinor << "\n"; - if (devMajor < 2) { - std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; - exit(1); - } - - // Create driver context - checkCudaErrors(cuCtxCreate(&context, 0, device)); -} -void destroyContext() -{ - checkCudaErrors(cuCtxDestroy(context)); -} - -CUmodule loadModule(const char * module) -{ - CUmodule cudaModule; - checkCudaErrors(cuModuleLoadData(&cudaModule, module)); - return cudaModule; -} -void unloadModule(CUmodule &cudaModule) -{ - checkCudaErrors(cuModuleUnload(cudaModule)); -} - -CUfunction getFunction(CUmodule &cudaModule, const char * function) -{ - CUfunction cudaFunction; - checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function)); - return cudaFunction; -} - -CUdeviceptr deviceMalloc(const size_t size) -{ - CUdeviceptr d_buf; - checkCudaErrors(cuMemAlloc(&d_buf, size)); - return d_buf; -} -void deviceFree(CUdeviceptr d_buf) -{ - checkCudaErrors(cuMemFree(d_buf)); -} -void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size) -{ - checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size)); -} -void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) -{ - checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); -} -#define deviceLaunch(func,nbx,nby,nbz,params) \ - checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \ -checkCudaErrors( \ - cuLaunchKernel( \ - (func), \ - ((nbx-1)/(128/32)+1), (nby), (nbz), \ - 128, 1, 1, \ - 0, NULL, (params), NULL \ - )); - -typedef CUdeviceptr devicePtr; - - -/**************/ -#include -std::vector readBinary(const char * filename) -{ - std::vector buffer; - FILE *fp = fopen(filename, "rb"); - if (!fp ) - { - fprintf(stderr, "file %s not found\n", filename); - assert(0); - } -#if 0 - char c; - while ((c = fgetc(fp)) != EOF) - buffer.push_back(c); -#else - fseek(fp, 0, SEEK_END); - const unsigned long long size = ftell(fp); /*calc the size needed*/ - fseek(fp, 0, SEEK_SET); - buffer.resize(size); - - if (fp == NULL){ /*ERROR detection if file == empty*/ - fprintf(stderr, "Error: There was an Error reading the file %s \n",filename); - exit(1); - } - else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/ - fprintf(stderr, "Error: There was an Error reading the file %s \n", filename); - exit(1); - } -#endif - fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size()); - return buffer; -} - -extern "C" -{ - - void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment) - { - return NULL; - } - void CUDALaunch( - void **handlePtr, - const char * module_name, - const char * module_1, - const char * func_name, - void **func_args, - int countx, int county, int countz) - { - assert(module_name != NULL); - assert(module_1 != NULL); - assert(func_name != NULL); - assert(func_args != NULL); -#if 1 - const char * module = module_1; -#else - const std::vector module_str = readBinary("kernel.cubin"); - const char * module = &module_str[0]; -#endif - CUmodule cudaModule = loadModule(module); - CUfunction cudaFunction = getFunction(cudaModule, func_name); - deviceLaunch(cudaFunction, countx, county, countz, func_args); - unloadModule(cudaModule); - } - void CUDASync(void *handle) - { - checkCudaErrors(cuStreamSynchronize(0)); - } - void ISPCSync(void *handle) - { - checkCudaErrors(cuStreamSynchronize(0)); - } - void CUDAFree(void *handle) - { - } -} extern void loop_stencil_serial(int t0, int t1, int x0, int x1, @@ -295,9 +114,9 @@ int main() { double dt = get_elapsed_mcycles(); minTimeISPC = std::min(minTimeISPC, dt); } -#endif printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); +#endif InitData(Nx, Ny, Nz, Aispc, vsq); @@ -310,19 +129,35 @@ int main() { // the minimum time of three runs. // double minTimeISPCTasks = 1e30; + const bool print_log = false; + const int nreg = 128; for (int i = 0; i < 3; ++i) { reset_and_start_timer(); - const double t0 = rtc(); - loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, (double*)d_coeff, (double*)d_vsq, - (double*)d_Aispc0, (double*)d_Aispc1); - double dt = rtc() - t0; //get_elapsed_mcycles(); + const char * func_name = "loop_stencil_ispc_tasks"; + + int t0 = 0; + int t1 = 6; + + int x0 = width; + int x1 = Nx - width; + + int y0 = width; + int y1 = Ny - width; + + int z0 = width; + int z1 = Nz - width; + + void *func_args[] = { + &t0, &t1, + &x0, &x1, &y0, &y1, &z0, &z1, &Nx, &Ny, &Nz, + &d_coeff, &d_vsq, &d_Aispc0, &d_Aispc1}; + double dt = 1e3*CUDALaunch(NULL, func_name, func_args, print_log, nreg); minTimeISPCTasks = std::min(minTimeISPCTasks, dt); } memcpyD2H(Aispc[1], d_Aispc1, bufsize); //memcpyD2H(Aispc[1], d_vsq, bufsize); - printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); + fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); InitData(Nx, Ny, Nz, Aserial, vsq); diff --git a/examples_cuda/stencil/stencil_cu.ll b/examples_cuda/stencil/stencil_cu.ll deleted file mode 100644 index 6ea8748c..00000000 --- a/examples_cuda/stencil/stencil_cu.ll +++ /dev/null @@ -1,762 +0,0 @@ -; ModuleID = 'stencil_cu.bc' -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: nounwind -declare i8* @ISPCAlloc(i8**, i64, i32) #0 - -; Function Attrs: nounwind -declare void @ISPCLaunch(i8**, i8*, i8*, i32, i32, i32) #0 - -; Function Attrs: nounwind -declare void @ISPCSync(i8*) #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) #1 - -; Function Attrs: nounwind readonly -declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x double>) #2 - -; Function Attrs: nounwind -declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x double>, <4 x double>) #0 - -; Function Attrs: nounwind -define internal fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, double* noalias nocapture %coef, double* noalias %vsq, double* noalias %Ain, double* noalias %Aout, <8 x i32> %__mask) #3 { -allocas: - %floatmask.i = bitcast <8 x i32> %__mask to <8 x float> - %v.i = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i) #1 - %cmp.i = icmp eq i32 %v.i, 255 - %mul_Nx_load_Ny_load = mul i32 %Ny, %Nx - %coef_load_offset_load = load double* %coef, align 8 - %coef_load18_offset = getelementptr double* %coef, i64 1 - %coef_load18_offset_load = load double* %coef_load18_offset, align 8 - %coef_load21_offset = getelementptr double* %coef, i64 2 - %coef_load21_offset_load = load double* %coef_load21_offset, align 8 - %coef_load24_offset = getelementptr double* %coef, i64 3 - %coef_load24_offset_load = load double* %coef_load24_offset, align 8 - %less_z_load_z1_load260 = icmp slt i32 %z0, %z1 - br i1 %cmp.i, label %for_test.preheader, label %for_test264.preheader - -for_test264.preheader: ; preds = %allocas - br i1 %less_z_load_z1_load260, label %for_test275.preheader.lr.ph, label %for_exit - -for_test275.preheader.lr.ph: ; preds = %for_test264.preheader - %less_y_load282_y1_load283264 = icmp slt i32 %y0, %y1 - %less_xb_load293_x1_load294262 = icmp slt i32 %x0, %x1 - %x1_load463_broadcast_init = insertelement <8 x i32> undef, i32 %x1, i32 0 - %x1_load463_broadcast = shufflevector <8 x i32> %x1_load463_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer - %mul__Nx_load382 = shl i32 %Nx, 1 - %mul__Nx_load431 = mul i32 %Nx, 3 - %mul__Nx_load390 = mul i32 %Nx, -2 - %mul__Nx_load439 = mul i32 %Nx, -3 - %mul__Nxy_load399 = shl i32 %mul_Nx_load_Ny_load, 1 - %mul__Nxy_load448 = mul i32 %mul_Nx_load_Ny_load, 3 - %mul__Nxy_load407 = mul i32 %mul_Nx_load_Ny_load, -2 - %mul__Nxy_load456 = mul i32 %mul_Nx_load_Ny_load, -3 - %Ain_load327_ptr2int_2void = bitcast double* %Ain to i8* - %mask0.i.i201 = shufflevector <8 x i32> %__mask, <8 x i32> undef, <8 x i32> - %mask1.i.i202 = shufflevector <8 x i32> %__mask, <8 x i32> undef, <8 x i32> - %mask0d.i.i203 = bitcast <8 x i32> %mask0.i.i201 to <4 x double> - %mask1d.i.i204 = bitcast <8 x i32> %mask1.i.i202 to <4 x double> - %coef1_load315_broadcast_init = insertelement <8 x double> undef, double %coef_load18_offset_load, i32 0 - %coef0_load306_broadcast_init = insertelement <8 x double> undef, double %coef_load_offset_load, i32 0 - %coef2_load364_broadcast_init = insertelement <8 x double> undef, double %coef_load21_offset_load, i32 0 - %coef1_load315_broadcast = shufflevector <8 x double> %coef1_load315_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef0_load306_broadcast = shufflevector <8 x double> %coef0_load306_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load413_broadcast_init = insertelement <8 x double> undef, double %coef_load24_offset_load, i32 0 - %coef2_load364_broadcast = shufflevector <8 x double> %coef2_load364_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load413_broadcast = shufflevector <8 x double> %coef3_load413_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %Aout_load488_ptr2int_2void = bitcast double* %Aout to i8* - %vsq_load494_ptr2int_2void = bitcast double* %vsq to i8* - br label %for_test275.preheader - -for_test.preheader: ; preds = %allocas - br i1 %less_z_load_z1_load260, label %for_test30.preheader.lr.ph, label %for_exit - -for_test30.preheader.lr.ph: ; preds = %for_test.preheader - %less_y_load_y1_load258 = icmp slt i32 %y0, %y1 - %less_xb_load_x1_load256 = icmp slt i32 %x0, %x1 - %x1_load199_broadcast_init = insertelement <8 x i32> undef, i32 %x1, i32 0 - %x1_load199_broadcast = shufflevector <8 x i32> %x1_load199_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer - %mul__Nx_load119 = shl i32 %Nx, 1 - %mul__Nx_load167 = mul i32 %Nx, 3 - %mul__Nx_load127 = mul i32 %Nx, -2 - %mul__Nx_load175 = mul i32 %Nx, -3 - %mul__Nxy_load136 = shl i32 %mul_Nx_load_Ny_load, 1 - %mul__Nxy_load184 = mul i32 %mul_Nx_load_Ny_load, 3 - %mul__Nxy_load144 = mul i32 %mul_Nx_load_Ny_load, -2 - %mul__Nxy_load192 = mul i32 %mul_Nx_load_Ny_load, -3 - %Ain_load65_ptr2int_2void = bitcast double* %Ain to i8* - %coef1_load_broadcast_init = insertelement <8 x double> undef, double %coef_load18_offset_load, i32 0 - %coef0_load_broadcast_init = insertelement <8 x double> undef, double %coef_load_offset_load, i32 0 - %coef2_load_broadcast_init = insertelement <8 x double> undef, double %coef_load21_offset_load, i32 0 - %coef1_load_broadcast = shufflevector <8 x double> %coef1_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef0_load_broadcast = shufflevector <8 x double> %coef0_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load_broadcast_init = insertelement <8 x double> undef, double %coef_load24_offset_load, i32 0 - %coef2_load_broadcast = shufflevector <8 x double> %coef2_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load_broadcast = shufflevector <8 x double> %coef3_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %Aout_load219_ptr2int_2void = bitcast double* %Aout to i8* - %vsq_load_ptr2int_2void = bitcast double* %vsq to i8* - br label %for_test30.preheader - -for_test30.preheader: ; preds = %for_exit33, %for_test30.preheader.lr.ph - %z.0261 = phi i32 [ %z0, %for_test30.preheader.lr.ph ], [ %z_load242_plus1, %for_exit33 ] - br i1 %less_y_load_y1_load258, label %for_test37.preheader.lr.ph, label %for_exit33 - -for_test37.preheader.lr.ph: ; preds = %for_test30.preheader - %mul_z_load45_Nxy_load = mul i32 %z.0261, %mul_Nx_load_Ny_load - br i1 %less_xb_load_x1_load256, label %for_loop39.lr.ph.us, label %for_exit33 - -for_exit40.us: ; preds = %safe_if_after_true.us - %y_load241_plus1.us = add i32 %y.0259.us, 1 - %exitcond = icmp eq i32 %y_load241_plus1.us, %y1 - br i1 %exitcond, label %for_exit33, label %for_loop39.lr.ph.us - -for_loop39.us: ; preds = %for_loop39.lr.ph.us, %safe_if_after_true.us - %xb.0257.us = phi i32 [ %x0, %for_loop39.lr.ph.us ], [ %add_xb_load240_.us, %safe_if_after_true.us ] - %xb_load44_broadcast_init.us = insertelement <8 x i32> undef, i32 %xb.0257.us, i32 0 - %xb_load44_broadcast.us = shufflevector <8 x i32> %xb_load44_broadcast_init.us, <8 x i32> undef, <8 x i32> zeroinitializer - %add_xb_load44_broadcast_.us = add <8 x i32> %xb_load44_broadcast.us, - %less_x_load198_x1_load199_broadcast.us = icmp slt <8 x i32> %add_xb_load44_broadcast_.us, %x1_load199_broadcast - %"oldMask&test.us" = select <8 x i1> %less_x_load198_x1_load199_broadcast.us, <8 x i32> , <8 x i32> zeroinitializer - %floatmask.i244.us = bitcast <8 x i32> %"oldMask&test.us" to <8 x float> - %v.i245.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i244.us) #1 - %cmp.i246.us = icmp eq i32 %v.i245.us, 0 - br i1 %cmp.i246.us, label %safe_if_after_true.us, label %safe_if_run_true.us - -safe_if_run_true.us: ; preds = %for_loop39.us - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast.elt0.us = add i32 %xb.0257.us, %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us - %scaled_varying.elt0.us = shl i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast.elt0.us, 3 - %"varying+const_offsets.elt0.us" = add i32 %scaled_varying.elt0.us, -8 - %0 = sext i32 %"varying+const_offsets.elt0.us" to i64 - %ptr.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %0, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3 - %ptr_cast_for_load.us = bitcast i8* %ptr.us to <8 x double>* - %ptr_masked_load521.us = load <8 x double>* %ptr_cast_for_load.us, align 8, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3 - %"varying+const_offsets529.elt0.us" = add i32 %scaled_varying.elt0.us, 8 - %1 = sext i32 %"varying+const_offsets529.elt0.us" to i64 - %ptr530.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %1, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5 - %ptr_cast_for_load531.us = bitcast i8* %ptr530.us to <8 x double>* - %ptr530_masked_load532.us = load <8 x double>* %ptr_cast_for_load531.us, align 8, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5 - %"varying+const_offsets540.elt0.us" = add i32 %scaled_varying.elt0.us, -16 - %2 = sext i32 %"varying+const_offsets540.elt0.us" to i64 - %ptr541.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %2, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3 - %ptr_cast_for_load542.us = bitcast i8* %ptr541.us to <8 x double>* - %ptr541_masked_load543.us = load <8 x double>* %ptr_cast_for_load542.us, align 8, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3 - %"varying+const_offsets551.elt0.us" = add i32 %scaled_varying.elt0.us, 16 - %3 = sext i32 %"varying+const_offsets551.elt0.us" to i64 - %ptr552.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %3, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5 - %ptr_cast_for_load553.us = bitcast i8* %ptr552.us to <8 x double>* - %ptr552_masked_load554.us = load <8 x double>* %ptr_cast_for_load553.us, align 8, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5 - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556_mul__Nx_load71_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556.elt0.us, %xb.0257.us - %scaled_varying560.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556_mul__Nx_load71_broadcast.elt0.us, 3 - %4 = sext i32 %scaled_varying560.elt0.us to i64 - %ptr562.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %4, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8 - %ptr_cast_for_load563.us = bitcast i8* %ptr562.us to <8 x double>* - %ptr562_masked_load564.us = load <8 x double>* %ptr_cast_for_load563.us, align 8, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8 - %add_Ain_load57_offset_load_Ain_load65_offset_load.us = fadd <8 x double> %ptr_masked_load521.us, %ptr530_masked_load532.us - %"varying+const_offsets572.elt0.us" = add i32 %scaled_varying.elt0.us, -24 - %5 = sext i32 %"varying+const_offsets572.elt0.us" to i64 - %ptr573.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %5, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3 - %ptr_cast_for_load574.us = bitcast i8* %ptr573.us to <8 x double>* - %ptr573_masked_load575.us = load <8 x double>* %ptr_cast_for_load574.us, align 8, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3 - %"varying+const_offsets583.elt0.us" = add i32 %scaled_varying.elt0.us, 24 - %6 = sext i32 %"varying+const_offsets583.elt0.us" to i64 - %ptr584.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %6, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5 - %ptr_cast_for_load585.us = bitcast i8* %ptr584.us to <8 x double>* - %ptr584_masked_load586.us = load <8 x double>* %ptr_cast_for_load585.us, align 8, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5 - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588_mul__Nx_load119_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588.elt0.us, %xb.0257.us - %scaled_varying593.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588_mul__Nx_load119_broadcast.elt0.us, 3 - %7 = sext i32 %scaled_varying593.elt0.us to i64 - %ptr595.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %7, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1 - %ptr_cast_for_load596.us = bitcast i8* %ptr595.us to <8 x double>* - %ptr595_masked_load597.us = load <8 x double>* %ptr_cast_for_load596.us, align 8, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1 - %add_Ain_load105_offset_load_Ain_load113_offset_load.us = fadd <8 x double> %ptr541_masked_load543.us, %ptr552_masked_load554.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599_mul__Nx_load79_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599.elt0.us, %xb.0257.us - %scaled_varying604.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599_mul__Nx_load79_broadcast.elt0.us, 3 - %8 = sext i32 %scaled_varying604.elt0.us to i64 - %ptr606.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %8, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13 - %ptr_cast_for_load607.us = bitcast i8* %ptr606.us to <8 x double>* - %ptr606_masked_load608.us = load <8 x double>* %ptr_cast_for_load607.us, align 8, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13 - %add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load.us = fadd <8 x double> %add_Ain_load57_offset_load_Ain_load65_offset_load.us, %ptr562_masked_load564.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610_mul__Nx_load167_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610.elt0.us, %xb.0257.us - %scaled_varying615.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610_mul__Nx_load167_broadcast.elt0.us, 3 - %9 = sext i32 %scaled_varying615.elt0.us to i64 - %ptr617.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %9, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1 - %ptr_cast_for_load618.us = bitcast i8* %ptr617.us to <8 x double>* - %ptr617_masked_load619.us = load <8 x double>* %ptr_cast_for_load618.us, align 8, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1 - %add_Ain_load153_offset_load_Ain_load161_offset_load.us = fadd <8 x double> %ptr573_masked_load575.us, %ptr584_masked_load586.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621_mul__Nx_load127_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621.elt0.us, %xb.0257.us - %scaled_varying626.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621_mul__Nx_load127_broadcast.elt0.us, 3 - %10 = sext i32 %scaled_varying626.elt0.us to i64 - %ptr628.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %10, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15 - %ptr_cast_for_load629.us = bitcast i8* %ptr628.us to <8 x double>* - %ptr628_masked_load630.us = load <8 x double>* %ptr_cast_for_load629.us, align 8, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15 - %add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load.us = fadd <8 x double> %add_Ain_load105_offset_load_Ain_load113_offset_load.us, %ptr595_masked_load597.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632_mul__Nxy_load88_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632.elt0.us, %xb.0257.us - %scaled_varying637.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632_mul__Nxy_load88_broadcast.elt0.us, 3 - %11 = sext i32 %scaled_varying637.elt0.us to i64 - %ptr639.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %11, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1 - %ptr_cast_for_load640.us = bitcast i8* %ptr639.us to <8 x double>* - %ptr639_masked_load641.us = load <8 x double>* %ptr_cast_for_load640.us, align 8, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1 - %add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load.us = fadd <8 x double> %add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load.us, %ptr606_masked_load608.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643_mul__Nx_load175_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643.elt0.us, %xb.0257.us - %scaled_varying648.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643_mul__Nx_load175_broadcast.elt0.us, 3 - %12 = sext i32 %scaled_varying648.elt0.us to i64 - %ptr650.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %12, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15 - %ptr_cast_for_load651.us = bitcast i8* %ptr650.us to <8 x double>* - %ptr650_masked_load652.us = load <8 x double>* %ptr_cast_for_load651.us, align 8, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15 - %add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load.us = fadd <8 x double> %add_Ain_load153_offset_load_Ain_load161_offset_load.us, %ptr617_masked_load619.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654_mul__Nxy_load136_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654.elt0.us, %xb.0257.us - %scaled_varying659.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654_mul__Nxy_load136_broadcast.elt0.us, 3 - %13 = sext i32 %scaled_varying659.elt0.us to i64 - %ptr661.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %13, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1 - %ptr_cast_for_load662.us = bitcast i8* %ptr661.us to <8 x double>* - %ptr661_masked_load663.us = load <8 x double>* %ptr_cast_for_load662.us, align 8, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1 - %add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load.us = fadd <8 x double> %add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load.us, %ptr628_masked_load630.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665_mul__Nxy_load96_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665.elt0.us, %xb.0257.us - %scaled_varying670.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665_mul__Nxy_load96_broadcast.elt0.us, 3 - %14 = sext i32 %scaled_varying670.elt0.us to i64 - %ptr672.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %14, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15 - %ptr_cast_for_load673.us = bitcast i8* %ptr672.us to <8 x double>* - %ptr672_masked_load674.us = load <8 x double>* %ptr_cast_for_load673.us, align 8, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15 - %add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load.us = fadd <8 x double> %add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load.us, %ptr639_masked_load641.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676_mul__Nxy_load184_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676.elt0.us, %xb.0257.us - %scaled_varying681.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676_mul__Nxy_load184_broadcast.elt0.us, 3 - %15 = sext i32 %scaled_varying681.elt0.us to i64 - %ptr683.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %15, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1 - %ptr_cast_for_load684.us = bitcast i8* %ptr683.us to <8 x double>* - %ptr683_masked_load685.us = load <8 x double>* %ptr_cast_for_load684.us, align 8, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1 - %add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load.us = fadd <8 x double> %add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load.us, %ptr650_masked_load652.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687_mul__Nxy_load144_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687.elt0.us, %xb.0257.us - %scaled_varying692.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687_mul__Nxy_load144_broadcast.elt0.us, 3 - %16 = sext i32 %scaled_varying692.elt0.us to i64 - %ptr694.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %16, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15 - %ptr_cast_for_load695.us = bitcast i8* %ptr694.us to <8 x double>* - %ptr694_masked_load696.us = load <8 x double>* %ptr_cast_for_load695.us, align 8, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15 - %add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load.us = fadd <8 x double> %add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load.us, %ptr661_masked_load663.us - %add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load.us, %ptr672_masked_load674.us - %17 = sext i32 %scaled_varying.elt0.us to i64 - %ptr705.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %17, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19 - %ptr_cast_for_load706.us = bitcast i8* %ptr705.us to <8 x double>* - %ptr705_masked_load707.us = load <8 x double>* %ptr_cast_for_load706.us, align 8, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19 - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709_mul__Nxy_load192_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709.elt0.us, %xb.0257.us - %scaled_varying714.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709_mul__Nxy_load192_broadcast.elt0.us, 3 - %18 = sext i32 %scaled_varying714.elt0.us to i64 - %ptr716.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %18, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15 - %ptr_cast_for_load717.us = bitcast i8* %ptr716.us to <8 x double>* - %ptr716_masked_load718.us = load <8 x double>* %ptr_cast_for_load717.us, align 8, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15 - %add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load.us = fadd <8 x double> %add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load.us, %ptr683_masked_load685.us - %add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load.us, %ptr694_masked_load696.us - %mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fmul <8 x double> %coef1_load_broadcast, %add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us - %mul_coef0_load_broadcast_Ain_load_offset_load.us = fmul <8 x double> %coef0_load_broadcast, %ptr705_masked_load707.us - %add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load.us, %ptr716_masked_load718.us - %mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fmul <8 x double> %coef2_load_broadcast, %add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us - %add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fadd <8 x double> %mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us, %mul_coef0_load_broadcast_Ain_load_offset_load.us - %mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fmul <8 x double> %coef3_load_broadcast, %add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us - %add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fadd <8 x double> %mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us, %add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us - %add_add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load_mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fadd <8 x double> %add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us, %mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us - %mask0.i.i234.us = shufflevector <8 x i32> %"oldMask&test.us", <8 x i32> undef, <8 x i32> - %mask1.i.i235.us = shufflevector <8 x i32> %"oldMask&test.us", <8 x i32> undef, <8 x i32> - %mask0d.i.i236.us = bitcast <8 x i32> %mask0.i.i234.us to <4 x double> - %mask1d.i.i237.us = bitcast <8 x i32> %mask1.i.i235.us to <4 x double> - %val0d.i.i238.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr705.us, <4 x double> %mask0d.i.i236.us) #0 - %ptr727.sum.us = add i64 %17, 32 - %ptr1.i.i239.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %ptr727.sum.us - %val1d.i.i240.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i239.us, <4 x double> %mask1d.i.i237.us) #0 - %vald.i.i241.us = shufflevector <4 x double> %val0d.i.i238.us, <4 x double> %val1d.i.i240.us, <8 x i32> - %mul__Ain_load211_offset_load.us = fmul <8 x double> %vald.i.i241.us, - %ptr736.us = getelementptr i8* %Aout_load219_ptr2int_2void, i64 %17, !filename !0, !first_line !20, !first_column !21, !last_line !20, !last_column !22 - %val0d.i.i228.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr736.us, <4 x double> %mask0d.i.i236.us) #0 - %ptr1.i.i229.us = getelementptr i8* %Aout_load219_ptr2int_2void, i64 %ptr727.sum.us - %val1d.i.i230.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i229.us, <4 x double> %mask1d.i.i237.us) #0 - %vald.i.i231.us = shufflevector <4 x double> %val0d.i.i228.us, <4 x double> %val1d.i.i230.us, <8 x i32> - %sub_mul__Ain_load211_offset_load_Aout_load219_offset_load.us = fsub <8 x double> %mul__Ain_load211_offset_load.us, %vald.i.i231.us - %ptr745.us = getelementptr i8* %vsq_load_ptr2int_2void, i64 %17, !filename !0, !first_line !23, !first_column !24, !last_line !23, !last_column !7 - %val0d.i.i218.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr745.us, <4 x double> %mask0d.i.i236.us) #0 - %ptr1.i.i219.us = getelementptr i8* %vsq_load_ptr2int_2void, i64 %ptr727.sum.us - %val1d.i.i220.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i219.us, <4 x double> %mask1d.i.i237.us) #0 - %vald.i.i221.us = shufflevector <4 x double> %val0d.i.i218.us, <4 x double> %val1d.i.i220.us, <8 x i32> - %mul_vsq_load_offset_load_div_load.us = fmul <8 x double> %add_add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load_mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us, %vald.i.i221.us - %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us = fadd <8 x double> %sub_mul__Ain_load211_offset_load_Aout_load219_offset_load.us, %mul_vsq_load_offset_load_div_load.us - %val0.i.i.us = shufflevector <8 x double> %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us, <8 x double> undef, <4 x i32> - %val1.i.i.us = shufflevector <8 x double> %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us, <8 x double> undef, <4 x i32> - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr736.us, <4 x double> %mask0d.i.i236.us, <4 x double> %val0.i.i.us) #0 - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr1.i.i229.us, <4 x double> %mask1d.i.i237.us, <4 x double> %val1.i.i.us) #0 - br label %safe_if_after_true.us - -safe_if_after_true.us: ; preds = %safe_if_run_true.us, %for_loop39.us - %add_xb_load240_.us = add i32 %xb.0257.us, 8 - %less_xb_load_x1_load.us = icmp slt i32 %add_xb_load240_.us, %x1 - br i1 %less_xb_load_x1_load.us, label %for_loop39.us, label %for_exit40.us - -for_loop39.lr.ph.us: ; preds = %for_exit40.us, %for_test37.preheader.lr.ph - %y.0259.us = phi i32 [ %y_load241_plus1.us, %for_exit40.us ], [ %y0, %for_test37.preheader.lr.ph ] - %mul_y_load46_Nx_load47.us = mul i32 %y.0259.us, %Nx - %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us = add i32 %mul_y_load46_Nx_load47.us, %mul_z_load45_Nxy_load - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %Nx - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load119 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599.elt0.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %Nx - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load167 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load127 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load175 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load136 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665.elt0.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load184 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load144 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load192 - br label %for_loop39.us - -for_exit: ; preds = %for_exit278, %for_exit33, %for_test.preheader, %for_test264.preheader - ret void - -for_exit33: ; preds = %for_exit40.us, %for_test37.preheader.lr.ph, %for_test30.preheader - %z_load242_plus1 = add i32 %z.0261, 1 - %exitcond269 = icmp eq i32 %z_load242_plus1, %z1 - br i1 %exitcond269, label %for_exit, label %for_test30.preheader - -for_test275.preheader: ; preds = %for_exit278, %for_test275.preheader.lr.ph - %z269.0268 = phi i32 [ %z0, %for_test275.preheader.lr.ph ], [ %z_load518_plus1, %for_exit278 ] - br i1 %less_y_load282_y1_load283264, label %for_test286.preheader.lr.ph, label %for_exit278 - -for_test286.preheader.lr.ph: ; preds = %for_test275.preheader - %mul_z_load300_Nxy_load301 = mul i32 %z269.0268, %mul_Nx_load_Ny_load - br i1 %less_xb_load293_x1_load294262, label %for_loop288.lr.ph.us, label %for_exit278 - -for_exit289.us: ; preds = %safe_if_after_true466.us - %y_load517_plus1.us = add i32 %y280.0265.us, 1 - %exitcond271 = icmp eq i32 %y_load517_plus1.us, %y1 - br i1 %exitcond271, label %for_exit278, label %for_loop288.lr.ph.us - -for_loop288.us: ; preds = %for_loop288.lr.ph.us, %safe_if_after_true466.us - %xb291.0263.us = phi i32 [ %x0, %for_loop288.lr.ph.us ], [ %add_xb291_load_.us, %safe_if_after_true466.us ] - %xb_load298_broadcast_init.us = insertelement <8 x i32> undef, i32 %xb291.0263.us, i32 0 - %xb_load298_broadcast.us = shufflevector <8 x i32> %xb_load298_broadcast_init.us, <8 x i32> undef, <8 x i32> zeroinitializer - %add_xb_load298_broadcast_.us = add <8 x i32> %xb_load298_broadcast.us, - %less_x_load462_x1_load463_broadcast.us = icmp slt <8 x i32> %add_xb_load298_broadcast_.us, %x1_load463_broadcast - %"oldMask&test468.us" = select <8 x i1> %less_x_load462_x1_load463_broadcast.us, <8 x i32> , <8 x i32> zeroinitializer - %"internal_mask&function_mask472.us" = and <8 x i32> %"oldMask&test468.us", %__mask - %floatmask.i211.us = bitcast <8 x i32> %"internal_mask&function_mask472.us" to <8 x float> - %v.i212.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i211.us) #1 - %cmp.i213.us = icmp eq i32 %v.i212.us, 0 - br i1 %cmp.i213.us, label %safe_if_after_true466.us, label %safe_if_run_true467.us - -safe_if_run_true467.us: ; preds = %for_loop288.us - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast.elt0.us = add i32 %xb291.0263.us, %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us - %scaled_varying757.elt0.us = shl i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast.elt0.us, 3 - %"varying+const_offsets.elt0758.us" = add i32 %scaled_varying757.elt0.us, -8 - %19 = sext i32 %"varying+const_offsets.elt0758.us" to i64 - %ptr759.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %19, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3 - %val0d.i.i205.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr759.us, <4 x double> %mask0d.i.i203) #0 - %ptr759.sum.us = add i64 %19, 32 - %ptr1.i.i206.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr759.sum.us - %val1d.i.i207.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i206.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i208.us = shufflevector <4 x double> %val0d.i.i205.us, <4 x double> %val1d.i.i207.us, <8 x i32> - %"varying+const_offsets767.elt0.us" = add i32 %scaled_varying757.elt0.us, 8 - %20 = sext i32 %"varying+const_offsets767.elt0.us" to i64 - %ptr768.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %20, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5 - %val0d.i.i195.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr768.us, <4 x double> %mask0d.i.i203) #0 - %ptr768.sum.us = add i64 %20, 32 - %ptr1.i.i196.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr768.sum.us - %val1d.i.i197.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i196.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i198.us = shufflevector <4 x double> %val0d.i.i195.us, <4 x double> %val1d.i.i197.us, <8 x i32> - %"varying+const_offsets776.elt0.us" = add i32 %scaled_varying757.elt0.us, -16 - %21 = sext i32 %"varying+const_offsets776.elt0.us" to i64 - %ptr777.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %21, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3 - %val0d.i.i185.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr777.us, <4 x double> %mask0d.i.i203) #0 - %ptr777.sum.us = add i64 %21, 32 - %ptr1.i.i186.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr777.sum.us - %val1d.i.i187.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i186.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i188.us = shufflevector <4 x double> %val0d.i.i185.us, <4 x double> %val1d.i.i187.us, <8 x i32> - %"varying+const_offsets785.elt0.us" = add i32 %scaled_varying757.elt0.us, 16 - %22 = sext i32 %"varying+const_offsets785.elt0.us" to i64 - %ptr786.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %22, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5 - %val0d.i.i175.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr786.us, <4 x double> %mask0d.i.i203) #0 - %ptr786.sum.us = add i64 %22, 32 - %ptr1.i.i176.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr786.sum.us - %val1d.i.i177.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i176.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i178.us = shufflevector <4 x double> %val0d.i.i175.us, <4 x double> %val1d.i.i177.us, <8 x i32> - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788_mul__Nx_load333_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788.elt0.us, %xb291.0263.us - %scaled_varying793.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788_mul__Nx_load333_broadcast.elt0.us, 3 - %23 = sext i32 %scaled_varying793.elt0.us to i64 - %ptr795.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %23, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8 - %val0d.i.i165.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr795.us, <4 x double> %mask0d.i.i203) #0 - %ptr795.sum.us = add i64 %23, 32 - %ptr1.i.i166.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr795.sum.us - %val1d.i.i167.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i166.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i168.us = shufflevector <4 x double> %val0d.i.i165.us, <4 x double> %val1d.i.i167.us, <8 x i32> - %add_Ain_load319_offset_load_Ain_load327_offset_load.us = fadd <8 x double> %vald.i.i208.us, %vald.i.i198.us - %"varying+const_offsets803.elt0.us" = add i32 %scaled_varying757.elt0.us, -24 - %24 = sext i32 %"varying+const_offsets803.elt0.us" to i64 - %ptr804.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %24, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3 - %val0d.i.i155.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr804.us, <4 x double> %mask0d.i.i203) #0 - %ptr804.sum.us = add i64 %24, 32 - %ptr1.i.i156.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr804.sum.us - %val1d.i.i157.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i156.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i158.us = shufflevector <4 x double> %val0d.i.i155.us, <4 x double> %val1d.i.i157.us, <8 x i32> - %"varying+const_offsets812.elt0.us" = add i32 %scaled_varying757.elt0.us, 24 - %25 = sext i32 %"varying+const_offsets812.elt0.us" to i64 - %ptr813.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %25, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5 - %val0d.i.i145.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr813.us, <4 x double> %mask0d.i.i203) #0 - %ptr813.sum.us = add i64 %25, 32 - %ptr1.i.i146.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr813.sum.us - %val1d.i.i147.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i146.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i148.us = shufflevector <4 x double> %val0d.i.i145.us, <4 x double> %val1d.i.i147.us, <8 x i32> - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815_mul__Nx_load382_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815.elt0.us, %xb291.0263.us - %scaled_varying820.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815_mul__Nx_load382_broadcast.elt0.us, 3 - %26 = sext i32 %scaled_varying820.elt0.us to i64 - %ptr822.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %26, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1 - %val0d.i.i135.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr822.us, <4 x double> %mask0d.i.i203) #0 - %ptr822.sum.us = add i64 %26, 32 - %ptr1.i.i136.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr822.sum.us - %val1d.i.i137.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i136.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i138.us = shufflevector <4 x double> %val0d.i.i135.us, <4 x double> %val1d.i.i137.us, <8 x i32> - %add_Ain_load368_offset_load_Ain_load376_offset_load.us = fadd <8 x double> %vald.i.i188.us, %vald.i.i178.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824_mul__Nx_load341_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824.elt0.us, %xb291.0263.us - %scaled_varying829.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824_mul__Nx_load341_broadcast.elt0.us, 3 - %27 = sext i32 %scaled_varying829.elt0.us to i64 - %ptr831.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %27, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13 - %val0d.i.i125.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr831.us, <4 x double> %mask0d.i.i203) #0 - %ptr831.sum.us = add i64 %27, 32 - %ptr1.i.i126.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr831.sum.us - %val1d.i.i127.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i126.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i128.us = shufflevector <4 x double> %val0d.i.i125.us, <4 x double> %val1d.i.i127.us, <8 x i32> - %add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load.us = fadd <8 x double> %add_Ain_load319_offset_load_Ain_load327_offset_load.us, %vald.i.i168.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833_mul__Nx_load431_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833.elt0.us, %xb291.0263.us - %scaled_varying838.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833_mul__Nx_load431_broadcast.elt0.us, 3 - %28 = sext i32 %scaled_varying838.elt0.us to i64 - %ptr840.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %28, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1 - %val0d.i.i115.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr840.us, <4 x double> %mask0d.i.i203) #0 - %ptr840.sum.us = add i64 %28, 32 - %ptr1.i.i116.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr840.sum.us - %val1d.i.i117.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i116.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i118.us = shufflevector <4 x double> %val0d.i.i115.us, <4 x double> %val1d.i.i117.us, <8 x i32> - %add_Ain_load417_offset_load_Ain_load425_offset_load.us = fadd <8 x double> %vald.i.i158.us, %vald.i.i148.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842_mul__Nx_load390_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842.elt0.us, %xb291.0263.us - %scaled_varying847.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842_mul__Nx_load390_broadcast.elt0.us, 3 - %29 = sext i32 %scaled_varying847.elt0.us to i64 - %ptr849.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %29, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15 - %val0d.i.i105.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr849.us, <4 x double> %mask0d.i.i203) #0 - %ptr849.sum.us = add i64 %29, 32 - %ptr1.i.i106.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr849.sum.us - %val1d.i.i107.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i106.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i108.us = shufflevector <4 x double> %val0d.i.i105.us, <4 x double> %val1d.i.i107.us, <8 x i32> - %add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load.us = fadd <8 x double> %add_Ain_load368_offset_load_Ain_load376_offset_load.us, %vald.i.i138.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851_mul__Nxy_load350_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851.elt0.us, %xb291.0263.us - %scaled_varying856.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851_mul__Nxy_load350_broadcast.elt0.us, 3 - %30 = sext i32 %scaled_varying856.elt0.us to i64 - %ptr858.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %30, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1 - %val0d.i.i95.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr858.us, <4 x double> %mask0d.i.i203) #0 - %ptr858.sum.us = add i64 %30, 32 - %ptr1.i.i96.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr858.sum.us - %val1d.i.i97.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i96.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i98.us = shufflevector <4 x double> %val0d.i.i95.us, <4 x double> %val1d.i.i97.us, <8 x i32> - %add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load.us = fadd <8 x double> %add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load.us, %vald.i.i128.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860_mul__Nx_load439_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860.elt0.us, %xb291.0263.us - %scaled_varying865.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860_mul__Nx_load439_broadcast.elt0.us, 3 - %31 = sext i32 %scaled_varying865.elt0.us to i64 - %ptr867.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %31, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15 - %val0d.i.i85.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr867.us, <4 x double> %mask0d.i.i203) #0 - %ptr867.sum.us = add i64 %31, 32 - %ptr1.i.i86.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr867.sum.us - %val1d.i.i87.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i86.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i88.us = shufflevector <4 x double> %val0d.i.i85.us, <4 x double> %val1d.i.i87.us, <8 x i32> - %add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load.us = fadd <8 x double> %add_Ain_load417_offset_load_Ain_load425_offset_load.us, %vald.i.i118.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869_mul__Nxy_load399_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869.elt0.us, %xb291.0263.us - %scaled_varying874.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869_mul__Nxy_load399_broadcast.elt0.us, 3 - %32 = sext i32 %scaled_varying874.elt0.us to i64 - %ptr876.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %32, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1 - %val0d.i.i75.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr876.us, <4 x double> %mask0d.i.i203) #0 - %ptr876.sum.us = add i64 %32, 32 - %ptr1.i.i76.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr876.sum.us - %val1d.i.i77.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i76.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i78.us = shufflevector <4 x double> %val0d.i.i75.us, <4 x double> %val1d.i.i77.us, <8 x i32> - %add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load.us = fadd <8 x double> %add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load.us, %vald.i.i108.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878_mul__Nxy_load358_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878.elt0.us, %xb291.0263.us - %scaled_varying883.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878_mul__Nxy_load358_broadcast.elt0.us, 3 - %33 = sext i32 %scaled_varying883.elt0.us to i64 - %ptr885.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %33, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15 - %val0d.i.i65.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr885.us, <4 x double> %mask0d.i.i203) #0 - %ptr885.sum.us = add i64 %33, 32 - %ptr1.i.i66.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr885.sum.us - %val1d.i.i67.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i66.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i68.us = shufflevector <4 x double> %val0d.i.i65.us, <4 x double> %val1d.i.i67.us, <8 x i32> - %add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load.us = fadd <8 x double> %add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load.us, %vald.i.i98.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887_mul__Nxy_load448_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887.elt0.us, %xb291.0263.us - %scaled_varying892.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887_mul__Nxy_load448_broadcast.elt0.us, 3 - %34 = sext i32 %scaled_varying892.elt0.us to i64 - %ptr894.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %34, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1 - %val0d.i.i55.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr894.us, <4 x double> %mask0d.i.i203) #0 - %ptr894.sum.us = add i64 %34, 32 - %ptr1.i.i56.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr894.sum.us - %val1d.i.i57.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i56.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i58.us = shufflevector <4 x double> %val0d.i.i55.us, <4 x double> %val1d.i.i57.us, <8 x i32> - %add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load.us = fadd <8 x double> %add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load.us, %vald.i.i88.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896_mul__Nxy_load407_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896.elt0.us, %xb291.0263.us - %scaled_varying901.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896_mul__Nxy_load407_broadcast.elt0.us, 3 - %35 = sext i32 %scaled_varying901.elt0.us to i64 - %ptr903.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %35, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15 - %val0d.i.i45.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr903.us, <4 x double> %mask0d.i.i203) #0 - %ptr903.sum.us = add i64 %35, 32 - %ptr1.i.i46.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr903.sum.us - %val1d.i.i47.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i46.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i48.us = shufflevector <4 x double> %val0d.i.i45.us, <4 x double> %val1d.i.i47.us, <8 x i32> - %add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load.us = fadd <8 x double> %add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load.us, %vald.i.i78.us - %add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load.us, %vald.i.i68.us - %36 = sext i32 %scaled_varying757.elt0.us to i64 - %ptr912.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %36, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19 - %val0d.i.i35.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr912.us, <4 x double> %mask0d.i.i203) #0 - %ptr912.sum.us = add i64 %36, 32 - %ptr1.i.i36.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr912.sum.us - %val1d.i.i37.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i36.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i38.us = shufflevector <4 x double> %val0d.i.i35.us, <4 x double> %val1d.i.i37.us, <8 x i32> - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914_mul__Nxy_load456_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914.elt0.us, %xb291.0263.us - %scaled_varying919.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914_mul__Nxy_load456_broadcast.elt0.us, 3 - %37 = sext i32 %scaled_varying919.elt0.us to i64 - %ptr921.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %37, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15 - %val0d.i.i25.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr921.us, <4 x double> %mask0d.i.i203) #0 - %ptr921.sum.us = add i64 %37, 32 - %ptr1.i.i26.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr921.sum.us - %val1d.i.i27.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i26.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i28.us = shufflevector <4 x double> %val0d.i.i25.us, <4 x double> %val1d.i.i27.us, <8 x i32> - %add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load.us = fadd <8 x double> %add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load.us, %vald.i.i58.us - %add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load.us, %vald.i.i48.us - %mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fmul <8 x double> %coef1_load315_broadcast, %add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us - %mul_coef0_load306_broadcast_Ain_load310_offset_load.us = fmul <8 x double> %coef0_load306_broadcast, %vald.i.i38.us - %add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load.us, %vald.i.i28.us - %mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fmul <8 x double> %coef2_load364_broadcast, %add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us - %add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fadd <8 x double> %mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us, %mul_coef0_load306_broadcast_Ain_load310_offset_load.us - %mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fmul <8 x double> %coef3_load413_broadcast, %add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us - %add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fadd <8 x double> %mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us, %add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us - %add_add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load_mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fadd <8 x double> %add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us, %mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us - %mask0.i.i11.us = shufflevector <8 x i32> %"internal_mask&function_mask472.us", <8 x i32> undef, <8 x i32> - %mask1.i.i12.us = shufflevector <8 x i32> %"internal_mask&function_mask472.us", <8 x i32> undef, <8 x i32> - %mask0d.i.i13.us = bitcast <8 x i32> %mask0.i.i11.us to <4 x double> - %mask1d.i.i14.us = bitcast <8 x i32> %mask1.i.i12.us to <4 x double> - %val0d.i.i15.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr912.us, <4 x double> %mask0d.i.i13.us) #0 - %val1d.i.i17.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i36.us, <4 x double> %mask1d.i.i14.us) #0 - %vald.i.i18.us = shufflevector <4 x double> %val0d.i.i15.us, <4 x double> %val1d.i.i17.us, <8 x i32> - %mul__Ain_load480_offset_load.us = fmul <8 x double> %vald.i.i18.us, - %ptr939.us = getelementptr i8* %Aout_load488_ptr2int_2void, i64 %36, !filename !0, !first_line !20, !first_column !21, !last_line !20, !last_column !22 - %val0d.i.i5.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr939.us, <4 x double> %mask0d.i.i13.us) #0 - %ptr1.i.i6.us = getelementptr i8* %Aout_load488_ptr2int_2void, i64 %ptr912.sum.us - %val1d.i.i7.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i6.us, <4 x double> %mask1d.i.i14.us) #0 - %vald.i.i8.us = shufflevector <4 x double> %val0d.i.i5.us, <4 x double> %val1d.i.i7.us, <8 x i32> - %sub_mul__Ain_load480_offset_load_Aout_load488_offset_load.us = fsub <8 x double> %mul__Ain_load480_offset_load.us, %vald.i.i8.us - %ptr948.us = getelementptr i8* %vsq_load494_ptr2int_2void, i64 %36, !filename !0, !first_line !23, !first_column !24, !last_line !23, !last_column !7 - %val0d.i.i.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr948.us, <4 x double> %mask0d.i.i13.us) #0 - %ptr1.i.i.us = getelementptr i8* %vsq_load494_ptr2int_2void, i64 %ptr912.sum.us - %val1d.i.i.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i.us, <4 x double> %mask1d.i.i14.us) #0 - %vald.i.i.us = shufflevector <4 x double> %val0d.i.i.us, <4 x double> %val1d.i.i.us, <8 x i32> - %mul_vsq_load494_offset_load_div_load499.us = fmul <8 x double> %add_add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load_mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us, %vald.i.i.us - %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us = fadd <8 x double> %sub_mul__Ain_load480_offset_load_Aout_load488_offset_load.us, %mul_vsq_load494_offset_load_div_load499.us - %val0.i.i253.us = shufflevector <8 x double> %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us, <8 x double> undef, <4 x i32> - %val1.i.i254.us = shufflevector <8 x double> %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us, <8 x double> undef, <4 x i32> - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr939.us, <4 x double> %mask0d.i.i13.us, <4 x double> %val0.i.i253.us) #0 - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr1.i.i6.us, <4 x double> %mask1d.i.i14.us, <4 x double> %val1.i.i254.us) #0 - br label %safe_if_after_true466.us - -safe_if_after_true466.us: ; preds = %safe_if_run_true467.us, %for_loop288.us - %add_xb291_load_.us = add i32 %xb291.0263.us, 8 - %less_xb_load293_x1_load294.us = icmp slt i32 %add_xb291_load_.us, %x1 - br i1 %less_xb_load293_x1_load294.us, label %for_loop288.us, label %for_exit289.us - -for_loop288.lr.ph.us: ; preds = %for_exit289.us, %for_test286.preheader.lr.ph - %y280.0265.us = phi i32 [ %y_load517_plus1.us, %for_exit289.us ], [ %y0, %for_test286.preheader.lr.ph ] - %mul_y_load302_Nx_load303.us = mul i32 %y280.0265.us, %Nx - %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us = add i32 %mul_y_load302_Nx_load303.us, %mul_z_load300_Nxy_load301 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %Nx - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load382 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824.elt0.us = sub i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %Nx - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load431 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load390 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load439 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load399 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878.elt0.us = sub i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load448 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load407 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load456 - br label %for_loop288.us - -for_exit278: ; preds = %for_exit289.us, %for_test286.preheader.lr.ph, %for_test275.preheader - %z_load518_plus1 = add i32 %z269.0268, 1 - %exitcond272 = icmp eq i32 %z_load518_plus1, %z1 - br i1 %exitcond272, label %for_exit, label %for_test275.preheader -} - -; Function Attrs: nounwind -define internal void @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* noalias nocapture, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #3 { -allocas: - %x01 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 0 - %x02 = load i32* %x01, align 4 - %x13 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 1 - %x14 = load i32* %x13, align 4 - %y05 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 2 - %y06 = load i32* %y05, align 4 - %y17 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 3 - %y18 = load i32* %y17, align 4 - %z09 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 4 - %z010 = load i32* %z09, align 4 - %Nx11 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 5 - %Nx12 = load i32* %Nx11, align 4 - %Ny13 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 6 - %Ny14 = load i32* %Ny13, align 4 - %coef17 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 8 - %coef18 = load double** %coef17, align 8 - %vsq19 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 9 - %vsq20 = load double** %vsq19, align 8 - %Ain21 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 10 - %Ain22 = load double** %Ain21, align 8 - %Aout23 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 11 - %Aout24 = load double** %Aout23, align 8 - %task_struct_mask = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 12 - %mask = load <8 x i32>* %task_struct_mask, align 32 - %floatmask.i = bitcast <8 x i32> %mask to <8 x float> - %v.i = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i) #1 - %cmp.i = icmp eq i32 %v.i, 255 - %add_z0_load_taskIndex_load = add i32 %z010, %3 - %add_z0_load27_taskIndex_load28 = add i32 %3, 1 - %add_add_z0_load27_taskIndex_load28_ = add i32 %add_z0_load27_taskIndex_load28, %z010 - br i1 %cmp.i, label %all_on, label %some_on - -all_on: ; preds = %allocas - tail call fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x02, i32 %x14, i32 %y06, i32 %y18, i32 %add_z0_load_taskIndex_load, i32 %add_add_z0_load27_taskIndex_load28_, i32 %Nx12, i32 %Ny14, double* %coef18, double* %vsq20, double* %Ain22, double* %Aout24, <8 x i32> ) - ret void - -some_on: ; preds = %allocas - tail call fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x02, i32 %x14, i32 %y06, i32 %y18, i32 %add_z0_load_taskIndex_load, i32 %add_add_z0_load27_taskIndex_load28_, i32 %Nx12, i32 %Ny14, double* %coef18, double* %vsq20, double* %Ain22, double* %Aout24, <8 x i32> %mask) - ret void -} - -; Function Attrs: nounwind -define void @loop_stencil_ispc_tasks(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd) #3 { -allocas: - %launch_group_handle = alloca i8*, align 8 - store i8* null, i8** %launch_group_handle, align 8 - %less_t_load_t1_load166 = icmp slt i32 %t0, %t1 - br i1 %less_t_load_t1_load166, label %for_loop.lr.ph, label %post_sync73 - -for_loop.lr.ph: ; preds = %allocas - %sub_z1_load_z0_load23 = sub i32 %z1, %z0 - br label %for_loop - -for_loop: ; preds = %post_sync, %for_loop.lr.ph - %t.0167 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load69_plus1, %post_sync ] - %bitop = and i32 %t.0167, 1 - %equal_bitop_ = icmp eq i32 %bitop, 0 - %args_ptr = call i8* @ISPCAlloc(i8** %launch_group_handle, i64 96, i32 32) - %funarg = bitcast i8* %args_ptr to i32* - store i32 %x0, i32* %funarg, align 4 - %funarg24 = getelementptr i8* %args_ptr, i64 4 - %0 = bitcast i8* %funarg24 to i32* - store i32 %x1, i32* %0, align 4 - %funarg25 = getelementptr i8* %args_ptr, i64 8 - %1 = bitcast i8* %funarg25 to i32* - store i32 %y0, i32* %1, align 4 - %funarg26 = getelementptr i8* %args_ptr, i64 12 - %2 = bitcast i8* %funarg26 to i32* - store i32 %y1, i32* %2, align 4 - %funarg27 = getelementptr i8* %args_ptr, i64 16 - %3 = bitcast i8* %funarg27 to i32* - store i32 %z0, i32* %3, align 4 - %funarg28 = getelementptr i8* %args_ptr, i64 20 - %4 = bitcast i8* %funarg28 to i32* - store i32 %Nx, i32* %4, align 4 - %funarg29 = getelementptr i8* %args_ptr, i64 24 - %5 = bitcast i8* %funarg29 to i32* - store i32 %Ny, i32* %5, align 4 - %funarg30 = getelementptr i8* %args_ptr, i64 28 - %6 = bitcast i8* %funarg30 to i32* - store i32 %Nz, i32* %6, align 4 - %funarg31 = getelementptr i8* %args_ptr, i64 32 - %7 = bitcast i8* %funarg31 to double** - store double* %coef, double** %7, align 8 - %funarg32 = getelementptr i8* %args_ptr, i64 40 - %8 = bitcast i8* %funarg32 to double** - store double* %vsq, double** %8, align 8 - %funarg33 = getelementptr i8* %args_ptr, i64 48 - %9 = bitcast i8* %funarg33 to double** - br i1 %equal_bitop_, label %if_then, label %if_else - -for_exit: ; preds = %post_sync - %launch_group_handle_load70.pre = load i8** %launch_group_handle, align 8 - %cmp71 = icmp eq i8* %launch_group_handle_load70.pre, null - br i1 %cmp71, label %post_sync73, label %call_sync72 - -if_then: ; preds = %for_loop - store double* %Aeven, double** %9, align 8 - %funarg34 = getelementptr i8* %args_ptr, i64 56 - %10 = bitcast i8* %funarg34 to double** - store double* %Aodd, double** %10, align 8 - %funarg_mask = getelementptr i8* %args_ptr, i64 64 - %11 = bitcast i8* %funarg_mask to <8 x i32>* - store <8 x i32> , <8 x i32>* %11, align 32 - call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i8*), i8* %args_ptr, i32 %sub_z1_load_z0_load23, i32 1, i32 1) - br label %if_exit - -if_else: ; preds = %for_loop - store double* %Aodd, double** %9, align 8 - %funarg64 = getelementptr i8* %args_ptr, i64 56 - %12 = bitcast i8* %funarg64 to double** - store double* %Aeven, double** %12, align 8 - %funarg_mask67 = getelementptr i8* %args_ptr, i64 64 - %13 = bitcast i8* %funarg_mask67 to <8 x i32>* - store <8 x i32> , <8 x i32>* %13, align 32 - call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i8*), i8* %args_ptr, i32 %sub_z1_load_z0_load23, i32 1, i32 1) - br label %if_exit - -if_exit: ; preds = %if_else, %if_then - %launch_group_handle_load = load i8** %launch_group_handle, align 8 - %cmp = icmp eq i8* %launch_group_handle_load, null - br i1 %cmp, label %post_sync, label %call_sync - -call_sync: ; preds = %if_exit - call void @ISPCSync(i8* %launch_group_handle_load) - store i8* null, i8** %launch_group_handle, align 8 - br label %post_sync - -post_sync: ; preds = %call_sync, %if_exit - %t_load69_plus1 = add i32 %t.0167, 1 - %exitcond = icmp eq i32 %t_load69_plus1, %t1 - br i1 %exitcond, label %for_exit, label %for_loop - -call_sync72: ; preds = %for_exit - call void @ISPCSync(i8* %launch_group_handle_load70.pre) - store i8* null, i8** %launch_group_handle, align 8 - br label %post_sync73 - -post_sync73: ; preds = %call_sync72, %for_exit, %allocas - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind readonly } -attributes #3 = { nounwind "target-cpu"="corei7-avx" "target-features"="+avx,+popcnt,+cmov" } - -!0 = metadata !{metadata !"stencil.ispc"} -!1 = metadata !{i32 68} -!2 = metadata !{i32 69} -!3 = metadata !{i32 113} -!4 = metadata !{i32 22} -!5 = metadata !{i32 66} -!6 = metadata !{i32 71} -!7 = metadata !{i32 23} -!8 = metadata !{i32 67} -!9 = metadata !{i32 74} -!10 = metadata !{i32 72} -!11 = metadata !{i32 24} -!12 = metadata !{i32 70} -!13 = metadata !{i32 114} -!14 = metadata !{i32 75} -!15 = metadata !{i32 115} -!16 = metadata !{i32 73} -!17 = metadata !{i32 76} -!18 = metadata !{i32 21} -!19 = metadata !{i32 64} -!20 = metadata !{i32 79} -!21 = metadata !{i32 112} -!22 = metadata !{i32 156} -!23 = metadata !{i32 80} -!24 = metadata !{i32 13} diff --git a/examples_cuda/stencil/stencil_cu.o b/examples_cuda/stencil/stencil_cu.o index 90b014c6a6ff571d572a9eb4be61fde79c2b2a58..dcd38c9fd49fc3e9836c18ebd5de5a730e5452cd 100644 GIT binary patch literal 21784 zcmc&+4|G)3nSYZ6h=`eBtEHmliJ?spV+K$O{>vmYDml^IZ`uKp~7}|@Zw6A z@bIvfbybvVJ-?W8ppEc3?`6h=*3y&T9g*iaoEhGQg3~kX(_D&@E$`{K)xMhkiq_Lt z-t*aT_lKu-5=xwD)#qihWw0;=Chs#JR{BbhZWqMr*}6ui>gV zW#9A?kd4#E`CQ|$)A*?W!)YGlmoEF0!?yEo$C(%5}g5A2)%9@uV2pX2p5 zkpIp5o3hK5rfjgZDg2+F{@)f4IqfrFa2#>;HQAnPvOVi9_EwxaIp02Q9rRL$>@)k| z@6{puUHgGJx#WOnpkzO9+3&LLhlYOE&<`WD)ed~~lG=k09EL6j;O}MZc5=WzO#x4y zb>|`_ww?nvx9wEzNqgpRz{Ad-gErTV>56mqOaQ7&mphj^mpkur>IYB-q=ul&sk{CD z7QY%#q}8@WcRX^XGF_Rjwj~ptYFn%$s@|l`QafVtsH#jIQ=v=WJ~ZU9MX^%k{838rH4RVpe{UD4ICa8zAdvwXg)@Jd6X zv#UEDRU1NGp_Q?YSb8nARV&Grimu-sO~#`gsp_uu8ikoMBQ;{?M5RXQh^<6(KwmOl z9a4O)i&bwd4&79DG6{xFoRONSbR-g8dMX`_hhtzN7_O&7sk>8(8jmKEH>)!uDhxIh zhSDl^qUn)9u%W>n2oU$^p5{ePug9gsXV6U>T7s>5%L2X0-PGb=LK~V}TA`Px)dT(R zbh~Ji%Wcu%^S3PUc-?;8eWSZLy?(dTwWL5xbBkW@X?FUT=nGo>dZRB$^^0!KhTDT4|7elu zps|YF4Gns*xuM1HcQ>@652(0cb0FyRwfI}zF5T;C@{Ag*a6_Oa=x-R+4SlMkx&S=Pr3Zh+{ zfq>gT3TbU=0Y^12(S2^Wf7Gf(oiGA$1bBoB!G^r6XhX1hVROqJ&F1!3!tK%U-3{H5 zDBCZo$|}{BjID@Sg*rZmX&Q6^q{dbI<;~}WET5ZHQzK9qsA=Nv;tfOA%uAiOX!P{_^Ri(Hck!Q{%Bz#p4d8p zRwu4^HjvNfwXjVq$%F0baxd~UxYf#dLPY`BgfZ4aWQVv3@i?X4gR>NgrXiX~BUMs= zu?vI{vlF^89BYe3BVsLt)YYMmScGZdlB?*rcyQ9iuo{A}!<$>#uqY4RV@3tt8PHio z8s(qS?Yhfx;u1uQ zrur5yi78{oF`ADPE}{TXiCi8C_+`ZJHltTyj6BZsKxl08LCAv0sAIGb`XPFm>``(t zR)}#4pSzpQiwf0GdFb0b2W1jFeSWy^K#5r#844STJ}~gGn_6xfi*bCeJl=UX0xIh%PIynqod-eX1wGzA{EEH zP&w{3(iNw?#(foM;ef4anIraxZh<4YH=8KcvW?}M@h6Y*iMQ&wR=fY6jwa(nm;I?) zPt_T3?U22v7Ki|OjpLrGSG>l1-l`#cW*PQ2hBVtzZ`E6x;V!jj)&is%uQ}~ced;yd z)w0VfrfAuY3e}s9R8)GQRO8L2E9QB#_f^!vFmShEe8TXfVU+cAjw}u$u{?Zc% z&JPc>`xLKn2KwbKJ+YZJ)S=J4#yfdQC;Rw)0gwULs$OMHC(=TW+c6AxAX=u+{?HqM zd5nP*{g96=WWH`h7Q~iP&3M~+hk2V5&>npMY>}c2WY+x&)*-llAhS7N+RjU+K}KdB z$j&mR{YgfH3_03XEqBv1s6;R~d|@UI{fo)m`URP6^onxqRu8-RQh6nMpP#*}dV2s66{8=&0>t9|b}XfP?YBNQnTX}O)qEf_|Gw4Eww2chM8 z<8B_2=gRItJi84^l5nf|WD)*)mba>&k0? z@;D&4R6PA}7h_c{f5Gf!ATx}(Acxal!872dY4JZwJ?xD8%mhW zQ@q4v-oF_aC@_qS0-srPiyOfWVA^&j=KjZkfgT#jl*1>=(7;3}Ldi5NxEY1&1jRnKU^FAR3%cy$U2y7{Wo!u0&^OZ1ZZ@DIxDu@cNt>AwZU{PTVF21|dOy>r z83*hS9lZb-&A@JmYYN|l5cj>NOBmB}#&LvQ3e1bVmfeY#z^vjJ4rU$z1t8&+R&~}> zJ7E9T)KW!BU3gor**44ed| zG85FGh^DZS!F4F}-3cJUc+(tdb`OqC%V{=k03PlE52Tk=VD5GthJzYhH<=eH-E46j zW|4X?4u-bEvg%?oceX>uT2Sp=RGVcOczrQ#(pVF-Bu{rUY#Z``z6LU;&1;;~j5qSE z8eqZoA-e+!dGu=RJ8FVi(~xGLnZfBEvQN&efEC1m#~82#y|0+-#dvqfzStyiVqWrF zFek96*;W95sn`h}SB?eYK=2<`0gQ|9=lA^S>NSCB#EMZRT{bD;`^)Fp-0>=wE~hgh`#e1vQv^k2}VsVn78- zKRnORnfv1|7C_A9&927~^EQ19OFp{=uZGziuHBFAaCgDrzI>$3e&-Q;>#v{$g*JQjORIaGJ?Fy@{Bt&k z(ZXfCXS`vK6|Y|4ifuIj+pBiuJz@?ChGMoDF=Ag?Mf+=|x+6Tq_K;^D}=4jr7mG|NU z7!iGP(W8#OrQ+Sm@&j-bfU_tb8t~4zYCrGI4ihI{{1jr0G1&8#O?&aUcEKR7j6R5> zeJ8$t&Z^}Ah$`IYH_*rQjOhZH*b`ra5$=8(>|^7~8_CqedC0VeUy*ljlzFCGyvAFs zx1*2I+3nKq#|4+o6l>W1uL9$GcK=KU`encl%aoe7d$I&D@r=QA?1N^0$v42a{Lqv? zQn1i(o(+JfjI#Ixp?kz)K8s^IjA$8S&+dyQ15I~aS3r2r`=ukQMszR?Usj$#6XA+i zf5YPt_kVrCdfSs%D6dV*l)N`pd1~^*a8!KfvT*>tJ#`!)=cd9o&s}cVpuGKx>5B65 z71u-K559IO0DnnB57LAIg(lo|G~v~WCKhc7fY&UV-~cfym5!<@N<^whrTQVMepsqM zE7gA~)t{H@FG%%4sg7p{nvT#4l<@^GO~0ZQD8DS#UyU*X7KB?X()%&ITFQoc2QvIM*ACT&Yr21j0u1X8| zN~!*?REG#TEKLEcu1>%kiZFXu)sTp%qifQYv3MF@;Kc5Y>iN8?P*;XE1UnNEc$qv? z?MQ?oO^HZ%N3=4Wh^NwOxIL7dIZI{YMj=A>%0!~WTC%A!m5$7xZ@yK(2?gH5-j~(K z;-Tc)(Rxg2;^{-~gPq$yJ=l8-lT!C^ohhyNmUEB);Yia;->qgHBh&B$qRjHfhKBiS zWn*)&N}XGML$!l9ewpXCd;N-SO}TCAqzR?C<|+WcoVT$cSzlJ3E4r<0%En?>nYzBj zRaV(EuA!_3Z(v-U_a)35%gRyaVvc%}sp|Hcon>ER+**_2Ji^<`?Vq`s_j<2Ywo&HC}qvUxodihfsC<1DL$ruwoe zK&=N=OxGva7t+OMrp$u8lnE0#x-Tqx1G4%H z^M0}2>+>?NO<;?%Sop36M4XoUKQk_g- z1?di$1q97tbpE=x~15b}iGZr5!0>OaSE%pLv35O;H} zuCI!B>)|zP95?8CJdx}S!FMv?#c>F}N8<5?0}kEo?({DVQ_-n(D4Fi=QmR*l!+La0 zINFtluW-N%Oise@px_8`4CXMQVh%T)O5^DA702~TbvV)48I6N?;2ST|YFPT$fP(Pd zTtKv4hi`*)MwNntz77YyDwzOl{wgU9b33_!q&kklOtXZ?dpzp1HAC|IEc$D#7pz~N zV}E?GE%#Re!{sGk?|pz_V$T@NBKs^SWh~+;MaZO0TW>ULeH=vnxEq8$Q#g)m8ud%z zTbU%qe@-8xi|x?@{mXg(uZoU57y1Q^@ecLR768vh|IZ}Dkc&y=b-X;LKX)j5nuQ&B z68c5|R=}|=`~P|VMG6*UMQ6?>RiyjBl+W}+xC4X2VxCYOt_9Iw;8VDM+5SGRzg!Zof()9z zWTC%?>+j|LQ5;?iW&Jq6@NdjI9lX-iQfX%SPtjlGcn&YrAHD*~l04s{F7g|B|C+o8 z-jDZ}_YdmABytz;Z=xfg(|P|9#4L$^0$;-W%l@zA{kKaU#JENOR%n-|h&3I<#~+a9 z@&4%l%ivG+7xRzTE_v+%q#U{mKjGDoq+hBsKghOYzw&GF6A}h3NMD0LJTuDrvAK}` za(o91X}xq~E9*B|V?U8sqWocaYzyiuq!DpR8Sh*?Cc2vq#ma<2;z5oW<>=;O;S2)f zIxHf%xL!B`x3L{#l54ohcQXEn{p@TG*xzwng&Y(58q*KquLvyiv6`Z=^EodUvxv?s zBr9g$3@1RI&x0XpdJ4)m@Q)a=$on`h_L;yBG5pKQIzEJYsAD<{e>mP_92fiXDws&b z>o{J?*ycxZfs>aH_(c$@@qCU6&nK7+CTDy;9zr#ripNABv#FT)KTn22@hC5jv=l2} z8L5y`u_B*h%*JB2BMA|TTbW^(=KU#J|B2(mp923O$HjgY_>VX)>LP!V;bZOG$@!`v zlDs(YH3CAh7-ptWDjst+RFtQID%*~IpRvIIWPwkBU{gpAo;M5Oms#L50lyeV(8UKJ z?oV4e-pg@zj|$~)a=ee@|IPD90H03wk+@#`81O>&{F?>;IfnB?3&WPEbwXz{3Z*$6>-?}atc}S*IM9;_+3o6yE@#} z1rI&&xs3Q#K!|{5-KeAzEJsR)s^eqg>s=MHsO!aK&;C0ld;1>iTS8_F@1ky?P ze*a$}i9IasN+!Ch=hQNJv(2xVH-$Pn65&pSYCEHy;jXpXY}Z=fYKC0L9tvUGoM=1( zIoMG%@=O*@4AkhIE8PxwW_u5&*A7QuYB3jEKd&KdR0IM&%| zCAZPrQt$NYda8S+?$Px-XFFC!(_`HuyA%uxq`mWWJ(h^YWAJRs+c5hE_6d)#ybW_| z@UxZ`=djPYtjF9i{Mvw(I#QlM+Tp-w#(Fv#iltNdkj@X-oRy(eEUYKs$zL?o=?TL( zyaU(iFktA{97uGe+%YA(26TqIo1&dg^trxzwpl};rxPx$|0P5Df@=t5&hx-%UGQKs z7K{0OkpRw&#j{5J&Pa!Eb>VggJeDie@&EDw5|&&lng+4)dpZ@UbS75gL%~3Lc8#v5 z+mnfV^p0qJRk~e=IY=bYp`D>v9DcU8YRwwGE1FCt;_!or^jdv&4N3`hf=_X{kh$0o zFB)NG;LqeJK+DbMW5_^yZaC3B!s5S}8~MSU0y~I3?+tf1q}MdCw;MQRZHcaETxm;1 zqjxK9>1aoX(uONuffbpKbw(3yk;AUv+g7A}W1CKjuNB#28{EZH* zGm(4?BM>5B(K&R5cVZ6YwzEYajNcQBM|6KQ)s2sY^-v0b;HbBSVjZ9XR_eT1BBjF) zi_XQVZD0#gkE9ZMdng_OcNHMHOj3AFNQa+H;m%Xwmz8G196~kxw9!oBjtN6O-Yvqn z0ay~Z$B17|@F0{iA&$>R1-~0XNGKogCd7IF+XzAudd-@m{D>FuF&w@7*u`<#9yp$` zB;Gs?%WZebGhDDLB~A$(k`!u}^1d+d1U5qv)3ue8ANxi=I6i_G^1n}T$~T`s zjQBr{LLoiQ`8Y3PP7wbokwe$>9t(Uwk%MtY?2BhD_`f4~9q@(y9}%3|Z^H^Cv3~y6 ztSaLPPVK*t`-Psn+j@UJC$$MXvkHdFj^f@7N({xHq9 zz+D#jG7Eew$HjX12UEQAeZq%Z$^0|-3|1fs{`btPg7=%4(8dP%6aG9+a2LT#aB)CF zz8n4o|4Rg?{G|lPYpCEqNbp92FXb0wl&=x|Zi0IV{#}CKM)03^VYkoGXEUK=3Al+whPC3GHl#Kf%Xuz+)16e{WWmZxDV9V1j=$!SO61@WljgB>24q z$9ENi{~dzgPVkQi?kD(pf>XU;Ed>%J99JX!3Hdh?oa$XlaD0^`_-TUE>ui?b(+L0H z3EoQZ0|ZB#g#2d-j%O`_zshm+KTh2S_`~#D!e0b+!7t(m5tL2$@6`myt$=@+@HaS^ zkbejK2{}I?IC3Y!AEw77T*!Hv;41KhoC5^ElHe~$_#gTCd>jrG652!U>?Am~GfQx4 z=OYp>?A$?c6?6zYj}V;dosS1_NV0zx6P)VRC0yuD5?lp&Lhn5sNA)$f?q}OO?&|eJMp*2nEr+1=pQQoJi+ODo^TQC zmX9ar%&Jnwag;;Xa}B}qdMfO^j^IlO?j$(M68u#JpF{9BIgaDP5H9W?-ywVqEke#| zg3l)Sp9qfc3I%^c8B`$2^2<1mHd6VQ5S;q$3Q5lMOe%aYj_|49st6zNL)Y=^=p2I6 z@y?U*JzW0HME+91#JCzIzVKT!;nQ&~w&3d){Fnv5%YuKe1^+<{e%6Bjhz0*Ug#Qih z8_d2(aNNI@yzK`>4r0P@KPLE0fbg46Z$o`j3_^AV9x*UuS(Q~UoY$r1L8S7kUZ^fSg9OcU&8F2{|@ z1gCzU%5mWzv2LdmK3%u7EckOQ`11%K{Z_~QR!?y1w?+xaIl$y4@~Pi~gpb#jDU7Ws zcM|?Gz{Gmf34b}~ViqAdl@lZQU4)+`@^!$4d_lp>0aCkg)IQrog&OasL?{l09fHtfmzl6&v=j{TY#&K1` zYdKyc;fpw4C*ets`v{KqZ{YX}34e^^T?9uv1%I7{@8JB+1V{OU9N#YC_+NEk>LobJ z-_P+r3I7ep2MLbyRla|ZNw|;WrwESn&u}^HzzB70M*L#V7w;YgK8xcj?{f z5y$HYj&`o$xR2ndH^K201V{dMj(17;Ajj89_*sr`mT;9HaJNgi#_?W)qdf+f(dEyFTaGcdLzg2dRNO3 d^8^3iB238Vp`F#EgntP`%nz~e1YgAY{{efPftdgR literal 18464 zcmd^HeQaCTb$_&#PFB5=PSk{U)%mr1S`~6z|=$(5$?z!ild+s^+KGOL?Q}~``4##puejLi(io--KaP00` zq>-DA%BPi5Woz*3!TW++gRS?5e)#(%%M>LW)}F{dt>vh>OFIx6-mR(7p1;y^L?3=y zD@FT<7pI}YIrI;IrYKGINA&EoPUCr}q8Q(w(v6;F##g6xpL{<1 z0$OME?3>Of-vWMFH}1I#${{^_6vOS+jr$xxU(L2^dxDNvf{vFQzs^3Rjlu-oh_7+z zMrO^jpyQb1SoVO{0cz0kYa_X4+Ss*bc{Zc%GXiUj@S3UYh}H$B(YoepcCXflX8vN& zZk~xgJfbPE9D|=5Zq*JL701X281ytq7ctl%sRfD=9zqkGb9(k1z}v%n*%B~chFUTT z)N%Y>B!7@OB#gh8hYzp;q-OXTHh`2Ht>-UbW(|O)fOUXp0Rw<8Kri4VU^So{u*xVu z51_%!eVdKINlZ{R0?z|fU^ueP2%HDz21Z5;tlS7(2GL~%&H$|3Y^;10o-GAD16Trh zKD$x_xD50P;32>Z@CwbxSou7#4-V_vi1fmWWJ96yQ1Htmu8#lA5mcXGfz z=Ez@JrE6nouhzAv9QlE&Kc{Y;&b2yX24)Y3wLKdVMo!}!Hz^9R0~<$p8PFUV&S-m_ zMi2}kpV9VXoYCxyB-gnIOE!+7jwtzaS0O`d*xRZ-l|S7%1_}?9zY5A2C`WTA``wc; zr{&ntE6&}&D8XQGux^z20Ru(_0r=C8bsNN{09G!rMuC+GY)oLKx=~F%T)OC40q$^8 zbL+;g^$HqZ#346S-3W7|K{r~_sMbgVLDWHtc4-%khP!~NgeA3=lBo!^+DM|yF71jC z8WmkW5ZDATeK@XZO@kevxnr35j1NoCAN#wX!Z0 zyG%Ik3G^?A{50Vdu`Hi|FLE>#dnE+#M2PlbWTmS)_A22j@aEVQi7TdfDE5pt2K$Ie zFdW;fH3C9+g)1C8z%I@`I^Zg=3di!fM|VWZN$2z7*nUl6`4M&riB!1IOREdVuz|n= zDN^Ev@PELjH(5t%2pUHP&lPYZBu=&9F?A_-@_zTU<14XuLfB(f*weYknrgJ$^QR&o zg)q_tSv24(sS07=x`@4fVD;BZ^8>5)GLNG6i~~bGm;aZx$=vu?A}^7G5;YebsCqd! zKG1fM#t*3Zsckt}Q|`{47+C$q{M4$)*_{0Gws-T#BJaSKGi1=4Flej$jDv>R?#lnT z?I=xy`h3HUPW#y_`BQDL(2!x6>?WIj)KpV`d;Y_==W;*VTGM(v27`IziTpt1bpDD< zcdOWByM=#&RtZ#Pf9#rJD<sRfu2zA`*xV*C z8JRGP`gMqYVS#Z}50J^k8ztU=OY#PVWZb3ZzSy*``u5!T{gJonF_AwWxro?4gujFM zdlu^%IZTgd^FN8SY8_ag*Yf$uZfz9vfWU5Tj2o^I5jY>VjlpzeMVirvZDFkuLPtT8 zGw1L?pgf1@5Y9o|9YvTRxJVH-Xi>PKnvF(kOt}hU0qRG@l21j(;o^00`NPjPK0ID3 zlcjUy?4{{xWW~|t$iqmR`=_7Xn9-gg`_+ktrlpBjdk{a|J~J zN;xVx>iNsa_OdcO&Gi|BVR{FQaxTw0P?!Pz6b-=Mr8-WrfcA?z>b-3LN=MG3Xu|#? zQ$!~=3Z_FA6G4}&_FPMkvqA={YyOOuj*XSvDO<~sh&TNl1vMkqd&tumpDClt%^w&GzB&%?~ z*wu>6k{jREa%AYJb9W^T8phgGuL3|7ZXoD3R<6RPP>qUP0Z@ro5-c$Sw<5}D#{nx9 z*o43;bfBPC;ebG}8ugxFfXG-Pqv1A$8tqzOE`ccmTPZNw1M?I>2LN(UIQBSaz}#cdsZxPgsVlI67uoq; zbUn_zDyF$ush5;e!S5j1@tu7aF7Xh@Bc}+PYze~@@*Vdn&BeC5f9)XiMx|Mp=kdsU zxf3-t4OKJel;Qm|2Y_$V0boBbO)fUDI_F^ZKWRIg8*geGr?Si^iG%3B4ypYxTUC|Y zPRo-U-x`^u^1d88P&qJU{@>fC^DjjHh`7EY_Eav`rIs`l} z4gts8PJ#TJFm6(ge6-AM@2}Zb%BTp{{+e9^?I2-r;{xT#2AUKoM?X*;N=VhHG)r;^B}zw@gPUr$ z@%Wb_5p_wqlE{E^Os=RqFBl3-eR z;*te z**Wq9IwT+&FK;b4K%kyDhwc@;5%3;Ck{_YUGJTK^5G%k5{V;a~%D)-h7Q8?Btzi58 z`3rx2$ho#r*&528vb-{QXJVR!e?JCcDx?*3&Tykq9?bqSn4Jn87FNPH3hrrR`f$H{ zlX!d3iGy&k{zIqH@+tJav)>gm#Nu`0<8iLtO?{ML_K%_LfBm8Z&7Z4yz>Y}(CHPU^ z;T-xtT%%_n*BUT<$k;7l7QO>MtZU_H)=3sa;32_sqZyDahQM0{s{&20WHAIzS#h|@ zg;urXFa+Mg92Ko9$zce*i#b(jX_CVbIEDFelNT*jau@=qhz|`n)rHV@OCCeugM!yc zJeTA#1pbKNg@{)wc?^N?5xf@SDU!z!_&(q`Pn5!s%Ha}?0J)ny{BwHzj2>Zlwh@A- zcY;r=V_6^a{(g|j`wXFU6u9*MKFMMT))Rt7-rp-(41qr-SmgaZlEo1CPl?6eACVk} z;EXYcy#I*gFa&;tIaO%wmK=t_Ut$hHUW!k0E#yg2&$PlRSptO$r`+ zxJ&XFf_GN%cx5^yk0E#$fG^4WqY!M$p5w@;d+#VMmRa zuizqZ9+JGveuoC_valh#JJ$}<+Gfv(v(xi!uU-sG3w#BFw7?8uw*tCc;0uz)5Uf(c zq6PjyvKRt)2^KBzS;=Au+)XTA;NMFQLvU2)&;rv`$q0tPtC&Lz{Ep->1ny-HE%1co zFa%x)j=8|ENghM+8U>FR_$A3>2wq6=coC0C9z*b21dkVgO!63l7Xi<dln7C}Kyd@M5~YL%V_ z7Q7E9AMtAr*zjUg_HZ!!LiSiN`$6_H)iPPf?x0nxKZ(l`8u^P4(u+~|5qe1)#=&Uz zYSqa1qmK?b`o9+%7JG<1f)`at1mq%i50JAGaT8}!A}+KkQXy1|CIx1;M9cm`&%Uo` zFX2_S4(b@FbVWV_NN^{WIWe7wq#6ETX+hYiF2Gn>XVu)oSkqEnfZ8r z@xo+$$a=n7_MAF6TkuTD@ZtKZN5aSq)!UpiBi5;*WC{NWC!;Okuez>{=8pf60#Por$if$2=5 zC)SnP}>PL`F^TXs=ygt!5Gr?dXjs+Piz=iM|9LJ()63G#*bS(&_ee z_jeQOhCo@_3Jm3WAlbV;xkB2}4%pY8iKf3@rY`XV9gTB)doW^Zr#gpGRkp&@7fnTX zsGj~>|6G1MGuyq@OOx|0QO>_ax!NVl1(qnceu;7$mMC|ZRc=E-_4H*@swbN4exOIK zo7bClAFEgW$Li%PSh^*f=<}7`QT9D7dMfb%W|czdSN1j3)^E6bqe)ErHmH@USTSPS zMnq&)rff8rQkhB?5<5w-(?%n~PE(L}HX4Ty8&{Q(PXkW7EPC}*wOnShxdxg zl+RzSR>m^X?zrb6OeGl?;XlK!tyaG|XP3xb5V#*buxmYO*h#uJK$p;%^3@jdyv%}t z5A<{*)8~00A^Y{s@G4VYa2qT8yz2H;R9&%ov$x%c4q}qNB~7xkH`Sg=q%*#|Jl(WV zU5RKsk@6%{p1!WR^TW{dPR~ozo8yoLI{1opfPd#5!C$PyLLRRz)*-OK2@}Aa;*}|W1PUC2K*XXa7jj`;;qbDDZ-Mdai}qN^i9Xvp$w3x!qmNfA z-Ierg7m>?>3qMkD7Eb;ynmjlZ&V`&h$*CchMOVFa*IZr5J;IZ7mHFTkb~0kqFB8;s}Z6I@JcX=(ed_DNJmiBwUv9XMHm4DH_q^guMitnat zDZZQTZtdz$tKr_>J~hO9@k^?n=*x6%@?DD>%5Ph`x|4CCiRMr*^r-%8)iX!ada0?l zz-c~?c^|p5Si<6kp_R1dE2J=yuWA?dvDL%=K#_d4SXMXrP^7BeJ?xt=NjGoeb)NCH zm_ssCkaR`noihj8=SaYw{+@1`gfS^n7g%&k95##EQn-_3-4J;OLt6sZ$NSGOMz3Gm z=p3|3#G8KuT*`^x)1SV!AZdxFa6+7oHUEqsO_whQu_T-BR@M1}|M~`PG;Yb)KF&&g=(yK4PliC6Qn0pxeo;PCn7$ zU}rzx)5RU@fsWT@6gmYiI7$uYtxr4~1l7XhT%Td!ocV|GafX?*pu(Ye(t>{CiDF^q zw}7L=PYL)FkAqFO@Hp3J7^M06hY@w^20Ijswtxkk{BN)YC|V0xAWoq6EOC8<3Kew$ z3uOL9R9P-SGEZEuu%lgzTE_>wKFDI z=1L`lBjHZ}?8&A;otO~~W$k7SIjadmT0H4=Y3(CZ7QcO?K4vufNxU=3S1AJMM%R}a zHcbe0DP{V7hAp`KG>>z|cyuc&7}e6xM`yixnRMlwkLfqpz#NhmlMjn{(oC!!tvvfQ ztUU8>S5)UhN`0i2UL%-2sT{)&%0kk!ojanQRL_inby$9LCaf+hMU@q7$_|*JZEj*D z3fj)3-C@!S=9xEPn}tGkaiy}meQ~W?eYCnojTTaTG7?4?l@f07iy87sEu9fQ5}U@H zGQIA(YtfsLcuSa$cH+Cd&h}_$CV{u_ss0;$Nursi=(dO_?;-j=!S-rPT+WGGcFb2+aqpYGY4e6LzdGUSNzpqqpTu_xUe$^BIkX20=i{SD zDtK9k0HMY6E&ip@%qw1=q1V%WZ7<1+U2A7wlod=*yt6@Go0%Ri(qiuN>mN>}oSh2i zC*S?~7XS8#XH)Q9ReL+$V9x(vd+nC?E&V<5_Sz;0E`~`#5BaKWMJkcmST;~rCcij0 zLBa6!J+;Y@@P8p6UuhWxP;QN<&k0`tW1-P;D+v_CdVzgwW+D-g7P8YLqBh0!d>HQVGC^`+lH*gvo7 zf)bKHf>;5RWJ{%>Tf!}pZuzYQa~l6b3fO>std-!C&X3!_8GXsFoAGD;nB%KfBHHHs zO}G&>>QBG-uzotE{mWD=U)I>HJEh!t}^N7FNJ3N{4FMY zGwd0y5VUVeN8K#xE-^u6tz0bnmm4JST`lC}auI$6l<9JwZ%7m9N92~Ni%xi6ork^x z`isdcGQo-;EP29PTAID>UPU&1CR_ZD=th(Wg}!(q_4Ko;cKX>A|6D2EzQ|yrOf9hn ziDD%SlCov19t_e?JGd2&_V>iPl;(R|w%oh3CsxFs8)wNpbDwL2a*vD?0+f64ZyEUW zV+);{CjChZokGc^SI7cv*VAL6TlEZD=vF;X%U~i|{I5=C^xrBIFLfh3Rs7NCb2hqH zkQLQNZ&@oCdv6iYuIKM;bW_hi+UR{!&%a7K*-0jwei*aRDQBDXTgt___IdSxR?zMC zeAh-d?fHR?zDL@#=hotS{>DZ(_58h!{*=`7Pm*q**Uv0;>%1oCp+73~jeWk#=Zf2T tlZ|fLS!ScZB<=jX%zt*jebGiY_1tcwpOtz7l5U^xJr=rkzWwvi{~I-}qvHSo diff --git a/examples_cuda/stencil/stencil_cu.s b/examples_cuda/stencil/stencil_cu.s deleted file mode 100644 index a10402a9..00000000 --- a/examples_cuda/stencil/stencil_cu.s +++ /dev/null @@ -1,1134 +0,0 @@ - .file "stencil_cu.ll" - .section .rodata.cst16,"aM",@progbits,16 - .align 16 -.LCPI0_0: - .long 4 # 0x4 - .long 5 # 0x5 - .long 6 # 0x6 - .long 7 # 0x7 -.LCPI0_1: - .long 0 # 0x0 - .long 1 # 0x1 - .long 2 # 0x2 - .long 3 # 0x3 - .section .rodata,"a",@progbits - .align 32 -.LCPI0_2: - .quad 4611686018427387904 # double 2.000000e+00 - .quad 4611686018427387904 # double 2.000000e+00 - .quad 4611686018427387904 # double 2.000000e+00 - .quad 4611686018427387904 # double 2.000000e+00 - .text - .align 16, 0x90 - .type stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_,@function -stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_: # @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ -# BB#0: # %allocas - pushq %rbp - pushq %r15 - pushq %r14 - pushq %r13 - pushq %r12 - pushq %rbx - subq $1384, %rsp # imm = 0x568 - movl %ecx, -72(%rsp) # 4-byte Spill - movl %esi, 1308(%rsp) # 4-byte Spill - movl %edi, -68(%rsp) # 4-byte Spill - movq 1456(%rsp), %rcx - vmovsd 24(%rcx), %xmm1 - vmovsd 16(%rcx), %xmm3 - movq 1472(%rsp), %rax - vmovsd (%rcx), %xmm2 - vmovsd 8(%rcx), %xmm4 - movl 1448(%rsp), %esi - vmovmskps %ymm0, %ecx - cmpl $255, %ecx - jne .LBB0_1 -# BB#7: # %for_test.preheader - cmpl %r9d, %r8d - jge .LBB0_6 -# BB#8: # %for_test30.preheader.lr.ph - leal -3(%r8), %ecx - leal 2(%r8), %r13d - leal -1(%r8), %edi - leal 3(%r8), %ebp - movl %esi, %r11d - imull %r11d, %ebp - movl %ebp, %ebx - imull %r11d, %edi - movl %edi, %ebp - imull %r11d, %r13d - imull %r8d, %esi - imull %r11d, %ecx - leal -2(%r8), %r10d - imull %r11d, %r10d - leal 1(%r8), %r14d - imull %r11d, %r14d - movl %edx, -96(%rsp) # 4-byte Spill - addl %edx, %r14d - addl %edx, %r10d - addl %edx, %ecx - movl %ecx, 1344(%rsp) # 4-byte Spill - movl %r9d, -92(%rsp) # 4-byte Spill - leal 1(%rdx,%rsi), %r15d - leal 2(%rdx,%rsi), %edi - addl %edx, %r13d - addl %edx, %ebp - movl %ebp, 1216(%rsp) # 4-byte Spill - addl %edx, %ebx - movl %ebx, 1152(%rsp) # 4-byte Spill - leal -1(%rdx,%rsi), %ebp - leal 3(%rdx,%rsi), %ecx - leal (%rdx,%rsi), %r12d - leal -3(%rdx,%rsi), %ebx - movl %ebx, 1184(%rsp) # 4-byte Spill - movl %r8d, -88(%rsp) # 4-byte Spill - leal -2(%rdx,%rsi), %edx - vmovd 1308(%rsp), %xmm0 # 4-byte Folded Reload - movl 1440(%rsp), %r9d - imull %r9d, %r13d - imull %r9d, %ecx - movl %ecx, 1312(%rsp) # 4-byte Spill - imull %r9d, %ebp - movl %ebp, 1248(%rsp) # 4-byte Spill - imull %r9d, %edi - imull %r9d, %r15d - movl 1344(%rsp), %ecx # 4-byte Reload - imull %r9d, %ecx - movl %ecx, 1344(%rsp) # 4-byte Spill - imull %r9d, %r10d - movl 1152(%rsp), %ebx # 4-byte Reload - imull %r9d, %ebx - movl 1216(%rsp), %ebp # 4-byte Reload - imull %r9d, %ebp - imull %r9d, %r14d - movl 1184(%rsp), %r8d # 4-byte Reload - imull %r9d, %r8d - imull %r9d, %edx - movl %edx, 1216(%rsp) # 4-byte Spill - imull %r9d, %r12d - movl -68(%rsp), %edx # 4-byte Reload - leal (,%rdx,8), %edx - leal -16(%rdx,%r12,8), %esi - movl %esi, 76(%rsp) # 4-byte Spill - leal (%rdx,%r12,8), %ecx - movl %ecx, 72(%rsp) # 4-byte Spill - leal (%rdx,%r15,8), %ecx - movl %ecx, 68(%rsp) # 4-byte Spill - movl -92(%rsp), %ecx # 4-byte Reload - leal (%rdx,%rdi,8), %esi - movl %esi, 64(%rsp) # 4-byte Spill - movl 1248(%rsp), %esi # 4-byte Reload - leal (%rdx,%rsi,8), %esi - movl %esi, 60(%rsp) # 4-byte Spill - movl 1312(%rsp), %esi # 4-byte Reload - leal (%rdx,%rsi,8), %esi - movl %esi, 56(%rsp) # 4-byte Spill - movl 1216(%rsp), %esi # 4-byte Reload - leal (%rdx,%rsi,8), %esi - movl %esi, 52(%rsp) # 4-byte Spill - movl -88(%rsp), %esi # 4-byte Reload - leal (%rdx,%r8,8), %edi - movl %edi, 48(%rsp) # 4-byte Spill - leal (%rdx,%r14,8), %edi - movl %edi, 44(%rsp) # 4-byte Spill - leal (%rdx,%r13,8), %edi - movl %edi, 40(%rsp) # 4-byte Spill - leal (%rdx,%rbp,8), %edi - movl %edi, 36(%rsp) # 4-byte Spill - leal (%rdx,%rbx,8), %edi - movl %edi, 32(%rsp) # 4-byte Spill - leal (%rdx,%r10,8), %edi - movl %edi, 28(%rsp) # 4-byte Spill - movl 1344(%rsp), %edi # 4-byte Reload - leal (%rdx,%rdi,8), %edx - movl %edx, 24(%rsp) # 4-byte Spill - movl $0, -100(%rsp) # 4-byte Folded Spill - imull %r9d, %r11d - shll $3, %r9d - movl %r9d, -76(%rsp) # 4-byte Spill - shll $3, %r11d - movl %r11d, -104(%rsp) # 4-byte Spill - vpermilpd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0] - vpermilpd $0, %xmm2, %xmm2 # xmm2 = xmm2[0,0] - vpermilpd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0] - vpshufd $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] - vinsertf128 $1, %xmm1, %ymm1, %ymm1 - vmovupd %ymm1, 1312(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm3, %ymm3, %ymm1 - vmovupd %ymm1, 1344(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm2, %ymm2, %ymm15 - vmovupd %ymm15, -32(%rsp) # 32-byte Folded Spill - vpermilpd $0, %xmm4, %xmm1 # xmm1 = xmm4[0,0] - vinsertf128 $1, %xmm1, %ymm1, %ymm14 - vmovupd %ymm14, -64(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm0, %ymm0, %ymm0 - vmovups %ymm0, 1248(%rsp) # 32-byte Folded Spill - vmovapd .LCPI0_2(%rip), %ymm13 - .align 16, 0x90 -.LBB0_9: # %for_test30.preheader - # =>This Loop Header: Depth=1 - # Child Loop BB0_16 Depth 2 - # Child Loop BB0_12 Depth 3 - movl %esi, -88(%rsp) # 4-byte Spill - movl -96(%rsp), %edx # 4-byte Reload - cmpl -72(%rsp), %edx # 4-byte Folded Reload - jge .LBB0_11 -# BB#10: # %for_test37.preheader.lr.ph - # in Loop: Header=BB0_9 Depth=1 - movl -68(%rsp), %edx # 4-byte Reload - cmpl 1308(%rsp), %edx # 4-byte Folded Reload - movl -100(%rsp), %edx # 4-byte Reload - movl -96(%rsp), %edi # 4-byte Reload - jge .LBB0_11 - .align 16, 0x90 -.LBB0_16: # %for_loop39.lr.ph.us - # Parent Loop BB0_9 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB0_12 Depth 3 - movl %edi, -84(%rsp) # 4-byte Spill - movl %edx, -80(%rsp) # 4-byte Spill - movl %edx, %r13d - movl -68(%rsp), %ecx # 4-byte Reload - .align 16, 0x90 -.LBB0_12: # %for_loop39.us - # Parent Loop BB0_9 Depth=1 - # Parent Loop BB0_16 Depth=2 - # => This Inner Loop Header: Depth=3 - movl %ecx, 1216(%rsp) # 4-byte Spill - vmovups 1248(%rsp), %ymm3 # 32-byte Folded Reload - vmovups %ymm3, 1248(%rsp) # 32-byte Folded Spill - vextractf128 $1, %ymm3, %xmm0 - vmovd %ecx, %xmm1 - vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0] - vpaddd .LCPI0_0(%rip), %xmm1, %xmm2 - vpcmpgtd %xmm2, %xmm0, %xmm0 - vpaddd .LCPI0_1(%rip), %xmm1, %xmm1 - vpcmpgtd %xmm1, %xmm3, %xmm1 - vinsertf128 $1, %xmm0, %ymm1, %ymm8 - vmovmskps %ymm8, %ecx - testl %ecx, %ecx - je .LBB0_14 -# BB#13: # %safe_if_run_true.us - # in Loop: Header=BB0_12 Depth=3 - movl 76(%rsp), %esi # 4-byte Reload - leal 8(%rsi,%r13), %edx - movl 68(%rsp), %ecx # 4-byte Reload - leal (%rcx,%r13), %ecx - movl 72(%rsp), %r12d # 4-byte Reload - leal 24(%r12,%r13), %r14d - leal -8(%rsi,%r13), %r8d - movl 52(%rsp), %edi # 4-byte Reload - leal (%rdi,%r13), %edi - leal 8(%r12,%r13), %ebp - leal (%rsi,%r13), %esi - leal 16(%r12,%r13), %r11d - movl 64(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r9d - movl 44(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r15d - movl 60(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r10d - movl 40(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 832(%rsp) # 4-byte Spill - movl 56(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 800(%rsp) # 4-byte Spill - movl 36(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 768(%rsp) # 4-byte Spill - movl 28(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 736(%rsp) # 4-byte Spill - leal (%r12,%r13), %ebx - movl %ebx, 960(%rsp) # 4-byte Spill - movl 48(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 896(%rsp) # 4-byte Spill - movl 32(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r12d - movl 24(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 992(%rsp) # 4-byte Spill - movslq %edx, %rdx - movq %rdx, 1184(%rsp) # 8-byte Spill - movslq %ecx, %rbx - movq %rbx, 1056(%rsp) # 8-byte Spill - movslq %esi, %rcx - movq %rcx, 1120(%rsp) # 8-byte Spill - vmovupd (%rax,%rbx), %xmm0 - movq %rbx, %rsi - vmovupd 16(%rax,%rdx), %xmm2 - vmovupd (%rax,%rdx), %xmm3 - movslq %ebp, %rdx - movq %rdx, 1152(%rsp) # 8-byte Spill - vmovupd 16(%rax,%rdx), %xmm1 - vmovupd (%rax,%rdx), %xmm4 - vinsertf128 $1, %xmm1, %ymm4, %ymm1 - vinsertf128 $1, %xmm2, %ymm3, %ymm2 - movslq %edi, %rdx - movq %rdx, 928(%rsp) # 8-byte Spill - movslq %r8d, %rbx - movslq %r14d, %r14 - vmovupd 16(%rax,%rsi), %xmm3 - vmovupd 16(%rax,%rcx), %xmm4 - vmovupd (%rax,%rcx), %xmm5 - movslq %r11d, %rcx - movq %rcx, 1088(%rsp) # 8-byte Spill - vmovupd 16(%rax,%rcx), %xmm6 - vmovupd (%rax,%rcx), %xmm7 - vinsertf128 $1, %xmm6, %ymm7, %ymm6 - vinsertf128 $1, %xmm4, %ymm5, %ymm7 - vaddpd %ymm1, %ymm2, %ymm1 - vinsertf128 $1, %xmm3, %ymm0, %ymm3 - movslq %r10d, %rsi - movq %rsi, 864(%rsp) # 8-byte Spill - vmovupd (%rax,%r14), %xmm5 - vmovupd (%rax,%rbx), %xmm4 - vmovupd (%rax,%rdx), %xmm2 - movslq %r15d, %rbp - movslq %r9d, %rcx - movq %rcx, 1048(%rsp) # 8-byte Spill - vmovupd 16(%rax,%rcx), %xmm0 - vmovupd (%rax,%rcx), %xmm9 - vaddpd %ymm6, %ymm7, %ymm7 - vinsertf128 $1, %xmm0, %ymm9, %ymm9 - vmovupd (%rax,%rbp), %xmm12 - vmovupd (%rax,%rsi), %xmm6 - vmovupd 16(%rax,%rdx), %xmm0 - vaddpd %ymm3, %ymm1, %ymm3 - vinsertf128 $1, 16(%rax,%rsi), %ymm6, %ymm6 - vinsertf128 $1, 16(%rax,%r14), %ymm5, %ymm5 - vinsertf128 $1, 16(%rax,%rbx), %ymm4, %ymm4 - vaddpd %ymm9, %ymm7, %ymm1 - vinsertf128 $1, %xmm0, %ymm2, %ymm2 - movslq 736(%rsp), %r8 # 4-byte Folded Reload - movslq 768(%rsp), %rdx # 4-byte Folded Reload - movslq 800(%rsp), %rdi # 4-byte Folded Reload - vmovupd (%rax,%rdi), %xmm10 - movslq 832(%rsp), %r15 # 4-byte Folded Reload - vmovupd (%rax,%r15), %xmm9 - vmovupd (%rax,%rdx), %xmm7 - vaddpd %ymm5, %ymm4, %ymm4 - vmovupd (%rax,%r8), %xmm11 - vaddpd %ymm6, %ymm3, %ymm5 - vinsertf128 $1, 16(%rax,%rdi), %ymm10, %ymm3 - vinsertf128 $1, 16(%rax,%rbp), %ymm12, %ymm10 - vinsertf128 $1, 16(%rax,%r15), %ymm9, %ymm0 - movslq 896(%rsp), %r11 # 4-byte Folded Reload - vaddpd %ymm2, %ymm1, %ymm1 - movslq 960(%rsp), %rcx # 4-byte Folded Reload - vmovupd (%rax,%rcx), %xmm6 - vaddpd %ymm0, %ymm1, %ymm1 - vinsertf128 $1, 16(%rax,%r8), %ymm11, %ymm2 - vinsertf128 $1, 16(%rax,%rdx), %ymm7, %ymm0 - movslq %r12d, %r12 - vaddpd %ymm10, %ymm5, %ymm7 - vmovupd (%rax,%r11), %xmm5 - vaddpd %ymm3, %ymm4, %ymm3 - vinsertf128 $1, 16(%rax,%r11), %ymm5, %ymm4 - vinsertf128 $1, 16(%rax,%rcx), %ymm6, %ymm9 - vmovupd (%rax,%r12), %xmm5 - movslq 992(%rsp), %rsi # 4-byte Folded Reload - vaddpd %ymm0, %ymm7, %ymm10 - vextractf128 $1, %ymm8, %xmm6 - vaddpd %ymm2, %ymm1, %ymm2 - vpshufd $80, %xmm6, %xmm7 # xmm7 = xmm6[0,0,1,1] - vmulpd %ymm9, %ymm15, %ymm1 - vmovupd (%rax,%rsi), %xmm9 - vaddpd %ymm4, %ymm3, %ymm3 - vinsertf128 $1, 16(%rax,%r12), %ymm5, %ymm4 - vpshufd $80, %xmm8, %xmm5 # xmm5 = xmm8[0,0,1,1] - vpshufd $-6, %xmm6, %xmm0 # xmm0 = xmm6[2,2,3,3] - vpshufd $-6, %xmm8, %xmm6 # xmm6 = xmm8[2,2,3,3] - vinsertf128 $1, %xmm6, %ymm5, %ymm6 - vinsertf128 $1, 16(%rax,%rsi), %ymm9, %ymm5 - vinsertf128 $1, %xmm0, %ymm7, %ymm8 - vmovupd %ymm8, 96(%rsp) # 32-byte Folded Spill - vmovupd 1344(%rsp), %ymm0 # 32-byte Folded Reload - vmovupd %ymm0, 1344(%rsp) # 32-byte Folded Spill - vmovupd %ymm0, 1344(%rsp) # 32-byte Folded Spill - vmulpd %ymm2, %ymm0, %ymm0 - vmulpd %ymm10, %ymm14, %ymm2 - movq 1480(%rsp), %r9 - vmaskmovpd (%r9,%rcx), %ymm6, %ymm7 - vaddpd %ymm1, %ymm2, %ymm1 - vaddpd %ymm1, %ymm0, %ymm0 - vaddpd %ymm4, %ymm3, %ymm3 - vmaskmovpd (%rax,%rcx), %ymm6, %ymm1 - vmulpd %ymm13, %ymm1, %ymm1 - movq 1464(%rsp), %r10 - vmaskmovpd (%r10,%rcx), %ymm6, %ymm2 - vsubpd %ymm7, %ymm1, %ymm1 - vmaskmovpd 32(%r10,%rcx), %ymm8, %ymm4 - vmovupd %ymm4, 992(%rsp) # 32-byte Folded Spill - vaddpd %ymm5, %ymm3, %ymm3 - vmovups 48(%rax,%rsi), %xmm4 - vmovaps %xmm4, 960(%rsp) # 16-byte Spill - vmovupd 1312(%rsp), %ymm4 # 32-byte Folded Reload - vmovupd %ymm4, 1312(%rsp) # 32-byte Folded Spill - vmovupd %ymm4, 1312(%rsp) # 32-byte Folded Spill - vmulpd %ymm3, %ymm4, %ymm3 - vmovups 32(%rax,%rsi), %xmm4 - vmovups %ymm4, 896(%rsp) # 32-byte Folded Spill - vaddpd %ymm3, %ymm0, %ymm0 - vmovups 48(%rax,%r12), %xmm3 - vmovaps %xmm3, 832(%rsp) # 16-byte Spill - vmulpd %ymm2, %ymm0, %ymm0 - vmovups 32(%rax,%r12), %xmm2 - vmovups %ymm2, 800(%rsp) # 32-byte Folded Spill - vaddpd %ymm0, %ymm1, %ymm0 - vmovupd %ymm0, 128(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r11), %xmm0 - vmovaps %xmm0, 768(%rsp) # 16-byte Spill - vmovups 32(%rax,%r11), %xmm0 - vmovups %ymm0, 736(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rdi), %xmm0 - vmovaps %xmm0, 704(%rsp) # 16-byte Spill - vmovups 32(%rax,%rdi), %xmm0 - vmovups %ymm0, 640(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rbx), %xmm0 - vmovaps %xmm0, 592(%rsp) # 16-byte Spill - vmovups 32(%rax,%rbx), %xmm0 - vmovups %ymm0, 544(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r14), %xmm0 - vmovaps %xmm0, 464(%rsp) # 16-byte Spill - vmovups 32(%rax,%r14), %xmm0 - vmovups %ymm0, 416(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rdx), %xmm0 - vmovaps %xmm0, 400(%rsp) # 16-byte Spill - vmovups 32(%rax,%rdx), %xmm0 - vmovups %ymm0, 352(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r8), %xmm0 - vmovaps %xmm0, 336(%rsp) # 16-byte Spill - vmovups 32(%rax,%r8), %xmm0 - vmovups %ymm0, 288(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rbp), %xmm0 - vmovaps %xmm0, 272(%rsp) # 16-byte Spill - vmovups 32(%rax,%rbp), %xmm0 - vmovups %ymm0, 224(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%r9,%rcx), %ymm8, %ymm0 - vmovupd %ymm0, 672(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rcx), %ymm8, %ymm0 - vmovupd %ymm0, 608(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rcx), %xmm0 - vmovaps %xmm0, 528(%rsp) # 16-byte Spill - vmovups 32(%rax,%rcx), %xmm0 - vmovups %ymm0, 480(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r15), %xmm0 - vmovaps %xmm0, 208(%rsp) # 16-byte Spill - vmovups 32(%rax,%r15), %xmm0 - vmovups %ymm0, 160(%rsp) # 32-byte Folded Spill - movq 864(%rsp), %rdx # 8-byte Reload - vmovups 48(%rax,%rdx), %xmm0 - vmovaps %xmm0, 80(%rsp) # 16-byte Spill - vmovups 32(%rax,%rdx), %xmm0 - vmovups %ymm0, 864(%rsp) # 32-byte Folded Spill - movq 928(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm4 - vmovupd 32(%rax,%rdx), %xmm9 - movq 1056(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm5 - vmovupd 32(%rax,%rdx), %xmm11 - movq 1048(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm13 - vmovupd 32(%rax,%rdx), %xmm7 - movq 1184(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm15 - vmovupd 32(%rax,%rdx), %xmm10 - movq 1152(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm12 - vmovupd 32(%rax,%rdx), %xmm14 - movq 1120(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm0 - vmovupd 32(%rax,%rdx), %xmm1 - movq 1088(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm2 - vmovupd 32(%rax,%rdx), %xmm3 - vmovupd 128(%rsp), %ymm8 # 32-byte Folded Reload - vmaskmovpd %ymm8, %ymm6, (%r9,%rcx) - vinsertf128 $1, %xmm2, %ymm3, %ymm2 - vinsertf128 $1, %xmm0, %ymm1, %ymm0 - vaddpd %ymm2, %ymm0, %ymm1 - vinsertf128 $1, %xmm12, %ymm14, %ymm0 - vinsertf128 $1, %xmm15, %ymm10, %ymm2 - vaddpd %ymm0, %ymm2, %ymm0 - vinsertf128 $1, %xmm13, %ymm7, %ymm2 - vinsertf128 $1, %xmm5, %ymm11, %ymm3 - vaddpd %ymm3, %ymm0, %ymm5 - vaddpd %ymm2, %ymm1, %ymm0 - vinsertf128 $1, %xmm4, %ymm9, %ymm1 - vaddpd %ymm1, %ymm0, %ymm0 - vmovupd 864(%rsp), %ymm1 # 32-byte Folded Reload - vinsertf128 $1, 80(%rsp), %ymm1, %ymm1 # 16-byte Folded Reload - vmovupd 160(%rsp), %ymm2 # 32-byte Folded Reload - vinsertf128 $1, 208(%rsp), %ymm2, %ymm2 # 16-byte Folded Reload - vaddpd %ymm2, %ymm0, %ymm0 - vaddpd %ymm1, %ymm5, %ymm1 - vmovupd 224(%rsp), %ymm2 # 32-byte Folded Reload - vinsertf128 $1, 272(%rsp), %ymm2, %ymm2 # 16-byte Folded Reload - vaddpd %ymm2, %ymm1, %ymm1 - vmovupd 288(%rsp), %ymm2 # 32-byte Folded Reload - vinsertf128 $1, 336(%rsp), %ymm2, %ymm2 # 16-byte Folded Reload - vmovupd 352(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 400(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm1, %ymm1 - vaddpd %ymm2, %ymm0, %ymm2 - vmovupd 416(%rsp), %ymm0 # 32-byte Folded Reload - vinsertf128 $1, 464(%rsp), %ymm0, %ymm0 # 16-byte Folded Reload - vmovupd 544(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 592(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm0, %ymm3, %ymm0 - vmovupd 640(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 704(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 1344(%rsp), %ymm3 # 32-byte Folded Reload - vmulpd %ymm2, %ymm3, %ymm2 - vmovupd -64(%rsp), %ymm3 # 32-byte Folded Reload - vmulpd %ymm1, %ymm3, %ymm1 - vmovapd %ymm3, %ymm14 - vmovupd 736(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 768(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vmovupd 480(%rsp), %ymm4 # 32-byte Folded Reload - vinsertf128 $1, 528(%rsp), %ymm4, %ymm4 # 16-byte Folded Reload - vmovupd -32(%rsp), %ymm5 # 32-byte Folded Reload - vmulpd %ymm4, %ymm5, %ymm4 - vmovapd %ymm5, %ymm15 - vaddpd %ymm4, %ymm1, %ymm1 - vmovapd .LCPI0_2(%rip), %ymm5 - vmovupd 608(%rsp), %ymm4 # 32-byte Folded Reload - vmulpd %ymm5, %ymm4, %ymm4 - vmovapd %ymm5, %ymm13 - vaddpd %ymm1, %ymm2, %ymm2 - vsubpd 672(%rsp), %ymm4, %ymm1 # 32-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 800(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 832(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 896(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 960(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 1312(%rsp), %ymm3 # 32-byte Folded Reload - vmulpd %ymm0, %ymm3, %ymm0 - vaddpd %ymm0, %ymm2, %ymm0 - vmulpd 992(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd %ymm0, %ymm1, %ymm0 - vmovupd 96(%rsp), %ymm1 # 32-byte Folded Reload - vmaskmovpd %ymm0, %ymm1, 32(%r9,%rcx) -.LBB0_14: # %safe_if_after_true.us - # in Loop: Header=BB0_12 Depth=3 - addl $64, %r13d - movl 1216(%rsp), %ecx # 4-byte Reload - addl $8, %ecx - cmpl 1308(%rsp), %ecx # 4-byte Folded Reload - jl .LBB0_12 -# BB#15: # %for_exit40.us - # in Loop: Header=BB0_16 Depth=2 - movl -80(%rsp), %edx # 4-byte Reload - addl -76(%rsp), %edx # 4-byte Folded Reload - movl -84(%rsp), %edi # 4-byte Reload - incl %edi - cmpl -72(%rsp), %edi # 4-byte Folded Reload - movl -92(%rsp), %ecx # 4-byte Reload - movl -88(%rsp), %esi # 4-byte Reload - jne .LBB0_16 -.LBB0_11: # %for_exit33 - # in Loop: Header=BB0_9 Depth=1 - movl -100(%rsp), %edx # 4-byte Reload - addl -104(%rsp), %edx # 4-byte Folded Reload - movl %edx, -100(%rsp) # 4-byte Spill - incl %esi - cmpl %ecx, %esi - jne .LBB0_9 - jmp .LBB0_6 -.LBB0_1: # %for_test264.preheader - cmpl %r9d, %r8d - jge .LBB0_6 -# BB#2: # %for_test275.preheader.lr.ph - leal 2(%r8), %r13d - movl %esi, %r10d - imull %r10d, %r13d - movl %r10d, %ecx - imull %r8d, %ecx - movl %edx, %esi - movl %esi, -96(%rsp) # 4-byte Spill - leal (%rsi,%rcx), %r15d - movl %r9d, -92(%rsp) # 4-byte Spill - leal 2(%rsi,%rcx), %edx - movl %edx, 1248(%rsp) # 4-byte Spill - leal -1(%rsi,%rcx), %edx - movl %edx, 1344(%rsp) # 4-byte Spill - leal 3(%rsi,%rcx), %r12d - leal -2(%rsi,%rcx), %edx - movl %edx, 1312(%rsp) # 4-byte Spill - leal -3(%rsi,%rcx), %edi - addl %esi, %r13d - leal 1(%rsi,%rcx), %ecx - leal -3(%r8), %r14d - imull %r10d, %r14d - leal -2(%r8), %r9d - imull %r10d, %r9d - leal 3(%r8), %ebx - imull %r10d, %ebx - leal -1(%r8), %ebp - imull %r10d, %ebp - leal 1(%r8), %edx - imull %r10d, %edx - addl %esi, %edx - addl %esi, %ebp - addl %esi, %ebx - addl %esi, %r9d - addl %esi, %r14d - vmovd 1308(%rsp), %xmm5 # 4-byte Folded Reload - movl 1440(%rsp), %r11d - imull %r11d, %ecx - movl %ecx, 1184(%rsp) # 4-byte Spill - imull %r11d, %r13d - imull %r11d, %edi - movl %edi, 1216(%rsp) # 4-byte Spill - movl 1312(%rsp), %ecx # 4-byte Reload - imull %r11d, %ecx - movl %ecx, 1312(%rsp) # 4-byte Spill - imull %r11d, %r12d - movl 1344(%rsp), %esi # 4-byte Reload - imull %r11d, %esi - movl %esi, 1344(%rsp) # 4-byte Spill - movl 1248(%rsp), %ecx # 4-byte Reload - imull %r11d, %ecx - imull %r11d, %r15d - movl -68(%rsp), %esi # 4-byte Reload - leal (,%rsi,8), %esi - imull %r11d, %r14d - imull %r11d, %r9d - imull %r11d, %ebx - imull %r11d, %ebp - imull %r11d, %edx - leal -16(%rsi,%r15,8), %edi - movl %edi, 672(%rsp) # 4-byte Spill - leal (%rsi,%r15,8), %edi - movl %edi, 640(%rsp) # 4-byte Spill - movl 1184(%rsp), %edi # 4-byte Reload - leal (%rsi,%rdi,8), %edi - movl %edi, 608(%rsp) # 4-byte Spill - movl %r8d, %edi - leal (%rsi,%rcx,8), %ecx - movl %ecx, 592(%rsp) # 4-byte Spill - movl 1344(%rsp), %ecx # 4-byte Reload - leal (%rsi,%rcx,8), %ecx - movl %ecx, 544(%rsp) # 4-byte Spill - leal (%rsi,%r12,8), %ecx - movl %ecx, 528(%rsp) # 4-byte Spill - movl 1312(%rsp), %ecx # 4-byte Reload - leal (%rsi,%rcx,8), %ecx - movl %ecx, 480(%rsp) # 4-byte Spill - movl 1216(%rsp), %ecx # 4-byte Reload - leal (%rsi,%rcx,8), %ecx - movl %ecx, 464(%rsp) # 4-byte Spill - leal (%rsi,%rdx,8), %ecx - movl %ecx, 416(%rsp) # 4-byte Spill - leal (%rsi,%r13,8), %ecx - movl %ecx, 400(%rsp) # 4-byte Spill - leal (%rsi,%rbp,8), %ecx - movl %ecx, 352(%rsp) # 4-byte Spill - leal (%rsi,%rbx,8), %ecx - movl %ecx, 336(%rsp) # 4-byte Spill - leal (%rsi,%r9,8), %ecx - movl %ecx, 288(%rsp) # 4-byte Spill - leal (%rsi,%r14,8), %ecx - movl %ecx, 272(%rsp) # 4-byte Spill - movl $0, 160(%rsp) # 4-byte Folded Spill - imull %r11d, %r10d - shll $3, %r11d - movl %r11d, -76(%rsp) # 4-byte Spill - shll $3, %r10d - movl %r10d, -104(%rsp) # 4-byte Spill - vpermilpd $0, %xmm1, %xmm6 # xmm6 = xmm1[0,0] - vpermilpd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0] - vpermilpd $0, %xmm2, %xmm1 # xmm1 = xmm2[0,0] - vmovaps %ymm0, %ymm8 - vmovups %ymm8, 704(%rsp) # 32-byte Folded Spill - vextractf128 $1, %ymm8, %xmm7 - vpshufd $80, %xmm8, %xmm0 # xmm0 = xmm8[0,0,1,1] - vinsertf128 $1, %xmm6, %ymm6, %ymm13 - vpshufd $80, %xmm7, %xmm2 # xmm2 = xmm7[0,0,1,1] - vinsertf128 $1, %xmm3, %ymm3, %ymm15 - vpshufd $-6, %xmm7, %xmm3 # xmm3 = xmm7[2,2,3,3] - vinsertf128 $1, %xmm1, %ymm1, %ymm10 - vpshufd $-6, %xmm8, %xmm1 # xmm1 = xmm8[2,2,3,3] - vpshufd $0, %xmm5, %xmm7 # xmm7 = xmm5[0,0,0,0] - vpermilpd $0, %xmm4, %xmm4 # xmm4 = xmm4[0,0] - vinsertf128 $1, %xmm4, %ymm4, %ymm4 - vmovupd %ymm4, 1344(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm3, %ymm2, %ymm5 - vinsertf128 $1, %xmm1, %ymm0, %ymm6 - vinsertf128 $1, %xmm7, %ymm7, %ymm0 - vmovups %ymm0, 1312(%rsp) # 32-byte Folded Spill - vmovapd .LCPI0_2(%rip), %ymm14 - .align 16, 0x90 -.LBB0_3: # %for_test275.preheader - # =>This Loop Header: Depth=1 - # Child Loop BB0_21 Depth 2 - # Child Loop BB0_17 Depth 3 - movl %edi, -88(%rsp) # 4-byte Spill - movl -96(%rsp), %ecx # 4-byte Reload - cmpl -72(%rsp), %ecx # 4-byte Folded Reload - jge .LBB0_5 -# BB#4: # %for_test286.preheader.lr.ph - # in Loop: Header=BB0_3 Depth=1 - movl -68(%rsp), %ecx # 4-byte Reload - cmpl 1308(%rsp), %ecx # 4-byte Folded Reload - movl 160(%rsp), %ecx # 4-byte Reload - movl -96(%rsp), %edx # 4-byte Reload - jge .LBB0_5 - .align 16, 0x90 -.LBB0_21: # %for_loop288.lr.ph.us - # Parent Loop BB0_3 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB0_17 Depth 3 - movl %edx, 208(%rsp) # 4-byte Spill - movl %ecx, 224(%rsp) # 4-byte Spill - movl %ecx, %r9d - movl -68(%rsp), %r15d # 4-byte Reload - .align 16, 0x90 -.LBB0_17: # %for_loop288.us - # Parent Loop BB0_3 Depth=1 - # Parent Loop BB0_21 Depth=2 - # => This Inner Loop Header: Depth=3 - vmovups 1312(%rsp), %ymm3 # 32-byte Folded Reload - vmovups %ymm3, 1312(%rsp) # 32-byte Folded Spill - vextractf128 $1, %ymm3, %xmm0 - vmovd %r15d, %xmm1 - vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0] - vpaddd .LCPI0_0(%rip), %xmm1, %xmm2 - vpcmpgtd %xmm2, %xmm0, %xmm0 - vpaddd .LCPI0_1(%rip), %xmm1, %xmm1 - vpcmpgtd %xmm1, %xmm3, %xmm1 - vinsertf128 $1, %xmm0, %ymm1, %ymm0 - vandps 704(%rsp), %ymm0, %ymm11 # 32-byte Folded Reload - vmovmskps %ymm11, %ecx - testl %ecx, %ecx - je .LBB0_19 -# BB#18: # %safe_if_run_true467.us - # in Loop: Header=BB0_17 Depth=3 - movl 640(%rsp), %r11d # 4-byte Reload - leal 24(%r11,%r9), %ecx - movl 528(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %ebx - leal 8(%r11,%r9), %edx - movl %edx, 1088(%rsp) # 4-byte Spill - movl 608(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 1048(%rsp) # 4-byte Spill - movl 480(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 768(%rsp) # 4-byte Spill - movl 464(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %r14d - movl 592(%rsp), %esi # 4-byte Reload - leal (%rsi,%r9), %esi - movl 672(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %ebp - leal 16(%r11,%r9), %r12d - leal -8(%rdi,%r9), %r13d - movl 336(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 832(%rsp) # 4-byte Spill - movl 272(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 800(%rsp) # 4-byte Spill - leal 8(%rdi,%r9), %r10d - movl 544(%rsp), %r8d # 4-byte Reload - leal (%r8,%r9), %edx - movl %edx, 960(%rsp) # 4-byte Spill - leal (%r11,%r9), %edx - movl %edx, 928(%rsp) # 4-byte Spill - movl 416(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %edx - movl %edx, 896(%rsp) # 4-byte Spill - movl 400(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %edx - movl %edx, 864(%rsp) # 4-byte Spill - movl 288(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 992(%rsp) # 4-byte Spill - movl 352(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %r8d - movslq %ecx, %rcx - movq %rcx, 1184(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rcx), %ymm6, %ymm0 - movslq %r13d, %rcx - movq %rcx, 1152(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rcx), %ymm6, %ymm1 - vaddpd %ymm0, %ymm1, %ymm0 - movslq %r12d, %rcx - movq %rcx, 1248(%rsp) # 8-byte Spill - movslq %ebx, %rdx - movq %rdx, 1120(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rdx), %ymm6, %ymm1 - vaddpd %ymm1, %ymm0, %ymm0 - vmaskmovpd (%rax,%rcx), %ymm6, %ymm1 - movslq %ebp, %rcx - movq %rcx, 1216(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rcx), %ymm6, %ymm2 - vaddpd %ymm1, %ymm2, %ymm1 - movslq %esi, %rsi - movq %rsi, 1056(%rsp) # 8-byte Spill - movslq %r14d, %rdx - vmaskmovpd (%rax,%rdx), %ymm6, %ymm2 - vaddpd %ymm2, %ymm0, %ymm0 - movslq 768(%rsp), %rcx # 4-byte Folded Reload - movslq 1048(%rsp), %rdi # 4-byte Folded Reload - movq %rdi, 1048(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rsi), %ymm6, %ymm2 - movslq 1088(%rsp), %rsi # 4-byte Folded Reload - movq %rsi, 1088(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rsi), %ymm6, %ymm3 - movslq %r10d, %r11 - vmaskmovpd (%rax,%r11), %ymm6, %ymm4 - vaddpd %ymm3, %ymm4, %ymm3 - vaddpd %ymm2, %ymm1, %ymm1 - movslq 800(%rsp), %rsi # 4-byte Folded Reload - vmaskmovpd (%rax,%rdi), %ymm6, %ymm7 - vmaskmovpd (%rax,%rcx), %ymm6, %ymm2 - movslq 832(%rsp), %rdi # 4-byte Folded Reload - vmaskmovpd (%rax,%rdi), %ymm6, %ymm8 - vpshufd $80, %xmm11, %xmm4 # xmm4 = xmm11[0,0,1,1] - vaddpd %ymm8, %ymm0, %ymm0 - vaddpd %ymm2, %ymm1, %ymm2 - vaddpd %ymm7, %ymm3, %ymm3 - vmaskmovpd (%rax,%rsi), %ymm6, %ymm1 - movslq 864(%rsp), %r12 # 4-byte Folded Reload - movslq 896(%rsp), %rbx # 4-byte Folded Reload - vpshufd $-6, %xmm11, %xmm7 # xmm7 = xmm11[2,2,3,3] - vinsertf128 $1, %xmm7, %ymm4, %ymm12 - movslq 928(%rsp), %r13 # 4-byte Folded Reload - movslq 960(%rsp), %r10 # 4-byte Folded Reload - vmaskmovpd (%rax,%r10), %ymm6, %ymm4 - vaddpd %ymm4, %ymm3, %ymm4 - vmaskmovpd (%rax,%r13), %ymm12, %ymm7 - vmaskmovpd (%rax,%rbx), %ymm6, %ymm8 - vextractf128 $1, %ymm11, %xmm3 - vmaskmovpd (%rax,%r12), %ymm6, %ymm9 - vaddpd %ymm9, %ymm2, %ymm2 - movslq 992(%rsp), %rbp # 4-byte Folded Reload - vmaskmovpd (%rax,%rbp), %ymm6, %ymm9 - vaddpd %ymm9, %ymm2, %ymm2 - vaddpd %ymm1, %ymm0, %ymm1 - vmulpd %ymm14, %ymm7, %ymm0 - vaddpd %ymm8, %ymm4, %ymm4 - vmaskmovpd (%rax,%r13), %ymm6, %ymm7 - movslq %r8d, %r8 - vmaskmovpd (%rax,%r8), %ymm6, %ymm8 - vaddpd %ymm8, %ymm4, %ymm8 - vmovapd %ymm10, %ymm14 - vmulpd %ymm7, %ymm14, %ymm7 - vpshufd $-6, %xmm3, %xmm4 # xmm4 = xmm3[2,2,3,3] - vpshufd $80, %xmm3, %xmm3 # xmm3 = xmm3[0,0,1,1] - movq 1480(%rsp), %r14 - vmaskmovpd (%r14,%r13), %ymm12, %ymm9 - vsubpd %ymm9, %ymm0, %ymm0 - vmulpd %ymm1, %ymm13, %ymm1 - vmulpd %ymm2, %ymm15, %ymm2 - vmovupd 1344(%rsp), %ymm9 # 32-byte Folded Reload - vmovupd %ymm9, 1344(%rsp) # 32-byte Folded Spill - vmulpd %ymm8, %ymm9, %ymm8 - vaddpd %ymm7, %ymm8, %ymm7 - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm8 - vmovupd %ymm8, 992(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm4, %ymm3, %ymm11 - vmaskmovpd 32(%rax,%rdi), %ymm5, %ymm3 - vmovupd %ymm3, 960(%rsp) # 32-byte Folded Spill - vaddpd %ymm7, %ymm2, %ymm2 - vmaskmovpd 32(%rax,%rdx), %ymm5, %ymm3 - vmovupd %ymm3, 928(%rsp) # 32-byte Folded Spill - vaddpd %ymm1, %ymm2, %ymm1 - movq 1464(%rsp), %rdx - vmaskmovpd (%rdx,%r13), %ymm12, %ymm2 - vmulpd %ymm2, %ymm1, %ymm1 - movq 1120(%rsp), %rsi # 8-byte Reload - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm2 - vmovupd %ymm2, 1120(%rsp) # 32-byte Folded Spill - vaddpd %ymm1, %ymm0, %ymm0 - vmovupd %ymm0, 736(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r8), %ymm5, %ymm0 - vmovupd %ymm0, 896(%rsp) # 32-byte Folded Spill - movq 1184(%rsp), %rsi # 8-byte Reload - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm0 - vmovupd %ymm0, 1184(%rsp) # 32-byte Folded Spill - movq 1152(%rsp), %rsi # 8-byte Reload - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm0 - vmovupd %ymm0, 1152(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rbp), %ymm5, %ymm0 - vmovupd %ymm0, 832(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r12), %ymm5, %ymm0 - vmovupd %ymm0, 800(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm0 - vmovupd %ymm0, 768(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r13), %ymm5, %ymm0 - vmovupd %ymm0, 864(%rsp) # 32-byte Folded Spill - movq 1056(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm0 - vmovupd %ymm0, 1056(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rbx), %ymm5, %ymm7 - vmaskmovpd 32(%rax,%r10), %ymm5, %ymm10 - movq 1048(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm0 - movq 1088(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm1 - vmaskmovpd 32(%rax,%r11), %ymm5, %ymm2 - movq 1248(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm3 - movq 1216(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm4 - vmaskmovpd 32(%rdx,%r13), %ymm11, %ymm8 - vmovupd %ymm8, 1248(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%r14,%r13), %ymm11, %ymm8 - vmovupd %ymm8, 1216(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r13), %ymm11, %ymm8 - vmovupd %ymm8, 1088(%rsp) # 32-byte Folded Spill - vmovupd 736(%rsp), %ymm8 # 32-byte Folded Reload - vmaskmovpd %ymm8, %ymm12, (%r14,%r13) - vaddpd %ymm3, %ymm4, %ymm3 - vaddpd %ymm1, %ymm2, %ymm1 - vaddpd %ymm0, %ymm1, %ymm0 - vaddpd %ymm10, %ymm0, %ymm0 - vaddpd %ymm7, %ymm0, %ymm1 - vaddpd 1056(%rsp), %ymm3, %ymm0 # 32-byte Folded Reload - vaddpd 768(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 800(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 832(%rsp), %ymm0, %ymm2 # 32-byte Folded Reload - vmovupd 1152(%rsp), %ymm0 # 32-byte Folded Reload - vaddpd 1184(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vmulpd %ymm2, %ymm15, %ymm2 - vaddpd 896(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload - vmulpd %ymm1, %ymm9, %ymm1 - vmulpd 864(%rsp), %ymm14, %ymm3 # 32-byte Folded Reload - vmovapd %ymm14, %ymm10 - vaddpd %ymm3, %ymm1, %ymm3 - vmovapd .LCPI0_2(%rip), %ymm4 - vmovupd 1088(%rsp), %ymm1 # 32-byte Folded Reload - vmulpd %ymm4, %ymm1, %ymm1 - vmovapd %ymm4, %ymm14 - vsubpd 1216(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload - vaddpd %ymm3, %ymm2, %ymm2 - vaddpd 1120(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 928(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 960(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 992(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vmulpd %ymm0, %ymm13, %ymm0 - vaddpd %ymm0, %ymm2, %ymm0 - vmulpd 1248(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd %ymm0, %ymm1, %ymm0 - vmaskmovpd %ymm0, %ymm11, 32(%r14,%r13) -.LBB0_19: # %safe_if_after_true466.us - # in Loop: Header=BB0_17 Depth=3 - addl $64, %r9d - addl $8, %r15d - cmpl 1308(%rsp), %r15d # 4-byte Folded Reload - jl .LBB0_17 -# BB#20: # %for_exit289.us - # in Loop: Header=BB0_21 Depth=2 - movl 224(%rsp), %ecx # 4-byte Reload - addl -76(%rsp), %ecx # 4-byte Folded Reload - movl 208(%rsp), %edx # 4-byte Reload - incl %edx - cmpl -72(%rsp), %edx # 4-byte Folded Reload - jne .LBB0_21 -.LBB0_5: # %for_exit278 - # in Loop: Header=BB0_3 Depth=1 - movl 160(%rsp), %ecx # 4-byte Reload - addl -104(%rsp), %ecx # 4-byte Folded Reload - movl %ecx, 160(%rsp) # 4-byte Spill - movl -88(%rsp), %edi # 4-byte Reload - incl %edi - movl -92(%rsp), %ecx # 4-byte Reload - cmpl %ecx, %edi - jne .LBB0_3 -.LBB0_6: # %for_exit - addq $1384, %rsp # imm = 0x568 - popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 - popq %rbp - vzeroupper - ret -.Ltmp0: - .size stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, .Ltmp0-stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ - - .align 16, 0x90 - .type stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_,@function -stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_: # @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ -# BB#0: # %allocas - pushq %rbp - pushq %r15 - pushq %r14 - pushq %rbx - subq $56, %rsp - movq %rdi, %rax - movl 16(%rax), %r8d - movq 56(%rax), %rbx - movq 48(%rax), %r15 - movq 40(%rax), %r14 - movq 32(%rax), %r11 - leal 1(%r8,%rcx), %r9d - movl 24(%rax), %r10d - vmovaps 64(%rax), %ymm0 - addl %ecx, %r8d - movl 20(%rax), %ebp - movl 12(%rax), %ecx - movl 8(%rax), %edx - movl (%rax), %edi - movl 4(%rax), %esi - vmovmskps %ymm0, %eax - cmpl $255, %eax - jne .LBB1_2 -# BB#1: # %all_on - vpcmpeqd %xmm0, %xmm0, %xmm0 - movq %rbx, 40(%rsp) - movq %r15, 32(%rsp) - movq %r14, 24(%rsp) - movq %r11, 16(%rsp) - movl %r10d, 8(%rsp) - movl %ebp, (%rsp) - vinsertf128 $1, %xmm0, %ymm0, %ymm0 - jmp .LBB1_3 -.LBB1_2: # %some_on - movq %rbx, 40(%rsp) - movq %r15, 32(%rsp) - movq %r14, 24(%rsp) - movq %r11, 16(%rsp) - movl %r10d, 8(%rsp) - movl %ebp, (%rsp) -.LBB1_3: # %some_on - callq stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ - addq $56, %rsp - popq %rbx - popq %r14 - popq %r15 - popq %rbp - vzeroupper - ret -.Ltmp1: - .size stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, .Ltmp1-stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ - - .globl loop_stencil_ispc_tasks - .align 16, 0x90 - .type loop_stencil_ispc_tasks,@function -loop_stencil_ispc_tasks: # @loop_stencil_ispc_tasks -# BB#0: # %allocas - pushq %rbp - pushq %r15 - pushq %r14 - pushq %r13 - pushq %r12 - pushq %rbx - subq $104, %rsp - movl %r9d, 92(%rsp) # 4-byte Spill - movl %r8d, 88(%rsp) # 4-byte Spill - movl %ecx, 84(%rsp) # 4-byte Spill - movl %edx, 80(%rsp) # 4-byte Spill - movl %esi, %ebx - movl %edi, %ebp - movq $0, 96(%rsp) - cmpl %ebx, %ebp - jge .LBB2_10 -# BB#1: # %for_loop.lr.ph - movq 216(%rsp), %r13 - movl 168(%rsp), %r14d - movl 160(%rsp), %r12d - subl %r12d, %r14d - leaq 96(%rsp), %r15 - vpcmpeqd %xmm0, %xmm0, %xmm0 - vinsertf128 $1, %xmm0, %ymm0, %ymm1 - vmovups %ymm1, 32(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm0, %ymm0, %ymm0 - vmovups %ymm0, (%rsp) # 32-byte Folded Spill - .align 16, 0x90 -.LBB2_2: # %for_loop - # =>This Inner Loop Header: Depth=1 - movq %r15, %rdi - movl $96, %esi - movl $32, %edx - vzeroupper - callq ISPCAlloc - movq %rax, %rdx - movl 80(%rsp), %eax # 4-byte Reload - movl %eax, (%rdx) - movl 84(%rsp), %eax # 4-byte Reload - movl %eax, 4(%rdx) - movl 88(%rsp), %eax # 4-byte Reload - movl %eax, 8(%rdx) - movl 92(%rsp), %eax # 4-byte Reload - movl %eax, 12(%rdx) - movl %r12d, 16(%rdx) - movl 176(%rsp), %eax - movl %eax, 20(%rdx) - movl 184(%rsp), %eax - movl %eax, 24(%rdx) - testb $1, %bpl - movl 192(%rsp), %eax - movl %eax, 28(%rdx) - movq 200(%rsp), %rax - movq %rax, 32(%rdx) - movq 208(%rsp), %rax - movq %rax, 40(%rdx) - jne .LBB2_4 -# BB#3: # %if_then - # in Loop: Header=BB2_2 Depth=1 - movq %r13, 48(%rdx) - movq 224(%rsp), %rax - movq %rax, 56(%rdx) - vmovups 32(%rsp), %ymm0 # 32-byte Folded Reload - jmp .LBB2_5 - .align 16, 0x90 -.LBB2_4: # %if_else - # in Loop: Header=BB2_2 Depth=1 - movq 224(%rsp), %rax - movq %rax, 48(%rdx) - movq %r13, 56(%rdx) - vmovups (%rsp), %ymm0 # 32-byte Folded Reload -.LBB2_5: # %if_else - # in Loop: Header=BB2_2 Depth=1 - vmovaps %ymm0, 64(%rdx) - movq %r15, %rdi - movl $stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, %esi - movl %r14d, %ecx - movl $1, %r8d - movl $1, %r9d - vzeroupper - callq ISPCLaunch - movq 96(%rsp), %rdi - testq %rdi, %rdi - je .LBB2_7 -# BB#6: # %call_sync - # in Loop: Header=BB2_2 Depth=1 - callq ISPCSync - movq $0, 96(%rsp) -.LBB2_7: # %post_sync - # in Loop: Header=BB2_2 Depth=1 - incl %ebp - cmpl %ebp, %ebx - jne .LBB2_2 -# BB#8: # %for_exit - movq 96(%rsp), %rdi - testq %rdi, %rdi - je .LBB2_10 -# BB#9: # %call_sync72 - callq ISPCSync - movq $0, 96(%rsp) -.LBB2_10: # %post_sync73 - addq $104, %rsp - popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 - popq %rbp - ret -.Ltmp2: - .size loop_stencil_ispc_tasks, .Ltmp2-loop_stencil_ispc_tasks - - - .section ".note.GNU-stack","",@progbits diff --git a/examples_cuda/stencil/stencil_cu_avx.bc b/examples_cuda/stencil/stencil_cu_avx.bc deleted file mode 100644 index d9338e7cfdf08cacd887729e1069415814417945..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9820 zcmcIp3p~{6`X4i6MrMp5h0qy=t!>IR(Zr5xiX0AGY-rqd)@8zu*e)2Q`$lq!YW8f! z|7=@nq(m#1lpWW!MY2&SC)$d%N_C$1_xmYoQ~&*Zj?ehb`+I-C_j%s;dA`s0dER$y z?JoYN%OsI9rjSUCWDbc;B9T<#pPn>3&E^Vq8I5gXnJ$;Y)R$M#oJrjokf`QBwoPZ) z9$}lPStw{MQ#7&Hb5h)^JG=THiA+u1Xg&HBCzXH#Nddp+ksQcz*C3H*z_-@6&e!lq z=ihupuS6~7MEfQ?iz`)@85FCc3X6+&R;0U+SWnzoeL{K!n>C!MOtMk#;way9V+}RY zdfixEeC4Mcu+uqdkHwNgmSKp$Ge)MSAz2fnw% zMH9YFW;spa1Z|=2n{^}wE_YMD->atTvSSw`<%kPg7(;b3Cn?VDql6uyN=(Rni2~hy zmf2F0VhmN_rcmTHdsVuWZTsUHN1;MuP?GD{Kb^V$)z=XXmfH;|vz&MLYTFpegN~_2 z8!l3Kf5`JBV)FueoEJTAed@QuRC7UUtS~j&`GFALs#nS<1=dvk4~4NsidFg7vJ4%5T0;2oia4j<4PVWf;V@Ky9CqWrcrWh=(q1q|RoTfG_T)8v*>g=W_jW7HRFFlZ7oY@f^pY3NLu8wE# zTN{y_w{p%jhU(VK4^}p>SXt($xJBld+1L`bD6DtKT%Vy+qdv#Py#v1=uOT_=Ov|6%eo=@;1-$V5JzTjsT^{*QGg8#^x zdFZ73jughs%+z+ys*MJecb=6$I<@51t4gnrzEbn?SAO~UD-9oi@x&HB2Zhrig z-`>8`;YcazjJ3QnEhN7&lFc&9&+52q_JWt6)wL#J{m-u|tDXp7>U%cxB0h-gQOv!AutCq_%j zGhG@l&8t7_fr~b243ebhERwZt;5k(48y>byzuG&d;r#adcv6I2{ zq1HIuq&mnn*2;r>#OK|`8^t>8!#J!2!g9GgAj)Su*3`!A% z2mXzj+%GVx_)oE0{LnQm>_*#y;dTV6?dfI6D+|F7`2fDLH9(BP8iqeKvpe|Aaw5#! z{uLBb#J;v}Pa(fOJS~ee|OAHv29&7Y2=WT z{0!Y{M>99~^;r6DjXCNojMwyK?UO~ze>BH&njXmiXQbf%kaqQk@-9vginwn$0h+E$ zp9Oqkii9j`7kgj@&G>edh0~mrjWO78anPp^=QObohU~Mcd#1%w1-h8A&1-8*vm|F$ z^QQJO&O%`Su1&O~y38fDv?2J$1{APD;$08_>a$ke$ImgVs{8b5J8Xj}1lf|Cfxgrj8F*OyyV7Uu`890#~*W7eXW6P>eoaZSmSJ-kYidpOd zUxq8_lNKiz{5b461IXm(SSj4p+Ti%VgXMz>$GrnuEwIw%e{$WH`61|^?ZA0yi3r6* zo2#x`<s5)9R!>;ULi1FZkqLzvvXXeb zc}SC_0%Glu5v=~EVDFHBkJI}?iLNX2kM=%6QIQE3pa0C`Q?)~txAneRE;Hcg+rqtW z3NNnnt4;Fu(CXE;i?aIz%6$j%q-cQv)`4@u{%+&;@fojB8(B~Y7IXy*`kS=jEbK!= zVXv_VoEib-;ipb8iYx%$oLPk&9+B^qQX7P$H97}#KT=dao+lRMBA#Gg21+*uFuYB) zCNsv=BFDW|yYh|Nbv%bzE(T+2!^hOdN9H@#JqRGwwjnwUL~2h?P>Uu=sD)JiR_*CG ztDTuUet@<3&45A}@V3^FqA;uyDMH}=)Hf+6prXbiZuo`L;SQg*t2UnaG!$^yW?aM4 zJ`4p@wwyZ2wC7g!-S=~mn>0u;fB|;VlqO(s5giNBzaRTF?SMfl*}b{Xa!#KptLHr^ zE*ekU;gFf{<5gE>(CM#x&5k*K^$QH!}W-|9FaHMAXmKs%9Az( zlu%Rf*kQ;(?^9l52=X*w;-acHO;uJUNda8{Mo?b($0#pZTHd{g*T10j>WSw*pJ+iz zkrv7ii`l2O4YGQykf)Qj1zYHJSEQi|Z+ z7{+yDKrLJ&I{$U#4<&d?mWIOmtbZiHhl&#@RZuv6iwxNh;CowgnBgS=)KV%2`-or; zMcD%Mac;S$X|(s9DB0bMmjIZQk>ijpkaO@_1DAV$m0(9C$|Z}w`(Bw#x2Jfyc`K_M9W-BkPcn}qDi3Q z64VxAb!(mZj5+Qm(skUXV!hBtC>h@}Q6ow=);|>T=WD{O|FY<)$wZp~RsLlMTo!yt z^a4b3BjJFMH|69AoHKyw@X9fe*Rk+yvk#s$Fb5p|6^cZJ5L8%$B~S{%j3olIo~lLL zx#byk{jE@!XLk2{$BhqkZ$!<(#t5zT5a`juZKq|(p zUH=f;#B9Lw7?=Q6&4mXa9%v6E3S?|WjsGMdg>VKk(fTkF;DK{mgRvn1JcCg*zqU;Zo1eFN8Vg?;O8V9)Pg@=sfV=`!iN!ktErL|5Ya9TRs06U^4LyTX zB^*VdZ5hTP7-)6CIpL>e;b;eD%FBcu9}q6uPrs==FpouJ^~5^_j}b1|1^DH!2c__X zL5WJ0|4C5p{$NmA{%?YEKCt3gP)@*nIGzpP$NVvr_$FY!nEPj#566lOREoWWc~mS| zLyh_8m>;V=CeJIG6%#Q(j_X#S#85n*>%yUa8P9dNXmiAMZKx6mts0-QGHWr>$KVtI z(I}C9WspK@>oP9^o94{l3Gj~h9m{c@gv_!%zMpD|rbanmNc!nj21($-2yBv*sg~)gjXlB^B)s7RiBTIy! zW#+UIF-B;IMF)knAnm(9G{6`l2SS_376=R`QaHkZLm48y5O%?&V2VUcD4TA6Q%nSG zDOy7~xZgw?3<=slGSxTwEb%NCTOalelX(KtL=3M#y>#r{2Ad$-NatV{G@oDz5w@a5 zXbqPI6D;vHn&1g!LEiKU3-d*mz{e3=nz1y9EespCrvuNMZ_BOp9Bf=MY=QOa22MFY z90LQkXp~b4nMHrLKHO%(CTN~A#QLsrNOSUkMB?pq}KOD-5gcQ?F$~s zT5vMU_y4g`al5I6HkvvL-f;f%fO=&ycN>W}+ij0{ z$4hH(dp}+sw>h*bs$<2gh8=5GX&yc{PxXn(#{B#9)I8184gRIFweVr5&OsXSL1FBBg4VZ6ZB>Jv}CS9kZ9DJiulRC9+JJwlga$bQ-?b z4l$C)UK4B8QLL`mS?!`&)vezZE2Mv`K#zAy6$U4Fw{b%0qQg7AOxdo!kwp$(%F9A8Dl@?_b-ZeYkxhkfEf2uH>p74jf$cY{| zCsnhqlGSCTJoJbGzZv(tDGzkAV9x6}B+_*F776F?863T8Z$>^nx@}0|HPvZHko=T; zq65X;Jp#&(HEz2fIeU1nCf)g><*Bnr5P3V^6GT(IC;DoN+iszT+^o}cF9+-vkajj& zP|mQ+%^W~1$)7_a)xz(PQ>o5e^z)=L{9jWHqe`gTrV07pyGR*)l?)r*3|r$2JDUu9 zclL~5jA*@qKY!1(6Il0Z{lXz*H*)9({itcG!l2NqU8*l7CK*pnG8P!s*$bE&Oj8vN z3a!e8T9a5|^r7xFV*$U;K8UHoHdWC?LhNf2<&3AUGZqLwT?nr7Jt0Uo(=l9Ij=5+w z>2Hz>uAPLsAAb9_WBO}*=b#5#|(0#uM-Lq zMcXQv5BaPGOn6CRGqbK@yWCA(+QL@K;&aA)UiC%VMmPFW8VtPEsodZB3u%B&(N3w< z$NbGGm(AS~2P@u)Bqgy|}|R;aO*K-h7c&+{seg0QXTUh6SlFxJIFg?xgsMtHncS)+0Dl8H!wk64to|71N+*neky{;Q~Z`W4A^oxQ_n`Oxr z+q?==2;NqAxlO`}qZdVW(*{b!T z?xN#xv9&?n1^=qmmL+j5ORC#e9;K>ZE(01P`s@VHF2^xGlw4V;$ohdZ@USf@0|o{MUrKSV!C| zveMr#W;Uv|op&G_7n(Wzpd}u$Ei1*Ow8t*Z8`o4MseMuCloHH7V9E@^JAXP%9<;bR zkMOOY9G~ zwHW&E7OcTvwv5o&;^=7~=#SryL>RV^fwV>)yWJW$lj={g%`-h&CQCdr9`s&jv*o#) z6I~hxb3s&GgeSG|);Sg0l9qCdi*7fDUUM@%bEnyGpG8isjMv;E3|Og~mm^QpAP-(U z6X8WI>?~TVQk{Ie7q`$REp;32xyUcDnGT$fheyw7DVRln%k{P4F7(y%gX`{GbiJ^K zJewqYg3S3eA{d|Pw~?WEgBK;B7IqKo=#fux%TpF6%j+UDSGOBDvYij*vN3CI*hF^f zvVVQY;=B1@s!kjAerd=u8r4MC*0W0Z`-7SILaT2S6OFsqiN^1a*u>n|tX9FKSw%Wx z(|%fd0r%3=qDw^MY(3F9|0~hBkfu_7`tDL8=h7EC_JJYP!v4~ye7>5T2G1eG=%`0g z`&LIbLnY1dbg+2rY1t(WWTK#r z4&cnd6-zN3)#$NzPN4}H{DqZ|wbJaYUdx$P*^cg{FzG^)?2bZWECnq1#X8-6N;D>U z&`T_A?&bKLc?rgIdTr`3%n=mXD0pp@bH&YEZvQGqLvZmd$#b@Lxr*hr!N2iMxz!j@ w&b+p-*3@=-JXPJqJ7sUPfo{;5S`6bscg-This Inner Loop Header: Depth=1 - movq %rax, %r12 - movq %r12, %rdi - movl $96, %esi - movl $32, %edx - callq CUDAAlloc - testb $1, %r13b - jne .LBB0_4 -# BB#3: # %if_then - # in Loop: Header=BB0_2 Depth=1 - movl %ebx, 252(%rsp) - leaq 252(%rsp), %rax - movq %rax, 256(%rsp) - movl 20(%rsp), %eax # 4-byte Reload - movl %eax, 248(%rsp) - leaq 248(%rsp), %rax - movq %rax, 264(%rsp) - movl 24(%rsp), %eax # 4-byte Reload - movl %eax, 244(%rsp) - leaq 244(%rsp), %rax - movq %rax, 272(%rsp) - movl 28(%rsp), %eax # 4-byte Reload - movl %eax, 240(%rsp) - leaq 240(%rsp), %rax - movq %rax, 280(%rsp) - movl %r15d, 236(%rsp) - leaq 236(%rsp), %rax - movq %rax, 288(%rsp) - movl 32(%rbp), %eax - movl %eax, 232(%rsp) - leaq 232(%rsp), %rax - movq %rax, 296(%rsp) - movl 40(%rbp), %eax - movl %eax, 228(%rsp) - leaq 228(%rsp), %rax - movq %rax, 304(%rsp) - movl 48(%rbp), %eax - movl %eax, 224(%rsp) - leaq 224(%rsp), %rax - movq %rax, 312(%rsp) - movq 56(%rbp), %rax - movq %rax, 216(%rsp) - leaq 216(%rsp), %rax - movq %rax, 320(%rsp) - movq 64(%rbp), %rax - movq %rax, 208(%rsp) - leaq 208(%rsp), %rax - movq %rax, 328(%rsp) - movq 72(%rbp), %rax - movq %rax, 200(%rsp) - leaq 200(%rsp), %rax - movq %rax, 336(%rsp) - movq 80(%rbp), %rax - movq %rax, 192(%rsp) - leaq 192(%rsp), %rax - movq %rax, 344(%rsp) - movl $1, 8(%rsp) - movl $1, (%rsp) - movq %r12, %rdi - movl $.L.module_str, %esi - movl $.L.ptx_str, %edx - movl $.L.func_str, %ecx - leaq 256(%rsp), %r8 - jmp .LBB0_5 - .align 16, 0x90 -.LBB0_4: # %if_else - # in Loop: Header=BB0_2 Depth=1 - movl %ebx, 92(%rsp) - leaq 92(%rsp), %rax - movq %rax, 96(%rsp) - movl 20(%rsp), %eax # 4-byte Reload - movl %eax, 88(%rsp) - leaq 88(%rsp), %rax - movq %rax, 104(%rsp) - movl 24(%rsp), %eax # 4-byte Reload - movl %eax, 84(%rsp) - leaq 84(%rsp), %rax - movq %rax, 112(%rsp) - movl 28(%rsp), %eax # 4-byte Reload - movl %eax, 80(%rsp) - leaq 80(%rsp), %rax - movq %rax, 120(%rsp) - movl %r15d, 76(%rsp) - leaq 76(%rsp), %rax - movq %rax, 128(%rsp) - movl 32(%rbp), %eax - movl %eax, 72(%rsp) - leaq 72(%rsp), %rax - movq %rax, 136(%rsp) - movl 40(%rbp), %eax - movl %eax, 68(%rsp) - leaq 68(%rsp), %rax - movq %rax, 144(%rsp) - movl 48(%rbp), %eax - movl %eax, 64(%rsp) - leaq 64(%rsp), %rax - movq %rax, 152(%rsp) - movq 56(%rbp), %rax - movq %rax, 56(%rsp) - leaq 56(%rsp), %rax - movq %rax, 160(%rsp) - movq 64(%rbp), %rax - movq %rax, 48(%rsp) - leaq 48(%rsp), %rax - movq %rax, 168(%rsp) - movq 80(%rbp), %rax - movq %rax, 40(%rsp) - leaq 40(%rsp), %rax - movq %rax, 176(%rsp) - movq 72(%rbp), %rax - movq %rax, 32(%rsp) - leaq 32(%rsp), %rax - movq %rax, 184(%rsp) - movl $1, 8(%rsp) - movl $1, (%rsp) - movq %r12, %rdi - movl $.L.module_str, %esi - movl $.L.ptx_str, %edx - movl $.L.func_str1, %ecx - leaq 96(%rsp), %r8 -.LBB0_5: # %if_else - # in Loop: Header=BB0_2 Depth=1 - movl %r14d, %r9d - callq CUDALaunch - movq 352(%rsp), %rdi - testq %rdi, %rdi - je .LBB0_7 -# BB#6: # %call_sync - # in Loop: Header=BB0_2 Depth=1 - callq ISPCSync - movq $0, 352(%rsp) -.LBB0_7: # %post_sync - # in Loop: Header=BB0_2 Depth=1 - incl %r13d - cmpl %r13d, 16(%rsp) # 4-byte Folded Reload - movq %r12, %rax - jne .LBB0_2 -# BB#8: # %for_exit - movq 352(%rsp), %rdi - testq %rdi, %rdi - je .LBB0_10 -# BB#9: # %call_sync113 - callq ISPCSync - movq $0, 352(%rsp) -.LBB0_10: # %post_sync114 - leaq -40(%rbp), %rsp - popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 - popq %rbp - ret -.Ltmp0: - .size loop_stencil_ispc_tasks, .Ltmp0-loop_stencil_ispc_tasks - - .type .L.module_str,@object # @.module_str - .section .rodata,"a",@progbits -.L.module_str: - .asciz "stencil.ispc" - .size .L.module_str, 13 - - .type .L.ptx_str,@object # @.ptx_str - .align 16 -.L.ptx_str: - .asciz "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.1\n.target sm_35, texmode_independent\n.address_size 64\n\n\t// .globl\tstencil_step_task\n // @stencil_step_task\n.entry stencil_step_task(\n\t.param .u32 stencil_step_task_param_0,\n\t.param .u32 stencil_step_task_param_1,\n\t.param .u32 stencil_step_task_param_2,\n\t.param .u32 stencil_step_task_param_3,\n\t.param .u32 stencil_step_task_param_4,\n\t.param .u32 stencil_step_task_param_5,\n\t.param .u32 stencil_step_task_param_6,\n\t.param .u32 stencil_step_task_param_7,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_8,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_9,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_10,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_11\n)\n{\n\t.reg .pred %p<396>;\n\t.reg .s16 %rc<396>;\n\t.reg .s16 %rs<396>;\n\t.reg .s32 %r<396>;\n\t.reg .s64 %rl<396>;\n\t.reg .f32 %f<396>;\n\t.reg .f64 %fl<396>;\n\n// BB#0: // %allocas\n\tmov.u32 \t%r12, %ctaid.x;\n\tld.param.u32 \t%r13, [stencil_step_task_param_4];\n\tadd.s32 \t%r16, %r12, %r13;\n\tadd.s32 \t%r0, %r16, 1;\n\tsetp.ge.s32 \t%p0, %r16, %r0;\n\t@%p0 bra \tBB0_11;\n// BB#1: // %for_test28.i.preheader.lr.ph\n\tld.param.u32 \t%r0, [stencil_step_task_param_0];\n\tld.param.u32 \t%r1, [stencil_step_task_param_1];\n\tld.param.u32 \t%r2, [stencil_step_task_param_2];\n\tld.param.u32 \t%r3, [stencil_step_task_param_3];\n\tld.param.u32 \t%r4, [stencil_step_task_param_5];\n\tld.param.u32 \t%r5, [stencil_step_task_param_6];\n\tmul.lo.s32 \t%r5, %r5, %r4;\n\tld.param.u64 \t%rl3, [stencil_step_task_param_8];\n\tld.f64 \t%fl0, [%rl3];\n\tld.f64 \t%fl1, [%rl3+8];\n\tld.param.u64 \t%rl0, [stencil_step_task_param_9];\n\tld.f64 \t%fl2, [%rl3+16];\n\tld.param.u64 \t%rl1, [stencil_step_task_param_10];\n\tld.param.u64 \t%rl2, [stencil_step_task_param_11];\n\tld.f64 \t%fl3, [%rl3+24];\n\tshl.b32 \t%r6, %r4, 1;\n\tmul.lo.s32 \t%r7, %r4, 3;\n\tmul.lo.s32 \t%r8, %r4, -3;\n\tshl.b32 \t%r9, %r5, 1;\n\tmul.lo.s32 \t%r10, %r5, 3;\n\tmul.lo.s32 \t%r11, %r5, -3;\n\tadd.s32 \t%r12, %r12, %r13;\n\tneg.s32 \t%r13, %r9;\n\tneg.s32 \t%r14, %r6;\n\tmov.u32 \t%r32, WARP_SZ;\nBB0_2: // %for_test28.i.preheader\n // =>This Loop Header: Depth=1\n // Child Loop BB0_9 Depth 2\n // Child Loop BB0_5 Depth 3\n\tmov.u32 \t%r15, %r16;\n\tsetp.ge.s32 \t%p0, %r2, %r3;\n\t@%p0 bra \tBB0_10;\n// BB#3: // %for_test35.i.preheader.lr.ph\n // in Loop: Header=BB0_2 Depth=1\n\tsetp.lt.s32 \t%p0, %r0, %r1;\n\t@%p0 bra \tBB0_4;\n\tbra.uni \tBB0_10;\nBB0_4: // in Loop: Header=BB0_2 Depth=1\n\tmul.lo.s32 \t%r16, %r15, %r5;\n\tmov.u32 \t%r17, %r2;\nBB0_9: // %for_loop37.i.lr.ph.us\n // Parent Loop BB0_2 Depth=1\n // => This Loop Header: Depth=2\n // Child Loop BB0_5 Depth 3\n\tmad.lo.s32 \t%r18, %r17, %r4, %r16;\n\tadd.s32 \t%r19, %r18, %r4;\n\tadd.s32 \t%r20, %r18, %r6;\n\tsub.s32 \t%r21, %r18, %r4;\n\tadd.s32 \t%r22, %r18, %r7;\n\tadd.s32 \t%r23, %r18, %r14;\n\tadd.s32 \t%r24, %r18, %r5;\n\tadd.s32 \t%r25, %r18, %r8;\n\tadd.s32 \t%r26, %r18, %r9;\n\tsub.s32 \t%r27, %r18, %r5;\n\tadd.s32 \t%r28, %r18, %r10;\n\tadd.s32 \t%r29, %r18, %r13;\n\tadd.s32 \t%r30, %r18, %r11;\n\tmov.u32 \t%r31, %r0;\nBB0_5: // %for_loop37.i.us\n // Parent Loop BB0_2 Depth=1\n // Parent Loop BB0_9 Depth=2\n // => This Inner Loop Header: Depth=3\n\tmov.u32 \t%r33, %tid.x;\n\tadd.s32 \t%r34, %r32, -1;\n\tand.b32 \t%r33, %r34, %r33;\n\tadd.s32 \t%r33, %r33, %r31;\n\tsetp.ge.s32 \t%p0, %r33, %r1;\n\t@%p0 bra \tBB0_7;\n// BB#6: // %pl_dolane.i.us\n // in Loop: Header=BB0_5 Depth=3\n\tadd.s32 \t%r34, %r18, %r33;\n\tshl.b32 \t%r34, %r34, 3;\n\tadd.s32 \t%r35, %r34, -8;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl4, [%rl3];\n\tadd.s32 \t%r35, %r34, 8;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl5, [%rl3];\n\tadd.s32 \t%r35, %r34, -16;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl6, [%rl3];\n\tadd.s32 \t%r35, %r34, 16;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl9, [%rl3];\n\tadd.s32 \t%r35, %r19, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl8, [%rl3];\n\tadd.s32 \t%r35, %r34, -24;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl7, [%rl3];\n\tadd.s32 \t%r35, %r34, 24;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl10, [%rl3];\n\tadd.s32 \t%r35, %r20, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl13, [%rl3];\n\tadd.s32 \t%r35, %r21, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl12, [%rl3];\n\tadd.s32 \t%r35, %r22, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl11, [%rl3];\n\tadd.s32 \t%r35, %r23, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl16, [%rl3];\n\tadd.s32 \t%r35, %r24, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl15, [%rl3];\n\tadd.s32 \t%r35, %r25, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl14, [%rl3];\n\tadd.s32 \t%r35, %r26, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl19, [%rl3];\n\tadd.s32 \t%r35, %r27, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl18, [%rl3];\n\tadd.s32 \t%r35, %r28, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl17, [%rl3];\n\tadd.s32 \t%r35, %r29, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl24, [%rl3];\n\tcvt.s64.s32 \t%rl4, %r34;\n\tadd.s64 \t%rl3, %rl4, %rl1;\n\tld.f64 \t%fl21, [%rl3];\n\tadd.s32 \t%r33, %r30, %r33;\n\tshl.b32 \t%r33, %r33, 3;\n\tcvt.s64.s32 \t%rl3, %r33;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl20, [%rl3];\n\tadd.s64 \t%rl3, %rl4, %rl2;\n\tld.f64 \t%fl23, [%rl3];\n\tadd.s64 \t%rl4, %rl4, %rl0;\n\tld.f64 \t%fl22, [%rl4];\n\tadd.f64 \t%fl25, %fl21, %fl21;\n\tsub.f64 \t%fl23, %fl25, %fl23;\n\tadd.f64 \t%fl6, %fl6, %fl9;\n\tadd.f64 \t%fl6, %fl6, %fl13;\n\tadd.f64 \t%fl6, %fl6, %fl16;\n\tadd.f64 \t%fl6, %fl6, %fl19;\n\tadd.f64 \t%fl6, %fl6, %fl24;\n\tadd.f64 \t%fl4, %fl4, %fl5;\n\tadd.f64 \t%fl4, %fl4, %fl8;\n\tadd.f64 \t%fl4, %fl4, %fl12;\n\tadd.f64 \t%fl4, %fl4, %fl15;\n\tadd.f64 \t%fl4, %fl4, %fl18;\n\tmul.f64 \t%fl5, %fl0, %fl21;\n\tfma.rn.f64 \t%fl4, %fl1, %fl4, %fl5;\n\tfma.rn.f64 \t%fl4, %fl2, %fl6, %fl4;\n\tadd.f64 \t%fl5, %fl7, %fl10;\n\tadd.f64 \t%fl5, %fl5, %fl11;\n\tadd.f64 \t%fl5, %fl5, %fl14;\n\tadd.f64 \t%fl5, %fl5, %fl17;\n\tadd.f64 \t%fl5, %fl5, %fl20;\n\tfma.rn.f64 \t%fl4, %fl3, %fl5, %fl4;\n\tfma.rn.f64 \t%fl4, %fl4, %fl22, %fl23;\n\tst.f64 \t[%rl3], %fl4;\nBB0_7: // %safe_if_after_true.i.us\n // in Loop: Header=BB0_5 Depth=3\n\tadd.s32 \t%r31, %r32, %r31;\n\tsetp.lt.s32 \t%p0, %r31, %r1;\n\t@%p0 bra \tBB0_5;\n// BB#8: // %for_exit38.i.us\n // in Loop: Header=BB0_9 Depth=2\n\tadd.s32 \t%r17, %r17, 1;\n\tsetp.eq.s32 \t%p0, %r17, %r3;\n\t@%p0 bra \tBB0_10;\n\tbra.uni \tBB0_9;\nBB0_10: // %for_exit31.i\n // in Loop: Header=BB0_2 Depth=1\n\tadd.s32 \t%r16, %r15, 1;\n\tsetp.ne.s32 \t%p0, %r15, %r12;\n\t@%p0 bra \tBB0_2;\nBB0_11: // %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit\n\tret;\n}\n\n" - .size .L.ptx_str, 7954 - - .type .L.func_str,@object # @.func_str - .align 16 -.L.func_str: - .asciz "stencil_step_task" - .size .L.func_str, 18 - - .type .L.func_str1,@object # @.func_str1 - .align 16 -.L.func_str1: - .asciz "stencil_step_task" - .size .L.func_str1, 18 - - - .section ".note.GNU-stack","",@progbits diff --git a/examples_cuda/stencil/stencil_cu_nvptx64.bc b/examples_cuda/stencil/stencil_cu_nvptx64.bc deleted file mode 100644 index 2f3c05dadda86c8f7536698a5ab102c3fd7fb8a6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5256 zcma)A3sh5Qwmv7X6Cs2|h!6~zG~!hkYUCwWg@gnf6igt&$`oxtWcm;%A|PNbhleFj z@q#q%)w{-N?9yIcYwWdJ7;J5M>d*m25f!dfgJ7AiisBoFM`r)$BvEu`)@2<|4*!3$ z|GoG3?Qd_5RX5(0@i1%-8^b(Hr5FRlu$k~nl*sq1+F0=%LFn=tyqp)z=8A(@A7_w) zI7V!ZXKcA3)HjOjAMX*WnXC1vmdRU_NuF4iGnZ)N3Nwz`jw#QN;9w`opSCx<((ZsC zLdiJhRbGv?bGD75zwv5r5tk^8s>rcbL@FwBxJ1!0qHumiA$H9a^nd++Kil}E7wZ9f zl_^E`;=vvhK7C3}rBa>`1NpmGieW`AVY5MC4Kr+132*HuOfpCC)D-QAAy6(L@vphDN&>#a%==Vz@O7B zn5OTVUfT7ZX3FF}&6LS|x+(ANQsm6@`z6jj$_?}_@L8b|k{B1FC1tH0S_NEKI1boH6f>^DK2K;z1$g8|Zo|{>j~JmNVp!oREc9=7wr=j~Dq^MlWM z7^gjX>4%1t$~nSDd(4$PYlZ2fvjnSV)(B^fELwYRG%;oAhZ?NRk?>zj;>!CyeMsKa z7UuqN;XX2?AzauQgV$tWKb2q!()ERf0tQ&SE{3>KhQh4ir(h|c0^`Zf6R6C!)5)#Gm%`m(QoD+8>m zf*RRJeXE6*&)~Q@SOm6^Tk##(iE-Kqh2a5k$ZD76_yNeyO1>OUb z>v3eVQaLh#oF2jTO)Jr{Z~`1FF5#kBZ~@y^`a!*C^1^y{z0Jx9Wu!laWhUvNWIN1$ zM~s&|&8h|LOjgD)EEA0)(qql-#oFU0iM=%0qm9ezgJo}Q5hnUm>qc<=b*JOs!txw! zBeu}87dgJu>39(=$6$7F+y^=SgmV0^lJ&4$OnMYimgmrxHNQ}nS74&bbI7t5Sr)`< z7dfaeEe=XvD42mSk1gaGFJhW+ShR#E6aAQOVYNYQ${VpghfT$%ozGGR}0rv4!YnW zI{09AW0})N!97FG_p6-8CH@*6EKaBs)I5H>sUfOyc+2ak>;gR7h8$sW zkzEPV8Jh$T6D~XCWmVmg!-kp%%jsaW4G>Hv?(N(g`9>FC7bs0>e2Co5zelJPw zLD^Sl-;d7N==(fDDDEzqagWdB#{(;$-IGtv9P8{i)GQ~Cy^0Zy6nzJz-g6^W2!Tze z@oFb}0SI!)QJ9V}uR4&>s&99EejOI|*T1jY(EG@VOD(*gq-vvylY9%Hn3cl<7@{-o z<4GtRJq1%Sp21h6Ge|dJkKwJX0K1U}>|0302~L!g9AAxFb2%4JPn4a;y!z!plp=Cq zNriFXOoIf{mmetQxHAT%bzc+&+IKNtOEDYeew%}FGQ{W6#EWo)u~~bZ_|+bw)48|@ zVE-J!Uc4SLBs%%mTDIGnw}CpHc$OY-M;z=ykiZ3=xbFZk$AwVh5R}b3jjpy4KEbEywFA>>rI$s^E`mAOKKWPG-tDRaA zClg+ws--UMU48`e+f^#tf8QWU-gh02$RZ82!*}a+@vDRE6vu~`pd#Mw3e5s}&P+<6 zYZvH!h%+C!oVN~E7h3Ita|q{W4F=ztOUEoY;S>?ePq5vK|7Lo{Q?;JDi-KwB8w#dt zVwbj9{z&K9;O6NvIxR3>J2`Xxj4pnCpzUAc3eZyM>b#s(AGK!gq=LNx!DhOdyX&5z zYsLm(zmF_2^1gh^M@Zm2O=dm5j`p|oKERVpu73qgP>-k1oaaH#b7v&C7syNH2&bcxi{C34l>rV!31m)XaFbDhLQyre#W6OGx6$Wh?+;G15l6Dlt8 zeit#IJ>|7jTERcsB19mzlG!1zt)(QlE?(-8_k_vqeHA3uK=H_aQ#{ z((Z?jp*={co}3uB9H$HF2fCo-@gy($wmd)k0>16vh%614F8<4BQfHU=9ALG%3i}Hv z?4O!~arOiGId5Md;iN$$M4paG59-FXCvHSaOzzo$$^zI+UF94lc7urYZX=zl-1ldw zW*5inCXw8JQsuwZtuf1ic*k7?*$4zl&7d&?4a=9jqG5C`yQyK#9Kk)e%RaG(t54kG z1f4WN<-CAA&HR9gN1a{{WD|=W)>xvpX8rQ`a4CEny_Hl z03YeQp=K}9jVpLD>P5xW05n#-sRug9NLcs7&8QoHHn)tt(o`m|3rd*16m(s>38nd{gj>|_q!$Liz@)kbyw@o2Nv9M&6nIw0AM&RPNs}S!GN^R!BLvu0 zcPW=TOW8Roou#e$yIL^UIXS-LZov_B3r?YWe#8&pf7i=m<(}btowG#u6KJovr14Y~ zLn_WAXTQV+AOg(=n!6NzMO?pZPIJmz0WSH^ON65=uCi?gqKSQ-IR#!EaO>7x!2U~# z-CeXV8aLFWxaw~=)L(B2dHEx$sTNN{gfk6{Izp^QBu8JqAUt`$8@G(PvCKf{?sVyT z4^-~c!0JWib0^;%0ai^=vpaYHuNpbvi74+VbMv~%vgZ<-m`B7*wT zRrbDf?)m-B@0@%0zPE=rJ-ay^3awuZa)elu{#dFQo zxRzyKsQ3=h6Xfp#N6hah^d}(drwt47z^6W({}dqM7rz9(mEyD<{^L0owRAi@d@7W1 zLKuHWv+$6h{Y=8Sb9c3W?jyqG{35e-JcNAextxpxuOReq$;r6bQKuDkzJxmG@yNBr zK{qK+%ydl9qv$v0$p4w=EXO=Qf=l_c>;fni`VyL77j|}L#L<_M4Wj=pd(&~mI`$ub z>Fkj<(|HIRRz9B41wK|F{a9BRGNLZWeT48XcTj@MKA!(P1g2kX%^UM~n|P%yZ%{w| zb*B3P0Q0Y#MmxZ83fFJb`i6w`6PWbhu-taAd1@jd;}B2%_n%bw$xgXmRy^@-GQT!9 z%DGlx)-^-y?^BA`E=l`rQsOQv-g8Rl#+MbIqk`NhyV@<|zN<)G=POdj35rkpMbnpj z9hT5Fo4h1#M0*-{*u@JHziC|Jds32rV!VdO_KTBG%Q(9t@hJYN#)sK;6^HfV1WY^4v&BoX~@PPOAFG@bvTiDC<$Wy82|isrvD~;<>E&J2G;81J%{(7g0Y_jIpa% zYWOX`c&bzNp;PL&RJ}d6v554wDqgIewvb&cNqkVp$wM+vN!7X0mv!ARW!xFcKaTlP zzKivrZ^2RT8UN5PBF-ebzft+R)Hx3FVf%jIuFED*OP#(Y^Hx+o)cdHY+n0IEyJVk4{XX{3bBTZj z`~h#3Pw~V}SvTs(moA?z4@e#t2?!tX{S{9Y|Axfl_3PnpFgre6uPfjwtG=F9^&O*q zEBM8d+K+3v`_3q5mkYB06PDcHbp^RUS1s9>j=Zs!?hjv`!>)06E!`L1C_kd*F0k`? z13!O;Qy0hGHgS)XRGk{0JDX42s;^~+{63#fs&jed2)JI!W!N9QZ{C`X(>XQyv?=3% zdN-@%eTeU$Y;b==-*1PZ<@v1+Za??El7|0FEVh;IeUt8Yoz`QXvjX&O(_g9g7rqbX z=Fxwr$vKSggPYy-Ed&t7dHyc?!`qBLNSyFH%SaU#A57|(||Io)6WVY#047hrBpZ8OY0td zAnKNiCOmyvH{vtsqg0JG+)@r+Q}9A~`j)4V#!t~cmB=O#4$6;l$;SMIql+l`n+zZ8 h#!+PjbI?D^Nw`2~9S99y1qO31u2E|vw$=`CzXP&N7+U}U diff --git a/examples_cuda/stencil/stencil_cu_nvptx64.ll b/examples_cuda/stencil/stencil_cu_nvptx64.ll deleted file mode 100644 index d0c5e824..00000000 --- a/examples_cuda/stencil/stencil_cu_nvptx64.ll +++ /dev/null @@ -1,269 +0,0 @@ -; ModuleID = 'stencil_cu_nvptx64.bc' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -target triple = "nvptx64" - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #0 - -; Function Attrs: nounwind -define void @stencil_step_task(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %Nx, i32 %Ny, i32 %Nz, double* nocapture %coef, double* %vsq, double* %Ain, double* %Aout) #1 { -allocas: - %bid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 - %add_z0_load_calltmp = add i32 %bid.i.i, %z0 - %bid.i.i21 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 - %add_z0_load15_calltmp18 = add i32 %z0, 1 - %add_add_z0_load15_calltmp18_ = add i32 %add_z0_load15_calltmp18, %bid.i.i21 - %mul_Nx_load_Ny_load.i = mul i32 %Ny, %Nx - %coef_load_offset_load.i = load double* %coef, align 8 - %coef_load16_offset.i = getelementptr double* %coef, i64 1 - %coef_load16_offset_load.i = load double* %coef_load16_offset.i, align 8 - %coef_load19_offset.i = getelementptr double* %coef, i64 2 - %coef_load19_offset_load.i = load double* %coef_load19_offset.i, align 8 - %coef_load22_offset.i = getelementptr double* %coef, i64 3 - %coef_load22_offset_load.i = load double* %coef_load22_offset.i, align 8 - %less_z_load_z1_load.i161 = icmp slt i32 %add_z0_load_calltmp, %add_add_z0_load15_calltmp18_ - br i1 %less_z_load_z1_load.i161, label %for_test28.i.preheader.lr.ph, label %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit - -for_test28.i.preheader.lr.ph: ; preds = %allocas - %less_y_load_y1_load.i159 = icmp slt i32 %y0, %y1 - %less_xb_load_x1_load.i157 = icmp slt i32 %x0, %x1 - %x1_load199_broadcast_init.i = insertelement <1 x i32> undef, i32 %x1, i32 0 - %mul__Nx_load119.i = shl i32 %Nx, 1 - %mul__Nx_load167.i = mul i32 %Nx, 3 - %mul__Nx_load127.i = mul i32 %Nx, -2 - %Ain_load65_ptr2int.i = ptrtoint double* %Ain to i64 - %mul__Nx_load175.i = mul i32 %Nx, -3 - %mul__Nxy_load136.i = shl i32 %mul_Nx_load_Ny_load.i, 1 - %mul__Nxy_load184.i = mul i32 %mul_Nx_load_Ny_load.i, 3 - %mul__Nxy_load144.i = mul i32 %mul_Nx_load_Ny_load.i, -2 - %mul__Nxy_load192.i = mul i32 %mul_Nx_load_Ny_load.i, -3 - %Aout_load_ptr2int.i = ptrtoint double* %Aout to i64 - %vsq_load_ptr2int.i = ptrtoint double* %vsq to i64 - %0 = add i32 %bid.i.i21, %z0 - br label %for_test28.i.preheader - -for_test28.i.preheader: ; preds = %for_exit31.i, %for_test28.i.preheader.lr.ph - %z.0.i162 = phi i32 [ %add_z0_load_calltmp, %for_test28.i.preheader.lr.ph ], [ %z_load245_plus1.i, %for_exit31.i ] - br i1 %less_y_load_y1_load.i159, label %for_test35.i.preheader.lr.ph, label %for_exit31.i - -for_test35.i.preheader.lr.ph: ; preds = %for_test28.i.preheader - %mul_z_load45_Nxy_load.i = mul i32 %z.0.i162, %mul_Nx_load_Ny_load.i - br i1 %less_xb_load_x1_load.i157, label %for_loop37.i.lr.ph.us, label %for_exit31.i - -for_exit38.i.us: ; preds = %safe_if_after_true.i.us - %y_load244_plus1.i.us = add i32 %y.0.i160.us, 1 - %exitcond = icmp eq i32 %y_load244_plus1.i.us, %y1 - br i1 %exitcond, label %for_exit31.i, label %for_loop37.i.lr.ph.us - -for_loop37.i.us: ; preds = %for_loop37.i.lr.ph.us, %safe_if_after_true.i.us - %xb.0.i158.us = phi i32 [ %x0, %for_loop37.i.lr.ph.us ], [ %add_xb_load243_calltmp241.i.us, %safe_if_after_true.i.us ] - %tid.i.i.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 - %tid.i.i.i.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2 - %sub_calltmp3_.i.i.us = add i32 %tid.i.i.i.i.us, -1 - %bitop.i.i.us = and i32 %sub_calltmp3_.i.i.us, %tid.i.i.i.us - %add_xb_load42_calltmp.i.us = add i32 %bitop.i.i.us, %xb.0.i158.us - %add_xb_load42_calltmp_broadcast_init.i.us = insertelement <1 x i32> undef, i32 %add_xb_load42_calltmp.i.us, i32 0 - %less_x_load198_x1_load199_broadcast.i.us = icmp slt <1 x i32> %add_xb_load42_calltmp_broadcast_init.i.us, %x1_load199_broadcast_init.i - %v.i.i.us = extractelement <1 x i1> %less_x_load198_x1_load199_broadcast.i.us, i32 0 - br i1 %v.i.i.us, label %pl_dolane.i.us, label %safe_if_after_true.i.us - -pl_dolane.i.us: ; preds = %for_loop37.i.us - %.lhs.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %add_xb_load42_calltmp.i.us - %.lhs.us = shl i32 %.lhs.lhs.us, 3 - %1 = add i32 %.lhs.us, -8 - %iptr__id.i.rhs.us = sext i32 %1 to i64 - %iptr__id.i.us = add i64 %iptr__id.i.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i.us = inttoptr i64 %iptr__id.i.us to double* - %val__id.i.us = load double* %ptr__id.i.us, align 8 - %2 = add i32 %.lhs.us, 8 - %iptr__id.i130.rhs.us = sext i32 %2 to i64 - %iptr__id.i130.us = add i64 %iptr__id.i130.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i131.us = inttoptr i64 %iptr__id.i130.us to double* - %val__id.i132.us = load double* %ptr__id.i131.us, align 8 - %3 = add i32 %.lhs.us, -16 - %iptr__id.i125.rhs.us = sext i32 %3 to i64 - %iptr__id.i125.us = add i64 %iptr__id.i125.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i126.us = inttoptr i64 %iptr__id.i125.us to double* - %val__id.i127.us = load double* %ptr__id.i126.us, align 8 - %4 = add i32 %.lhs.us, 16 - %iptr__id.i120.rhs.us = sext i32 %4 to i64 - %iptr__id.i120.us = add i64 %iptr__id.i120.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i121.us = inttoptr i64 %iptr__id.i120.us to double* - %val__id.i122.us = load double* %ptr__id.i121.us, align 8 - %.lhs138.us = add i32 %.lhs138.lhs.us, %add_xb_load42_calltmp.i.us - %5 = shl i32 %.lhs138.us, 3 - %iptr__id.i115.rhs.us = sext i32 %5 to i64 - %iptr__id.i115.us = add i64 %iptr__id.i115.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i116.us = inttoptr i64 %iptr__id.i115.us to double* - %val__id.i117.us = load double* %ptr__id.i116.us, align 8 - %6 = add i32 %.lhs.us, -24 - %iptr__id.i110.rhs.us = sext i32 %6 to i64 - %iptr__id.i110.us = add i64 %iptr__id.i110.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i111.us = inttoptr i64 %iptr__id.i110.us to double* - %val__id.i112.us = load double* %ptr__id.i111.us, align 8 - %7 = add i32 %.lhs.us, 24 - %iptr__id.i105.rhs.us = sext i32 %7 to i64 - %iptr__id.i105.us = add i64 %iptr__id.i105.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i106.us = inttoptr i64 %iptr__id.i105.us to double* - %val__id.i107.us = load double* %ptr__id.i106.us, align 8 - %.lhs141.us = add i32 %.lhs141.lhs.us, %add_xb_load42_calltmp.i.us - %8 = shl i32 %.lhs141.us, 3 - %iptr__id.i100.rhs.us = sext i32 %8 to i64 - %iptr__id.i100.us = add i64 %iptr__id.i100.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i101.us = inttoptr i64 %iptr__id.i100.us to double* - %val__id.i102.us = load double* %ptr__id.i101.us, align 8 - %.lhs142.us = add i32 %.lhs142.lhs.us, %add_xb_load42_calltmp.i.us - %9 = shl i32 %.lhs142.us, 3 - %iptr__id.i95.rhs.us = sext i32 %9 to i64 - %iptr__id.i95.us = add i64 %iptr__id.i95.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i96.us = inttoptr i64 %iptr__id.i95.us to double* - %val__id.i97.us = load double* %ptr__id.i96.us, align 8 - %.lhs143.us = add i32 %.lhs143.lhs.us, %add_xb_load42_calltmp.i.us - %10 = shl i32 %.lhs143.us, 3 - %iptr__id.i90.rhs.us = sext i32 %10 to i64 - %iptr__id.i90.us = add i64 %iptr__id.i90.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i91.us = inttoptr i64 %iptr__id.i90.us to double* - %val__id.i92.us = load double* %ptr__id.i91.us, align 8 - %.lhs144.us = add i32 %.lhs144.lhs.us, %add_xb_load42_calltmp.i.us - %11 = shl i32 %.lhs144.us, 3 - %iptr__id.i85.rhs.us = sext i32 %11 to i64 - %iptr__id.i85.us = add i64 %iptr__id.i85.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i86.us = inttoptr i64 %iptr__id.i85.us to double* - %val__id.i87.us = load double* %ptr__id.i86.us, align 8 - %.lhs145.us = add i32 %.lhs145.lhs.us, %add_xb_load42_calltmp.i.us - %12 = shl i32 %.lhs145.us, 3 - %iptr__id.i80.rhs.us = sext i32 %12 to i64 - %iptr__id.i80.us = add i64 %iptr__id.i80.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i81.us = inttoptr i64 %iptr__id.i80.us to double* - %val__id.i82.us = load double* %ptr__id.i81.us, align 8 - %.lhs146.us = add i32 %.lhs146.lhs.us, %add_xb_load42_calltmp.i.us - %13 = shl i32 %.lhs146.us, 3 - %iptr__id.i75.rhs.us = sext i32 %13 to i64 - %iptr__id.i75.us = add i64 %iptr__id.i75.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i76.us = inttoptr i64 %iptr__id.i75.us to double* - %val__id.i77.us = load double* %ptr__id.i76.us, align 8 - %.lhs147.us = add i32 %.lhs147.lhs.us, %add_xb_load42_calltmp.i.us - %14 = shl i32 %.lhs147.us, 3 - %iptr__id.i70.rhs.us = sext i32 %14 to i64 - %iptr__id.i70.us = add i64 %iptr__id.i70.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i71.us = inttoptr i64 %iptr__id.i70.us to double* - %val__id.i72.us = load double* %ptr__id.i71.us, align 8 - %.lhs148.us = add i32 %.lhs148.lhs.us, %add_xb_load42_calltmp.i.us - %15 = shl i32 %.lhs148.us, 3 - %iptr__id.i65.rhs.us = sext i32 %15 to i64 - %iptr__id.i65.us = add i64 %iptr__id.i65.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i66.us = inttoptr i64 %iptr__id.i65.us to double* - %val__id.i67.us = load double* %ptr__id.i66.us, align 8 - %.lhs149.us = add i32 %.lhs149.lhs.us, %add_xb_load42_calltmp.i.us - %16 = shl i32 %.lhs149.us, 3 - %iptr__id.i60.rhs.us = sext i32 %16 to i64 - %iptr__id.i60.us = add i64 %iptr__id.i60.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i61.us = inttoptr i64 %iptr__id.i60.us to double* - %val__id.i62.us = load double* %ptr__id.i61.us, align 8 - %.lhs150.us = add i32 %.lhs150.lhs.us, %add_xb_load42_calltmp.i.us - %17 = shl i32 %.lhs150.us, 3 - %iptr__id.i55.rhs.us = sext i32 %17 to i64 - %iptr__id.i55.us = add i64 %iptr__id.i55.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i56.us = inttoptr i64 %iptr__id.i55.us to double* - %val__id.i57.us = load double* %ptr__id.i56.us, align 8 - %.lhs151.us = add i32 %add_xb_load42_calltmp.i.us, %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us - %18 = shl i32 %.lhs151.us, 3 - %iptr__id.i50.rhs.us = sext i32 %18 to i64 - %iptr__id.i50.us = add i64 %iptr__id.i50.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i51.us = inttoptr i64 %iptr__id.i50.us to double* - %val__id.i52.us = load double* %ptr__id.i51.us, align 8 - %.lhs152.us = add i32 %.lhs152.lhs.us, %add_xb_load42_calltmp.i.us - %19 = shl i32 %.lhs152.us, 3 - %iptr__id.i45.rhs.us = sext i32 %19 to i64 - %iptr__id.i45.us = add i64 %iptr__id.i45.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i46.us = inttoptr i64 %iptr__id.i45.us to double* - %val__id.i47.us = load double* %ptr__id.i46.us, align 8 - %val__id.i41.us = load double* %ptr__id.i51.us, align 8 - %iptr__id.i32.us = add i64 %iptr__id.i50.rhs.us, %Aout_load_ptr2int.i - %ptr__id.i33.us = inttoptr i64 %iptr__id.i32.us to double* - %val__id.i34.us = load double* %ptr__id.i33.us, align 8 - %iptr__id.i27.rhs.us = sext i32 %.lhs.us to i64 - %iptr__id.i27.us = add i64 %iptr__id.i27.rhs.us, %vsq_load_ptr2int.i - %ptr__id.i28.us = inttoptr i64 %iptr__id.i27.us to double* - %val__id.i29.us = load double* %ptr__id.i28.us, align 8 - %iptr__id.i23.us = add i64 %iptr__id.i50.rhs.us, %Aout_load_ptr2int.i - %ptr__id.i24.us = inttoptr i64 %iptr__id.i23.us to double* - %val__id.i25.lhs.us.lhs = fmul double %val__id.i41.us, 2.000000e+00 - %val__id.i25.lhs.us = fsub double %val__id.i25.lhs.us.lhs, %val__id.i34.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i127.us, %val__id.i122.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i102.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.us, %val__id.i87.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.us, %val__id.i72.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.us, %val__id.i57.us - %val__id.i25.rhs.rhs.lhs.lhs.us = fmul double %coef_load19_offset_load.i, %val__id.i25.rhs.rhs.lhs.lhs.rhs.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i.us, %val__id.i132.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i117.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.us, %val__id.i97.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.us, %val__id.i82.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.us, %val__id.i67.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.us = fmul double %coef_load16_offset_load.i, %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.us - %val__id.i25.rhs.rhs.lhs.rhs.rhs.us = fmul double %coef_load_offset_load.i, %val__id.i52.us - %val__id.i25.rhs.rhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.us, %val__id.i25.rhs.rhs.lhs.rhs.rhs.us - %val__id.i25.rhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.us, %val__id.i25.rhs.rhs.lhs.rhs.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i112.us, %val__id.i107.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i92.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.us, %val__id.i77.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.us, %val__id.i62.us - %val__id.i25.rhs.rhs.rhs.rhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.us, %val__id.i47.us - %val__id.i25.rhs.rhs.rhs.us = fmul double %coef_load22_offset_load.i, %val__id.i25.rhs.rhs.rhs.rhs.us - %val__id.i25.rhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.us, %val__id.i25.rhs.rhs.rhs.us - %val__id.i25.rhs.us = fmul double %val__id.i25.rhs.rhs.us, %val__id.i29.us - %val__id.i25.us = fadd double %val__id.i25.lhs.us, %val__id.i25.rhs.us - store double %val__id.i25.us, double* %ptr__id.i24.us, align 8 - br label %safe_if_after_true.i.us - -safe_if_after_true.i.us: ; preds = %pl_dolane.i.us, %for_loop37.i.us - %tid.i.i1.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2 - %add_xb_load243_calltmp241.i.us = add i32 %tid.i.i1.i.us, %xb.0.i158.us - %less_xb_load_x1_load.i.us = icmp slt i32 %add_xb_load243_calltmp241.i.us, %x1 - br i1 %less_xb_load_x1_load.i.us, label %for_loop37.i.us, label %for_exit38.i.us - -for_loop37.i.lr.ph.us: ; preds = %for_exit38.i.us, %for_test35.i.preheader.lr.ph - %y.0.i160.us = phi i32 [ %y_load244_plus1.i.us, %for_exit38.i.us ], [ %y0, %for_test35.i.preheader.lr.ph ] - %mul_y_load46_Nx_load47.i.us = mul i32 %y.0.i160.us, %Nx - %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us = add i32 %mul_y_load46_Nx_load47.i.us, %mul_z_load45_Nxy_load.i - %.lhs138.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %Nx - %.lhs141.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load119.i - %.lhs142.lhs.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %Nx - %.lhs143.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load167.i - %.lhs144.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load127.i - %.lhs145.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul_Nx_load_Ny_load.i - %.lhs146.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load175.i - %.lhs147.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load136.i - %.lhs148.lhs.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul_Nx_load_Ny_load.i - %.lhs149.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load184.i - %.lhs150.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load144.i - %.lhs152.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load192.i - br label %for_loop37.i.us - -for_exit31.i: ; preds = %for_exit38.i.us, %for_test35.i.preheader.lr.ph, %for_test28.i.preheader - %z_load245_plus1.i = add i32 %z.0.i162, 1 - %exitcond163 = icmp eq i32 %z.0.i162, %0 - br i1 %exitcond163, label %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit, label %for_test28.i.preheader - -stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit: ; preds = %for_exit31.i, %allocas - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "target-features"="+sm_35" } -attributes #2 = { nounwind } - -!nvvm.annotations = !{!0} - -!0 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task, metadata !"kernel", i32 1} -!1 = metadata !{ } -!2 = metadata !{ metadata !"output", metadata !0 } -!3 = metadata !{ metadata !"input1", metadata !0 } -!4 = metadata !{ metadata !"input2", metadata !0 } diff --git a/examples_cuda/stencil/stencil_ispc.h b/examples_cuda/stencil/stencil_ispc.h index ebf29582..10b0d713 100644 --- a/examples_cuda/stencil/stencil_ispc.h +++ b/examples_cuda/stencil/stencil_ispc.h @@ -21,7 +21,6 @@ namespace ispc { /* namespace */ #if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C) extern "C" { #endif // __cplusplus - extern void loop_stencil_ispc(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd); extern void loop_stencil_ispc_tasks(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd); #if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C) } /* end extern C */ diff --git a/examples_cuda/stencil/stencil_ispc_nvptx64.ll b/examples_cuda/stencil/stencil_ispc_nvptx64.ll new file mode 100644 index 00000000..51c0d95a --- /dev/null +++ b/examples_cuda/stencil/stencil_ispc_nvptx64.ll @@ -0,0 +1,974 @@ +; ModuleID = 'stencil_ispc_nvptx64.bc' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64" + +module asm "" +module asm ".extern .func (.param .b32 func_retval0) cudaLaunchDevice" +module asm "(" +module asm " .param .b64 cudaLaunchDevice_param_0," +module asm " .param .b64 cudaLaunchDevice_param_1," +module asm " .param .align 4 .b8 cudaLaunchDevice_param_2[12]," +module asm " .param .align 4 .b8 cudaLaunchDevice_param_3[12]," +module asm " .param .b32 cudaLaunchDevice_param_4," +module asm " .param .b64 cudaLaunchDevice_param_5" +module asm ");" + +@constDeltaForeach1 = private unnamed_addr constant [32 x i8] zeroinitializer +@constDeltaForeach4 = private unnamed_addr constant [32 x i8] c"\00\01\02\03\04\05\06\07\08\09\0A\0B\0C\0D\0E\0F\10\11\12\13\14\15\16\17\18\19\1A\1B\1C\1D\1E\1F" + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone + +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone + +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() nounwind readnone + +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() nounwind readnone + +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() nounwind readnone + +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() nounwind readnone + +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() nounwind readnone + +define i32 @__shfl_i32(i32, i32) { + %shfl = tail call i32 asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) + ret i32 %shfl +} + +define float @__shfl_xor_float(float, i32) { + %shfl = tail call float asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) + ret float %shfl +} + +define i32 @__shfl_xor_i32(i32, i32) { + %shfl = tail call i32 asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) + ret i32 %shfl +} + +define float @__fminf(float, float) { + %min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) + ret float %min +} + +define float @__fmaxf(float, float) { + %max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) + ret float %max +} + +define i32 @__ballot(i1) { + %conv = zext i1 %0 to i32 + %res = tail call i32 asm sideeffect "{ .reg .pred %p1; \0A setp.ne.u32 %p1, $1, 0; \0A vote.ballot.b32 $0, %p1; \0A }", "=r,r"(i32 %conv) + ret i32 %res +} + +define i32 @__lanemask_lt() { + %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() + ret i32 %mask +} + +define i8* @ISPCAlloc(i8**, i64, i32) { + ret i8* inttoptr (i64 1 to i8*) +} + +declare i64 @cudaGetParameterBuffer(i64, i64) + +define i8* @ISPCGetParamBuffer(i8**, i64 %align, i64 %size) { +entry: + %tid.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and = and i32 %tid.i, 31 + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %ptri64tmp = tail call i64 @cudaGetParameterBuffer(i64 %align, i64 %size) + %phitmp = inttoptr i64 %ptri64tmp to i8* + br label %if.end + +if.end: ; preds = %if.then, %entry + %ptri64 = phi i8* [ %phitmp, %if.then ], [ null, %entry ] + ret i8* %ptri64 +} + +define void @ISPCLaunch(i8**, i8* %func_ptr, i8* %func_args, i32 %ntx, i32 %nty, i32 %ntz) { +entry: + %tid.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and = and i32 %tid.i, 31 + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %ntxm1 = add nsw i32 %ntx, -1 + %ntxm1d4 = ashr i32 %ntxm1, 2 + %nbx = add nsw i32 %ntxm1d4, 1 + %args_i64 = ptrtoint i8* %func_args to i64 + %func_i64 = ptrtoint i8* %func_ptr to i64 + %res_tmp = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 %func_i64, i64 %args_i64, i32 %nbx, i32 %nty, i32 %ntz, i32 128, i32 1, i32 1, i32 0, i64 0) + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +declare i32 @cudaDeviceSynchronize() + +define void @ISPCSync(i8*) { + %2 = tail call i32 @cudaDeviceSynchronize() + ret void +} + +define i64 @__warpBinExclusiveScan(i1 %p) { +entry: + %conv.i = zext i1 %p to i32 + %res.i = tail call i32 asm sideeffect "{ .reg .pred %p1; \0A setp.ne.u32 %p1, $1, 0; \0A vote.ballot.b32 $0, %p1; \0A }", "=r,r"(i32 %conv.i) + %res.i1 = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %res.i) + %mask.i = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() + %and = and i32 %mask.i, %res.i + %res.i2 = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %and) + %retval.sroa.1.4.insert.ext.i = zext i32 %res.i2 to i64 + %retval.sroa.1.4.insert.shift.i = shl nuw i64 %retval.sroa.1.4.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %res.i1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.1.4.insert.shift.i, %retval.sroa.0.0.insert.ext.i + ret i64 %retval.sroa.0.0.insert.insert.i +} + +define internal void @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Ain, double* %Aout) { +allocas: + %bid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + %mul_calltmp_.i = shl i32 %bid.i.i, 2 + %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %bitop.i = ashr i32 %tid.i.i, 5 + %add_mul_calltmp__bitop.i = add i32 %bitop.i, %mul_calltmp_.i + %nb.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() + %mul_calltmp_.i57 = shl i32 %nb.i.i, 2 + %greaterequal_calltmp_calltmp18 = icmp sge i32 %add_mul_calltmp__bitop.i, %mul_calltmp_.i57 + %bid.i.i58 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + %nb.i.i59 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() + %greaterequal_calltmp21_calltmp24 = icmp sge i32 %bid.i.i58, %nb.i.i59 + %logical_or = or i1 %greaterequal_calltmp_calltmp18, %greaterequal_calltmp21_calltmp24 + %bid.i.i60 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() + %nb.i.i61 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() + %greaterequal_calltmp27_calltmp30 = icmp sge i32 %bid.i.i60, %nb.i.i61 + %logical_or31 = or i1 %logical_or, %greaterequal_calltmp27_calltmp30 + br i1 %logical_or31, label %if_then, label %if_exit + +if_then: ; preds = %foreach_reset19.i, %if_exit, %allocas + ret void + +if_exit: ; preds = %allocas + %bid.i.i62 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + %mul_calltmp_.i63 = shl i32 %bid.i.i62, 7 + %tid.i.i64 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %bitop.i657375 = add i32 %tid.i.i64, %mul_calltmp_.i63 + %mul_calltmp35_ = and i32 %bitop.i657375, -32 + %add_x0_load_mul_calltmp35_ = add i32 %mul_calltmp35_, %x0 + %add_xfirst_load_ = add i32 %add_x0_load_mul_calltmp35_, 32 + %c.i.i = icmp sgt i32 %add_xfirst_load_, %x1 + %r.i.i = select i1 %c.i.i, i32 %x1, i32 %add_xfirst_load_ + %bid.i.i67 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + %mul_calltmp41_ = shl i32 %bid.i.i67, 3 + %add_y0_load_mul_calltmp41_ = add i32 %mul_calltmp41_, %y0 + %add_yfirst_load_ = add i32 %add_y0_load_mul_calltmp41_, 8 + %bid.i.i70 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() + %mul_calltmp47_ = shl i32 %bid.i.i70, 3 + %add_z0_load_mul_calltmp47_ = add i32 %mul_calltmp47_, %z0 + %add_zfirst_load_ = add i32 %add_z0_load_mul_calltmp47_, 8 + %c.i.i71 = icmp sgt i32 %add_zfirst_load_, %z1 + %r.i.i72 = select i1 %c.i.i71, i32 %z1, i32 %add_zfirst_load_ + %mul_Nx_load_Ny_load.i = mul i32 %Ny, %Nx + %nitems29.i = sub i32 %r.i.i, %add_x0_load_mul_calltmp35_ + %nextras30.i = srem i32 %nitems29.i, 32 + %aligned_end31.i = sub i32 %r.i.i, %nextras30.i + %tid.i4.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %__laneidx.i = and i32 %tid.i4.i, 31 + %0 = zext i32 %__laneidx.i to i64 + %arrayidx.i = getelementptr [32 x i8]* @constDeltaForeach1, i64 0, i64 %0 + %cmp38.i396 = icmp slt i32 %add_z0_load_mul_calltmp47_, %r.i.i72 + br i1 %cmp38.i396, label %foreach_test21.i.preheader.lr.ph, label %if_then + +foreach_test21.i.preheader.lr.ph: ; preds = %if_exit + %c.i.i68 = icmp sgt i32 %add_yfirst_load_, %y1 + %r.i.i69 = select i1 %c.i.i68, i32 %y1, i32 %add_yfirst_load_ + %1 = load i8* %arrayidx.i, align 1 + %_zext.i394 = zext i8 %1 to i32 + %2 = insertelement <1 x i32> undef, i32 %_zext.i394, i32 0 + %smear_counter_init.i393 = insertelement <1 x i32> undef, i32 %add_z0_load_mul_calltmp47_, i32 0 + %iter_val.i395 = add <1 x i32> %smear_counter_init.i393, %2 + %smear_counter_init44.i387 = insertelement <1 x i32> undef, i32 %add_y0_load_mul_calltmp41_, i32 0 + %cmp54.i390 = icmp slt i32 %add_y0_load_mul_calltmp41_, %r.i.i69 + %before_aligned_end73.i385 = icmp slt i32 %add_x0_load_mul_calltmp35_, %aligned_end31.i + %smear_end_init289.i = insertelement <1 x i32> undef, i32 %r.i.i, i32 0 + %Nxy_load298_broadcast_init.i = insertelement <1 x i32> undef, i32 %mul_Nx_load_Ny_load.i, i32 0 + %Nx_load300_broadcast_init.i = insertelement <1 x i32> undef, i32 %Nx, i32 0 + %Ain_load309_ptr2int.i = ptrtoint double* %Ain to i64 + %coef_load314_offset.i = getelementptr double* %coef, i64 1 + %coef_load365_offset.i = getelementptr double* %coef, i64 2 + %mul__Nx_load385.i = shl i32 %Nx, 1 + %mul__Nx_load393.i = mul i32 %Nx, -2 + %mul__Nxy_load402.i = shl i32 %mul_Nx_load_Ny_load.i, 1 + %mul__Nxy_load410.i = mul i32 %mul_Nx_load_Ny_load.i, -2 + %coef_load416_offset.i = getelementptr double* %coef, i64 3 + %mul__Nx_load436.i = mul i32 %Nx, 3 + %mul__Nx_load444.i = mul i32 %Nx, -3 + %mul__Nxy_load453.i = mul i32 %mul_Nx_load_Ny_load.i, 3 + %mul__Nxy_load461.i = mul i32 %mul_Nx_load_Ny_load.i, -3 + %Aout_load470_ptr2int.i = ptrtoint double* %Aout to i64 + %vsq_load488_ptr2int.i = ptrtoint double* %vsq to i64 + %3 = sub i32 -9, %y0 + %4 = shl i32 %bid.i.i67, 3 + %5 = sub i32 %3, %4 + %6 = xor i32 %y1, -1 + %7 = icmp sgt i32 %5, %6 + %smax = select i1 %7, i32 %5, i32 %6 + %8 = xor i32 %smax, -1 + %9 = sub i32 -9, %z0 + %10 = shl i32 %bid.i.i70, 3 + %11 = sub i32 %9, %10 + %12 = xor i32 %z1, -1 + %13 = icmp sgt i32 %11, %12 + %smax399 = select i1 %13, i32 %11, i32 %12 + %14 = xor i32 %smax399, -1 + br label %foreach_test21.i.preheader + +foreach_full_body.i: ; preds = %outer_not_in_extras.i.preheader, %foreach_full_body.i + %counter32.4.i386 = phi i32 [ %new_counter279.i, %foreach_full_body.i ], [ %add_x0_load_mul_calltmp35_, %outer_not_in_extras.i.preheader ] + %tid.i.i56 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %__laneidx80.i = and i32 %tid.i.i56, 31 + %15 = zext i32 %__laneidx80.i to i64 + %arrayidx81.i = getelementptr [32 x i8]* @constDeltaForeach4, i64 0, i64 %15 + %16 = load i8* %arrayidx81.i, align 1 + %_zext82.i = zext i8 %16 to i32 + %coef_load_offset_load.i = load double* %coef, align 8 + %.lhs362.lhs.lhs = extractelement <1 x i32> %mul_z_load297_Nxy_load298_broadcast.i, i32 0 + %.lhs362.lhs.rhs.lhs = extractelement <1 x i32> %iter_val50.i392, i32 0 + %.lhs362.lhs.rhs = mul i32 %.lhs362.lhs.rhs.lhs, %Nx + %.lhs362.lhs = add i32 %.lhs362.lhs.lhs, %.lhs362.lhs.rhs + %.lhs362.rhs = add i32 %counter32.4.i386, %_zext82.i + %.lhs362 = add i32 %.lhs362.lhs, %.lhs362.rhs + %17 = shl i32 %.lhs362, 3 + %iptr__id.i.rhs = sext i32 %17 to i64 + %iptr__id.i = add i64 %iptr__id.i.rhs, %Ain_load309_ptr2int.i + %ptr__id.i = inttoptr i64 %iptr__id.i to double* + %val__id.i = load double* %ptr__id.i, align 8 + %coef_load94_offset_load.i = load double* %coef_load314_offset.i, align 8 + %18 = add i32 %17, 8 + %iptr__id.i335.rhs = sext i32 %18 to i64 + %iptr__id.i335 = add i64 %iptr__id.i335.rhs, %Ain_load309_ptr2int.i + %ptr__id.i336 = inttoptr i64 %iptr__id.i335 to double* + %val__id.i337 = load double* %ptr__id.i336, align 8 + %19 = add i32 %17, -8 + %iptr__id.i330.rhs = sext i32 %19 to i64 + %iptr__id.i330 = add i64 %iptr__id.i330.rhs, %Ain_load309_ptr2int.i + %ptr__id.i331 = inttoptr i64 %iptr__id.i330 to double* + %val__id.i332 = load double* %ptr__id.i331, align 8 + %.lhs365 = add i32 %.lhs362, %Nx + %20 = shl i32 %.lhs365, 3 + %iptr__id.i325.rhs = sext i32 %20 to i64 + %iptr__id.i325 = add i64 %iptr__id.i325.rhs, %Ain_load309_ptr2int.i + %ptr__id.i326 = inttoptr i64 %iptr__id.i325 to double* + %val__id.i327 = load double* %ptr__id.i326, align 8 + %.lhs366 = sub i32 %.lhs362, %Nx + %21 = shl i32 %.lhs366, 3 + %iptr__id.i320.rhs = sext i32 %21 to i64 + %iptr__id.i320 = add i64 %iptr__id.i320.rhs, %Ain_load309_ptr2int.i + %ptr__id.i321 = inttoptr i64 %iptr__id.i320 to double* + %val__id.i322 = load double* %ptr__id.i321, align 8 + %.lhs367 = add i32 %.lhs362, %mul_Nx_load_Ny_load.i + %22 = shl i32 %.lhs367, 3 + %iptr__id.i315.rhs = sext i32 %22 to i64 + %iptr__id.i315 = add i64 %iptr__id.i315.rhs, %Ain_load309_ptr2int.i + %ptr__id.i316 = inttoptr i64 %iptr__id.i315 to double* + %val__id.i317 = load double* %ptr__id.i316, align 8 + %.lhs368 = sub i32 %.lhs362, %mul_Nx_load_Ny_load.i + %23 = shl i32 %.lhs368, 3 + %iptr__id.i310.rhs = sext i32 %23 to i64 + %iptr__id.i310 = add i64 %iptr__id.i310.rhs, %Ain_load309_ptr2int.i + %ptr__id.i311 = inttoptr i64 %iptr__id.i310 to double* + %val__id.i312 = load double* %ptr__id.i311, align 8 + %coef_load145_offset_load.i = load double* %coef_load365_offset.i, align 8 + %24 = add i32 %17, 16 + %iptr__id.i305.rhs = sext i32 %24 to i64 + %iptr__id.i305 = add i64 %iptr__id.i305.rhs, %Ain_load309_ptr2int.i + %ptr__id.i306 = inttoptr i64 %iptr__id.i305 to double* + %val__id.i307 = load double* %ptr__id.i306, align 8 + %25 = add i32 %17, -16 + %iptr__id.i300.rhs = sext i32 %25 to i64 + %iptr__id.i300 = add i64 %iptr__id.i300.rhs, %Ain_load309_ptr2int.i + %ptr__id.i301 = inttoptr i64 %iptr__id.i300 to double* + %val__id.i302 = load double* %ptr__id.i301, align 8 + %.lhs371 = add i32 %.lhs362, %mul__Nx_load385.i + %26 = shl i32 %.lhs371, 3 + %iptr__id.i295.rhs = sext i32 %26 to i64 + %iptr__id.i295 = add i64 %iptr__id.i295.rhs, %Ain_load309_ptr2int.i + %ptr__id.i296 = inttoptr i64 %iptr__id.i295 to double* + %val__id.i297 = load double* %ptr__id.i296, align 8 + %.lhs372 = add i32 %.lhs362, %mul__Nx_load393.i + %27 = shl i32 %.lhs372, 3 + %iptr__id.i290.rhs = sext i32 %27 to i64 + %iptr__id.i290 = add i64 %iptr__id.i290.rhs, %Ain_load309_ptr2int.i + %ptr__id.i291 = inttoptr i64 %iptr__id.i290 to double* + %val__id.i292 = load double* %ptr__id.i291, align 8 + %.lhs373 = add i32 %.lhs362, %mul__Nxy_load402.i + %28 = shl i32 %.lhs373, 3 + %iptr__id.i285.rhs = sext i32 %28 to i64 + %iptr__id.i285 = add i64 %iptr__id.i285.rhs, %Ain_load309_ptr2int.i + %ptr__id.i286 = inttoptr i64 %iptr__id.i285 to double* + %val__id.i287 = load double* %ptr__id.i286, align 8 + %.lhs374 = add i32 %.lhs362, %mul__Nxy_load410.i + %29 = shl i32 %.lhs374, 3 + %iptr__id.i280.rhs = sext i32 %29 to i64 + %iptr__id.i280 = add i64 %iptr__id.i280.rhs, %Ain_load309_ptr2int.i + %ptr__id.i281 = inttoptr i64 %iptr__id.i280 to double* + %val__id.i282 = load double* %ptr__id.i281, align 8 + %coef_load196_offset_load.i = load double* %coef_load416_offset.i, align 8 + %30 = add i32 %17, 24 + %iptr__id.i275.rhs = sext i32 %30 to i64 + %iptr__id.i275 = add i64 %iptr__id.i275.rhs, %Ain_load309_ptr2int.i + %ptr__id.i276 = inttoptr i64 %iptr__id.i275 to double* + %val__id.i277 = load double* %ptr__id.i276, align 8 + %31 = add i32 %17, -24 + %iptr__id.i270.rhs = sext i32 %31 to i64 + %iptr__id.i270 = add i64 %iptr__id.i270.rhs, %Ain_load309_ptr2int.i + %ptr__id.i271 = inttoptr i64 %iptr__id.i270 to double* + %val__id.i272 = load double* %ptr__id.i271, align 8 + %.lhs377 = add i32 %.lhs362, %mul__Nx_load436.i + %32 = shl i32 %.lhs377, 3 + %iptr__id.i265.rhs = sext i32 %32 to i64 + %iptr__id.i265 = add i64 %iptr__id.i265.rhs, %Ain_load309_ptr2int.i + %ptr__id.i266 = inttoptr i64 %iptr__id.i265 to double* + %val__id.i267 = load double* %ptr__id.i266, align 8 + %.lhs378 = add i32 %.lhs362, %mul__Nx_load444.i + %33 = shl i32 %.lhs378, 3 + %iptr__id.i260.rhs = sext i32 %33 to i64 + %iptr__id.i260 = add i64 %iptr__id.i260.rhs, %Ain_load309_ptr2int.i + %ptr__id.i261 = inttoptr i64 %iptr__id.i260 to double* + %val__id.i262 = load double* %ptr__id.i261, align 8 + %.lhs379 = add i32 %.lhs362, %mul__Nxy_load453.i + %34 = shl i32 %.lhs379, 3 + %iptr__id.i255.rhs = sext i32 %34 to i64 + %iptr__id.i255 = add i64 %iptr__id.i255.rhs, %Ain_load309_ptr2int.i + %ptr__id.i256 = inttoptr i64 %iptr__id.i255 to double* + %val__id.i257 = load double* %ptr__id.i256, align 8 + %.lhs380 = add i32 %.lhs362, %mul__Nxy_load461.i + %35 = shl i32 %.lhs380, 3 + %iptr__id.i250.rhs = sext i32 %35 to i64 + %iptr__id.i250 = add i64 %iptr__id.i250.rhs, %Ain_load309_ptr2int.i + %ptr__id.i251 = inttoptr i64 %iptr__id.i250 to double* + %val__id.i252 = load double* %ptr__id.i251, align 8 + %val__id.i247 = load double* %ptr__id.i, align 8 + %iptr__id.i240 = add i64 %iptr__id.i.rhs, %Aout_load470_ptr2int.i + %ptr__id.i241 = inttoptr i64 %iptr__id.i240 to double* + %val__id.i242 = load double* %ptr__id.i241, align 8 + %iptr__id.i235 = add i64 %iptr__id.i.rhs, %vsq_load488_ptr2int.i + %ptr__id.i236 = inttoptr i64 %iptr__id.i235 to double* + %val__id.i237 = load double* %ptr__id.i236, align 8 + %val__id.i233.lhs.lhs = fmul double %val__id.i247, 2.000000e+00 + %val__id.i233.lhs = fsub double %val__id.i233.lhs.lhs, %val__id.i242 + %val__id.i233.rhs.rhs.lhs.lhs.lhs = fmul double %coef_load_offset_load.i, %val__id.i + %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i337, %val__id.i332 + %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i327 + %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i322 + %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs, %val__id.i317 + %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs, %val__id.i312 + %val__id.i233.rhs.rhs.lhs.lhs.rhs = fmul double %coef_load94_offset_load.i, %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs + %val__id.i233.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.lhs, %val__id.i233.rhs.rhs.lhs.lhs.rhs + %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i307, %val__id.i302 + %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i297 + %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i292 + %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs, %val__id.i287 + %val__id.i233.rhs.rhs.lhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs, %val__id.i282 + %val__id.i233.rhs.rhs.lhs.rhs = fmul double %coef_load145_offset_load.i, %val__id.i233.rhs.rhs.lhs.rhs.rhs + %val__id.i233.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs, %val__id.i233.rhs.rhs.lhs.rhs + %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i277, %val__id.i272 + %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i267 + %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs, %val__id.i262 + %val__id.i233.rhs.rhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs, %val__id.i257 + %val__id.i233.rhs.rhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs, %val__id.i252 + %val__id.i233.rhs.rhs.rhs = fmul double %coef_load196_offset_load.i, %val__id.i233.rhs.rhs.rhs.rhs + %val__id.i233.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs, %val__id.i233.rhs.rhs.rhs + %val__id.i233.rhs = fmul double %val__id.i237, %val__id.i233.rhs.rhs + %val__id.i233 = fadd double %val__id.i233.lhs, %val__id.i233.rhs + store double %val__id.i233, double* %ptr__id.i241, align 8 + %new_counter279.i = add i32 %counter32.4.i386, 32 + %before_aligned_end73.i = icmp slt i32 %new_counter279.i, %aligned_end31.i + br i1 %before_aligned_end73.i, label %foreach_full_body.i, label %partial_inner_all_outer.i + +foreach_test21.i.preheader: ; preds = %foreach_reset19.i, %foreach_test21.i.preheader.lr.ph + %iter_val.i398 = phi <1 x i32> [ %iter_val.i395, %foreach_test21.i.preheader.lr.ph ], [ %iter_val.i, %foreach_reset19.i ] + %counter.0.i397 = phi i32 [ %add_z0_load_mul_calltmp47_, %foreach_test21.i.preheader.lr.ph ], [ %new_counter.i, %foreach_reset19.i ] + %tid.i3.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %__laneidx47.i = and i32 %tid.i3.i, 31 + %36 = zext i32 %__laneidx47.i to i64 + %arrayidx48.i = getelementptr [32 x i8]* @constDeltaForeach1, i64 0, i64 %36 + br i1 %cmp54.i390, label %outer_not_in_extras.i.preheader.lr.ph, label %foreach_reset19.i + +outer_not_in_extras.i.preheader.lr.ph: ; preds = %foreach_test21.i.preheader + %37 = load i8* %arrayidx48.i, align 1 + %_zext49.i388 = zext i8 %37 to i32 + %38 = insertelement <1 x i32> undef, i32 %_zext49.i388, i32 0 + %iter_val50.i389 = add <1 x i32> %smear_counter_init44.i387, %38 + %mul_z_load297_Nxy_load298_broadcast.i = mul <1 x i32> %iter_val.i398, %Nxy_load298_broadcast_init.i + br label %outer_not_in_extras.i.preheader + +foreach_reset19.i: ; preds = %foreach_reset27.i, %foreach_test21.i.preheader + %new_counter.i = add i32 %counter.0.i397, 1 + %smear_counter_init.i = insertelement <1 x i32> undef, i32 %new_counter.i, i32 0 + %39 = load i8* %arrayidx.i, align 1 + %_zext.i = zext i8 %39 to i32 + %40 = insertelement <1 x i32> undef, i32 %_zext.i, i32 0 + %iter_val.i = add <1 x i32> %smear_counter_init.i, %40 + %exitcond400 = icmp eq i32 %new_counter.i, %14 + br i1 %exitcond400, label %if_then, label %foreach_test21.i.preheader + +outer_not_in_extras.i.preheader: ; preds = %foreach_reset27.i, %outer_not_in_extras.i.preheader.lr.ph + %iter_val50.i392 = phi <1 x i32> [ %iter_val50.i389, %outer_not_in_extras.i.preheader.lr.ph ], [ %iter_val50.i, %foreach_reset27.i ] + %counter25.1.i391 = phi i32 [ %add_y0_load_mul_calltmp41_, %outer_not_in_extras.i.preheader.lr.ph ], [ %new_counter35.i, %foreach_reset27.i ] + br i1 %before_aligned_end73.i385, label %foreach_full_body.i, label %partial_inner_all_outer.i + +foreach_reset27.i: ; preds = %pl_dolane.i, %partial_inner_only.i, %partial_inner_all_outer.i + %new_counter35.i = add i32 %counter25.1.i391, 1 + %smear_counter_init44.i = insertelement <1 x i32> undef, i32 %new_counter35.i, i32 0 + %41 = load i8* %arrayidx48.i, align 1 + %_zext49.i = zext i8 %41 to i32 + %42 = insertelement <1 x i32> undef, i32 %_zext49.i, i32 0 + %iter_val50.i = add <1 x i32> %smear_counter_init44.i, %42 + %exitcond = icmp eq i32 %new_counter35.i, %8 + br i1 %exitcond, label %foreach_reset19.i, label %outer_not_in_extras.i.preheader + +partial_inner_all_outer.i: ; preds = %outer_not_in_extras.i.preheader, %foreach_full_body.i + %counter32.4.i.lcssa = phi i32 [ %add_x0_load_mul_calltmp35_, %outer_not_in_extras.i.preheader ], [ %new_counter279.i, %foreach_full_body.i ] + %before_full_end.i = icmp slt i32 %counter32.4.i.lcssa, %r.i.i + br i1 %before_full_end.i, label %partial_inner_only.i, label %foreach_reset27.i + +partial_inner_only.i: ; preds = %partial_inner_all_outer.i + %smear_counter_init282.i = insertelement <1 x i32> undef, i32 %counter32.4.i.lcssa, i32 0 + %tid.i2.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %__laneidx285.i = and i32 %tid.i2.i, 31 + %43 = zext i32 %__laneidx285.i to i64 + %arrayidx286.i = getelementptr [32 x i8]* @constDeltaForeach4, i64 0, i64 %43 + %44 = load i8* %arrayidx286.i, align 1 + %_zext287.i = zext i8 %44 to i32 + %45 = insertelement <1 x i32> undef, i32 %_zext287.i, i32 0 + %iter_val288.i = add <1 x i32> %smear_counter_init282.i, %45 + %cmp291.i = icmp slt <1 x i32> %iter_val288.i, %smear_end_init289.i + %mul_y_load299_Nx_load300_broadcast.i = mul <1 x i32> %iter_val50.i392, %Nx_load300_broadcast_init.i + %add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast.i = add <1 x i32> %mul_z_load297_Nxy_load298_broadcast.i, %mul_y_load299_Nx_load300_broadcast.i + %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i = add <1 x i32> %add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast.i, %iter_val288.i + %v.i.i224 = extractelement <1 x i1> %cmp291.i, i32 0 + br i1 %v.i.i224, label %pl_dolane.i, label %foreach_reset27.i + +pl_dolane.i: ; preds = %partial_inner_only.i + %coef_load303_offset_load.i = load double* %coef, align 8 + %.lhs361 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %46 = shl i32 %.lhs361, 3 + %iptr__id.i225.rhs = sext i32 %46 to i64 + %iptr__id.i225 = add i64 %iptr__id.i225.rhs, %Ain_load309_ptr2int.i + %ptr__id.i226 = inttoptr i64 %iptr__id.i225 to double* + %val__id.i227 = load double* %ptr__id.i226, align 8 + %coef_load314_offset_load.i401 = load double* %coef_load314_offset.i, align 8 + %.lhs360.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs360 = shl i32 %.lhs360.lhs, 3 + %47 = add i32 %.lhs360, 8 + %iptr__id.i218.rhs = sext i32 %47 to i64 + %iptr__id.i218 = add i64 %iptr__id.i218.rhs, %Ain_load309_ptr2int.i + %ptr__id.i219 = inttoptr i64 %iptr__id.i218 to double* + %val__id.i220 = load double* %ptr__id.i219, align 8 + %.lhs359.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs359 = shl i32 %.lhs359.lhs, 3 + %48 = add i32 %.lhs359, -8 + %iptr__id.i211.rhs = sext i32 %48 to i64 + %iptr__id.i211 = add i64 %iptr__id.i211.rhs, %Ain_load309_ptr2int.i + %ptr__id.i212 = inttoptr i64 %iptr__id.i211 to double* + %val__id.i213 = load double* %ptr__id.i212, align 8 + %.lhs358.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs358 = add i32 %.lhs358.lhs, %Nx + %49 = shl i32 %.lhs358, 3 + %iptr__id.i204.rhs = sext i32 %49 to i64 + %iptr__id.i204 = add i64 %iptr__id.i204.rhs, %Ain_load309_ptr2int.i + %ptr__id.i205 = inttoptr i64 %iptr__id.i204 to double* + %val__id.i206 = load double* %ptr__id.i205, align 8 + %.lhs357.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs357 = sub i32 %.lhs357.lhs, %Nx + %50 = shl i32 %.lhs357, 3 + %iptr__id.i197.rhs = sext i32 %50 to i64 + %iptr__id.i197 = add i64 %iptr__id.i197.rhs, %Ain_load309_ptr2int.i + %ptr__id.i198 = inttoptr i64 %iptr__id.i197 to double* + %val__id.i199 = load double* %ptr__id.i198, align 8 + %.lhs356.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs356 = add i32 %.lhs356.lhs, %mul_Nx_load_Ny_load.i + %51 = shl i32 %.lhs356, 3 + %iptr__id.i190.rhs = sext i32 %51 to i64 + %iptr__id.i190 = add i64 %iptr__id.i190.rhs, %Ain_load309_ptr2int.i + %ptr__id.i191 = inttoptr i64 %iptr__id.i190 to double* + %val__id.i192 = load double* %ptr__id.i191, align 8 + %.lhs355.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs355 = sub i32 %.lhs355.lhs, %mul_Nx_load_Ny_load.i + %52 = shl i32 %.lhs355, 3 + %iptr__id.i183.rhs = sext i32 %52 to i64 + %iptr__id.i183 = add i64 %iptr__id.i183.rhs, %Ain_load309_ptr2int.i + %ptr__id.i184 = inttoptr i64 %iptr__id.i183 to double* + %val__id.i185 = load double* %ptr__id.i184, align 8 + %coef_load365_offset_load.i457 = load double* %coef_load365_offset.i, align 8 + %.lhs354.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs354 = shl i32 %.lhs354.lhs, 3 + %53 = add i32 %.lhs354, 16 + %iptr__id.i176.rhs = sext i32 %53 to i64 + %iptr__id.i176 = add i64 %iptr__id.i176.rhs, %Ain_load309_ptr2int.i + %ptr__id.i177 = inttoptr i64 %iptr__id.i176 to double* + %val__id.i178 = load double* %ptr__id.i177, align 8 + %.lhs353.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs353 = shl i32 %.lhs353.lhs, 3 + %54 = add i32 %.lhs353, -16 + %iptr__id.i169.rhs = sext i32 %54 to i64 + %iptr__id.i169 = add i64 %iptr__id.i169.rhs, %Ain_load309_ptr2int.i + %ptr__id.i170 = inttoptr i64 %iptr__id.i169 to double* + %val__id.i171 = load double* %ptr__id.i170, align 8 + %.lhs352.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs352 = add i32 %.lhs352.lhs, %mul__Nx_load385.i + %55 = shl i32 %.lhs352, 3 + %iptr__id.i162.rhs = sext i32 %55 to i64 + %iptr__id.i162 = add i64 %iptr__id.i162.rhs, %Ain_load309_ptr2int.i + %ptr__id.i163 = inttoptr i64 %iptr__id.i162 to double* + %val__id.i164 = load double* %ptr__id.i163, align 8 + %.lhs351.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs351 = add i32 %.lhs351.lhs, %mul__Nx_load393.i + %56 = shl i32 %.lhs351, 3 + %iptr__id.i155.rhs = sext i32 %56 to i64 + %iptr__id.i155 = add i64 %iptr__id.i155.rhs, %Ain_load309_ptr2int.i + %ptr__id.i156 = inttoptr i64 %iptr__id.i155 to double* + %val__id.i157 = load double* %ptr__id.i156, align 8 + %.lhs350.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs350 = add i32 %.lhs350.lhs, %mul__Nxy_load402.i + %57 = shl i32 %.lhs350, 3 + %iptr__id.i148.rhs = sext i32 %57 to i64 + %iptr__id.i148 = add i64 %iptr__id.i148.rhs, %Ain_load309_ptr2int.i + %ptr__id.i149 = inttoptr i64 %iptr__id.i148 to double* + %val__id.i150 = load double* %ptr__id.i149, align 8 + %.lhs349.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs349 = add i32 %.lhs349.lhs, %mul__Nxy_load410.i + %58 = shl i32 %.lhs349, 3 + %iptr__id.i141.rhs = sext i32 %58 to i64 + %iptr__id.i141 = add i64 %iptr__id.i141.rhs, %Ain_load309_ptr2int.i + %ptr__id.i142 = inttoptr i64 %iptr__id.i141 to double* + %val__id.i143 = load double* %ptr__id.i142, align 8 + %coef_load416_offset_load.i544 = load double* %coef_load416_offset.i, align 8 + %.lhs348.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs348 = shl i32 %.lhs348.lhs, 3 + %59 = add i32 %.lhs348, 24 + %iptr__id.i134.rhs = sext i32 %59 to i64 + %iptr__id.i134 = add i64 %iptr__id.i134.rhs, %Ain_load309_ptr2int.i + %ptr__id.i135 = inttoptr i64 %iptr__id.i134 to double* + %val__id.i136 = load double* %ptr__id.i135, align 8 + %.lhs347.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs347 = shl i32 %.lhs347.lhs, 3 + %60 = add i32 %.lhs347, -24 + %iptr__id.i127.rhs = sext i32 %60 to i64 + %iptr__id.i127 = add i64 %iptr__id.i127.rhs, %Ain_load309_ptr2int.i + %ptr__id.i128 = inttoptr i64 %iptr__id.i127 to double* + %val__id.i129 = load double* %ptr__id.i128, align 8 + %.lhs346.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs346 = add i32 %.lhs346.lhs, %mul__Nx_load436.i + %61 = shl i32 %.lhs346, 3 + %iptr__id.i120.rhs = sext i32 %61 to i64 + %iptr__id.i120 = add i64 %iptr__id.i120.rhs, %Ain_load309_ptr2int.i + %ptr__id.i121 = inttoptr i64 %iptr__id.i120 to double* + %val__id.i122 = load double* %ptr__id.i121, align 8 + %.lhs345.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs345 = add i32 %.lhs345.lhs, %mul__Nx_load444.i + %62 = shl i32 %.lhs345, 3 + %iptr__id.i113.rhs = sext i32 %62 to i64 + %iptr__id.i113 = add i64 %iptr__id.i113.rhs, %Ain_load309_ptr2int.i + %ptr__id.i114 = inttoptr i64 %iptr__id.i113 to double* + %val__id.i115 = load double* %ptr__id.i114, align 8 + %.lhs344.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs344 = add i32 %.lhs344.lhs, %mul__Nxy_load453.i + %63 = shl i32 %.lhs344, 3 + %iptr__id.i106.rhs = sext i32 %63 to i64 + %iptr__id.i106 = add i64 %iptr__id.i106.rhs, %Ain_load309_ptr2int.i + %ptr__id.i107 = inttoptr i64 %iptr__id.i106 to double* + %val__id.i108 = load double* %ptr__id.i107, align 8 + %.lhs343.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs343 = add i32 %.lhs343.lhs, %mul__Nxy_load461.i + %64 = shl i32 %.lhs343, 3 + %iptr__id.i99.rhs = sext i32 %64 to i64 + %iptr__id.i99 = add i64 %iptr__id.i99.rhs, %Ain_load309_ptr2int.i + %ptr__id.i100 = inttoptr i64 %iptr__id.i99 to double* + %val__id.i101 = load double* %ptr__id.i100, align 8 + %.lhs342 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %65 = shl i32 %.lhs342, 3 + %iptr__id.i92.rhs = sext i32 %65 to i64 + %iptr__id.i92 = add i64 %iptr__id.i92.rhs, %Ain_load309_ptr2int.i + %ptr__id.i93 = inttoptr i64 %iptr__id.i92 to double* + %val__id.i94 = load double* %ptr__id.i93, align 8 + %.lhs341 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %66 = shl i32 %.lhs341, 3 + %iptr__id.i85.rhs = sext i32 %66 to i64 + %iptr__id.i85 = add i64 %iptr__id.i85.rhs, %Aout_load470_ptr2int.i + %ptr__id.i86 = inttoptr i64 %iptr__id.i85 to double* + %val__id.i87 = load double* %ptr__id.i86, align 8 + %.lhs340 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %67 = shl i32 %.lhs340, 3 + %iptr__id.i80.rhs = sext i32 %67 to i64 + %iptr__id.i80 = add i64 %iptr__id.i80.rhs, %vsq_load488_ptr2int.i + %ptr__id.i81 = inttoptr i64 %iptr__id.i80 to double* + %val__id.i82 = load double* %ptr__id.i81, align 8 + %.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %68 = shl i32 %.lhs, 3 + %iptr__id.i76.rhs = sext i32 %68 to i64 + %iptr__id.i76 = add i64 %iptr__id.i76.rhs, %Aout_load470_ptr2int.i + %ptr__id.i77 = inttoptr i64 %iptr__id.i76 to double* + %val__id.i78.lhs.lhs = fmul double %val__id.i94, 2.000000e+00 + %val__id.i78.lhs = fsub double %val__id.i78.lhs.lhs, %val__id.i87 + %val__id.i78.rhs.rhs.lhs.lhs.lhs = fmul double %coef_load303_offset_load.i, %val__id.i227 + %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i220, %val__id.i213 + %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i206 + %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i199 + %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs, %val__id.i192 + %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs, %val__id.i185 + %val__id.i78.rhs.rhs.lhs.lhs.rhs = fmul double %coef_load314_offset_load.i401, %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs + %val__id.i78.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.lhs, %val__id.i78.rhs.rhs.lhs.lhs.rhs + %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i178, %val__id.i171 + %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i164 + %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i157 + %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs, %val__id.i150 + %val__id.i78.rhs.rhs.lhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs, %val__id.i143 + %val__id.i78.rhs.rhs.lhs.rhs = fmul double %coef_load365_offset_load.i457, %val__id.i78.rhs.rhs.lhs.rhs.rhs + %val__id.i78.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs, %val__id.i78.rhs.rhs.lhs.rhs + %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i136, %val__id.i129 + %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i122 + %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs, %val__id.i115 + %val__id.i78.rhs.rhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs, %val__id.i108 + %val__id.i78.rhs.rhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs, %val__id.i101 + %val__id.i78.rhs.rhs.rhs = fmul double %coef_load416_offset_load.i544, %val__id.i78.rhs.rhs.rhs.rhs + %val__id.i78.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs, %val__id.i78.rhs.rhs.rhs + %val__id.i78.rhs = fmul double %val__id.i78.rhs.rhs, %val__id.i82 + %val__id.i78 = fadd double %val__id.i78.lhs, %val__id.i78.rhs + store double %val__id.i78, double* %ptr__id.i77, align 8 + br label %foreach_reset27.i +} + +define void @loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd, <1 x i1> %__mask) { +allocas: + %less_t_load_t1_load94 = icmp slt i32 %t0, %t1 + br i1 %less_t_load_t1_load94, label %for_loop.lr.ph, label %for_exit + +for_loop.lr.ph: ; preds = %allocas + %add_sub_x1_load21_x0_load22_ = sub i32 31, %x0 + %sub_add_sub_x1_load21_x0_load22__ = add i32 %add_sub_x1_load21_x0_load22_, %x1 + %div_sub_add_sub_x1_load21_x0_load22___ = sdiv i32 %sub_add_sub_x1_load21_x0_load22__, 32 + %add_sub_y1_load23_y0_load24_ = sub i32 7, %y0 + %sub_add_sub_y1_load23_y0_load24__ = add i32 %add_sub_y1_load23_y0_load24_, %y1 + %div_sub_add_sub_y1_load23_y0_load24___ = sdiv i32 %sub_add_sub_y1_load23_y0_load24__, 8 + %add_sub_z1_load25_z0_load26_ = sub i32 7, %z0 + %sub_add_sub_z1_load25_z0_load26__ = add i32 %add_sub_z1_load25_z0_load26_, %z1 + %div_sub_add_sub_z1_load25_z0_load26___ = sdiv i32 %sub_add_sub_z1_load25_z0_load26__, 8 + %ntxm1.i = add nsw i32 %div_sub_add_sub_x1_load21_x0_load22___, -1 + %ntxm1d4.i = ashr i32 %ntxm1.i, 2 + %nbx.i = add nsw i32 %ntxm1d4.i, 1 + br label %for_loop + +for_loop: ; preds = %if_exit, %for_loop.lr.ph + %t.095 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load78_plus1, %if_exit ] + %bitop = and i32 %t.095, 1 + %equal_bitop_ = icmp eq i32 %bitop, 0 + %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and.i = and i32 %tid.i.i, 31 + %cmp.i = icmp eq i32 %and.i, 0 + br i1 %cmp.i, label %if.then.i, label %ISPCGetParamBuffer.exit + +if.then.i: ; preds = %for_loop + %ptri64tmp.i = tail call i64 @cudaGetParameterBuffer(i64 8, i64 72) + %phitmp.i = inttoptr i64 %ptri64tmp.i to i8* + br label %ISPCGetParamBuffer.exit + +ISPCGetParamBuffer.exit: ; preds = %if.then.i, %for_loop + %ptri64.i = phi i8* [ %phitmp.i, %if.then.i ], [ null, %for_loop ] + %cmp1 = icmp eq i8* %ptri64.i, null + br i1 %equal_bitop_, label %if_then, label %if_else + +for_exit: ; preds = %if_exit, %allocas + %0 = tail call i32 @cudaDeviceSynchronize() + ret void + +if_then: ; preds = %ISPCGetParamBuffer.exit + br i1 %cmp1, label %if_false, label %if_true + +if_else: ; preds = %ISPCGetParamBuffer.exit + br i1 %cmp1, label %if_false62, label %if_true61 + +if_exit: ; preds = %if.then.i92, %if_false62, %if.then.i83, %if_false + %1 = tail call i32 @cudaDeviceSynchronize() + %t_load78_plus1 = add i32 %t.095, 1 + %exitcond = icmp eq i32 %t_load78_plus1, %t1 + br i1 %exitcond, label %for_exit, label %for_loop + +if_true: ; preds = %if_then + %funarg = bitcast i8* %ptri64.i to i32* + store i32 %x0, i32* %funarg, align 4 + %funarg27 = getelementptr i8* %ptri64.i, i64 4 + %2 = bitcast i8* %funarg27 to i32* + store i32 %x1, i32* %2, align 4 + %funarg28 = getelementptr i8* %ptri64.i, i64 8 + %3 = bitcast i8* %funarg28 to i32* + store i32 %y0, i32* %3, align 4 + %funarg29 = getelementptr i8* %ptri64.i, i64 12 + %4 = bitcast i8* %funarg29 to i32* + store i32 %y1, i32* %4, align 4 + %funarg30 = getelementptr i8* %ptri64.i, i64 16 + %5 = bitcast i8* %funarg30 to i32* + store i32 %z0, i32* %5, align 4 + %funarg31 = getelementptr i8* %ptri64.i, i64 20 + %6 = bitcast i8* %funarg31 to i32* + store i32 %z1, i32* %6, align 4 + %funarg32 = getelementptr i8* %ptri64.i, i64 24 + %7 = bitcast i8* %funarg32 to i32* + store i32 %Nx, i32* %7, align 4 + %funarg33 = getelementptr i8* %ptri64.i, i64 28 + %8 = bitcast i8* %funarg33 to i32* + store i32 %Ny, i32* %8, align 4 + %funarg34 = getelementptr i8* %ptri64.i, i64 32 + %9 = bitcast i8* %funarg34 to i32* + store i32 %Nz, i32* %9, align 4 + %funarg35 = getelementptr i8* %ptri64.i, i64 40 + %10 = bitcast i8* %funarg35 to double** + store double* %coef, double** %10, align 8 + %funarg36 = getelementptr i8* %ptri64.i, i64 48 + %11 = bitcast i8* %funarg36 to double** + store double* %vsq, double** %11, align 8 + %funarg37 = getelementptr i8* %ptri64.i, i64 56 + %12 = bitcast i8* %funarg37 to double** + store double* %Aeven, double** %12, align 8 + %funarg38 = getelementptr i8* %ptri64.i, i64 64 + %13 = bitcast i8* %funarg38 to double** + store double* %Aodd, double** %13, align 8 + br label %if_false + +if_false: ; preds = %if_true, %if_then + %tid.i.i80 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and.i81 = and i32 %tid.i.i80, 31 + %cmp.i82 = icmp eq i32 %and.i81, 0 + br i1 %cmp.i82, label %if.then.i83, label %if_exit + +if.then.i83: ; preds = %if_false + %args_i64.i = ptrtoint i8* %ptri64.i to i64 + %res_tmp.i = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) + br label %if_exit + +if_true61: ; preds = %if_else + %funarg64 = bitcast i8* %ptri64.i to i32* + store i32 %x0, i32* %funarg64, align 4 + %funarg65 = getelementptr i8* %ptri64.i, i64 4 + %14 = bitcast i8* %funarg65 to i32* + store i32 %x1, i32* %14, align 4 + %funarg66 = getelementptr i8* %ptri64.i, i64 8 + %15 = bitcast i8* %funarg66 to i32* + store i32 %y0, i32* %15, align 4 + %funarg67 = getelementptr i8* %ptri64.i, i64 12 + %16 = bitcast i8* %funarg67 to i32* + store i32 %y1, i32* %16, align 4 + %funarg68 = getelementptr i8* %ptri64.i, i64 16 + %17 = bitcast i8* %funarg68 to i32* + store i32 %z0, i32* %17, align 4 + %funarg69 = getelementptr i8* %ptri64.i, i64 20 + %18 = bitcast i8* %funarg69 to i32* + store i32 %z1, i32* %18, align 4 + %funarg70 = getelementptr i8* %ptri64.i, i64 24 + %19 = bitcast i8* %funarg70 to i32* + store i32 %Nx, i32* %19, align 4 + %funarg71 = getelementptr i8* %ptri64.i, i64 28 + %20 = bitcast i8* %funarg71 to i32* + store i32 %Ny, i32* %20, align 4 + %funarg72 = getelementptr i8* %ptri64.i, i64 32 + %21 = bitcast i8* %funarg72 to i32* + store i32 %Nz, i32* %21, align 4 + %funarg73 = getelementptr i8* %ptri64.i, i64 40 + %22 = bitcast i8* %funarg73 to double** + store double* %coef, double** %22, align 8 + %funarg74 = getelementptr i8* %ptri64.i, i64 48 + %23 = bitcast i8* %funarg74 to double** + store double* %vsq, double** %23, align 8 + %funarg75 = getelementptr i8* %ptri64.i, i64 56 + %24 = bitcast i8* %funarg75 to double** + store double* %Aodd, double** %24, align 8 + %funarg76 = getelementptr i8* %ptri64.i, i64 64 + %25 = bitcast i8* %funarg76 to double** + store double* %Aeven, double** %25, align 8 + br label %if_false62 + +if_false62: ; preds = %if_true61, %if_else + %tid.i.i84 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and.i85 = and i32 %tid.i.i84, 31 + %cmp.i86 = icmp eq i32 %and.i85, 0 + br i1 %cmp.i86, label %if.then.i92, label %if_exit + +if.then.i92: ; preds = %if_false62 + %args_i64.i90 = ptrtoint i8* %ptri64.i to i64 + %res_tmp.i91 = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i90, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) + br label %if_exit +} + +define void @loop_stencil_ispc_tasks(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd) { +allocas: + %less_t_load_t1_load94 = icmp slt i32 %t0, %t1 + br i1 %less_t_load_t1_load94, label %for_loop.lr.ph, label %for_exit + +for_loop.lr.ph: ; preds = %allocas + %add_sub_x1_load21_x0_load22_ = sub i32 31, %x0 + %sub_add_sub_x1_load21_x0_load22__ = add i32 %add_sub_x1_load21_x0_load22_, %x1 + %div_sub_add_sub_x1_load21_x0_load22___ = sdiv i32 %sub_add_sub_x1_load21_x0_load22__, 32 + %add_sub_y1_load23_y0_load24_ = sub i32 7, %y0 + %sub_add_sub_y1_load23_y0_load24__ = add i32 %add_sub_y1_load23_y0_load24_, %y1 + %div_sub_add_sub_y1_load23_y0_load24___ = sdiv i32 %sub_add_sub_y1_load23_y0_load24__, 8 + %add_sub_z1_load25_z0_load26_ = sub i32 7, %z0 + %sub_add_sub_z1_load25_z0_load26__ = add i32 %add_sub_z1_load25_z0_load26_, %z1 + %div_sub_add_sub_z1_load25_z0_load26___ = sdiv i32 %sub_add_sub_z1_load25_z0_load26__, 8 + %ntxm1.i = add nsw i32 %div_sub_add_sub_x1_load21_x0_load22___, -1 + %ntxm1d4.i = ashr i32 %ntxm1.i, 2 + %nbx.i = add nsw i32 %ntxm1d4.i, 1 + br label %for_loop + +for_loop: ; preds = %if_exit, %for_loop.lr.ph + %t.095 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load78_plus1, %if_exit ] + %bitop = and i32 %t.095, 1 + %equal_bitop_ = icmp eq i32 %bitop, 0 + %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and.i = and i32 %tid.i.i, 31 + %cmp.i = icmp eq i32 %and.i, 0 + br i1 %cmp.i, label %if.then.i, label %ISPCGetParamBuffer.exit + +if.then.i: ; preds = %for_loop + %ptri64tmp.i = tail call i64 @cudaGetParameterBuffer(i64 8, i64 72) + %phitmp.i = inttoptr i64 %ptri64tmp.i to i8* + br label %ISPCGetParamBuffer.exit + +ISPCGetParamBuffer.exit: ; preds = %if.then.i, %for_loop + %ptri64.i = phi i8* [ %phitmp.i, %if.then.i ], [ null, %for_loop ] + %cmp1 = icmp eq i8* %ptri64.i, null + br i1 %equal_bitop_, label %if_then, label %if_else + +for_exit: ; preds = %if_exit, %allocas + %0 = tail call i32 @cudaDeviceSynchronize() + ret void + +if_then: ; preds = %ISPCGetParamBuffer.exit + br i1 %cmp1, label %if_false, label %if_true + +if_else: ; preds = %ISPCGetParamBuffer.exit + br i1 %cmp1, label %if_false62, label %if_true61 + +if_exit: ; preds = %if.then.i92, %if_false62, %if.then.i83, %if_false + %1 = tail call i32 @cudaDeviceSynchronize() + %t_load78_plus1 = add i32 %t.095, 1 + %exitcond = icmp eq i32 %t_load78_plus1, %t1 + br i1 %exitcond, label %for_exit, label %for_loop + +if_true: ; preds = %if_then + %funarg = bitcast i8* %ptri64.i to i32* + store i32 %x0, i32* %funarg, align 4 + %funarg27 = getelementptr i8* %ptri64.i, i64 4 + %2 = bitcast i8* %funarg27 to i32* + store i32 %x1, i32* %2, align 4 + %funarg28 = getelementptr i8* %ptri64.i, i64 8 + %3 = bitcast i8* %funarg28 to i32* + store i32 %y0, i32* %3, align 4 + %funarg29 = getelementptr i8* %ptri64.i, i64 12 + %4 = bitcast i8* %funarg29 to i32* + store i32 %y1, i32* %4, align 4 + %funarg30 = getelementptr i8* %ptri64.i, i64 16 + %5 = bitcast i8* %funarg30 to i32* + store i32 %z0, i32* %5, align 4 + %funarg31 = getelementptr i8* %ptri64.i, i64 20 + %6 = bitcast i8* %funarg31 to i32* + store i32 %z1, i32* %6, align 4 + %funarg32 = getelementptr i8* %ptri64.i, i64 24 + %7 = bitcast i8* %funarg32 to i32* + store i32 %Nx, i32* %7, align 4 + %funarg33 = getelementptr i8* %ptri64.i, i64 28 + %8 = bitcast i8* %funarg33 to i32* + store i32 %Ny, i32* %8, align 4 + %funarg34 = getelementptr i8* %ptri64.i, i64 32 + %9 = bitcast i8* %funarg34 to i32* + store i32 %Nz, i32* %9, align 4 + %funarg35 = getelementptr i8* %ptri64.i, i64 40 + %10 = bitcast i8* %funarg35 to double** + store double* %coef, double** %10, align 8 + %funarg36 = getelementptr i8* %ptri64.i, i64 48 + %11 = bitcast i8* %funarg36 to double** + store double* %vsq, double** %11, align 8 + %funarg37 = getelementptr i8* %ptri64.i, i64 56 + %12 = bitcast i8* %funarg37 to double** + store double* %Aeven, double** %12, align 8 + %funarg38 = getelementptr i8* %ptri64.i, i64 64 + %13 = bitcast i8* %funarg38 to double** + store double* %Aodd, double** %13, align 8 + br label %if_false + +if_false: ; preds = %if_true, %if_then + %tid.i.i80 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and.i81 = and i32 %tid.i.i80, 31 + %cmp.i82 = icmp eq i32 %and.i81, 0 + br i1 %cmp.i82, label %if.then.i83, label %if_exit + +if.then.i83: ; preds = %if_false + %args_i64.i = ptrtoint i8* %ptri64.i to i64 + %res_tmp.i = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) + br label %if_exit + +if_true61: ; preds = %if_else + %funarg64 = bitcast i8* %ptri64.i to i32* + store i32 %x0, i32* %funarg64, align 4 + %funarg65 = getelementptr i8* %ptri64.i, i64 4 + %14 = bitcast i8* %funarg65 to i32* + store i32 %x1, i32* %14, align 4 + %funarg66 = getelementptr i8* %ptri64.i, i64 8 + %15 = bitcast i8* %funarg66 to i32* + store i32 %y0, i32* %15, align 4 + %funarg67 = getelementptr i8* %ptri64.i, i64 12 + %16 = bitcast i8* %funarg67 to i32* + store i32 %y1, i32* %16, align 4 + %funarg68 = getelementptr i8* %ptri64.i, i64 16 + %17 = bitcast i8* %funarg68 to i32* + store i32 %z0, i32* %17, align 4 + %funarg69 = getelementptr i8* %ptri64.i, i64 20 + %18 = bitcast i8* %funarg69 to i32* + store i32 %z1, i32* %18, align 4 + %funarg70 = getelementptr i8* %ptri64.i, i64 24 + %19 = bitcast i8* %funarg70 to i32* + store i32 %Nx, i32* %19, align 4 + %funarg71 = getelementptr i8* %ptri64.i, i64 28 + %20 = bitcast i8* %funarg71 to i32* + store i32 %Ny, i32* %20, align 4 + %funarg72 = getelementptr i8* %ptri64.i, i64 32 + %21 = bitcast i8* %funarg72 to i32* + store i32 %Nz, i32* %21, align 4 + %funarg73 = getelementptr i8* %ptri64.i, i64 40 + %22 = bitcast i8* %funarg73 to double** + store double* %coef, double** %22, align 8 + %funarg74 = getelementptr i8* %ptri64.i, i64 48 + %23 = bitcast i8* %funarg74 to double** + store double* %vsq, double** %23, align 8 + %funarg75 = getelementptr i8* %ptri64.i, i64 56 + %24 = bitcast i8* %funarg75 to double** + store double* %Aodd, double** %24, align 8 + %funarg76 = getelementptr i8* %ptri64.i, i64 64 + %25 = bitcast i8* %funarg76 to double** + store double* %Aeven, double** %25, align 8 + br label %if_false62 + +if_false62: ; preds = %if_true61, %if_else + %tid.i.i84 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and.i85 = and i32 %tid.i.i84, 31 + %cmp.i86 = icmp eq i32 %and.i85, 0 + br i1 %cmp.i86, label %if.then.i92, label %if_exit + +if.then.i92: ; preds = %if_false62 + %args_i64.i90 = ptrtoint i8* %ptri64.i to i64 + %res_tmp.i91 = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i90, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) + br label %if_exit +} + +!llvm.ident = !{!0} +!nvvm.annotations = !{!1, !2} + +!0 = metadata !{metadata !"clang version 3.4 (trunk 194723)"} +!1 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, metadata !"kernel", i32 1} +!2 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @loop_stencil_ispc_tasks, metadata !"kernel", i32 1} diff --git a/examples_cuda/stencil/stencil_ispc_nvptx64.ptx b/examples_cuda/stencil/stencil_ispc_nvptx64.ptx new file mode 100644 index 00000000..b0339cbf --- /dev/null +++ b/examples_cuda/stencil/stencil_ispc_nvptx64.ptx @@ -0,0 +1,1246 @@ +// +// Generated by NVIDIA NVVM Compiler +// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) +// Cuda compilation tools, release 5.5, V5.5.0 +// + +.version 3.2 +.target sm_35 +.address_size 64 + + +.extern .func (.param .b32 func_retval0) cudaLaunchDevice +( + .param .b64 cudaLaunchDevice_param_0, + .param .b64 cudaLaunchDevice_param_1, + .param .align 4 .b8 cudaLaunchDevice_param_2[12], + .param .align 4 .b8 cudaLaunchDevice_param_3[12], + .param .b32 cudaLaunchDevice_param_4, + .param .b64 cudaLaunchDevice_param_5 +); + + +.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer +( + .param .b64 cudaGetParameterBuffer_param_0, + .param .b64 cudaGetParameterBuffer_param_1 +) +; +.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize +( + +) +; +.global .align 1 .b8 constDeltaForeach1[32]; +.global .align 1 .b8 constDeltaForeach4[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + +.visible .func (.param .b32 func_retval0) __shfl_i32( + .param .b32 __shfl_i32_param_0, + .param .b32 __shfl_i32_param_1 +) +{ + .reg .s32 %r<4>; + + + ld.param.u32 %r2, [__shfl_i32_param_0]; + ld.param.u32 %r3, [__shfl_i32_param_1]; + // inline asm + shfl.idx.b32 %r1, %r2, %r3, 0x1f; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __shfl_xor_float( + .param .b32 __shfl_xor_float_param_0, + .param .b32 __shfl_xor_float_param_1 +) +{ + .reg .s32 %r<2>; + .reg .f32 %f<3>; + + + ld.param.f32 %f2, [__shfl_xor_float_param_0]; + ld.param.u32 %r1, [__shfl_xor_float_param_1]; + // inline asm + shfl.bfly.b32 %f1, %f2, %r1, 0x1f; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __shfl_xor_i32( + .param .b32 __shfl_xor_i32_param_0, + .param .b32 __shfl_xor_i32_param_1 +) +{ + .reg .s32 %r<4>; + + + ld.param.u32 %r2, [__shfl_xor_i32_param_0]; + ld.param.u32 %r3, [__shfl_xor_i32_param_1]; + // inline asm + shfl.bfly.b32 %r1, %r2, %r3, 0x1f; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __fminf( + .param .b32 __fminf_param_0, + .param .b32 __fminf_param_1 +) +{ + .reg .f32 %f<4>; + + + ld.param.f32 %f2, [__fminf_param_0]; + ld.param.f32 %f3, [__fminf_param_1]; + // inline asm + min.f32 %f1, %f2, %f3; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __fmaxf( + .param .b32 __fmaxf_param_0, + .param .b32 __fmaxf_param_1 +) +{ + .reg .f32 %f<4>; + + + ld.param.f32 %f2, [__fmaxf_param_0]; + ld.param.f32 %f3, [__fmaxf_param_1]; + // inline asm + max.f32 %f1, %f2, %f3; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __ballot( + .param .b32 __ballot_param_0 +) +{ + .reg .s32 %r<3>; + + + ld.param.u8 %r2, [__ballot_param_0]; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r2, 0; + vote.ballot.b32 %r1, %p1; + } + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __lanemask_lt( + +) +{ + .reg .s32 %r<2>; + + + // inline asm + mov.u32 %r1, %lanemask_lt; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b64 func_retval0) ISPCAlloc( + .param .b64 ISPCAlloc_param_0, + .param .b64 ISPCAlloc_param_1, + .param .b32 ISPCAlloc_param_2 +) +{ + .reg .s64 %rd<2>; + + + mov.u64 %rd1, 1; + st.param.b64 [func_retval0+0], %rd1; + ret; +} + +.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer( + .param .b64 ISPCGetParamBuffer_param_0, + .param .b64 ISPCGetParamBuffer_param_1, + .param .b64 ISPCGetParamBuffer_param_2 +) +{ + .reg .pred %p<2>; + .reg .s32 %r<3>; + .reg .s64 %rd<7>; + + + ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1]; + ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2]; + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + setp.ne.s32 %p1, %r2, 0; + mov.u64 %rd6, 0; + @%p1 bra BB8_2; + + // Callseq Start 0 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd3; + .param .b64 param1; + st.param.b64 [param1+0], %rd4; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd6, [retval0+0]; + } + // Callseq End 0 + +BB8_2: + st.param.b64 [func_retval0+0], %rd6; + ret; +} + +.visible .func ISPCLaunch( + .param .b64 ISPCLaunch_param_0, + .param .b64 ISPCLaunch_param_1, + .param .b64 ISPCLaunch_param_2, + .param .b32 ISPCLaunch_param_3, + .param .b32 ISPCLaunch_param_4, + .param .b32 ISPCLaunch_param_5 +) +{ + .reg .pred %p<2>; + .reg .s32 %r<16>; + .reg .s64 %rd<6>; + + + ld.param.u64 %rd1, [ISPCLaunch_param_1]; + ld.param.u64 %rd2, [ISPCLaunch_param_2]; + ld.param.u32 %r1, [ISPCLaunch_param_3]; + ld.param.u32 %r2, [ISPCLaunch_param_4]; + ld.param.u32 %r3, [ISPCLaunch_param_5]; + mov.u32 %r4, %tid.x; + and.b32 %r5, %r4, 31; + setp.ne.s32 %p1, %r5, 0; + @%p1 bra BB9_2; + + add.s32 %r14, %r1, -1; + shr.s32 %r15, %r14, 2; + add.s32 %r7, %r15, 1; + mov.u32 %r12, 1; + mov.u32 %r10, 128; + mov.u32 %r13, 0; + mov.u64 %rd5, 0; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd1; + .param .b64 param1; + st.param.b64 [param1+0], %rd2; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r7; + st.param.b32 [param2+4], %r2; + st.param.b32 [param2+8], %r3; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r10; + st.param.b32 [param3+4], %r12; + st.param.b32 [param3+8], %r12; + .param .b32 param4; + st.param.b32 [param4+0], %r13; + .param .b64 param5; + st.param.b64 [param5+0], %rd5; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r6, [retval0+0]; + } + + // inline asm + +BB9_2: + ret; +} + +.visible .func ISPCSync( + .param .b64 ISPCSync_param_0 +) +{ + .reg .s32 %r<2>; + + + // Callseq Start 1 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r1, [retval0+0]; + } + // Callseq End 1 + ret; +} + +.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan( + .param .b32 __warpBinExclusiveScan_param_0 +) +{ + .reg .s32 %r<8>; + .reg .s64 %rd<5>; + + + ld.param.u8 %r2, [__warpBinExclusiveScan_param_0]; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r2, 0; + vote.ballot.b32 %r1, %p1; + } + // inline asm + // inline asm + popc.b32 %r3, %r1; + // inline asm + // inline asm + mov.u32 %r5, %lanemask_lt; + // inline asm + and.b32 %r7, %r5, %r1; + // inline asm + popc.b32 %r6, %r7; + // inline asm + cvt.u64.u32 %rd1, %r6; + shl.b64 %rd2, %rd1, 32; + cvt.u64.u32 %rd3, %r3; + or.b64 %rd4, %rd2, %rd3; + st.param.b64 [func_retval0+0], %rd4; + ret; +} + +.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_( + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12 +) +{ + .reg .pred %p<14>; + .reg .s32 %r<178>; + .reg .s64 %rd<96>; + .reg .f64 %fd<95>; + + + ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0]; + ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1]; + ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2]; + ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3]; + ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4]; + ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5]; + ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6]; + ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7]; + ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9]; + ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10]; + ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11]; + ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12]; + mov.u32 %r1, %ctaid.x; + shl.b32 %r50, %r1, 2; + mov.u32 %r2, %tid.x; + shr.s32 %r51, %r2, 5; + add.s32 %r52, %r51, %r50; + mov.u32 %r53, %nctaid.x; + shl.b32 %r54, %r53, 2; + setp.ge.s32 %p1, %r52, %r54; + mov.u32 %r55, %nctaid.y; + mov.u32 %r3, %ctaid.y; + setp.ge.s32 %p2, %r3, %r55; + or.pred %p3, %p1, %p2; + mov.u32 %r56, %nctaid.z; + mov.u32 %r4, %ctaid.z; + setp.ge.s32 %p4, %r4, %r56; + or.pred %p5, %p3, %p4; + @%p5 bra BB12_13; + + shl.b32 %r57, %r1, 7; + add.s32 %r58, %r2, %r57; + and.b32 %r59, %r58, -32; + add.s32 %r60, %r59, %r42; + add.s32 %r61, %r60, 32; + min.s32 %r5, %r43, %r61; + shl.b32 %r6, %r3, 3; + add.s32 %r62, %r6, %r44; + add.s32 %r7, %r62, 8; + shl.b32 %r8, %r4, 3; + add.s32 %r172, %r8, %r46; + add.s32 %r63, %r172, 8; + min.s32 %r64, %r47, %r63; + mul.lo.s32 %r10, %r49, %r48; + sub.s32 %r65, %r5, %r60; + shr.s32 %r66, %r65, 31; + shr.u32 %r67, %r66, 27; + add.s32 %r68, %r65, %r67; + and.b32 %r69, %r68, -32; + sub.s32 %r70, %r65, %r69; + sub.s32 %r11, %r5, %r70; + and.b32 %r71, %r2, 31; + cvt.u64.u32 %rd6, %r71; + mov.u64 %rd7, constDeltaForeach1; + add.s64 %rd1, %rd7, %rd6; + setp.ge.s32 %p6, %r172, %r64; + @%p6 bra BB12_13; + + min.s32 %r12, %r45, %r7; + shl.b32 %r15, %r10, 1; + neg.s32 %r16, %r15; + mul.lo.s32 %r17, %r10, 3; + mul.lo.s32 %r18, %r10, -3; + mov.u32 %r72, -9; + sub.s32 %r73, %r72, %r44; + sub.s32 %r74, %r73, %r6; + not.b32 %r75, %r45; + max.s32 %r76, %r74, %r75; + not.b32 %r19, %r76; + sub.s32 %r77, %r72, %r46; + sub.s32 %r78, %r77, %r8; + not.b32 %r79, %r47; + max.s32 %r80, %r78, %r79; + not.b32 %r20, %r80; + ld.global.u8 %r13, [%rd1]; + mov.u32 %r171, %r172; + +BB12_3: + mov.u32 %r21, %r171; + add.s32 %r23, %r21, %r13; + setp.ge.s32 %p7, %r62, %r12; + @%p7 bra BB12_12; + + mul.lo.s32 %r24, %r23, %r10; + mov.u32 %r174, %r62; + mov.u32 %r173, %r62; + +BB12_5: + mov.u32 %r27, %r173; + add.s32 %r30, %r27, %r13; + setp.ge.s32 %p8, %r60, %r11; + mov.u32 %r176, %r60; + @%p8 bra BB12_8; + + mov.u64 %rd9, constDeltaForeach4; + add.s64 %rd10, %rd9, %rd6; + ld.global.u8 %r31, [%rd10]; + mad.lo.s32 %r32, %r30, %r48, %r24; + add.s32 %r177, %r59, %r42; + +BB12_7: + cvta.to.global.u64 %rd11, %rd2; + add.s32 %r98, %r32, %r177; + add.s32 %r99, %r98, %r31; + shl.b32 %r100, %r99, 3; + cvt.s64.s32 %rd12, %r100; + add.s64 %rd13, %rd12, %rd4; + add.s32 %r101, %r100, 8; + cvt.s64.s32 %rd14, %r101; + add.s64 %rd15, %rd14, %rd4; + add.s32 %r102, %r100, -8; + cvt.s64.s32 %rd16, %r102; + add.s64 %rd17, %rd16, %rd4; + add.s32 %r103, %r99, %r48; + shl.b32 %r104, %r103, 3; + cvt.s64.s32 %rd18, %r104; + add.s64 %rd19, %rd18, %rd4; + sub.s32 %r105, %r99, %r48; + shl.b32 %r106, %r105, 3; + cvt.s64.s32 %rd20, %r106; + add.s64 %rd21, %rd20, %rd4; + add.s32 %r108, %r99, %r10; + shl.b32 %r109, %r108, 3; + cvt.s64.s32 %rd22, %r109; + add.s64 %rd23, %rd22, %rd4; + sub.s32 %r110, %r99, %r10; + shl.b32 %r111, %r110, 3; + cvt.s64.s32 %rd24, %r111; + add.s64 %rd25, %rd24, %rd4; + add.s32 %r112, %r100, 16; + cvt.s64.s32 %rd26, %r112; + add.s64 %rd27, %rd26, %rd4; + add.s32 %r113, %r100, -16; + cvt.s64.s32 %rd28, %r113; + add.s64 %rd29, %rd28, %rd4; + shl.b32 %r114, %r48, 1; + add.s32 %r115, %r99, %r114; + shl.b32 %r116, %r115, 3; + cvt.s64.s32 %rd30, %r116; + add.s64 %rd31, %rd30, %rd4; + mad.lo.s32 %r117, %r48, -2, %r99; + shl.b32 %r118, %r117, 3; + cvt.s64.s32 %rd32, %r118; + add.s64 %rd33, %rd32, %rd4; + add.s32 %r119, %r99, %r15; + shl.b32 %r120, %r119, 3; + cvt.s64.s32 %rd34, %r120; + add.s64 %rd35, %rd34, %rd4; + add.s32 %r121, %r99, %r16; + shl.b32 %r122, %r121, 3; + cvt.s64.s32 %rd36, %r122; + add.s64 %rd37, %rd36, %rd4; + add.s32 %r123, %r100, 24; + cvt.s64.s32 %rd38, %r123; + add.s64 %rd39, %rd38, %rd4; + add.s32 %r124, %r100, -24; + cvt.s64.s32 %rd40, %r124; + add.s64 %rd41, %rd40, %rd4; + mad.lo.s32 %r125, %r48, 3, %r99; + shl.b32 %r126, %r125, 3; + cvt.s64.s32 %rd42, %r126; + add.s64 %rd43, %rd42, %rd4; + mad.lo.s32 %r127, %r48, -3, %r99; + shl.b32 %r128, %r127, 3; + cvt.s64.s32 %rd44, %r128; + add.s64 %rd45, %rd44, %rd4; + add.s32 %r129, %r99, %r17; + shl.b32 %r130, %r129, 3; + cvt.s64.s32 %rd46, %r130; + add.s64 %rd47, %rd46, %rd4; + add.s32 %r131, %r99, %r18; + shl.b32 %r132, %r131, 3; + cvt.s64.s32 %rd48, %r132; + add.s64 %rd49, %rd48, %rd4; + add.s64 %rd50, %rd12, %rd5; + add.s64 %rd51, %rd12, %rd3; + ld.f64 %fd1, [%rd13]; + add.f64 %fd2, %fd1, %fd1; + ld.f64 %fd3, [%rd50]; + sub.f64 %fd4, %fd2, %fd3; + ld.global.f64 %fd5, [%rd11]; + ld.f64 %fd6, [%rd17]; + ld.f64 %fd7, [%rd15]; + add.f64 %fd8, %fd7, %fd6; + ld.f64 %fd9, [%rd19]; + add.f64 %fd10, %fd8, %fd9; + ld.f64 %fd11, [%rd21]; + add.f64 %fd12, %fd10, %fd11; + ld.f64 %fd13, [%rd23]; + add.f64 %fd14, %fd12, %fd13; + ld.f64 %fd15, [%rd25]; + add.f64 %fd16, %fd14, %fd15; + ld.global.f64 %fd17, [%rd11+8]; + mul.f64 %fd18, %fd17, %fd16; + fma.rn.f64 %fd19, %fd5, %fd1, %fd18; + ld.f64 %fd20, [%rd29]; + ld.f64 %fd21, [%rd27]; + add.f64 %fd22, %fd21, %fd20; + ld.f64 %fd23, [%rd31]; + add.f64 %fd24, %fd22, %fd23; + ld.f64 %fd25, [%rd33]; + add.f64 %fd26, %fd24, %fd25; + ld.f64 %fd27, [%rd35]; + add.f64 %fd28, %fd26, %fd27; + ld.f64 %fd29, [%rd37]; + add.f64 %fd30, %fd28, %fd29; + ld.global.f64 %fd31, [%rd11+16]; + fma.rn.f64 %fd32, %fd31, %fd30, %fd19; + ld.f64 %fd33, [%rd41]; + ld.f64 %fd34, [%rd39]; + add.f64 %fd35, %fd34, %fd33; + ld.f64 %fd36, [%rd43]; + add.f64 %fd37, %fd35, %fd36; + ld.f64 %fd38, [%rd45]; + add.f64 %fd39, %fd37, %fd38; + ld.f64 %fd40, [%rd47]; + add.f64 %fd41, %fd39, %fd40; + ld.f64 %fd42, [%rd49]; + add.f64 %fd43, %fd41, %fd42; + ld.global.f64 %fd44, [%rd11+24]; + fma.rn.f64 %fd45, %fd44, %fd43, %fd32; + ld.f64 %fd46, [%rd51]; + fma.rn.f64 %fd47, %fd46, %fd45, %fd4; + st.f64 [%rd50], %fd47; + add.s32 %r177, %r177, 32; + setp.lt.s32 %p9, %r177, %r11; + mov.u32 %r175, %r177; + mov.u32 %r176, %r175; + @%p9 bra BB12_7; + +BB12_8: + mov.u32 %r36, %r176; + setp.ge.s32 %p10, %r36, %r5; + @%p10 bra BB12_11; + + mov.u64 %rd53, constDeltaForeach4; + add.s64 %rd54, %rd53, %rd6; + ld.global.u8 %r135, [%rd54]; + add.s32 %r37, %r36, %r135; + setp.ge.s32 %p11, %r37, %r5; + @%p11 bra BB12_11; + + cvta.to.global.u64 %rd55, %rd2; + mad.lo.s32 %r136, %r30, %r48, %r24; + add.s32 %r137, %r136, %r37; + shl.b32 %r138, %r137, 3; + cvt.s64.s32 %rd56, %r138; + add.s64 %rd57, %rd56, %rd4; + add.s32 %r139, %r138, 8; + cvt.s64.s32 %rd58, %r139; + add.s64 %rd59, %rd58, %rd4; + add.s32 %r140, %r138, -8; + cvt.s64.s32 %rd60, %r140; + add.s64 %rd61, %rd60, %rd4; + add.s32 %r141, %r137, %r48; + shl.b32 %r142, %r141, 3; + cvt.s64.s32 %rd62, %r142; + add.s64 %rd63, %rd62, %rd4; + sub.s32 %r143, %r137, %r48; + shl.b32 %r144, %r143, 3; + cvt.s64.s32 %rd64, %r144; + add.s64 %rd65, %rd64, %rd4; + add.s32 %r146, %r137, %r10; + shl.b32 %r147, %r146, 3; + cvt.s64.s32 %rd66, %r147; + add.s64 %rd67, %rd66, %rd4; + sub.s32 %r148, %r137, %r10; + shl.b32 %r149, %r148, 3; + cvt.s64.s32 %rd68, %r149; + add.s64 %rd69, %rd68, %rd4; + add.s32 %r150, %r138, 16; + cvt.s64.s32 %rd70, %r150; + add.s64 %rd71, %rd70, %rd4; + add.s32 %r151, %r138, -16; + cvt.s64.s32 %rd72, %r151; + add.s64 %rd73, %rd72, %rd4; + shl.b32 %r152, %r48, 1; + add.s32 %r153, %r137, %r152; + shl.b32 %r154, %r153, 3; + cvt.s64.s32 %rd74, %r154; + add.s64 %rd75, %rd74, %rd4; + mad.lo.s32 %r155, %r48, -2, %r137; + shl.b32 %r156, %r155, 3; + cvt.s64.s32 %rd76, %r156; + add.s64 %rd77, %rd76, %rd4; + add.s32 %r157, %r137, %r15; + shl.b32 %r158, %r157, 3; + cvt.s64.s32 %rd78, %r158; + add.s64 %rd79, %rd78, %rd4; + add.s32 %r159, %r137, %r16; + shl.b32 %r160, %r159, 3; + cvt.s64.s32 %rd80, %r160; + add.s64 %rd81, %rd80, %rd4; + add.s32 %r161, %r138, 24; + cvt.s64.s32 %rd82, %r161; + add.s64 %rd83, %rd82, %rd4; + add.s32 %r162, %r138, -24; + cvt.s64.s32 %rd84, %r162; + add.s64 %rd85, %rd84, %rd4; + mad.lo.s32 %r163, %r48, 3, %r137; + shl.b32 %r164, %r163, 3; + cvt.s64.s32 %rd86, %r164; + add.s64 %rd87, %rd86, %rd4; + mad.lo.s32 %r165, %r48, -3, %r137; + shl.b32 %r166, %r165, 3; + cvt.s64.s32 %rd88, %r166; + add.s64 %rd89, %rd88, %rd4; + add.s32 %r167, %r137, %r17; + shl.b32 %r168, %r167, 3; + cvt.s64.s32 %rd90, %r168; + add.s64 %rd91, %rd90, %rd4; + add.s32 %r169, %r137, %r18; + shl.b32 %r170, %r169, 3; + cvt.s64.s32 %rd92, %r170; + add.s64 %rd93, %rd92, %rd4; + add.s64 %rd94, %rd56, %rd5; + add.s64 %rd95, %rd56, %rd3; + ld.f64 %fd48, [%rd57]; + add.f64 %fd49, %fd48, %fd48; + ld.f64 %fd50, [%rd94]; + sub.f64 %fd51, %fd49, %fd50; + ld.global.f64 %fd52, [%rd55]; + ld.f64 %fd53, [%rd61]; + ld.f64 %fd54, [%rd59]; + add.f64 %fd55, %fd54, %fd53; + ld.f64 %fd56, [%rd63]; + add.f64 %fd57, %fd55, %fd56; + ld.f64 %fd58, [%rd65]; + add.f64 %fd59, %fd57, %fd58; + ld.f64 %fd60, [%rd67]; + add.f64 %fd61, %fd59, %fd60; + ld.f64 %fd62, [%rd69]; + add.f64 %fd63, %fd61, %fd62; + ld.global.f64 %fd64, [%rd55+8]; + mul.f64 %fd65, %fd64, %fd63; + fma.rn.f64 %fd66, %fd52, %fd48, %fd65; + ld.f64 %fd67, [%rd73]; + ld.f64 %fd68, [%rd71]; + add.f64 %fd69, %fd68, %fd67; + ld.f64 %fd70, [%rd75]; + add.f64 %fd71, %fd69, %fd70; + ld.f64 %fd72, [%rd77]; + add.f64 %fd73, %fd71, %fd72; + ld.f64 %fd74, [%rd79]; + add.f64 %fd75, %fd73, %fd74; + ld.f64 %fd76, [%rd81]; + add.f64 %fd77, %fd75, %fd76; + ld.global.f64 %fd78, [%rd55+16]; + fma.rn.f64 %fd79, %fd78, %fd77, %fd66; + ld.f64 %fd80, [%rd85]; + ld.f64 %fd81, [%rd83]; + add.f64 %fd82, %fd81, %fd80; + ld.f64 %fd83, [%rd87]; + add.f64 %fd84, %fd82, %fd83; + ld.f64 %fd85, [%rd89]; + add.f64 %fd86, %fd84, %fd85; + ld.f64 %fd87, [%rd91]; + add.f64 %fd88, %fd86, %fd87; + ld.f64 %fd89, [%rd93]; + add.f64 %fd90, %fd88, %fd89; + ld.global.f64 %fd91, [%rd55+24]; + fma.rn.f64 %fd92, %fd91, %fd90, %fd79; + ld.f64 %fd93, [%rd95]; + fma.rn.f64 %fd94, %fd92, %fd93, %fd51; + st.f64 [%rd94], %fd94; + +BB12_11: + add.s32 %r39, %r174, 1; + setp.ne.s32 %p12, %r39, %r19; + mov.u32 %r174, %r39; + mov.u32 %r173, %r39; + @%p12 bra BB12_5; + +BB12_12: + add.s32 %r171, %r172, 1; + setp.ne.s32 %p13, %r171, %r20; + mov.u32 %r172, %r171; + @%p13 bra BB12_3; + +BB12_13: + ret; +} + +.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_( + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14, + .param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1] +) +{ + .reg .pred %p<9>; + .reg .s32 %r<63>; + .reg .s64 %rd<18>; + + + ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0]; + ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1]; + ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2]; + ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3]; + ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4]; + ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5]; + ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6]; + ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7]; + ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8]; + ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9]; + ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10]; + ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11]; + ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12]; + ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13]; + ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14]; + setp.ge.s32 %p1, %r62, %r12; + @%p1 bra BB13_14; + + mov.u32 %r22, 31; + sub.s32 %r23, %r22, %r13; + add.s32 %r24, %r23, %r14; + shr.s32 %r25, %r24, 31; + shr.u32 %r26, %r25, 27; + add.s32 %r27, %r24, %r26; + shr.s32 %r28, %r27, 5; + mov.u32 %r29, 7; + sub.s32 %r30, %r29, %r15; + add.s32 %r31, %r30, %r16; + shr.s32 %r32, %r31, 31; + shr.u32 %r33, %r32, 29; + add.s32 %r34, %r31, %r33; + shr.s32 %r1, %r34, 3; + sub.s32 %r35, %r29, %r17; + add.s32 %r36, %r35, %r18; + shr.s32 %r37, %r36, 31; + shr.u32 %r38, %r37, 29; + add.s32 %r39, %r36, %r38; + shr.s32 %r2, %r39, 3; + add.s32 %r40, %r28, -1; + shr.s32 %r41, %r40, 2; + add.s32 %r3, %r41, 1; + mov.u32 %r42, %tid.x; + and.b32 %r4, %r42, 31; + sub.s32 %r61, %r62, %r12; + +BB13_2: + and.b32 %r8, %r62, 1; + setp.ne.s32 %p2, %r4, 0; + mov.u64 %rd17, 0; + @%p2 bra BB13_4; + + mov.u64 %rd9, 8; + mov.u64 %rd10, 72; + // Callseq Start 2 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd10; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd17, [retval0+0]; + } + // Callseq End 2 + +BB13_4: + setp.eq.s32 %p3, %r8, 0; + @%p3 bra BB13_9; + + setp.eq.s64 %p4, %rd17, 0; + @%p4 bra BB13_7; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd7; + st.u64 [%rd17+64], %rd6; + +BB13_7: + @%p2 bra BB13_13; + + mov.u32 %r47, 128; + mov.u32 %r49, 1; + mov.u32 %r50, 0; + mov.u64 %rd13, 0; + mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd11; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r47; + st.param.b32 [param3+4], %r49; + st.param.b32 [param3+8], %r49; + .param .b32 param4; + st.param.b32 [param4+0], %r50; + .param .b64 param5; + st.param.b64 [param5+0], %rd13; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r43, [retval0+0]; + } + + // inline asm + bra.uni BB13_13; + +BB13_9: + setp.eq.s64 %p6, %rd17, 0; + @%p6 bra BB13_11; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd6; + st.u64 [%rd17+64], %rd7; + +BB13_11: + @%p2 bra BB13_13; + + mov.u32 %r55, 128; + mov.u32 %r57, 1; + mov.u32 %r58, 0; + mov.u64 %rd16, 0; + mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd14; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r55; + st.param.b32 [param3+4], %r57; + st.param.b32 [param3+8], %r57; + .param .b32 param4; + st.param.b32 [param4+0], %r58; + .param .b64 param5; + st.param.b64 [param5+0], %rd16; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r51, [retval0+0]; + } + + // inline asm + +BB13_13: + // Callseq Start 3 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r59, [retval0+0]; + } + // Callseq End 3 + add.s32 %r62, %r62, 1; + add.s32 %r61, %r61, 1; + setp.ne.s32 %p8, %r61, 0; + @%p8 bra BB13_2; + +BB13_14: + // Callseq Start 4 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r60, [retval0+0]; + } + // Callseq End 4 + ret; +} + +.visible .entry loop_stencil_ispc_tasks( + .param .u32 loop_stencil_ispc_tasks_param_0, + .param .u32 loop_stencil_ispc_tasks_param_1, + .param .u32 loop_stencil_ispc_tasks_param_2, + .param .u32 loop_stencil_ispc_tasks_param_3, + .param .u32 loop_stencil_ispc_tasks_param_4, + .param .u32 loop_stencil_ispc_tasks_param_5, + .param .u32 loop_stencil_ispc_tasks_param_6, + .param .u32 loop_stencil_ispc_tasks_param_7, + .param .u32 loop_stencil_ispc_tasks_param_8, + .param .u32 loop_stencil_ispc_tasks_param_9, + .param .u32 loop_stencil_ispc_tasks_param_10, + .param .u64 loop_stencil_ispc_tasks_param_11, + .param .u64 loop_stencil_ispc_tasks_param_12, + .param .u64 loop_stencil_ispc_tasks_param_13, + .param .u64 loop_stencil_ispc_tasks_param_14 +) +{ + .reg .pred %p<9>; + .reg .s32 %r<63>; + .reg .s64 %rd<18>; + + + ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0]; + ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1]; + ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2]; + ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3]; + ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4]; + ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5]; + ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6]; + ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7]; + ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8]; + ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9]; + ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10]; + ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11]; + ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12]; + ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13]; + ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14]; + setp.ge.s32 %p1, %r62, %r12; + @%p1 bra BB14_14; + + mov.u32 %r22, 31; + sub.s32 %r23, %r22, %r13; + add.s32 %r24, %r23, %r14; + shr.s32 %r25, %r24, 31; + shr.u32 %r26, %r25, 27; + add.s32 %r27, %r24, %r26; + shr.s32 %r28, %r27, 5; + mov.u32 %r29, 7; + sub.s32 %r30, %r29, %r15; + add.s32 %r31, %r30, %r16; + shr.s32 %r32, %r31, 31; + shr.u32 %r33, %r32, 29; + add.s32 %r34, %r31, %r33; + shr.s32 %r1, %r34, 3; + sub.s32 %r35, %r29, %r17; + add.s32 %r36, %r35, %r18; + shr.s32 %r37, %r36, 31; + shr.u32 %r38, %r37, 29; + add.s32 %r39, %r36, %r38; + shr.s32 %r2, %r39, 3; + add.s32 %r40, %r28, -1; + shr.s32 %r41, %r40, 2; + add.s32 %r3, %r41, 1; + mov.u32 %r42, %tid.x; + and.b32 %r4, %r42, 31; + sub.s32 %r61, %r62, %r12; + +BB14_2: + and.b32 %r8, %r62, 1; + setp.ne.s32 %p2, %r4, 0; + mov.u64 %rd17, 0; + @%p2 bra BB14_4; + + mov.u64 %rd9, 8; + mov.u64 %rd10, 72; + // Callseq Start 5 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd10; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd17, [retval0+0]; + } + // Callseq End 5 + +BB14_4: + setp.eq.s32 %p3, %r8, 0; + @%p3 bra BB14_9; + + setp.eq.s64 %p4, %rd17, 0; + @%p4 bra BB14_7; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd7; + st.u64 [%rd17+64], %rd6; + +BB14_7: + @%p2 bra BB14_13; + + mov.u32 %r47, 128; + mov.u32 %r49, 1; + mov.u32 %r50, 0; + mov.u64 %rd13, 0; + mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd11; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r47; + st.param.b32 [param3+4], %r49; + st.param.b32 [param3+8], %r49; + .param .b32 param4; + st.param.b32 [param4+0], %r50; + .param .b64 param5; + st.param.b64 [param5+0], %rd13; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r43, [retval0+0]; + } + + // inline asm + bra.uni BB14_13; + +BB14_9: + setp.eq.s64 %p6, %rd17, 0; + @%p6 bra BB14_11; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd6; + st.u64 [%rd17+64], %rd7; + +BB14_11: + @%p2 bra BB14_13; + + mov.u32 %r55, 128; + mov.u32 %r57, 1; + mov.u32 %r58, 0; + mov.u64 %rd16, 0; + mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd14; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r55; + st.param.b32 [param3+4], %r57; + st.param.b32 [param3+8], %r57; + .param .b32 param4; + st.param.b32 [param4+0], %r58; + .param .b64 param5; + st.param.b64 [param5+0], %rd16; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r51, [retval0+0]; + } + + // inline asm + +BB14_13: + // Callseq Start 6 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r59, [retval0+0]; + } + // Callseq End 6 + add.s32 %r62, %r62, 1; + add.s32 %r61, %r61, 1; + setp.ne.s32 %p8, %r61, 0; + @%p8 bra BB14_2; + +BB14_14: + // Callseq Start 7 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r60, [retval0+0]; + } + // Callseq End 7 + ret; +} + + + diff --git a/examples_cuda/stencil/stencil_nvptx64.bc b/examples_cuda/stencil/stencil_nvptx64.bc deleted file mode 100644 index b77be1e37199eec8aefe33226d8f35d06caeff37..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8500 zcma)C30PCdzCS0sA%p~pK-ofRt5TF;0imb_SsGi=h|y{R8xXzreGLX>Q7W3S`-%^= z*5y*4vDJQfuUcx^kxK-mw19%3-~$8%Q7KUt8f+gOE(KO!MOpdiZfnss&8&wSb`sX$ppu;=U?=F<-~n`ZiP=dK_D&reJb>;j3d-4U zm9M-tn?*`#Hj9+fY&I#gSTSVA_}$EUg0TTt0iGoRLJb0Ji)b8wMG*~@J6a3AnBcR- z{y4{47@V6u=baDg82%~H?t6fK0UQwjY$qvAx>0%F-@885=Lp>=Thn#L{?Tc>F0pOb z6}vlH`)aNIjXt}sNc-;@#JWY`_dSlrwzki9n+3W&%ik-lCVQoZy=OOj*YQ~1AY%4d zz#r4_-{Y;cufAe;?X>MS2@{)^YSTzxUhls=o4&kpcXphb{RIGpEK{v!C#mTPyEE0| zOzU4F`49E{d%gU=X8t|N7QaSEyMaji2Yr?uj&@yul=}hiGpWk}ofp+!m+3xj{ywEO zyuipz^>KF6GiHK}owy*=y0MzyCovlvG3g)SKa!h0?&SlXW5odb;FmSB=%cI8Le{(O z3qVp0!N*(!)A~&2YIrZVqKDjt2Q@BL?!xQ7=wXPtNU&xv8{m);;vxWjF8DA4B{Vs6 ztDK!KpTk@$OkZnVyH)B#mxj{l(UK?$&(Erj#gc20qSz%} za?W;1gFte{wkB%hE?%}(c#Y{6F>AY|OUc@*VeR7OSa09OYyPpxEg)KNXQk#lW^3Kk zq^!`>TAT2*sFloHtsc+P@*K}-H>f16&=~MpXDwU1Rb0aQ=05A2OtwqWO|;RRM;Gy8 zFCS2&NXwN4eeJ-r*!fca6WUR(OR4TiieHD-7OzP#8d;XgwaAy6U8MPFOg1_{>rDye z23aQ|mIVqaOYOIixrMRv4qUCLIZM(jG-qiZ8j}wfS~&?OUm6U)h^-ubw~Of_(xc(> z53g8hdm&L&K-bYWl>&MNbhw)vP52ZH(={w9vs4X8>)DWW>hDSm>n>WjK2*Cv&pd09 z{b6bZWWtL{EEIzX({S(fEmodW&VxMHacZxDVKeDMMATLj zp%3+2jFA*_3c#>~W3O5q}8 zGvcf~0A=1y$Q%!beUZC}LOeGS{fIe%SRM(L1c}FhD)uAIXg)F@YWNvOGF$p$8cat@ z>2mEFsG=+eRFOE*N@Vssw(z3eM&}1?DC=6z8W}T4;oFtk=95oX$oX>AOUkT3L>)8> zUqo=$kaPEAMW zDJ2bh9++CaIKWy=7iqc#k($OOwg)1PNGt&_TR487D|9yU>X-mi9U7Pxq=>YXXGBQt z643MxrXti*{$Yf4Q9OFJjoE0*-Jyw0|Cr!G|{7a0Iw_iAyRm> zLv9&5wlY+EJe;|V20*HQ69XyF(c;>f6!$W2Ed%VT55#v}>0G*s=}Hrg%~6Qw!(BaT zIG0gZRy!96^K}XxZ8_X^M2PR2YqBrYFGtF}Koe#C3U<|=!0_porj~rot!Fsri}I!G zA>_Ik5XHHF&vr|T+x;~+>YlSqR0@FWqlM18g)^djSJeCK`VnWSn`C0wk<%YL*U!z@ zMCek~aMuMQd{@N$UMQL`WscEAXBwa@{S)7nab){3?rNrUuVub?Gu-vRm~hu@M}0u0 zqSslD{vu^g?S!8CF0KT0{_kf~zSeT3gFp$Ia9j!1YjT5`OX#9+8lf70AOyO!@dWpi znbT-twZTkZx~LgWLo2|vz_Zzk-hiv=pU4g}*GQQjsHl{oP@Bt9nY1!c9M6zHmzv4_ ze5A}lMD&8D(3r^Q3uSg_n@a&rA?M1qz;6-}5UAQ*p+|}KhHgYsg<%rpCk7{jjv^=v zK{kBLY1}osVrS3g2osl=J7o!{o;Nt&wjT{{F*v2>bd;(xXpQd$8gh6mLWjyW1=i!w zQNu+Jc87oCe=aI`;k)En*1_j3j#Fg2{b|~=bz_G5UZ;r*VB9(m*?=}cYHeQ(e~>wV zwdn8Z4Fsv44L=jpJ^z)rSO2yfLol78Yi9Hm`el{vjgVm=&^%Yfkyw5aax(&Si*eIYs^z;7{5RRyk`kAx zUZSML!xO?ITK|xMv8C6>oSN2zuVcto-1Qa43iQ%|m*!K~BY&d*O-@n+3IVOk7JlOo z;91BGAXFAQR0>AVxe>ga96st@K*(L9@(}tjGVI@MxQozZT31AQnPc6KDlfgJzbt3t zT7u={1Nx`9VY(_mBbl#H4tuQs_%FD)VBEj666_z#BKe7GhbBBk@^hgx!$wc=7<~K=0kQ@XYT%*4(Rv=qYK3aSf9scdKX#ft zm}fx_K94D&_@AsWCHau;>--7ZaIzd)iQG{3Kb>1N?|23L=E|3}Eqmz`=qPLjM(B0A zRZC)Yhr>#Rmk{Or>>xCEyA32gkIBHpj)nbXO?q# zJ#m^$4oj5&arA;s6$$ePMNKm>Pz-F|VnV|HTG(Exg&=*daKGQ8hBfZM-$O~)_52iQ z`*t|hg}xKvXO#SAWzNP5eLpbSWHFE&v}9WpxM^eqqv`SW~Rcu7y323 zaF)&^+@Yg5mQ+3Gdft5`m9SQE%t1Pc1c#eOfetq;24xr)J1(mZp+a(L!s8Dzqr1ty zt0xETkU9LD!2!|t_W>xK4EqToklZ$=sz#NBzf-Y{j6czZ^ewN61zXuiQ2#~yCjhWI z_D-LzkBvk%dEG;zp^sN<9@T}5k2S!Ad33K7x-%R#`4o}vi|z6YQ?2l6FEc-c9c z-~v^7dm3uyCMnL1p$#s2LhAUCupcN{#$&z&B`O|F+NL1s8>k#tf9NnIJ;e{69o((| zfdjn==+ki##AG+XT!lfab!^xoJs@~RR0GYF+$2uL^MBjV)1_IA zhd^1#=!Q2)-aMP~E>OIA#{3fv#(9ZqEjfufKq7R26TQ@CWvbt4r1b|XA9LNwFxy^s zn#xl5%Yh(eTnR3SEV+Hcn%poU*(MgTeSK&F(FL?k+epSbJa)Wi40Gzq(wyVx$aif^ zcRZySdQkr3XDR*`z`OmEd#cu1n7cs-UlEmogK-il@8}z(98(DS;X=iT~F+$2X24LE%*GUlHin13D`V^ez>=!Y2l_X|Nko<$lKD8W_!*GRm z!a_o11ou#Biw9YC_m`V4B%jMD=|)Swb|Zwl(DS{68q1t9MaK7+Z{wx?r?7^wBo*}j zj!OE+M3n8PAQTi2m&dCL=wY+p0scW_CBWC?s%@E|zK-Zmco&Nm$-}@U60xD-;yMC- z3D?4UpqAS&e+Gq3RM~t?rm|vKP23J6Wlz|gr^EG9ZpwlNN$d{WPldSg&3UWWuWE*A zWq866eCVM2S2-IW{n+U&vm0)#ZU)bkl6m46+<1-3| zWiji>L^OOkl8aSh$v}&nqA-3t3tQN^Ag%ji+5E-ysdm7na`c%)0AIqgcP*X0d4us*?$Ar;1&R8AI% z9ThM!ISg!};yKAOHXX<7n$p(TYKWro4~yjKJBc)2Z~SP1N&`C@qyAveBhK#AwE!! z;se)hF5&}~n~bj14qeG^IUtFZ!O#;g;j=2rC?CHMef%81@%26qXy7fu8Ox1v#Hr_3 zBRb+BT3m_hTHQZQRM!{(?8pIB0ColKAwT<-jVX5o`)yl{h3;vvSseWn%xs1L*;f(I zvkU>Xeh6K7`=MSj;|LGP#%(}Po5c}q8UCL^I!SvhS!DwDKcL3$abn9xa}m>br6 z`5wZS9(t|VeVYH{~`r44+dhV?8Ff>iu z-n1}0o>t5rRIGbZGWi_HwE+)g9`OA1kUC~PDgm1G<4Xzn?HmOpJx5#sJ; z=7|tktCqSqYpwi^P2ReMVi#uzja6apz>^ z6iffQq6;V(Yd4r0zdy0=)JFrpnbup0nskR@^;%i&7-ocX*m?B!;gX^##2nt5#pd&XM?<2zfwN%yN)PJS+1a>xo;* z4tOsI8qgN_9%nWGCt8KGBiUZqJ>^tkPFLcNiW$de5FJrl{UtA&*!m%rT8g2Xw0P(o z0`eZqT`qNZ)IW6!@elAVaH#_Ki~~*^Z1sPl8cJwfO`hWLLLuj4)ddP-J%y(!4~%Ms zQ74i`$ek52yuq22d%QC2!+EdX&P+z+{f#tiN0DkAKOw-{AGCx&o;AQI#n84zSo}{W z%K2l|v9b#=abuCo^`~anXzpn_9YcR|MTu)%#IaJbXd61CDISVdtK%H$-GkUEHBz7%ku&KDE96h{~S$Upx|P{!X;8U@`9pMuvHeIVOx zI!nshwnvpo0luf0Nf|zQ_|^i_@bTC3R8q41_JDVL)_aka;q`BMVgF?S{`D3?{vV_x B|2zNy diff --git a/examples_cuda/stencil/stencil_orig.cpp b/examples_cuda/stencil/stencil_orig.cpp new file mode 100644 index 00000000..015f2b80 --- /dev/null +++ b/examples_cuda/stencil/stencil_orig.cpp @@ -0,0 +1,172 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include +#include "../timing.h" +#include "stencil_ispc.h" +using namespace ispc; + +#include + + +double rtc(void) +{ + struct timeval Tvalue; + double etime; + struct timezone dummy; + + gettimeofday(&Tvalue,&dummy); + etime = (double) Tvalue.tv_sec + + 1.e-6*((double) Tvalue.tv_usec); + return etime; +} + + +extern void loop_stencil_serial(int t0, int t1, int x0, int x1, + int y0, int y1, int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[5], + const double vsq[], + double Aeven[], double Aodd[]); + + +void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) { + int offset = 0; + for (int z = 0; z < Nz; ++z) + for (int y = 0; y < Ny; ++y) + for (int x = 0; x < Nx; ++x, ++offset) { + A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny); + A[1][offset] = 0; + vsq[offset] = x*y*z / double(Nx * Ny * Nz); + } +} + + +int main() { + int Nx = 256, Ny = 256, Nz = 256; + int width = 4; + double *Aserial[2], *Aispc[2]; + Aserial[0] = new double [Nx * Ny * Nz]; + Aserial[1] = new double [Nx * Ny * Nz]; + Aispc[0] = new double [Nx * Ny * Nz]; + Aispc[1] = new double [Nx * Ny * Nz]; + double *vsq = new double [Nx * Ny * Nz]; + + double coeff[4] = { 0.5, -.25, .125, -.0625 }; + +// InitData(Nx, Ny, Nz, Aispc, vsq); + + // + // Compute the image using the ispc implementation on one core; report + // the minimum time of three runs. + // + double minTimeISPC = 1e30; +#if 0 + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aispc[0], Aispc[1]); + double dt = get_elapsed_mcycles(); + minTimeISPC = std::min(minTimeISPC, dt); + } + + printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); +#endif + + fprintf(stderr, " -- init -- \n"); + InitData(Nx, Ny, Nz, Aispc, vsq); + fprintf(stderr, " -- done init -- \n"); + + // + // Compute the image using the ispc implementation with tasks; report + // the minimum time of three runs. + // + double minTimeISPCTasks = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + const double t0 = rtc(); + loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aispc[0], Aispc[1]); + double dt = 1e3*(rtc() - t0); //get_elapsed_mcycles(); + minTimeISPCTasks = std::min(minTimeISPCTasks, dt); + } + + fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); + + + InitData(Nx, Ny, Nz, Aserial, vsq); + + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minTimeSerial = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aserial[0], Aserial[1]); + double dt = get_elapsed_mcycles(); + minTimeSerial = std::min(minTimeSerial, dt); + } + + printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); + + printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", + minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); + + // Check for agreement + int offset = 0; + for (int z = 0; z < Nz; ++z) + for (int y = 0; y < Ny; ++y) + for (int x = 0; x < Nx; ++x, ++offset) { + double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) / + Aserial[1][offset]); + if (error > 1e-4) + printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n", + x, y, z, Aispc[1][offset], Aserial[1][offset]); + } + + return 0; +} diff --git a/examples_cuda/stencil/stencil_orig.ispc b/examples_cuda/stencil/stencil_orig.ispc new file mode 100644 index 00000000..d2e095b3 --- /dev/null +++ b/examples_cuda/stencil/stencil_orig.ispc @@ -0,0 +1,172 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef __NVPTX__ +#warning "emitting DEVICE code" +#define taskIndex blockIndex0() +#define taskCount blockCount0() +#define programIndex laneIndex() +#define programCount warpSize() +#else +#warning "emitting HOST code" +#endif + +static inline void +stencil_step(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + const uniform int Nxy = Nx * Ny; + +// foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) +#if 0 +#define VER1 +#endif + +#ifdef VER1 + const uniform long x1o = 1; + const uniform long x2o = 2; + const uniform long x3o = 3; + const uniform long y1o = Nx; + const uniform long y2o = Nx*2; + const uniform long y3o = Nx*3; + const uniform long z1o = Nxy; + const uniform long z2o = Nxy*2; + const uniform long z3o = Nxy*3; +#endif + for (uniform int z = z0; z < z1; z++) + for (uniform int y = y0; y < y1; y++) + { + const int index_base = (z * Nxy) + (y * Nx); + for (uniform int xb = x0; xb < x1; xb += programCount) + { + const int x = xb + programIndex; + int index = index_base + x; +#ifndef VER1 +#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] +#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); +#else +#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)] +#define A_next(x, y, z) Aout[index + (x) + (y) + (z)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) + + A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) + + A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) + + coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) + + A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) + + A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) + + coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) + + A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) + + A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o)); +#endif + + if (x < x1) + A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; + } + } +} + + +static task void +stencil_step_task(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + if(taskIndex >= taskCount) return; + + stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, + Nx, Ny, Nz, coef, vsq, Ain, Aout); +} + + +export void +loop_stencil_ispc_tasks(uniform int t0, uniform int t1, + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], + uniform const double vsq[], + uniform double Aeven[], uniform double Aodd[]) +{ + for (uniform int t = t0; t < t1; ++t) { + // Parallelize across cores as well: each task will work on a slice + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd); + else + launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven); + + // We need to wait for all of the launched tasks to finish before + // starting the next iteration. + sync; + } +} + + +export void +loop_stencil_ispc(uniform int t0, uniform int t1, + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], + uniform const double vsq[], + uniform double Aeven[], uniform double Aodd[]) +{ + for (uniform int t = t0; t < t1; ++t) { + if ((t & 1) == 0) + stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, + Aeven, Aodd); + else + stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, + Aodd, Aeven); + } +} diff --git a/examples_cuda/stencil/stencil_parallel.cpp b/examples_cuda/stencil/stencil_parallel.cpp index 30ded2cd..d4e59dc8 100644 --- a/examples_cuda/stencil/stencil_parallel.cpp +++ b/examples_cuda/stencil/stencil_parallel.cpp @@ -37,8 +37,8 @@ stencil_step(int x0, int x1, int y0, int y1, int z0, int z1, int Nx, int Ny, int Nz, - const float coef[4], const float vsq[], - const float Ain[], float Aout[]) { + const double coef[4], const double vsq[], + const double Ain[], double Aout[]) { int Nxy = Nx * Ny; #pragma omp parallel for @@ -48,7 +48,7 @@ stencil_step(int x0, int x1, int index = (z * Nxy) + (y * Nx) + x; #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - float div = coef[0] * A_cur(0, 0, 0) + + double div = coef[0] * A_cur(0, 0, 0) + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + A_cur(0, +1, 0) + A_cur(0, -1, 0) + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + @@ -72,9 +72,9 @@ void loop_stencil_parallel(int t0, int t1, int y0, int y1, int z0, int z1, int Nx, int Ny, int Nz, - const float coef[4], - const float vsq[], - float Aeven[], float Aodd[]) + const double coef[4], + const double vsq[], + double Aeven[], double Aodd[]) { for (int t = t0; t < t1; ++t) { if ((t & 1) == 0) diff --git a/examples_cuda/stencil/stencil_serial.o b/examples_cuda/stencil/stencil_serial.o new file mode 100644 index 0000000000000000000000000000000000000000..1fd32c299aeb42b3c7b58279918024d18b00f656 GIT binary patch literal 2360 zcmbtUUrZZy9RFR*pi^_#5n@O-Lrs2(CC&>NRO*Pi>#pv{JPeVd;Fw)^Eg6x2q4zKi zi6kAyOLvS%9(s#!0MXSu48L@ArBKrJ^tUrM>Up z&*%I7eSg2Zk3#C}E{-G4;>c^{z;e8VDzWB_YOJd8Z2k^&Hdr#}WyxQ?SS!Oc zg{HAk7R@w_CLajXMPViYgJ!Ug9K;rcrYWon%u4}iPME<%!9p7h5ScFHPnOat3@A*u zI*n{+;m6Jb5KhuL#uXX2$GAsAlSFm}uf=&26jt07`lS4z;QtmB`1c>*4GMiq5&Q%7 zmb!r{O15YjD`51?9`I}cT0%w2S&@7>A!lV5i`aD93T#;ruLD6v+JcATq*_@vf2PQi zE5O<>HTHx?+s=-c2z zw&OkOz^f@Ksbx9~nM!Jr&Vf`@U(vKBzn}{s1@jYl!bMYzzZk?;z`A$A;b*C9bO}7n z7^Z$aZ=aI7P7$F%WC~<*jf2ZCLFXONc{}W~O*LY)Q3Zro0bY+{bkN-Af0s&U2B@Mx z^I(nB)0MnF1y+F&5RMM7*3x z5K$W&8HtaAZ*)wLi?HVx5_(j7pNJFj;iwqD5*eI`j>PTG9#5PEIDQvvt7s(KzTrL` z@E$hFvtT2>Dve~S9X77wb8C5?gQw>)@TUQb^;_Uyxt#j-l}^wx7ndVns?e=~{rKfS z`gd?n?s*%JSFCd7Y<#-X2|Dhza)rRCo(aZ1$IDsHzR<)oa`27iFgXG8s8(lf(y*Q9 zi0b~)uI?Rk({*J;zAVTSAhCz)`sh2V{+yjSX5$@EUElFnOU$bwf0U~IXuJ1M?{L(8 zW*y+W0t$ysNgpxbN5F<3rBsek0b?5`kZLOdLu~M8y_JZVe2Njg3e40Tz*i09gig Y$9h^Lg!r%~ENr9loMV>v!0+_=5ANaM@c;k- literal 0 HcmV?d00001