diff --git a/examples_cuda/stencil/1.s b/examples_cuda/stencil/1.s deleted file mode 100644 index d59cb1f9..00000000 --- a/examples_cuda/stencil/1.s +++ /dev/null @@ -1,175 +0,0 @@ - - code for sm_35 - Function : stencil_step_task - .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" - /* 0x0880a010a0a01000 */ - /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ - /*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */ - /*0018*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */ - /*0020*/ IADD R0, R10, c[0x0][0x150]; /* 0x608000002a1c2802 */ - /*0028*/ IADD R11, R0, 0x1; /* 0xc0800000009c002d */ - /*0030*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */ - /*0038*/ ISETP.GE.AND P0, PT, R0, R11, PT; /* 0xdb681c00059c001e */ - /* 0x08a0a1ac118d8d8c */ - /*0048*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */ - /*0050*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */ - /*0058*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */ - /*0060*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */ - /*0068*/ @P0 EXIT ; /* 0x180000000000003c */ - /*0070*/ MOV R11, c[0x0][0x158]; /* 0x64c03c002b1c002e */ - /*0078*/ IMUL R41, R11, c[0x0][0x154]; /* 0x61c018002a9c2ca6 */ - /* 0x08b0a000a010a010 */ - /*0088*/ IADD R11, R10, c[0x0][0x150]; /* 0x608000002a1c282e */ - /*0090*/ SHF.L R40, RZ, 0x1, R41; /* 0xb7c0a400009ffca1 */ - /*0098*/ I2I.S32.S32 R10, -R40; /* 0xe6010000141ce82a */ - /*00a0*/ IADD R49, R11, 0x1; /* 0xc0800000009c2cc5 */ - /*00a8*/ SHF.L R28, RZ, 0x3, R10; /* 0xb7c02800019ffc71 */ - /*00b0*/ MOV R10, c[0x0][0x148]; /* 0x64c03c00291c002a */ - /*00b8*/ ISETP.GE.AND P0, PT, R10, c[0x0][0x14c], PT; /* 0x5b681c00299c281e */ - /* 0x0880acb0a00010ac */ - /*00c8*/ @P0 BRA 0x4f0; /* 0x120000021000003c */ - /*00d0*/ MOV R29, c[0x0][0x148]; /* 0x64c03c00291c0076 */ - /*00d8*/ IMUL R42, R0, R41; /* 0xe1c01800149c00aa */ - /*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */ - /*00e8*/ ISETP.GE.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b681c00289c281e */ - /*00f0*/ @P0 BRA 0x4d8; /* 0x12000001f000003c */ - /*00f8*/ MOV R10, c[0x0][0x154]; /* 0x64c03c002a9c002a */ - /* 0x0880888010a0109c */ - /*0108*/ IMAD R44, R29, c[0x0][0x154], R42; /* 0x5108a8002a9c74b2 */ - /*0110*/ SHF.L R11, RZ, 0x1, R10; /* 0xb7c02800009ffc2d */ - /*0118*/ MOV R39, c[0x0][0x140]; /* 0x64c03c00281c009e */ - /*0120*/ IMAD R34, R10, -0x2, R44; /* 0xa908b3ffff1c2889 */ - /*0128*/ IADD R43, R44, R11; /* 0xe0800000059cb0ae */ - /*0130*/ I2I.S32.S32 R10, -R11; /* 0xe6010000059ce82a */ - /*0138*/ IMAD R36, R41, -0x2, R44; /* 0xa908b3ffff1ca491 */ - /* 0x08a0001084108480 */ - /*0148*/ IADD R32, R44, c[0x0][0x154]; /* 0x608000002a9cb082 */ - /*0150*/ IADD R33, R44, R41; /* 0xe0800000149cb086 */ - /*0158*/ IADD R35, R44, R40; /* 0xe0800000141cb08e */ - /*0160*/ IMAD R38, R41, 0x3, R44; /* 0xa108b000019ca499 */ - /*0168*/ SHF.L R47, RZ, 0x3, R10; /* 0xb7c02800019ffcbd */ - /*0170*/ IADD R37, R43, c[0x0][0x154]; /* 0x608000002a9cac96 */ - /*0178*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */ - /* 0x08a0b0a010908c10 */ - /*0188*/ MOV32I R48, 0x8; /* 0x74000000041fc0c2 */ - /*0190*/ IADD R45, R10, R39; /* 0xe0800000139c28b6 */ - /*0198*/ BFE R30, R47, 0x11f; /* 0xc00800008f9cbc79 */ - /*01a0*/ IADD R46, R45, R44; /* 0xe0800000161cb4ba */ - /*01a8*/ IADD R14, R32, R45; /* 0xe0800000169c803a */ - /*01b0*/ IMAD R10.CC, R46, R48, c[0x0][0x170]; /* 0x910cc0002e1cb82a */ - /*01b8*/ IMAD.HI.X R11, R46, R48, c[0x0][0x174]; /* 0x9318c0002e9cb82e */ - /* 0x0881cc118c118c10 */ - /*01c8*/ IADD R27, R37, R45; /* 0xe0800000169c946e */ - /*01d0*/ LD.E.64 R12, [R10+-0x8]; /* 0xc5fffffffc1c2830 */ - /*01d8*/ BFE R50, R28, 0x11f; /* 0xc00800008f9c70c9 */ - /*01e0*/ LD.E.64 R24, [R10+0x8]; /* 0xc5800000041c2860 */ - /*01e8*/ ISETP.GE.AND P0, PT, R45, c[0x0][0x144], PT; /* 0x5b681c00289cb41e */ - /*01f0*/ LD.E.64 R18, [R10+-0x18]; /* 0xc5fffffff41c2848 */ - /*01f8*/ DADD R20, R24, R12; /* 0xe3800000061c6052 */ - /* 0x098c10a011ac8188 */ - /*0208*/ LD.E.64 R22, [R10+0x18]; /* 0xc58000000c1c2858 */ - /*0210*/ IMAD R16.CC, R14, R48, c[0x0][0x170]; /* 0x910cc0002e1c3842 */ - /*0218*/ LD.E.64 R12, [R10+-0x10]; /* 0xc5fffffff81c2830 */ - /*0220*/ IMAD.HI.X R17, R14, R48, c[0x0][0x174]; /* 0x9318c0002e9c3846 */ - /*0228*/ IADD R25, R43, R45; /* 0xe0800000169cac66 */ - /*0230*/ LD.E.64 R14, [R16]; /* 0xc5800000001c4038 */ - /*0238*/ DADD R22, R22, R18; /* 0xe3800000091c585a */ - /* 0x0994808c848cb180 */ - /*0248*/ LD.E.64 R18, [R10+0x10]; /* 0xc5800000081c2848 */ - /*0250*/ IMAD R26.CC, R27, R48, c[0x0][0x170]; /* 0x910cc0002e1c6c6a */ - /*0258*/ IMAD.HI.X R27, R27, R48, c[0x0][0x174]; /* 0x9318c0002e9c6c6e */ - /*0260*/ IMAD R24.CC, R25, R48, c[0x0][0x170]; /* 0x910cc0002e1c6462 */ - /*0268*/ DADD R14, R20, R14; /* 0xe3800000071c503a */ - /*0270*/ DADD R20, R18, R12; /* 0xe3800000061c4852 */ - /*0278*/ LD.E.64 R12, [R26]; /* 0xc5800000001c6830 */ - /* 0x08b080118010c080 */ - /*0288*/ IMAD.HI.X R25, R25, R48, c[0x0][0x174]; /* 0x9318c0002e9c6466 */ - /*0290*/ IADD R16.CC, R16, R47; /* 0xe0840000179c4042 */ - /*0298*/ LD.E.64 R18, [R24]; /* 0xc5800000001c6048 */ - /*02a0*/ DADD R12, R22, R12; /* 0xe3800000061c5832 */ - /*02a8*/ IADD.X R17, R17, R30; /* 0xe08040000f1c4446 */ - /*02b0*/ IADD R31, R34, R45; /* 0xe0800000169c887e */ - /*02b8*/ IADD R22.CC, R16, R47; /* 0xe0840000179c405a */ - /* 0x089980818880a010 */ - /*02c8*/ IADD.X R23, R17, R30; /* 0xe08040000f1c445e */ - /*02d0*/ IMAD R26.CC, R31, R48, c[0x0][0x170]; /* 0x910cc0002e1c7c6a */ - /*02d8*/ DADD R20, R20, R18; /* 0xe3800000091c5052 */ - /*02e0*/ LD.E.64 R18, [R16]; /* 0xc5800000001c4048 */ - /*02e8*/ IMAD.HI.X R27, R31, R48, c[0x0][0x174]; /* 0x9318c0002e9c7c6e */ - /*02f0*/ LD.E.64 R24, [R22]; /* 0xc5800000001c5860 */ - /*02f8*/ IADD R51, R33, R45; /* 0xe0800000169c84ce */ - /* 0x088880ac818c11b8 */ - /*0308*/ LD.E.64 R30, [R26]; /* 0xc5800000001c6878 */ - /*0310*/ LD.E.64 R26, [R10]; /* 0xc5800000001c2868 */ - /*0318*/ DADD R14, R14, R18; /* 0xe3800000091c383a */ - /*0320*/ IMAD R18.CC, R51, R48, c[0x0][0x170]; /* 0x910cc0002e1ccc4a */ - /*0328*/ IADD R17, R35, R45; /* 0xe0800000169c8c46 */ - /*0330*/ IMAD.HI.X R19, R51, R48, c[0x0][0x174]; /* 0x9318c0002e9ccc4e */ - /*0338*/ DADD R22, R20, R30; /* 0xe38000000f1c505a */ - /* 0x098c10a0999c1090 */ - /*0348*/ IMAD R16.CC, R17, R48, c[0x0][0x170]; /* 0x910cc0002e1c4442 */ - /*0350*/ LD.E.64 R20, [R18]; /* 0xc5800000001c4850 */ - /*0358*/ DADD R12, R12, R24; /* 0xe38000000c1c3032 */ - /*0360*/ IMAD.HI.X R17, R17, R48, c[0x0][0x174]; /* 0x9318c0002e9c4446 */ - /*0368*/ IADD R18.CC, R18, R28; /* 0xe08400000e1c484a */ - /*0370*/ LD.E.64 R24, [R16]; /* 0xc5800000001c4060 */ - /*0378*/ DADD R20, R14, R20; /* 0xe38000000a1c3852 */ - /* 0x088080b4a18010cc */ - /*0388*/ IADD.X R19, R19, R50; /* 0xe0804000191c4c4e */ - /*0390*/ LD.E.64 R14, [R18]; /* 0xc5800000001c4838 */ - /*0398*/ DADD R22, R22, R24; /* 0xe38000000c1c585a */ - /*03a0*/ IADD R25, R36, R45; /* 0xe0800000169c9066 */ - /*03a8*/ IMAD R16.CC, R25, R48, c[0x0][0x170]; /* 0x910cc0002e1c6442 */ - /*03b0*/ DADD R20, R20, R14; /* 0xe3800000071c5052 */ - /*03b8*/ IADD R15, R38, R45; /* 0xe0800000169c983e */ - /* 0x09a010b081ac809c */ - /*03c8*/ IMAD.HI.X R17, R25, R48, c[0x0][0x174]; /* 0x9318c0002e9c6446 */ - /*03d0*/ IMAD R14.CC, R15, R48, c[0x0][0x170]; /* 0x910cc0002e1c3c3a */ - /*03d8*/ LD.E.64 R24, [R16]; /* 0xc5800000001c4060 */ - /*03e0*/ IMAD.HI.X R15, R15, R48, c[0x0][0x174]; /* 0x9318c0002e9c3c3e */ - /*03e8*/ IADD R18.CC, R18, R28; /* 0xe08400000e1c484a */ - /*03f0*/ LD.E.64 R30, [R14]; /* 0xc5800000001c3878 */ - /*03f8*/ IADD.X R19, R19, R50; /* 0xe0804000191c4c4e */ - /* 0x08a480a480b58010 */ - /*0408*/ LD.E.64 R50, [R18]; /* 0xc5800000001c48c8 */ - /*0410*/ DMUL R20, R6, R20; /* 0xe40000000a1c1852 */ - /*0418*/ DADD R22, R22, R24; /* 0xe38000000c1c585a */ - /*0420*/ DADD R12, R12, R30; /* 0xe38000000f1c3032 */ - /*0428*/ DFMA R24, R8, R26, R20; /* 0xdb8050000d1c2062 */ - /*0430*/ DFMA R16, R4, R22, R24; /* 0xdb8060000b1c1042 */ - /*0438*/ DADD R12, R12, R50; /* 0xe3800000191c3032 */ - /* 0x08908cb0a010ac80 */ - /*0448*/ DFMA R10, R2, R12, R16; /* 0xdb804000061c082a */ - /*0450*/ @P0 BRA.U 0x4b8; /* 0x120000003000023c */ - /*0458*/ @!P0 MOV32I R17, 0x8; /* 0x740000000423c046 */ - /*0460*/ @!P0 DADD R18, R26, R26; /* 0xe38000000d20684a */ - /*0468*/ @!P0 IMAD R14.CC, R46, R17, c[0x0][0x178]; /* 0x910c44002f20b83a */ - /*0470*/ @!P0 IMAD.HI.X R15, R46, R17, c[0x0][0x17c]; /* 0x931844002fa0b83e */ - /*0478*/ @!P0 IMAD R16.CC, R46, R17, c[0x0][0x168]; /* 0x910c44002d20b842 */ - /* 0x08a180a5dc10bd9c */ - /*0488*/ @!P0 LD.E.64 R12, [R14]; /* 0xc580000000203830 */ - /*0490*/ @!P0 IMAD.HI.X R17, R46, R17, c[0x0][0x16c]; /* 0x931844002da0b846 */ - /*0498*/ @!P0 LD.E.64 R20, [R16]; /* 0xc580000000204050 */ - /*04a0*/ @!P0 DADD R22, R18, -R12; /* 0xe38100000620485a */ - /*04a8*/ @!P0 DFMA R10, R20, R10, R22; /* 0xdb8058000520502a */ - /*04b0*/ @!P0 ST.E.64 [R14], R10; /* 0xe580000000203828 */ - /*04b8*/ IADD R39, R39, 0x20; /* 0xc0800000101c9c9d */ - /* 0x08b0a0b8b0a0b8b0 */ - /*04c8*/ ISETP.LT.AND P0, PT, R39, c[0x0][0x144], PT; /* 0x5b181c00289c9c1e */ - /*04d0*/ @P0 BRA 0x178; /* 0x12007ffe5000003c */ - /*04d8*/ IADD R29, R29, 0x1; /* 0xc0800000009c7475 */ - /*04e0*/ ISETP.LT.AND P0, PT, R29, c[0x0][0x14c], PT; /* 0x5b181c00299c741e */ - /*04e8*/ @P0 BRA 0xe0; /* 0x12007ffdf800003c */ - /*04f0*/ IADD R0, R0, 0x1; /* 0xc0800000009c0001 */ - /*04f8*/ ISETP.LT.AND P0, PT, R0, R49, PT; /* 0xdb181c00189c001e */ - /* 0x0800000000b810b8 */ - /*0508*/ @P0 BRA 0xb0; /* 0x12007ffdd000003c */ - /*0510*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ - /*0518*/ EXIT ; /* 0x18000000001c003c */ - /*0520*/ BRA 0x520; /* 0x12007ffffc1c003c */ - /*0528*/ NOP; /* 0x85800000001c3c02 */ - /*0530*/ NOP; /* 0x85800000001c3c02 */ - /*0538*/ NOP; /* 0x85800000001c3c02 */ - .................................. - - diff --git a/examples_cuda/stencil/2.s b/examples_cuda/stencil/2.s deleted file mode 100644 index 76476d03..00000000 --- a/examples_cuda/stencil/2.s +++ /dev/null @@ -1,239 +0,0 @@ - - code for sm_35 - Function : stencil_step_task - .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" - /* 0x0880acb0a0a0a000 */ - /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ - /*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */ - /*0018*/ IADD R44, R10, c[0x0][0x150]; /* 0x608000002a1c28b2 */ - /*0020*/ IADD R0, R44, 0x1; /* 0xc0800000009cb001 */ - /*0028*/ ISETP.GE.AND P0, PT, R44, R0, PT; /* 0xdb681c00001cb01e */ - /*0030*/ @P0 EXIT ; /* 0x180000000000003c */ - /*0038*/ MOV R11, c[0x0][0x154]; /* 0x64c03c002a9c002e */ - /* 0x0888108010a01080 */ - /*0048*/ IADD R41, R10, c[0x0][0x150]; /* 0x608000002a1c28a6 */ - /*0050*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */ - /*0058*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */ - /*0060*/ IMUL R35, R11, c[0x0][0x158]; /* 0x61c018002b1c2c8e */ - /*0068*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */ - /*0070*/ SHF.L R36, RZ, 0x1, R11; /* 0xb7c02c00009ffc91 */ - /*0078*/ MOV R42, c[0x0][0x148]; /* 0x64c03c00291c00aa */ - /* 0x088c80108c108c10 */ - /*0088*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */ - /*0090*/ IMUL R0, R11, 0x3; /* 0xc1c01800019c2c01 */ - /*0098*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */ - /*00a0*/ IMUL R18, R11, -0x3; /* 0xc9c01bfffe9c2c49 */ - /*00a8*/ SHF.L R37, RZ, 0x1, R35; /* 0xb7c08c00009ffc95 */ - /*00b0*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */ - /*00b8*/ IMUL R19, R35, 0x3; /* 0xc1c01800019c8c4d */ - /* 0x0880acb0a0acb000 */ - /*00c8*/ IMUL R20, R35, -0x3; /* 0xc9c01bfffe9c8c51 */ - /*00d0*/ ISETP.GE.AND P0, PT, R42, c[0x0][0x14c], PT; /* 0x5b681c00299ca81e */ - /*00d8*/ @P0 BRA 0x6d8; /* 0x12000002fc00003c */ - /*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */ - /*00e8*/ ISETP.LT.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b181c00289c281e */ - /*00f0*/ @!P0 BRA 0x6d8; /* 0x12000002f020003c */ - /*00f8*/ IMUL R40, R44, R35; /* 0xe1c01800119cb0a2 */ - /* 0x088880108c10a000 */ - /*0108*/ MOV R21, c[0x0][0x148]; /* 0x64c03c00291c0056 */ - /*0110*/ IMAD R39, R21, c[0x0][0x154], R40; /* 0x5108a0002a9c549e */ - /*0118*/ MOV R34, c[0x0][0x140]; /* 0x64c03c00281c008a */ - /*0120*/ IADD R29, R39, R37; /* 0xe0800000129c9c76 */ - /*0128*/ IADD R22, R39, c[0x0][0x154]; /* 0x608000002a9c9c5a */ - /*0130*/ ISUB R32, R39, R37; /* 0xe0880000129c9c82 */ - /*0138*/ IADD R23, R39, R36; /* 0xe0800000121c9c5e */ - /* 0x0880808080108c10 */ - /*0148*/ ISUB R24, R39, c[0x0][0x154]; /* 0x608800002a9c9c62 */ - /*0150*/ IADD R25, R39, R0; /* 0xe0800000001c9c66 */ - /*0158*/ ISUB R26, R39, R36; /* 0xe0880000121c9c6a */ - /*0160*/ IADD R27, R39, R35; /* 0xe0800000119c9c6e */ - /*0168*/ IADD R28, R39, R18; /* 0xe0800000091c9c72 */ - /*0170*/ ISUB R30, R39, R35; /* 0xe0880000119c9c7a */ - /*0178*/ IADD R33, R39, R20; /* 0xe08000000a1c9c86 */ - /* 0x08a0acb0a0a0a000 */ - /*0188*/ IADD R31, R39, R19; /* 0xe0800000099c9c7e */ - /*0190*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */ - /*0198*/ LOP.AND R11, R10, 0x1f; /* 0xc20000000f9c282d */ - /*01a0*/ IADD R43, R11, R34; /* 0xe0800000111c2cae */ - /*01a8*/ ISETP.GE.AND P0, PT, R43, c[0x0][0x144], PT; /* 0x5b681c00289cac1e */ - /*01b0*/ @P0 BRA.U 0x6a0; /* 0x120000027400023c */ - /*01b8*/ @!P0 IADD R10, R39, R43; /* 0xe080000015a09c2a */ - /* 0x08a0108c109c80a0 */ - /*01c8*/ @!P0 SHF.L R38, RZ, 0x3, R10; /* 0xb7c0280001a3fc99 */ - /*01d0*/ @!P0 IADD R10, R38, -0x8; /* 0xc88003fffc209829 */ - /*01d8*/ @!P0 IADD R11, R38, 0x8; /* 0xc08000000420982d */ - /*01e0*/ @!P0 BFE R12, R10, 0x11f; /* 0xc00800008fa02831 */ - /*01e8*/ @!P0 IADD R54.CC, R10, c[0x0][0x170]; /* 0x608400002e2028da */ - /*01f0*/ @!P0 IADD R10, R38, -0x10; /* 0xc88003fff8209829 */ - /*01f8*/ @!P0 BFE R13, R11, 0x11f; /* 0xc00800008fa02c35 */ - /* 0x08808080a0108c10 */ - /*0208*/ @!P0 IADD.X R55, R12, c[0x0][0x174]; /* 0x608040002ea030de */ - /*0210*/ @!P0 IADD R46.CC, R11, c[0x0][0x170]; /* 0x608400002e202cba */ - /*0218*/ @!P0 IADD R11, R38, 0x10; /* 0xc08000000820982d */ - /*0220*/ @!P0 BFE R14, R10, 0x11f; /* 0xc00800008fa02839 */ - /*0228*/ @!P0 IADD.X R47, R13, c[0x0][0x174]; /* 0x608040002ea034be */ - /*0230*/ @!P0 IADD R48.CC, R10, c[0x0][0x170]; /* 0x608400002e2028c2 */ - /*0238*/ @!P0 IADD R10, R22, R43; /* 0xe080000015a0582a */ - /* 0x08ac108080909410 */ - /*0248*/ @!P0 LD.E.64 R12, [R54]; /* 0xc58000000020d830 */ - /*0250*/ @!P0 BFE R15, R11, 0x11f; /* 0xc00800008fa02c3d */ - /*0258*/ @!P0 LD.E.64 R16, [R46]; /* 0xc58000000020b840 */ - /*0260*/ @!P0 IADD.X R49, R14, c[0x0][0x174]; /* 0x608040002ea038c6 */ - /*0268*/ @!P0 IADD R52.CC, R11, c[0x0][0x170]; /* 0x608400002e202cd2 */ - /*0270*/ @!P0 SHF.L R50, RZ, 0x3, R10; /* 0xb7c0280001a3fcc9 */ - /*0278*/ @!P0 IADD R14, R23, R43; /* 0xe080000015a05c3a */ - /* 0x08908c108c108010 */ - /*0288*/ @!P0 IADD.X R53, R15, c[0x0][0x174]; /* 0x608040002ea03cd6 */ - /*0290*/ @!P0 BFE R51, R50, 0x11f; /* 0xc00800008fa0c8cd */ - /*0298*/ @!P0 IADD R50.CC, R50, c[0x0][0x170]; /* 0x608400002e20c8ca */ - /*02a0*/ @!P0 SHF.L R45, RZ, 0x3, R14; /* 0xb7c0380001a3fcb5 */ - /*02a8*/ @!P0 LD.E.64 R10, [R48]; /* 0xc58000000020c028 */ - /*02b0*/ @!P0 DADD R12, R12, R16; /* 0xe380000008203032 */ - /*02b8*/ @!P0 LD.E.64 R14, [R52]; /* 0xc58000000020d038 */ - /* 0x089c8010b0108c10 */ - /*02c8*/ @!P0 IADD.X R51, R51, c[0x0][0x174]; /* 0x608040002ea0ccce */ - /*02d0*/ @!P0 BFE R17, R45, 0x11f; /* 0xc00800008fa0b445 */ - /*02d8*/ @!P0 IADD R16, R24, R43; /* 0xe080000015a06042 */ - /*02e0*/ @!P0 IADD R46.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4ba */ - /*02e8*/ @!P0 SHF.L R45, RZ, 0x3, R16; /* 0xb7c0400001a3fcb5 */ - /*02f0*/ @!P0 IADD.X R47, R17, c[0x0][0x174]; /* 0x608040002ea044be */ - /*02f8*/ @!P0 LD.E.64 R16, [R50]; /* 0xc58000000020c840 */ - /* 0x08848010a8108080 */ - /*0308*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*0310*/ @!P0 DADD R48, R10, R14; /* 0xe3800000072028c2 */ - /*0318*/ @!P0 BFE R11, R45, 0x11f; /* 0xc00800008fa0b42d */ - /*0320*/ @!P0 IADD R10, R26, R43; /* 0xe080000015a0682a */ - /*0328*/ @!P0 IADD.X R55, R11, c[0x0][0x174]; /* 0x608040002ea02cde */ - /*0330*/ @!P0 SHF.L R45, RZ, 0x3, R10; /* 0xb7c0280001a3fcb5 */ - /*0338*/ @!P0 LD.E.64 R14, [R46]; /* 0xc58000000020b838 */ - /* 0x0890988010801094 */ - /*0348*/ @!P0 DADD R16, R12, R16; /* 0xe380000008203042 */ - /*0350*/ @!P0 IADD R13, R27, R43; /* 0xe080000015a06c36 */ - /*0358*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0360*/ @!P0 BFE R53, R45, 0x11f; /* 0xc00800008fa0b4d5 */ - /*0368*/ @!P0 IADD R52.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4d2 */ - /*0370*/ @!P0 IADD R12, R29, R43; /* 0xe080000015a07432 */ - /*0378*/ @!P0 SHF.L R13, RZ, 0x3, R13; /* 0xb7c0340001a3fc35 */ - /* 0x0894801094108c10 */ - /*0388*/ @!P0 IADD.X R53, R53, c[0x0][0x174]; /* 0x608040002ea0d4d6 */ - /*0390*/ @!P0 SHF.L R45, RZ, 0x3, R12; /* 0xb7c0300001a3fcb5 */ - /*0398*/ @!P0 BFE R46, R13, 0x11f; /* 0xc00800008fa034b9 */ - /*03a0*/ @!P0 IADD R50.CC, R13, c[0x0][0x170]; /* 0x608400002e2034ca */ - /*03a8*/ @!P0 LD.E.64 R12, [R52]; /* 0xc58000000020d030 */ - /*03b0*/ @!P0 DADD R16, R16, R10; /* 0xe380000005204042 */ - /*03b8*/ @!P0 BFE R10, R45, 0x11f; /* 0xc00800008fa0b429 */ - /* 0x08a0108c109c8010 */ - /*03c8*/ @!P0 IADD.X R51, R46, c[0x0][0x174]; /* 0x608040002ea0b8ce */ - /*03d0*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*03d8*/ @!P0 IADD R45, R30, R43; /* 0xe080000015a078b6 */ - /*03e0*/ @!P0 LD.E.64 R46, [R50]; /* 0xc58000000020c8b8 */ - /*03e8*/ @!P0 DADD R14, R48, R14; /* 0xe38000000720c03a */ - /*03f0*/ @!P0 IADD.X R55, R10, c[0x0][0x174]; /* 0x608040002ea028de */ - /*03f8*/ @!P0 SHF.L R48, RZ, 0x3, R45; /* 0xb7c0b40001a3fcc1 */ - /* 0x088480a080108010 */ - /*0408*/ @!P0 IADD R45, R32, R43; /* 0xe080000015a080b6 */ - /*0410*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0418*/ @!P0 BFE R49, R48, 0x11f; /* 0xc00800008fa0c0c5 */ - /*0420*/ @!P0 IADD R48.CC, R48, c[0x0][0x170]; /* 0x608400002e20c0c2 */ - /*0428*/ @!P0 DADD R14, R14, R12; /* 0xe38000000620383a */ - /*0430*/ @!P0 DADD R12, R16, R46; /* 0xe380000017204032 */ - /*0438*/ @!P0 SHF.L R46, RZ, 0x3, R45; /* 0xb7c0b40001a3fcb9 */ - /* 0x0880808010b08010 */ - /*0448*/ @!P0 IADD.X R49, R49, c[0x0][0x174]; /* 0x608040002ea0c4c6 */ - /*0450*/ @!P0 BFE R45, R38, 0x11f; /* 0xc00800008fa098b5 */ - /*0458*/ @!P0 IADD R16.CC, R38, c[0x0][0x170]; /* 0x608400002e209842 */ - /*0460*/ @!P0 IADD.X R17, R45, c[0x0][0x174]; /* 0x608040002ea0b446 */ - /*0468*/ @!P0 LD.E.64 R50, [R48]; /* 0xc58000000020c0c8 */ - /*0470*/ @!P0 DADD R14, R14, R10; /* 0xe38000000520383a */ - /*0478*/ @!P0 BFE R10, R46, 0x11f; /* 0xc00800008fa0b829 */ - /* 0x0880bc109c1080b0 */ - /*0488*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0490*/ @!P0 IADD.X R47, R10, c[0x0][0x174]; /* 0x608040002ea028be */ - /*0498*/ @!P0 LD.E.64 R10, [R16]; /* 0xc580000000204028 */ - /*04a0*/ @!P0 IADD R48, R38, -0x18; /* 0xc88003fff42098c1 */ - /*04a8*/ @!P0 LD.E.64 R52, [R46]; /* 0xc58000000020b8d0 */ - /*04b0*/ @!P0 DADD R12, R12, R50; /* 0xe380000019203032 */ - /*04b8*/ @!P0 DMUL R50, R8, R10; /* 0xe4000000052020ca */ - /* 0x08b08010b01080a0 */ - /*04c8*/ @!P0 IADD R46, R38, 0x18; /* 0xc08000000c2098b9 */ - /*04d0*/ @!P0 DFMA R16, R6, R12, R50; /* 0xdb80c80006201842 */ - /*04d8*/ @!P0 BFE R13, R48, 0x11f; /* 0xc00800008fa0c035 */ - /*04e0*/ @!P0 IADD R12.CC, R48, c[0x0][0x170]; /* 0x608400002e20c032 */ - /*04e8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*04f0*/ @!P0 IADD.X R13, R13, c[0x0][0x174]; /* 0x608040002ea03436 */ - /*04f8*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /* 0x08a0a080dc109c80 */ - /*0508*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0510*/ @!P0 LD.E.64 R48, [R12]; /* 0xc5800000002030c0 */ - /*0518*/ @!P0 LD.E.64 R50, [R46]; /* 0xc58000000020b8c8 */ - /*0520*/ @!P0 DADD R14, R14, R52; /* 0xe38000001a20383a */ - /*0528*/ @!P0 DADD R12, R48, R50; /* 0xe38000001920c032 */ - /*0530*/ @!P0 IADD R48, R25, R43; /* 0xe080000015a064c2 */ - /*0538*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /* 0x08a080dc10a0b010 */ - /*0548*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0550*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0558*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0560*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*0568*/ @!P0 DADD R10, R10, R10; /* 0xe38000000520282a */ - /*0570*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*0578*/ @!P0 IADD R48, R28, R43; /* 0xe080000015a070c2 */ - /* 0x08a080dca0b010a0 */ - /*0588*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*0590*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0598*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*05a0*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*05a8*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*05b0*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*05b8*/ @!P0 IADD R48, R31, R43; /* 0xe080000015a07cc2 */ - /* 0x0880a010b010a010 */ - /*05c8*/ @!P0 IADD R43, R33, R43; /* 0xe080000015a084ae */ - /*05d0*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*05d8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*05e0*/ @!P0 IADD R48.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8c2 */ - /*05e8*/ @!P0 IADD.X R49, R47, c[0x0][0x174]; /* 0x608040002ea0bcc6 */ - /*05f0*/ @!P0 SHF.L R43, RZ, 0x3, R43; /* 0xb7c0ac0001a3fcad */ - /*05f8*/ @!P0 LD.E.64 R46, [R48]; /* 0xc58000000020c0b8 */ - /* 0x0880909c80a080d8 */ - /*0608*/ @!P0 IADD R52.CC, R43, c[0x0][0x170]; /* 0x608400002e20acd2 */ - /*0610*/ @!P0 DADD R46, R12, R46; /* 0xe3800000172030ba */ - /*0618*/ @!P0 BFE R12, R43, 0x11f; /* 0xc00800008fa0ac31 */ - /*0620*/ @!P0 IADD.X R53, R12, c[0x0][0x174]; /* 0x608040002ea030d6 */ - /*0628*/ @!P0 IADD R12.CC, R38, c[0x0][0x178]; /* 0x608400002f209832 */ - /*0630*/ @!P0 LD.E.64 R48, [R52]; /* 0xc58000000020d0c0 */ - /*0638*/ @!P0 IADD.X R13, R45, c[0x0][0x17c]; /* 0x608040002fa0b436 */ - /* 0x08cc8c10a48090b0 */ - /*0648*/ @!P0 IADD R50.CC, R38, c[0x0][0x168]; /* 0x608400002d2098ca */ - /*0650*/ @!P0 IADD.X R51, R45, c[0x0][0x16c]; /* 0x608040002da0b4ce */ - /*0658*/ @!P0 DADD R46, R46, R48; /* 0xe38000001820b8ba */ - /*0660*/ @!P0 DFMA R48, R4, R14, R16; /* 0xdb804000072010c2 */ - /*0668*/ @!P0 LD.E.64 R16, [R12]; /* 0xc580000000203040 */ - /*0670*/ @!P0 DFMA R48, R2, R46, R48; /* 0xdb80c000172008c2 */ - /*0678*/ @!P0 LD.E.64 R14, [R50]; /* 0xc58000000020c838 */ - /* 0x08a0b8b0a000a4a4 */ - /*0688*/ @!P0 DADD R10, R10, -R16; /* 0xe38100000820282a */ - /*0690*/ @!P0 DFMA R10, R48, R14, R10; /* 0xdb8028000720c02a */ - /*0698*/ @!P0 ST.E.64 [R12], R10; /* 0xe580000000203028 */ - /*06a0*/ IADD R34, R34, 0x20; /* 0xc0800000101c8889 */ - /*06a8*/ ISETP.LT.AND P0, PT, R34, c[0x0][0x144], PT; /* 0x5b181c00289c881e */ - /*06b0*/ @P0 BRA 0x190; /* 0x12007ffd6c00003c */ - /*06b8*/ IADD R21, R21, 0x1; /* 0xc0800000009c5455 */ - /* 0x08b810b8b010b8b0 */ - /*06c8*/ ISETP.EQ.AND P0, PT, R21, c[0x0][0x14c], PT; /* 0x5b281c00299c541e */ - /*06d0*/ @!P0 BRA 0x110; /* 0x12007ffd1c20003c */ - /*06d8*/ ISETP.NE.AND P0, PT, R44, R41, PT; /* 0xdb581c00149cb01e */ - /*06e0*/ IADD R44, R44, 0x1; /* 0xc0800000009cb0b1 */ - /*06e8*/ @P0 BRA 0xd0; /* 0x12007ffcf000003c */ - /*06f0*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ - /*06f8*/ EXIT ; /* 0x18000000001c003c */ - /*0700*/ BRA 0x700; /* 0x12007ffffc1c003c */ - /*0708*/ NOP; /* 0x85800000001c3c02 */ - /*0710*/ NOP; /* 0x85800000001c3c02 */ - /*0718*/ NOP; /* 0x85800000001c3c02 */ - /*0720*/ NOP; /* 0x85800000001c3c02 */ - /*0728*/ NOP; /* 0x85800000001c3c02 */ - /*0730*/ NOP; /* 0x85800000001c3c02 */ - /*0738*/ NOP; /* 0x85800000001c3c02 */ - .................................. - - diff --git a/examples_cuda/stencil/3.s b/examples_cuda/stencil/3.s deleted file mode 100644 index 76476d03..00000000 --- a/examples_cuda/stencil/3.s +++ /dev/null @@ -1,239 +0,0 @@ - - code for sm_35 - Function : stencil_step_task - .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" - /* 0x0880acb0a0a0a000 */ - /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ - /*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */ - /*0018*/ IADD R44, R10, c[0x0][0x150]; /* 0x608000002a1c28b2 */ - /*0020*/ IADD R0, R44, 0x1; /* 0xc0800000009cb001 */ - /*0028*/ ISETP.GE.AND P0, PT, R44, R0, PT; /* 0xdb681c00001cb01e */ - /*0030*/ @P0 EXIT ; /* 0x180000000000003c */ - /*0038*/ MOV R11, c[0x0][0x154]; /* 0x64c03c002a9c002e */ - /* 0x0888108010a01080 */ - /*0048*/ IADD R41, R10, c[0x0][0x150]; /* 0x608000002a1c28a6 */ - /*0050*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */ - /*0058*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */ - /*0060*/ IMUL R35, R11, c[0x0][0x158]; /* 0x61c018002b1c2c8e */ - /*0068*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */ - /*0070*/ SHF.L R36, RZ, 0x1, R11; /* 0xb7c02c00009ffc91 */ - /*0078*/ MOV R42, c[0x0][0x148]; /* 0x64c03c00291c00aa */ - /* 0x088c80108c108c10 */ - /*0088*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */ - /*0090*/ IMUL R0, R11, 0x3; /* 0xc1c01800019c2c01 */ - /*0098*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */ - /*00a0*/ IMUL R18, R11, -0x3; /* 0xc9c01bfffe9c2c49 */ - /*00a8*/ SHF.L R37, RZ, 0x1, R35; /* 0xb7c08c00009ffc95 */ - /*00b0*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */ - /*00b8*/ IMUL R19, R35, 0x3; /* 0xc1c01800019c8c4d */ - /* 0x0880acb0a0acb000 */ - /*00c8*/ IMUL R20, R35, -0x3; /* 0xc9c01bfffe9c8c51 */ - /*00d0*/ ISETP.GE.AND P0, PT, R42, c[0x0][0x14c], PT; /* 0x5b681c00299ca81e */ - /*00d8*/ @P0 BRA 0x6d8; /* 0x12000002fc00003c */ - /*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */ - /*00e8*/ ISETP.LT.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b181c00289c281e */ - /*00f0*/ @!P0 BRA 0x6d8; /* 0x12000002f020003c */ - /*00f8*/ IMUL R40, R44, R35; /* 0xe1c01800119cb0a2 */ - /* 0x088880108c10a000 */ - /*0108*/ MOV R21, c[0x0][0x148]; /* 0x64c03c00291c0056 */ - /*0110*/ IMAD R39, R21, c[0x0][0x154], R40; /* 0x5108a0002a9c549e */ - /*0118*/ MOV R34, c[0x0][0x140]; /* 0x64c03c00281c008a */ - /*0120*/ IADD R29, R39, R37; /* 0xe0800000129c9c76 */ - /*0128*/ IADD R22, R39, c[0x0][0x154]; /* 0x608000002a9c9c5a */ - /*0130*/ ISUB R32, R39, R37; /* 0xe0880000129c9c82 */ - /*0138*/ IADD R23, R39, R36; /* 0xe0800000121c9c5e */ - /* 0x0880808080108c10 */ - /*0148*/ ISUB R24, R39, c[0x0][0x154]; /* 0x608800002a9c9c62 */ - /*0150*/ IADD R25, R39, R0; /* 0xe0800000001c9c66 */ - /*0158*/ ISUB R26, R39, R36; /* 0xe0880000121c9c6a */ - /*0160*/ IADD R27, R39, R35; /* 0xe0800000119c9c6e */ - /*0168*/ IADD R28, R39, R18; /* 0xe0800000091c9c72 */ - /*0170*/ ISUB R30, R39, R35; /* 0xe0880000119c9c7a */ - /*0178*/ IADD R33, R39, R20; /* 0xe08000000a1c9c86 */ - /* 0x08a0acb0a0a0a000 */ - /*0188*/ IADD R31, R39, R19; /* 0xe0800000099c9c7e */ - /*0190*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */ - /*0198*/ LOP.AND R11, R10, 0x1f; /* 0xc20000000f9c282d */ - /*01a0*/ IADD R43, R11, R34; /* 0xe0800000111c2cae */ - /*01a8*/ ISETP.GE.AND P0, PT, R43, c[0x0][0x144], PT; /* 0x5b681c00289cac1e */ - /*01b0*/ @P0 BRA.U 0x6a0; /* 0x120000027400023c */ - /*01b8*/ @!P0 IADD R10, R39, R43; /* 0xe080000015a09c2a */ - /* 0x08a0108c109c80a0 */ - /*01c8*/ @!P0 SHF.L R38, RZ, 0x3, R10; /* 0xb7c0280001a3fc99 */ - /*01d0*/ @!P0 IADD R10, R38, -0x8; /* 0xc88003fffc209829 */ - /*01d8*/ @!P0 IADD R11, R38, 0x8; /* 0xc08000000420982d */ - /*01e0*/ @!P0 BFE R12, R10, 0x11f; /* 0xc00800008fa02831 */ - /*01e8*/ @!P0 IADD R54.CC, R10, c[0x0][0x170]; /* 0x608400002e2028da */ - /*01f0*/ @!P0 IADD R10, R38, -0x10; /* 0xc88003fff8209829 */ - /*01f8*/ @!P0 BFE R13, R11, 0x11f; /* 0xc00800008fa02c35 */ - /* 0x08808080a0108c10 */ - /*0208*/ @!P0 IADD.X R55, R12, c[0x0][0x174]; /* 0x608040002ea030de */ - /*0210*/ @!P0 IADD R46.CC, R11, c[0x0][0x170]; /* 0x608400002e202cba */ - /*0218*/ @!P0 IADD R11, R38, 0x10; /* 0xc08000000820982d */ - /*0220*/ @!P0 BFE R14, R10, 0x11f; /* 0xc00800008fa02839 */ - /*0228*/ @!P0 IADD.X R47, R13, c[0x0][0x174]; /* 0x608040002ea034be */ - /*0230*/ @!P0 IADD R48.CC, R10, c[0x0][0x170]; /* 0x608400002e2028c2 */ - /*0238*/ @!P0 IADD R10, R22, R43; /* 0xe080000015a0582a */ - /* 0x08ac108080909410 */ - /*0248*/ @!P0 LD.E.64 R12, [R54]; /* 0xc58000000020d830 */ - /*0250*/ @!P0 BFE R15, R11, 0x11f; /* 0xc00800008fa02c3d */ - /*0258*/ @!P0 LD.E.64 R16, [R46]; /* 0xc58000000020b840 */ - /*0260*/ @!P0 IADD.X R49, R14, c[0x0][0x174]; /* 0x608040002ea038c6 */ - /*0268*/ @!P0 IADD R52.CC, R11, c[0x0][0x170]; /* 0x608400002e202cd2 */ - /*0270*/ @!P0 SHF.L R50, RZ, 0x3, R10; /* 0xb7c0280001a3fcc9 */ - /*0278*/ @!P0 IADD R14, R23, R43; /* 0xe080000015a05c3a */ - /* 0x08908c108c108010 */ - /*0288*/ @!P0 IADD.X R53, R15, c[0x0][0x174]; /* 0x608040002ea03cd6 */ - /*0290*/ @!P0 BFE R51, R50, 0x11f; /* 0xc00800008fa0c8cd */ - /*0298*/ @!P0 IADD R50.CC, R50, c[0x0][0x170]; /* 0x608400002e20c8ca */ - /*02a0*/ @!P0 SHF.L R45, RZ, 0x3, R14; /* 0xb7c0380001a3fcb5 */ - /*02a8*/ @!P0 LD.E.64 R10, [R48]; /* 0xc58000000020c028 */ - /*02b0*/ @!P0 DADD R12, R12, R16; /* 0xe380000008203032 */ - /*02b8*/ @!P0 LD.E.64 R14, [R52]; /* 0xc58000000020d038 */ - /* 0x089c8010b0108c10 */ - /*02c8*/ @!P0 IADD.X R51, R51, c[0x0][0x174]; /* 0x608040002ea0ccce */ - /*02d0*/ @!P0 BFE R17, R45, 0x11f; /* 0xc00800008fa0b445 */ - /*02d8*/ @!P0 IADD R16, R24, R43; /* 0xe080000015a06042 */ - /*02e0*/ @!P0 IADD R46.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4ba */ - /*02e8*/ @!P0 SHF.L R45, RZ, 0x3, R16; /* 0xb7c0400001a3fcb5 */ - /*02f0*/ @!P0 IADD.X R47, R17, c[0x0][0x174]; /* 0x608040002ea044be */ - /*02f8*/ @!P0 LD.E.64 R16, [R50]; /* 0xc58000000020c840 */ - /* 0x08848010a8108080 */ - /*0308*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*0310*/ @!P0 DADD R48, R10, R14; /* 0xe3800000072028c2 */ - /*0318*/ @!P0 BFE R11, R45, 0x11f; /* 0xc00800008fa0b42d */ - /*0320*/ @!P0 IADD R10, R26, R43; /* 0xe080000015a0682a */ - /*0328*/ @!P0 IADD.X R55, R11, c[0x0][0x174]; /* 0x608040002ea02cde */ - /*0330*/ @!P0 SHF.L R45, RZ, 0x3, R10; /* 0xb7c0280001a3fcb5 */ - /*0338*/ @!P0 LD.E.64 R14, [R46]; /* 0xc58000000020b838 */ - /* 0x0890988010801094 */ - /*0348*/ @!P0 DADD R16, R12, R16; /* 0xe380000008203042 */ - /*0350*/ @!P0 IADD R13, R27, R43; /* 0xe080000015a06c36 */ - /*0358*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0360*/ @!P0 BFE R53, R45, 0x11f; /* 0xc00800008fa0b4d5 */ - /*0368*/ @!P0 IADD R52.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4d2 */ - /*0370*/ @!P0 IADD R12, R29, R43; /* 0xe080000015a07432 */ - /*0378*/ @!P0 SHF.L R13, RZ, 0x3, R13; /* 0xb7c0340001a3fc35 */ - /* 0x0894801094108c10 */ - /*0388*/ @!P0 IADD.X R53, R53, c[0x0][0x174]; /* 0x608040002ea0d4d6 */ - /*0390*/ @!P0 SHF.L R45, RZ, 0x3, R12; /* 0xb7c0300001a3fcb5 */ - /*0398*/ @!P0 BFE R46, R13, 0x11f; /* 0xc00800008fa034b9 */ - /*03a0*/ @!P0 IADD R50.CC, R13, c[0x0][0x170]; /* 0x608400002e2034ca */ - /*03a8*/ @!P0 LD.E.64 R12, [R52]; /* 0xc58000000020d030 */ - /*03b0*/ @!P0 DADD R16, R16, R10; /* 0xe380000005204042 */ - /*03b8*/ @!P0 BFE R10, R45, 0x11f; /* 0xc00800008fa0b429 */ - /* 0x08a0108c109c8010 */ - /*03c8*/ @!P0 IADD.X R51, R46, c[0x0][0x174]; /* 0x608040002ea0b8ce */ - /*03d0*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*03d8*/ @!P0 IADD R45, R30, R43; /* 0xe080000015a078b6 */ - /*03e0*/ @!P0 LD.E.64 R46, [R50]; /* 0xc58000000020c8b8 */ - /*03e8*/ @!P0 DADD R14, R48, R14; /* 0xe38000000720c03a */ - /*03f0*/ @!P0 IADD.X R55, R10, c[0x0][0x174]; /* 0x608040002ea028de */ - /*03f8*/ @!P0 SHF.L R48, RZ, 0x3, R45; /* 0xb7c0b40001a3fcc1 */ - /* 0x088480a080108010 */ - /*0408*/ @!P0 IADD R45, R32, R43; /* 0xe080000015a080b6 */ - /*0410*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0418*/ @!P0 BFE R49, R48, 0x11f; /* 0xc00800008fa0c0c5 */ - /*0420*/ @!P0 IADD R48.CC, R48, c[0x0][0x170]; /* 0x608400002e20c0c2 */ - /*0428*/ @!P0 DADD R14, R14, R12; /* 0xe38000000620383a */ - /*0430*/ @!P0 DADD R12, R16, R46; /* 0xe380000017204032 */ - /*0438*/ @!P0 SHF.L R46, RZ, 0x3, R45; /* 0xb7c0b40001a3fcb9 */ - /* 0x0880808010b08010 */ - /*0448*/ @!P0 IADD.X R49, R49, c[0x0][0x174]; /* 0x608040002ea0c4c6 */ - /*0450*/ @!P0 BFE R45, R38, 0x11f; /* 0xc00800008fa098b5 */ - /*0458*/ @!P0 IADD R16.CC, R38, c[0x0][0x170]; /* 0x608400002e209842 */ - /*0460*/ @!P0 IADD.X R17, R45, c[0x0][0x174]; /* 0x608040002ea0b446 */ - /*0468*/ @!P0 LD.E.64 R50, [R48]; /* 0xc58000000020c0c8 */ - /*0470*/ @!P0 DADD R14, R14, R10; /* 0xe38000000520383a */ - /*0478*/ @!P0 BFE R10, R46, 0x11f; /* 0xc00800008fa0b829 */ - /* 0x0880bc109c1080b0 */ - /*0488*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0490*/ @!P0 IADD.X R47, R10, c[0x0][0x174]; /* 0x608040002ea028be */ - /*0498*/ @!P0 LD.E.64 R10, [R16]; /* 0xc580000000204028 */ - /*04a0*/ @!P0 IADD R48, R38, -0x18; /* 0xc88003fff42098c1 */ - /*04a8*/ @!P0 LD.E.64 R52, [R46]; /* 0xc58000000020b8d0 */ - /*04b0*/ @!P0 DADD R12, R12, R50; /* 0xe380000019203032 */ - /*04b8*/ @!P0 DMUL R50, R8, R10; /* 0xe4000000052020ca */ - /* 0x08b08010b01080a0 */ - /*04c8*/ @!P0 IADD R46, R38, 0x18; /* 0xc08000000c2098b9 */ - /*04d0*/ @!P0 DFMA R16, R6, R12, R50; /* 0xdb80c80006201842 */ - /*04d8*/ @!P0 BFE R13, R48, 0x11f; /* 0xc00800008fa0c035 */ - /*04e0*/ @!P0 IADD R12.CC, R48, c[0x0][0x170]; /* 0x608400002e20c032 */ - /*04e8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*04f0*/ @!P0 IADD.X R13, R13, c[0x0][0x174]; /* 0x608040002ea03436 */ - /*04f8*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /* 0x08a0a080dc109c80 */ - /*0508*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0510*/ @!P0 LD.E.64 R48, [R12]; /* 0xc5800000002030c0 */ - /*0518*/ @!P0 LD.E.64 R50, [R46]; /* 0xc58000000020b8c8 */ - /*0520*/ @!P0 DADD R14, R14, R52; /* 0xe38000001a20383a */ - /*0528*/ @!P0 DADD R12, R48, R50; /* 0xe38000001920c032 */ - /*0530*/ @!P0 IADD R48, R25, R43; /* 0xe080000015a064c2 */ - /*0538*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /* 0x08a080dc10a0b010 */ - /*0548*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0550*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0558*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0560*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*0568*/ @!P0 DADD R10, R10, R10; /* 0xe38000000520282a */ - /*0570*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*0578*/ @!P0 IADD R48, R28, R43; /* 0xe080000015a070c2 */ - /* 0x08a080dca0b010a0 */ - /*0588*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*0590*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0598*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*05a0*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*05a8*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*05b0*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*05b8*/ @!P0 IADD R48, R31, R43; /* 0xe080000015a07cc2 */ - /* 0x0880a010b010a010 */ - /*05c8*/ @!P0 IADD R43, R33, R43; /* 0xe080000015a084ae */ - /*05d0*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*05d8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*05e0*/ @!P0 IADD R48.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8c2 */ - /*05e8*/ @!P0 IADD.X R49, R47, c[0x0][0x174]; /* 0x608040002ea0bcc6 */ - /*05f0*/ @!P0 SHF.L R43, RZ, 0x3, R43; /* 0xb7c0ac0001a3fcad */ - /*05f8*/ @!P0 LD.E.64 R46, [R48]; /* 0xc58000000020c0b8 */ - /* 0x0880909c80a080d8 */ - /*0608*/ @!P0 IADD R52.CC, R43, c[0x0][0x170]; /* 0x608400002e20acd2 */ - /*0610*/ @!P0 DADD R46, R12, R46; /* 0xe3800000172030ba */ - /*0618*/ @!P0 BFE R12, R43, 0x11f; /* 0xc00800008fa0ac31 */ - /*0620*/ @!P0 IADD.X R53, R12, c[0x0][0x174]; /* 0x608040002ea030d6 */ - /*0628*/ @!P0 IADD R12.CC, R38, c[0x0][0x178]; /* 0x608400002f209832 */ - /*0630*/ @!P0 LD.E.64 R48, [R52]; /* 0xc58000000020d0c0 */ - /*0638*/ @!P0 IADD.X R13, R45, c[0x0][0x17c]; /* 0x608040002fa0b436 */ - /* 0x08cc8c10a48090b0 */ - /*0648*/ @!P0 IADD R50.CC, R38, c[0x0][0x168]; /* 0x608400002d2098ca */ - /*0650*/ @!P0 IADD.X R51, R45, c[0x0][0x16c]; /* 0x608040002da0b4ce */ - /*0658*/ @!P0 DADD R46, R46, R48; /* 0xe38000001820b8ba */ - /*0660*/ @!P0 DFMA R48, R4, R14, R16; /* 0xdb804000072010c2 */ - /*0668*/ @!P0 LD.E.64 R16, [R12]; /* 0xc580000000203040 */ - /*0670*/ @!P0 DFMA R48, R2, R46, R48; /* 0xdb80c000172008c2 */ - /*0678*/ @!P0 LD.E.64 R14, [R50]; /* 0xc58000000020c838 */ - /* 0x08a0b8b0a000a4a4 */ - /*0688*/ @!P0 DADD R10, R10, -R16; /* 0xe38100000820282a */ - /*0690*/ @!P0 DFMA R10, R48, R14, R10; /* 0xdb8028000720c02a */ - /*0698*/ @!P0 ST.E.64 [R12], R10; /* 0xe580000000203028 */ - /*06a0*/ IADD R34, R34, 0x20; /* 0xc0800000101c8889 */ - /*06a8*/ ISETP.LT.AND P0, PT, R34, c[0x0][0x144], PT; /* 0x5b181c00289c881e */ - /*06b0*/ @P0 BRA 0x190; /* 0x12007ffd6c00003c */ - /*06b8*/ IADD R21, R21, 0x1; /* 0xc0800000009c5455 */ - /* 0x08b810b8b010b8b0 */ - /*06c8*/ ISETP.EQ.AND P0, PT, R21, c[0x0][0x14c], PT; /* 0x5b281c00299c541e */ - /*06d0*/ @!P0 BRA 0x110; /* 0x12007ffd1c20003c */ - /*06d8*/ ISETP.NE.AND P0, PT, R44, R41, PT; /* 0xdb581c00149cb01e */ - /*06e0*/ IADD R44, R44, 0x1; /* 0xc0800000009cb0b1 */ - /*06e8*/ @P0 BRA 0xd0; /* 0x12007ffcf000003c */ - /*06f0*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ - /*06f8*/ EXIT ; /* 0x18000000001c003c */ - /*0700*/ BRA 0x700; /* 0x12007ffffc1c003c */ - /*0708*/ NOP; /* 0x85800000001c3c02 */ - /*0710*/ NOP; /* 0x85800000001c3c02 */ - /*0718*/ NOP; /* 0x85800000001c3c02 */ - /*0720*/ NOP; /* 0x85800000001c3c02 */ - /*0728*/ NOP; /* 0x85800000001c3c02 */ - /*0730*/ NOP; /* 0x85800000001c3c02 */ - /*0738*/ NOP; /* 0x85800000001c3c02 */ - .................................. - - diff --git a/examples_cuda/stencil/Makefile_gpu b/examples_cuda/stencil/Makefile_gpu new file mode 100644 index 00000000..ac1f3b25 --- /dev/null +++ b/examples_cuda/stencil/Makefile_gpu @@ -0,0 +1,55 @@ +PROG=stencil_cu +ISPC_SRC=stencil.ispc +CXX_SRC=stencil_cu.cpp stencil_serial.cpp + +CXX=g++ +CXXFLAGS=-O3 -I$(CUDATK)/include +LD=g++ +LDFLAGS=-lcuda + +ISPC=ispc +ISPCFLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math + +LLVM32 = $(HOME)/usr/local/llvm/bin-3.2 +LLVM = $(HOME)/usr/local/llvm/bin-3.3 +PTXGEN = $(HOME)/ptxgen +PTXGEN += -opt=3 +PTXGEN += -ftz=1 -prec-div=0 -prec-sqrt=0 -fma=1 + +LLVM32DIS=$(LLVM32)/bin/llvm-dis + +.SUFFIXES: .bc .o .ptx .cu _ispc_nvptx64.bc + + +ISPC_OBJ=$(ISPC_SRC:%.ispc=%_ispc.o) +ISPC_BC=$(ISPC_SRC:%.ispc=%_ispc_nvptx64.bc) +PTXSRC=$(ISPC_SRC:%.ispc=%_ispc_nvptx64.ptx) +CXX_OBJ=$(CXX_SRC:%.cpp=%.o) + +all: $(PROG) + + +$(CXX_OBJ) : kernel.ptx +$(PROG): $(CXX_OBJ) kernel.ptx + /bin/cp kernel.ptx __kernels.ptx + $(LD) -o $@ $(CXX_OBJ) $(LDFLAGS) + +%.o: %.cpp + $(CXX) $(CXXFLAGS) -o $@ -c $< + + +%_ispc_nvptx64.bc: %.ispc + $(ISPC) $(ISPCFLAGS) --emit-llvm -o `basename $< .ispc`_ispc_nvptx64.bc -h `basename $< .ispc`_ispc.h $< --emit-llvm + +%.ptx: %.bc + $(LLVM32DIS) $< + $(PTXGEN) `basename $< .bc`.ll > $@ + +kernel.ptx: $(PTXSRC) + cat $^ > kernel.ptx + +clean: + /bin/rm -rf *.ptx *.bc *.ll $(PROG) + + + diff --git a/examples_cuda/stencil/__kernels.ptx b/examples_cuda/stencil/__kernels.ptx new file mode 100644 index 00000000..b0339cbf --- /dev/null +++ b/examples_cuda/stencil/__kernels.ptx @@ -0,0 +1,1246 @@ +// +// Generated by NVIDIA NVVM Compiler +// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) +// Cuda compilation tools, release 5.5, V5.5.0 +// + +.version 3.2 +.target sm_35 +.address_size 64 + + +.extern .func (.param .b32 func_retval0) cudaLaunchDevice +( + .param .b64 cudaLaunchDevice_param_0, + .param .b64 cudaLaunchDevice_param_1, + .param .align 4 .b8 cudaLaunchDevice_param_2[12], + .param .align 4 .b8 cudaLaunchDevice_param_3[12], + .param .b32 cudaLaunchDevice_param_4, + .param .b64 cudaLaunchDevice_param_5 +); + + +.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer +( + .param .b64 cudaGetParameterBuffer_param_0, + .param .b64 cudaGetParameterBuffer_param_1 +) +; +.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize +( + +) +; +.global .align 1 .b8 constDeltaForeach1[32]; +.global .align 1 .b8 constDeltaForeach4[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + +.visible .func (.param .b32 func_retval0) __shfl_i32( + .param .b32 __shfl_i32_param_0, + .param .b32 __shfl_i32_param_1 +) +{ + .reg .s32 %r<4>; + + + ld.param.u32 %r2, [__shfl_i32_param_0]; + ld.param.u32 %r3, [__shfl_i32_param_1]; + // inline asm + shfl.idx.b32 %r1, %r2, %r3, 0x1f; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __shfl_xor_float( + .param .b32 __shfl_xor_float_param_0, + .param .b32 __shfl_xor_float_param_1 +) +{ + .reg .s32 %r<2>; + .reg .f32 %f<3>; + + + ld.param.f32 %f2, [__shfl_xor_float_param_0]; + ld.param.u32 %r1, [__shfl_xor_float_param_1]; + // inline asm + shfl.bfly.b32 %f1, %f2, %r1, 0x1f; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __shfl_xor_i32( + .param .b32 __shfl_xor_i32_param_0, + .param .b32 __shfl_xor_i32_param_1 +) +{ + .reg .s32 %r<4>; + + + ld.param.u32 %r2, [__shfl_xor_i32_param_0]; + ld.param.u32 %r3, [__shfl_xor_i32_param_1]; + // inline asm + shfl.bfly.b32 %r1, %r2, %r3, 0x1f; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __fminf( + .param .b32 __fminf_param_0, + .param .b32 __fminf_param_1 +) +{ + .reg .f32 %f<4>; + + + ld.param.f32 %f2, [__fminf_param_0]; + ld.param.f32 %f3, [__fminf_param_1]; + // inline asm + min.f32 %f1, %f2, %f3; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __fmaxf( + .param .b32 __fmaxf_param_0, + .param .b32 __fmaxf_param_1 +) +{ + .reg .f32 %f<4>; + + + ld.param.f32 %f2, [__fmaxf_param_0]; + ld.param.f32 %f3, [__fmaxf_param_1]; + // inline asm + max.f32 %f1, %f2, %f3; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __ballot( + .param .b32 __ballot_param_0 +) +{ + .reg .s32 %r<3>; + + + ld.param.u8 %r2, [__ballot_param_0]; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r2, 0; + vote.ballot.b32 %r1, %p1; + } + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __lanemask_lt( + +) +{ + .reg .s32 %r<2>; + + + // inline asm + mov.u32 %r1, %lanemask_lt; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b64 func_retval0) ISPCAlloc( + .param .b64 ISPCAlloc_param_0, + .param .b64 ISPCAlloc_param_1, + .param .b32 ISPCAlloc_param_2 +) +{ + .reg .s64 %rd<2>; + + + mov.u64 %rd1, 1; + st.param.b64 [func_retval0+0], %rd1; + ret; +} + +.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer( + .param .b64 ISPCGetParamBuffer_param_0, + .param .b64 ISPCGetParamBuffer_param_1, + .param .b64 ISPCGetParamBuffer_param_2 +) +{ + .reg .pred %p<2>; + .reg .s32 %r<3>; + .reg .s64 %rd<7>; + + + ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1]; + ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2]; + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + setp.ne.s32 %p1, %r2, 0; + mov.u64 %rd6, 0; + @%p1 bra BB8_2; + + // Callseq Start 0 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd3; + .param .b64 param1; + st.param.b64 [param1+0], %rd4; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd6, [retval0+0]; + } + // Callseq End 0 + +BB8_2: + st.param.b64 [func_retval0+0], %rd6; + ret; +} + +.visible .func ISPCLaunch( + .param .b64 ISPCLaunch_param_0, + .param .b64 ISPCLaunch_param_1, + .param .b64 ISPCLaunch_param_2, + .param .b32 ISPCLaunch_param_3, + .param .b32 ISPCLaunch_param_4, + .param .b32 ISPCLaunch_param_5 +) +{ + .reg .pred %p<2>; + .reg .s32 %r<16>; + .reg .s64 %rd<6>; + + + ld.param.u64 %rd1, [ISPCLaunch_param_1]; + ld.param.u64 %rd2, [ISPCLaunch_param_2]; + ld.param.u32 %r1, [ISPCLaunch_param_3]; + ld.param.u32 %r2, [ISPCLaunch_param_4]; + ld.param.u32 %r3, [ISPCLaunch_param_5]; + mov.u32 %r4, %tid.x; + and.b32 %r5, %r4, 31; + setp.ne.s32 %p1, %r5, 0; + @%p1 bra BB9_2; + + add.s32 %r14, %r1, -1; + shr.s32 %r15, %r14, 2; + add.s32 %r7, %r15, 1; + mov.u32 %r12, 1; + mov.u32 %r10, 128; + mov.u32 %r13, 0; + mov.u64 %rd5, 0; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd1; + .param .b64 param1; + st.param.b64 [param1+0], %rd2; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r7; + st.param.b32 [param2+4], %r2; + st.param.b32 [param2+8], %r3; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r10; + st.param.b32 [param3+4], %r12; + st.param.b32 [param3+8], %r12; + .param .b32 param4; + st.param.b32 [param4+0], %r13; + .param .b64 param5; + st.param.b64 [param5+0], %rd5; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r6, [retval0+0]; + } + + // inline asm + +BB9_2: + ret; +} + +.visible .func ISPCSync( + .param .b64 ISPCSync_param_0 +) +{ + .reg .s32 %r<2>; + + + // Callseq Start 1 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r1, [retval0+0]; + } + // Callseq End 1 + ret; +} + +.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan( + .param .b32 __warpBinExclusiveScan_param_0 +) +{ + .reg .s32 %r<8>; + .reg .s64 %rd<5>; + + + ld.param.u8 %r2, [__warpBinExclusiveScan_param_0]; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r2, 0; + vote.ballot.b32 %r1, %p1; + } + // inline asm + // inline asm + popc.b32 %r3, %r1; + // inline asm + // inline asm + mov.u32 %r5, %lanemask_lt; + // inline asm + and.b32 %r7, %r5, %r1; + // inline asm + popc.b32 %r6, %r7; + // inline asm + cvt.u64.u32 %rd1, %r6; + shl.b64 %rd2, %rd1, 32; + cvt.u64.u32 %rd3, %r3; + or.b64 %rd4, %rd2, %rd3; + st.param.b64 [func_retval0+0], %rd4; + ret; +} + +.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_( + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12 +) +{ + .reg .pred %p<14>; + .reg .s32 %r<178>; + .reg .s64 %rd<96>; + .reg .f64 %fd<95>; + + + ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0]; + ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1]; + ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2]; + ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3]; + ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4]; + ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5]; + ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6]; + ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7]; + ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9]; + ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10]; + ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11]; + ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12]; + mov.u32 %r1, %ctaid.x; + shl.b32 %r50, %r1, 2; + mov.u32 %r2, %tid.x; + shr.s32 %r51, %r2, 5; + add.s32 %r52, %r51, %r50; + mov.u32 %r53, %nctaid.x; + shl.b32 %r54, %r53, 2; + setp.ge.s32 %p1, %r52, %r54; + mov.u32 %r55, %nctaid.y; + mov.u32 %r3, %ctaid.y; + setp.ge.s32 %p2, %r3, %r55; + or.pred %p3, %p1, %p2; + mov.u32 %r56, %nctaid.z; + mov.u32 %r4, %ctaid.z; + setp.ge.s32 %p4, %r4, %r56; + or.pred %p5, %p3, %p4; + @%p5 bra BB12_13; + + shl.b32 %r57, %r1, 7; + add.s32 %r58, %r2, %r57; + and.b32 %r59, %r58, -32; + add.s32 %r60, %r59, %r42; + add.s32 %r61, %r60, 32; + min.s32 %r5, %r43, %r61; + shl.b32 %r6, %r3, 3; + add.s32 %r62, %r6, %r44; + add.s32 %r7, %r62, 8; + shl.b32 %r8, %r4, 3; + add.s32 %r172, %r8, %r46; + add.s32 %r63, %r172, 8; + min.s32 %r64, %r47, %r63; + mul.lo.s32 %r10, %r49, %r48; + sub.s32 %r65, %r5, %r60; + shr.s32 %r66, %r65, 31; + shr.u32 %r67, %r66, 27; + add.s32 %r68, %r65, %r67; + and.b32 %r69, %r68, -32; + sub.s32 %r70, %r65, %r69; + sub.s32 %r11, %r5, %r70; + and.b32 %r71, %r2, 31; + cvt.u64.u32 %rd6, %r71; + mov.u64 %rd7, constDeltaForeach1; + add.s64 %rd1, %rd7, %rd6; + setp.ge.s32 %p6, %r172, %r64; + @%p6 bra BB12_13; + + min.s32 %r12, %r45, %r7; + shl.b32 %r15, %r10, 1; + neg.s32 %r16, %r15; + mul.lo.s32 %r17, %r10, 3; + mul.lo.s32 %r18, %r10, -3; + mov.u32 %r72, -9; + sub.s32 %r73, %r72, %r44; + sub.s32 %r74, %r73, %r6; + not.b32 %r75, %r45; + max.s32 %r76, %r74, %r75; + not.b32 %r19, %r76; + sub.s32 %r77, %r72, %r46; + sub.s32 %r78, %r77, %r8; + not.b32 %r79, %r47; + max.s32 %r80, %r78, %r79; + not.b32 %r20, %r80; + ld.global.u8 %r13, [%rd1]; + mov.u32 %r171, %r172; + +BB12_3: + mov.u32 %r21, %r171; + add.s32 %r23, %r21, %r13; + setp.ge.s32 %p7, %r62, %r12; + @%p7 bra BB12_12; + + mul.lo.s32 %r24, %r23, %r10; + mov.u32 %r174, %r62; + mov.u32 %r173, %r62; + +BB12_5: + mov.u32 %r27, %r173; + add.s32 %r30, %r27, %r13; + setp.ge.s32 %p8, %r60, %r11; + mov.u32 %r176, %r60; + @%p8 bra BB12_8; + + mov.u64 %rd9, constDeltaForeach4; + add.s64 %rd10, %rd9, %rd6; + ld.global.u8 %r31, [%rd10]; + mad.lo.s32 %r32, %r30, %r48, %r24; + add.s32 %r177, %r59, %r42; + +BB12_7: + cvta.to.global.u64 %rd11, %rd2; + add.s32 %r98, %r32, %r177; + add.s32 %r99, %r98, %r31; + shl.b32 %r100, %r99, 3; + cvt.s64.s32 %rd12, %r100; + add.s64 %rd13, %rd12, %rd4; + add.s32 %r101, %r100, 8; + cvt.s64.s32 %rd14, %r101; + add.s64 %rd15, %rd14, %rd4; + add.s32 %r102, %r100, -8; + cvt.s64.s32 %rd16, %r102; + add.s64 %rd17, %rd16, %rd4; + add.s32 %r103, %r99, %r48; + shl.b32 %r104, %r103, 3; + cvt.s64.s32 %rd18, %r104; + add.s64 %rd19, %rd18, %rd4; + sub.s32 %r105, %r99, %r48; + shl.b32 %r106, %r105, 3; + cvt.s64.s32 %rd20, %r106; + add.s64 %rd21, %rd20, %rd4; + add.s32 %r108, %r99, %r10; + shl.b32 %r109, %r108, 3; + cvt.s64.s32 %rd22, %r109; + add.s64 %rd23, %rd22, %rd4; + sub.s32 %r110, %r99, %r10; + shl.b32 %r111, %r110, 3; + cvt.s64.s32 %rd24, %r111; + add.s64 %rd25, %rd24, %rd4; + add.s32 %r112, %r100, 16; + cvt.s64.s32 %rd26, %r112; + add.s64 %rd27, %rd26, %rd4; + add.s32 %r113, %r100, -16; + cvt.s64.s32 %rd28, %r113; + add.s64 %rd29, %rd28, %rd4; + shl.b32 %r114, %r48, 1; + add.s32 %r115, %r99, %r114; + shl.b32 %r116, %r115, 3; + cvt.s64.s32 %rd30, %r116; + add.s64 %rd31, %rd30, %rd4; + mad.lo.s32 %r117, %r48, -2, %r99; + shl.b32 %r118, %r117, 3; + cvt.s64.s32 %rd32, %r118; + add.s64 %rd33, %rd32, %rd4; + add.s32 %r119, %r99, %r15; + shl.b32 %r120, %r119, 3; + cvt.s64.s32 %rd34, %r120; + add.s64 %rd35, %rd34, %rd4; + add.s32 %r121, %r99, %r16; + shl.b32 %r122, %r121, 3; + cvt.s64.s32 %rd36, %r122; + add.s64 %rd37, %rd36, %rd4; + add.s32 %r123, %r100, 24; + cvt.s64.s32 %rd38, %r123; + add.s64 %rd39, %rd38, %rd4; + add.s32 %r124, %r100, -24; + cvt.s64.s32 %rd40, %r124; + add.s64 %rd41, %rd40, %rd4; + mad.lo.s32 %r125, %r48, 3, %r99; + shl.b32 %r126, %r125, 3; + cvt.s64.s32 %rd42, %r126; + add.s64 %rd43, %rd42, %rd4; + mad.lo.s32 %r127, %r48, -3, %r99; + shl.b32 %r128, %r127, 3; + cvt.s64.s32 %rd44, %r128; + add.s64 %rd45, %rd44, %rd4; + add.s32 %r129, %r99, %r17; + shl.b32 %r130, %r129, 3; + cvt.s64.s32 %rd46, %r130; + add.s64 %rd47, %rd46, %rd4; + add.s32 %r131, %r99, %r18; + shl.b32 %r132, %r131, 3; + cvt.s64.s32 %rd48, %r132; + add.s64 %rd49, %rd48, %rd4; + add.s64 %rd50, %rd12, %rd5; + add.s64 %rd51, %rd12, %rd3; + ld.f64 %fd1, [%rd13]; + add.f64 %fd2, %fd1, %fd1; + ld.f64 %fd3, [%rd50]; + sub.f64 %fd4, %fd2, %fd3; + ld.global.f64 %fd5, [%rd11]; + ld.f64 %fd6, [%rd17]; + ld.f64 %fd7, [%rd15]; + add.f64 %fd8, %fd7, %fd6; + ld.f64 %fd9, [%rd19]; + add.f64 %fd10, %fd8, %fd9; + ld.f64 %fd11, [%rd21]; + add.f64 %fd12, %fd10, %fd11; + ld.f64 %fd13, [%rd23]; + add.f64 %fd14, %fd12, %fd13; + ld.f64 %fd15, [%rd25]; + add.f64 %fd16, %fd14, %fd15; + ld.global.f64 %fd17, [%rd11+8]; + mul.f64 %fd18, %fd17, %fd16; + fma.rn.f64 %fd19, %fd5, %fd1, %fd18; + ld.f64 %fd20, [%rd29]; + ld.f64 %fd21, [%rd27]; + add.f64 %fd22, %fd21, %fd20; + ld.f64 %fd23, [%rd31]; + add.f64 %fd24, %fd22, %fd23; + ld.f64 %fd25, [%rd33]; + add.f64 %fd26, %fd24, %fd25; + ld.f64 %fd27, [%rd35]; + add.f64 %fd28, %fd26, %fd27; + ld.f64 %fd29, [%rd37]; + add.f64 %fd30, %fd28, %fd29; + ld.global.f64 %fd31, [%rd11+16]; + fma.rn.f64 %fd32, %fd31, %fd30, %fd19; + ld.f64 %fd33, [%rd41]; + ld.f64 %fd34, [%rd39]; + add.f64 %fd35, %fd34, %fd33; + ld.f64 %fd36, [%rd43]; + add.f64 %fd37, %fd35, %fd36; + ld.f64 %fd38, [%rd45]; + add.f64 %fd39, %fd37, %fd38; + ld.f64 %fd40, [%rd47]; + add.f64 %fd41, %fd39, %fd40; + ld.f64 %fd42, [%rd49]; + add.f64 %fd43, %fd41, %fd42; + ld.global.f64 %fd44, [%rd11+24]; + fma.rn.f64 %fd45, %fd44, %fd43, %fd32; + ld.f64 %fd46, [%rd51]; + fma.rn.f64 %fd47, %fd46, %fd45, %fd4; + st.f64 [%rd50], %fd47; + add.s32 %r177, %r177, 32; + setp.lt.s32 %p9, %r177, %r11; + mov.u32 %r175, %r177; + mov.u32 %r176, %r175; + @%p9 bra BB12_7; + +BB12_8: + mov.u32 %r36, %r176; + setp.ge.s32 %p10, %r36, %r5; + @%p10 bra BB12_11; + + mov.u64 %rd53, constDeltaForeach4; + add.s64 %rd54, %rd53, %rd6; + ld.global.u8 %r135, [%rd54]; + add.s32 %r37, %r36, %r135; + setp.ge.s32 %p11, %r37, %r5; + @%p11 bra BB12_11; + + cvta.to.global.u64 %rd55, %rd2; + mad.lo.s32 %r136, %r30, %r48, %r24; + add.s32 %r137, %r136, %r37; + shl.b32 %r138, %r137, 3; + cvt.s64.s32 %rd56, %r138; + add.s64 %rd57, %rd56, %rd4; + add.s32 %r139, %r138, 8; + cvt.s64.s32 %rd58, %r139; + add.s64 %rd59, %rd58, %rd4; + add.s32 %r140, %r138, -8; + cvt.s64.s32 %rd60, %r140; + add.s64 %rd61, %rd60, %rd4; + add.s32 %r141, %r137, %r48; + shl.b32 %r142, %r141, 3; + cvt.s64.s32 %rd62, %r142; + add.s64 %rd63, %rd62, %rd4; + sub.s32 %r143, %r137, %r48; + shl.b32 %r144, %r143, 3; + cvt.s64.s32 %rd64, %r144; + add.s64 %rd65, %rd64, %rd4; + add.s32 %r146, %r137, %r10; + shl.b32 %r147, %r146, 3; + cvt.s64.s32 %rd66, %r147; + add.s64 %rd67, %rd66, %rd4; + sub.s32 %r148, %r137, %r10; + shl.b32 %r149, %r148, 3; + cvt.s64.s32 %rd68, %r149; + add.s64 %rd69, %rd68, %rd4; + add.s32 %r150, %r138, 16; + cvt.s64.s32 %rd70, %r150; + add.s64 %rd71, %rd70, %rd4; + add.s32 %r151, %r138, -16; + cvt.s64.s32 %rd72, %r151; + add.s64 %rd73, %rd72, %rd4; + shl.b32 %r152, %r48, 1; + add.s32 %r153, %r137, %r152; + shl.b32 %r154, %r153, 3; + cvt.s64.s32 %rd74, %r154; + add.s64 %rd75, %rd74, %rd4; + mad.lo.s32 %r155, %r48, -2, %r137; + shl.b32 %r156, %r155, 3; + cvt.s64.s32 %rd76, %r156; + add.s64 %rd77, %rd76, %rd4; + add.s32 %r157, %r137, %r15; + shl.b32 %r158, %r157, 3; + cvt.s64.s32 %rd78, %r158; + add.s64 %rd79, %rd78, %rd4; + add.s32 %r159, %r137, %r16; + shl.b32 %r160, %r159, 3; + cvt.s64.s32 %rd80, %r160; + add.s64 %rd81, %rd80, %rd4; + add.s32 %r161, %r138, 24; + cvt.s64.s32 %rd82, %r161; + add.s64 %rd83, %rd82, %rd4; + add.s32 %r162, %r138, -24; + cvt.s64.s32 %rd84, %r162; + add.s64 %rd85, %rd84, %rd4; + mad.lo.s32 %r163, %r48, 3, %r137; + shl.b32 %r164, %r163, 3; + cvt.s64.s32 %rd86, %r164; + add.s64 %rd87, %rd86, %rd4; + mad.lo.s32 %r165, %r48, -3, %r137; + shl.b32 %r166, %r165, 3; + cvt.s64.s32 %rd88, %r166; + add.s64 %rd89, %rd88, %rd4; + add.s32 %r167, %r137, %r17; + shl.b32 %r168, %r167, 3; + cvt.s64.s32 %rd90, %r168; + add.s64 %rd91, %rd90, %rd4; + add.s32 %r169, %r137, %r18; + shl.b32 %r170, %r169, 3; + cvt.s64.s32 %rd92, %r170; + add.s64 %rd93, %rd92, %rd4; + add.s64 %rd94, %rd56, %rd5; + add.s64 %rd95, %rd56, %rd3; + ld.f64 %fd48, [%rd57]; + add.f64 %fd49, %fd48, %fd48; + ld.f64 %fd50, [%rd94]; + sub.f64 %fd51, %fd49, %fd50; + ld.global.f64 %fd52, [%rd55]; + ld.f64 %fd53, [%rd61]; + ld.f64 %fd54, [%rd59]; + add.f64 %fd55, %fd54, %fd53; + ld.f64 %fd56, [%rd63]; + add.f64 %fd57, %fd55, %fd56; + ld.f64 %fd58, [%rd65]; + add.f64 %fd59, %fd57, %fd58; + ld.f64 %fd60, [%rd67]; + add.f64 %fd61, %fd59, %fd60; + ld.f64 %fd62, [%rd69]; + add.f64 %fd63, %fd61, %fd62; + ld.global.f64 %fd64, [%rd55+8]; + mul.f64 %fd65, %fd64, %fd63; + fma.rn.f64 %fd66, %fd52, %fd48, %fd65; + ld.f64 %fd67, [%rd73]; + ld.f64 %fd68, [%rd71]; + add.f64 %fd69, %fd68, %fd67; + ld.f64 %fd70, [%rd75]; + add.f64 %fd71, %fd69, %fd70; + ld.f64 %fd72, [%rd77]; + add.f64 %fd73, %fd71, %fd72; + ld.f64 %fd74, [%rd79]; + add.f64 %fd75, %fd73, %fd74; + ld.f64 %fd76, [%rd81]; + add.f64 %fd77, %fd75, %fd76; + ld.global.f64 %fd78, [%rd55+16]; + fma.rn.f64 %fd79, %fd78, %fd77, %fd66; + ld.f64 %fd80, [%rd85]; + ld.f64 %fd81, [%rd83]; + add.f64 %fd82, %fd81, %fd80; + ld.f64 %fd83, [%rd87]; + add.f64 %fd84, %fd82, %fd83; + ld.f64 %fd85, [%rd89]; + add.f64 %fd86, %fd84, %fd85; + ld.f64 %fd87, [%rd91]; + add.f64 %fd88, %fd86, %fd87; + ld.f64 %fd89, [%rd93]; + add.f64 %fd90, %fd88, %fd89; + ld.global.f64 %fd91, [%rd55+24]; + fma.rn.f64 %fd92, %fd91, %fd90, %fd79; + ld.f64 %fd93, [%rd95]; + fma.rn.f64 %fd94, %fd92, %fd93, %fd51; + st.f64 [%rd94], %fd94; + +BB12_11: + add.s32 %r39, %r174, 1; + setp.ne.s32 %p12, %r39, %r19; + mov.u32 %r174, %r39; + mov.u32 %r173, %r39; + @%p12 bra BB12_5; + +BB12_12: + add.s32 %r171, %r172, 1; + setp.ne.s32 %p13, %r171, %r20; + mov.u32 %r172, %r171; + @%p13 bra BB12_3; + +BB12_13: + ret; +} + +.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_( + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14, + .param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1] +) +{ + .reg .pred %p<9>; + .reg .s32 %r<63>; + .reg .s64 %rd<18>; + + + ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0]; + ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1]; + ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2]; + ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3]; + ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4]; + ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5]; + ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6]; + ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7]; + ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8]; + ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9]; + ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10]; + ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11]; + ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12]; + ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13]; + ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14]; + setp.ge.s32 %p1, %r62, %r12; + @%p1 bra BB13_14; + + mov.u32 %r22, 31; + sub.s32 %r23, %r22, %r13; + add.s32 %r24, %r23, %r14; + shr.s32 %r25, %r24, 31; + shr.u32 %r26, %r25, 27; + add.s32 %r27, %r24, %r26; + shr.s32 %r28, %r27, 5; + mov.u32 %r29, 7; + sub.s32 %r30, %r29, %r15; + add.s32 %r31, %r30, %r16; + shr.s32 %r32, %r31, 31; + shr.u32 %r33, %r32, 29; + add.s32 %r34, %r31, %r33; + shr.s32 %r1, %r34, 3; + sub.s32 %r35, %r29, %r17; + add.s32 %r36, %r35, %r18; + shr.s32 %r37, %r36, 31; + shr.u32 %r38, %r37, 29; + add.s32 %r39, %r36, %r38; + shr.s32 %r2, %r39, 3; + add.s32 %r40, %r28, -1; + shr.s32 %r41, %r40, 2; + add.s32 %r3, %r41, 1; + mov.u32 %r42, %tid.x; + and.b32 %r4, %r42, 31; + sub.s32 %r61, %r62, %r12; + +BB13_2: + and.b32 %r8, %r62, 1; + setp.ne.s32 %p2, %r4, 0; + mov.u64 %rd17, 0; + @%p2 bra BB13_4; + + mov.u64 %rd9, 8; + mov.u64 %rd10, 72; + // Callseq Start 2 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd10; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd17, [retval0+0]; + } + // Callseq End 2 + +BB13_4: + setp.eq.s32 %p3, %r8, 0; + @%p3 bra BB13_9; + + setp.eq.s64 %p4, %rd17, 0; + @%p4 bra BB13_7; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd7; + st.u64 [%rd17+64], %rd6; + +BB13_7: + @%p2 bra BB13_13; + + mov.u32 %r47, 128; + mov.u32 %r49, 1; + mov.u32 %r50, 0; + mov.u64 %rd13, 0; + mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd11; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r47; + st.param.b32 [param3+4], %r49; + st.param.b32 [param3+8], %r49; + .param .b32 param4; + st.param.b32 [param4+0], %r50; + .param .b64 param5; + st.param.b64 [param5+0], %rd13; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r43, [retval0+0]; + } + + // inline asm + bra.uni BB13_13; + +BB13_9: + setp.eq.s64 %p6, %rd17, 0; + @%p6 bra BB13_11; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd6; + st.u64 [%rd17+64], %rd7; + +BB13_11: + @%p2 bra BB13_13; + + mov.u32 %r55, 128; + mov.u32 %r57, 1; + mov.u32 %r58, 0; + mov.u64 %rd16, 0; + mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd14; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r55; + st.param.b32 [param3+4], %r57; + st.param.b32 [param3+8], %r57; + .param .b32 param4; + st.param.b32 [param4+0], %r58; + .param .b64 param5; + st.param.b64 [param5+0], %rd16; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r51, [retval0+0]; + } + + // inline asm + +BB13_13: + // Callseq Start 3 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r59, [retval0+0]; + } + // Callseq End 3 + add.s32 %r62, %r62, 1; + add.s32 %r61, %r61, 1; + setp.ne.s32 %p8, %r61, 0; + @%p8 bra BB13_2; + +BB13_14: + // Callseq Start 4 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r60, [retval0+0]; + } + // Callseq End 4 + ret; +} + +.visible .entry loop_stencil_ispc_tasks( + .param .u32 loop_stencil_ispc_tasks_param_0, + .param .u32 loop_stencil_ispc_tasks_param_1, + .param .u32 loop_stencil_ispc_tasks_param_2, + .param .u32 loop_stencil_ispc_tasks_param_3, + .param .u32 loop_stencil_ispc_tasks_param_4, + .param .u32 loop_stencil_ispc_tasks_param_5, + .param .u32 loop_stencil_ispc_tasks_param_6, + .param .u32 loop_stencil_ispc_tasks_param_7, + .param .u32 loop_stencil_ispc_tasks_param_8, + .param .u32 loop_stencil_ispc_tasks_param_9, + .param .u32 loop_stencil_ispc_tasks_param_10, + .param .u64 loop_stencil_ispc_tasks_param_11, + .param .u64 loop_stencil_ispc_tasks_param_12, + .param .u64 loop_stencil_ispc_tasks_param_13, + .param .u64 loop_stencil_ispc_tasks_param_14 +) +{ + .reg .pred %p<9>; + .reg .s32 %r<63>; + .reg .s64 %rd<18>; + + + ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0]; + ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1]; + ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2]; + ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3]; + ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4]; + ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5]; + ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6]; + ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7]; + ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8]; + ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9]; + ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10]; + ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11]; + ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12]; + ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13]; + ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14]; + setp.ge.s32 %p1, %r62, %r12; + @%p1 bra BB14_14; + + mov.u32 %r22, 31; + sub.s32 %r23, %r22, %r13; + add.s32 %r24, %r23, %r14; + shr.s32 %r25, %r24, 31; + shr.u32 %r26, %r25, 27; + add.s32 %r27, %r24, %r26; + shr.s32 %r28, %r27, 5; + mov.u32 %r29, 7; + sub.s32 %r30, %r29, %r15; + add.s32 %r31, %r30, %r16; + shr.s32 %r32, %r31, 31; + shr.u32 %r33, %r32, 29; + add.s32 %r34, %r31, %r33; + shr.s32 %r1, %r34, 3; + sub.s32 %r35, %r29, %r17; + add.s32 %r36, %r35, %r18; + shr.s32 %r37, %r36, 31; + shr.u32 %r38, %r37, 29; + add.s32 %r39, %r36, %r38; + shr.s32 %r2, %r39, 3; + add.s32 %r40, %r28, -1; + shr.s32 %r41, %r40, 2; + add.s32 %r3, %r41, 1; + mov.u32 %r42, %tid.x; + and.b32 %r4, %r42, 31; + sub.s32 %r61, %r62, %r12; + +BB14_2: + and.b32 %r8, %r62, 1; + setp.ne.s32 %p2, %r4, 0; + mov.u64 %rd17, 0; + @%p2 bra BB14_4; + + mov.u64 %rd9, 8; + mov.u64 %rd10, 72; + // Callseq Start 5 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd10; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd17, [retval0+0]; + } + // Callseq End 5 + +BB14_4: + setp.eq.s32 %p3, %r8, 0; + @%p3 bra BB14_9; + + setp.eq.s64 %p4, %rd17, 0; + @%p4 bra BB14_7; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd7; + st.u64 [%rd17+64], %rd6; + +BB14_7: + @%p2 bra BB14_13; + + mov.u32 %r47, 128; + mov.u32 %r49, 1; + mov.u32 %r50, 0; + mov.u64 %rd13, 0; + mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd11; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r47; + st.param.b32 [param3+4], %r49; + st.param.b32 [param3+8], %r49; + .param .b32 param4; + st.param.b32 [param4+0], %r50; + .param .b64 param5; + st.param.b64 [param5+0], %rd13; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r43, [retval0+0]; + } + + // inline asm + bra.uni BB14_13; + +BB14_9: + setp.eq.s64 %p6, %rd17, 0; + @%p6 bra BB14_11; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd6; + st.u64 [%rd17+64], %rd7; + +BB14_11: + @%p2 bra BB14_13; + + mov.u32 %r55, 128; + mov.u32 %r57, 1; + mov.u32 %r58, 0; + mov.u64 %rd16, 0; + mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd14; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r55; + st.param.b32 [param3+4], %r57; + st.param.b32 [param3+8], %r57; + .param .b32 param4; + st.param.b32 [param4+0], %r58; + .param .b64 param5; + st.param.b64 [param5+0], %rd16; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r51, [retval0+0]; + } + + // inline asm + +BB14_13: + // Callseq Start 6 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r59, [retval0+0]; + } + // Callseq End 6 + add.s32 %r62, %r62, 1; + add.s32 %r61, %r61, 1; + setp.ne.s32 %p8, %r61, 0; + @%p8 bra BB14_2; + +BB14_14: + // Callseq Start 7 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r60, [retval0+0]; + } + // Callseq End 7 + ret; +} + + + diff --git a/examples_cuda/stencil/err b/examples_cuda/stencil/err deleted file mode 100644 index e69de29b..00000000 diff --git a/examples_cuda/stencil/info b/examples_cuda/stencil/info deleted file mode 100644 index 4fc9105f..00000000 --- a/examples_cuda/stencil/info +++ /dev/null @@ -1,5 +0,0 @@ -I have been working with sort example, attempting to use ISPC_USE_OMP for tasking and adding example for sort_paralle.cpp which uses __gnu_parallel::sort to compare apples with apples, but clang has no support for OpenMP. - -The reason to use ISPC_USE_OMP is to control thread-affinity on multi-socket systems. For bandwidth bound throughput, the tasking system based on pthread make it messy to control thread-affinity and w/o this for bandwidth bound work-loads performance may suffer.. - -I used sort example to begin with diff --git a/examples_cuda/stencil/kernel.ptx b/examples_cuda/stencil/kernel.ptx new file mode 100644 index 00000000..b0339cbf --- /dev/null +++ b/examples_cuda/stencil/kernel.ptx @@ -0,0 +1,1246 @@ +// +// Generated by NVIDIA NVVM Compiler +// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) +// Cuda compilation tools, release 5.5, V5.5.0 +// + +.version 3.2 +.target sm_35 +.address_size 64 + + +.extern .func (.param .b32 func_retval0) cudaLaunchDevice +( + .param .b64 cudaLaunchDevice_param_0, + .param .b64 cudaLaunchDevice_param_1, + .param .align 4 .b8 cudaLaunchDevice_param_2[12], + .param .align 4 .b8 cudaLaunchDevice_param_3[12], + .param .b32 cudaLaunchDevice_param_4, + .param .b64 cudaLaunchDevice_param_5 +); + + +.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer +( + .param .b64 cudaGetParameterBuffer_param_0, + .param .b64 cudaGetParameterBuffer_param_1 +) +; +.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize +( + +) +; +.global .align 1 .b8 constDeltaForeach1[32]; +.global .align 1 .b8 constDeltaForeach4[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + +.visible .func (.param .b32 func_retval0) __shfl_i32( + .param .b32 __shfl_i32_param_0, + .param .b32 __shfl_i32_param_1 +) +{ + .reg .s32 %r<4>; + + + ld.param.u32 %r2, [__shfl_i32_param_0]; + ld.param.u32 %r3, [__shfl_i32_param_1]; + // inline asm + shfl.idx.b32 %r1, %r2, %r3, 0x1f; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __shfl_xor_float( + .param .b32 __shfl_xor_float_param_0, + .param .b32 __shfl_xor_float_param_1 +) +{ + .reg .s32 %r<2>; + .reg .f32 %f<3>; + + + ld.param.f32 %f2, [__shfl_xor_float_param_0]; + ld.param.u32 %r1, [__shfl_xor_float_param_1]; + // inline asm + shfl.bfly.b32 %f1, %f2, %r1, 0x1f; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __shfl_xor_i32( + .param .b32 __shfl_xor_i32_param_0, + .param .b32 __shfl_xor_i32_param_1 +) +{ + .reg .s32 %r<4>; + + + ld.param.u32 %r2, [__shfl_xor_i32_param_0]; + ld.param.u32 %r3, [__shfl_xor_i32_param_1]; + // inline asm + shfl.bfly.b32 %r1, %r2, %r3, 0x1f; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __fminf( + .param .b32 __fminf_param_0, + .param .b32 __fminf_param_1 +) +{ + .reg .f32 %f<4>; + + + ld.param.f32 %f2, [__fminf_param_0]; + ld.param.f32 %f3, [__fminf_param_1]; + // inline asm + min.f32 %f1, %f2, %f3; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __fmaxf( + .param .b32 __fmaxf_param_0, + .param .b32 __fmaxf_param_1 +) +{ + .reg .f32 %f<4>; + + + ld.param.f32 %f2, [__fmaxf_param_0]; + ld.param.f32 %f3, [__fmaxf_param_1]; + // inline asm + max.f32 %f1, %f2, %f3; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __ballot( + .param .b32 __ballot_param_0 +) +{ + .reg .s32 %r<3>; + + + ld.param.u8 %r2, [__ballot_param_0]; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r2, 0; + vote.ballot.b32 %r1, %p1; + } + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __lanemask_lt( + +) +{ + .reg .s32 %r<2>; + + + // inline asm + mov.u32 %r1, %lanemask_lt; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b64 func_retval0) ISPCAlloc( + .param .b64 ISPCAlloc_param_0, + .param .b64 ISPCAlloc_param_1, + .param .b32 ISPCAlloc_param_2 +) +{ + .reg .s64 %rd<2>; + + + mov.u64 %rd1, 1; + st.param.b64 [func_retval0+0], %rd1; + ret; +} + +.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer( + .param .b64 ISPCGetParamBuffer_param_0, + .param .b64 ISPCGetParamBuffer_param_1, + .param .b64 ISPCGetParamBuffer_param_2 +) +{ + .reg .pred %p<2>; + .reg .s32 %r<3>; + .reg .s64 %rd<7>; + + + ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1]; + ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2]; + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + setp.ne.s32 %p1, %r2, 0; + mov.u64 %rd6, 0; + @%p1 bra BB8_2; + + // Callseq Start 0 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd3; + .param .b64 param1; + st.param.b64 [param1+0], %rd4; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd6, [retval0+0]; + } + // Callseq End 0 + +BB8_2: + st.param.b64 [func_retval0+0], %rd6; + ret; +} + +.visible .func ISPCLaunch( + .param .b64 ISPCLaunch_param_0, + .param .b64 ISPCLaunch_param_1, + .param .b64 ISPCLaunch_param_2, + .param .b32 ISPCLaunch_param_3, + .param .b32 ISPCLaunch_param_4, + .param .b32 ISPCLaunch_param_5 +) +{ + .reg .pred %p<2>; + .reg .s32 %r<16>; + .reg .s64 %rd<6>; + + + ld.param.u64 %rd1, [ISPCLaunch_param_1]; + ld.param.u64 %rd2, [ISPCLaunch_param_2]; + ld.param.u32 %r1, [ISPCLaunch_param_3]; + ld.param.u32 %r2, [ISPCLaunch_param_4]; + ld.param.u32 %r3, [ISPCLaunch_param_5]; + mov.u32 %r4, %tid.x; + and.b32 %r5, %r4, 31; + setp.ne.s32 %p1, %r5, 0; + @%p1 bra BB9_2; + + add.s32 %r14, %r1, -1; + shr.s32 %r15, %r14, 2; + add.s32 %r7, %r15, 1; + mov.u32 %r12, 1; + mov.u32 %r10, 128; + mov.u32 %r13, 0; + mov.u64 %rd5, 0; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd1; + .param .b64 param1; + st.param.b64 [param1+0], %rd2; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r7; + st.param.b32 [param2+4], %r2; + st.param.b32 [param2+8], %r3; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r10; + st.param.b32 [param3+4], %r12; + st.param.b32 [param3+8], %r12; + .param .b32 param4; + st.param.b32 [param4+0], %r13; + .param .b64 param5; + st.param.b64 [param5+0], %rd5; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r6, [retval0+0]; + } + + // inline asm + +BB9_2: + ret; +} + +.visible .func ISPCSync( + .param .b64 ISPCSync_param_0 +) +{ + .reg .s32 %r<2>; + + + // Callseq Start 1 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r1, [retval0+0]; + } + // Callseq End 1 + ret; +} + +.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan( + .param .b32 __warpBinExclusiveScan_param_0 +) +{ + .reg .s32 %r<8>; + .reg .s64 %rd<5>; + + + ld.param.u8 %r2, [__warpBinExclusiveScan_param_0]; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r2, 0; + vote.ballot.b32 %r1, %p1; + } + // inline asm + // inline asm + popc.b32 %r3, %r1; + // inline asm + // inline asm + mov.u32 %r5, %lanemask_lt; + // inline asm + and.b32 %r7, %r5, %r1; + // inline asm + popc.b32 %r6, %r7; + // inline asm + cvt.u64.u32 %rd1, %r6; + shl.b64 %rd2, %rd1, 32; + cvt.u64.u32 %rd3, %r3; + or.b64 %rd4, %rd2, %rd3; + st.param.b64 [func_retval0+0], %rd4; + ret; +} + +.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_( + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12 +) +{ + .reg .pred %p<14>; + .reg .s32 %r<178>; + .reg .s64 %rd<96>; + .reg .f64 %fd<95>; + + + ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0]; + ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1]; + ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2]; + ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3]; + ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4]; + ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5]; + ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6]; + ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7]; + ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9]; + ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10]; + ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11]; + ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12]; + mov.u32 %r1, %ctaid.x; + shl.b32 %r50, %r1, 2; + mov.u32 %r2, %tid.x; + shr.s32 %r51, %r2, 5; + add.s32 %r52, %r51, %r50; + mov.u32 %r53, %nctaid.x; + shl.b32 %r54, %r53, 2; + setp.ge.s32 %p1, %r52, %r54; + mov.u32 %r55, %nctaid.y; + mov.u32 %r3, %ctaid.y; + setp.ge.s32 %p2, %r3, %r55; + or.pred %p3, %p1, %p2; + mov.u32 %r56, %nctaid.z; + mov.u32 %r4, %ctaid.z; + setp.ge.s32 %p4, %r4, %r56; + or.pred %p5, %p3, %p4; + @%p5 bra BB12_13; + + shl.b32 %r57, %r1, 7; + add.s32 %r58, %r2, %r57; + and.b32 %r59, %r58, -32; + add.s32 %r60, %r59, %r42; + add.s32 %r61, %r60, 32; + min.s32 %r5, %r43, %r61; + shl.b32 %r6, %r3, 3; + add.s32 %r62, %r6, %r44; + add.s32 %r7, %r62, 8; + shl.b32 %r8, %r4, 3; + add.s32 %r172, %r8, %r46; + add.s32 %r63, %r172, 8; + min.s32 %r64, %r47, %r63; + mul.lo.s32 %r10, %r49, %r48; + sub.s32 %r65, %r5, %r60; + shr.s32 %r66, %r65, 31; + shr.u32 %r67, %r66, 27; + add.s32 %r68, %r65, %r67; + and.b32 %r69, %r68, -32; + sub.s32 %r70, %r65, %r69; + sub.s32 %r11, %r5, %r70; + and.b32 %r71, %r2, 31; + cvt.u64.u32 %rd6, %r71; + mov.u64 %rd7, constDeltaForeach1; + add.s64 %rd1, %rd7, %rd6; + setp.ge.s32 %p6, %r172, %r64; + @%p6 bra BB12_13; + + min.s32 %r12, %r45, %r7; + shl.b32 %r15, %r10, 1; + neg.s32 %r16, %r15; + mul.lo.s32 %r17, %r10, 3; + mul.lo.s32 %r18, %r10, -3; + mov.u32 %r72, -9; + sub.s32 %r73, %r72, %r44; + sub.s32 %r74, %r73, %r6; + not.b32 %r75, %r45; + max.s32 %r76, %r74, %r75; + not.b32 %r19, %r76; + sub.s32 %r77, %r72, %r46; + sub.s32 %r78, %r77, %r8; + not.b32 %r79, %r47; + max.s32 %r80, %r78, %r79; + not.b32 %r20, %r80; + ld.global.u8 %r13, [%rd1]; + mov.u32 %r171, %r172; + +BB12_3: + mov.u32 %r21, %r171; + add.s32 %r23, %r21, %r13; + setp.ge.s32 %p7, %r62, %r12; + @%p7 bra BB12_12; + + mul.lo.s32 %r24, %r23, %r10; + mov.u32 %r174, %r62; + mov.u32 %r173, %r62; + +BB12_5: + mov.u32 %r27, %r173; + add.s32 %r30, %r27, %r13; + setp.ge.s32 %p8, %r60, %r11; + mov.u32 %r176, %r60; + @%p8 bra BB12_8; + + mov.u64 %rd9, constDeltaForeach4; + add.s64 %rd10, %rd9, %rd6; + ld.global.u8 %r31, [%rd10]; + mad.lo.s32 %r32, %r30, %r48, %r24; + add.s32 %r177, %r59, %r42; + +BB12_7: + cvta.to.global.u64 %rd11, %rd2; + add.s32 %r98, %r32, %r177; + add.s32 %r99, %r98, %r31; + shl.b32 %r100, %r99, 3; + cvt.s64.s32 %rd12, %r100; + add.s64 %rd13, %rd12, %rd4; + add.s32 %r101, %r100, 8; + cvt.s64.s32 %rd14, %r101; + add.s64 %rd15, %rd14, %rd4; + add.s32 %r102, %r100, -8; + cvt.s64.s32 %rd16, %r102; + add.s64 %rd17, %rd16, %rd4; + add.s32 %r103, %r99, %r48; + shl.b32 %r104, %r103, 3; + cvt.s64.s32 %rd18, %r104; + add.s64 %rd19, %rd18, %rd4; + sub.s32 %r105, %r99, %r48; + shl.b32 %r106, %r105, 3; + cvt.s64.s32 %rd20, %r106; + add.s64 %rd21, %rd20, %rd4; + add.s32 %r108, %r99, %r10; + shl.b32 %r109, %r108, 3; + cvt.s64.s32 %rd22, %r109; + add.s64 %rd23, %rd22, %rd4; + sub.s32 %r110, %r99, %r10; + shl.b32 %r111, %r110, 3; + cvt.s64.s32 %rd24, %r111; + add.s64 %rd25, %rd24, %rd4; + add.s32 %r112, %r100, 16; + cvt.s64.s32 %rd26, %r112; + add.s64 %rd27, %rd26, %rd4; + add.s32 %r113, %r100, -16; + cvt.s64.s32 %rd28, %r113; + add.s64 %rd29, %rd28, %rd4; + shl.b32 %r114, %r48, 1; + add.s32 %r115, %r99, %r114; + shl.b32 %r116, %r115, 3; + cvt.s64.s32 %rd30, %r116; + add.s64 %rd31, %rd30, %rd4; + mad.lo.s32 %r117, %r48, -2, %r99; + shl.b32 %r118, %r117, 3; + cvt.s64.s32 %rd32, %r118; + add.s64 %rd33, %rd32, %rd4; + add.s32 %r119, %r99, %r15; + shl.b32 %r120, %r119, 3; + cvt.s64.s32 %rd34, %r120; + add.s64 %rd35, %rd34, %rd4; + add.s32 %r121, %r99, %r16; + shl.b32 %r122, %r121, 3; + cvt.s64.s32 %rd36, %r122; + add.s64 %rd37, %rd36, %rd4; + add.s32 %r123, %r100, 24; + cvt.s64.s32 %rd38, %r123; + add.s64 %rd39, %rd38, %rd4; + add.s32 %r124, %r100, -24; + cvt.s64.s32 %rd40, %r124; + add.s64 %rd41, %rd40, %rd4; + mad.lo.s32 %r125, %r48, 3, %r99; + shl.b32 %r126, %r125, 3; + cvt.s64.s32 %rd42, %r126; + add.s64 %rd43, %rd42, %rd4; + mad.lo.s32 %r127, %r48, -3, %r99; + shl.b32 %r128, %r127, 3; + cvt.s64.s32 %rd44, %r128; + add.s64 %rd45, %rd44, %rd4; + add.s32 %r129, %r99, %r17; + shl.b32 %r130, %r129, 3; + cvt.s64.s32 %rd46, %r130; + add.s64 %rd47, %rd46, %rd4; + add.s32 %r131, %r99, %r18; + shl.b32 %r132, %r131, 3; + cvt.s64.s32 %rd48, %r132; + add.s64 %rd49, %rd48, %rd4; + add.s64 %rd50, %rd12, %rd5; + add.s64 %rd51, %rd12, %rd3; + ld.f64 %fd1, [%rd13]; + add.f64 %fd2, %fd1, %fd1; + ld.f64 %fd3, [%rd50]; + sub.f64 %fd4, %fd2, %fd3; + ld.global.f64 %fd5, [%rd11]; + ld.f64 %fd6, [%rd17]; + ld.f64 %fd7, [%rd15]; + add.f64 %fd8, %fd7, %fd6; + ld.f64 %fd9, [%rd19]; + add.f64 %fd10, %fd8, %fd9; + ld.f64 %fd11, [%rd21]; + add.f64 %fd12, %fd10, %fd11; + ld.f64 %fd13, [%rd23]; + add.f64 %fd14, %fd12, %fd13; + ld.f64 %fd15, [%rd25]; + add.f64 %fd16, %fd14, %fd15; + ld.global.f64 %fd17, [%rd11+8]; + mul.f64 %fd18, %fd17, %fd16; + fma.rn.f64 %fd19, %fd5, %fd1, %fd18; + ld.f64 %fd20, [%rd29]; + ld.f64 %fd21, [%rd27]; + add.f64 %fd22, %fd21, %fd20; + ld.f64 %fd23, [%rd31]; + add.f64 %fd24, %fd22, %fd23; + ld.f64 %fd25, [%rd33]; + add.f64 %fd26, %fd24, %fd25; + ld.f64 %fd27, [%rd35]; + add.f64 %fd28, %fd26, %fd27; + ld.f64 %fd29, [%rd37]; + add.f64 %fd30, %fd28, %fd29; + ld.global.f64 %fd31, [%rd11+16]; + fma.rn.f64 %fd32, %fd31, %fd30, %fd19; + ld.f64 %fd33, [%rd41]; + ld.f64 %fd34, [%rd39]; + add.f64 %fd35, %fd34, %fd33; + ld.f64 %fd36, [%rd43]; + add.f64 %fd37, %fd35, %fd36; + ld.f64 %fd38, [%rd45]; + add.f64 %fd39, %fd37, %fd38; + ld.f64 %fd40, [%rd47]; + add.f64 %fd41, %fd39, %fd40; + ld.f64 %fd42, [%rd49]; + add.f64 %fd43, %fd41, %fd42; + ld.global.f64 %fd44, [%rd11+24]; + fma.rn.f64 %fd45, %fd44, %fd43, %fd32; + ld.f64 %fd46, [%rd51]; + fma.rn.f64 %fd47, %fd46, %fd45, %fd4; + st.f64 [%rd50], %fd47; + add.s32 %r177, %r177, 32; + setp.lt.s32 %p9, %r177, %r11; + mov.u32 %r175, %r177; + mov.u32 %r176, %r175; + @%p9 bra BB12_7; + +BB12_8: + mov.u32 %r36, %r176; + setp.ge.s32 %p10, %r36, %r5; + @%p10 bra BB12_11; + + mov.u64 %rd53, constDeltaForeach4; + add.s64 %rd54, %rd53, %rd6; + ld.global.u8 %r135, [%rd54]; + add.s32 %r37, %r36, %r135; + setp.ge.s32 %p11, %r37, %r5; + @%p11 bra BB12_11; + + cvta.to.global.u64 %rd55, %rd2; + mad.lo.s32 %r136, %r30, %r48, %r24; + add.s32 %r137, %r136, %r37; + shl.b32 %r138, %r137, 3; + cvt.s64.s32 %rd56, %r138; + add.s64 %rd57, %rd56, %rd4; + add.s32 %r139, %r138, 8; + cvt.s64.s32 %rd58, %r139; + add.s64 %rd59, %rd58, %rd4; + add.s32 %r140, %r138, -8; + cvt.s64.s32 %rd60, %r140; + add.s64 %rd61, %rd60, %rd4; + add.s32 %r141, %r137, %r48; + shl.b32 %r142, %r141, 3; + cvt.s64.s32 %rd62, %r142; + add.s64 %rd63, %rd62, %rd4; + sub.s32 %r143, %r137, %r48; + shl.b32 %r144, %r143, 3; + cvt.s64.s32 %rd64, %r144; + add.s64 %rd65, %rd64, %rd4; + add.s32 %r146, %r137, %r10; + shl.b32 %r147, %r146, 3; + cvt.s64.s32 %rd66, %r147; + add.s64 %rd67, %rd66, %rd4; + sub.s32 %r148, %r137, %r10; + shl.b32 %r149, %r148, 3; + cvt.s64.s32 %rd68, %r149; + add.s64 %rd69, %rd68, %rd4; + add.s32 %r150, %r138, 16; + cvt.s64.s32 %rd70, %r150; + add.s64 %rd71, %rd70, %rd4; + add.s32 %r151, %r138, -16; + cvt.s64.s32 %rd72, %r151; + add.s64 %rd73, %rd72, %rd4; + shl.b32 %r152, %r48, 1; + add.s32 %r153, %r137, %r152; + shl.b32 %r154, %r153, 3; + cvt.s64.s32 %rd74, %r154; + add.s64 %rd75, %rd74, %rd4; + mad.lo.s32 %r155, %r48, -2, %r137; + shl.b32 %r156, %r155, 3; + cvt.s64.s32 %rd76, %r156; + add.s64 %rd77, %rd76, %rd4; + add.s32 %r157, %r137, %r15; + shl.b32 %r158, %r157, 3; + cvt.s64.s32 %rd78, %r158; + add.s64 %rd79, %rd78, %rd4; + add.s32 %r159, %r137, %r16; + shl.b32 %r160, %r159, 3; + cvt.s64.s32 %rd80, %r160; + add.s64 %rd81, %rd80, %rd4; + add.s32 %r161, %r138, 24; + cvt.s64.s32 %rd82, %r161; + add.s64 %rd83, %rd82, %rd4; + add.s32 %r162, %r138, -24; + cvt.s64.s32 %rd84, %r162; + add.s64 %rd85, %rd84, %rd4; + mad.lo.s32 %r163, %r48, 3, %r137; + shl.b32 %r164, %r163, 3; + cvt.s64.s32 %rd86, %r164; + add.s64 %rd87, %rd86, %rd4; + mad.lo.s32 %r165, %r48, -3, %r137; + shl.b32 %r166, %r165, 3; + cvt.s64.s32 %rd88, %r166; + add.s64 %rd89, %rd88, %rd4; + add.s32 %r167, %r137, %r17; + shl.b32 %r168, %r167, 3; + cvt.s64.s32 %rd90, %r168; + add.s64 %rd91, %rd90, %rd4; + add.s32 %r169, %r137, %r18; + shl.b32 %r170, %r169, 3; + cvt.s64.s32 %rd92, %r170; + add.s64 %rd93, %rd92, %rd4; + add.s64 %rd94, %rd56, %rd5; + add.s64 %rd95, %rd56, %rd3; + ld.f64 %fd48, [%rd57]; + add.f64 %fd49, %fd48, %fd48; + ld.f64 %fd50, [%rd94]; + sub.f64 %fd51, %fd49, %fd50; + ld.global.f64 %fd52, [%rd55]; + ld.f64 %fd53, [%rd61]; + ld.f64 %fd54, [%rd59]; + add.f64 %fd55, %fd54, %fd53; + ld.f64 %fd56, [%rd63]; + add.f64 %fd57, %fd55, %fd56; + ld.f64 %fd58, [%rd65]; + add.f64 %fd59, %fd57, %fd58; + ld.f64 %fd60, [%rd67]; + add.f64 %fd61, %fd59, %fd60; + ld.f64 %fd62, [%rd69]; + add.f64 %fd63, %fd61, %fd62; + ld.global.f64 %fd64, [%rd55+8]; + mul.f64 %fd65, %fd64, %fd63; + fma.rn.f64 %fd66, %fd52, %fd48, %fd65; + ld.f64 %fd67, [%rd73]; + ld.f64 %fd68, [%rd71]; + add.f64 %fd69, %fd68, %fd67; + ld.f64 %fd70, [%rd75]; + add.f64 %fd71, %fd69, %fd70; + ld.f64 %fd72, [%rd77]; + add.f64 %fd73, %fd71, %fd72; + ld.f64 %fd74, [%rd79]; + add.f64 %fd75, %fd73, %fd74; + ld.f64 %fd76, [%rd81]; + add.f64 %fd77, %fd75, %fd76; + ld.global.f64 %fd78, [%rd55+16]; + fma.rn.f64 %fd79, %fd78, %fd77, %fd66; + ld.f64 %fd80, [%rd85]; + ld.f64 %fd81, [%rd83]; + add.f64 %fd82, %fd81, %fd80; + ld.f64 %fd83, [%rd87]; + add.f64 %fd84, %fd82, %fd83; + ld.f64 %fd85, [%rd89]; + add.f64 %fd86, %fd84, %fd85; + ld.f64 %fd87, [%rd91]; + add.f64 %fd88, %fd86, %fd87; + ld.f64 %fd89, [%rd93]; + add.f64 %fd90, %fd88, %fd89; + ld.global.f64 %fd91, [%rd55+24]; + fma.rn.f64 %fd92, %fd91, %fd90, %fd79; + ld.f64 %fd93, [%rd95]; + fma.rn.f64 %fd94, %fd92, %fd93, %fd51; + st.f64 [%rd94], %fd94; + +BB12_11: + add.s32 %r39, %r174, 1; + setp.ne.s32 %p12, %r39, %r19; + mov.u32 %r174, %r39; + mov.u32 %r173, %r39; + @%p12 bra BB12_5; + +BB12_12: + add.s32 %r171, %r172, 1; + setp.ne.s32 %p13, %r171, %r20; + mov.u32 %r172, %r171; + @%p13 bra BB12_3; + +BB12_13: + ret; +} + +.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_( + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14, + .param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1] +) +{ + .reg .pred %p<9>; + .reg .s32 %r<63>; + .reg .s64 %rd<18>; + + + ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0]; + ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1]; + ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2]; + ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3]; + ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4]; + ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5]; + ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6]; + ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7]; + ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8]; + ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9]; + ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10]; + ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11]; + ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12]; + ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13]; + ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14]; + setp.ge.s32 %p1, %r62, %r12; + @%p1 bra BB13_14; + + mov.u32 %r22, 31; + sub.s32 %r23, %r22, %r13; + add.s32 %r24, %r23, %r14; + shr.s32 %r25, %r24, 31; + shr.u32 %r26, %r25, 27; + add.s32 %r27, %r24, %r26; + shr.s32 %r28, %r27, 5; + mov.u32 %r29, 7; + sub.s32 %r30, %r29, %r15; + add.s32 %r31, %r30, %r16; + shr.s32 %r32, %r31, 31; + shr.u32 %r33, %r32, 29; + add.s32 %r34, %r31, %r33; + shr.s32 %r1, %r34, 3; + sub.s32 %r35, %r29, %r17; + add.s32 %r36, %r35, %r18; + shr.s32 %r37, %r36, 31; + shr.u32 %r38, %r37, 29; + add.s32 %r39, %r36, %r38; + shr.s32 %r2, %r39, 3; + add.s32 %r40, %r28, -1; + shr.s32 %r41, %r40, 2; + add.s32 %r3, %r41, 1; + mov.u32 %r42, %tid.x; + and.b32 %r4, %r42, 31; + sub.s32 %r61, %r62, %r12; + +BB13_2: + and.b32 %r8, %r62, 1; + setp.ne.s32 %p2, %r4, 0; + mov.u64 %rd17, 0; + @%p2 bra BB13_4; + + mov.u64 %rd9, 8; + mov.u64 %rd10, 72; + // Callseq Start 2 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd10; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd17, [retval0+0]; + } + // Callseq End 2 + +BB13_4: + setp.eq.s32 %p3, %r8, 0; + @%p3 bra BB13_9; + + setp.eq.s64 %p4, %rd17, 0; + @%p4 bra BB13_7; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd7; + st.u64 [%rd17+64], %rd6; + +BB13_7: + @%p2 bra BB13_13; + + mov.u32 %r47, 128; + mov.u32 %r49, 1; + mov.u32 %r50, 0; + mov.u64 %rd13, 0; + mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd11; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r47; + st.param.b32 [param3+4], %r49; + st.param.b32 [param3+8], %r49; + .param .b32 param4; + st.param.b32 [param4+0], %r50; + .param .b64 param5; + st.param.b64 [param5+0], %rd13; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r43, [retval0+0]; + } + + // inline asm + bra.uni BB13_13; + +BB13_9: + setp.eq.s64 %p6, %rd17, 0; + @%p6 bra BB13_11; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd6; + st.u64 [%rd17+64], %rd7; + +BB13_11: + @%p2 bra BB13_13; + + mov.u32 %r55, 128; + mov.u32 %r57, 1; + mov.u32 %r58, 0; + mov.u64 %rd16, 0; + mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd14; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r55; + st.param.b32 [param3+4], %r57; + st.param.b32 [param3+8], %r57; + .param .b32 param4; + st.param.b32 [param4+0], %r58; + .param .b64 param5; + st.param.b64 [param5+0], %rd16; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r51, [retval0+0]; + } + + // inline asm + +BB13_13: + // Callseq Start 3 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r59, [retval0+0]; + } + // Callseq End 3 + add.s32 %r62, %r62, 1; + add.s32 %r61, %r61, 1; + setp.ne.s32 %p8, %r61, 0; + @%p8 bra BB13_2; + +BB13_14: + // Callseq Start 4 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r60, [retval0+0]; + } + // Callseq End 4 + ret; +} + +.visible .entry loop_stencil_ispc_tasks( + .param .u32 loop_stencil_ispc_tasks_param_0, + .param .u32 loop_stencil_ispc_tasks_param_1, + .param .u32 loop_stencil_ispc_tasks_param_2, + .param .u32 loop_stencil_ispc_tasks_param_3, + .param .u32 loop_stencil_ispc_tasks_param_4, + .param .u32 loop_stencil_ispc_tasks_param_5, + .param .u32 loop_stencil_ispc_tasks_param_6, + .param .u32 loop_stencil_ispc_tasks_param_7, + .param .u32 loop_stencil_ispc_tasks_param_8, + .param .u32 loop_stencil_ispc_tasks_param_9, + .param .u32 loop_stencil_ispc_tasks_param_10, + .param .u64 loop_stencil_ispc_tasks_param_11, + .param .u64 loop_stencil_ispc_tasks_param_12, + .param .u64 loop_stencil_ispc_tasks_param_13, + .param .u64 loop_stencil_ispc_tasks_param_14 +) +{ + .reg .pred %p<9>; + .reg .s32 %r<63>; + .reg .s64 %rd<18>; + + + ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0]; + ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1]; + ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2]; + ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3]; + ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4]; + ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5]; + ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6]; + ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7]; + ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8]; + ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9]; + ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10]; + ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11]; + ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12]; + ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13]; + ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14]; + setp.ge.s32 %p1, %r62, %r12; + @%p1 bra BB14_14; + + mov.u32 %r22, 31; + sub.s32 %r23, %r22, %r13; + add.s32 %r24, %r23, %r14; + shr.s32 %r25, %r24, 31; + shr.u32 %r26, %r25, 27; + add.s32 %r27, %r24, %r26; + shr.s32 %r28, %r27, 5; + mov.u32 %r29, 7; + sub.s32 %r30, %r29, %r15; + add.s32 %r31, %r30, %r16; + shr.s32 %r32, %r31, 31; + shr.u32 %r33, %r32, 29; + add.s32 %r34, %r31, %r33; + shr.s32 %r1, %r34, 3; + sub.s32 %r35, %r29, %r17; + add.s32 %r36, %r35, %r18; + shr.s32 %r37, %r36, 31; + shr.u32 %r38, %r37, 29; + add.s32 %r39, %r36, %r38; + shr.s32 %r2, %r39, 3; + add.s32 %r40, %r28, -1; + shr.s32 %r41, %r40, 2; + add.s32 %r3, %r41, 1; + mov.u32 %r42, %tid.x; + and.b32 %r4, %r42, 31; + sub.s32 %r61, %r62, %r12; + +BB14_2: + and.b32 %r8, %r62, 1; + setp.ne.s32 %p2, %r4, 0; + mov.u64 %rd17, 0; + @%p2 bra BB14_4; + + mov.u64 %rd9, 8; + mov.u64 %rd10, 72; + // Callseq Start 5 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd10; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd17, [retval0+0]; + } + // Callseq End 5 + +BB14_4: + setp.eq.s32 %p3, %r8, 0; + @%p3 bra BB14_9; + + setp.eq.s64 %p4, %rd17, 0; + @%p4 bra BB14_7; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd7; + st.u64 [%rd17+64], %rd6; + +BB14_7: + @%p2 bra BB14_13; + + mov.u32 %r47, 128; + mov.u32 %r49, 1; + mov.u32 %r50, 0; + mov.u64 %rd13, 0; + mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd11; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r47; + st.param.b32 [param3+4], %r49; + st.param.b32 [param3+8], %r49; + .param .b32 param4; + st.param.b32 [param4+0], %r50; + .param .b64 param5; + st.param.b64 [param5+0], %rd13; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r43, [retval0+0]; + } + + // inline asm + bra.uni BB14_13; + +BB14_9: + setp.eq.s64 %p6, %rd17, 0; + @%p6 bra BB14_11; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd6; + st.u64 [%rd17+64], %rd7; + +BB14_11: + @%p2 bra BB14_13; + + mov.u32 %r55, 128; + mov.u32 %r57, 1; + mov.u32 %r58, 0; + mov.u64 %rd16, 0; + mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd14; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r55; + st.param.b32 [param3+4], %r57; + st.param.b32 [param3+8], %r57; + .param .b32 param4; + st.param.b32 [param4+0], %r58; + .param .b64 param5; + st.param.b64 [param5+0], %rd16; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r51, [retval0+0]; + } + + // inline asm + +BB14_13: + // Callseq Start 6 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r59, [retval0+0]; + } + // Callseq End 6 + add.s32 %r62, %r62, 1; + add.s32 %r61, %r61, 1; + setp.ne.s32 %p8, %r61, 0; + @%p8 bra BB14_2; + +BB14_14: + // Callseq Start 7 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r60, [retval0+0]; + } + // Callseq End 7 + ret; +} + + + diff --git a/examples_cuda/stencil/libcudadevrt.a b/examples_cuda/stencil/libcudadevrt.a new file mode 100644 index 00000000..6cf40658 Binary files /dev/null and b/examples_cuda/stencil/libcudadevrt.a differ diff --git a/examples_cuda/stencil/stencil.cpp b/examples_cuda/stencil/stencil.cpp index 2dd09535..015f2b80 100644 --- a/examples_cuda/stencil/stencil.cpp +++ b/examples_cuda/stencil/stencil.cpp @@ -71,7 +71,6 @@ extern void loop_stencil_serial(int t0, int t1, int x0, int x1, void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) { int offset = 0; -#pragma omp parallel for collapse(2) private(offset) for (int z = 0; z < Nz; ++z) for (int y = 0; y < Ny; ++y) for (int x = 0; x < Nx; ++x, ++offset) { @@ -129,7 +128,7 @@ int main() { loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, width, Nz - width, Nx, Ny, Nz, coeff, vsq, Aispc[0], Aispc[1]); - double dt = rtc() - t0; //get_elapsed_mcycles(); + double dt = 1e3*(rtc() - t0); //get_elapsed_mcycles(); minTimeISPCTasks = std::min(minTimeISPCTasks, dt); } diff --git a/examples_cuda/stencil/stencil.ispc b/examples_cuda/stencil/stencil.ispc index d2e095b3..72c28ef6 100644 --- a/examples_cuda/stencil/stencil.ispc +++ b/examples_cuda/stencil/stencil.ispc @@ -31,16 +31,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifdef __NVPTX__ -#warning "emitting DEVICE code" -#define taskIndex blockIndex0() -#define taskCount blockCount0() -#define programIndex laneIndex() -#define programCount warpSize() -#else -#warning "emitting HOST code" -#endif - static inline void stencil_step(uniform int x0, uniform int x1, uniform int y0, uniform int y1, @@ -50,80 +40,59 @@ stencil_step(uniform int x0, uniform int x1, uniform const double Ain[], uniform double Aout[]) { const uniform int Nxy = Nx * Ny; -// foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) -#if 0 -#define VER1 -#endif - -#ifdef VER1 - const uniform long x1o = 1; - const uniform long x2o = 2; - const uniform long x3o = 3; - const uniform long y1o = Nx; - const uniform long y2o = Nx*2; - const uniform long y3o = Nx*3; - const uniform long z1o = Nxy; - const uniform long z2o = Nxy*2; - const uniform long z3o = Nxy*3; -#endif - for (uniform int z = z0; z < z1; z++) - for (uniform int y = y0; y < y1; y++) - { - const int index_base = (z * Nxy) + (y * Nx); - for (uniform int xb = x0; xb < x1; xb += programCount) - { - const int x = xb + programIndex; - int index = index_base + x; -#ifndef VER1 + foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) + { + int index = (z * Nxy) + (y * Nx) + x; #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - double div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + - A_cur(0, +1, 0) + A_cur(0, -1, 0) + - A_cur(0, 0, +1) + A_cur(0, 0, -1)) + - coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + - A_cur(0, +2, 0) + A_cur(0, -2, 0) + - A_cur(0, 0, +2) + A_cur(0, 0, -2)) + - coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + - A_cur(0, +3, 0) + A_cur(0, -3, 0) + - A_cur(0, 0, +3) + A_cur(0, 0, -3)); -#else -#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)] -#define A_next(x, y, z) Aout[index + (x) + (y) + (z)] - double div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) + - A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) + - A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) + - coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) + - A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) + - A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) + - coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) + - A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) + - A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o)); -#endif + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); - if (x < x1) - A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + - vsq[index] * div; - } - } + A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; + + } } +#define SPANX 32 +#define SPANY 8 +#define SPANZ 8 static task void stencil_step_task(uniform int x0, uniform int x1, uniform int y0, uniform int y1, - uniform int z0, + uniform int z0, uniform int z1, uniform int Nx, uniform int Ny, uniform int Nz, uniform const double coef[4], uniform const double vsq[], uniform const double Ain[], uniform double Aout[]) { - if(taskIndex >= taskCount) return; + if (taskIndex0 >= taskCount0 || + taskIndex1 >= taskCount1 || + taskIndex2 >= taskCount2) + return; - stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, - Nx, Ny, Nz, coef, vsq, Ain, Aout); + const uniform int xfirst = x0 + taskIndex0 * SPANX; + const uniform int xlast = min(x1, xfirst + SPANX); + + const uniform int yfirst = y0 + taskIndex1 * SPANY; + const uniform int ylast = min(y1, yfirst + SPANY); + + const uniform int zfirst = z0 + taskIndex2 * SPANZ; + const uniform int zlast = min(z1, zfirst + SPANZ); + + stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, + Nx, Ny, Nz, coef, vsq, Ain, Aout); } + export void loop_stencil_ispc_tasks(uniform int t0, uniform int t1, uniform int x0, uniform int x1, @@ -134,39 +103,24 @@ loop_stencil_ispc_tasks(uniform int t0, uniform int t1, uniform const double vsq[], uniform double Aeven[], uniform double Aodd[]) { - for (uniform int t = t0; t < t1; ++t) { - // Parallelize across cores as well: each task will work on a slice - // of 1 in the z extent of the volume. - if ((t & 1) == 0) - launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, - coef, vsq, Aeven, Aodd); - else - launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, - coef, vsq, Aodd, Aeven); +#define NB(x,n) (((x)+(n)-1)/(n)) - // We need to wait for all of the launched tasks to finish before - // starting the next iteration. - sync; + for (uniform int t = t0; t < t1; ++t) + { + // Parallelize across cores as well: each task will work on a slice + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd); + else + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven); + + // We need to wait for all of the launched tasks to finish before + // starting the next iteration. + sync; } } - -export void -loop_stencil_ispc(uniform int t0, uniform int t1, - uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], - uniform const double vsq[], - uniform double Aeven[], uniform double Aodd[]) -{ - for (uniform int t = t0; t < t1; ++t) { - if ((t & 1) == 0) - stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, - Aeven, Aodd); - else - stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, - Aodd, Aeven); - } -} diff --git a/examples_cuda/stencil/stencil.ptx b/examples_cuda/stencil/stencil.ptx deleted file mode 100644 index e3dcd1ca..00000000 --- a/examples_cuda/stencil/stencil.ptx +++ /dev/null @@ -1,267 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) -// Cuda compilation tools, release 5.5, V5.5.0 -// - -.version 3.2 -.target sm_35 -.address_size 64 - - .file 1 "/home/evghenii/soft/ispc-code/ispc/examples/stencil/stencil.cu", 1383254912, 2112 - .file 2 "/usr/local/cuda-5.5/bin/..//include/cuda_device_runtime_api.h", 1375338991, 7655 - -.weak .func (.param .b32 func_retval0) cudaMalloc( - .param .b64 cudaMalloc_param_0, - .param .b64 cudaMalloc_param_1 -) -{ - .reg .s32 %r<2>; - - - mov.u32 %r1, 30; - st.param.b32 [func_retval0+0], %r1; - .loc 2 66 3 - ret; -} - -.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes( - .param .b64 cudaFuncGetAttributes_param_0, - .param .b64 cudaFuncGetAttributes_param_1 -) -{ - .reg .s32 %r<2>; - - - mov.u32 %r1, 30; - st.param.b32 [func_retval0+0], %r1; - .loc 2 71 3 - ret; -} - -.visible .entry stencil_step_task( - .param .u32 stencil_step_task_param_0, - .param .u32 stencil_step_task_param_1, - .param .u32 stencil_step_task_param_2, - .param .u32 stencil_step_task_param_3, - .param .u32 stencil_step_task_param_4, - .param .u32 stencil_step_task_param_5, - .param .u32 stencil_step_task_param_6, - .param .u32 stencil_step_task_param_7, - .param .u64 stencil_step_task_param_8, - .param .u64 stencil_step_task_param_9, - .param .u64 stencil_step_task_param_10, - .param .u64 stencil_step_task_param_11 -) -{ - .reg .pred %p<8>; - .reg .s32 %r<54>; - .reg .s64 %rd<36>; - .reg .f64 %fd<48>; - - - ld.param.u32 %r19, [stencil_step_task_param_0]; - ld.param.u32 %r20, [stencil_step_task_param_1]; - ld.param.u32 %r21, [stencil_step_task_param_2]; - ld.param.u32 %r22, [stencil_step_task_param_3]; - ld.param.u32 %r23, [stencil_step_task_param_4]; - ld.param.u32 %r24, [stencil_step_task_param_5]; - ld.param.u32 %r25, [stencil_step_task_param_6]; - ld.param.u64 %rd4, [stencil_step_task_param_8]; - ld.param.u64 %rd1, [stencil_step_task_param_9]; - ld.param.u64 %rd2, [stencil_step_task_param_10]; - ld.param.u64 %rd3, [stencil_step_task_param_11]; - cvta.to.global.u64 %rd5, %rd4; - .loc 1 59 1 - mov.u32 %r26, %ctaid.x; - add.s32 %r51, %r26, %r23; - add.s32 %r27, %r51, 1; - .loc 1 18 1 - ld.global.f64 %fd1, [%rd5]; - .loc 1 19 1 - ld.global.f64 %fd2, [%rd5+8]; - .loc 1 20 1 - ld.global.f64 %fd3, [%rd5+16]; - .loc 1 21 1 - ld.global.f64 %fd4, [%rd5+24]; - .loc 1 22 1 - setp.ge.s32 %p1, %r51, %r27; - @%p1 bra BB2_11; - - mul.lo.s32 %r28, %r25, %r24; - shl.b32 %r29, %r28, 1; - neg.s32 %r30, %r29; - shl.b32 %r2, %r30, 3; - cvta.to.global.u64 %rd6, %rd2; - cvta.to.global.u64 %rd31, %rd3; - cvta.to.global.u64 %rd32, %rd1; - -BB2_2: - .loc 1 23 1 - setp.ge.s32 %p2, %r21, %r22; - @%p2 bra BB2_10; - - mov.u32 %r52, %r21; - -BB2_4: - .loc 1 24 1 - mov.u32 %r4, %r52; - setp.ge.s32 %p3, %r19, %r20; - @%p3 bra BB2_9; - - .loc 1 29 1 - mul.lo.s32 %r32, %r51, %r28; - mad.lo.s32 %r5, %r4, %r24, %r32; - .loc 1 32 1 - add.s32 %r6, %r24, %r5; - add.s32 %r7, %r5, %r28; - shl.b32 %r33, %r24, 1; - add.s32 %r8, %r5, %r33; - mad.lo.s32 %r9, %r24, -2, %r5; - add.s32 %r10, %r5, %r29; - mad.lo.s32 %r11, %r28, -2, %r5; - add.s32 %r12, %r24, %r8; - mad.lo.s32 %r13, %r28, 3, %r5; - mov.u32 %r53, %r19; - -BB2_6: - .loc 1 26 1 - mov.u32 %r14, %r53; - mov.u32 %r35, %tid.x; - add.s32 %r36, %r35, %r14; - .loc 1 29 1 - add.s32 %r15, %r36, %r5; - mul.wide.s32 %rd7, %r15, 8; - add.s64 %rd8, %rd6, %rd7; - .loc 1 32 1 - ld.global.f64 %fd5, [%rd8]; - ld.global.f64 %fd7, [%rd8+-8]; - ld.global.f64 %fd8, [%rd8+8]; - add.f64 %fd9, %fd8, %fd7; - add.s32 %r37, %r6, %r36; - mul.wide.s32 %rd9, %r37, 8; - add.s64 %rd10, %rd6, %rd9; - .loc 1 32 1 - ld.global.f64 %fd10, [%rd10]; - add.f64 %fd11, %fd9, %fd10; - .loc 1 22 1 - neg.s32 %r39, %r33; - shl.b32 %r40, %r39, 3; - cvt.s64.s32 %rd11, %r40; - add.s64 %rd12, %rd10, %rd11; - .loc 1 32 1 - ld.global.f64 %fd12, [%rd12]; - add.f64 %fd13, %fd11, %fd12; - add.s32 %r41, %r7, %r36; - mul.wide.s32 %rd13, %r41, 8; - add.s64 %rd14, %rd6, %rd13; - .loc 1 32 1 - ld.global.f64 %fd14, [%rd14]; - add.f64 %fd15, %fd13, %fd14; - cvt.s64.s32 %rd15, %r2; - add.s64 %rd16, %rd14, %rd15; - .loc 1 32 1 - ld.global.f64 %fd16, [%rd16]; - add.f64 %fd17, %fd15, %fd16; - mul.f64 %fd18, %fd2, %fd17; - fma.rn.f64 %fd19, %fd1, %fd5, %fd18; - ld.global.f64 %fd20, [%rd8+-16]; - ld.global.f64 %fd21, [%rd8+16]; - add.f64 %fd22, %fd21, %fd20; - add.s32 %r42, %r8, %r36; - mul.wide.s32 %rd17, %r42, 8; - add.s64 %rd18, %rd6, %rd17; - .loc 1 32 1 - ld.global.f64 %fd23, [%rd18]; - add.f64 %fd24, %fd22, %fd23; - add.s32 %r43, %r9, %r36; - mul.wide.s32 %rd19, %r43, 8; - add.s64 %rd20, %rd6, %rd19; - .loc 1 32 1 - ld.global.f64 %fd25, [%rd20]; - add.f64 %fd26, %fd24, %fd25; - add.s32 %r44, %r10, %r36; - mul.wide.s32 %rd21, %r44, 8; - add.s64 %rd22, %rd6, %rd21; - .loc 1 32 1 - ld.global.f64 %fd27, [%rd22]; - add.f64 %fd28, %fd26, %fd27; - add.s32 %r45, %r11, %r36; - mul.wide.s32 %rd23, %r45, 8; - add.s64 %rd24, %rd6, %rd23; - .loc 1 32 1 - ld.global.f64 %fd29, [%rd24]; - add.f64 %fd30, %fd28, %fd29; - fma.rn.f64 %fd31, %fd3, %fd30, %fd19; - ld.global.f64 %fd32, [%rd8+-24]; - ld.global.f64 %fd33, [%rd8+24]; - add.f64 %fd34, %fd33, %fd32; - add.s32 %r46, %r12, %r36; - mul.wide.s32 %rd25, %r46, 8; - add.s64 %rd26, %rd6, %rd25; - .loc 1 32 1 - ld.global.f64 %fd35, [%rd26]; - add.f64 %fd36, %fd34, %fd35; - add.s64 %rd27, %rd12, %rd11; - .loc 1 32 1 - ld.global.f64 %fd37, [%rd27]; - add.f64 %fd38, %fd36, %fd37; - add.s32 %r47, %r13, %r36; - mul.wide.s32 %rd28, %r47, 8; - add.s64 %rd29, %rd6, %rd28; - .loc 1 32 1 - ld.global.f64 %fd39, [%rd29]; - add.f64 %fd40, %fd38, %fd39; - add.s64 %rd30, %rd16, %rd15; - .loc 1 32 1 - ld.global.f64 %fd41, [%rd30]; - add.f64 %fd42, %fd40, %fd41; - fma.rn.f64 %fd6, %fd4, %fd42, %fd31; - .loc 1 44 1 - setp.ge.s32 %p4, %r36, %r20; - @%p4 bra BB2_8; - - mul.wide.s32 %rd33, %r15, 8; - add.s64 %rd34, %rd31, %rd33; - .loc 1 45 1 - ld.global.f64 %fd43, [%rd34]; - add.f64 %fd44, %fd5, %fd5; - sub.f64 %fd45, %fd44, %fd43; - add.s64 %rd35, %rd32, %rd33; - .loc 1 45 1 - ld.global.f64 %fd46, [%rd35]; - fma.rn.f64 %fd47, %fd46, %fd6, %fd45; - st.global.f64 [%rd34], %fd47; - -BB2_8: - .loc 1 24 19 - add.s32 %r16, %r14, 32; - .loc 1 24 1 - setp.lt.s32 %p5, %r16, %r20; - mov.u32 %r53, %r16; - @%p5 bra BB2_6; - -BB2_9: - .loc 1 23 18 - add.s32 %r17, %r4, 1; - .loc 1 23 1 - setp.lt.s32 %p6, %r17, %r22; - mov.u32 %r52, %r17; - @%p6 bra BB2_4; - -BB2_10: - .loc 1 22 18 - add.s32 %r51, %r51, 1; - .loc 1 59 1 - add.s32 %r49, %r23, %r26; - add.s32 %r50, %r49, 1; - .loc 1 22 1 - setp.lt.s32 %p7, %r51, %r50; - @%p7 bra BB2_2; - -BB2_11: - .loc 1 61 2 - ret; -} - - diff --git a/examples_cuda/stencil/stencil0.ptx b/examples_cuda/stencil/stencil0.ptx deleted file mode 100644 index f06a11d9..00000000 --- a/examples_cuda/stencil/stencil0.ptx +++ /dev/null @@ -1,224 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) -// Cuda compilation tools, release 5.5, V5.5.0 -// - -.version 3.2 -.target sm_35 -.address_size 64 - - .file 1 "/home/evghenii/soft/ispc-code/ispc/examples/stencil/stencil.cu", 1383254912, 2112 - -) -{ - .reg .s32 %r<2>; - - - mov.u32 %r1, 30; - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes( - .param .b64 cudaFuncGetAttributes_param_0, - .param .b64 cudaFuncGetAttributes_param_1 -) -{ - .reg .s32 %r<2>; - - - mov.u32 %r1, 30; - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .entry stencil_step_task( - .param .u32 stencil_step_task_param_0, - .param .u32 stencil_step_task_param_1, - .param .u32 stencil_step_task_param_2, - .param .u32 stencil_step_task_param_3, - .param .u32 stencil_step_task_param_4, - .param .u32 stencil_step_task_param_5, - .param .u32 stencil_step_task_param_6, - .param .u32 stencil_step_task_param_7, - .param .u64 stencil_step_task_param_8, - .param .u64 stencil_step_task_param_9, - .param .u64 stencil_step_task_param_10, - .param .u64 stencil_step_task_param_11 -) -{ - .reg .pred %p<8>; - .reg .s32 %r<54>; - .reg .s64 %rd<36>; - .reg .f64 %fd<48>; - - - ld.param.u32 %r19, [stencil_step_task_param_0]; - ld.param.u32 %r20, [stencil_step_task_param_1]; - ld.param.u32 %r21, [stencil_step_task_param_2]; - ld.param.u32 %r22, [stencil_step_task_param_3]; - ld.param.u32 %r23, [stencil_step_task_param_4]; - ld.param.u32 %r24, [stencil_step_task_param_5]; - ld.param.u32 %r25, [stencil_step_task_param_6]; - ld.param.u64 %rd4, [stencil_step_task_param_8]; - ld.param.u64 %rd1, [stencil_step_task_param_9]; - ld.param.u64 %rd2, [stencil_step_task_param_10]; - ld.param.u64 %rd3, [stencil_step_task_param_11]; - cvta.to.global.u64 %rd5, %rd4; - mov.u32 %r26, %ctaid.x; - add.s32 %r51, %r26, %r23; - add.s32 %r27, %r51, 1; - ld.global.f64 %fd1, [%rd5]; - ld.global.f64 %fd2, [%rd5+8]; - ld.global.f64 %fd3, [%rd5+16]; - ld.global.f64 %fd4, [%rd5+24]; - setp.ge.s32 %p1, %r51, %r27; - @%p1 bra BB2_11; - - mul.lo.s32 %r28, %r25, %r24; - shl.b32 %r29, %r28, 1; - neg.s32 %r30, %r29; - shl.b32 %r2, %r30, 3; - cvta.to.global.u64 %rd6, %rd2; - cvta.to.global.u64 %rd31, %rd3; - cvta.to.global.u64 %rd32, %rd1; - -BB2_2: - setp.ge.s32 %p2, %r21, %r22; - @%p2 bra BB2_10; - - mov.u32 %r52, %r21; - -BB2_4: - mov.u32 %r4, %r52; - setp.ge.s32 %p3, %r19, %r20; - @%p3 bra BB2_9; - - mul.lo.s32 %r32, %r51, %r28; - mad.lo.s32 %r5, %r4, %r24, %r32; - add.s32 %r6, %r24, %r5; - add.s32 %r7, %r5, %r28; - shl.b32 %r33, %r24, 1; - add.s32 %r8, %r5, %r33; - mad.lo.s32 %r9, %r24, -2, %r5; - add.s32 %r10, %r5, %r29; - mad.lo.s32 %r11, %r28, -2, %r5; - add.s32 %r12, %r24, %r8; - mad.lo.s32 %r13, %r28, 3, %r5; - mov.u32 %r53, %r19; - -BB2_6: - mov.u32 %r14, %r53; - mov.u32 %r35, %tid.x; - add.s32 %r36, %r35, %r14; - add.s32 %r15, %r36, %r5; - mul.wide.s32 %rd7, %r15, 8; - add.s64 %rd8, %rd6, %rd7; - ld.global.f64 %fd5, [%rd8]; - ld.global.f64 %fd7, [%rd8+-8]; - ld.global.f64 %fd8, [%rd8+8]; - add.f64 %fd9, %fd8, %fd7; - add.s32 %r37, %r6, %r36; - mul.wide.s32 %rd9, %r37, 8; - add.s64 %rd10, %rd6, %rd9; - ld.global.f64 %fd10, [%rd10]; - add.f64 %fd11, %fd9, %fd10; - neg.s32 %r39, %r33; - shl.b32 %r40, %r39, 3; - cvt.s64.s32 %rd11, %r40; - add.s64 %rd12, %rd10, %rd11; - ld.global.f64 %fd12, [%rd12]; - add.f64 %fd13, %fd11, %fd12; - add.s32 %r41, %r7, %r36; - mul.wide.s32 %rd13, %r41, 8; - add.s64 %rd14, %rd6, %rd13; - ld.global.f64 %fd14, [%rd14]; - add.f64 %fd15, %fd13, %fd14; - cvt.s64.s32 %rd15, %r2; - add.s64 %rd16, %rd14, %rd15; - ld.global.f64 %fd16, [%rd16]; - add.f64 %fd17, %fd15, %fd16; - mul.f64 %fd18, %fd2, %fd17; - fma.rn.f64 %fd19, %fd1, %fd5, %fd18; - ld.global.f64 %fd20, [%rd8+-16]; - ld.global.f64 %fd21, [%rd8+16]; - add.f64 %fd22, %fd21, %fd20; - add.s32 %r42, %r8, %r36; - mul.wide.s32 %rd17, %r42, 8; - add.s64 %rd18, %rd6, %rd17; - ld.global.f64 %fd23, [%rd18]; - add.f64 %fd24, %fd22, %fd23; - add.s32 %r43, %r9, %r36; - mul.wide.s32 %rd19, %r43, 8; - add.s64 %rd20, %rd6, %rd19; - ld.global.f64 %fd25, [%rd20]; - add.f64 %fd26, %fd24, %fd25; - add.s32 %r44, %r10, %r36; - mul.wide.s32 %rd21, %r44, 8; - add.s64 %rd22, %rd6, %rd21; - ld.global.f64 %fd27, [%rd22]; - add.f64 %fd28, %fd26, %fd27; - add.s32 %r45, %r11, %r36; - mul.wide.s32 %rd23, %r45, 8; - add.s64 %rd24, %rd6, %rd23; - ld.global.f64 %fd29, [%rd24]; - add.f64 %fd30, %fd28, %fd29; - fma.rn.f64 %fd31, %fd3, %fd30, %fd19; - ld.global.f64 %fd32, [%rd8+-24]; - ld.global.f64 %fd33, [%rd8+24]; - add.f64 %fd34, %fd33, %fd32; - add.s32 %r46, %r12, %r36; - mul.wide.s32 %rd25, %r46, 8; - add.s64 %rd26, %rd6, %rd25; - ld.global.f64 %fd35, [%rd26]; - add.f64 %fd36, %fd34, %fd35; - add.s64 %rd27, %rd12, %rd11; - ld.global.f64 %fd37, [%rd27]; - add.f64 %fd38, %fd36, %fd37; - add.s32 %r47, %r13, %r36; - mul.wide.s32 %rd28, %r47, 8; - add.s64 %rd29, %rd6, %rd28; - ld.global.f64 %fd39, [%rd29]; - add.f64 %fd40, %fd38, %fd39; - add.s64 %rd30, %rd16, %rd15; - ld.global.f64 %fd41, [%rd30]; - add.f64 %fd42, %fd40, %fd41; - fma.rn.f64 %fd6, %fd4, %fd42, %fd31; - setp.ge.s32 %p4, %r36, %r20; - @%p4 bra BB2_8; - - mul.wide.s32 %rd33, %r15, 8; - add.s64 %rd34, %rd31, %rd33; - ld.global.f64 %fd43, [%rd34]; - add.f64 %fd44, %fd5, %fd5; - sub.f64 %fd45, %fd44, %fd43; - add.s64 %rd35, %rd32, %rd33; - ld.global.f64 %fd46, [%rd35]; - fma.rn.f64 %fd47, %fd46, %fd6, %fd45; - st.global.f64 [%rd34], %fd47; - -BB2_8: - add.s32 %r16, %r14, 32; - setp.lt.s32 %p5, %r16, %r20; - mov.u32 %r53, %r16; - @%p5 bra BB2_6; - -BB2_9: - add.s32 %r17, %r4, 1; - setp.lt.s32 %p6, %r17, %r22; - mov.u32 %r52, %r17; - @%p6 bra BB2_4; - -BB2_10: - add.s32 %r51, %r51, 1; - add.s32 %r49, %r23, %r26; - add.s32 %r50, %r49, 1; - setp.lt.s32 %p7, %r51, %r50; - @%p7 bra BB2_2; - -BB2_11: - ret; -} - - diff --git a/examples_cuda/stencil/stencil1.cubin b/examples_cuda/stencil/stencil1.cubin deleted file mode 100644 index 8b7d18d9..00000000 Binary files a/examples_cuda/stencil/stencil1.cubin and /dev/null differ diff --git a/examples_cuda/stencil/stencil2.cubin b/examples_cuda/stencil/stencil2.cubin deleted file mode 100644 index 64a9d3ea..00000000 Binary files a/examples_cuda/stencil/stencil2.cubin and /dev/null differ diff --git a/examples_cuda/stencil/stencil2.ptx b/examples_cuda/stencil/stencil2.ptx deleted file mode 100644 index 3e5dfd92..00000000 --- a/examples_cuda/stencil/stencil2.ptx +++ /dev/null @@ -1,247 +0,0 @@ -// -// Generated by LLVM NVPTX Back-End -// - -.version 3.1 -.target sm_20, texmode_independent -.address_size 64 - - // .globl stencil_step_task - // @stencil_step_task -.entry stencil_step_task( - .param .u32 stencil_step_task_param_0, - .param .u32 stencil_step_task_param_1, - .param .u32 stencil_step_task_param_2, - .param .u32 stencil_step_task_param_3, - .param .u32 stencil_step_task_param_4, - .param .u32 stencil_step_task_param_5, - .param .u32 stencil_step_task_param_6, - .param .u32 stencil_step_task_param_7, - .param .u64 .ptr .align 8 stencil_step_task_param_8, - .param .u64 .ptr .align 8 stencil_step_task_param_9, - .param .u64 .ptr .align 8 stencil_step_task_param_10, - .param .u64 .ptr .align 8 stencil_step_task_param_11 -) -{ - .reg .pred %p<396>; - .reg .s16 %rc<396>; - .reg .s16 %rs<396>; - .reg .s32 %r<396>; - .reg .s64 %rl<396>; - .reg .f32 %f<396>; - .reg .f64 %fl<396>; - -// BB#0: // %allocas - mov.u32 %r12, %ctaid.x; - ld.param.u32 %r13, [stencil_step_task_param_4]; - add.s32 %r16, %r12, %r13; - add.s32 %r0, %r16, 1; - setp.ge.s32 %p0, %r16, %r0; - @%p0 bra BB0_11; -// BB#1: // %for_test28.i.preheader.lr.ph - ld.param.u32 %r0, [stencil_step_task_param_0]; - ld.param.u32 %r1, [stencil_step_task_param_1]; - ld.param.u32 %r2, [stencil_step_task_param_2]; - ld.param.u32 %r3, [stencil_step_task_param_3]; - ld.param.u32 %r4, [stencil_step_task_param_5]; - ld.param.u32 %r5, [stencil_step_task_param_6]; - mul.lo.s32 %r5, %r5, %r4; - ld.param.u64 %rl3, [stencil_step_task_param_8]; - ld.f64 %fl0, [%rl3]; - ld.f64 %fl1, [%rl3+8]; - ld.param.u64 %rl0, [stencil_step_task_param_9]; - ld.f64 %fl2, [%rl3+16]; - ld.param.u64 %rl1, [stencil_step_task_param_10]; - ld.param.u64 %rl2, [stencil_step_task_param_11]; - ld.f64 %fl3, [%rl3+24]; - shl.b32 %r6, %r4, 1; - mul.lo.s32 %r7, %r4, 3; - mul.lo.s32 %r8, %r4, -3; - shl.b32 %r9, %r5, 1; - mul.lo.s32 %r10, %r5, 3; - mul.lo.s32 %r11, %r5, -3; - add.s32 %r12, %r12, %r13; - neg.s32 %r13, %r9; - neg.s32 %r14, %r6; - mov.u32 %r32, WARP_SZ; -BB0_2: // %for_test28.i.preheader - // =>This Loop Header: Depth=1 - // Child Loop BB0_9 Depth 2 - // Child Loop BB0_5 Depth 3 - mov.u32 %r15, %r16; - setp.ge.s32 %p0, %r2, %r3; - @%p0 bra BB0_10; -// BB#3: // %for_test35.i.preheader.lr.ph - // in Loop: Header=BB0_2 Depth=1 - setp.lt.s32 %p0, %r0, %r1; - @%p0 bra BB0_4; - bra.uni BB0_10; -BB0_4: // in Loop: Header=BB0_2 Depth=1 - mul.lo.s32 %r16, %r15, %r5; - mov.u32 %r17, %r2; -BB0_9: // %for_loop37.i.lr.ph.us - // Parent Loop BB0_2 Depth=1 - // => This Loop Header: Depth=2 - // Child Loop BB0_5 Depth 3 - mad.lo.s32 %r18, %r17, %r4, %r16; - add.s32 %r19, %r18, %r4; - add.s32 %r20, %r18, %r6; - sub.s32 %r21, %r18, %r4; - add.s32 %r22, %r18, %r7; - add.s32 %r23, %r18, %r14; - add.s32 %r24, %r18, %r5; - add.s32 %r25, %r18, %r8; - add.s32 %r26, %r18, %r9; - sub.s32 %r27, %r18, %r5; - add.s32 %r28, %r18, %r10; - add.s32 %r29, %r18, %r13; - add.s32 %r30, %r18, %r11; - mov.u32 %r31, %r0; -BB0_5: // %for_loop37.i.us - // Parent Loop BB0_2 Depth=1 - // Parent Loop BB0_9 Depth=2 - // => This Inner Loop Header: Depth=3 - mov.u32 %r33, %tid.x; - add.s32 %r34, %r32, -1; - and.b32 %r33, %r34, %r33; - add.s32 %r33, %r33, %r31; - setp.ge.s32 %p0, %r33, %r1; - @%p0 bra BB0_7; -// BB#6: // %pl_dolane.i.us - // in Loop: Header=BB0_5 Depth=3 - add.s32 %r34, %r18, %r33; - shl.b32 %r34, %r34, 3; - add.s32 %r35, %r34, -8; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl4, [%rl3]; - add.s32 %r35, %r34, 8; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl5, [%rl3]; - add.s32 %r35, %r34, -16; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl6, [%rl3]; - add.s32 %r35, %r34, 16; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl9, [%rl3]; - add.s32 %r35, %r19, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl8, [%rl3]; - add.s32 %r35, %r34, -24; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl7, [%rl3]; - add.s32 %r35, %r34, 24; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl10, [%rl3]; - add.s32 %r35, %r20, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl13, [%rl3]; - add.s32 %r35, %r21, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl12, [%rl3]; - add.s32 %r35, %r22, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl11, [%rl3]; - add.s32 %r35, %r23, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl16, [%rl3]; - add.s32 %r35, %r24, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl15, [%rl3]; - add.s32 %r35, %r25, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl14, [%rl3]; - add.s32 %r35, %r26, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl19, [%rl3]; - add.s32 %r35, %r27, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl18, [%rl3]; - add.s32 %r35, %r28, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl17, [%rl3]; - add.s32 %r35, %r29, %r33; - shl.b32 %r35, %r35, 3; - cvt.s64.s32 %rl3, %r35; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl24, [%rl3]; - cvt.s64.s32 %rl4, %r34; - add.s64 %rl3, %rl4, %rl1; - ld.f64 %fl21, [%rl3]; - add.s32 %r33, %r30, %r33; - shl.b32 %r33, %r33, 3; - cvt.s64.s32 %rl3, %r33; - add.s64 %rl3, %rl3, %rl1; - ld.f64 %fl20, [%rl3]; - add.s64 %rl3, %rl4, %rl2; - ld.f64 %fl23, [%rl3]; - add.s64 %rl4, %rl4, %rl0; - ld.f64 %fl22, [%rl4]; - add.f64 %fl25, %fl21, %fl21; - sub.f64 %fl23, %fl25, %fl23; - add.f64 %fl6, %fl6, %fl9; - add.f64 %fl6, %fl6, %fl13; - add.f64 %fl6, %fl6, %fl16; - add.f64 %fl6, %fl6, %fl19; - add.f64 %fl6, %fl6, %fl24; - add.f64 %fl4, %fl4, %fl5; - add.f64 %fl4, %fl4, %fl8; - add.f64 %fl4, %fl4, %fl12; - add.f64 %fl4, %fl4, %fl15; - add.f64 %fl4, %fl4, %fl18; - mul.f64 %fl5, %fl0, %fl21; - fma.rn.f64 %fl4, %fl1, %fl4, %fl5; - fma.rn.f64 %fl4, %fl2, %fl6, %fl4; - add.f64 %fl5, %fl7, %fl10; - add.f64 %fl5, %fl5, %fl11; - add.f64 %fl5, %fl5, %fl14; - add.f64 %fl5, %fl5, %fl17; - add.f64 %fl5, %fl5, %fl20; - fma.rn.f64 %fl4, %fl3, %fl5, %fl4; - fma.rn.f64 %fl4, %fl4, %fl22, %fl23; - st.f64 [%rl3], %fl4; -BB0_7: // %safe_if_after_true.i.us - // in Loop: Header=BB0_5 Depth=3 - add.s32 %r31, %r32, %r31; - setp.lt.s32 %p0, %r31, %r1; - @%p0 bra BB0_5; -// BB#8: // %for_exit38.i.us - // in Loop: Header=BB0_9 Depth=2 - add.s32 %r17, %r17, 1; - setp.eq.s32 %p0, %r17, %r3; - @%p0 bra BB0_10; - bra.uni BB0_9; -BB0_10: // %for_exit31.i - // in Loop: Header=BB0_2 Depth=1 - add.s32 %r16, %r15, 1; - setp.ne.s32 %p0, %r15, %r12; - @%p0 bra BB0_2; -BB0_11: // %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit - ret; -} - diff --git a/examples_cuda/stencil/stencilX.ispc b/examples_cuda/stencil/stencilX.ispc new file mode 100644 index 00000000..36d9d521 --- /dev/null +++ b/examples_cuda/stencil/stencilX.ispc @@ -0,0 +1,159 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +static inline void +stencil_step(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + const uniform int Nxy = Nx * Ny; + +#if 0 +#define VER1 +#endif + +#ifdef VER1 + const uniform int x1o = 1; + const uniform int x2o = 2; + const uniform int x3o = 3; + const uniform int y1o = Nx; + const uniform int y2o = Nx*2; + const uniform int y3o = Nx*3; + const uniform int z1o = Nxy; + const uniform int z2o = Nxy*2; + const uniform int z3o = Nxy*3; +#endif + foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) + { + const int index= (z * Nxy) + (y * Nx) + x; + +#ifndef VER1 +#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] +#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); + +#else + +#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)] +#define A_next(x, y, z) Aout[index + (x) + (y) + (z)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) + + A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) + + A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) + + coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) + + A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) + + A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) + + coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) + + A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) + + A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o)); + +#endif + + A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; + } +} + +#define SPANX 32 +#define SPANY 8 +#define SPANZ 8 + +static task void +stencil_step_task(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + if (taskIndex0 >= taskCount0 || + taskIndex1 >= taskCount1 || + taskIndex2 >= taskCount2) + return; + + const uniform int xfirst = x0 + taskIndex0 * SPANX; + const uniform int xlast = min(x1, xfirst + SPANX); + + const uniform int yfirst = y0 + taskIndex1 * SPANY; + const uniform int ylast = min(y1, yfirst + SPANY); + + const uniform int zfirst = z0 + taskIndex2 * SPANZ; + const uniform int zlast = min(z1, zfirst + SPANZ); + + stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, + Nx, Ny, Nz, coef, vsq, Ain, Aout); +} + + + +export void +loop_stencil_ispc_tasks(uniform int t0, uniform int t1, + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], + uniform const double vsq[], + uniform double Aeven[], uniform double Aodd[]) +{ +#define NB(x,n) (((x)+(n)-1)/(n)) + + for (uniform int t = t0; t < t1; ++t) + { + // Parallelize across cores as well: each task will work on a slice + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd); + else + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven); + + // We need to wait for all of the launched tasks to finish before + // starting the next iteration. + sync; + } +} + diff --git a/examples_cuda/stencil/stencilY.ispc b/examples_cuda/stencil/stencilY.ispc new file mode 100644 index 00000000..72c28ef6 --- /dev/null +++ b/examples_cuda/stencil/stencilY.ispc @@ -0,0 +1,126 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +static inline void +stencil_step(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + const uniform int Nxy = Nx * Ny; + + foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) + { + int index = (z * Nxy) + (y * Nx) + x; +#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] +#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); + + A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; + + } +} + +#define SPANX 32 +#define SPANY 8 +#define SPANZ 8 + +static task void +stencil_step_task(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + if (taskIndex0 >= taskCount0 || + taskIndex1 >= taskCount1 || + taskIndex2 >= taskCount2) + return; + + const uniform int xfirst = x0 + taskIndex0 * SPANX; + const uniform int xlast = min(x1, xfirst + SPANX); + + const uniform int yfirst = y0 + taskIndex1 * SPANY; + const uniform int ylast = min(y1, yfirst + SPANY); + + const uniform int zfirst = z0 + taskIndex2 * SPANZ; + const uniform int zlast = min(z1, zfirst + SPANZ); + + stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, + Nx, Ny, Nz, coef, vsq, Ain, Aout); +} + + + +export void +loop_stencil_ispc_tasks(uniform int t0, uniform int t1, + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], + uniform const double vsq[], + uniform double Aeven[], uniform double Aodd[]) +{ +#define NB(x,n) (((x)+(n)-1)/(n)) + + for (uniform int t = t0; t < t1; ++t) + { + // Parallelize across cores as well: each task will work on a slice + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd); + else + launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] + stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven); + + // We need to wait for all of the launched tasks to finish before + // starting the next iteration. + sync; + } +} + diff --git a/examples_cuda/stencil/stencil_avx.bc b/examples_cuda/stencil/stencil_avx.bc deleted file mode 100644 index 7a63ccce..00000000 Binary files a/examples_cuda/stencil/stencil_avx.bc and /dev/null differ diff --git a/examples_cuda/stencil/stencil_cu b/examples_cuda/stencil/stencil_cu index 1667a10b..28fe453a 100755 Binary files a/examples_cuda/stencil/stencil_cu and b/examples_cuda/stencil/stencil_cu differ diff --git a/examples_cuda/stencil/stencil_cu.bc b/examples_cuda/stencil/stencil_cu.bc deleted file mode 100644 index 5d9aecbe..00000000 Binary files a/examples_cuda/stencil/stencil_cu.bc and /dev/null differ diff --git a/examples_cuda/stencil/stencil_cu.cpp b/examples_cuda/stencil/stencil_cu.cpp index f23809a1..a4674f59 100644 --- a/examples_cuda/stencil/stencil_cu.cpp +++ b/examples_cuda/stencil/stencil_cu.cpp @@ -51,189 +51,8 @@ using namespace ispc; #include "drvapi_error_string.h" #include +#include "../cuda_ispc.h" -double rtc(void) -{ - struct timeval Tvalue; - double etime; - struct timezone dummy; - - gettimeofday(&Tvalue,&dummy); - etime = (double) Tvalue.tv_sec + - 1.e-6*((double) Tvalue.tv_usec); - return etime; -} - -#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) -// These are the inline versions for all of the SDK helper functions -void __checkCudaErrors(CUresult err, const char *file, const int line) { - if(CUDA_SUCCESS != err) { - std::cerr << "checkCudeErrors() Driver API error = " << err << "\"" - << getCudaDrvErrorString(err) << "\" from file <" << file - << ", line " << line << "\n"; - exit(-1); - } -} - -/**********************/ -/* Basic CUDriver API */ -CUcontext context; - -void createContext(const int deviceId = 0) -{ - CUdevice device; - int devCount; - checkCudaErrors(cuInit(0)); - checkCudaErrors(cuDeviceGetCount(&devCount)); - assert(devCount > 0); - checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0)); - - char name[128]; - checkCudaErrors(cuDeviceGetName(name, 128, device)); - std::cout << "Using CUDA Device [0]: " << name << "\n"; - - int devMajor, devMinor; - checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); - std::cout << "Device Compute Capability: " - << devMajor << "." << devMinor << "\n"; - if (devMajor < 2) { - std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; - exit(1); - } - - // Create driver context - checkCudaErrors(cuCtxCreate(&context, 0, device)); -} -void destroyContext() -{ - checkCudaErrors(cuCtxDestroy(context)); -} - -CUmodule loadModule(const char * module) -{ - CUmodule cudaModule; - checkCudaErrors(cuModuleLoadData(&cudaModule, module)); - return cudaModule; -} -void unloadModule(CUmodule &cudaModule) -{ - checkCudaErrors(cuModuleUnload(cudaModule)); -} - -CUfunction getFunction(CUmodule &cudaModule, const char * function) -{ - CUfunction cudaFunction; - checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function)); - return cudaFunction; -} - -CUdeviceptr deviceMalloc(const size_t size) -{ - CUdeviceptr d_buf; - checkCudaErrors(cuMemAlloc(&d_buf, size)); - return d_buf; -} -void deviceFree(CUdeviceptr d_buf) -{ - checkCudaErrors(cuMemFree(d_buf)); -} -void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size) -{ - checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size)); -} -void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) -{ - checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); -} -#define deviceLaunch(func,nbx,nby,nbz,params) \ - checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \ -checkCudaErrors( \ - cuLaunchKernel( \ - (func), \ - ((nbx-1)/(128/32)+1), (nby), (nbz), \ - 128, 1, 1, \ - 0, NULL, (params), NULL \ - )); - -typedef CUdeviceptr devicePtr; - - -/**************/ -#include -std::vector readBinary(const char * filename) -{ - std::vector buffer; - FILE *fp = fopen(filename, "rb"); - if (!fp ) - { - fprintf(stderr, "file %s not found\n", filename); - assert(0); - } -#if 0 - char c; - while ((c = fgetc(fp)) != EOF) - buffer.push_back(c); -#else - fseek(fp, 0, SEEK_END); - const unsigned long long size = ftell(fp); /*calc the size needed*/ - fseek(fp, 0, SEEK_SET); - buffer.resize(size); - - if (fp == NULL){ /*ERROR detection if file == empty*/ - fprintf(stderr, "Error: There was an Error reading the file %s \n",filename); - exit(1); - } - else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/ - fprintf(stderr, "Error: There was an Error reading the file %s \n", filename); - exit(1); - } -#endif - fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size()); - return buffer; -} - -extern "C" -{ - - void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment) - { - return NULL; - } - void CUDALaunch( - void **handlePtr, - const char * module_name, - const char * module_1, - const char * func_name, - void **func_args, - int countx, int county, int countz) - { - assert(module_name != NULL); - assert(module_1 != NULL); - assert(func_name != NULL); - assert(func_args != NULL); -#if 1 - const char * module = module_1; -#else - const std::vector module_str = readBinary("kernel.cubin"); - const char * module = &module_str[0]; -#endif - CUmodule cudaModule = loadModule(module); - CUfunction cudaFunction = getFunction(cudaModule, func_name); - deviceLaunch(cudaFunction, countx, county, countz, func_args); - unloadModule(cudaModule); - } - void CUDASync(void *handle) - { - checkCudaErrors(cuStreamSynchronize(0)); - } - void ISPCSync(void *handle) - { - checkCudaErrors(cuStreamSynchronize(0)); - } - void CUDAFree(void *handle) - { - } -} extern void loop_stencil_serial(int t0, int t1, int x0, int x1, @@ -295,9 +114,9 @@ int main() { double dt = get_elapsed_mcycles(); minTimeISPC = std::min(minTimeISPC, dt); } -#endif printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); +#endif InitData(Nx, Ny, Nz, Aispc, vsq); @@ -310,19 +129,35 @@ int main() { // the minimum time of three runs. // double minTimeISPCTasks = 1e30; + const bool print_log = false; + const int nreg = 128; for (int i = 0; i < 3; ++i) { reset_and_start_timer(); - const double t0 = rtc(); - loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, (double*)d_coeff, (double*)d_vsq, - (double*)d_Aispc0, (double*)d_Aispc1); - double dt = rtc() - t0; //get_elapsed_mcycles(); + const char * func_name = "loop_stencil_ispc_tasks"; + + int t0 = 0; + int t1 = 6; + + int x0 = width; + int x1 = Nx - width; + + int y0 = width; + int y1 = Ny - width; + + int z0 = width; + int z1 = Nz - width; + + void *func_args[] = { + &t0, &t1, + &x0, &x1, &y0, &y1, &z0, &z1, &Nx, &Ny, &Nz, + &d_coeff, &d_vsq, &d_Aispc0, &d_Aispc1}; + double dt = 1e3*CUDALaunch(NULL, func_name, func_args, print_log, nreg); minTimeISPCTasks = std::min(minTimeISPCTasks, dt); } memcpyD2H(Aispc[1], d_Aispc1, bufsize); //memcpyD2H(Aispc[1], d_vsq, bufsize); - printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); + fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); InitData(Nx, Ny, Nz, Aserial, vsq); diff --git a/examples_cuda/stencil/stencil_cu.ll b/examples_cuda/stencil/stencil_cu.ll deleted file mode 100644 index 6ea8748c..00000000 --- a/examples_cuda/stencil/stencil_cu.ll +++ /dev/null @@ -1,762 +0,0 @@ -; ModuleID = 'stencil_cu.bc' -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: nounwind -declare i8* @ISPCAlloc(i8**, i64, i32) #0 - -; Function Attrs: nounwind -declare void @ISPCLaunch(i8**, i8*, i8*, i32, i32, i32) #0 - -; Function Attrs: nounwind -declare void @ISPCSync(i8*) #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) #1 - -; Function Attrs: nounwind readonly -declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x double>) #2 - -; Function Attrs: nounwind -declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x double>, <4 x double>) #0 - -; Function Attrs: nounwind -define internal fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, double* noalias nocapture %coef, double* noalias %vsq, double* noalias %Ain, double* noalias %Aout, <8 x i32> %__mask) #3 { -allocas: - %floatmask.i = bitcast <8 x i32> %__mask to <8 x float> - %v.i = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i) #1 - %cmp.i = icmp eq i32 %v.i, 255 - %mul_Nx_load_Ny_load = mul i32 %Ny, %Nx - %coef_load_offset_load = load double* %coef, align 8 - %coef_load18_offset = getelementptr double* %coef, i64 1 - %coef_load18_offset_load = load double* %coef_load18_offset, align 8 - %coef_load21_offset = getelementptr double* %coef, i64 2 - %coef_load21_offset_load = load double* %coef_load21_offset, align 8 - %coef_load24_offset = getelementptr double* %coef, i64 3 - %coef_load24_offset_load = load double* %coef_load24_offset, align 8 - %less_z_load_z1_load260 = icmp slt i32 %z0, %z1 - br i1 %cmp.i, label %for_test.preheader, label %for_test264.preheader - -for_test264.preheader: ; preds = %allocas - br i1 %less_z_load_z1_load260, label %for_test275.preheader.lr.ph, label %for_exit - -for_test275.preheader.lr.ph: ; preds = %for_test264.preheader - %less_y_load282_y1_load283264 = icmp slt i32 %y0, %y1 - %less_xb_load293_x1_load294262 = icmp slt i32 %x0, %x1 - %x1_load463_broadcast_init = insertelement <8 x i32> undef, i32 %x1, i32 0 - %x1_load463_broadcast = shufflevector <8 x i32> %x1_load463_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer - %mul__Nx_load382 = shl i32 %Nx, 1 - %mul__Nx_load431 = mul i32 %Nx, 3 - %mul__Nx_load390 = mul i32 %Nx, -2 - %mul__Nx_load439 = mul i32 %Nx, -3 - %mul__Nxy_load399 = shl i32 %mul_Nx_load_Ny_load, 1 - %mul__Nxy_load448 = mul i32 %mul_Nx_load_Ny_load, 3 - %mul__Nxy_load407 = mul i32 %mul_Nx_load_Ny_load, -2 - %mul__Nxy_load456 = mul i32 %mul_Nx_load_Ny_load, -3 - %Ain_load327_ptr2int_2void = bitcast double* %Ain to i8* - %mask0.i.i201 = shufflevector <8 x i32> %__mask, <8 x i32> undef, <8 x i32> - %mask1.i.i202 = shufflevector <8 x i32> %__mask, <8 x i32> undef, <8 x i32> - %mask0d.i.i203 = bitcast <8 x i32> %mask0.i.i201 to <4 x double> - %mask1d.i.i204 = bitcast <8 x i32> %mask1.i.i202 to <4 x double> - %coef1_load315_broadcast_init = insertelement <8 x double> undef, double %coef_load18_offset_load, i32 0 - %coef0_load306_broadcast_init = insertelement <8 x double> undef, double %coef_load_offset_load, i32 0 - %coef2_load364_broadcast_init = insertelement <8 x double> undef, double %coef_load21_offset_load, i32 0 - %coef1_load315_broadcast = shufflevector <8 x double> %coef1_load315_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef0_load306_broadcast = shufflevector <8 x double> %coef0_load306_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load413_broadcast_init = insertelement <8 x double> undef, double %coef_load24_offset_load, i32 0 - %coef2_load364_broadcast = shufflevector <8 x double> %coef2_load364_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load413_broadcast = shufflevector <8 x double> %coef3_load413_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %Aout_load488_ptr2int_2void = bitcast double* %Aout to i8* - %vsq_load494_ptr2int_2void = bitcast double* %vsq to i8* - br label %for_test275.preheader - -for_test.preheader: ; preds = %allocas - br i1 %less_z_load_z1_load260, label %for_test30.preheader.lr.ph, label %for_exit - -for_test30.preheader.lr.ph: ; preds = %for_test.preheader - %less_y_load_y1_load258 = icmp slt i32 %y0, %y1 - %less_xb_load_x1_load256 = icmp slt i32 %x0, %x1 - %x1_load199_broadcast_init = insertelement <8 x i32> undef, i32 %x1, i32 0 - %x1_load199_broadcast = shufflevector <8 x i32> %x1_load199_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer - %mul__Nx_load119 = shl i32 %Nx, 1 - %mul__Nx_load167 = mul i32 %Nx, 3 - %mul__Nx_load127 = mul i32 %Nx, -2 - %mul__Nx_load175 = mul i32 %Nx, -3 - %mul__Nxy_load136 = shl i32 %mul_Nx_load_Ny_load, 1 - %mul__Nxy_load184 = mul i32 %mul_Nx_load_Ny_load, 3 - %mul__Nxy_load144 = mul i32 %mul_Nx_load_Ny_load, -2 - %mul__Nxy_load192 = mul i32 %mul_Nx_load_Ny_load, -3 - %Ain_load65_ptr2int_2void = bitcast double* %Ain to i8* - %coef1_load_broadcast_init = insertelement <8 x double> undef, double %coef_load18_offset_load, i32 0 - %coef0_load_broadcast_init = insertelement <8 x double> undef, double %coef_load_offset_load, i32 0 - %coef2_load_broadcast_init = insertelement <8 x double> undef, double %coef_load21_offset_load, i32 0 - %coef1_load_broadcast = shufflevector <8 x double> %coef1_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef0_load_broadcast = shufflevector <8 x double> %coef0_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load_broadcast_init = insertelement <8 x double> undef, double %coef_load24_offset_load, i32 0 - %coef2_load_broadcast = shufflevector <8 x double> %coef2_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load_broadcast = shufflevector <8 x double> %coef3_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %Aout_load219_ptr2int_2void = bitcast double* %Aout to i8* - %vsq_load_ptr2int_2void = bitcast double* %vsq to i8* - br label %for_test30.preheader - -for_test30.preheader: ; preds = %for_exit33, %for_test30.preheader.lr.ph - %z.0261 = phi i32 [ %z0, %for_test30.preheader.lr.ph ], [ %z_load242_plus1, %for_exit33 ] - br i1 %less_y_load_y1_load258, label %for_test37.preheader.lr.ph, label %for_exit33 - -for_test37.preheader.lr.ph: ; preds = %for_test30.preheader - %mul_z_load45_Nxy_load = mul i32 %z.0261, %mul_Nx_load_Ny_load - br i1 %less_xb_load_x1_load256, label %for_loop39.lr.ph.us, label %for_exit33 - -for_exit40.us: ; preds = %safe_if_after_true.us - %y_load241_plus1.us = add i32 %y.0259.us, 1 - %exitcond = icmp eq i32 %y_load241_plus1.us, %y1 - br i1 %exitcond, label %for_exit33, label %for_loop39.lr.ph.us - -for_loop39.us: ; preds = %for_loop39.lr.ph.us, %safe_if_after_true.us - %xb.0257.us = phi i32 [ %x0, %for_loop39.lr.ph.us ], [ %add_xb_load240_.us, %safe_if_after_true.us ] - %xb_load44_broadcast_init.us = insertelement <8 x i32> undef, i32 %xb.0257.us, i32 0 - %xb_load44_broadcast.us = shufflevector <8 x i32> %xb_load44_broadcast_init.us, <8 x i32> undef, <8 x i32> zeroinitializer - %add_xb_load44_broadcast_.us = add <8 x i32> %xb_load44_broadcast.us, - %less_x_load198_x1_load199_broadcast.us = icmp slt <8 x i32> %add_xb_load44_broadcast_.us, %x1_load199_broadcast - %"oldMask&test.us" = select <8 x i1> %less_x_load198_x1_load199_broadcast.us, <8 x i32> , <8 x i32> zeroinitializer - %floatmask.i244.us = bitcast <8 x i32> %"oldMask&test.us" to <8 x float> - %v.i245.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i244.us) #1 - %cmp.i246.us = icmp eq i32 %v.i245.us, 0 - br i1 %cmp.i246.us, label %safe_if_after_true.us, label %safe_if_run_true.us - -safe_if_run_true.us: ; preds = %for_loop39.us - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast.elt0.us = add i32 %xb.0257.us, %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us - %scaled_varying.elt0.us = shl i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast.elt0.us, 3 - %"varying+const_offsets.elt0.us" = add i32 %scaled_varying.elt0.us, -8 - %0 = sext i32 %"varying+const_offsets.elt0.us" to i64 - %ptr.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %0, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3 - %ptr_cast_for_load.us = bitcast i8* %ptr.us to <8 x double>* - %ptr_masked_load521.us = load <8 x double>* %ptr_cast_for_load.us, align 8, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3 - %"varying+const_offsets529.elt0.us" = add i32 %scaled_varying.elt0.us, 8 - %1 = sext i32 %"varying+const_offsets529.elt0.us" to i64 - %ptr530.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %1, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5 - %ptr_cast_for_load531.us = bitcast i8* %ptr530.us to <8 x double>* - %ptr530_masked_load532.us = load <8 x double>* %ptr_cast_for_load531.us, align 8, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5 - %"varying+const_offsets540.elt0.us" = add i32 %scaled_varying.elt0.us, -16 - %2 = sext i32 %"varying+const_offsets540.elt0.us" to i64 - %ptr541.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %2, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3 - %ptr_cast_for_load542.us = bitcast i8* %ptr541.us to <8 x double>* - %ptr541_masked_load543.us = load <8 x double>* %ptr_cast_for_load542.us, align 8, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3 - %"varying+const_offsets551.elt0.us" = add i32 %scaled_varying.elt0.us, 16 - %3 = sext i32 %"varying+const_offsets551.elt0.us" to i64 - %ptr552.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %3, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5 - %ptr_cast_for_load553.us = bitcast i8* %ptr552.us to <8 x double>* - %ptr552_masked_load554.us = load <8 x double>* %ptr_cast_for_load553.us, align 8, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5 - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556_mul__Nx_load71_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556.elt0.us, %xb.0257.us - %scaled_varying560.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556_mul__Nx_load71_broadcast.elt0.us, 3 - %4 = sext i32 %scaled_varying560.elt0.us to i64 - %ptr562.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %4, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8 - %ptr_cast_for_load563.us = bitcast i8* %ptr562.us to <8 x double>* - %ptr562_masked_load564.us = load <8 x double>* %ptr_cast_for_load563.us, align 8, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8 - %add_Ain_load57_offset_load_Ain_load65_offset_load.us = fadd <8 x double> %ptr_masked_load521.us, %ptr530_masked_load532.us - %"varying+const_offsets572.elt0.us" = add i32 %scaled_varying.elt0.us, -24 - %5 = sext i32 %"varying+const_offsets572.elt0.us" to i64 - %ptr573.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %5, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3 - %ptr_cast_for_load574.us = bitcast i8* %ptr573.us to <8 x double>* - %ptr573_masked_load575.us = load <8 x double>* %ptr_cast_for_load574.us, align 8, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3 - %"varying+const_offsets583.elt0.us" = add i32 %scaled_varying.elt0.us, 24 - %6 = sext i32 %"varying+const_offsets583.elt0.us" to i64 - %ptr584.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %6, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5 - %ptr_cast_for_load585.us = bitcast i8* %ptr584.us to <8 x double>* - %ptr584_masked_load586.us = load <8 x double>* %ptr_cast_for_load585.us, align 8, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5 - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588_mul__Nx_load119_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588.elt0.us, %xb.0257.us - %scaled_varying593.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588_mul__Nx_load119_broadcast.elt0.us, 3 - %7 = sext i32 %scaled_varying593.elt0.us to i64 - %ptr595.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %7, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1 - %ptr_cast_for_load596.us = bitcast i8* %ptr595.us to <8 x double>* - %ptr595_masked_load597.us = load <8 x double>* %ptr_cast_for_load596.us, align 8, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1 - %add_Ain_load105_offset_load_Ain_load113_offset_load.us = fadd <8 x double> %ptr541_masked_load543.us, %ptr552_masked_load554.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599_mul__Nx_load79_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599.elt0.us, %xb.0257.us - %scaled_varying604.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599_mul__Nx_load79_broadcast.elt0.us, 3 - %8 = sext i32 %scaled_varying604.elt0.us to i64 - %ptr606.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %8, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13 - %ptr_cast_for_load607.us = bitcast i8* %ptr606.us to <8 x double>* - %ptr606_masked_load608.us = load <8 x double>* %ptr_cast_for_load607.us, align 8, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13 - %add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load.us = fadd <8 x double> %add_Ain_load57_offset_load_Ain_load65_offset_load.us, %ptr562_masked_load564.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610_mul__Nx_load167_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610.elt0.us, %xb.0257.us - %scaled_varying615.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610_mul__Nx_load167_broadcast.elt0.us, 3 - %9 = sext i32 %scaled_varying615.elt0.us to i64 - %ptr617.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %9, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1 - %ptr_cast_for_load618.us = bitcast i8* %ptr617.us to <8 x double>* - %ptr617_masked_load619.us = load <8 x double>* %ptr_cast_for_load618.us, align 8, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1 - %add_Ain_load153_offset_load_Ain_load161_offset_load.us = fadd <8 x double> %ptr573_masked_load575.us, %ptr584_masked_load586.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621_mul__Nx_load127_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621.elt0.us, %xb.0257.us - %scaled_varying626.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621_mul__Nx_load127_broadcast.elt0.us, 3 - %10 = sext i32 %scaled_varying626.elt0.us to i64 - %ptr628.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %10, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15 - %ptr_cast_for_load629.us = bitcast i8* %ptr628.us to <8 x double>* - %ptr628_masked_load630.us = load <8 x double>* %ptr_cast_for_load629.us, align 8, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15 - %add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load.us = fadd <8 x double> %add_Ain_load105_offset_load_Ain_load113_offset_load.us, %ptr595_masked_load597.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632_mul__Nxy_load88_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632.elt0.us, %xb.0257.us - %scaled_varying637.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632_mul__Nxy_load88_broadcast.elt0.us, 3 - %11 = sext i32 %scaled_varying637.elt0.us to i64 - %ptr639.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %11, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1 - %ptr_cast_for_load640.us = bitcast i8* %ptr639.us to <8 x double>* - %ptr639_masked_load641.us = load <8 x double>* %ptr_cast_for_load640.us, align 8, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1 - %add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load.us = fadd <8 x double> %add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load.us, %ptr606_masked_load608.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643_mul__Nx_load175_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643.elt0.us, %xb.0257.us - %scaled_varying648.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643_mul__Nx_load175_broadcast.elt0.us, 3 - %12 = sext i32 %scaled_varying648.elt0.us to i64 - %ptr650.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %12, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15 - %ptr_cast_for_load651.us = bitcast i8* %ptr650.us to <8 x double>* - %ptr650_masked_load652.us = load <8 x double>* %ptr_cast_for_load651.us, align 8, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15 - %add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load.us = fadd <8 x double> %add_Ain_load153_offset_load_Ain_load161_offset_load.us, %ptr617_masked_load619.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654_mul__Nxy_load136_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654.elt0.us, %xb.0257.us - %scaled_varying659.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654_mul__Nxy_load136_broadcast.elt0.us, 3 - %13 = sext i32 %scaled_varying659.elt0.us to i64 - %ptr661.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %13, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1 - %ptr_cast_for_load662.us = bitcast i8* %ptr661.us to <8 x double>* - %ptr661_masked_load663.us = load <8 x double>* %ptr_cast_for_load662.us, align 8, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1 - %add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load.us = fadd <8 x double> %add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load.us, %ptr628_masked_load630.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665_mul__Nxy_load96_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665.elt0.us, %xb.0257.us - %scaled_varying670.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665_mul__Nxy_load96_broadcast.elt0.us, 3 - %14 = sext i32 %scaled_varying670.elt0.us to i64 - %ptr672.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %14, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15 - %ptr_cast_for_load673.us = bitcast i8* %ptr672.us to <8 x double>* - %ptr672_masked_load674.us = load <8 x double>* %ptr_cast_for_load673.us, align 8, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15 - %add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load.us = fadd <8 x double> %add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load.us, %ptr639_masked_load641.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676_mul__Nxy_load184_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676.elt0.us, %xb.0257.us - %scaled_varying681.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676_mul__Nxy_load184_broadcast.elt0.us, 3 - %15 = sext i32 %scaled_varying681.elt0.us to i64 - %ptr683.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %15, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1 - %ptr_cast_for_load684.us = bitcast i8* %ptr683.us to <8 x double>* - %ptr683_masked_load685.us = load <8 x double>* %ptr_cast_for_load684.us, align 8, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1 - %add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load.us = fadd <8 x double> %add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load.us, %ptr650_masked_load652.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687_mul__Nxy_load144_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687.elt0.us, %xb.0257.us - %scaled_varying692.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687_mul__Nxy_load144_broadcast.elt0.us, 3 - %16 = sext i32 %scaled_varying692.elt0.us to i64 - %ptr694.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %16, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15 - %ptr_cast_for_load695.us = bitcast i8* %ptr694.us to <8 x double>* - %ptr694_masked_load696.us = load <8 x double>* %ptr_cast_for_load695.us, align 8, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15 - %add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load.us = fadd <8 x double> %add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load.us, %ptr661_masked_load663.us - %add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load.us, %ptr672_masked_load674.us - %17 = sext i32 %scaled_varying.elt0.us to i64 - %ptr705.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %17, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19 - %ptr_cast_for_load706.us = bitcast i8* %ptr705.us to <8 x double>* - %ptr705_masked_load707.us = load <8 x double>* %ptr_cast_for_load706.us, align 8, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19 - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709_mul__Nxy_load192_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709.elt0.us, %xb.0257.us - %scaled_varying714.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709_mul__Nxy_load192_broadcast.elt0.us, 3 - %18 = sext i32 %scaled_varying714.elt0.us to i64 - %ptr716.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %18, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15 - %ptr_cast_for_load717.us = bitcast i8* %ptr716.us to <8 x double>* - %ptr716_masked_load718.us = load <8 x double>* %ptr_cast_for_load717.us, align 8, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15 - %add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load.us = fadd <8 x double> %add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load.us, %ptr683_masked_load685.us - %add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load.us, %ptr694_masked_load696.us - %mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fmul <8 x double> %coef1_load_broadcast, %add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us - %mul_coef0_load_broadcast_Ain_load_offset_load.us = fmul <8 x double> %coef0_load_broadcast, %ptr705_masked_load707.us - %add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load.us, %ptr716_masked_load718.us - %mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fmul <8 x double> %coef2_load_broadcast, %add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us - %add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fadd <8 x double> %mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us, %mul_coef0_load_broadcast_Ain_load_offset_load.us - %mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fmul <8 x double> %coef3_load_broadcast, %add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us - %add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fadd <8 x double> %mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us, %add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us - %add_add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load_mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fadd <8 x double> %add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us, %mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us - %mask0.i.i234.us = shufflevector <8 x i32> %"oldMask&test.us", <8 x i32> undef, <8 x i32> - %mask1.i.i235.us = shufflevector <8 x i32> %"oldMask&test.us", <8 x i32> undef, <8 x i32> - %mask0d.i.i236.us = bitcast <8 x i32> %mask0.i.i234.us to <4 x double> - %mask1d.i.i237.us = bitcast <8 x i32> %mask1.i.i235.us to <4 x double> - %val0d.i.i238.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr705.us, <4 x double> %mask0d.i.i236.us) #0 - %ptr727.sum.us = add i64 %17, 32 - %ptr1.i.i239.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %ptr727.sum.us - %val1d.i.i240.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i239.us, <4 x double> %mask1d.i.i237.us) #0 - %vald.i.i241.us = shufflevector <4 x double> %val0d.i.i238.us, <4 x double> %val1d.i.i240.us, <8 x i32> - %mul__Ain_load211_offset_load.us = fmul <8 x double> %vald.i.i241.us, - %ptr736.us = getelementptr i8* %Aout_load219_ptr2int_2void, i64 %17, !filename !0, !first_line !20, !first_column !21, !last_line !20, !last_column !22 - %val0d.i.i228.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr736.us, <4 x double> %mask0d.i.i236.us) #0 - %ptr1.i.i229.us = getelementptr i8* %Aout_load219_ptr2int_2void, i64 %ptr727.sum.us - %val1d.i.i230.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i229.us, <4 x double> %mask1d.i.i237.us) #0 - %vald.i.i231.us = shufflevector <4 x double> %val0d.i.i228.us, <4 x double> %val1d.i.i230.us, <8 x i32> - %sub_mul__Ain_load211_offset_load_Aout_load219_offset_load.us = fsub <8 x double> %mul__Ain_load211_offset_load.us, %vald.i.i231.us - %ptr745.us = getelementptr i8* %vsq_load_ptr2int_2void, i64 %17, !filename !0, !first_line !23, !first_column !24, !last_line !23, !last_column !7 - %val0d.i.i218.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr745.us, <4 x double> %mask0d.i.i236.us) #0 - %ptr1.i.i219.us = getelementptr i8* %vsq_load_ptr2int_2void, i64 %ptr727.sum.us - %val1d.i.i220.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i219.us, <4 x double> %mask1d.i.i237.us) #0 - %vald.i.i221.us = shufflevector <4 x double> %val0d.i.i218.us, <4 x double> %val1d.i.i220.us, <8 x i32> - %mul_vsq_load_offset_load_div_load.us = fmul <8 x double> %add_add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load_mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us, %vald.i.i221.us - %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us = fadd <8 x double> %sub_mul__Ain_load211_offset_load_Aout_load219_offset_load.us, %mul_vsq_load_offset_load_div_load.us - %val0.i.i.us = shufflevector <8 x double> %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us, <8 x double> undef, <4 x i32> - %val1.i.i.us = shufflevector <8 x double> %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us, <8 x double> undef, <4 x i32> - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr736.us, <4 x double> %mask0d.i.i236.us, <4 x double> %val0.i.i.us) #0 - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr1.i.i229.us, <4 x double> %mask1d.i.i237.us, <4 x double> %val1.i.i.us) #0 - br label %safe_if_after_true.us - -safe_if_after_true.us: ; preds = %safe_if_run_true.us, %for_loop39.us - %add_xb_load240_.us = add i32 %xb.0257.us, 8 - %less_xb_load_x1_load.us = icmp slt i32 %add_xb_load240_.us, %x1 - br i1 %less_xb_load_x1_load.us, label %for_loop39.us, label %for_exit40.us - -for_loop39.lr.ph.us: ; preds = %for_exit40.us, %for_test37.preheader.lr.ph - %y.0259.us = phi i32 [ %y_load241_plus1.us, %for_exit40.us ], [ %y0, %for_test37.preheader.lr.ph ] - %mul_y_load46_Nx_load47.us = mul i32 %y.0259.us, %Nx - %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us = add i32 %mul_y_load46_Nx_load47.us, %mul_z_load45_Nxy_load - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %Nx - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load119 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599.elt0.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %Nx - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load167 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load127 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load175 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load136 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665.elt0.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load184 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load144 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load192 - br label %for_loop39.us - -for_exit: ; preds = %for_exit278, %for_exit33, %for_test.preheader, %for_test264.preheader - ret void - -for_exit33: ; preds = %for_exit40.us, %for_test37.preheader.lr.ph, %for_test30.preheader - %z_load242_plus1 = add i32 %z.0261, 1 - %exitcond269 = icmp eq i32 %z_load242_plus1, %z1 - br i1 %exitcond269, label %for_exit, label %for_test30.preheader - -for_test275.preheader: ; preds = %for_exit278, %for_test275.preheader.lr.ph - %z269.0268 = phi i32 [ %z0, %for_test275.preheader.lr.ph ], [ %z_load518_plus1, %for_exit278 ] - br i1 %less_y_load282_y1_load283264, label %for_test286.preheader.lr.ph, label %for_exit278 - -for_test286.preheader.lr.ph: ; preds = %for_test275.preheader - %mul_z_load300_Nxy_load301 = mul i32 %z269.0268, %mul_Nx_load_Ny_load - br i1 %less_xb_load293_x1_load294262, label %for_loop288.lr.ph.us, label %for_exit278 - -for_exit289.us: ; preds = %safe_if_after_true466.us - %y_load517_plus1.us = add i32 %y280.0265.us, 1 - %exitcond271 = icmp eq i32 %y_load517_plus1.us, %y1 - br i1 %exitcond271, label %for_exit278, label %for_loop288.lr.ph.us - -for_loop288.us: ; preds = %for_loop288.lr.ph.us, %safe_if_after_true466.us - %xb291.0263.us = phi i32 [ %x0, %for_loop288.lr.ph.us ], [ %add_xb291_load_.us, %safe_if_after_true466.us ] - %xb_load298_broadcast_init.us = insertelement <8 x i32> undef, i32 %xb291.0263.us, i32 0 - %xb_load298_broadcast.us = shufflevector <8 x i32> %xb_load298_broadcast_init.us, <8 x i32> undef, <8 x i32> zeroinitializer - %add_xb_load298_broadcast_.us = add <8 x i32> %xb_load298_broadcast.us, - %less_x_load462_x1_load463_broadcast.us = icmp slt <8 x i32> %add_xb_load298_broadcast_.us, %x1_load463_broadcast - %"oldMask&test468.us" = select <8 x i1> %less_x_load462_x1_load463_broadcast.us, <8 x i32> , <8 x i32> zeroinitializer - %"internal_mask&function_mask472.us" = and <8 x i32> %"oldMask&test468.us", %__mask - %floatmask.i211.us = bitcast <8 x i32> %"internal_mask&function_mask472.us" to <8 x float> - %v.i212.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i211.us) #1 - %cmp.i213.us = icmp eq i32 %v.i212.us, 0 - br i1 %cmp.i213.us, label %safe_if_after_true466.us, label %safe_if_run_true467.us - -safe_if_run_true467.us: ; preds = %for_loop288.us - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast.elt0.us = add i32 %xb291.0263.us, %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us - %scaled_varying757.elt0.us = shl i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast.elt0.us, 3 - %"varying+const_offsets.elt0758.us" = add i32 %scaled_varying757.elt0.us, -8 - %19 = sext i32 %"varying+const_offsets.elt0758.us" to i64 - %ptr759.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %19, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3 - %val0d.i.i205.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr759.us, <4 x double> %mask0d.i.i203) #0 - %ptr759.sum.us = add i64 %19, 32 - %ptr1.i.i206.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr759.sum.us - %val1d.i.i207.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i206.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i208.us = shufflevector <4 x double> %val0d.i.i205.us, <4 x double> %val1d.i.i207.us, <8 x i32> - %"varying+const_offsets767.elt0.us" = add i32 %scaled_varying757.elt0.us, 8 - %20 = sext i32 %"varying+const_offsets767.elt0.us" to i64 - %ptr768.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %20, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5 - %val0d.i.i195.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr768.us, <4 x double> %mask0d.i.i203) #0 - %ptr768.sum.us = add i64 %20, 32 - %ptr1.i.i196.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr768.sum.us - %val1d.i.i197.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i196.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i198.us = shufflevector <4 x double> %val0d.i.i195.us, <4 x double> %val1d.i.i197.us, <8 x i32> - %"varying+const_offsets776.elt0.us" = add i32 %scaled_varying757.elt0.us, -16 - %21 = sext i32 %"varying+const_offsets776.elt0.us" to i64 - %ptr777.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %21, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3 - %val0d.i.i185.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr777.us, <4 x double> %mask0d.i.i203) #0 - %ptr777.sum.us = add i64 %21, 32 - %ptr1.i.i186.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr777.sum.us - %val1d.i.i187.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i186.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i188.us = shufflevector <4 x double> %val0d.i.i185.us, <4 x double> %val1d.i.i187.us, <8 x i32> - %"varying+const_offsets785.elt0.us" = add i32 %scaled_varying757.elt0.us, 16 - %22 = sext i32 %"varying+const_offsets785.elt0.us" to i64 - %ptr786.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %22, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5 - %val0d.i.i175.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr786.us, <4 x double> %mask0d.i.i203) #0 - %ptr786.sum.us = add i64 %22, 32 - %ptr1.i.i176.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr786.sum.us - %val1d.i.i177.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i176.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i178.us = shufflevector <4 x double> %val0d.i.i175.us, <4 x double> %val1d.i.i177.us, <8 x i32> - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788_mul__Nx_load333_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788.elt0.us, %xb291.0263.us - %scaled_varying793.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788_mul__Nx_load333_broadcast.elt0.us, 3 - %23 = sext i32 %scaled_varying793.elt0.us to i64 - %ptr795.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %23, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8 - %val0d.i.i165.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr795.us, <4 x double> %mask0d.i.i203) #0 - %ptr795.sum.us = add i64 %23, 32 - %ptr1.i.i166.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr795.sum.us - %val1d.i.i167.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i166.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i168.us = shufflevector <4 x double> %val0d.i.i165.us, <4 x double> %val1d.i.i167.us, <8 x i32> - %add_Ain_load319_offset_load_Ain_load327_offset_load.us = fadd <8 x double> %vald.i.i208.us, %vald.i.i198.us - %"varying+const_offsets803.elt0.us" = add i32 %scaled_varying757.elt0.us, -24 - %24 = sext i32 %"varying+const_offsets803.elt0.us" to i64 - %ptr804.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %24, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3 - %val0d.i.i155.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr804.us, <4 x double> %mask0d.i.i203) #0 - %ptr804.sum.us = add i64 %24, 32 - %ptr1.i.i156.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr804.sum.us - %val1d.i.i157.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i156.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i158.us = shufflevector <4 x double> %val0d.i.i155.us, <4 x double> %val1d.i.i157.us, <8 x i32> - %"varying+const_offsets812.elt0.us" = add i32 %scaled_varying757.elt0.us, 24 - %25 = sext i32 %"varying+const_offsets812.elt0.us" to i64 - %ptr813.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %25, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5 - %val0d.i.i145.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr813.us, <4 x double> %mask0d.i.i203) #0 - %ptr813.sum.us = add i64 %25, 32 - %ptr1.i.i146.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr813.sum.us - %val1d.i.i147.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i146.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i148.us = shufflevector <4 x double> %val0d.i.i145.us, <4 x double> %val1d.i.i147.us, <8 x i32> - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815_mul__Nx_load382_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815.elt0.us, %xb291.0263.us - %scaled_varying820.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815_mul__Nx_load382_broadcast.elt0.us, 3 - %26 = sext i32 %scaled_varying820.elt0.us to i64 - %ptr822.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %26, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1 - %val0d.i.i135.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr822.us, <4 x double> %mask0d.i.i203) #0 - %ptr822.sum.us = add i64 %26, 32 - %ptr1.i.i136.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr822.sum.us - %val1d.i.i137.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i136.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i138.us = shufflevector <4 x double> %val0d.i.i135.us, <4 x double> %val1d.i.i137.us, <8 x i32> - %add_Ain_load368_offset_load_Ain_load376_offset_load.us = fadd <8 x double> %vald.i.i188.us, %vald.i.i178.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824_mul__Nx_load341_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824.elt0.us, %xb291.0263.us - %scaled_varying829.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824_mul__Nx_load341_broadcast.elt0.us, 3 - %27 = sext i32 %scaled_varying829.elt0.us to i64 - %ptr831.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %27, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13 - %val0d.i.i125.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr831.us, <4 x double> %mask0d.i.i203) #0 - %ptr831.sum.us = add i64 %27, 32 - %ptr1.i.i126.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr831.sum.us - %val1d.i.i127.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i126.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i128.us = shufflevector <4 x double> %val0d.i.i125.us, <4 x double> %val1d.i.i127.us, <8 x i32> - %add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load.us = fadd <8 x double> %add_Ain_load319_offset_load_Ain_load327_offset_load.us, %vald.i.i168.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833_mul__Nx_load431_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833.elt0.us, %xb291.0263.us - %scaled_varying838.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833_mul__Nx_load431_broadcast.elt0.us, 3 - %28 = sext i32 %scaled_varying838.elt0.us to i64 - %ptr840.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %28, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1 - %val0d.i.i115.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr840.us, <4 x double> %mask0d.i.i203) #0 - %ptr840.sum.us = add i64 %28, 32 - %ptr1.i.i116.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr840.sum.us - %val1d.i.i117.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i116.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i118.us = shufflevector <4 x double> %val0d.i.i115.us, <4 x double> %val1d.i.i117.us, <8 x i32> - %add_Ain_load417_offset_load_Ain_load425_offset_load.us = fadd <8 x double> %vald.i.i158.us, %vald.i.i148.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842_mul__Nx_load390_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842.elt0.us, %xb291.0263.us - %scaled_varying847.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842_mul__Nx_load390_broadcast.elt0.us, 3 - %29 = sext i32 %scaled_varying847.elt0.us to i64 - %ptr849.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %29, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15 - %val0d.i.i105.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr849.us, <4 x double> %mask0d.i.i203) #0 - %ptr849.sum.us = add i64 %29, 32 - %ptr1.i.i106.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr849.sum.us - %val1d.i.i107.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i106.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i108.us = shufflevector <4 x double> %val0d.i.i105.us, <4 x double> %val1d.i.i107.us, <8 x i32> - %add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load.us = fadd <8 x double> %add_Ain_load368_offset_load_Ain_load376_offset_load.us, %vald.i.i138.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851_mul__Nxy_load350_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851.elt0.us, %xb291.0263.us - %scaled_varying856.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851_mul__Nxy_load350_broadcast.elt0.us, 3 - %30 = sext i32 %scaled_varying856.elt0.us to i64 - %ptr858.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %30, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1 - %val0d.i.i95.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr858.us, <4 x double> %mask0d.i.i203) #0 - %ptr858.sum.us = add i64 %30, 32 - %ptr1.i.i96.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr858.sum.us - %val1d.i.i97.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i96.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i98.us = shufflevector <4 x double> %val0d.i.i95.us, <4 x double> %val1d.i.i97.us, <8 x i32> - %add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load.us = fadd <8 x double> %add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load.us, %vald.i.i128.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860_mul__Nx_load439_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860.elt0.us, %xb291.0263.us - %scaled_varying865.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860_mul__Nx_load439_broadcast.elt0.us, 3 - %31 = sext i32 %scaled_varying865.elt0.us to i64 - %ptr867.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %31, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15 - %val0d.i.i85.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr867.us, <4 x double> %mask0d.i.i203) #0 - %ptr867.sum.us = add i64 %31, 32 - %ptr1.i.i86.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr867.sum.us - %val1d.i.i87.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i86.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i88.us = shufflevector <4 x double> %val0d.i.i85.us, <4 x double> %val1d.i.i87.us, <8 x i32> - %add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load.us = fadd <8 x double> %add_Ain_load417_offset_load_Ain_load425_offset_load.us, %vald.i.i118.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869_mul__Nxy_load399_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869.elt0.us, %xb291.0263.us - %scaled_varying874.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869_mul__Nxy_load399_broadcast.elt0.us, 3 - %32 = sext i32 %scaled_varying874.elt0.us to i64 - %ptr876.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %32, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1 - %val0d.i.i75.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr876.us, <4 x double> %mask0d.i.i203) #0 - %ptr876.sum.us = add i64 %32, 32 - %ptr1.i.i76.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr876.sum.us - %val1d.i.i77.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i76.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i78.us = shufflevector <4 x double> %val0d.i.i75.us, <4 x double> %val1d.i.i77.us, <8 x i32> - %add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load.us = fadd <8 x double> %add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load.us, %vald.i.i108.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878_mul__Nxy_load358_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878.elt0.us, %xb291.0263.us - %scaled_varying883.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878_mul__Nxy_load358_broadcast.elt0.us, 3 - %33 = sext i32 %scaled_varying883.elt0.us to i64 - %ptr885.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %33, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15 - %val0d.i.i65.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr885.us, <4 x double> %mask0d.i.i203) #0 - %ptr885.sum.us = add i64 %33, 32 - %ptr1.i.i66.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr885.sum.us - %val1d.i.i67.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i66.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i68.us = shufflevector <4 x double> %val0d.i.i65.us, <4 x double> %val1d.i.i67.us, <8 x i32> - %add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load.us = fadd <8 x double> %add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load.us, %vald.i.i98.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887_mul__Nxy_load448_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887.elt0.us, %xb291.0263.us - %scaled_varying892.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887_mul__Nxy_load448_broadcast.elt0.us, 3 - %34 = sext i32 %scaled_varying892.elt0.us to i64 - %ptr894.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %34, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1 - %val0d.i.i55.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr894.us, <4 x double> %mask0d.i.i203) #0 - %ptr894.sum.us = add i64 %34, 32 - %ptr1.i.i56.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr894.sum.us - %val1d.i.i57.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i56.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i58.us = shufflevector <4 x double> %val0d.i.i55.us, <4 x double> %val1d.i.i57.us, <8 x i32> - %add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load.us = fadd <8 x double> %add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load.us, %vald.i.i88.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896_mul__Nxy_load407_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896.elt0.us, %xb291.0263.us - %scaled_varying901.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896_mul__Nxy_load407_broadcast.elt0.us, 3 - %35 = sext i32 %scaled_varying901.elt0.us to i64 - %ptr903.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %35, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15 - %val0d.i.i45.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr903.us, <4 x double> %mask0d.i.i203) #0 - %ptr903.sum.us = add i64 %35, 32 - %ptr1.i.i46.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr903.sum.us - %val1d.i.i47.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i46.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i48.us = shufflevector <4 x double> %val0d.i.i45.us, <4 x double> %val1d.i.i47.us, <8 x i32> - %add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load.us = fadd <8 x double> %add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load.us, %vald.i.i78.us - %add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load.us, %vald.i.i68.us - %36 = sext i32 %scaled_varying757.elt0.us to i64 - %ptr912.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %36, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19 - %val0d.i.i35.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr912.us, <4 x double> %mask0d.i.i203) #0 - %ptr912.sum.us = add i64 %36, 32 - %ptr1.i.i36.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr912.sum.us - %val1d.i.i37.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i36.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i38.us = shufflevector <4 x double> %val0d.i.i35.us, <4 x double> %val1d.i.i37.us, <8 x i32> - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914_mul__Nxy_load456_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914.elt0.us, %xb291.0263.us - %scaled_varying919.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914_mul__Nxy_load456_broadcast.elt0.us, 3 - %37 = sext i32 %scaled_varying919.elt0.us to i64 - %ptr921.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %37, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15 - %val0d.i.i25.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr921.us, <4 x double> %mask0d.i.i203) #0 - %ptr921.sum.us = add i64 %37, 32 - %ptr1.i.i26.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr921.sum.us - %val1d.i.i27.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i26.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i28.us = shufflevector <4 x double> %val0d.i.i25.us, <4 x double> %val1d.i.i27.us, <8 x i32> - %add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load.us = fadd <8 x double> %add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load.us, %vald.i.i58.us - %add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load.us, %vald.i.i48.us - %mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fmul <8 x double> %coef1_load315_broadcast, %add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us - %mul_coef0_load306_broadcast_Ain_load310_offset_load.us = fmul <8 x double> %coef0_load306_broadcast, %vald.i.i38.us - %add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load.us, %vald.i.i28.us - %mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fmul <8 x double> %coef2_load364_broadcast, %add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us - %add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fadd <8 x double> %mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us, %mul_coef0_load306_broadcast_Ain_load310_offset_load.us - %mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fmul <8 x double> %coef3_load413_broadcast, %add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us - %add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fadd <8 x double> %mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us, %add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us - %add_add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load_mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fadd <8 x double> %add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us, %mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us - %mask0.i.i11.us = shufflevector <8 x i32> %"internal_mask&function_mask472.us", <8 x i32> undef, <8 x i32> - %mask1.i.i12.us = shufflevector <8 x i32> %"internal_mask&function_mask472.us", <8 x i32> undef, <8 x i32> - %mask0d.i.i13.us = bitcast <8 x i32> %mask0.i.i11.us to <4 x double> - %mask1d.i.i14.us = bitcast <8 x i32> %mask1.i.i12.us to <4 x double> - %val0d.i.i15.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr912.us, <4 x double> %mask0d.i.i13.us) #0 - %val1d.i.i17.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i36.us, <4 x double> %mask1d.i.i14.us) #0 - %vald.i.i18.us = shufflevector <4 x double> %val0d.i.i15.us, <4 x double> %val1d.i.i17.us, <8 x i32> - %mul__Ain_load480_offset_load.us = fmul <8 x double> %vald.i.i18.us, - %ptr939.us = getelementptr i8* %Aout_load488_ptr2int_2void, i64 %36, !filename !0, !first_line !20, !first_column !21, !last_line !20, !last_column !22 - %val0d.i.i5.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr939.us, <4 x double> %mask0d.i.i13.us) #0 - %ptr1.i.i6.us = getelementptr i8* %Aout_load488_ptr2int_2void, i64 %ptr912.sum.us - %val1d.i.i7.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i6.us, <4 x double> %mask1d.i.i14.us) #0 - %vald.i.i8.us = shufflevector <4 x double> %val0d.i.i5.us, <4 x double> %val1d.i.i7.us, <8 x i32> - %sub_mul__Ain_load480_offset_load_Aout_load488_offset_load.us = fsub <8 x double> %mul__Ain_load480_offset_load.us, %vald.i.i8.us - %ptr948.us = getelementptr i8* %vsq_load494_ptr2int_2void, i64 %36, !filename !0, !first_line !23, !first_column !24, !last_line !23, !last_column !7 - %val0d.i.i.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr948.us, <4 x double> %mask0d.i.i13.us) #0 - %ptr1.i.i.us = getelementptr i8* %vsq_load494_ptr2int_2void, i64 %ptr912.sum.us - %val1d.i.i.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i.us, <4 x double> %mask1d.i.i14.us) #0 - %vald.i.i.us = shufflevector <4 x double> %val0d.i.i.us, <4 x double> %val1d.i.i.us, <8 x i32> - %mul_vsq_load494_offset_load_div_load499.us = fmul <8 x double> %add_add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load_mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us, %vald.i.i.us - %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us = fadd <8 x double> %sub_mul__Ain_load480_offset_load_Aout_load488_offset_load.us, %mul_vsq_load494_offset_load_div_load499.us - %val0.i.i253.us = shufflevector <8 x double> %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us, <8 x double> undef, <4 x i32> - %val1.i.i254.us = shufflevector <8 x double> %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us, <8 x double> undef, <4 x i32> - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr939.us, <4 x double> %mask0d.i.i13.us, <4 x double> %val0.i.i253.us) #0 - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr1.i.i6.us, <4 x double> %mask1d.i.i14.us, <4 x double> %val1.i.i254.us) #0 - br label %safe_if_after_true466.us - -safe_if_after_true466.us: ; preds = %safe_if_run_true467.us, %for_loop288.us - %add_xb291_load_.us = add i32 %xb291.0263.us, 8 - %less_xb_load293_x1_load294.us = icmp slt i32 %add_xb291_load_.us, %x1 - br i1 %less_xb_load293_x1_load294.us, label %for_loop288.us, label %for_exit289.us - -for_loop288.lr.ph.us: ; preds = %for_exit289.us, %for_test286.preheader.lr.ph - %y280.0265.us = phi i32 [ %y_load517_plus1.us, %for_exit289.us ], [ %y0, %for_test286.preheader.lr.ph ] - %mul_y_load302_Nx_load303.us = mul i32 %y280.0265.us, %Nx - %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us = add i32 %mul_y_load302_Nx_load303.us, %mul_z_load300_Nxy_load301 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %Nx - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load382 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824.elt0.us = sub i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %Nx - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load431 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load390 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load439 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load399 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878.elt0.us = sub i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load448 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load407 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load456 - br label %for_loop288.us - -for_exit278: ; preds = %for_exit289.us, %for_test286.preheader.lr.ph, %for_test275.preheader - %z_load518_plus1 = add i32 %z269.0268, 1 - %exitcond272 = icmp eq i32 %z_load518_plus1, %z1 - br i1 %exitcond272, label %for_exit, label %for_test275.preheader -} - -; Function Attrs: nounwind -define internal void @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* noalias nocapture, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #3 { -allocas: - %x01 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 0 - %x02 = load i32* %x01, align 4 - %x13 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 1 - %x14 = load i32* %x13, align 4 - %y05 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 2 - %y06 = load i32* %y05, align 4 - %y17 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 3 - %y18 = load i32* %y17, align 4 - %z09 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 4 - %z010 = load i32* %z09, align 4 - %Nx11 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 5 - %Nx12 = load i32* %Nx11, align 4 - %Ny13 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 6 - %Ny14 = load i32* %Ny13, align 4 - %coef17 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 8 - %coef18 = load double** %coef17, align 8 - %vsq19 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 9 - %vsq20 = load double** %vsq19, align 8 - %Ain21 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 10 - %Ain22 = load double** %Ain21, align 8 - %Aout23 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 11 - %Aout24 = load double** %Aout23, align 8 - %task_struct_mask = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 12 - %mask = load <8 x i32>* %task_struct_mask, align 32 - %floatmask.i = bitcast <8 x i32> %mask to <8 x float> - %v.i = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i) #1 - %cmp.i = icmp eq i32 %v.i, 255 - %add_z0_load_taskIndex_load = add i32 %z010, %3 - %add_z0_load27_taskIndex_load28 = add i32 %3, 1 - %add_add_z0_load27_taskIndex_load28_ = add i32 %add_z0_load27_taskIndex_load28, %z010 - br i1 %cmp.i, label %all_on, label %some_on - -all_on: ; preds = %allocas - tail call fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x02, i32 %x14, i32 %y06, i32 %y18, i32 %add_z0_load_taskIndex_load, i32 %add_add_z0_load27_taskIndex_load28_, i32 %Nx12, i32 %Ny14, double* %coef18, double* %vsq20, double* %Ain22, double* %Aout24, <8 x i32> ) - ret void - -some_on: ; preds = %allocas - tail call fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x02, i32 %x14, i32 %y06, i32 %y18, i32 %add_z0_load_taskIndex_load, i32 %add_add_z0_load27_taskIndex_load28_, i32 %Nx12, i32 %Ny14, double* %coef18, double* %vsq20, double* %Ain22, double* %Aout24, <8 x i32> %mask) - ret void -} - -; Function Attrs: nounwind -define void @loop_stencil_ispc_tasks(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd) #3 { -allocas: - %launch_group_handle = alloca i8*, align 8 - store i8* null, i8** %launch_group_handle, align 8 - %less_t_load_t1_load166 = icmp slt i32 %t0, %t1 - br i1 %less_t_load_t1_load166, label %for_loop.lr.ph, label %post_sync73 - -for_loop.lr.ph: ; preds = %allocas - %sub_z1_load_z0_load23 = sub i32 %z1, %z0 - br label %for_loop - -for_loop: ; preds = %post_sync, %for_loop.lr.ph - %t.0167 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load69_plus1, %post_sync ] - %bitop = and i32 %t.0167, 1 - %equal_bitop_ = icmp eq i32 %bitop, 0 - %args_ptr = call i8* @ISPCAlloc(i8** %launch_group_handle, i64 96, i32 32) - %funarg = bitcast i8* %args_ptr to i32* - store i32 %x0, i32* %funarg, align 4 - %funarg24 = getelementptr i8* %args_ptr, i64 4 - %0 = bitcast i8* %funarg24 to i32* - store i32 %x1, i32* %0, align 4 - %funarg25 = getelementptr i8* %args_ptr, i64 8 - %1 = bitcast i8* %funarg25 to i32* - store i32 %y0, i32* %1, align 4 - %funarg26 = getelementptr i8* %args_ptr, i64 12 - %2 = bitcast i8* %funarg26 to i32* - store i32 %y1, i32* %2, align 4 - %funarg27 = getelementptr i8* %args_ptr, i64 16 - %3 = bitcast i8* %funarg27 to i32* - store i32 %z0, i32* %3, align 4 - %funarg28 = getelementptr i8* %args_ptr, i64 20 - %4 = bitcast i8* %funarg28 to i32* - store i32 %Nx, i32* %4, align 4 - %funarg29 = getelementptr i8* %args_ptr, i64 24 - %5 = bitcast i8* %funarg29 to i32* - store i32 %Ny, i32* %5, align 4 - %funarg30 = getelementptr i8* %args_ptr, i64 28 - %6 = bitcast i8* %funarg30 to i32* - store i32 %Nz, i32* %6, align 4 - %funarg31 = getelementptr i8* %args_ptr, i64 32 - %7 = bitcast i8* %funarg31 to double** - store double* %coef, double** %7, align 8 - %funarg32 = getelementptr i8* %args_ptr, i64 40 - %8 = bitcast i8* %funarg32 to double** - store double* %vsq, double** %8, align 8 - %funarg33 = getelementptr i8* %args_ptr, i64 48 - %9 = bitcast i8* %funarg33 to double** - br i1 %equal_bitop_, label %if_then, label %if_else - -for_exit: ; preds = %post_sync - %launch_group_handle_load70.pre = load i8** %launch_group_handle, align 8 - %cmp71 = icmp eq i8* %launch_group_handle_load70.pre, null - br i1 %cmp71, label %post_sync73, label %call_sync72 - -if_then: ; preds = %for_loop - store double* %Aeven, double** %9, align 8 - %funarg34 = getelementptr i8* %args_ptr, i64 56 - %10 = bitcast i8* %funarg34 to double** - store double* %Aodd, double** %10, align 8 - %funarg_mask = getelementptr i8* %args_ptr, i64 64 - %11 = bitcast i8* %funarg_mask to <8 x i32>* - store <8 x i32> , <8 x i32>* %11, align 32 - call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i8*), i8* %args_ptr, i32 %sub_z1_load_z0_load23, i32 1, i32 1) - br label %if_exit - -if_else: ; preds = %for_loop - store double* %Aodd, double** %9, align 8 - %funarg64 = getelementptr i8* %args_ptr, i64 56 - %12 = bitcast i8* %funarg64 to double** - store double* %Aeven, double** %12, align 8 - %funarg_mask67 = getelementptr i8* %args_ptr, i64 64 - %13 = bitcast i8* %funarg_mask67 to <8 x i32>* - store <8 x i32> , <8 x i32>* %13, align 32 - call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i8*), i8* %args_ptr, i32 %sub_z1_load_z0_load23, i32 1, i32 1) - br label %if_exit - -if_exit: ; preds = %if_else, %if_then - %launch_group_handle_load = load i8** %launch_group_handle, align 8 - %cmp = icmp eq i8* %launch_group_handle_load, null - br i1 %cmp, label %post_sync, label %call_sync - -call_sync: ; preds = %if_exit - call void @ISPCSync(i8* %launch_group_handle_load) - store i8* null, i8** %launch_group_handle, align 8 - br label %post_sync - -post_sync: ; preds = %call_sync, %if_exit - %t_load69_plus1 = add i32 %t.0167, 1 - %exitcond = icmp eq i32 %t_load69_plus1, %t1 - br i1 %exitcond, label %for_exit, label %for_loop - -call_sync72: ; preds = %for_exit - call void @ISPCSync(i8* %launch_group_handle_load70.pre) - store i8* null, i8** %launch_group_handle, align 8 - br label %post_sync73 - -post_sync73: ; preds = %call_sync72, %for_exit, %allocas - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind readonly } -attributes #3 = { nounwind "target-cpu"="corei7-avx" "target-features"="+avx,+popcnt,+cmov" } - -!0 = metadata !{metadata !"stencil.ispc"} -!1 = metadata !{i32 68} -!2 = metadata !{i32 69} -!3 = metadata !{i32 113} -!4 = metadata !{i32 22} -!5 = metadata !{i32 66} -!6 = metadata !{i32 71} -!7 = metadata !{i32 23} -!8 = metadata !{i32 67} -!9 = metadata !{i32 74} -!10 = metadata !{i32 72} -!11 = metadata !{i32 24} -!12 = metadata !{i32 70} -!13 = metadata !{i32 114} -!14 = metadata !{i32 75} -!15 = metadata !{i32 115} -!16 = metadata !{i32 73} -!17 = metadata !{i32 76} -!18 = metadata !{i32 21} -!19 = metadata !{i32 64} -!20 = metadata !{i32 79} -!21 = metadata !{i32 112} -!22 = metadata !{i32 156} -!23 = metadata !{i32 80} -!24 = metadata !{i32 13} diff --git a/examples_cuda/stencil/stencil_cu.o b/examples_cuda/stencil/stencil_cu.o index 90b014c6..dcd38c9f 100644 Binary files a/examples_cuda/stencil/stencil_cu.o and b/examples_cuda/stencil/stencil_cu.o differ diff --git a/examples_cuda/stencil/stencil_cu.s b/examples_cuda/stencil/stencil_cu.s deleted file mode 100644 index a10402a9..00000000 --- a/examples_cuda/stencil/stencil_cu.s +++ /dev/null @@ -1,1134 +0,0 @@ - .file "stencil_cu.ll" - .section .rodata.cst16,"aM",@progbits,16 - .align 16 -.LCPI0_0: - .long 4 # 0x4 - .long 5 # 0x5 - .long 6 # 0x6 - .long 7 # 0x7 -.LCPI0_1: - .long 0 # 0x0 - .long 1 # 0x1 - .long 2 # 0x2 - .long 3 # 0x3 - .section .rodata,"a",@progbits - .align 32 -.LCPI0_2: - .quad 4611686018427387904 # double 2.000000e+00 - .quad 4611686018427387904 # double 2.000000e+00 - .quad 4611686018427387904 # double 2.000000e+00 - .quad 4611686018427387904 # double 2.000000e+00 - .text - .align 16, 0x90 - .type stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_,@function -stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_: # @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ -# BB#0: # %allocas - pushq %rbp - pushq %r15 - pushq %r14 - pushq %r13 - pushq %r12 - pushq %rbx - subq $1384, %rsp # imm = 0x568 - movl %ecx, -72(%rsp) # 4-byte Spill - movl %esi, 1308(%rsp) # 4-byte Spill - movl %edi, -68(%rsp) # 4-byte Spill - movq 1456(%rsp), %rcx - vmovsd 24(%rcx), %xmm1 - vmovsd 16(%rcx), %xmm3 - movq 1472(%rsp), %rax - vmovsd (%rcx), %xmm2 - vmovsd 8(%rcx), %xmm4 - movl 1448(%rsp), %esi - vmovmskps %ymm0, %ecx - cmpl $255, %ecx - jne .LBB0_1 -# BB#7: # %for_test.preheader - cmpl %r9d, %r8d - jge .LBB0_6 -# BB#8: # %for_test30.preheader.lr.ph - leal -3(%r8), %ecx - leal 2(%r8), %r13d - leal -1(%r8), %edi - leal 3(%r8), %ebp - movl %esi, %r11d - imull %r11d, %ebp - movl %ebp, %ebx - imull %r11d, %edi - movl %edi, %ebp - imull %r11d, %r13d - imull %r8d, %esi - imull %r11d, %ecx - leal -2(%r8), %r10d - imull %r11d, %r10d - leal 1(%r8), %r14d - imull %r11d, %r14d - movl %edx, -96(%rsp) # 4-byte Spill - addl %edx, %r14d - addl %edx, %r10d - addl %edx, %ecx - movl %ecx, 1344(%rsp) # 4-byte Spill - movl %r9d, -92(%rsp) # 4-byte Spill - leal 1(%rdx,%rsi), %r15d - leal 2(%rdx,%rsi), %edi - addl %edx, %r13d - addl %edx, %ebp - movl %ebp, 1216(%rsp) # 4-byte Spill - addl %edx, %ebx - movl %ebx, 1152(%rsp) # 4-byte Spill - leal -1(%rdx,%rsi), %ebp - leal 3(%rdx,%rsi), %ecx - leal (%rdx,%rsi), %r12d - leal -3(%rdx,%rsi), %ebx - movl %ebx, 1184(%rsp) # 4-byte Spill - movl %r8d, -88(%rsp) # 4-byte Spill - leal -2(%rdx,%rsi), %edx - vmovd 1308(%rsp), %xmm0 # 4-byte Folded Reload - movl 1440(%rsp), %r9d - imull %r9d, %r13d - imull %r9d, %ecx - movl %ecx, 1312(%rsp) # 4-byte Spill - imull %r9d, %ebp - movl %ebp, 1248(%rsp) # 4-byte Spill - imull %r9d, %edi - imull %r9d, %r15d - movl 1344(%rsp), %ecx # 4-byte Reload - imull %r9d, %ecx - movl %ecx, 1344(%rsp) # 4-byte Spill - imull %r9d, %r10d - movl 1152(%rsp), %ebx # 4-byte Reload - imull %r9d, %ebx - movl 1216(%rsp), %ebp # 4-byte Reload - imull %r9d, %ebp - imull %r9d, %r14d - movl 1184(%rsp), %r8d # 4-byte Reload - imull %r9d, %r8d - imull %r9d, %edx - movl %edx, 1216(%rsp) # 4-byte Spill - imull %r9d, %r12d - movl -68(%rsp), %edx # 4-byte Reload - leal (,%rdx,8), %edx - leal -16(%rdx,%r12,8), %esi - movl %esi, 76(%rsp) # 4-byte Spill - leal (%rdx,%r12,8), %ecx - movl %ecx, 72(%rsp) # 4-byte Spill - leal (%rdx,%r15,8), %ecx - movl %ecx, 68(%rsp) # 4-byte Spill - movl -92(%rsp), %ecx # 4-byte Reload - leal (%rdx,%rdi,8), %esi - movl %esi, 64(%rsp) # 4-byte Spill - movl 1248(%rsp), %esi # 4-byte Reload - leal (%rdx,%rsi,8), %esi - movl %esi, 60(%rsp) # 4-byte Spill - movl 1312(%rsp), %esi # 4-byte Reload - leal (%rdx,%rsi,8), %esi - movl %esi, 56(%rsp) # 4-byte Spill - movl 1216(%rsp), %esi # 4-byte Reload - leal (%rdx,%rsi,8), %esi - movl %esi, 52(%rsp) # 4-byte Spill - movl -88(%rsp), %esi # 4-byte Reload - leal (%rdx,%r8,8), %edi - movl %edi, 48(%rsp) # 4-byte Spill - leal (%rdx,%r14,8), %edi - movl %edi, 44(%rsp) # 4-byte Spill - leal (%rdx,%r13,8), %edi - movl %edi, 40(%rsp) # 4-byte Spill - leal (%rdx,%rbp,8), %edi - movl %edi, 36(%rsp) # 4-byte Spill - leal (%rdx,%rbx,8), %edi - movl %edi, 32(%rsp) # 4-byte Spill - leal (%rdx,%r10,8), %edi - movl %edi, 28(%rsp) # 4-byte Spill - movl 1344(%rsp), %edi # 4-byte Reload - leal (%rdx,%rdi,8), %edx - movl %edx, 24(%rsp) # 4-byte Spill - movl $0, -100(%rsp) # 4-byte Folded Spill - imull %r9d, %r11d - shll $3, %r9d - movl %r9d, -76(%rsp) # 4-byte Spill - shll $3, %r11d - movl %r11d, -104(%rsp) # 4-byte Spill - vpermilpd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0] - vpermilpd $0, %xmm2, %xmm2 # xmm2 = xmm2[0,0] - vpermilpd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0] - vpshufd $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] - vinsertf128 $1, %xmm1, %ymm1, %ymm1 - vmovupd %ymm1, 1312(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm3, %ymm3, %ymm1 - vmovupd %ymm1, 1344(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm2, %ymm2, %ymm15 - vmovupd %ymm15, -32(%rsp) # 32-byte Folded Spill - vpermilpd $0, %xmm4, %xmm1 # xmm1 = xmm4[0,0] - vinsertf128 $1, %xmm1, %ymm1, %ymm14 - vmovupd %ymm14, -64(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm0, %ymm0, %ymm0 - vmovups %ymm0, 1248(%rsp) # 32-byte Folded Spill - vmovapd .LCPI0_2(%rip), %ymm13 - .align 16, 0x90 -.LBB0_9: # %for_test30.preheader - # =>This Loop Header: Depth=1 - # Child Loop BB0_16 Depth 2 - # Child Loop BB0_12 Depth 3 - movl %esi, -88(%rsp) # 4-byte Spill - movl -96(%rsp), %edx # 4-byte Reload - cmpl -72(%rsp), %edx # 4-byte Folded Reload - jge .LBB0_11 -# BB#10: # %for_test37.preheader.lr.ph - # in Loop: Header=BB0_9 Depth=1 - movl -68(%rsp), %edx # 4-byte Reload - cmpl 1308(%rsp), %edx # 4-byte Folded Reload - movl -100(%rsp), %edx # 4-byte Reload - movl -96(%rsp), %edi # 4-byte Reload - jge .LBB0_11 - .align 16, 0x90 -.LBB0_16: # %for_loop39.lr.ph.us - # Parent Loop BB0_9 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB0_12 Depth 3 - movl %edi, -84(%rsp) # 4-byte Spill - movl %edx, -80(%rsp) # 4-byte Spill - movl %edx, %r13d - movl -68(%rsp), %ecx # 4-byte Reload - .align 16, 0x90 -.LBB0_12: # %for_loop39.us - # Parent Loop BB0_9 Depth=1 - # Parent Loop BB0_16 Depth=2 - # => This Inner Loop Header: Depth=3 - movl %ecx, 1216(%rsp) # 4-byte Spill - vmovups 1248(%rsp), %ymm3 # 32-byte Folded Reload - vmovups %ymm3, 1248(%rsp) # 32-byte Folded Spill - vextractf128 $1, %ymm3, %xmm0 - vmovd %ecx, %xmm1 - vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0] - vpaddd .LCPI0_0(%rip), %xmm1, %xmm2 - vpcmpgtd %xmm2, %xmm0, %xmm0 - vpaddd .LCPI0_1(%rip), %xmm1, %xmm1 - vpcmpgtd %xmm1, %xmm3, %xmm1 - vinsertf128 $1, %xmm0, %ymm1, %ymm8 - vmovmskps %ymm8, %ecx - testl %ecx, %ecx - je .LBB0_14 -# BB#13: # %safe_if_run_true.us - # in Loop: Header=BB0_12 Depth=3 - movl 76(%rsp), %esi # 4-byte Reload - leal 8(%rsi,%r13), %edx - movl 68(%rsp), %ecx # 4-byte Reload - leal (%rcx,%r13), %ecx - movl 72(%rsp), %r12d # 4-byte Reload - leal 24(%r12,%r13), %r14d - leal -8(%rsi,%r13), %r8d - movl 52(%rsp), %edi # 4-byte Reload - leal (%rdi,%r13), %edi - leal 8(%r12,%r13), %ebp - leal (%rsi,%r13), %esi - leal 16(%r12,%r13), %r11d - movl 64(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r9d - movl 44(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r15d - movl 60(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r10d - movl 40(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 832(%rsp) # 4-byte Spill - movl 56(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 800(%rsp) # 4-byte Spill - movl 36(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 768(%rsp) # 4-byte Spill - movl 28(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 736(%rsp) # 4-byte Spill - leal (%r12,%r13), %ebx - movl %ebx, 960(%rsp) # 4-byte Spill - movl 48(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 896(%rsp) # 4-byte Spill - movl 32(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r12d - movl 24(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 992(%rsp) # 4-byte Spill - movslq %edx, %rdx - movq %rdx, 1184(%rsp) # 8-byte Spill - movslq %ecx, %rbx - movq %rbx, 1056(%rsp) # 8-byte Spill - movslq %esi, %rcx - movq %rcx, 1120(%rsp) # 8-byte Spill - vmovupd (%rax,%rbx), %xmm0 - movq %rbx, %rsi - vmovupd 16(%rax,%rdx), %xmm2 - vmovupd (%rax,%rdx), %xmm3 - movslq %ebp, %rdx - movq %rdx, 1152(%rsp) # 8-byte Spill - vmovupd 16(%rax,%rdx), %xmm1 - vmovupd (%rax,%rdx), %xmm4 - vinsertf128 $1, %xmm1, %ymm4, %ymm1 - vinsertf128 $1, %xmm2, %ymm3, %ymm2 - movslq %edi, %rdx - movq %rdx, 928(%rsp) # 8-byte Spill - movslq %r8d, %rbx - movslq %r14d, %r14 - vmovupd 16(%rax,%rsi), %xmm3 - vmovupd 16(%rax,%rcx), %xmm4 - vmovupd (%rax,%rcx), %xmm5 - movslq %r11d, %rcx - movq %rcx, 1088(%rsp) # 8-byte Spill - vmovupd 16(%rax,%rcx), %xmm6 - vmovupd (%rax,%rcx), %xmm7 - vinsertf128 $1, %xmm6, %ymm7, %ymm6 - vinsertf128 $1, %xmm4, %ymm5, %ymm7 - vaddpd %ymm1, %ymm2, %ymm1 - vinsertf128 $1, %xmm3, %ymm0, %ymm3 - movslq %r10d, %rsi - movq %rsi, 864(%rsp) # 8-byte Spill - vmovupd (%rax,%r14), %xmm5 - vmovupd (%rax,%rbx), %xmm4 - vmovupd (%rax,%rdx), %xmm2 - movslq %r15d, %rbp - movslq %r9d, %rcx - movq %rcx, 1048(%rsp) # 8-byte Spill - vmovupd 16(%rax,%rcx), %xmm0 - vmovupd (%rax,%rcx), %xmm9 - vaddpd %ymm6, %ymm7, %ymm7 - vinsertf128 $1, %xmm0, %ymm9, %ymm9 - vmovupd (%rax,%rbp), %xmm12 - vmovupd (%rax,%rsi), %xmm6 - vmovupd 16(%rax,%rdx), %xmm0 - vaddpd %ymm3, %ymm1, %ymm3 - vinsertf128 $1, 16(%rax,%rsi), %ymm6, %ymm6 - vinsertf128 $1, 16(%rax,%r14), %ymm5, %ymm5 - vinsertf128 $1, 16(%rax,%rbx), %ymm4, %ymm4 - vaddpd %ymm9, %ymm7, %ymm1 - vinsertf128 $1, %xmm0, %ymm2, %ymm2 - movslq 736(%rsp), %r8 # 4-byte Folded Reload - movslq 768(%rsp), %rdx # 4-byte Folded Reload - movslq 800(%rsp), %rdi # 4-byte Folded Reload - vmovupd (%rax,%rdi), %xmm10 - movslq 832(%rsp), %r15 # 4-byte Folded Reload - vmovupd (%rax,%r15), %xmm9 - vmovupd (%rax,%rdx), %xmm7 - vaddpd %ymm5, %ymm4, %ymm4 - vmovupd (%rax,%r8), %xmm11 - vaddpd %ymm6, %ymm3, %ymm5 - vinsertf128 $1, 16(%rax,%rdi), %ymm10, %ymm3 - vinsertf128 $1, 16(%rax,%rbp), %ymm12, %ymm10 - vinsertf128 $1, 16(%rax,%r15), %ymm9, %ymm0 - movslq 896(%rsp), %r11 # 4-byte Folded Reload - vaddpd %ymm2, %ymm1, %ymm1 - movslq 960(%rsp), %rcx # 4-byte Folded Reload - vmovupd (%rax,%rcx), %xmm6 - vaddpd %ymm0, %ymm1, %ymm1 - vinsertf128 $1, 16(%rax,%r8), %ymm11, %ymm2 - vinsertf128 $1, 16(%rax,%rdx), %ymm7, %ymm0 - movslq %r12d, %r12 - vaddpd %ymm10, %ymm5, %ymm7 - vmovupd (%rax,%r11), %xmm5 - vaddpd %ymm3, %ymm4, %ymm3 - vinsertf128 $1, 16(%rax,%r11), %ymm5, %ymm4 - vinsertf128 $1, 16(%rax,%rcx), %ymm6, %ymm9 - vmovupd (%rax,%r12), %xmm5 - movslq 992(%rsp), %rsi # 4-byte Folded Reload - vaddpd %ymm0, %ymm7, %ymm10 - vextractf128 $1, %ymm8, %xmm6 - vaddpd %ymm2, %ymm1, %ymm2 - vpshufd $80, %xmm6, %xmm7 # xmm7 = xmm6[0,0,1,1] - vmulpd %ymm9, %ymm15, %ymm1 - vmovupd (%rax,%rsi), %xmm9 - vaddpd %ymm4, %ymm3, %ymm3 - vinsertf128 $1, 16(%rax,%r12), %ymm5, %ymm4 - vpshufd $80, %xmm8, %xmm5 # xmm5 = xmm8[0,0,1,1] - vpshufd $-6, %xmm6, %xmm0 # xmm0 = xmm6[2,2,3,3] - vpshufd $-6, %xmm8, %xmm6 # xmm6 = xmm8[2,2,3,3] - vinsertf128 $1, %xmm6, %ymm5, %ymm6 - vinsertf128 $1, 16(%rax,%rsi), %ymm9, %ymm5 - vinsertf128 $1, %xmm0, %ymm7, %ymm8 - vmovupd %ymm8, 96(%rsp) # 32-byte Folded Spill - vmovupd 1344(%rsp), %ymm0 # 32-byte Folded Reload - vmovupd %ymm0, 1344(%rsp) # 32-byte Folded Spill - vmovupd %ymm0, 1344(%rsp) # 32-byte Folded Spill - vmulpd %ymm2, %ymm0, %ymm0 - vmulpd %ymm10, %ymm14, %ymm2 - movq 1480(%rsp), %r9 - vmaskmovpd (%r9,%rcx), %ymm6, %ymm7 - vaddpd %ymm1, %ymm2, %ymm1 - vaddpd %ymm1, %ymm0, %ymm0 - vaddpd %ymm4, %ymm3, %ymm3 - vmaskmovpd (%rax,%rcx), %ymm6, %ymm1 - vmulpd %ymm13, %ymm1, %ymm1 - movq 1464(%rsp), %r10 - vmaskmovpd (%r10,%rcx), %ymm6, %ymm2 - vsubpd %ymm7, %ymm1, %ymm1 - vmaskmovpd 32(%r10,%rcx), %ymm8, %ymm4 - vmovupd %ymm4, 992(%rsp) # 32-byte Folded Spill - vaddpd %ymm5, %ymm3, %ymm3 - vmovups 48(%rax,%rsi), %xmm4 - vmovaps %xmm4, 960(%rsp) # 16-byte Spill - vmovupd 1312(%rsp), %ymm4 # 32-byte Folded Reload - vmovupd %ymm4, 1312(%rsp) # 32-byte Folded Spill - vmovupd %ymm4, 1312(%rsp) # 32-byte Folded Spill - vmulpd %ymm3, %ymm4, %ymm3 - vmovups 32(%rax,%rsi), %xmm4 - vmovups %ymm4, 896(%rsp) # 32-byte Folded Spill - vaddpd %ymm3, %ymm0, %ymm0 - vmovups 48(%rax,%r12), %xmm3 - vmovaps %xmm3, 832(%rsp) # 16-byte Spill - vmulpd %ymm2, %ymm0, %ymm0 - vmovups 32(%rax,%r12), %xmm2 - vmovups %ymm2, 800(%rsp) # 32-byte Folded Spill - vaddpd %ymm0, %ymm1, %ymm0 - vmovupd %ymm0, 128(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r11), %xmm0 - vmovaps %xmm0, 768(%rsp) # 16-byte Spill - vmovups 32(%rax,%r11), %xmm0 - vmovups %ymm0, 736(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rdi), %xmm0 - vmovaps %xmm0, 704(%rsp) # 16-byte Spill - vmovups 32(%rax,%rdi), %xmm0 - vmovups %ymm0, 640(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rbx), %xmm0 - vmovaps %xmm0, 592(%rsp) # 16-byte Spill - vmovups 32(%rax,%rbx), %xmm0 - vmovups %ymm0, 544(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r14), %xmm0 - vmovaps %xmm0, 464(%rsp) # 16-byte Spill - vmovups 32(%rax,%r14), %xmm0 - vmovups %ymm0, 416(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rdx), %xmm0 - vmovaps %xmm0, 400(%rsp) # 16-byte Spill - vmovups 32(%rax,%rdx), %xmm0 - vmovups %ymm0, 352(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r8), %xmm0 - vmovaps %xmm0, 336(%rsp) # 16-byte Spill - vmovups 32(%rax,%r8), %xmm0 - vmovups %ymm0, 288(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rbp), %xmm0 - vmovaps %xmm0, 272(%rsp) # 16-byte Spill - vmovups 32(%rax,%rbp), %xmm0 - vmovups %ymm0, 224(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%r9,%rcx), %ymm8, %ymm0 - vmovupd %ymm0, 672(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rcx), %ymm8, %ymm0 - vmovupd %ymm0, 608(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rcx), %xmm0 - vmovaps %xmm0, 528(%rsp) # 16-byte Spill - vmovups 32(%rax,%rcx), %xmm0 - vmovups %ymm0, 480(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r15), %xmm0 - vmovaps %xmm0, 208(%rsp) # 16-byte Spill - vmovups 32(%rax,%r15), %xmm0 - vmovups %ymm0, 160(%rsp) # 32-byte Folded Spill - movq 864(%rsp), %rdx # 8-byte Reload - vmovups 48(%rax,%rdx), %xmm0 - vmovaps %xmm0, 80(%rsp) # 16-byte Spill - vmovups 32(%rax,%rdx), %xmm0 - vmovups %ymm0, 864(%rsp) # 32-byte Folded Spill - movq 928(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm4 - vmovupd 32(%rax,%rdx), %xmm9 - movq 1056(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm5 - vmovupd 32(%rax,%rdx), %xmm11 - movq 1048(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm13 - vmovupd 32(%rax,%rdx), %xmm7 - movq 1184(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm15 - vmovupd 32(%rax,%rdx), %xmm10 - movq 1152(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm12 - vmovupd 32(%rax,%rdx), %xmm14 - movq 1120(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm0 - vmovupd 32(%rax,%rdx), %xmm1 - movq 1088(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm2 - vmovupd 32(%rax,%rdx), %xmm3 - vmovupd 128(%rsp), %ymm8 # 32-byte Folded Reload - vmaskmovpd %ymm8, %ymm6, (%r9,%rcx) - vinsertf128 $1, %xmm2, %ymm3, %ymm2 - vinsertf128 $1, %xmm0, %ymm1, %ymm0 - vaddpd %ymm2, %ymm0, %ymm1 - vinsertf128 $1, %xmm12, %ymm14, %ymm0 - vinsertf128 $1, %xmm15, %ymm10, %ymm2 - vaddpd %ymm0, %ymm2, %ymm0 - vinsertf128 $1, %xmm13, %ymm7, %ymm2 - vinsertf128 $1, %xmm5, %ymm11, %ymm3 - vaddpd %ymm3, %ymm0, %ymm5 - vaddpd %ymm2, %ymm1, %ymm0 - vinsertf128 $1, %xmm4, %ymm9, %ymm1 - vaddpd %ymm1, %ymm0, %ymm0 - vmovupd 864(%rsp), %ymm1 # 32-byte Folded Reload - vinsertf128 $1, 80(%rsp), %ymm1, %ymm1 # 16-byte Folded Reload - vmovupd 160(%rsp), %ymm2 # 32-byte Folded Reload - vinsertf128 $1, 208(%rsp), %ymm2, %ymm2 # 16-byte Folded Reload - vaddpd %ymm2, %ymm0, %ymm0 - vaddpd %ymm1, %ymm5, %ymm1 - vmovupd 224(%rsp), %ymm2 # 32-byte Folded Reload - vinsertf128 $1, 272(%rsp), %ymm2, %ymm2 # 16-byte Folded Reload - vaddpd %ymm2, %ymm1, %ymm1 - vmovupd 288(%rsp), %ymm2 # 32-byte Folded Reload - vinsertf128 $1, 336(%rsp), %ymm2, %ymm2 # 16-byte Folded Reload - vmovupd 352(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 400(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm1, %ymm1 - vaddpd %ymm2, %ymm0, %ymm2 - vmovupd 416(%rsp), %ymm0 # 32-byte Folded Reload - vinsertf128 $1, 464(%rsp), %ymm0, %ymm0 # 16-byte Folded Reload - vmovupd 544(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 592(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm0, %ymm3, %ymm0 - vmovupd 640(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 704(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 1344(%rsp), %ymm3 # 32-byte Folded Reload - vmulpd %ymm2, %ymm3, %ymm2 - vmovupd -64(%rsp), %ymm3 # 32-byte Folded Reload - vmulpd %ymm1, %ymm3, %ymm1 - vmovapd %ymm3, %ymm14 - vmovupd 736(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 768(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vmovupd 480(%rsp), %ymm4 # 32-byte Folded Reload - vinsertf128 $1, 528(%rsp), %ymm4, %ymm4 # 16-byte Folded Reload - vmovupd -32(%rsp), %ymm5 # 32-byte Folded Reload - vmulpd %ymm4, %ymm5, %ymm4 - vmovapd %ymm5, %ymm15 - vaddpd %ymm4, %ymm1, %ymm1 - vmovapd .LCPI0_2(%rip), %ymm5 - vmovupd 608(%rsp), %ymm4 # 32-byte Folded Reload - vmulpd %ymm5, %ymm4, %ymm4 - vmovapd %ymm5, %ymm13 - vaddpd %ymm1, %ymm2, %ymm2 - vsubpd 672(%rsp), %ymm4, %ymm1 # 32-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 800(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 832(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 896(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 960(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 1312(%rsp), %ymm3 # 32-byte Folded Reload - vmulpd %ymm0, %ymm3, %ymm0 - vaddpd %ymm0, %ymm2, %ymm0 - vmulpd 992(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd %ymm0, %ymm1, %ymm0 - vmovupd 96(%rsp), %ymm1 # 32-byte Folded Reload - vmaskmovpd %ymm0, %ymm1, 32(%r9,%rcx) -.LBB0_14: # %safe_if_after_true.us - # in Loop: Header=BB0_12 Depth=3 - addl $64, %r13d - movl 1216(%rsp), %ecx # 4-byte Reload - addl $8, %ecx - cmpl 1308(%rsp), %ecx # 4-byte Folded Reload - jl .LBB0_12 -# BB#15: # %for_exit40.us - # in Loop: Header=BB0_16 Depth=2 - movl -80(%rsp), %edx # 4-byte Reload - addl -76(%rsp), %edx # 4-byte Folded Reload - movl -84(%rsp), %edi # 4-byte Reload - incl %edi - cmpl -72(%rsp), %edi # 4-byte Folded Reload - movl -92(%rsp), %ecx # 4-byte Reload - movl -88(%rsp), %esi # 4-byte Reload - jne .LBB0_16 -.LBB0_11: # %for_exit33 - # in Loop: Header=BB0_9 Depth=1 - movl -100(%rsp), %edx # 4-byte Reload - addl -104(%rsp), %edx # 4-byte Folded Reload - movl %edx, -100(%rsp) # 4-byte Spill - incl %esi - cmpl %ecx, %esi - jne .LBB0_9 - jmp .LBB0_6 -.LBB0_1: # %for_test264.preheader - cmpl %r9d, %r8d - jge .LBB0_6 -# BB#2: # %for_test275.preheader.lr.ph - leal 2(%r8), %r13d - movl %esi, %r10d - imull %r10d, %r13d - movl %r10d, %ecx - imull %r8d, %ecx - movl %edx, %esi - movl %esi, -96(%rsp) # 4-byte Spill - leal (%rsi,%rcx), %r15d - movl %r9d, -92(%rsp) # 4-byte Spill - leal 2(%rsi,%rcx), %edx - movl %edx, 1248(%rsp) # 4-byte Spill - leal -1(%rsi,%rcx), %edx - movl %edx, 1344(%rsp) # 4-byte Spill - leal 3(%rsi,%rcx), %r12d - leal -2(%rsi,%rcx), %edx - movl %edx, 1312(%rsp) # 4-byte Spill - leal -3(%rsi,%rcx), %edi - addl %esi, %r13d - leal 1(%rsi,%rcx), %ecx - leal -3(%r8), %r14d - imull %r10d, %r14d - leal -2(%r8), %r9d - imull %r10d, %r9d - leal 3(%r8), %ebx - imull %r10d, %ebx - leal -1(%r8), %ebp - imull %r10d, %ebp - leal 1(%r8), %edx - imull %r10d, %edx - addl %esi, %edx - addl %esi, %ebp - addl %esi, %ebx - addl %esi, %r9d - addl %esi, %r14d - vmovd 1308(%rsp), %xmm5 # 4-byte Folded Reload - movl 1440(%rsp), %r11d - imull %r11d, %ecx - movl %ecx, 1184(%rsp) # 4-byte Spill - imull %r11d, %r13d - imull %r11d, %edi - movl %edi, 1216(%rsp) # 4-byte Spill - movl 1312(%rsp), %ecx # 4-byte Reload - imull %r11d, %ecx - movl %ecx, 1312(%rsp) # 4-byte Spill - imull %r11d, %r12d - movl 1344(%rsp), %esi # 4-byte Reload - imull %r11d, %esi - movl %esi, 1344(%rsp) # 4-byte Spill - movl 1248(%rsp), %ecx # 4-byte Reload - imull %r11d, %ecx - imull %r11d, %r15d - movl -68(%rsp), %esi # 4-byte Reload - leal (,%rsi,8), %esi - imull %r11d, %r14d - imull %r11d, %r9d - imull %r11d, %ebx - imull %r11d, %ebp - imull %r11d, %edx - leal -16(%rsi,%r15,8), %edi - movl %edi, 672(%rsp) # 4-byte Spill - leal (%rsi,%r15,8), %edi - movl %edi, 640(%rsp) # 4-byte Spill - movl 1184(%rsp), %edi # 4-byte Reload - leal (%rsi,%rdi,8), %edi - movl %edi, 608(%rsp) # 4-byte Spill - movl %r8d, %edi - leal (%rsi,%rcx,8), %ecx - movl %ecx, 592(%rsp) # 4-byte Spill - movl 1344(%rsp), %ecx # 4-byte Reload - leal (%rsi,%rcx,8), %ecx - movl %ecx, 544(%rsp) # 4-byte Spill - leal (%rsi,%r12,8), %ecx - movl %ecx, 528(%rsp) # 4-byte Spill - movl 1312(%rsp), %ecx # 4-byte Reload - leal (%rsi,%rcx,8), %ecx - movl %ecx, 480(%rsp) # 4-byte Spill - movl 1216(%rsp), %ecx # 4-byte Reload - leal (%rsi,%rcx,8), %ecx - movl %ecx, 464(%rsp) # 4-byte Spill - leal (%rsi,%rdx,8), %ecx - movl %ecx, 416(%rsp) # 4-byte Spill - leal (%rsi,%r13,8), %ecx - movl %ecx, 400(%rsp) # 4-byte Spill - leal (%rsi,%rbp,8), %ecx - movl %ecx, 352(%rsp) # 4-byte Spill - leal (%rsi,%rbx,8), %ecx - movl %ecx, 336(%rsp) # 4-byte Spill - leal (%rsi,%r9,8), %ecx - movl %ecx, 288(%rsp) # 4-byte Spill - leal (%rsi,%r14,8), %ecx - movl %ecx, 272(%rsp) # 4-byte Spill - movl $0, 160(%rsp) # 4-byte Folded Spill - imull %r11d, %r10d - shll $3, %r11d - movl %r11d, -76(%rsp) # 4-byte Spill - shll $3, %r10d - movl %r10d, -104(%rsp) # 4-byte Spill - vpermilpd $0, %xmm1, %xmm6 # xmm6 = xmm1[0,0] - vpermilpd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0] - vpermilpd $0, %xmm2, %xmm1 # xmm1 = xmm2[0,0] - vmovaps %ymm0, %ymm8 - vmovups %ymm8, 704(%rsp) # 32-byte Folded Spill - vextractf128 $1, %ymm8, %xmm7 - vpshufd $80, %xmm8, %xmm0 # xmm0 = xmm8[0,0,1,1] - vinsertf128 $1, %xmm6, %ymm6, %ymm13 - vpshufd $80, %xmm7, %xmm2 # xmm2 = xmm7[0,0,1,1] - vinsertf128 $1, %xmm3, %ymm3, %ymm15 - vpshufd $-6, %xmm7, %xmm3 # xmm3 = xmm7[2,2,3,3] - vinsertf128 $1, %xmm1, %ymm1, %ymm10 - vpshufd $-6, %xmm8, %xmm1 # xmm1 = xmm8[2,2,3,3] - vpshufd $0, %xmm5, %xmm7 # xmm7 = xmm5[0,0,0,0] - vpermilpd $0, %xmm4, %xmm4 # xmm4 = xmm4[0,0] - vinsertf128 $1, %xmm4, %ymm4, %ymm4 - vmovupd %ymm4, 1344(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm3, %ymm2, %ymm5 - vinsertf128 $1, %xmm1, %ymm0, %ymm6 - vinsertf128 $1, %xmm7, %ymm7, %ymm0 - vmovups %ymm0, 1312(%rsp) # 32-byte Folded Spill - vmovapd .LCPI0_2(%rip), %ymm14 - .align 16, 0x90 -.LBB0_3: # %for_test275.preheader - # =>This Loop Header: Depth=1 - # Child Loop BB0_21 Depth 2 - # Child Loop BB0_17 Depth 3 - movl %edi, -88(%rsp) # 4-byte Spill - movl -96(%rsp), %ecx # 4-byte Reload - cmpl -72(%rsp), %ecx # 4-byte Folded Reload - jge .LBB0_5 -# BB#4: # %for_test286.preheader.lr.ph - # in Loop: Header=BB0_3 Depth=1 - movl -68(%rsp), %ecx # 4-byte Reload - cmpl 1308(%rsp), %ecx # 4-byte Folded Reload - movl 160(%rsp), %ecx # 4-byte Reload - movl -96(%rsp), %edx # 4-byte Reload - jge .LBB0_5 - .align 16, 0x90 -.LBB0_21: # %for_loop288.lr.ph.us - # Parent Loop BB0_3 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB0_17 Depth 3 - movl %edx, 208(%rsp) # 4-byte Spill - movl %ecx, 224(%rsp) # 4-byte Spill - movl %ecx, %r9d - movl -68(%rsp), %r15d # 4-byte Reload - .align 16, 0x90 -.LBB0_17: # %for_loop288.us - # Parent Loop BB0_3 Depth=1 - # Parent Loop BB0_21 Depth=2 - # => This Inner Loop Header: Depth=3 - vmovups 1312(%rsp), %ymm3 # 32-byte Folded Reload - vmovups %ymm3, 1312(%rsp) # 32-byte Folded Spill - vextractf128 $1, %ymm3, %xmm0 - vmovd %r15d, %xmm1 - vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0] - vpaddd .LCPI0_0(%rip), %xmm1, %xmm2 - vpcmpgtd %xmm2, %xmm0, %xmm0 - vpaddd .LCPI0_1(%rip), %xmm1, %xmm1 - vpcmpgtd %xmm1, %xmm3, %xmm1 - vinsertf128 $1, %xmm0, %ymm1, %ymm0 - vandps 704(%rsp), %ymm0, %ymm11 # 32-byte Folded Reload - vmovmskps %ymm11, %ecx - testl %ecx, %ecx - je .LBB0_19 -# BB#18: # %safe_if_run_true467.us - # in Loop: Header=BB0_17 Depth=3 - movl 640(%rsp), %r11d # 4-byte Reload - leal 24(%r11,%r9), %ecx - movl 528(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %ebx - leal 8(%r11,%r9), %edx - movl %edx, 1088(%rsp) # 4-byte Spill - movl 608(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 1048(%rsp) # 4-byte Spill - movl 480(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 768(%rsp) # 4-byte Spill - movl 464(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %r14d - movl 592(%rsp), %esi # 4-byte Reload - leal (%rsi,%r9), %esi - movl 672(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %ebp - leal 16(%r11,%r9), %r12d - leal -8(%rdi,%r9), %r13d - movl 336(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 832(%rsp) # 4-byte Spill - movl 272(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 800(%rsp) # 4-byte Spill - leal 8(%rdi,%r9), %r10d - movl 544(%rsp), %r8d # 4-byte Reload - leal (%r8,%r9), %edx - movl %edx, 960(%rsp) # 4-byte Spill - leal (%r11,%r9), %edx - movl %edx, 928(%rsp) # 4-byte Spill - movl 416(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %edx - movl %edx, 896(%rsp) # 4-byte Spill - movl 400(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %edx - movl %edx, 864(%rsp) # 4-byte Spill - movl 288(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 992(%rsp) # 4-byte Spill - movl 352(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %r8d - movslq %ecx, %rcx - movq %rcx, 1184(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rcx), %ymm6, %ymm0 - movslq %r13d, %rcx - movq %rcx, 1152(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rcx), %ymm6, %ymm1 - vaddpd %ymm0, %ymm1, %ymm0 - movslq %r12d, %rcx - movq %rcx, 1248(%rsp) # 8-byte Spill - movslq %ebx, %rdx - movq %rdx, 1120(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rdx), %ymm6, %ymm1 - vaddpd %ymm1, %ymm0, %ymm0 - vmaskmovpd (%rax,%rcx), %ymm6, %ymm1 - movslq %ebp, %rcx - movq %rcx, 1216(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rcx), %ymm6, %ymm2 - vaddpd %ymm1, %ymm2, %ymm1 - movslq %esi, %rsi - movq %rsi, 1056(%rsp) # 8-byte Spill - movslq %r14d, %rdx - vmaskmovpd (%rax,%rdx), %ymm6, %ymm2 - vaddpd %ymm2, %ymm0, %ymm0 - movslq 768(%rsp), %rcx # 4-byte Folded Reload - movslq 1048(%rsp), %rdi # 4-byte Folded Reload - movq %rdi, 1048(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rsi), %ymm6, %ymm2 - movslq 1088(%rsp), %rsi # 4-byte Folded Reload - movq %rsi, 1088(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rsi), %ymm6, %ymm3 - movslq %r10d, %r11 - vmaskmovpd (%rax,%r11), %ymm6, %ymm4 - vaddpd %ymm3, %ymm4, %ymm3 - vaddpd %ymm2, %ymm1, %ymm1 - movslq 800(%rsp), %rsi # 4-byte Folded Reload - vmaskmovpd (%rax,%rdi), %ymm6, %ymm7 - vmaskmovpd (%rax,%rcx), %ymm6, %ymm2 - movslq 832(%rsp), %rdi # 4-byte Folded Reload - vmaskmovpd (%rax,%rdi), %ymm6, %ymm8 - vpshufd $80, %xmm11, %xmm4 # xmm4 = xmm11[0,0,1,1] - vaddpd %ymm8, %ymm0, %ymm0 - vaddpd %ymm2, %ymm1, %ymm2 - vaddpd %ymm7, %ymm3, %ymm3 - vmaskmovpd (%rax,%rsi), %ymm6, %ymm1 - movslq 864(%rsp), %r12 # 4-byte Folded Reload - movslq 896(%rsp), %rbx # 4-byte Folded Reload - vpshufd $-6, %xmm11, %xmm7 # xmm7 = xmm11[2,2,3,3] - vinsertf128 $1, %xmm7, %ymm4, %ymm12 - movslq 928(%rsp), %r13 # 4-byte Folded Reload - movslq 960(%rsp), %r10 # 4-byte Folded Reload - vmaskmovpd (%rax,%r10), %ymm6, %ymm4 - vaddpd %ymm4, %ymm3, %ymm4 - vmaskmovpd (%rax,%r13), %ymm12, %ymm7 - vmaskmovpd (%rax,%rbx), %ymm6, %ymm8 - vextractf128 $1, %ymm11, %xmm3 - vmaskmovpd (%rax,%r12), %ymm6, %ymm9 - vaddpd %ymm9, %ymm2, %ymm2 - movslq 992(%rsp), %rbp # 4-byte Folded Reload - vmaskmovpd (%rax,%rbp), %ymm6, %ymm9 - vaddpd %ymm9, %ymm2, %ymm2 - vaddpd %ymm1, %ymm0, %ymm1 - vmulpd %ymm14, %ymm7, %ymm0 - vaddpd %ymm8, %ymm4, %ymm4 - vmaskmovpd (%rax,%r13), %ymm6, %ymm7 - movslq %r8d, %r8 - vmaskmovpd (%rax,%r8), %ymm6, %ymm8 - vaddpd %ymm8, %ymm4, %ymm8 - vmovapd %ymm10, %ymm14 - vmulpd %ymm7, %ymm14, %ymm7 - vpshufd $-6, %xmm3, %xmm4 # xmm4 = xmm3[2,2,3,3] - vpshufd $80, %xmm3, %xmm3 # xmm3 = xmm3[0,0,1,1] - movq 1480(%rsp), %r14 - vmaskmovpd (%r14,%r13), %ymm12, %ymm9 - vsubpd %ymm9, %ymm0, %ymm0 - vmulpd %ymm1, %ymm13, %ymm1 - vmulpd %ymm2, %ymm15, %ymm2 - vmovupd 1344(%rsp), %ymm9 # 32-byte Folded Reload - vmovupd %ymm9, 1344(%rsp) # 32-byte Folded Spill - vmulpd %ymm8, %ymm9, %ymm8 - vaddpd %ymm7, %ymm8, %ymm7 - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm8 - vmovupd %ymm8, 992(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm4, %ymm3, %ymm11 - vmaskmovpd 32(%rax,%rdi), %ymm5, %ymm3 - vmovupd %ymm3, 960(%rsp) # 32-byte Folded Spill - vaddpd %ymm7, %ymm2, %ymm2 - vmaskmovpd 32(%rax,%rdx), %ymm5, %ymm3 - vmovupd %ymm3, 928(%rsp) # 32-byte Folded Spill - vaddpd %ymm1, %ymm2, %ymm1 - movq 1464(%rsp), %rdx - vmaskmovpd (%rdx,%r13), %ymm12, %ymm2 - vmulpd %ymm2, %ymm1, %ymm1 - movq 1120(%rsp), %rsi # 8-byte Reload - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm2 - vmovupd %ymm2, 1120(%rsp) # 32-byte Folded Spill - vaddpd %ymm1, %ymm0, %ymm0 - vmovupd %ymm0, 736(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r8), %ymm5, %ymm0 - vmovupd %ymm0, 896(%rsp) # 32-byte Folded Spill - movq 1184(%rsp), %rsi # 8-byte Reload - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm0 - vmovupd %ymm0, 1184(%rsp) # 32-byte Folded Spill - movq 1152(%rsp), %rsi # 8-byte Reload - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm0 - vmovupd %ymm0, 1152(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rbp), %ymm5, %ymm0 - vmovupd %ymm0, 832(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r12), %ymm5, %ymm0 - vmovupd %ymm0, 800(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm0 - vmovupd %ymm0, 768(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r13), %ymm5, %ymm0 - vmovupd %ymm0, 864(%rsp) # 32-byte Folded Spill - movq 1056(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm0 - vmovupd %ymm0, 1056(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rbx), %ymm5, %ymm7 - vmaskmovpd 32(%rax,%r10), %ymm5, %ymm10 - movq 1048(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm0 - movq 1088(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm1 - vmaskmovpd 32(%rax,%r11), %ymm5, %ymm2 - movq 1248(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm3 - movq 1216(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm4 - vmaskmovpd 32(%rdx,%r13), %ymm11, %ymm8 - vmovupd %ymm8, 1248(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%r14,%r13), %ymm11, %ymm8 - vmovupd %ymm8, 1216(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r13), %ymm11, %ymm8 - vmovupd %ymm8, 1088(%rsp) # 32-byte Folded Spill - vmovupd 736(%rsp), %ymm8 # 32-byte Folded Reload - vmaskmovpd %ymm8, %ymm12, (%r14,%r13) - vaddpd %ymm3, %ymm4, %ymm3 - vaddpd %ymm1, %ymm2, %ymm1 - vaddpd %ymm0, %ymm1, %ymm0 - vaddpd %ymm10, %ymm0, %ymm0 - vaddpd %ymm7, %ymm0, %ymm1 - vaddpd 1056(%rsp), %ymm3, %ymm0 # 32-byte Folded Reload - vaddpd 768(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 800(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 832(%rsp), %ymm0, %ymm2 # 32-byte Folded Reload - vmovupd 1152(%rsp), %ymm0 # 32-byte Folded Reload - vaddpd 1184(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vmulpd %ymm2, %ymm15, %ymm2 - vaddpd 896(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload - vmulpd %ymm1, %ymm9, %ymm1 - vmulpd 864(%rsp), %ymm14, %ymm3 # 32-byte Folded Reload - vmovapd %ymm14, %ymm10 - vaddpd %ymm3, %ymm1, %ymm3 - vmovapd .LCPI0_2(%rip), %ymm4 - vmovupd 1088(%rsp), %ymm1 # 32-byte Folded Reload - vmulpd %ymm4, %ymm1, %ymm1 - vmovapd %ymm4, %ymm14 - vsubpd 1216(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload - vaddpd %ymm3, %ymm2, %ymm2 - vaddpd 1120(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 928(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 960(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 992(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vmulpd %ymm0, %ymm13, %ymm0 - vaddpd %ymm0, %ymm2, %ymm0 - vmulpd 1248(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd %ymm0, %ymm1, %ymm0 - vmaskmovpd %ymm0, %ymm11, 32(%r14,%r13) -.LBB0_19: # %safe_if_after_true466.us - # in Loop: Header=BB0_17 Depth=3 - addl $64, %r9d - addl $8, %r15d - cmpl 1308(%rsp), %r15d # 4-byte Folded Reload - jl .LBB0_17 -# BB#20: # %for_exit289.us - # in Loop: Header=BB0_21 Depth=2 - movl 224(%rsp), %ecx # 4-byte Reload - addl -76(%rsp), %ecx # 4-byte Folded Reload - movl 208(%rsp), %edx # 4-byte Reload - incl %edx - cmpl -72(%rsp), %edx # 4-byte Folded Reload - jne .LBB0_21 -.LBB0_5: # %for_exit278 - # in Loop: Header=BB0_3 Depth=1 - movl 160(%rsp), %ecx # 4-byte Reload - addl -104(%rsp), %ecx # 4-byte Folded Reload - movl %ecx, 160(%rsp) # 4-byte Spill - movl -88(%rsp), %edi # 4-byte Reload - incl %edi - movl -92(%rsp), %ecx # 4-byte Reload - cmpl %ecx, %edi - jne .LBB0_3 -.LBB0_6: # %for_exit - addq $1384, %rsp # imm = 0x568 - popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 - popq %rbp - vzeroupper - ret -.Ltmp0: - .size stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, .Ltmp0-stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ - - .align 16, 0x90 - .type stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_,@function -stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_: # @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ -# BB#0: # %allocas - pushq %rbp - pushq %r15 - pushq %r14 - pushq %rbx - subq $56, %rsp - movq %rdi, %rax - movl 16(%rax), %r8d - movq 56(%rax), %rbx - movq 48(%rax), %r15 - movq 40(%rax), %r14 - movq 32(%rax), %r11 - leal 1(%r8,%rcx), %r9d - movl 24(%rax), %r10d - vmovaps 64(%rax), %ymm0 - addl %ecx, %r8d - movl 20(%rax), %ebp - movl 12(%rax), %ecx - movl 8(%rax), %edx - movl (%rax), %edi - movl 4(%rax), %esi - vmovmskps %ymm0, %eax - cmpl $255, %eax - jne .LBB1_2 -# BB#1: # %all_on - vpcmpeqd %xmm0, %xmm0, %xmm0 - movq %rbx, 40(%rsp) - movq %r15, 32(%rsp) - movq %r14, 24(%rsp) - movq %r11, 16(%rsp) - movl %r10d, 8(%rsp) - movl %ebp, (%rsp) - vinsertf128 $1, %xmm0, %ymm0, %ymm0 - jmp .LBB1_3 -.LBB1_2: # %some_on - movq %rbx, 40(%rsp) - movq %r15, 32(%rsp) - movq %r14, 24(%rsp) - movq %r11, 16(%rsp) - movl %r10d, 8(%rsp) - movl %ebp, (%rsp) -.LBB1_3: # %some_on - callq stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ - addq $56, %rsp - popq %rbx - popq %r14 - popq %r15 - popq %rbp - vzeroupper - ret -.Ltmp1: - .size stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, .Ltmp1-stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ - - .globl loop_stencil_ispc_tasks - .align 16, 0x90 - .type loop_stencil_ispc_tasks,@function -loop_stencil_ispc_tasks: # @loop_stencil_ispc_tasks -# BB#0: # %allocas - pushq %rbp - pushq %r15 - pushq %r14 - pushq %r13 - pushq %r12 - pushq %rbx - subq $104, %rsp - movl %r9d, 92(%rsp) # 4-byte Spill - movl %r8d, 88(%rsp) # 4-byte Spill - movl %ecx, 84(%rsp) # 4-byte Spill - movl %edx, 80(%rsp) # 4-byte Spill - movl %esi, %ebx - movl %edi, %ebp - movq $0, 96(%rsp) - cmpl %ebx, %ebp - jge .LBB2_10 -# BB#1: # %for_loop.lr.ph - movq 216(%rsp), %r13 - movl 168(%rsp), %r14d - movl 160(%rsp), %r12d - subl %r12d, %r14d - leaq 96(%rsp), %r15 - vpcmpeqd %xmm0, %xmm0, %xmm0 - vinsertf128 $1, %xmm0, %ymm0, %ymm1 - vmovups %ymm1, 32(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm0, %ymm0, %ymm0 - vmovups %ymm0, (%rsp) # 32-byte Folded Spill - .align 16, 0x90 -.LBB2_2: # %for_loop - # =>This Inner Loop Header: Depth=1 - movq %r15, %rdi - movl $96, %esi - movl $32, %edx - vzeroupper - callq ISPCAlloc - movq %rax, %rdx - movl 80(%rsp), %eax # 4-byte Reload - movl %eax, (%rdx) - movl 84(%rsp), %eax # 4-byte Reload - movl %eax, 4(%rdx) - movl 88(%rsp), %eax # 4-byte Reload - movl %eax, 8(%rdx) - movl 92(%rsp), %eax # 4-byte Reload - movl %eax, 12(%rdx) - movl %r12d, 16(%rdx) - movl 176(%rsp), %eax - movl %eax, 20(%rdx) - movl 184(%rsp), %eax - movl %eax, 24(%rdx) - testb $1, %bpl - movl 192(%rsp), %eax - movl %eax, 28(%rdx) - movq 200(%rsp), %rax - movq %rax, 32(%rdx) - movq 208(%rsp), %rax - movq %rax, 40(%rdx) - jne .LBB2_4 -# BB#3: # %if_then - # in Loop: Header=BB2_2 Depth=1 - movq %r13, 48(%rdx) - movq 224(%rsp), %rax - movq %rax, 56(%rdx) - vmovups 32(%rsp), %ymm0 # 32-byte Folded Reload - jmp .LBB2_5 - .align 16, 0x90 -.LBB2_4: # %if_else - # in Loop: Header=BB2_2 Depth=1 - movq 224(%rsp), %rax - movq %rax, 48(%rdx) - movq %r13, 56(%rdx) - vmovups (%rsp), %ymm0 # 32-byte Folded Reload -.LBB2_5: # %if_else - # in Loop: Header=BB2_2 Depth=1 - vmovaps %ymm0, 64(%rdx) - movq %r15, %rdi - movl $stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, %esi - movl %r14d, %ecx - movl $1, %r8d - movl $1, %r9d - vzeroupper - callq ISPCLaunch - movq 96(%rsp), %rdi - testq %rdi, %rdi - je .LBB2_7 -# BB#6: # %call_sync - # in Loop: Header=BB2_2 Depth=1 - callq ISPCSync - movq $0, 96(%rsp) -.LBB2_7: # %post_sync - # in Loop: Header=BB2_2 Depth=1 - incl %ebp - cmpl %ebp, %ebx - jne .LBB2_2 -# BB#8: # %for_exit - movq 96(%rsp), %rdi - testq %rdi, %rdi - je .LBB2_10 -# BB#9: # %call_sync72 - callq ISPCSync - movq $0, 96(%rsp) -.LBB2_10: # %post_sync73 - addq $104, %rsp - popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 - popq %rbp - ret -.Ltmp2: - .size loop_stencil_ispc_tasks, .Ltmp2-loop_stencil_ispc_tasks - - - .section ".note.GNU-stack","",@progbits diff --git a/examples_cuda/stencil/stencil_cu_avx.bc b/examples_cuda/stencil/stencil_cu_avx.bc deleted file mode 100644 index d9338e7c..00000000 Binary files a/examples_cuda/stencil/stencil_cu_avx.bc and /dev/null differ diff --git a/examples_cuda/stencil/stencil_cu_avx.s b/examples_cuda/stencil/stencil_cu_avx.s deleted file mode 100644 index 774d0a55..00000000 --- a/examples_cuda/stencil/stencil_cu_avx.s +++ /dev/null @@ -1,214 +0,0 @@ - .file "stencil.ispc" - .text - .globl loop_stencil_ispc_tasks - .align 16, 0x90 - .type loop_stencil_ispc_tasks,@function -loop_stencil_ispc_tasks: # @loop_stencil_ispc_tasks -# BB#0: # %allocas - pushq %rbp - movq %rsp, %rbp - pushq %r15 - pushq %r14 - pushq %r13 - pushq %r12 - pushq %rbx - andq $-32, %rsp - subq $384, %rsp # imm = 0x180 - movl %r9d, 28(%rsp) # 4-byte Spill - movl %r8d, 24(%rsp) # 4-byte Spill - movl %ecx, 20(%rsp) # 4-byte Spill - movl %edx, %ebx - movl %esi, 16(%rsp) # 4-byte Spill - movl %edi, %r13d - movq $0, 352(%rsp) - cmpl %esi, %r13d - jge .LBB0_10 -# BB#1: # %for_loop.lr.ph - movl 24(%rbp), %r14d - movl 16(%rbp), %r15d - subl %r15d, %r14d - leaq 352(%rsp), %rax - .align 16, 0x90 -.LBB0_2: # %for_loop - # =>This Inner Loop Header: Depth=1 - movq %rax, %r12 - movq %r12, %rdi - movl $96, %esi - movl $32, %edx - callq CUDAAlloc - testb $1, %r13b - jne .LBB0_4 -# BB#3: # %if_then - # in Loop: Header=BB0_2 Depth=1 - movl %ebx, 252(%rsp) - leaq 252(%rsp), %rax - movq %rax, 256(%rsp) - movl 20(%rsp), %eax # 4-byte Reload - movl %eax, 248(%rsp) - leaq 248(%rsp), %rax - movq %rax, 264(%rsp) - movl 24(%rsp), %eax # 4-byte Reload - movl %eax, 244(%rsp) - leaq 244(%rsp), %rax - movq %rax, 272(%rsp) - movl 28(%rsp), %eax # 4-byte Reload - movl %eax, 240(%rsp) - leaq 240(%rsp), %rax - movq %rax, 280(%rsp) - movl %r15d, 236(%rsp) - leaq 236(%rsp), %rax - movq %rax, 288(%rsp) - movl 32(%rbp), %eax - movl %eax, 232(%rsp) - leaq 232(%rsp), %rax - movq %rax, 296(%rsp) - movl 40(%rbp), %eax - movl %eax, 228(%rsp) - leaq 228(%rsp), %rax - movq %rax, 304(%rsp) - movl 48(%rbp), %eax - movl %eax, 224(%rsp) - leaq 224(%rsp), %rax - movq %rax, 312(%rsp) - movq 56(%rbp), %rax - movq %rax, 216(%rsp) - leaq 216(%rsp), %rax - movq %rax, 320(%rsp) - movq 64(%rbp), %rax - movq %rax, 208(%rsp) - leaq 208(%rsp), %rax - movq %rax, 328(%rsp) - movq 72(%rbp), %rax - movq %rax, 200(%rsp) - leaq 200(%rsp), %rax - movq %rax, 336(%rsp) - movq 80(%rbp), %rax - movq %rax, 192(%rsp) - leaq 192(%rsp), %rax - movq %rax, 344(%rsp) - movl $1, 8(%rsp) - movl $1, (%rsp) - movq %r12, %rdi - movl $.L.module_str, %esi - movl $.L.ptx_str, %edx - movl $.L.func_str, %ecx - leaq 256(%rsp), %r8 - jmp .LBB0_5 - .align 16, 0x90 -.LBB0_4: # %if_else - # in Loop: Header=BB0_2 Depth=1 - movl %ebx, 92(%rsp) - leaq 92(%rsp), %rax - movq %rax, 96(%rsp) - movl 20(%rsp), %eax # 4-byte Reload - movl %eax, 88(%rsp) - leaq 88(%rsp), %rax - movq %rax, 104(%rsp) - movl 24(%rsp), %eax # 4-byte Reload - movl %eax, 84(%rsp) - leaq 84(%rsp), %rax - movq %rax, 112(%rsp) - movl 28(%rsp), %eax # 4-byte Reload - movl %eax, 80(%rsp) - leaq 80(%rsp), %rax - movq %rax, 120(%rsp) - movl %r15d, 76(%rsp) - leaq 76(%rsp), %rax - movq %rax, 128(%rsp) - movl 32(%rbp), %eax - movl %eax, 72(%rsp) - leaq 72(%rsp), %rax - movq %rax, 136(%rsp) - movl 40(%rbp), %eax - movl %eax, 68(%rsp) - leaq 68(%rsp), %rax - movq %rax, 144(%rsp) - movl 48(%rbp), %eax - movl %eax, 64(%rsp) - leaq 64(%rsp), %rax - movq %rax, 152(%rsp) - movq 56(%rbp), %rax - movq %rax, 56(%rsp) - leaq 56(%rsp), %rax - movq %rax, 160(%rsp) - movq 64(%rbp), %rax - movq %rax, 48(%rsp) - leaq 48(%rsp), %rax - movq %rax, 168(%rsp) - movq 80(%rbp), %rax - movq %rax, 40(%rsp) - leaq 40(%rsp), %rax - movq %rax, 176(%rsp) - movq 72(%rbp), %rax - movq %rax, 32(%rsp) - leaq 32(%rsp), %rax - movq %rax, 184(%rsp) - movl $1, 8(%rsp) - movl $1, (%rsp) - movq %r12, %rdi - movl $.L.module_str, %esi - movl $.L.ptx_str, %edx - movl $.L.func_str1, %ecx - leaq 96(%rsp), %r8 -.LBB0_5: # %if_else - # in Loop: Header=BB0_2 Depth=1 - movl %r14d, %r9d - callq CUDALaunch - movq 352(%rsp), %rdi - testq %rdi, %rdi - je .LBB0_7 -# BB#6: # %call_sync - # in Loop: Header=BB0_2 Depth=1 - callq ISPCSync - movq $0, 352(%rsp) -.LBB0_7: # %post_sync - # in Loop: Header=BB0_2 Depth=1 - incl %r13d - cmpl %r13d, 16(%rsp) # 4-byte Folded Reload - movq %r12, %rax - jne .LBB0_2 -# BB#8: # %for_exit - movq 352(%rsp), %rdi - testq %rdi, %rdi - je .LBB0_10 -# BB#9: # %call_sync113 - callq ISPCSync - movq $0, 352(%rsp) -.LBB0_10: # %post_sync114 - leaq -40(%rbp), %rsp - popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 - popq %rbp - ret -.Ltmp0: - .size loop_stencil_ispc_tasks, .Ltmp0-loop_stencil_ispc_tasks - - .type .L.module_str,@object # @.module_str - .section .rodata,"a",@progbits -.L.module_str: - .asciz "stencil.ispc" - .size .L.module_str, 13 - - .type .L.ptx_str,@object # @.ptx_str - .align 16 -.L.ptx_str: - .asciz "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.1\n.target sm_35, texmode_independent\n.address_size 64\n\n\t// .globl\tstencil_step_task\n // @stencil_step_task\n.entry stencil_step_task(\n\t.param .u32 stencil_step_task_param_0,\n\t.param .u32 stencil_step_task_param_1,\n\t.param .u32 stencil_step_task_param_2,\n\t.param .u32 stencil_step_task_param_3,\n\t.param .u32 stencil_step_task_param_4,\n\t.param .u32 stencil_step_task_param_5,\n\t.param .u32 stencil_step_task_param_6,\n\t.param .u32 stencil_step_task_param_7,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_8,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_9,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_10,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_11\n)\n{\n\t.reg .pred %p<396>;\n\t.reg .s16 %rc<396>;\n\t.reg .s16 %rs<396>;\n\t.reg .s32 %r<396>;\n\t.reg .s64 %rl<396>;\n\t.reg .f32 %f<396>;\n\t.reg .f64 %fl<396>;\n\n// BB#0: // %allocas\n\tmov.u32 \t%r12, %ctaid.x;\n\tld.param.u32 \t%r13, [stencil_step_task_param_4];\n\tadd.s32 \t%r16, %r12, %r13;\n\tadd.s32 \t%r0, %r16, 1;\n\tsetp.ge.s32 \t%p0, %r16, %r0;\n\t@%p0 bra \tBB0_11;\n// BB#1: // %for_test28.i.preheader.lr.ph\n\tld.param.u32 \t%r0, [stencil_step_task_param_0];\n\tld.param.u32 \t%r1, [stencil_step_task_param_1];\n\tld.param.u32 \t%r2, [stencil_step_task_param_2];\n\tld.param.u32 \t%r3, [stencil_step_task_param_3];\n\tld.param.u32 \t%r4, [stencil_step_task_param_5];\n\tld.param.u32 \t%r5, [stencil_step_task_param_6];\n\tmul.lo.s32 \t%r5, %r5, %r4;\n\tld.param.u64 \t%rl3, [stencil_step_task_param_8];\n\tld.f64 \t%fl0, [%rl3];\n\tld.f64 \t%fl1, [%rl3+8];\n\tld.param.u64 \t%rl0, [stencil_step_task_param_9];\n\tld.f64 \t%fl2, [%rl3+16];\n\tld.param.u64 \t%rl1, [stencil_step_task_param_10];\n\tld.param.u64 \t%rl2, [stencil_step_task_param_11];\n\tld.f64 \t%fl3, [%rl3+24];\n\tshl.b32 \t%r6, %r4, 1;\n\tmul.lo.s32 \t%r7, %r4, 3;\n\tmul.lo.s32 \t%r8, %r4, -3;\n\tshl.b32 \t%r9, %r5, 1;\n\tmul.lo.s32 \t%r10, %r5, 3;\n\tmul.lo.s32 \t%r11, %r5, -3;\n\tadd.s32 \t%r12, %r12, %r13;\n\tneg.s32 \t%r13, %r9;\n\tneg.s32 \t%r14, %r6;\n\tmov.u32 \t%r32, WARP_SZ;\nBB0_2: // %for_test28.i.preheader\n // =>This Loop Header: Depth=1\n // Child Loop BB0_9 Depth 2\n // Child Loop BB0_5 Depth 3\n\tmov.u32 \t%r15, %r16;\n\tsetp.ge.s32 \t%p0, %r2, %r3;\n\t@%p0 bra \tBB0_10;\n// BB#3: // %for_test35.i.preheader.lr.ph\n // in Loop: Header=BB0_2 Depth=1\n\tsetp.lt.s32 \t%p0, %r0, %r1;\n\t@%p0 bra \tBB0_4;\n\tbra.uni \tBB0_10;\nBB0_4: // in Loop: Header=BB0_2 Depth=1\n\tmul.lo.s32 \t%r16, %r15, %r5;\n\tmov.u32 \t%r17, %r2;\nBB0_9: // %for_loop37.i.lr.ph.us\n // Parent Loop BB0_2 Depth=1\n // => This Loop Header: Depth=2\n // Child Loop BB0_5 Depth 3\n\tmad.lo.s32 \t%r18, %r17, %r4, %r16;\n\tadd.s32 \t%r19, %r18, %r4;\n\tadd.s32 \t%r20, %r18, %r6;\n\tsub.s32 \t%r21, %r18, %r4;\n\tadd.s32 \t%r22, %r18, %r7;\n\tadd.s32 \t%r23, %r18, %r14;\n\tadd.s32 \t%r24, %r18, %r5;\n\tadd.s32 \t%r25, %r18, %r8;\n\tadd.s32 \t%r26, %r18, %r9;\n\tsub.s32 \t%r27, %r18, %r5;\n\tadd.s32 \t%r28, %r18, %r10;\n\tadd.s32 \t%r29, %r18, %r13;\n\tadd.s32 \t%r30, %r18, %r11;\n\tmov.u32 \t%r31, %r0;\nBB0_5: // %for_loop37.i.us\n // Parent Loop BB0_2 Depth=1\n // Parent Loop BB0_9 Depth=2\n // => This Inner Loop Header: Depth=3\n\tmov.u32 \t%r33, %tid.x;\n\tadd.s32 \t%r34, %r32, -1;\n\tand.b32 \t%r33, %r34, %r33;\n\tadd.s32 \t%r33, %r33, %r31;\n\tsetp.ge.s32 \t%p0, %r33, %r1;\n\t@%p0 bra \tBB0_7;\n// BB#6: // %pl_dolane.i.us\n // in Loop: Header=BB0_5 Depth=3\n\tadd.s32 \t%r34, %r18, %r33;\n\tshl.b32 \t%r34, %r34, 3;\n\tadd.s32 \t%r35, %r34, -8;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl4, [%rl3];\n\tadd.s32 \t%r35, %r34, 8;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl5, [%rl3];\n\tadd.s32 \t%r35, %r34, -16;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl6, [%rl3];\n\tadd.s32 \t%r35, %r34, 16;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl9, [%rl3];\n\tadd.s32 \t%r35, %r19, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl8, [%rl3];\n\tadd.s32 \t%r35, %r34, -24;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl7, [%rl3];\n\tadd.s32 \t%r35, %r34, 24;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl10, [%rl3];\n\tadd.s32 \t%r35, %r20, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl13, [%rl3];\n\tadd.s32 \t%r35, %r21, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl12, [%rl3];\n\tadd.s32 \t%r35, %r22, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl11, [%rl3];\n\tadd.s32 \t%r35, %r23, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl16, [%rl3];\n\tadd.s32 \t%r35, %r24, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl15, [%rl3];\n\tadd.s32 \t%r35, %r25, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl14, [%rl3];\n\tadd.s32 \t%r35, %r26, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl19, [%rl3];\n\tadd.s32 \t%r35, %r27, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl18, [%rl3];\n\tadd.s32 \t%r35, %r28, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl17, [%rl3];\n\tadd.s32 \t%r35, %r29, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl24, [%rl3];\n\tcvt.s64.s32 \t%rl4, %r34;\n\tadd.s64 \t%rl3, %rl4, %rl1;\n\tld.f64 \t%fl21, [%rl3];\n\tadd.s32 \t%r33, %r30, %r33;\n\tshl.b32 \t%r33, %r33, 3;\n\tcvt.s64.s32 \t%rl3, %r33;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl20, [%rl3];\n\tadd.s64 \t%rl3, %rl4, %rl2;\n\tld.f64 \t%fl23, [%rl3];\n\tadd.s64 \t%rl4, %rl4, %rl0;\n\tld.f64 \t%fl22, [%rl4];\n\tadd.f64 \t%fl25, %fl21, %fl21;\n\tsub.f64 \t%fl23, %fl25, %fl23;\n\tadd.f64 \t%fl6, %fl6, %fl9;\n\tadd.f64 \t%fl6, %fl6, %fl13;\n\tadd.f64 \t%fl6, %fl6, %fl16;\n\tadd.f64 \t%fl6, %fl6, %fl19;\n\tadd.f64 \t%fl6, %fl6, %fl24;\n\tadd.f64 \t%fl4, %fl4, %fl5;\n\tadd.f64 \t%fl4, %fl4, %fl8;\n\tadd.f64 \t%fl4, %fl4, %fl12;\n\tadd.f64 \t%fl4, %fl4, %fl15;\n\tadd.f64 \t%fl4, %fl4, %fl18;\n\tmul.f64 \t%fl5, %fl0, %fl21;\n\tfma.rn.f64 \t%fl4, %fl1, %fl4, %fl5;\n\tfma.rn.f64 \t%fl4, %fl2, %fl6, %fl4;\n\tadd.f64 \t%fl5, %fl7, %fl10;\n\tadd.f64 \t%fl5, %fl5, %fl11;\n\tadd.f64 \t%fl5, %fl5, %fl14;\n\tadd.f64 \t%fl5, %fl5, %fl17;\n\tadd.f64 \t%fl5, %fl5, %fl20;\n\tfma.rn.f64 \t%fl4, %fl3, %fl5, %fl4;\n\tfma.rn.f64 \t%fl4, %fl4, %fl22, %fl23;\n\tst.f64 \t[%rl3], %fl4;\nBB0_7: // %safe_if_after_true.i.us\n // in Loop: Header=BB0_5 Depth=3\n\tadd.s32 \t%r31, %r32, %r31;\n\tsetp.lt.s32 \t%p0, %r31, %r1;\n\t@%p0 bra \tBB0_5;\n// BB#8: // %for_exit38.i.us\n // in Loop: Header=BB0_9 Depth=2\n\tadd.s32 \t%r17, %r17, 1;\n\tsetp.eq.s32 \t%p0, %r17, %r3;\n\t@%p0 bra \tBB0_10;\n\tbra.uni \tBB0_9;\nBB0_10: // %for_exit31.i\n // in Loop: Header=BB0_2 Depth=1\n\tadd.s32 \t%r16, %r15, 1;\n\tsetp.ne.s32 \t%p0, %r15, %r12;\n\t@%p0 bra \tBB0_2;\nBB0_11: // %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit\n\tret;\n}\n\n" - .size .L.ptx_str, 7954 - - .type .L.func_str,@object # @.func_str - .align 16 -.L.func_str: - .asciz "stencil_step_task" - .size .L.func_str, 18 - - .type .L.func_str1,@object # @.func_str1 - .align 16 -.L.func_str1: - .asciz "stencil_step_task" - .size .L.func_str1, 18 - - - .section ".note.GNU-stack","",@progbits diff --git a/examples_cuda/stencil/stencil_cu_nvptx64.bc b/examples_cuda/stencil/stencil_cu_nvptx64.bc deleted file mode 100644 index 2f3c05da..00000000 Binary files a/examples_cuda/stencil/stencil_cu_nvptx64.bc and /dev/null differ diff --git a/examples_cuda/stencil/stencil_cu_nvptx64.cubin b/examples_cuda/stencil/stencil_cu_nvptx64.cubin deleted file mode 100644 index a7e9a38a..00000000 Binary files a/examples_cuda/stencil/stencil_cu_nvptx64.cubin and /dev/null differ diff --git a/examples_cuda/stencil/stencil_cu_nvptx64.ll b/examples_cuda/stencil/stencil_cu_nvptx64.ll deleted file mode 100644 index d0c5e824..00000000 --- a/examples_cuda/stencil/stencil_cu_nvptx64.ll +++ /dev/null @@ -1,269 +0,0 @@ -; ModuleID = 'stencil_cu_nvptx64.bc' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -target triple = "nvptx64" - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #0 - -; Function Attrs: nounwind -define void @stencil_step_task(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %Nx, i32 %Ny, i32 %Nz, double* nocapture %coef, double* %vsq, double* %Ain, double* %Aout) #1 { -allocas: - %bid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 - %add_z0_load_calltmp = add i32 %bid.i.i, %z0 - %bid.i.i21 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 - %add_z0_load15_calltmp18 = add i32 %z0, 1 - %add_add_z0_load15_calltmp18_ = add i32 %add_z0_load15_calltmp18, %bid.i.i21 - %mul_Nx_load_Ny_load.i = mul i32 %Ny, %Nx - %coef_load_offset_load.i = load double* %coef, align 8 - %coef_load16_offset.i = getelementptr double* %coef, i64 1 - %coef_load16_offset_load.i = load double* %coef_load16_offset.i, align 8 - %coef_load19_offset.i = getelementptr double* %coef, i64 2 - %coef_load19_offset_load.i = load double* %coef_load19_offset.i, align 8 - %coef_load22_offset.i = getelementptr double* %coef, i64 3 - %coef_load22_offset_load.i = load double* %coef_load22_offset.i, align 8 - %less_z_load_z1_load.i161 = icmp slt i32 %add_z0_load_calltmp, %add_add_z0_load15_calltmp18_ - br i1 %less_z_load_z1_load.i161, label %for_test28.i.preheader.lr.ph, label %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit - -for_test28.i.preheader.lr.ph: ; preds = %allocas - %less_y_load_y1_load.i159 = icmp slt i32 %y0, %y1 - %less_xb_load_x1_load.i157 = icmp slt i32 %x0, %x1 - %x1_load199_broadcast_init.i = insertelement <1 x i32> undef, i32 %x1, i32 0 - %mul__Nx_load119.i = shl i32 %Nx, 1 - %mul__Nx_load167.i = mul i32 %Nx, 3 - %mul__Nx_load127.i = mul i32 %Nx, -2 - %Ain_load65_ptr2int.i = ptrtoint double* %Ain to i64 - %mul__Nx_load175.i = mul i32 %Nx, -3 - %mul__Nxy_load136.i = shl i32 %mul_Nx_load_Ny_load.i, 1 - %mul__Nxy_load184.i = mul i32 %mul_Nx_load_Ny_load.i, 3 - %mul__Nxy_load144.i = mul i32 %mul_Nx_load_Ny_load.i, -2 - %mul__Nxy_load192.i = mul i32 %mul_Nx_load_Ny_load.i, -3 - %Aout_load_ptr2int.i = ptrtoint double* %Aout to i64 - %vsq_load_ptr2int.i = ptrtoint double* %vsq to i64 - %0 = add i32 %bid.i.i21, %z0 - br label %for_test28.i.preheader - -for_test28.i.preheader: ; preds = %for_exit31.i, %for_test28.i.preheader.lr.ph - %z.0.i162 = phi i32 [ %add_z0_load_calltmp, %for_test28.i.preheader.lr.ph ], [ %z_load245_plus1.i, %for_exit31.i ] - br i1 %less_y_load_y1_load.i159, label %for_test35.i.preheader.lr.ph, label %for_exit31.i - -for_test35.i.preheader.lr.ph: ; preds = %for_test28.i.preheader - %mul_z_load45_Nxy_load.i = mul i32 %z.0.i162, %mul_Nx_load_Ny_load.i - br i1 %less_xb_load_x1_load.i157, label %for_loop37.i.lr.ph.us, label %for_exit31.i - -for_exit38.i.us: ; preds = %safe_if_after_true.i.us - %y_load244_plus1.i.us = add i32 %y.0.i160.us, 1 - %exitcond = icmp eq i32 %y_load244_plus1.i.us, %y1 - br i1 %exitcond, label %for_exit31.i, label %for_loop37.i.lr.ph.us - -for_loop37.i.us: ; preds = %for_loop37.i.lr.ph.us, %safe_if_after_true.i.us - %xb.0.i158.us = phi i32 [ %x0, %for_loop37.i.lr.ph.us ], [ %add_xb_load243_calltmp241.i.us, %safe_if_after_true.i.us ] - %tid.i.i.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 - %tid.i.i.i.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2 - %sub_calltmp3_.i.i.us = add i32 %tid.i.i.i.i.us, -1 - %bitop.i.i.us = and i32 %sub_calltmp3_.i.i.us, %tid.i.i.i.us - %add_xb_load42_calltmp.i.us = add i32 %bitop.i.i.us, %xb.0.i158.us - %add_xb_load42_calltmp_broadcast_init.i.us = insertelement <1 x i32> undef, i32 %add_xb_load42_calltmp.i.us, i32 0 - %less_x_load198_x1_load199_broadcast.i.us = icmp slt <1 x i32> %add_xb_load42_calltmp_broadcast_init.i.us, %x1_load199_broadcast_init.i - %v.i.i.us = extractelement <1 x i1> %less_x_load198_x1_load199_broadcast.i.us, i32 0 - br i1 %v.i.i.us, label %pl_dolane.i.us, label %safe_if_after_true.i.us - -pl_dolane.i.us: ; preds = %for_loop37.i.us - %.lhs.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %add_xb_load42_calltmp.i.us - %.lhs.us = shl i32 %.lhs.lhs.us, 3 - %1 = add i32 %.lhs.us, -8 - %iptr__id.i.rhs.us = sext i32 %1 to i64 - %iptr__id.i.us = add i64 %iptr__id.i.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i.us = inttoptr i64 %iptr__id.i.us to double* - %val__id.i.us = load double* %ptr__id.i.us, align 8 - %2 = add i32 %.lhs.us, 8 - %iptr__id.i130.rhs.us = sext i32 %2 to i64 - %iptr__id.i130.us = add i64 %iptr__id.i130.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i131.us = inttoptr i64 %iptr__id.i130.us to double* - %val__id.i132.us = load double* %ptr__id.i131.us, align 8 - %3 = add i32 %.lhs.us, -16 - %iptr__id.i125.rhs.us = sext i32 %3 to i64 - %iptr__id.i125.us = add i64 %iptr__id.i125.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i126.us = inttoptr i64 %iptr__id.i125.us to double* - %val__id.i127.us = load double* %ptr__id.i126.us, align 8 - %4 = add i32 %.lhs.us, 16 - %iptr__id.i120.rhs.us = sext i32 %4 to i64 - %iptr__id.i120.us = add i64 %iptr__id.i120.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i121.us = inttoptr i64 %iptr__id.i120.us to double* - %val__id.i122.us = load double* %ptr__id.i121.us, align 8 - %.lhs138.us = add i32 %.lhs138.lhs.us, %add_xb_load42_calltmp.i.us - %5 = shl i32 %.lhs138.us, 3 - %iptr__id.i115.rhs.us = sext i32 %5 to i64 - %iptr__id.i115.us = add i64 %iptr__id.i115.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i116.us = inttoptr i64 %iptr__id.i115.us to double* - %val__id.i117.us = load double* %ptr__id.i116.us, align 8 - %6 = add i32 %.lhs.us, -24 - %iptr__id.i110.rhs.us = sext i32 %6 to i64 - %iptr__id.i110.us = add i64 %iptr__id.i110.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i111.us = inttoptr i64 %iptr__id.i110.us to double* - %val__id.i112.us = load double* %ptr__id.i111.us, align 8 - %7 = add i32 %.lhs.us, 24 - %iptr__id.i105.rhs.us = sext i32 %7 to i64 - %iptr__id.i105.us = add i64 %iptr__id.i105.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i106.us = inttoptr i64 %iptr__id.i105.us to double* - %val__id.i107.us = load double* %ptr__id.i106.us, align 8 - %.lhs141.us = add i32 %.lhs141.lhs.us, %add_xb_load42_calltmp.i.us - %8 = shl i32 %.lhs141.us, 3 - %iptr__id.i100.rhs.us = sext i32 %8 to i64 - %iptr__id.i100.us = add i64 %iptr__id.i100.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i101.us = inttoptr i64 %iptr__id.i100.us to double* - %val__id.i102.us = load double* %ptr__id.i101.us, align 8 - %.lhs142.us = add i32 %.lhs142.lhs.us, %add_xb_load42_calltmp.i.us - %9 = shl i32 %.lhs142.us, 3 - %iptr__id.i95.rhs.us = sext i32 %9 to i64 - %iptr__id.i95.us = add i64 %iptr__id.i95.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i96.us = inttoptr i64 %iptr__id.i95.us to double* - %val__id.i97.us = load double* %ptr__id.i96.us, align 8 - %.lhs143.us = add i32 %.lhs143.lhs.us, %add_xb_load42_calltmp.i.us - %10 = shl i32 %.lhs143.us, 3 - %iptr__id.i90.rhs.us = sext i32 %10 to i64 - %iptr__id.i90.us = add i64 %iptr__id.i90.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i91.us = inttoptr i64 %iptr__id.i90.us to double* - %val__id.i92.us = load double* %ptr__id.i91.us, align 8 - %.lhs144.us = add i32 %.lhs144.lhs.us, %add_xb_load42_calltmp.i.us - %11 = shl i32 %.lhs144.us, 3 - %iptr__id.i85.rhs.us = sext i32 %11 to i64 - %iptr__id.i85.us = add i64 %iptr__id.i85.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i86.us = inttoptr i64 %iptr__id.i85.us to double* - %val__id.i87.us = load double* %ptr__id.i86.us, align 8 - %.lhs145.us = add i32 %.lhs145.lhs.us, %add_xb_load42_calltmp.i.us - %12 = shl i32 %.lhs145.us, 3 - %iptr__id.i80.rhs.us = sext i32 %12 to i64 - %iptr__id.i80.us = add i64 %iptr__id.i80.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i81.us = inttoptr i64 %iptr__id.i80.us to double* - %val__id.i82.us = load double* %ptr__id.i81.us, align 8 - %.lhs146.us = add i32 %.lhs146.lhs.us, %add_xb_load42_calltmp.i.us - %13 = shl i32 %.lhs146.us, 3 - %iptr__id.i75.rhs.us = sext i32 %13 to i64 - %iptr__id.i75.us = add i64 %iptr__id.i75.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i76.us = inttoptr i64 %iptr__id.i75.us to double* - %val__id.i77.us = load double* %ptr__id.i76.us, align 8 - %.lhs147.us = add i32 %.lhs147.lhs.us, %add_xb_load42_calltmp.i.us - %14 = shl i32 %.lhs147.us, 3 - %iptr__id.i70.rhs.us = sext i32 %14 to i64 - %iptr__id.i70.us = add i64 %iptr__id.i70.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i71.us = inttoptr i64 %iptr__id.i70.us to double* - %val__id.i72.us = load double* %ptr__id.i71.us, align 8 - %.lhs148.us = add i32 %.lhs148.lhs.us, %add_xb_load42_calltmp.i.us - %15 = shl i32 %.lhs148.us, 3 - %iptr__id.i65.rhs.us = sext i32 %15 to i64 - %iptr__id.i65.us = add i64 %iptr__id.i65.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i66.us = inttoptr i64 %iptr__id.i65.us to double* - %val__id.i67.us = load double* %ptr__id.i66.us, align 8 - %.lhs149.us = add i32 %.lhs149.lhs.us, %add_xb_load42_calltmp.i.us - %16 = shl i32 %.lhs149.us, 3 - %iptr__id.i60.rhs.us = sext i32 %16 to i64 - %iptr__id.i60.us = add i64 %iptr__id.i60.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i61.us = inttoptr i64 %iptr__id.i60.us to double* - %val__id.i62.us = load double* %ptr__id.i61.us, align 8 - %.lhs150.us = add i32 %.lhs150.lhs.us, %add_xb_load42_calltmp.i.us - %17 = shl i32 %.lhs150.us, 3 - %iptr__id.i55.rhs.us = sext i32 %17 to i64 - %iptr__id.i55.us = add i64 %iptr__id.i55.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i56.us = inttoptr i64 %iptr__id.i55.us to double* - %val__id.i57.us = load double* %ptr__id.i56.us, align 8 - %.lhs151.us = add i32 %add_xb_load42_calltmp.i.us, %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us - %18 = shl i32 %.lhs151.us, 3 - %iptr__id.i50.rhs.us = sext i32 %18 to i64 - %iptr__id.i50.us = add i64 %iptr__id.i50.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i51.us = inttoptr i64 %iptr__id.i50.us to double* - %val__id.i52.us = load double* %ptr__id.i51.us, align 8 - %.lhs152.us = add i32 %.lhs152.lhs.us, %add_xb_load42_calltmp.i.us - %19 = shl i32 %.lhs152.us, 3 - %iptr__id.i45.rhs.us = sext i32 %19 to i64 - %iptr__id.i45.us = add i64 %iptr__id.i45.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i46.us = inttoptr i64 %iptr__id.i45.us to double* - %val__id.i47.us = load double* %ptr__id.i46.us, align 8 - %val__id.i41.us = load double* %ptr__id.i51.us, align 8 - %iptr__id.i32.us = add i64 %iptr__id.i50.rhs.us, %Aout_load_ptr2int.i - %ptr__id.i33.us = inttoptr i64 %iptr__id.i32.us to double* - %val__id.i34.us = load double* %ptr__id.i33.us, align 8 - %iptr__id.i27.rhs.us = sext i32 %.lhs.us to i64 - %iptr__id.i27.us = add i64 %iptr__id.i27.rhs.us, %vsq_load_ptr2int.i - %ptr__id.i28.us = inttoptr i64 %iptr__id.i27.us to double* - %val__id.i29.us = load double* %ptr__id.i28.us, align 8 - %iptr__id.i23.us = add i64 %iptr__id.i50.rhs.us, %Aout_load_ptr2int.i - %ptr__id.i24.us = inttoptr i64 %iptr__id.i23.us to double* - %val__id.i25.lhs.us.lhs = fmul double %val__id.i41.us, 2.000000e+00 - %val__id.i25.lhs.us = fsub double %val__id.i25.lhs.us.lhs, %val__id.i34.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i127.us, %val__id.i122.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i102.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.us, %val__id.i87.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.us, %val__id.i72.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.us, %val__id.i57.us - %val__id.i25.rhs.rhs.lhs.lhs.us = fmul double %coef_load19_offset_load.i, %val__id.i25.rhs.rhs.lhs.lhs.rhs.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i.us, %val__id.i132.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i117.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.us, %val__id.i97.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.us, %val__id.i82.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.us, %val__id.i67.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.us = fmul double %coef_load16_offset_load.i, %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.us - %val__id.i25.rhs.rhs.lhs.rhs.rhs.us = fmul double %coef_load_offset_load.i, %val__id.i52.us - %val__id.i25.rhs.rhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.us, %val__id.i25.rhs.rhs.lhs.rhs.rhs.us - %val__id.i25.rhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.us, %val__id.i25.rhs.rhs.lhs.rhs.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i112.us, %val__id.i107.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i92.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.us, %val__id.i77.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.us, %val__id.i62.us - %val__id.i25.rhs.rhs.rhs.rhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.us, %val__id.i47.us - %val__id.i25.rhs.rhs.rhs.us = fmul double %coef_load22_offset_load.i, %val__id.i25.rhs.rhs.rhs.rhs.us - %val__id.i25.rhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.us, %val__id.i25.rhs.rhs.rhs.us - %val__id.i25.rhs.us = fmul double %val__id.i25.rhs.rhs.us, %val__id.i29.us - %val__id.i25.us = fadd double %val__id.i25.lhs.us, %val__id.i25.rhs.us - store double %val__id.i25.us, double* %ptr__id.i24.us, align 8 - br label %safe_if_after_true.i.us - -safe_if_after_true.i.us: ; preds = %pl_dolane.i.us, %for_loop37.i.us - %tid.i.i1.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2 - %add_xb_load243_calltmp241.i.us = add i32 %tid.i.i1.i.us, %xb.0.i158.us - %less_xb_load_x1_load.i.us = icmp slt i32 %add_xb_load243_calltmp241.i.us, %x1 - br i1 %less_xb_load_x1_load.i.us, label %for_loop37.i.us, label %for_exit38.i.us - -for_loop37.i.lr.ph.us: ; preds = %for_exit38.i.us, %for_test35.i.preheader.lr.ph - %y.0.i160.us = phi i32 [ %y_load244_plus1.i.us, %for_exit38.i.us ], [ %y0, %for_test35.i.preheader.lr.ph ] - %mul_y_load46_Nx_load47.i.us = mul i32 %y.0.i160.us, %Nx - %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us = add i32 %mul_y_load46_Nx_load47.i.us, %mul_z_load45_Nxy_load.i - %.lhs138.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %Nx - %.lhs141.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load119.i - %.lhs142.lhs.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %Nx - %.lhs143.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load167.i - %.lhs144.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load127.i - %.lhs145.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul_Nx_load_Ny_load.i - %.lhs146.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load175.i - %.lhs147.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load136.i - %.lhs148.lhs.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul_Nx_load_Ny_load.i - %.lhs149.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load184.i - %.lhs150.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load144.i - %.lhs152.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load192.i - br label %for_loop37.i.us - -for_exit31.i: ; preds = %for_exit38.i.us, %for_test35.i.preheader.lr.ph, %for_test28.i.preheader - %z_load245_plus1.i = add i32 %z.0.i162, 1 - %exitcond163 = icmp eq i32 %z.0.i162, %0 - br i1 %exitcond163, label %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit, label %for_test28.i.preheader - -stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit: ; preds = %for_exit31.i, %allocas - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "target-features"="+sm_35" } -attributes #2 = { nounwind } - -!nvvm.annotations = !{!0} - -!0 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task, metadata !"kernel", i32 1} -!1 = metadata !{ } -!2 = metadata !{ metadata !"output", metadata !0 } -!3 = metadata !{ metadata !"input1", metadata !0 } -!4 = metadata !{ metadata !"input2", metadata !0 } diff --git a/examples_cuda/stencil/stencil_ispc.h b/examples_cuda/stencil/stencil_ispc.h index ebf29582..10b0d713 100644 --- a/examples_cuda/stencil/stencil_ispc.h +++ b/examples_cuda/stencil/stencil_ispc.h @@ -21,7 +21,6 @@ namespace ispc { /* namespace */ #if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C) extern "C" { #endif // __cplusplus - extern void loop_stencil_ispc(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd); extern void loop_stencil_ispc_tasks(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd); #if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C) } /* end extern C */ diff --git a/examples_cuda/stencil/stencil_ispc_nvptx64.ll b/examples_cuda/stencil/stencil_ispc_nvptx64.ll new file mode 100644 index 00000000..51c0d95a --- /dev/null +++ b/examples_cuda/stencil/stencil_ispc_nvptx64.ll @@ -0,0 +1,974 @@ +; ModuleID = 'stencil_ispc_nvptx64.bc' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64" + +module asm "" +module asm ".extern .func (.param .b32 func_retval0) cudaLaunchDevice" +module asm "(" +module asm " .param .b64 cudaLaunchDevice_param_0," +module asm " .param .b64 cudaLaunchDevice_param_1," +module asm " .param .align 4 .b8 cudaLaunchDevice_param_2[12]," +module asm " .param .align 4 .b8 cudaLaunchDevice_param_3[12]," +module asm " .param .b32 cudaLaunchDevice_param_4," +module asm " .param .b64 cudaLaunchDevice_param_5" +module asm ");" + +@constDeltaForeach1 = private unnamed_addr constant [32 x i8] zeroinitializer +@constDeltaForeach4 = private unnamed_addr constant [32 x i8] c"\00\01\02\03\04\05\06\07\08\09\0A\0B\0C\0D\0E\0F\10\11\12\13\14\15\16\17\18\19\1A\1B\1C\1D\1E\1F" + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone + +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone + +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() nounwind readnone + +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() nounwind readnone + +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() nounwind readnone + +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() nounwind readnone + +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() nounwind readnone + +define i32 @__shfl_i32(i32, i32) { + %shfl = tail call i32 asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) + ret i32 %shfl +} + +define float @__shfl_xor_float(float, i32) { + %shfl = tail call float asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) + ret float %shfl +} + +define i32 @__shfl_xor_i32(i32, i32) { + %shfl = tail call i32 asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) + ret i32 %shfl +} + +define float @__fminf(float, float) { + %min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) + ret float %min +} + +define float @__fmaxf(float, float) { + %max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) + ret float %max +} + +define i32 @__ballot(i1) { + %conv = zext i1 %0 to i32 + %res = tail call i32 asm sideeffect "{ .reg .pred %p1; \0A setp.ne.u32 %p1, $1, 0; \0A vote.ballot.b32 $0, %p1; \0A }", "=r,r"(i32 %conv) + ret i32 %res +} + +define i32 @__lanemask_lt() { + %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() + ret i32 %mask +} + +define i8* @ISPCAlloc(i8**, i64, i32) { + ret i8* inttoptr (i64 1 to i8*) +} + +declare i64 @cudaGetParameterBuffer(i64, i64) + +define i8* @ISPCGetParamBuffer(i8**, i64 %align, i64 %size) { +entry: + %tid.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and = and i32 %tid.i, 31 + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %ptri64tmp = tail call i64 @cudaGetParameterBuffer(i64 %align, i64 %size) + %phitmp = inttoptr i64 %ptri64tmp to i8* + br label %if.end + +if.end: ; preds = %if.then, %entry + %ptri64 = phi i8* [ %phitmp, %if.then ], [ null, %entry ] + ret i8* %ptri64 +} + +define void @ISPCLaunch(i8**, i8* %func_ptr, i8* %func_args, i32 %ntx, i32 %nty, i32 %ntz) { +entry: + %tid.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and = and i32 %tid.i, 31 + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %ntxm1 = add nsw i32 %ntx, -1 + %ntxm1d4 = ashr i32 %ntxm1, 2 + %nbx = add nsw i32 %ntxm1d4, 1 + %args_i64 = ptrtoint i8* %func_args to i64 + %func_i64 = ptrtoint i8* %func_ptr to i64 + %res_tmp = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 %func_i64, i64 %args_i64, i32 %nbx, i32 %nty, i32 %ntz, i32 128, i32 1, i32 1, i32 0, i64 0) + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +declare i32 @cudaDeviceSynchronize() + +define void @ISPCSync(i8*) { + %2 = tail call i32 @cudaDeviceSynchronize() + ret void +} + +define i64 @__warpBinExclusiveScan(i1 %p) { +entry: + %conv.i = zext i1 %p to i32 + %res.i = tail call i32 asm sideeffect "{ .reg .pred %p1; \0A setp.ne.u32 %p1, $1, 0; \0A vote.ballot.b32 $0, %p1; \0A }", "=r,r"(i32 %conv.i) + %res.i1 = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %res.i) + %mask.i = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() + %and = and i32 %mask.i, %res.i + %res.i2 = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %and) + %retval.sroa.1.4.insert.ext.i = zext i32 %res.i2 to i64 + %retval.sroa.1.4.insert.shift.i = shl nuw i64 %retval.sroa.1.4.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %res.i1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.1.4.insert.shift.i, %retval.sroa.0.0.insert.ext.i + ret i64 %retval.sroa.0.0.insert.insert.i +} + +define internal void @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Ain, double* %Aout) { +allocas: + %bid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + %mul_calltmp_.i = shl i32 %bid.i.i, 2 + %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %bitop.i = ashr i32 %tid.i.i, 5 + %add_mul_calltmp__bitop.i = add i32 %bitop.i, %mul_calltmp_.i + %nb.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() + %mul_calltmp_.i57 = shl i32 %nb.i.i, 2 + %greaterequal_calltmp_calltmp18 = icmp sge i32 %add_mul_calltmp__bitop.i, %mul_calltmp_.i57 + %bid.i.i58 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + %nb.i.i59 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() + %greaterequal_calltmp21_calltmp24 = icmp sge i32 %bid.i.i58, %nb.i.i59 + %logical_or = or i1 %greaterequal_calltmp_calltmp18, %greaterequal_calltmp21_calltmp24 + %bid.i.i60 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() + %nb.i.i61 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() + %greaterequal_calltmp27_calltmp30 = icmp sge i32 %bid.i.i60, %nb.i.i61 + %logical_or31 = or i1 %logical_or, %greaterequal_calltmp27_calltmp30 + br i1 %logical_or31, label %if_then, label %if_exit + +if_then: ; preds = %foreach_reset19.i, %if_exit, %allocas + ret void + +if_exit: ; preds = %allocas + %bid.i.i62 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + %mul_calltmp_.i63 = shl i32 %bid.i.i62, 7 + %tid.i.i64 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %bitop.i657375 = add i32 %tid.i.i64, %mul_calltmp_.i63 + %mul_calltmp35_ = and i32 %bitop.i657375, -32 + %add_x0_load_mul_calltmp35_ = add i32 %mul_calltmp35_, %x0 + %add_xfirst_load_ = add i32 %add_x0_load_mul_calltmp35_, 32 + %c.i.i = icmp sgt i32 %add_xfirst_load_, %x1 + %r.i.i = select i1 %c.i.i, i32 %x1, i32 %add_xfirst_load_ + %bid.i.i67 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + %mul_calltmp41_ = shl i32 %bid.i.i67, 3 + %add_y0_load_mul_calltmp41_ = add i32 %mul_calltmp41_, %y0 + %add_yfirst_load_ = add i32 %add_y0_load_mul_calltmp41_, 8 + %bid.i.i70 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() + %mul_calltmp47_ = shl i32 %bid.i.i70, 3 + %add_z0_load_mul_calltmp47_ = add i32 %mul_calltmp47_, %z0 + %add_zfirst_load_ = add i32 %add_z0_load_mul_calltmp47_, 8 + %c.i.i71 = icmp sgt i32 %add_zfirst_load_, %z1 + %r.i.i72 = select i1 %c.i.i71, i32 %z1, i32 %add_zfirst_load_ + %mul_Nx_load_Ny_load.i = mul i32 %Ny, %Nx + %nitems29.i = sub i32 %r.i.i, %add_x0_load_mul_calltmp35_ + %nextras30.i = srem i32 %nitems29.i, 32 + %aligned_end31.i = sub i32 %r.i.i, %nextras30.i + %tid.i4.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %__laneidx.i = and i32 %tid.i4.i, 31 + %0 = zext i32 %__laneidx.i to i64 + %arrayidx.i = getelementptr [32 x i8]* @constDeltaForeach1, i64 0, i64 %0 + %cmp38.i396 = icmp slt i32 %add_z0_load_mul_calltmp47_, %r.i.i72 + br i1 %cmp38.i396, label %foreach_test21.i.preheader.lr.ph, label %if_then + +foreach_test21.i.preheader.lr.ph: ; preds = %if_exit + %c.i.i68 = icmp sgt i32 %add_yfirst_load_, %y1 + %r.i.i69 = select i1 %c.i.i68, i32 %y1, i32 %add_yfirst_load_ + %1 = load i8* %arrayidx.i, align 1 + %_zext.i394 = zext i8 %1 to i32 + %2 = insertelement <1 x i32> undef, i32 %_zext.i394, i32 0 + %smear_counter_init.i393 = insertelement <1 x i32> undef, i32 %add_z0_load_mul_calltmp47_, i32 0 + %iter_val.i395 = add <1 x i32> %smear_counter_init.i393, %2 + %smear_counter_init44.i387 = insertelement <1 x i32> undef, i32 %add_y0_load_mul_calltmp41_, i32 0 + %cmp54.i390 = icmp slt i32 %add_y0_load_mul_calltmp41_, %r.i.i69 + %before_aligned_end73.i385 = icmp slt i32 %add_x0_load_mul_calltmp35_, %aligned_end31.i + %smear_end_init289.i = insertelement <1 x i32> undef, i32 %r.i.i, i32 0 + %Nxy_load298_broadcast_init.i = insertelement <1 x i32> undef, i32 %mul_Nx_load_Ny_load.i, i32 0 + %Nx_load300_broadcast_init.i = insertelement <1 x i32> undef, i32 %Nx, i32 0 + %Ain_load309_ptr2int.i = ptrtoint double* %Ain to i64 + %coef_load314_offset.i = getelementptr double* %coef, i64 1 + %coef_load365_offset.i = getelementptr double* %coef, i64 2 + %mul__Nx_load385.i = shl i32 %Nx, 1 + %mul__Nx_load393.i = mul i32 %Nx, -2 + %mul__Nxy_load402.i = shl i32 %mul_Nx_load_Ny_load.i, 1 + %mul__Nxy_load410.i = mul i32 %mul_Nx_load_Ny_load.i, -2 + %coef_load416_offset.i = getelementptr double* %coef, i64 3 + %mul__Nx_load436.i = mul i32 %Nx, 3 + %mul__Nx_load444.i = mul i32 %Nx, -3 + %mul__Nxy_load453.i = mul i32 %mul_Nx_load_Ny_load.i, 3 + %mul__Nxy_load461.i = mul i32 %mul_Nx_load_Ny_load.i, -3 + %Aout_load470_ptr2int.i = ptrtoint double* %Aout to i64 + %vsq_load488_ptr2int.i = ptrtoint double* %vsq to i64 + %3 = sub i32 -9, %y0 + %4 = shl i32 %bid.i.i67, 3 + %5 = sub i32 %3, %4 + %6 = xor i32 %y1, -1 + %7 = icmp sgt i32 %5, %6 + %smax = select i1 %7, i32 %5, i32 %6 + %8 = xor i32 %smax, -1 + %9 = sub i32 -9, %z0 + %10 = shl i32 %bid.i.i70, 3 + %11 = sub i32 %9, %10 + %12 = xor i32 %z1, -1 + %13 = icmp sgt i32 %11, %12 + %smax399 = select i1 %13, i32 %11, i32 %12 + %14 = xor i32 %smax399, -1 + br label %foreach_test21.i.preheader + +foreach_full_body.i: ; preds = %outer_not_in_extras.i.preheader, %foreach_full_body.i + %counter32.4.i386 = phi i32 [ %new_counter279.i, %foreach_full_body.i ], [ %add_x0_load_mul_calltmp35_, %outer_not_in_extras.i.preheader ] + %tid.i.i56 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %__laneidx80.i = and i32 %tid.i.i56, 31 + %15 = zext i32 %__laneidx80.i to i64 + %arrayidx81.i = getelementptr [32 x i8]* @constDeltaForeach4, i64 0, i64 %15 + %16 = load i8* %arrayidx81.i, align 1 + %_zext82.i = zext i8 %16 to i32 + %coef_load_offset_load.i = load double* %coef, align 8 + %.lhs362.lhs.lhs = extractelement <1 x i32> %mul_z_load297_Nxy_load298_broadcast.i, i32 0 + %.lhs362.lhs.rhs.lhs = extractelement <1 x i32> %iter_val50.i392, i32 0 + %.lhs362.lhs.rhs = mul i32 %.lhs362.lhs.rhs.lhs, %Nx + %.lhs362.lhs = add i32 %.lhs362.lhs.lhs, %.lhs362.lhs.rhs + %.lhs362.rhs = add i32 %counter32.4.i386, %_zext82.i + %.lhs362 = add i32 %.lhs362.lhs, %.lhs362.rhs + %17 = shl i32 %.lhs362, 3 + %iptr__id.i.rhs = sext i32 %17 to i64 + %iptr__id.i = add i64 %iptr__id.i.rhs, %Ain_load309_ptr2int.i + %ptr__id.i = inttoptr i64 %iptr__id.i to double* + %val__id.i = load double* %ptr__id.i, align 8 + %coef_load94_offset_load.i = load double* %coef_load314_offset.i, align 8 + %18 = add i32 %17, 8 + %iptr__id.i335.rhs = sext i32 %18 to i64 + %iptr__id.i335 = add i64 %iptr__id.i335.rhs, %Ain_load309_ptr2int.i + %ptr__id.i336 = inttoptr i64 %iptr__id.i335 to double* + %val__id.i337 = load double* %ptr__id.i336, align 8 + %19 = add i32 %17, -8 + %iptr__id.i330.rhs = sext i32 %19 to i64 + %iptr__id.i330 = add i64 %iptr__id.i330.rhs, %Ain_load309_ptr2int.i + %ptr__id.i331 = inttoptr i64 %iptr__id.i330 to double* + %val__id.i332 = load double* %ptr__id.i331, align 8 + %.lhs365 = add i32 %.lhs362, %Nx + %20 = shl i32 %.lhs365, 3 + %iptr__id.i325.rhs = sext i32 %20 to i64 + %iptr__id.i325 = add i64 %iptr__id.i325.rhs, %Ain_load309_ptr2int.i + %ptr__id.i326 = inttoptr i64 %iptr__id.i325 to double* + %val__id.i327 = load double* %ptr__id.i326, align 8 + %.lhs366 = sub i32 %.lhs362, %Nx + %21 = shl i32 %.lhs366, 3 + %iptr__id.i320.rhs = sext i32 %21 to i64 + %iptr__id.i320 = add i64 %iptr__id.i320.rhs, %Ain_load309_ptr2int.i + %ptr__id.i321 = inttoptr i64 %iptr__id.i320 to double* + %val__id.i322 = load double* %ptr__id.i321, align 8 + %.lhs367 = add i32 %.lhs362, %mul_Nx_load_Ny_load.i + %22 = shl i32 %.lhs367, 3 + %iptr__id.i315.rhs = sext i32 %22 to i64 + %iptr__id.i315 = add i64 %iptr__id.i315.rhs, %Ain_load309_ptr2int.i + %ptr__id.i316 = inttoptr i64 %iptr__id.i315 to double* + %val__id.i317 = load double* %ptr__id.i316, align 8 + %.lhs368 = sub i32 %.lhs362, %mul_Nx_load_Ny_load.i + %23 = shl i32 %.lhs368, 3 + %iptr__id.i310.rhs = sext i32 %23 to i64 + %iptr__id.i310 = add i64 %iptr__id.i310.rhs, %Ain_load309_ptr2int.i + %ptr__id.i311 = inttoptr i64 %iptr__id.i310 to double* + %val__id.i312 = load double* %ptr__id.i311, align 8 + %coef_load145_offset_load.i = load double* %coef_load365_offset.i, align 8 + %24 = add i32 %17, 16 + %iptr__id.i305.rhs = sext i32 %24 to i64 + %iptr__id.i305 = add i64 %iptr__id.i305.rhs, %Ain_load309_ptr2int.i + %ptr__id.i306 = inttoptr i64 %iptr__id.i305 to double* + %val__id.i307 = load double* %ptr__id.i306, align 8 + %25 = add i32 %17, -16 + %iptr__id.i300.rhs = sext i32 %25 to i64 + %iptr__id.i300 = add i64 %iptr__id.i300.rhs, %Ain_load309_ptr2int.i + %ptr__id.i301 = inttoptr i64 %iptr__id.i300 to double* + %val__id.i302 = load double* %ptr__id.i301, align 8 + %.lhs371 = add i32 %.lhs362, %mul__Nx_load385.i + %26 = shl i32 %.lhs371, 3 + %iptr__id.i295.rhs = sext i32 %26 to i64 + %iptr__id.i295 = add i64 %iptr__id.i295.rhs, %Ain_load309_ptr2int.i + %ptr__id.i296 = inttoptr i64 %iptr__id.i295 to double* + %val__id.i297 = load double* %ptr__id.i296, align 8 + %.lhs372 = add i32 %.lhs362, %mul__Nx_load393.i + %27 = shl i32 %.lhs372, 3 + %iptr__id.i290.rhs = sext i32 %27 to i64 + %iptr__id.i290 = add i64 %iptr__id.i290.rhs, %Ain_load309_ptr2int.i + %ptr__id.i291 = inttoptr i64 %iptr__id.i290 to double* + %val__id.i292 = load double* %ptr__id.i291, align 8 + %.lhs373 = add i32 %.lhs362, %mul__Nxy_load402.i + %28 = shl i32 %.lhs373, 3 + %iptr__id.i285.rhs = sext i32 %28 to i64 + %iptr__id.i285 = add i64 %iptr__id.i285.rhs, %Ain_load309_ptr2int.i + %ptr__id.i286 = inttoptr i64 %iptr__id.i285 to double* + %val__id.i287 = load double* %ptr__id.i286, align 8 + %.lhs374 = add i32 %.lhs362, %mul__Nxy_load410.i + %29 = shl i32 %.lhs374, 3 + %iptr__id.i280.rhs = sext i32 %29 to i64 + %iptr__id.i280 = add i64 %iptr__id.i280.rhs, %Ain_load309_ptr2int.i + %ptr__id.i281 = inttoptr i64 %iptr__id.i280 to double* + %val__id.i282 = load double* %ptr__id.i281, align 8 + %coef_load196_offset_load.i = load double* %coef_load416_offset.i, align 8 + %30 = add i32 %17, 24 + %iptr__id.i275.rhs = sext i32 %30 to i64 + %iptr__id.i275 = add i64 %iptr__id.i275.rhs, %Ain_load309_ptr2int.i + %ptr__id.i276 = inttoptr i64 %iptr__id.i275 to double* + %val__id.i277 = load double* %ptr__id.i276, align 8 + %31 = add i32 %17, -24 + %iptr__id.i270.rhs = sext i32 %31 to i64 + %iptr__id.i270 = add i64 %iptr__id.i270.rhs, %Ain_load309_ptr2int.i + %ptr__id.i271 = inttoptr i64 %iptr__id.i270 to double* + %val__id.i272 = load double* %ptr__id.i271, align 8 + %.lhs377 = add i32 %.lhs362, %mul__Nx_load436.i + %32 = shl i32 %.lhs377, 3 + %iptr__id.i265.rhs = sext i32 %32 to i64 + %iptr__id.i265 = add i64 %iptr__id.i265.rhs, %Ain_load309_ptr2int.i + %ptr__id.i266 = inttoptr i64 %iptr__id.i265 to double* + %val__id.i267 = load double* %ptr__id.i266, align 8 + %.lhs378 = add i32 %.lhs362, %mul__Nx_load444.i + %33 = shl i32 %.lhs378, 3 + %iptr__id.i260.rhs = sext i32 %33 to i64 + %iptr__id.i260 = add i64 %iptr__id.i260.rhs, %Ain_load309_ptr2int.i + %ptr__id.i261 = inttoptr i64 %iptr__id.i260 to double* + %val__id.i262 = load double* %ptr__id.i261, align 8 + %.lhs379 = add i32 %.lhs362, %mul__Nxy_load453.i + %34 = shl i32 %.lhs379, 3 + %iptr__id.i255.rhs = sext i32 %34 to i64 + %iptr__id.i255 = add i64 %iptr__id.i255.rhs, %Ain_load309_ptr2int.i + %ptr__id.i256 = inttoptr i64 %iptr__id.i255 to double* + %val__id.i257 = load double* %ptr__id.i256, align 8 + %.lhs380 = add i32 %.lhs362, %mul__Nxy_load461.i + %35 = shl i32 %.lhs380, 3 + %iptr__id.i250.rhs = sext i32 %35 to i64 + %iptr__id.i250 = add i64 %iptr__id.i250.rhs, %Ain_load309_ptr2int.i + %ptr__id.i251 = inttoptr i64 %iptr__id.i250 to double* + %val__id.i252 = load double* %ptr__id.i251, align 8 + %val__id.i247 = load double* %ptr__id.i, align 8 + %iptr__id.i240 = add i64 %iptr__id.i.rhs, %Aout_load470_ptr2int.i + %ptr__id.i241 = inttoptr i64 %iptr__id.i240 to double* + %val__id.i242 = load double* %ptr__id.i241, align 8 + %iptr__id.i235 = add i64 %iptr__id.i.rhs, %vsq_load488_ptr2int.i + %ptr__id.i236 = inttoptr i64 %iptr__id.i235 to double* + %val__id.i237 = load double* %ptr__id.i236, align 8 + %val__id.i233.lhs.lhs = fmul double %val__id.i247, 2.000000e+00 + %val__id.i233.lhs = fsub double %val__id.i233.lhs.lhs, %val__id.i242 + %val__id.i233.rhs.rhs.lhs.lhs.lhs = fmul double %coef_load_offset_load.i, %val__id.i + %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i337, %val__id.i332 + %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i327 + %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i322 + %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs, %val__id.i317 + %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs, %val__id.i312 + %val__id.i233.rhs.rhs.lhs.lhs.rhs = fmul double %coef_load94_offset_load.i, %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs + %val__id.i233.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.lhs, %val__id.i233.rhs.rhs.lhs.lhs.rhs + %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i307, %val__id.i302 + %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i297 + %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i292 + %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs, %val__id.i287 + %val__id.i233.rhs.rhs.lhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs, %val__id.i282 + %val__id.i233.rhs.rhs.lhs.rhs = fmul double %coef_load145_offset_load.i, %val__id.i233.rhs.rhs.lhs.rhs.rhs + %val__id.i233.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs, %val__id.i233.rhs.rhs.lhs.rhs + %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i277, %val__id.i272 + %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i267 + %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs, %val__id.i262 + %val__id.i233.rhs.rhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs, %val__id.i257 + %val__id.i233.rhs.rhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs, %val__id.i252 + %val__id.i233.rhs.rhs.rhs = fmul double %coef_load196_offset_load.i, %val__id.i233.rhs.rhs.rhs.rhs + %val__id.i233.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs, %val__id.i233.rhs.rhs.rhs + %val__id.i233.rhs = fmul double %val__id.i237, %val__id.i233.rhs.rhs + %val__id.i233 = fadd double %val__id.i233.lhs, %val__id.i233.rhs + store double %val__id.i233, double* %ptr__id.i241, align 8 + %new_counter279.i = add i32 %counter32.4.i386, 32 + %before_aligned_end73.i = icmp slt i32 %new_counter279.i, %aligned_end31.i + br i1 %before_aligned_end73.i, label %foreach_full_body.i, label %partial_inner_all_outer.i + +foreach_test21.i.preheader: ; preds = %foreach_reset19.i, %foreach_test21.i.preheader.lr.ph + %iter_val.i398 = phi <1 x i32> [ %iter_val.i395, %foreach_test21.i.preheader.lr.ph ], [ %iter_val.i, %foreach_reset19.i ] + %counter.0.i397 = phi i32 [ %add_z0_load_mul_calltmp47_, %foreach_test21.i.preheader.lr.ph ], [ %new_counter.i, %foreach_reset19.i ] + %tid.i3.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %__laneidx47.i = and i32 %tid.i3.i, 31 + %36 = zext i32 %__laneidx47.i to i64 + %arrayidx48.i = getelementptr [32 x i8]* @constDeltaForeach1, i64 0, i64 %36 + br i1 %cmp54.i390, label %outer_not_in_extras.i.preheader.lr.ph, label %foreach_reset19.i + +outer_not_in_extras.i.preheader.lr.ph: ; preds = %foreach_test21.i.preheader + %37 = load i8* %arrayidx48.i, align 1 + %_zext49.i388 = zext i8 %37 to i32 + %38 = insertelement <1 x i32> undef, i32 %_zext49.i388, i32 0 + %iter_val50.i389 = add <1 x i32> %smear_counter_init44.i387, %38 + %mul_z_load297_Nxy_load298_broadcast.i = mul <1 x i32> %iter_val.i398, %Nxy_load298_broadcast_init.i + br label %outer_not_in_extras.i.preheader + +foreach_reset19.i: ; preds = %foreach_reset27.i, %foreach_test21.i.preheader + %new_counter.i = add i32 %counter.0.i397, 1 + %smear_counter_init.i = insertelement <1 x i32> undef, i32 %new_counter.i, i32 0 + %39 = load i8* %arrayidx.i, align 1 + %_zext.i = zext i8 %39 to i32 + %40 = insertelement <1 x i32> undef, i32 %_zext.i, i32 0 + %iter_val.i = add <1 x i32> %smear_counter_init.i, %40 + %exitcond400 = icmp eq i32 %new_counter.i, %14 + br i1 %exitcond400, label %if_then, label %foreach_test21.i.preheader + +outer_not_in_extras.i.preheader: ; preds = %foreach_reset27.i, %outer_not_in_extras.i.preheader.lr.ph + %iter_val50.i392 = phi <1 x i32> [ %iter_val50.i389, %outer_not_in_extras.i.preheader.lr.ph ], [ %iter_val50.i, %foreach_reset27.i ] + %counter25.1.i391 = phi i32 [ %add_y0_load_mul_calltmp41_, %outer_not_in_extras.i.preheader.lr.ph ], [ %new_counter35.i, %foreach_reset27.i ] + br i1 %before_aligned_end73.i385, label %foreach_full_body.i, label %partial_inner_all_outer.i + +foreach_reset27.i: ; preds = %pl_dolane.i, %partial_inner_only.i, %partial_inner_all_outer.i + %new_counter35.i = add i32 %counter25.1.i391, 1 + %smear_counter_init44.i = insertelement <1 x i32> undef, i32 %new_counter35.i, i32 0 + %41 = load i8* %arrayidx48.i, align 1 + %_zext49.i = zext i8 %41 to i32 + %42 = insertelement <1 x i32> undef, i32 %_zext49.i, i32 0 + %iter_val50.i = add <1 x i32> %smear_counter_init44.i, %42 + %exitcond = icmp eq i32 %new_counter35.i, %8 + br i1 %exitcond, label %foreach_reset19.i, label %outer_not_in_extras.i.preheader + +partial_inner_all_outer.i: ; preds = %outer_not_in_extras.i.preheader, %foreach_full_body.i + %counter32.4.i.lcssa = phi i32 [ %add_x0_load_mul_calltmp35_, %outer_not_in_extras.i.preheader ], [ %new_counter279.i, %foreach_full_body.i ] + %before_full_end.i = icmp slt i32 %counter32.4.i.lcssa, %r.i.i + br i1 %before_full_end.i, label %partial_inner_only.i, label %foreach_reset27.i + +partial_inner_only.i: ; preds = %partial_inner_all_outer.i + %smear_counter_init282.i = insertelement <1 x i32> undef, i32 %counter32.4.i.lcssa, i32 0 + %tid.i2.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %__laneidx285.i = and i32 %tid.i2.i, 31 + %43 = zext i32 %__laneidx285.i to i64 + %arrayidx286.i = getelementptr [32 x i8]* @constDeltaForeach4, i64 0, i64 %43 + %44 = load i8* %arrayidx286.i, align 1 + %_zext287.i = zext i8 %44 to i32 + %45 = insertelement <1 x i32> undef, i32 %_zext287.i, i32 0 + %iter_val288.i = add <1 x i32> %smear_counter_init282.i, %45 + %cmp291.i = icmp slt <1 x i32> %iter_val288.i, %smear_end_init289.i + %mul_y_load299_Nx_load300_broadcast.i = mul <1 x i32> %iter_val50.i392, %Nx_load300_broadcast_init.i + %add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast.i = add <1 x i32> %mul_z_load297_Nxy_load298_broadcast.i, %mul_y_load299_Nx_load300_broadcast.i + %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i = add <1 x i32> %add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast.i, %iter_val288.i + %v.i.i224 = extractelement <1 x i1> %cmp291.i, i32 0 + br i1 %v.i.i224, label %pl_dolane.i, label %foreach_reset27.i + +pl_dolane.i: ; preds = %partial_inner_only.i + %coef_load303_offset_load.i = load double* %coef, align 8 + %.lhs361 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %46 = shl i32 %.lhs361, 3 + %iptr__id.i225.rhs = sext i32 %46 to i64 + %iptr__id.i225 = add i64 %iptr__id.i225.rhs, %Ain_load309_ptr2int.i + %ptr__id.i226 = inttoptr i64 %iptr__id.i225 to double* + %val__id.i227 = load double* %ptr__id.i226, align 8 + %coef_load314_offset_load.i401 = load double* %coef_load314_offset.i, align 8 + %.lhs360.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs360 = shl i32 %.lhs360.lhs, 3 + %47 = add i32 %.lhs360, 8 + %iptr__id.i218.rhs = sext i32 %47 to i64 + %iptr__id.i218 = add i64 %iptr__id.i218.rhs, %Ain_load309_ptr2int.i + %ptr__id.i219 = inttoptr i64 %iptr__id.i218 to double* + %val__id.i220 = load double* %ptr__id.i219, align 8 + %.lhs359.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs359 = shl i32 %.lhs359.lhs, 3 + %48 = add i32 %.lhs359, -8 + %iptr__id.i211.rhs = sext i32 %48 to i64 + %iptr__id.i211 = add i64 %iptr__id.i211.rhs, %Ain_load309_ptr2int.i + %ptr__id.i212 = inttoptr i64 %iptr__id.i211 to double* + %val__id.i213 = load double* %ptr__id.i212, align 8 + %.lhs358.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs358 = add i32 %.lhs358.lhs, %Nx + %49 = shl i32 %.lhs358, 3 + %iptr__id.i204.rhs = sext i32 %49 to i64 + %iptr__id.i204 = add i64 %iptr__id.i204.rhs, %Ain_load309_ptr2int.i + %ptr__id.i205 = inttoptr i64 %iptr__id.i204 to double* + %val__id.i206 = load double* %ptr__id.i205, align 8 + %.lhs357.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs357 = sub i32 %.lhs357.lhs, %Nx + %50 = shl i32 %.lhs357, 3 + %iptr__id.i197.rhs = sext i32 %50 to i64 + %iptr__id.i197 = add i64 %iptr__id.i197.rhs, %Ain_load309_ptr2int.i + %ptr__id.i198 = inttoptr i64 %iptr__id.i197 to double* + %val__id.i199 = load double* %ptr__id.i198, align 8 + %.lhs356.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs356 = add i32 %.lhs356.lhs, %mul_Nx_load_Ny_load.i + %51 = shl i32 %.lhs356, 3 + %iptr__id.i190.rhs = sext i32 %51 to i64 + %iptr__id.i190 = add i64 %iptr__id.i190.rhs, %Ain_load309_ptr2int.i + %ptr__id.i191 = inttoptr i64 %iptr__id.i190 to double* + %val__id.i192 = load double* %ptr__id.i191, align 8 + %.lhs355.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs355 = sub i32 %.lhs355.lhs, %mul_Nx_load_Ny_load.i + %52 = shl i32 %.lhs355, 3 + %iptr__id.i183.rhs = sext i32 %52 to i64 + %iptr__id.i183 = add i64 %iptr__id.i183.rhs, %Ain_load309_ptr2int.i + %ptr__id.i184 = inttoptr i64 %iptr__id.i183 to double* + %val__id.i185 = load double* %ptr__id.i184, align 8 + %coef_load365_offset_load.i457 = load double* %coef_load365_offset.i, align 8 + %.lhs354.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs354 = shl i32 %.lhs354.lhs, 3 + %53 = add i32 %.lhs354, 16 + %iptr__id.i176.rhs = sext i32 %53 to i64 + %iptr__id.i176 = add i64 %iptr__id.i176.rhs, %Ain_load309_ptr2int.i + %ptr__id.i177 = inttoptr i64 %iptr__id.i176 to double* + %val__id.i178 = load double* %ptr__id.i177, align 8 + %.lhs353.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs353 = shl i32 %.lhs353.lhs, 3 + %54 = add i32 %.lhs353, -16 + %iptr__id.i169.rhs = sext i32 %54 to i64 + %iptr__id.i169 = add i64 %iptr__id.i169.rhs, %Ain_load309_ptr2int.i + %ptr__id.i170 = inttoptr i64 %iptr__id.i169 to double* + %val__id.i171 = load double* %ptr__id.i170, align 8 + %.lhs352.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs352 = add i32 %.lhs352.lhs, %mul__Nx_load385.i + %55 = shl i32 %.lhs352, 3 + %iptr__id.i162.rhs = sext i32 %55 to i64 + %iptr__id.i162 = add i64 %iptr__id.i162.rhs, %Ain_load309_ptr2int.i + %ptr__id.i163 = inttoptr i64 %iptr__id.i162 to double* + %val__id.i164 = load double* %ptr__id.i163, align 8 + %.lhs351.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs351 = add i32 %.lhs351.lhs, %mul__Nx_load393.i + %56 = shl i32 %.lhs351, 3 + %iptr__id.i155.rhs = sext i32 %56 to i64 + %iptr__id.i155 = add i64 %iptr__id.i155.rhs, %Ain_load309_ptr2int.i + %ptr__id.i156 = inttoptr i64 %iptr__id.i155 to double* + %val__id.i157 = load double* %ptr__id.i156, align 8 + %.lhs350.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs350 = add i32 %.lhs350.lhs, %mul__Nxy_load402.i + %57 = shl i32 %.lhs350, 3 + %iptr__id.i148.rhs = sext i32 %57 to i64 + %iptr__id.i148 = add i64 %iptr__id.i148.rhs, %Ain_load309_ptr2int.i + %ptr__id.i149 = inttoptr i64 %iptr__id.i148 to double* + %val__id.i150 = load double* %ptr__id.i149, align 8 + %.lhs349.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs349 = add i32 %.lhs349.lhs, %mul__Nxy_load410.i + %58 = shl i32 %.lhs349, 3 + %iptr__id.i141.rhs = sext i32 %58 to i64 + %iptr__id.i141 = add i64 %iptr__id.i141.rhs, %Ain_load309_ptr2int.i + %ptr__id.i142 = inttoptr i64 %iptr__id.i141 to double* + %val__id.i143 = load double* %ptr__id.i142, align 8 + %coef_load416_offset_load.i544 = load double* %coef_load416_offset.i, align 8 + %.lhs348.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs348 = shl i32 %.lhs348.lhs, 3 + %59 = add i32 %.lhs348, 24 + %iptr__id.i134.rhs = sext i32 %59 to i64 + %iptr__id.i134 = add i64 %iptr__id.i134.rhs, %Ain_load309_ptr2int.i + %ptr__id.i135 = inttoptr i64 %iptr__id.i134 to double* + %val__id.i136 = load double* %ptr__id.i135, align 8 + %.lhs347.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs347 = shl i32 %.lhs347.lhs, 3 + %60 = add i32 %.lhs347, -24 + %iptr__id.i127.rhs = sext i32 %60 to i64 + %iptr__id.i127 = add i64 %iptr__id.i127.rhs, %Ain_load309_ptr2int.i + %ptr__id.i128 = inttoptr i64 %iptr__id.i127 to double* + %val__id.i129 = load double* %ptr__id.i128, align 8 + %.lhs346.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs346 = add i32 %.lhs346.lhs, %mul__Nx_load436.i + %61 = shl i32 %.lhs346, 3 + %iptr__id.i120.rhs = sext i32 %61 to i64 + %iptr__id.i120 = add i64 %iptr__id.i120.rhs, %Ain_load309_ptr2int.i + %ptr__id.i121 = inttoptr i64 %iptr__id.i120 to double* + %val__id.i122 = load double* %ptr__id.i121, align 8 + %.lhs345.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs345 = add i32 %.lhs345.lhs, %mul__Nx_load444.i + %62 = shl i32 %.lhs345, 3 + %iptr__id.i113.rhs = sext i32 %62 to i64 + %iptr__id.i113 = add i64 %iptr__id.i113.rhs, %Ain_load309_ptr2int.i + %ptr__id.i114 = inttoptr i64 %iptr__id.i113 to double* + %val__id.i115 = load double* %ptr__id.i114, align 8 + %.lhs344.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs344 = add i32 %.lhs344.lhs, %mul__Nxy_load453.i + %63 = shl i32 %.lhs344, 3 + %iptr__id.i106.rhs = sext i32 %63 to i64 + %iptr__id.i106 = add i64 %iptr__id.i106.rhs, %Ain_load309_ptr2int.i + %ptr__id.i107 = inttoptr i64 %iptr__id.i106 to double* + %val__id.i108 = load double* %ptr__id.i107, align 8 + %.lhs343.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %.lhs343 = add i32 %.lhs343.lhs, %mul__Nxy_load461.i + %64 = shl i32 %.lhs343, 3 + %iptr__id.i99.rhs = sext i32 %64 to i64 + %iptr__id.i99 = add i64 %iptr__id.i99.rhs, %Ain_load309_ptr2int.i + %ptr__id.i100 = inttoptr i64 %iptr__id.i99 to double* + %val__id.i101 = load double* %ptr__id.i100, align 8 + %.lhs342 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %65 = shl i32 %.lhs342, 3 + %iptr__id.i92.rhs = sext i32 %65 to i64 + %iptr__id.i92 = add i64 %iptr__id.i92.rhs, %Ain_load309_ptr2int.i + %ptr__id.i93 = inttoptr i64 %iptr__id.i92 to double* + %val__id.i94 = load double* %ptr__id.i93, align 8 + %.lhs341 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %66 = shl i32 %.lhs341, 3 + %iptr__id.i85.rhs = sext i32 %66 to i64 + %iptr__id.i85 = add i64 %iptr__id.i85.rhs, %Aout_load470_ptr2int.i + %ptr__id.i86 = inttoptr i64 %iptr__id.i85 to double* + %val__id.i87 = load double* %ptr__id.i86, align 8 + %.lhs340 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %67 = shl i32 %.lhs340, 3 + %iptr__id.i80.rhs = sext i32 %67 to i64 + %iptr__id.i80 = add i64 %iptr__id.i80.rhs, %vsq_load488_ptr2int.i + %ptr__id.i81 = inttoptr i64 %iptr__id.i80 to double* + %val__id.i82 = load double* %ptr__id.i81, align 8 + %.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 + %68 = shl i32 %.lhs, 3 + %iptr__id.i76.rhs = sext i32 %68 to i64 + %iptr__id.i76 = add i64 %iptr__id.i76.rhs, %Aout_load470_ptr2int.i + %ptr__id.i77 = inttoptr i64 %iptr__id.i76 to double* + %val__id.i78.lhs.lhs = fmul double %val__id.i94, 2.000000e+00 + %val__id.i78.lhs = fsub double %val__id.i78.lhs.lhs, %val__id.i87 + %val__id.i78.rhs.rhs.lhs.lhs.lhs = fmul double %coef_load303_offset_load.i, %val__id.i227 + %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i220, %val__id.i213 + %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i206 + %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i199 + %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs, %val__id.i192 + %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs, %val__id.i185 + %val__id.i78.rhs.rhs.lhs.lhs.rhs = fmul double %coef_load314_offset_load.i401, %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs + %val__id.i78.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.lhs, %val__id.i78.rhs.rhs.lhs.lhs.rhs + %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i178, %val__id.i171 + %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i164 + %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i157 + %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs, %val__id.i150 + %val__id.i78.rhs.rhs.lhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs, %val__id.i143 + %val__id.i78.rhs.rhs.lhs.rhs = fmul double %coef_load365_offset_load.i457, %val__id.i78.rhs.rhs.lhs.rhs.rhs + %val__id.i78.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs, %val__id.i78.rhs.rhs.lhs.rhs + %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i136, %val__id.i129 + %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i122 + %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs, %val__id.i115 + %val__id.i78.rhs.rhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs, %val__id.i108 + %val__id.i78.rhs.rhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs, %val__id.i101 + %val__id.i78.rhs.rhs.rhs = fmul double %coef_load416_offset_load.i544, %val__id.i78.rhs.rhs.rhs.rhs + %val__id.i78.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs, %val__id.i78.rhs.rhs.rhs + %val__id.i78.rhs = fmul double %val__id.i78.rhs.rhs, %val__id.i82 + %val__id.i78 = fadd double %val__id.i78.lhs, %val__id.i78.rhs + store double %val__id.i78, double* %ptr__id.i77, align 8 + br label %foreach_reset27.i +} + +define void @loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd, <1 x i1> %__mask) { +allocas: + %less_t_load_t1_load94 = icmp slt i32 %t0, %t1 + br i1 %less_t_load_t1_load94, label %for_loop.lr.ph, label %for_exit + +for_loop.lr.ph: ; preds = %allocas + %add_sub_x1_load21_x0_load22_ = sub i32 31, %x0 + %sub_add_sub_x1_load21_x0_load22__ = add i32 %add_sub_x1_load21_x0_load22_, %x1 + %div_sub_add_sub_x1_load21_x0_load22___ = sdiv i32 %sub_add_sub_x1_load21_x0_load22__, 32 + %add_sub_y1_load23_y0_load24_ = sub i32 7, %y0 + %sub_add_sub_y1_load23_y0_load24__ = add i32 %add_sub_y1_load23_y0_load24_, %y1 + %div_sub_add_sub_y1_load23_y0_load24___ = sdiv i32 %sub_add_sub_y1_load23_y0_load24__, 8 + %add_sub_z1_load25_z0_load26_ = sub i32 7, %z0 + %sub_add_sub_z1_load25_z0_load26__ = add i32 %add_sub_z1_load25_z0_load26_, %z1 + %div_sub_add_sub_z1_load25_z0_load26___ = sdiv i32 %sub_add_sub_z1_load25_z0_load26__, 8 + %ntxm1.i = add nsw i32 %div_sub_add_sub_x1_load21_x0_load22___, -1 + %ntxm1d4.i = ashr i32 %ntxm1.i, 2 + %nbx.i = add nsw i32 %ntxm1d4.i, 1 + br label %for_loop + +for_loop: ; preds = %if_exit, %for_loop.lr.ph + %t.095 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load78_plus1, %if_exit ] + %bitop = and i32 %t.095, 1 + %equal_bitop_ = icmp eq i32 %bitop, 0 + %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and.i = and i32 %tid.i.i, 31 + %cmp.i = icmp eq i32 %and.i, 0 + br i1 %cmp.i, label %if.then.i, label %ISPCGetParamBuffer.exit + +if.then.i: ; preds = %for_loop + %ptri64tmp.i = tail call i64 @cudaGetParameterBuffer(i64 8, i64 72) + %phitmp.i = inttoptr i64 %ptri64tmp.i to i8* + br label %ISPCGetParamBuffer.exit + +ISPCGetParamBuffer.exit: ; preds = %if.then.i, %for_loop + %ptri64.i = phi i8* [ %phitmp.i, %if.then.i ], [ null, %for_loop ] + %cmp1 = icmp eq i8* %ptri64.i, null + br i1 %equal_bitop_, label %if_then, label %if_else + +for_exit: ; preds = %if_exit, %allocas + %0 = tail call i32 @cudaDeviceSynchronize() + ret void + +if_then: ; preds = %ISPCGetParamBuffer.exit + br i1 %cmp1, label %if_false, label %if_true + +if_else: ; preds = %ISPCGetParamBuffer.exit + br i1 %cmp1, label %if_false62, label %if_true61 + +if_exit: ; preds = %if.then.i92, %if_false62, %if.then.i83, %if_false + %1 = tail call i32 @cudaDeviceSynchronize() + %t_load78_plus1 = add i32 %t.095, 1 + %exitcond = icmp eq i32 %t_load78_plus1, %t1 + br i1 %exitcond, label %for_exit, label %for_loop + +if_true: ; preds = %if_then + %funarg = bitcast i8* %ptri64.i to i32* + store i32 %x0, i32* %funarg, align 4 + %funarg27 = getelementptr i8* %ptri64.i, i64 4 + %2 = bitcast i8* %funarg27 to i32* + store i32 %x1, i32* %2, align 4 + %funarg28 = getelementptr i8* %ptri64.i, i64 8 + %3 = bitcast i8* %funarg28 to i32* + store i32 %y0, i32* %3, align 4 + %funarg29 = getelementptr i8* %ptri64.i, i64 12 + %4 = bitcast i8* %funarg29 to i32* + store i32 %y1, i32* %4, align 4 + %funarg30 = getelementptr i8* %ptri64.i, i64 16 + %5 = bitcast i8* %funarg30 to i32* + store i32 %z0, i32* %5, align 4 + %funarg31 = getelementptr i8* %ptri64.i, i64 20 + %6 = bitcast i8* %funarg31 to i32* + store i32 %z1, i32* %6, align 4 + %funarg32 = getelementptr i8* %ptri64.i, i64 24 + %7 = bitcast i8* %funarg32 to i32* + store i32 %Nx, i32* %7, align 4 + %funarg33 = getelementptr i8* %ptri64.i, i64 28 + %8 = bitcast i8* %funarg33 to i32* + store i32 %Ny, i32* %8, align 4 + %funarg34 = getelementptr i8* %ptri64.i, i64 32 + %9 = bitcast i8* %funarg34 to i32* + store i32 %Nz, i32* %9, align 4 + %funarg35 = getelementptr i8* %ptri64.i, i64 40 + %10 = bitcast i8* %funarg35 to double** + store double* %coef, double** %10, align 8 + %funarg36 = getelementptr i8* %ptri64.i, i64 48 + %11 = bitcast i8* %funarg36 to double** + store double* %vsq, double** %11, align 8 + %funarg37 = getelementptr i8* %ptri64.i, i64 56 + %12 = bitcast i8* %funarg37 to double** + store double* %Aeven, double** %12, align 8 + %funarg38 = getelementptr i8* %ptri64.i, i64 64 + %13 = bitcast i8* %funarg38 to double** + store double* %Aodd, double** %13, align 8 + br label %if_false + +if_false: ; preds = %if_true, %if_then + %tid.i.i80 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and.i81 = and i32 %tid.i.i80, 31 + %cmp.i82 = icmp eq i32 %and.i81, 0 + br i1 %cmp.i82, label %if.then.i83, label %if_exit + +if.then.i83: ; preds = %if_false + %args_i64.i = ptrtoint i8* %ptri64.i to i64 + %res_tmp.i = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) + br label %if_exit + +if_true61: ; preds = %if_else + %funarg64 = bitcast i8* %ptri64.i to i32* + store i32 %x0, i32* %funarg64, align 4 + %funarg65 = getelementptr i8* %ptri64.i, i64 4 + %14 = bitcast i8* %funarg65 to i32* + store i32 %x1, i32* %14, align 4 + %funarg66 = getelementptr i8* %ptri64.i, i64 8 + %15 = bitcast i8* %funarg66 to i32* + store i32 %y0, i32* %15, align 4 + %funarg67 = getelementptr i8* %ptri64.i, i64 12 + %16 = bitcast i8* %funarg67 to i32* + store i32 %y1, i32* %16, align 4 + %funarg68 = getelementptr i8* %ptri64.i, i64 16 + %17 = bitcast i8* %funarg68 to i32* + store i32 %z0, i32* %17, align 4 + %funarg69 = getelementptr i8* %ptri64.i, i64 20 + %18 = bitcast i8* %funarg69 to i32* + store i32 %z1, i32* %18, align 4 + %funarg70 = getelementptr i8* %ptri64.i, i64 24 + %19 = bitcast i8* %funarg70 to i32* + store i32 %Nx, i32* %19, align 4 + %funarg71 = getelementptr i8* %ptri64.i, i64 28 + %20 = bitcast i8* %funarg71 to i32* + store i32 %Ny, i32* %20, align 4 + %funarg72 = getelementptr i8* %ptri64.i, i64 32 + %21 = bitcast i8* %funarg72 to i32* + store i32 %Nz, i32* %21, align 4 + %funarg73 = getelementptr i8* %ptri64.i, i64 40 + %22 = bitcast i8* %funarg73 to double** + store double* %coef, double** %22, align 8 + %funarg74 = getelementptr i8* %ptri64.i, i64 48 + %23 = bitcast i8* %funarg74 to double** + store double* %vsq, double** %23, align 8 + %funarg75 = getelementptr i8* %ptri64.i, i64 56 + %24 = bitcast i8* %funarg75 to double** + store double* %Aodd, double** %24, align 8 + %funarg76 = getelementptr i8* %ptri64.i, i64 64 + %25 = bitcast i8* %funarg76 to double** + store double* %Aeven, double** %25, align 8 + br label %if_false62 + +if_false62: ; preds = %if_true61, %if_else + %tid.i.i84 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and.i85 = and i32 %tid.i.i84, 31 + %cmp.i86 = icmp eq i32 %and.i85, 0 + br i1 %cmp.i86, label %if.then.i92, label %if_exit + +if.then.i92: ; preds = %if_false62 + %args_i64.i90 = ptrtoint i8* %ptri64.i to i64 + %res_tmp.i91 = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i90, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) + br label %if_exit +} + +define void @loop_stencil_ispc_tasks(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd) { +allocas: + %less_t_load_t1_load94 = icmp slt i32 %t0, %t1 + br i1 %less_t_load_t1_load94, label %for_loop.lr.ph, label %for_exit + +for_loop.lr.ph: ; preds = %allocas + %add_sub_x1_load21_x0_load22_ = sub i32 31, %x0 + %sub_add_sub_x1_load21_x0_load22__ = add i32 %add_sub_x1_load21_x0_load22_, %x1 + %div_sub_add_sub_x1_load21_x0_load22___ = sdiv i32 %sub_add_sub_x1_load21_x0_load22__, 32 + %add_sub_y1_load23_y0_load24_ = sub i32 7, %y0 + %sub_add_sub_y1_load23_y0_load24__ = add i32 %add_sub_y1_load23_y0_load24_, %y1 + %div_sub_add_sub_y1_load23_y0_load24___ = sdiv i32 %sub_add_sub_y1_load23_y0_load24__, 8 + %add_sub_z1_load25_z0_load26_ = sub i32 7, %z0 + %sub_add_sub_z1_load25_z0_load26__ = add i32 %add_sub_z1_load25_z0_load26_, %z1 + %div_sub_add_sub_z1_load25_z0_load26___ = sdiv i32 %sub_add_sub_z1_load25_z0_load26__, 8 + %ntxm1.i = add nsw i32 %div_sub_add_sub_x1_load21_x0_load22___, -1 + %ntxm1d4.i = ashr i32 %ntxm1.i, 2 + %nbx.i = add nsw i32 %ntxm1d4.i, 1 + br label %for_loop + +for_loop: ; preds = %if_exit, %for_loop.lr.ph + %t.095 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load78_plus1, %if_exit ] + %bitop = and i32 %t.095, 1 + %equal_bitop_ = icmp eq i32 %bitop, 0 + %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and.i = and i32 %tid.i.i, 31 + %cmp.i = icmp eq i32 %and.i, 0 + br i1 %cmp.i, label %if.then.i, label %ISPCGetParamBuffer.exit + +if.then.i: ; preds = %for_loop + %ptri64tmp.i = tail call i64 @cudaGetParameterBuffer(i64 8, i64 72) + %phitmp.i = inttoptr i64 %ptri64tmp.i to i8* + br label %ISPCGetParamBuffer.exit + +ISPCGetParamBuffer.exit: ; preds = %if.then.i, %for_loop + %ptri64.i = phi i8* [ %phitmp.i, %if.then.i ], [ null, %for_loop ] + %cmp1 = icmp eq i8* %ptri64.i, null + br i1 %equal_bitop_, label %if_then, label %if_else + +for_exit: ; preds = %if_exit, %allocas + %0 = tail call i32 @cudaDeviceSynchronize() + ret void + +if_then: ; preds = %ISPCGetParamBuffer.exit + br i1 %cmp1, label %if_false, label %if_true + +if_else: ; preds = %ISPCGetParamBuffer.exit + br i1 %cmp1, label %if_false62, label %if_true61 + +if_exit: ; preds = %if.then.i92, %if_false62, %if.then.i83, %if_false + %1 = tail call i32 @cudaDeviceSynchronize() + %t_load78_plus1 = add i32 %t.095, 1 + %exitcond = icmp eq i32 %t_load78_plus1, %t1 + br i1 %exitcond, label %for_exit, label %for_loop + +if_true: ; preds = %if_then + %funarg = bitcast i8* %ptri64.i to i32* + store i32 %x0, i32* %funarg, align 4 + %funarg27 = getelementptr i8* %ptri64.i, i64 4 + %2 = bitcast i8* %funarg27 to i32* + store i32 %x1, i32* %2, align 4 + %funarg28 = getelementptr i8* %ptri64.i, i64 8 + %3 = bitcast i8* %funarg28 to i32* + store i32 %y0, i32* %3, align 4 + %funarg29 = getelementptr i8* %ptri64.i, i64 12 + %4 = bitcast i8* %funarg29 to i32* + store i32 %y1, i32* %4, align 4 + %funarg30 = getelementptr i8* %ptri64.i, i64 16 + %5 = bitcast i8* %funarg30 to i32* + store i32 %z0, i32* %5, align 4 + %funarg31 = getelementptr i8* %ptri64.i, i64 20 + %6 = bitcast i8* %funarg31 to i32* + store i32 %z1, i32* %6, align 4 + %funarg32 = getelementptr i8* %ptri64.i, i64 24 + %7 = bitcast i8* %funarg32 to i32* + store i32 %Nx, i32* %7, align 4 + %funarg33 = getelementptr i8* %ptri64.i, i64 28 + %8 = bitcast i8* %funarg33 to i32* + store i32 %Ny, i32* %8, align 4 + %funarg34 = getelementptr i8* %ptri64.i, i64 32 + %9 = bitcast i8* %funarg34 to i32* + store i32 %Nz, i32* %9, align 4 + %funarg35 = getelementptr i8* %ptri64.i, i64 40 + %10 = bitcast i8* %funarg35 to double** + store double* %coef, double** %10, align 8 + %funarg36 = getelementptr i8* %ptri64.i, i64 48 + %11 = bitcast i8* %funarg36 to double** + store double* %vsq, double** %11, align 8 + %funarg37 = getelementptr i8* %ptri64.i, i64 56 + %12 = bitcast i8* %funarg37 to double** + store double* %Aeven, double** %12, align 8 + %funarg38 = getelementptr i8* %ptri64.i, i64 64 + %13 = bitcast i8* %funarg38 to double** + store double* %Aodd, double** %13, align 8 + br label %if_false + +if_false: ; preds = %if_true, %if_then + %tid.i.i80 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and.i81 = and i32 %tid.i.i80, 31 + %cmp.i82 = icmp eq i32 %and.i81, 0 + br i1 %cmp.i82, label %if.then.i83, label %if_exit + +if.then.i83: ; preds = %if_false + %args_i64.i = ptrtoint i8* %ptri64.i to i64 + %res_tmp.i = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) + br label %if_exit + +if_true61: ; preds = %if_else + %funarg64 = bitcast i8* %ptri64.i to i32* + store i32 %x0, i32* %funarg64, align 4 + %funarg65 = getelementptr i8* %ptri64.i, i64 4 + %14 = bitcast i8* %funarg65 to i32* + store i32 %x1, i32* %14, align 4 + %funarg66 = getelementptr i8* %ptri64.i, i64 8 + %15 = bitcast i8* %funarg66 to i32* + store i32 %y0, i32* %15, align 4 + %funarg67 = getelementptr i8* %ptri64.i, i64 12 + %16 = bitcast i8* %funarg67 to i32* + store i32 %y1, i32* %16, align 4 + %funarg68 = getelementptr i8* %ptri64.i, i64 16 + %17 = bitcast i8* %funarg68 to i32* + store i32 %z0, i32* %17, align 4 + %funarg69 = getelementptr i8* %ptri64.i, i64 20 + %18 = bitcast i8* %funarg69 to i32* + store i32 %z1, i32* %18, align 4 + %funarg70 = getelementptr i8* %ptri64.i, i64 24 + %19 = bitcast i8* %funarg70 to i32* + store i32 %Nx, i32* %19, align 4 + %funarg71 = getelementptr i8* %ptri64.i, i64 28 + %20 = bitcast i8* %funarg71 to i32* + store i32 %Ny, i32* %20, align 4 + %funarg72 = getelementptr i8* %ptri64.i, i64 32 + %21 = bitcast i8* %funarg72 to i32* + store i32 %Nz, i32* %21, align 4 + %funarg73 = getelementptr i8* %ptri64.i, i64 40 + %22 = bitcast i8* %funarg73 to double** + store double* %coef, double** %22, align 8 + %funarg74 = getelementptr i8* %ptri64.i, i64 48 + %23 = bitcast i8* %funarg74 to double** + store double* %vsq, double** %23, align 8 + %funarg75 = getelementptr i8* %ptri64.i, i64 56 + %24 = bitcast i8* %funarg75 to double** + store double* %Aodd, double** %24, align 8 + %funarg76 = getelementptr i8* %ptri64.i, i64 64 + %25 = bitcast i8* %funarg76 to double** + store double* %Aeven, double** %25, align 8 + br label %if_false62 + +if_false62: ; preds = %if_true61, %if_else + %tid.i.i84 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %and.i85 = and i32 %tid.i.i84, 31 + %cmp.i86 = icmp eq i32 %and.i85, 0 + br i1 %cmp.i86, label %if.then.i92, label %if_exit + +if.then.i92: ; preds = %if_false62 + %args_i64.i90 = ptrtoint i8* %ptri64.i to i64 + %res_tmp.i91 = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i90, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) + br label %if_exit +} + +!llvm.ident = !{!0} +!nvvm.annotations = !{!1, !2} + +!0 = metadata !{metadata !"clang version 3.4 (trunk 194723)"} +!1 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, metadata !"kernel", i32 1} +!2 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @loop_stencil_ispc_tasks, metadata !"kernel", i32 1} diff --git a/examples_cuda/stencil/stencil_ispc_nvptx64.ptx b/examples_cuda/stencil/stencil_ispc_nvptx64.ptx new file mode 100644 index 00000000..b0339cbf --- /dev/null +++ b/examples_cuda/stencil/stencil_ispc_nvptx64.ptx @@ -0,0 +1,1246 @@ +// +// Generated by NVIDIA NVVM Compiler +// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) +// Cuda compilation tools, release 5.5, V5.5.0 +// + +.version 3.2 +.target sm_35 +.address_size 64 + + +.extern .func (.param .b32 func_retval0) cudaLaunchDevice +( + .param .b64 cudaLaunchDevice_param_0, + .param .b64 cudaLaunchDevice_param_1, + .param .align 4 .b8 cudaLaunchDevice_param_2[12], + .param .align 4 .b8 cudaLaunchDevice_param_3[12], + .param .b32 cudaLaunchDevice_param_4, + .param .b64 cudaLaunchDevice_param_5 +); + + +.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer +( + .param .b64 cudaGetParameterBuffer_param_0, + .param .b64 cudaGetParameterBuffer_param_1 +) +; +.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize +( + +) +; +.global .align 1 .b8 constDeltaForeach1[32]; +.global .align 1 .b8 constDeltaForeach4[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + +.visible .func (.param .b32 func_retval0) __shfl_i32( + .param .b32 __shfl_i32_param_0, + .param .b32 __shfl_i32_param_1 +) +{ + .reg .s32 %r<4>; + + + ld.param.u32 %r2, [__shfl_i32_param_0]; + ld.param.u32 %r3, [__shfl_i32_param_1]; + // inline asm + shfl.idx.b32 %r1, %r2, %r3, 0x1f; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __shfl_xor_float( + .param .b32 __shfl_xor_float_param_0, + .param .b32 __shfl_xor_float_param_1 +) +{ + .reg .s32 %r<2>; + .reg .f32 %f<3>; + + + ld.param.f32 %f2, [__shfl_xor_float_param_0]; + ld.param.u32 %r1, [__shfl_xor_float_param_1]; + // inline asm + shfl.bfly.b32 %f1, %f2, %r1, 0x1f; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __shfl_xor_i32( + .param .b32 __shfl_xor_i32_param_0, + .param .b32 __shfl_xor_i32_param_1 +) +{ + .reg .s32 %r<4>; + + + ld.param.u32 %r2, [__shfl_xor_i32_param_0]; + ld.param.u32 %r3, [__shfl_xor_i32_param_1]; + // inline asm + shfl.bfly.b32 %r1, %r2, %r3, 0x1f; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __fminf( + .param .b32 __fminf_param_0, + .param .b32 __fminf_param_1 +) +{ + .reg .f32 %f<4>; + + + ld.param.f32 %f2, [__fminf_param_0]; + ld.param.f32 %f3, [__fminf_param_1]; + // inline asm + min.f32 %f1, %f2, %f3; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __fmaxf( + .param .b32 __fmaxf_param_0, + .param .b32 __fmaxf_param_1 +) +{ + .reg .f32 %f<4>; + + + ld.param.f32 %f2, [__fmaxf_param_0]; + ld.param.f32 %f3, [__fmaxf_param_1]; + // inline asm + max.f32 %f1, %f2, %f3; + // inline asm + st.param.f32 [func_retval0+0], %f1; + ret; +} + +.visible .func (.param .b32 func_retval0) __ballot( + .param .b32 __ballot_param_0 +) +{ + .reg .s32 %r<3>; + + + ld.param.u8 %r2, [__ballot_param_0]; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r2, 0; + vote.ballot.b32 %r1, %p1; + } + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b32 func_retval0) __lanemask_lt( + +) +{ + .reg .s32 %r<2>; + + + // inline asm + mov.u32 %r1, %lanemask_lt; + // inline asm + st.param.b32 [func_retval0+0], %r1; + ret; +} + +.visible .func (.param .b64 func_retval0) ISPCAlloc( + .param .b64 ISPCAlloc_param_0, + .param .b64 ISPCAlloc_param_1, + .param .b32 ISPCAlloc_param_2 +) +{ + .reg .s64 %rd<2>; + + + mov.u64 %rd1, 1; + st.param.b64 [func_retval0+0], %rd1; + ret; +} + +.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer( + .param .b64 ISPCGetParamBuffer_param_0, + .param .b64 ISPCGetParamBuffer_param_1, + .param .b64 ISPCGetParamBuffer_param_2 +) +{ + .reg .pred %p<2>; + .reg .s32 %r<3>; + .reg .s64 %rd<7>; + + + ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1]; + ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2]; + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + setp.ne.s32 %p1, %r2, 0; + mov.u64 %rd6, 0; + @%p1 bra BB8_2; + + // Callseq Start 0 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd3; + .param .b64 param1; + st.param.b64 [param1+0], %rd4; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd6, [retval0+0]; + } + // Callseq End 0 + +BB8_2: + st.param.b64 [func_retval0+0], %rd6; + ret; +} + +.visible .func ISPCLaunch( + .param .b64 ISPCLaunch_param_0, + .param .b64 ISPCLaunch_param_1, + .param .b64 ISPCLaunch_param_2, + .param .b32 ISPCLaunch_param_3, + .param .b32 ISPCLaunch_param_4, + .param .b32 ISPCLaunch_param_5 +) +{ + .reg .pred %p<2>; + .reg .s32 %r<16>; + .reg .s64 %rd<6>; + + + ld.param.u64 %rd1, [ISPCLaunch_param_1]; + ld.param.u64 %rd2, [ISPCLaunch_param_2]; + ld.param.u32 %r1, [ISPCLaunch_param_3]; + ld.param.u32 %r2, [ISPCLaunch_param_4]; + ld.param.u32 %r3, [ISPCLaunch_param_5]; + mov.u32 %r4, %tid.x; + and.b32 %r5, %r4, 31; + setp.ne.s32 %p1, %r5, 0; + @%p1 bra BB9_2; + + add.s32 %r14, %r1, -1; + shr.s32 %r15, %r14, 2; + add.s32 %r7, %r15, 1; + mov.u32 %r12, 1; + mov.u32 %r10, 128; + mov.u32 %r13, 0; + mov.u64 %rd5, 0; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd1; + .param .b64 param1; + st.param.b64 [param1+0], %rd2; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r7; + st.param.b32 [param2+4], %r2; + st.param.b32 [param2+8], %r3; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r10; + st.param.b32 [param3+4], %r12; + st.param.b32 [param3+8], %r12; + .param .b32 param4; + st.param.b32 [param4+0], %r13; + .param .b64 param5; + st.param.b64 [param5+0], %rd5; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r6, [retval0+0]; + } + + // inline asm + +BB9_2: + ret; +} + +.visible .func ISPCSync( + .param .b64 ISPCSync_param_0 +) +{ + .reg .s32 %r<2>; + + + // Callseq Start 1 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r1, [retval0+0]; + } + // Callseq End 1 + ret; +} + +.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan( + .param .b32 __warpBinExclusiveScan_param_0 +) +{ + .reg .s32 %r<8>; + .reg .s64 %rd<5>; + + + ld.param.u8 %r2, [__warpBinExclusiveScan_param_0]; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r2, 0; + vote.ballot.b32 %r1, %p1; + } + // inline asm + // inline asm + popc.b32 %r3, %r1; + // inline asm + // inline asm + mov.u32 %r5, %lanemask_lt; + // inline asm + and.b32 %r7, %r5, %r1; + // inline asm + popc.b32 %r6, %r7; + // inline asm + cvt.u64.u32 %rd1, %r6; + shl.b64 %rd2, %rd1, 32; + cvt.u64.u32 %rd3, %r3; + or.b64 %rd4, %rd2, %rd3; + st.param.b64 [func_retval0+0], %rd4; + ret; +} + +.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_( + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7, + .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11, + .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12 +) +{ + .reg .pred %p<14>; + .reg .s32 %r<178>; + .reg .s64 %rd<96>; + .reg .f64 %fd<95>; + + + ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0]; + ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1]; + ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2]; + ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3]; + ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4]; + ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5]; + ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6]; + ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7]; + ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9]; + ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10]; + ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11]; + ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12]; + mov.u32 %r1, %ctaid.x; + shl.b32 %r50, %r1, 2; + mov.u32 %r2, %tid.x; + shr.s32 %r51, %r2, 5; + add.s32 %r52, %r51, %r50; + mov.u32 %r53, %nctaid.x; + shl.b32 %r54, %r53, 2; + setp.ge.s32 %p1, %r52, %r54; + mov.u32 %r55, %nctaid.y; + mov.u32 %r3, %ctaid.y; + setp.ge.s32 %p2, %r3, %r55; + or.pred %p3, %p1, %p2; + mov.u32 %r56, %nctaid.z; + mov.u32 %r4, %ctaid.z; + setp.ge.s32 %p4, %r4, %r56; + or.pred %p5, %p3, %p4; + @%p5 bra BB12_13; + + shl.b32 %r57, %r1, 7; + add.s32 %r58, %r2, %r57; + and.b32 %r59, %r58, -32; + add.s32 %r60, %r59, %r42; + add.s32 %r61, %r60, 32; + min.s32 %r5, %r43, %r61; + shl.b32 %r6, %r3, 3; + add.s32 %r62, %r6, %r44; + add.s32 %r7, %r62, 8; + shl.b32 %r8, %r4, 3; + add.s32 %r172, %r8, %r46; + add.s32 %r63, %r172, 8; + min.s32 %r64, %r47, %r63; + mul.lo.s32 %r10, %r49, %r48; + sub.s32 %r65, %r5, %r60; + shr.s32 %r66, %r65, 31; + shr.u32 %r67, %r66, 27; + add.s32 %r68, %r65, %r67; + and.b32 %r69, %r68, -32; + sub.s32 %r70, %r65, %r69; + sub.s32 %r11, %r5, %r70; + and.b32 %r71, %r2, 31; + cvt.u64.u32 %rd6, %r71; + mov.u64 %rd7, constDeltaForeach1; + add.s64 %rd1, %rd7, %rd6; + setp.ge.s32 %p6, %r172, %r64; + @%p6 bra BB12_13; + + min.s32 %r12, %r45, %r7; + shl.b32 %r15, %r10, 1; + neg.s32 %r16, %r15; + mul.lo.s32 %r17, %r10, 3; + mul.lo.s32 %r18, %r10, -3; + mov.u32 %r72, -9; + sub.s32 %r73, %r72, %r44; + sub.s32 %r74, %r73, %r6; + not.b32 %r75, %r45; + max.s32 %r76, %r74, %r75; + not.b32 %r19, %r76; + sub.s32 %r77, %r72, %r46; + sub.s32 %r78, %r77, %r8; + not.b32 %r79, %r47; + max.s32 %r80, %r78, %r79; + not.b32 %r20, %r80; + ld.global.u8 %r13, [%rd1]; + mov.u32 %r171, %r172; + +BB12_3: + mov.u32 %r21, %r171; + add.s32 %r23, %r21, %r13; + setp.ge.s32 %p7, %r62, %r12; + @%p7 bra BB12_12; + + mul.lo.s32 %r24, %r23, %r10; + mov.u32 %r174, %r62; + mov.u32 %r173, %r62; + +BB12_5: + mov.u32 %r27, %r173; + add.s32 %r30, %r27, %r13; + setp.ge.s32 %p8, %r60, %r11; + mov.u32 %r176, %r60; + @%p8 bra BB12_8; + + mov.u64 %rd9, constDeltaForeach4; + add.s64 %rd10, %rd9, %rd6; + ld.global.u8 %r31, [%rd10]; + mad.lo.s32 %r32, %r30, %r48, %r24; + add.s32 %r177, %r59, %r42; + +BB12_7: + cvta.to.global.u64 %rd11, %rd2; + add.s32 %r98, %r32, %r177; + add.s32 %r99, %r98, %r31; + shl.b32 %r100, %r99, 3; + cvt.s64.s32 %rd12, %r100; + add.s64 %rd13, %rd12, %rd4; + add.s32 %r101, %r100, 8; + cvt.s64.s32 %rd14, %r101; + add.s64 %rd15, %rd14, %rd4; + add.s32 %r102, %r100, -8; + cvt.s64.s32 %rd16, %r102; + add.s64 %rd17, %rd16, %rd4; + add.s32 %r103, %r99, %r48; + shl.b32 %r104, %r103, 3; + cvt.s64.s32 %rd18, %r104; + add.s64 %rd19, %rd18, %rd4; + sub.s32 %r105, %r99, %r48; + shl.b32 %r106, %r105, 3; + cvt.s64.s32 %rd20, %r106; + add.s64 %rd21, %rd20, %rd4; + add.s32 %r108, %r99, %r10; + shl.b32 %r109, %r108, 3; + cvt.s64.s32 %rd22, %r109; + add.s64 %rd23, %rd22, %rd4; + sub.s32 %r110, %r99, %r10; + shl.b32 %r111, %r110, 3; + cvt.s64.s32 %rd24, %r111; + add.s64 %rd25, %rd24, %rd4; + add.s32 %r112, %r100, 16; + cvt.s64.s32 %rd26, %r112; + add.s64 %rd27, %rd26, %rd4; + add.s32 %r113, %r100, -16; + cvt.s64.s32 %rd28, %r113; + add.s64 %rd29, %rd28, %rd4; + shl.b32 %r114, %r48, 1; + add.s32 %r115, %r99, %r114; + shl.b32 %r116, %r115, 3; + cvt.s64.s32 %rd30, %r116; + add.s64 %rd31, %rd30, %rd4; + mad.lo.s32 %r117, %r48, -2, %r99; + shl.b32 %r118, %r117, 3; + cvt.s64.s32 %rd32, %r118; + add.s64 %rd33, %rd32, %rd4; + add.s32 %r119, %r99, %r15; + shl.b32 %r120, %r119, 3; + cvt.s64.s32 %rd34, %r120; + add.s64 %rd35, %rd34, %rd4; + add.s32 %r121, %r99, %r16; + shl.b32 %r122, %r121, 3; + cvt.s64.s32 %rd36, %r122; + add.s64 %rd37, %rd36, %rd4; + add.s32 %r123, %r100, 24; + cvt.s64.s32 %rd38, %r123; + add.s64 %rd39, %rd38, %rd4; + add.s32 %r124, %r100, -24; + cvt.s64.s32 %rd40, %r124; + add.s64 %rd41, %rd40, %rd4; + mad.lo.s32 %r125, %r48, 3, %r99; + shl.b32 %r126, %r125, 3; + cvt.s64.s32 %rd42, %r126; + add.s64 %rd43, %rd42, %rd4; + mad.lo.s32 %r127, %r48, -3, %r99; + shl.b32 %r128, %r127, 3; + cvt.s64.s32 %rd44, %r128; + add.s64 %rd45, %rd44, %rd4; + add.s32 %r129, %r99, %r17; + shl.b32 %r130, %r129, 3; + cvt.s64.s32 %rd46, %r130; + add.s64 %rd47, %rd46, %rd4; + add.s32 %r131, %r99, %r18; + shl.b32 %r132, %r131, 3; + cvt.s64.s32 %rd48, %r132; + add.s64 %rd49, %rd48, %rd4; + add.s64 %rd50, %rd12, %rd5; + add.s64 %rd51, %rd12, %rd3; + ld.f64 %fd1, [%rd13]; + add.f64 %fd2, %fd1, %fd1; + ld.f64 %fd3, [%rd50]; + sub.f64 %fd4, %fd2, %fd3; + ld.global.f64 %fd5, [%rd11]; + ld.f64 %fd6, [%rd17]; + ld.f64 %fd7, [%rd15]; + add.f64 %fd8, %fd7, %fd6; + ld.f64 %fd9, [%rd19]; + add.f64 %fd10, %fd8, %fd9; + ld.f64 %fd11, [%rd21]; + add.f64 %fd12, %fd10, %fd11; + ld.f64 %fd13, [%rd23]; + add.f64 %fd14, %fd12, %fd13; + ld.f64 %fd15, [%rd25]; + add.f64 %fd16, %fd14, %fd15; + ld.global.f64 %fd17, [%rd11+8]; + mul.f64 %fd18, %fd17, %fd16; + fma.rn.f64 %fd19, %fd5, %fd1, %fd18; + ld.f64 %fd20, [%rd29]; + ld.f64 %fd21, [%rd27]; + add.f64 %fd22, %fd21, %fd20; + ld.f64 %fd23, [%rd31]; + add.f64 %fd24, %fd22, %fd23; + ld.f64 %fd25, [%rd33]; + add.f64 %fd26, %fd24, %fd25; + ld.f64 %fd27, [%rd35]; + add.f64 %fd28, %fd26, %fd27; + ld.f64 %fd29, [%rd37]; + add.f64 %fd30, %fd28, %fd29; + ld.global.f64 %fd31, [%rd11+16]; + fma.rn.f64 %fd32, %fd31, %fd30, %fd19; + ld.f64 %fd33, [%rd41]; + ld.f64 %fd34, [%rd39]; + add.f64 %fd35, %fd34, %fd33; + ld.f64 %fd36, [%rd43]; + add.f64 %fd37, %fd35, %fd36; + ld.f64 %fd38, [%rd45]; + add.f64 %fd39, %fd37, %fd38; + ld.f64 %fd40, [%rd47]; + add.f64 %fd41, %fd39, %fd40; + ld.f64 %fd42, [%rd49]; + add.f64 %fd43, %fd41, %fd42; + ld.global.f64 %fd44, [%rd11+24]; + fma.rn.f64 %fd45, %fd44, %fd43, %fd32; + ld.f64 %fd46, [%rd51]; + fma.rn.f64 %fd47, %fd46, %fd45, %fd4; + st.f64 [%rd50], %fd47; + add.s32 %r177, %r177, 32; + setp.lt.s32 %p9, %r177, %r11; + mov.u32 %r175, %r177; + mov.u32 %r176, %r175; + @%p9 bra BB12_7; + +BB12_8: + mov.u32 %r36, %r176; + setp.ge.s32 %p10, %r36, %r5; + @%p10 bra BB12_11; + + mov.u64 %rd53, constDeltaForeach4; + add.s64 %rd54, %rd53, %rd6; + ld.global.u8 %r135, [%rd54]; + add.s32 %r37, %r36, %r135; + setp.ge.s32 %p11, %r37, %r5; + @%p11 bra BB12_11; + + cvta.to.global.u64 %rd55, %rd2; + mad.lo.s32 %r136, %r30, %r48, %r24; + add.s32 %r137, %r136, %r37; + shl.b32 %r138, %r137, 3; + cvt.s64.s32 %rd56, %r138; + add.s64 %rd57, %rd56, %rd4; + add.s32 %r139, %r138, 8; + cvt.s64.s32 %rd58, %r139; + add.s64 %rd59, %rd58, %rd4; + add.s32 %r140, %r138, -8; + cvt.s64.s32 %rd60, %r140; + add.s64 %rd61, %rd60, %rd4; + add.s32 %r141, %r137, %r48; + shl.b32 %r142, %r141, 3; + cvt.s64.s32 %rd62, %r142; + add.s64 %rd63, %rd62, %rd4; + sub.s32 %r143, %r137, %r48; + shl.b32 %r144, %r143, 3; + cvt.s64.s32 %rd64, %r144; + add.s64 %rd65, %rd64, %rd4; + add.s32 %r146, %r137, %r10; + shl.b32 %r147, %r146, 3; + cvt.s64.s32 %rd66, %r147; + add.s64 %rd67, %rd66, %rd4; + sub.s32 %r148, %r137, %r10; + shl.b32 %r149, %r148, 3; + cvt.s64.s32 %rd68, %r149; + add.s64 %rd69, %rd68, %rd4; + add.s32 %r150, %r138, 16; + cvt.s64.s32 %rd70, %r150; + add.s64 %rd71, %rd70, %rd4; + add.s32 %r151, %r138, -16; + cvt.s64.s32 %rd72, %r151; + add.s64 %rd73, %rd72, %rd4; + shl.b32 %r152, %r48, 1; + add.s32 %r153, %r137, %r152; + shl.b32 %r154, %r153, 3; + cvt.s64.s32 %rd74, %r154; + add.s64 %rd75, %rd74, %rd4; + mad.lo.s32 %r155, %r48, -2, %r137; + shl.b32 %r156, %r155, 3; + cvt.s64.s32 %rd76, %r156; + add.s64 %rd77, %rd76, %rd4; + add.s32 %r157, %r137, %r15; + shl.b32 %r158, %r157, 3; + cvt.s64.s32 %rd78, %r158; + add.s64 %rd79, %rd78, %rd4; + add.s32 %r159, %r137, %r16; + shl.b32 %r160, %r159, 3; + cvt.s64.s32 %rd80, %r160; + add.s64 %rd81, %rd80, %rd4; + add.s32 %r161, %r138, 24; + cvt.s64.s32 %rd82, %r161; + add.s64 %rd83, %rd82, %rd4; + add.s32 %r162, %r138, -24; + cvt.s64.s32 %rd84, %r162; + add.s64 %rd85, %rd84, %rd4; + mad.lo.s32 %r163, %r48, 3, %r137; + shl.b32 %r164, %r163, 3; + cvt.s64.s32 %rd86, %r164; + add.s64 %rd87, %rd86, %rd4; + mad.lo.s32 %r165, %r48, -3, %r137; + shl.b32 %r166, %r165, 3; + cvt.s64.s32 %rd88, %r166; + add.s64 %rd89, %rd88, %rd4; + add.s32 %r167, %r137, %r17; + shl.b32 %r168, %r167, 3; + cvt.s64.s32 %rd90, %r168; + add.s64 %rd91, %rd90, %rd4; + add.s32 %r169, %r137, %r18; + shl.b32 %r170, %r169, 3; + cvt.s64.s32 %rd92, %r170; + add.s64 %rd93, %rd92, %rd4; + add.s64 %rd94, %rd56, %rd5; + add.s64 %rd95, %rd56, %rd3; + ld.f64 %fd48, [%rd57]; + add.f64 %fd49, %fd48, %fd48; + ld.f64 %fd50, [%rd94]; + sub.f64 %fd51, %fd49, %fd50; + ld.global.f64 %fd52, [%rd55]; + ld.f64 %fd53, [%rd61]; + ld.f64 %fd54, [%rd59]; + add.f64 %fd55, %fd54, %fd53; + ld.f64 %fd56, [%rd63]; + add.f64 %fd57, %fd55, %fd56; + ld.f64 %fd58, [%rd65]; + add.f64 %fd59, %fd57, %fd58; + ld.f64 %fd60, [%rd67]; + add.f64 %fd61, %fd59, %fd60; + ld.f64 %fd62, [%rd69]; + add.f64 %fd63, %fd61, %fd62; + ld.global.f64 %fd64, [%rd55+8]; + mul.f64 %fd65, %fd64, %fd63; + fma.rn.f64 %fd66, %fd52, %fd48, %fd65; + ld.f64 %fd67, [%rd73]; + ld.f64 %fd68, [%rd71]; + add.f64 %fd69, %fd68, %fd67; + ld.f64 %fd70, [%rd75]; + add.f64 %fd71, %fd69, %fd70; + ld.f64 %fd72, [%rd77]; + add.f64 %fd73, %fd71, %fd72; + ld.f64 %fd74, [%rd79]; + add.f64 %fd75, %fd73, %fd74; + ld.f64 %fd76, [%rd81]; + add.f64 %fd77, %fd75, %fd76; + ld.global.f64 %fd78, [%rd55+16]; + fma.rn.f64 %fd79, %fd78, %fd77, %fd66; + ld.f64 %fd80, [%rd85]; + ld.f64 %fd81, [%rd83]; + add.f64 %fd82, %fd81, %fd80; + ld.f64 %fd83, [%rd87]; + add.f64 %fd84, %fd82, %fd83; + ld.f64 %fd85, [%rd89]; + add.f64 %fd86, %fd84, %fd85; + ld.f64 %fd87, [%rd91]; + add.f64 %fd88, %fd86, %fd87; + ld.f64 %fd89, [%rd93]; + add.f64 %fd90, %fd88, %fd89; + ld.global.f64 %fd91, [%rd55+24]; + fma.rn.f64 %fd92, %fd91, %fd90, %fd79; + ld.f64 %fd93, [%rd95]; + fma.rn.f64 %fd94, %fd92, %fd93, %fd51; + st.f64 [%rd94], %fd94; + +BB12_11: + add.s32 %r39, %r174, 1; + setp.ne.s32 %p12, %r39, %r19; + mov.u32 %r174, %r39; + mov.u32 %r173, %r39; + @%p12 bra BB12_5; + +BB12_12: + add.s32 %r171, %r172, 1; + setp.ne.s32 %p13, %r171, %r20; + mov.u32 %r172, %r171; + @%p13 bra BB12_3; + +BB12_13: + ret; +} + +.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_( + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9, + .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13, + .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14, + .param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1] +) +{ + .reg .pred %p<9>; + .reg .s32 %r<63>; + .reg .s64 %rd<18>; + + + ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0]; + ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1]; + ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2]; + ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3]; + ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4]; + ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5]; + ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6]; + ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7]; + ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8]; + ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9]; + ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10]; + ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11]; + ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12]; + ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13]; + ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14]; + setp.ge.s32 %p1, %r62, %r12; + @%p1 bra BB13_14; + + mov.u32 %r22, 31; + sub.s32 %r23, %r22, %r13; + add.s32 %r24, %r23, %r14; + shr.s32 %r25, %r24, 31; + shr.u32 %r26, %r25, 27; + add.s32 %r27, %r24, %r26; + shr.s32 %r28, %r27, 5; + mov.u32 %r29, 7; + sub.s32 %r30, %r29, %r15; + add.s32 %r31, %r30, %r16; + shr.s32 %r32, %r31, 31; + shr.u32 %r33, %r32, 29; + add.s32 %r34, %r31, %r33; + shr.s32 %r1, %r34, 3; + sub.s32 %r35, %r29, %r17; + add.s32 %r36, %r35, %r18; + shr.s32 %r37, %r36, 31; + shr.u32 %r38, %r37, 29; + add.s32 %r39, %r36, %r38; + shr.s32 %r2, %r39, 3; + add.s32 %r40, %r28, -1; + shr.s32 %r41, %r40, 2; + add.s32 %r3, %r41, 1; + mov.u32 %r42, %tid.x; + and.b32 %r4, %r42, 31; + sub.s32 %r61, %r62, %r12; + +BB13_2: + and.b32 %r8, %r62, 1; + setp.ne.s32 %p2, %r4, 0; + mov.u64 %rd17, 0; + @%p2 bra BB13_4; + + mov.u64 %rd9, 8; + mov.u64 %rd10, 72; + // Callseq Start 2 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd10; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd17, [retval0+0]; + } + // Callseq End 2 + +BB13_4: + setp.eq.s32 %p3, %r8, 0; + @%p3 bra BB13_9; + + setp.eq.s64 %p4, %rd17, 0; + @%p4 bra BB13_7; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd7; + st.u64 [%rd17+64], %rd6; + +BB13_7: + @%p2 bra BB13_13; + + mov.u32 %r47, 128; + mov.u32 %r49, 1; + mov.u32 %r50, 0; + mov.u64 %rd13, 0; + mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd11; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r47; + st.param.b32 [param3+4], %r49; + st.param.b32 [param3+8], %r49; + .param .b32 param4; + st.param.b32 [param4+0], %r50; + .param .b64 param5; + st.param.b64 [param5+0], %rd13; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r43, [retval0+0]; + } + + // inline asm + bra.uni BB13_13; + +BB13_9: + setp.eq.s64 %p6, %rd17, 0; + @%p6 bra BB13_11; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd6; + st.u64 [%rd17+64], %rd7; + +BB13_11: + @%p2 bra BB13_13; + + mov.u32 %r55, 128; + mov.u32 %r57, 1; + mov.u32 %r58, 0; + mov.u64 %rd16, 0; + mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd14; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r55; + st.param.b32 [param3+4], %r57; + st.param.b32 [param3+8], %r57; + .param .b32 param4; + st.param.b32 [param4+0], %r58; + .param .b64 param5; + st.param.b64 [param5+0], %rd16; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r51, [retval0+0]; + } + + // inline asm + +BB13_13: + // Callseq Start 3 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r59, [retval0+0]; + } + // Callseq End 3 + add.s32 %r62, %r62, 1; + add.s32 %r61, %r61, 1; + setp.ne.s32 %p8, %r61, 0; + @%p8 bra BB13_2; + +BB13_14: + // Callseq Start 4 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r60, [retval0+0]; + } + // Callseq End 4 + ret; +} + +.visible .entry loop_stencil_ispc_tasks( + .param .u32 loop_stencil_ispc_tasks_param_0, + .param .u32 loop_stencil_ispc_tasks_param_1, + .param .u32 loop_stencil_ispc_tasks_param_2, + .param .u32 loop_stencil_ispc_tasks_param_3, + .param .u32 loop_stencil_ispc_tasks_param_4, + .param .u32 loop_stencil_ispc_tasks_param_5, + .param .u32 loop_stencil_ispc_tasks_param_6, + .param .u32 loop_stencil_ispc_tasks_param_7, + .param .u32 loop_stencil_ispc_tasks_param_8, + .param .u32 loop_stencil_ispc_tasks_param_9, + .param .u32 loop_stencil_ispc_tasks_param_10, + .param .u64 loop_stencil_ispc_tasks_param_11, + .param .u64 loop_stencil_ispc_tasks_param_12, + .param .u64 loop_stencil_ispc_tasks_param_13, + .param .u64 loop_stencil_ispc_tasks_param_14 +) +{ + .reg .pred %p<9>; + .reg .s32 %r<63>; + .reg .s64 %rd<18>; + + + ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0]; + ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1]; + ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2]; + ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3]; + ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4]; + ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5]; + ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6]; + ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7]; + ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8]; + ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9]; + ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10]; + ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11]; + ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12]; + ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13]; + ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14]; + setp.ge.s32 %p1, %r62, %r12; + @%p1 bra BB14_14; + + mov.u32 %r22, 31; + sub.s32 %r23, %r22, %r13; + add.s32 %r24, %r23, %r14; + shr.s32 %r25, %r24, 31; + shr.u32 %r26, %r25, 27; + add.s32 %r27, %r24, %r26; + shr.s32 %r28, %r27, 5; + mov.u32 %r29, 7; + sub.s32 %r30, %r29, %r15; + add.s32 %r31, %r30, %r16; + shr.s32 %r32, %r31, 31; + shr.u32 %r33, %r32, 29; + add.s32 %r34, %r31, %r33; + shr.s32 %r1, %r34, 3; + sub.s32 %r35, %r29, %r17; + add.s32 %r36, %r35, %r18; + shr.s32 %r37, %r36, 31; + shr.u32 %r38, %r37, 29; + add.s32 %r39, %r36, %r38; + shr.s32 %r2, %r39, 3; + add.s32 %r40, %r28, -1; + shr.s32 %r41, %r40, 2; + add.s32 %r3, %r41, 1; + mov.u32 %r42, %tid.x; + and.b32 %r4, %r42, 31; + sub.s32 %r61, %r62, %r12; + +BB14_2: + and.b32 %r8, %r62, 1; + setp.ne.s32 %p2, %r4, 0; + mov.u64 %rd17, 0; + @%p2 bra BB14_4; + + mov.u64 %rd9, 8; + mov.u64 %rd10, 72; + // Callseq Start 5 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd10; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd17, [retval0+0]; + } + // Callseq End 5 + +BB14_4: + setp.eq.s32 %p3, %r8, 0; + @%p3 bra BB14_9; + + setp.eq.s64 %p4, %rd17, 0; + @%p4 bra BB14_7; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd7; + st.u64 [%rd17+64], %rd6; + +BB14_7: + @%p2 bra BB14_13; + + mov.u32 %r47, 128; + mov.u32 %r49, 1; + mov.u32 %r50, 0; + mov.u64 %rd13, 0; + mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd11; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r47; + st.param.b32 [param3+4], %r49; + st.param.b32 [param3+8], %r49; + .param .b32 param4; + st.param.b32 [param4+0], %r50; + .param .b64 param5; + st.param.b64 [param5+0], %rd13; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r43, [retval0+0]; + } + + // inline asm + bra.uni BB14_13; + +BB14_9: + setp.eq.s64 %p6, %rd17, 0; + @%p6 bra BB14_11; + + st.u32 [%rd17], %r13; + st.u32 [%rd17+4], %r14; + st.u32 [%rd17+8], %r15; + st.u32 [%rd17+12], %r16; + st.u32 [%rd17+16], %r17; + st.u32 [%rd17+20], %r18; + st.u32 [%rd17+24], %r19; + st.u32 [%rd17+28], %r20; + st.u32 [%rd17+32], %r21; + st.u64 [%rd17+40], %rd4; + st.u64 [%rd17+48], %rd5; + st.u64 [%rd17+56], %rd6; + st.u64 [%rd17+64], %rd7; + +BB14_11: + @%p2 bra BB14_13; + + mov.u32 %r55, 128; + mov.u32 %r57, 1; + mov.u32 %r58, 0; + mov.u64 %rd16, 0; + mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd14; + .param .b64 param1; + st.param.b64 [param1+0], %rd17; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r3; + st.param.b32 [param2+4], %r1; + st.param.b32 [param2+8], %r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r55; + st.param.b32 [param3+4], %r57; + st.param.b32 [param3+8], %r57; + .param .b32 param4; + st.param.b32 [param4+0], %r58; + .param .b64 param5; + st.param.b64 [param5+0], %rd16; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r51, [retval0+0]; + } + + // inline asm + +BB14_13: + // Callseq Start 6 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r59, [retval0+0]; + } + // Callseq End 6 + add.s32 %r62, %r62, 1; + add.s32 %r61, %r61, 1; + setp.ne.s32 %p8, %r61, 0; + @%p8 bra BB14_2; + +BB14_14: + // Callseq Start 7 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r60, [retval0+0]; + } + // Callseq End 7 + ret; +} + + + diff --git a/examples_cuda/stencil/stencil_nvptx64.bc b/examples_cuda/stencil/stencil_nvptx64.bc deleted file mode 100644 index b77be1e3..00000000 Binary files a/examples_cuda/stencil/stencil_nvptx64.bc and /dev/null differ diff --git a/examples_cuda/stencil/stencil_orig.cpp b/examples_cuda/stencil/stencil_orig.cpp new file mode 100644 index 00000000..015f2b80 --- /dev/null +++ b/examples_cuda/stencil/stencil_orig.cpp @@ -0,0 +1,172 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include +#include "../timing.h" +#include "stencil_ispc.h" +using namespace ispc; + +#include + + +double rtc(void) +{ + struct timeval Tvalue; + double etime; + struct timezone dummy; + + gettimeofday(&Tvalue,&dummy); + etime = (double) Tvalue.tv_sec + + 1.e-6*((double) Tvalue.tv_usec); + return etime; +} + + +extern void loop_stencil_serial(int t0, int t1, int x0, int x1, + int y0, int y1, int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[5], + const double vsq[], + double Aeven[], double Aodd[]); + + +void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) { + int offset = 0; + for (int z = 0; z < Nz; ++z) + for (int y = 0; y < Ny; ++y) + for (int x = 0; x < Nx; ++x, ++offset) { + A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny); + A[1][offset] = 0; + vsq[offset] = x*y*z / double(Nx * Ny * Nz); + } +} + + +int main() { + int Nx = 256, Ny = 256, Nz = 256; + int width = 4; + double *Aserial[2], *Aispc[2]; + Aserial[0] = new double [Nx * Ny * Nz]; + Aserial[1] = new double [Nx * Ny * Nz]; + Aispc[0] = new double [Nx * Ny * Nz]; + Aispc[1] = new double [Nx * Ny * Nz]; + double *vsq = new double [Nx * Ny * Nz]; + + double coeff[4] = { 0.5, -.25, .125, -.0625 }; + +// InitData(Nx, Ny, Nz, Aispc, vsq); + + // + // Compute the image using the ispc implementation on one core; report + // the minimum time of three runs. + // + double minTimeISPC = 1e30; +#if 0 + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aispc[0], Aispc[1]); + double dt = get_elapsed_mcycles(); + minTimeISPC = std::min(minTimeISPC, dt); + } + + printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); +#endif + + fprintf(stderr, " -- init -- \n"); + InitData(Nx, Ny, Nz, Aispc, vsq); + fprintf(stderr, " -- done init -- \n"); + + // + // Compute the image using the ispc implementation with tasks; report + // the minimum time of three runs. + // + double minTimeISPCTasks = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + const double t0 = rtc(); + loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aispc[0], Aispc[1]); + double dt = 1e3*(rtc() - t0); //get_elapsed_mcycles(); + minTimeISPCTasks = std::min(minTimeISPCTasks, dt); + } + + fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); + + + InitData(Nx, Ny, Nz, Aserial, vsq); + + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minTimeSerial = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, + width, Nz - width, Nx, Ny, Nz, coeff, vsq, + Aserial[0], Aserial[1]); + double dt = get_elapsed_mcycles(); + minTimeSerial = std::min(minTimeSerial, dt); + } + + printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); + + printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", + minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); + + // Check for agreement + int offset = 0; + for (int z = 0; z < Nz; ++z) + for (int y = 0; y < Ny; ++y) + for (int x = 0; x < Nx; ++x, ++offset) { + double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) / + Aserial[1][offset]); + if (error > 1e-4) + printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n", + x, y, z, Aispc[1][offset], Aserial[1][offset]); + } + + return 0; +} diff --git a/examples_cuda/stencil/stencil_orig.ispc b/examples_cuda/stencil/stencil_orig.ispc new file mode 100644 index 00000000..d2e095b3 --- /dev/null +++ b/examples_cuda/stencil/stencil_orig.ispc @@ -0,0 +1,172 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef __NVPTX__ +#warning "emitting DEVICE code" +#define taskIndex blockIndex0() +#define taskCount blockCount0() +#define programIndex laneIndex() +#define programCount warpSize() +#else +#warning "emitting HOST code" +#endif + +static inline void +stencil_step(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + const uniform int Nxy = Nx * Ny; + +// foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) +#if 0 +#define VER1 +#endif + +#ifdef VER1 + const uniform long x1o = 1; + const uniform long x2o = 2; + const uniform long x3o = 3; + const uniform long y1o = Nx; + const uniform long y2o = Nx*2; + const uniform long y3o = Nx*3; + const uniform long z1o = Nxy; + const uniform long z2o = Nxy*2; + const uniform long z3o = Nxy*3; +#endif + for (uniform int z = z0; z < z1; z++) + for (uniform int y = y0; y < y1; y++) + { + const int index_base = (z * Nxy) + (y * Nx); + for (uniform int xb = x0; xb < x1; xb += programCount) + { + const int x = xb + programIndex; + int index = index_base + x; +#ifndef VER1 +#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] +#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + + A_cur(0, +1, 0) + A_cur(0, -1, 0) + + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + + A_cur(0, +2, 0) + A_cur(0, -2, 0) + + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + + A_cur(0, +3, 0) + A_cur(0, -3, 0) + + A_cur(0, 0, +3) + A_cur(0, 0, -3)); +#else +#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)] +#define A_next(x, y, z) Aout[index + (x) + (y) + (z)] + double div = coef[0] * A_cur(0, 0, 0) + + coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) + + A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) + + A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) + + coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) + + A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) + + A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) + + coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) + + A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) + + A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o)); +#endif + + if (x < x1) + A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + + vsq[index] * div; + } + } +} + + +static task void +stencil_step_task(uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], uniform const double vsq[], + uniform const double Ain[], uniform double Aout[]) { + if(taskIndex >= taskCount) return; + + stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, + Nx, Ny, Nz, coef, vsq, Ain, Aout); +} + + +export void +loop_stencil_ispc_tasks(uniform int t0, uniform int t1, + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], + uniform const double vsq[], + uniform double Aeven[], uniform double Aodd[]) +{ + for (uniform int t = t0; t < t1; ++t) { + // Parallelize across cores as well: each task will work on a slice + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd); + else + launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven); + + // We need to wait for all of the launched tasks to finish before + // starting the next iteration. + sync; + } +} + + +export void +loop_stencil_ispc(uniform int t0, uniform int t1, + uniform int x0, uniform int x1, + uniform int y0, uniform int y1, + uniform int z0, uniform int z1, + uniform int Nx, uniform int Ny, uniform int Nz, + uniform const double coef[4], + uniform const double vsq[], + uniform double Aeven[], uniform double Aodd[]) +{ + for (uniform int t = t0; t < t1; ++t) { + if ((t & 1) == 0) + stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, + Aeven, Aodd); + else + stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, + Aodd, Aeven); + } +} diff --git a/examples_cuda/stencil/stencil_parallel.cpp b/examples_cuda/stencil/stencil_parallel.cpp index 30ded2cd..d4e59dc8 100644 --- a/examples_cuda/stencil/stencil_parallel.cpp +++ b/examples_cuda/stencil/stencil_parallel.cpp @@ -37,8 +37,8 @@ stencil_step(int x0, int x1, int y0, int y1, int z0, int z1, int Nx, int Ny, int Nz, - const float coef[4], const float vsq[], - const float Ain[], float Aout[]) { + const double coef[4], const double vsq[], + const double Ain[], double Aout[]) { int Nxy = Nx * Ny; #pragma omp parallel for @@ -48,7 +48,7 @@ stencil_step(int x0, int x1, int index = (z * Nxy) + (y * Nx) + x; #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - float div = coef[0] * A_cur(0, 0, 0) + + double div = coef[0] * A_cur(0, 0, 0) + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + A_cur(0, +1, 0) + A_cur(0, -1, 0) + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + @@ -72,9 +72,9 @@ void loop_stencil_parallel(int t0, int t1, int y0, int y1, int z0, int z1, int Nx, int Ny, int Nz, - const float coef[4], - const float vsq[], - float Aeven[], float Aodd[]) + const double coef[4], + const double vsq[], + double Aeven[], double Aodd[]) { for (int t = t0; t < t1; ++t) { if ((t & 1) == 0) diff --git a/examples_cuda/stencil/stencil_serial.o b/examples_cuda/stencil/stencil_serial.o new file mode 100644 index 00000000..1fd32c29 Binary files /dev/null and b/examples_cuda/stencil/stencil_serial.o differ