diff --git a/examples/stencil/1.s b/examples/stencil/1.s deleted file mode 100644 index d59cb1f9..00000000 --- a/examples/stencil/1.s +++ /dev/null @@ -1,175 +0,0 @@ - - code for sm_35 - Function : stencil_step_task - .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" - /* 0x0880a010a0a01000 */ - /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ - /*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */ - /*0018*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */ - /*0020*/ IADD R0, R10, c[0x0][0x150]; /* 0x608000002a1c2802 */ - /*0028*/ IADD R11, R0, 0x1; /* 0xc0800000009c002d */ - /*0030*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */ - /*0038*/ ISETP.GE.AND P0, PT, R0, R11, PT; /* 0xdb681c00059c001e */ - /* 0x08a0a1ac118d8d8c */ - /*0048*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */ - /*0050*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */ - /*0058*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */ - /*0060*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */ - /*0068*/ @P0 EXIT ; /* 0x180000000000003c */ - /*0070*/ MOV R11, c[0x0][0x158]; /* 0x64c03c002b1c002e */ - /*0078*/ IMUL R41, R11, c[0x0][0x154]; /* 0x61c018002a9c2ca6 */ - /* 0x08b0a000a010a010 */ - /*0088*/ IADD R11, R10, c[0x0][0x150]; /* 0x608000002a1c282e */ - /*0090*/ SHF.L R40, RZ, 0x1, R41; /* 0xb7c0a400009ffca1 */ - /*0098*/ I2I.S32.S32 R10, -R40; /* 0xe6010000141ce82a */ - /*00a0*/ IADD R49, R11, 0x1; /* 0xc0800000009c2cc5 */ - /*00a8*/ SHF.L R28, RZ, 0x3, R10; /* 0xb7c02800019ffc71 */ - /*00b0*/ MOV R10, c[0x0][0x148]; /* 0x64c03c00291c002a */ - /*00b8*/ ISETP.GE.AND P0, PT, R10, c[0x0][0x14c], PT; /* 0x5b681c00299c281e */ - /* 0x0880acb0a00010ac */ - /*00c8*/ @P0 BRA 0x4f0; /* 0x120000021000003c */ - /*00d0*/ MOV R29, c[0x0][0x148]; /* 0x64c03c00291c0076 */ - /*00d8*/ IMUL R42, R0, R41; /* 0xe1c01800149c00aa */ - /*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */ - /*00e8*/ ISETP.GE.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b681c00289c281e */ - /*00f0*/ @P0 BRA 0x4d8; /* 0x12000001f000003c */ - /*00f8*/ MOV R10, c[0x0][0x154]; /* 0x64c03c002a9c002a */ - /* 0x0880888010a0109c */ - /*0108*/ IMAD R44, R29, c[0x0][0x154], R42; /* 0x5108a8002a9c74b2 */ - /*0110*/ SHF.L R11, RZ, 0x1, R10; /* 0xb7c02800009ffc2d */ - /*0118*/ MOV R39, c[0x0][0x140]; /* 0x64c03c00281c009e */ - /*0120*/ IMAD R34, R10, -0x2, R44; /* 0xa908b3ffff1c2889 */ - /*0128*/ IADD R43, R44, R11; /* 0xe0800000059cb0ae */ - /*0130*/ I2I.S32.S32 R10, -R11; /* 0xe6010000059ce82a */ - /*0138*/ IMAD R36, R41, -0x2, R44; /* 0xa908b3ffff1ca491 */ - /* 0x08a0001084108480 */ - /*0148*/ IADD R32, R44, c[0x0][0x154]; /* 0x608000002a9cb082 */ - /*0150*/ IADD R33, R44, R41; /* 0xe0800000149cb086 */ - /*0158*/ IADD R35, R44, R40; /* 0xe0800000141cb08e */ - /*0160*/ IMAD R38, R41, 0x3, R44; /* 0xa108b000019ca499 */ - /*0168*/ SHF.L R47, RZ, 0x3, R10; /* 0xb7c02800019ffcbd */ - /*0170*/ IADD R37, R43, c[0x0][0x154]; /* 0x608000002a9cac96 */ - /*0178*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */ - /* 0x08a0b0a010908c10 */ - /*0188*/ MOV32I R48, 0x8; /* 0x74000000041fc0c2 */ - /*0190*/ IADD R45, R10, R39; /* 0xe0800000139c28b6 */ - /*0198*/ BFE R30, R47, 0x11f; /* 0xc00800008f9cbc79 */ - /*01a0*/ IADD R46, R45, R44; /* 0xe0800000161cb4ba */ - /*01a8*/ IADD R14, R32, R45; /* 0xe0800000169c803a */ - /*01b0*/ IMAD R10.CC, R46, R48, c[0x0][0x170]; /* 0x910cc0002e1cb82a */ - /*01b8*/ IMAD.HI.X R11, R46, R48, c[0x0][0x174]; /* 0x9318c0002e9cb82e */ - /* 0x0881cc118c118c10 */ - /*01c8*/ IADD R27, R37, R45; /* 0xe0800000169c946e */ - /*01d0*/ LD.E.64 R12, [R10+-0x8]; /* 0xc5fffffffc1c2830 */ - /*01d8*/ BFE R50, R28, 0x11f; /* 0xc00800008f9c70c9 */ - /*01e0*/ LD.E.64 R24, [R10+0x8]; /* 0xc5800000041c2860 */ - /*01e8*/ ISETP.GE.AND P0, PT, R45, c[0x0][0x144], PT; /* 0x5b681c00289cb41e */ - /*01f0*/ LD.E.64 R18, [R10+-0x18]; /* 0xc5fffffff41c2848 */ - /*01f8*/ DADD R20, R24, R12; /* 0xe3800000061c6052 */ - /* 0x098c10a011ac8188 */ - /*0208*/ LD.E.64 R22, [R10+0x18]; /* 0xc58000000c1c2858 */ - /*0210*/ IMAD R16.CC, R14, R48, c[0x0][0x170]; /* 0x910cc0002e1c3842 */ - /*0218*/ LD.E.64 R12, [R10+-0x10]; /* 0xc5fffffff81c2830 */ - /*0220*/ IMAD.HI.X R17, R14, R48, c[0x0][0x174]; /* 0x9318c0002e9c3846 */ - /*0228*/ IADD R25, R43, R45; /* 0xe0800000169cac66 */ - /*0230*/ LD.E.64 R14, [R16]; /* 0xc5800000001c4038 */ - /*0238*/ DADD R22, R22, R18; /* 0xe3800000091c585a */ - /* 0x0994808c848cb180 */ - /*0248*/ LD.E.64 R18, [R10+0x10]; /* 0xc5800000081c2848 */ - /*0250*/ IMAD R26.CC, R27, R48, c[0x0][0x170]; /* 0x910cc0002e1c6c6a */ - /*0258*/ IMAD.HI.X R27, R27, R48, c[0x0][0x174]; /* 0x9318c0002e9c6c6e */ - /*0260*/ IMAD R24.CC, R25, R48, c[0x0][0x170]; /* 0x910cc0002e1c6462 */ - /*0268*/ DADD R14, R20, R14; /* 0xe3800000071c503a */ - /*0270*/ DADD R20, R18, R12; /* 0xe3800000061c4852 */ - /*0278*/ LD.E.64 R12, [R26]; /* 0xc5800000001c6830 */ - /* 0x08b080118010c080 */ - /*0288*/ IMAD.HI.X R25, R25, R48, c[0x0][0x174]; /* 0x9318c0002e9c6466 */ - /*0290*/ IADD R16.CC, R16, R47; /* 0xe0840000179c4042 */ - /*0298*/ LD.E.64 R18, [R24]; /* 0xc5800000001c6048 */ - /*02a0*/ DADD R12, R22, R12; /* 0xe3800000061c5832 */ - /*02a8*/ IADD.X R17, R17, R30; /* 0xe08040000f1c4446 */ - /*02b0*/ IADD R31, R34, R45; /* 0xe0800000169c887e */ - /*02b8*/ IADD R22.CC, R16, R47; /* 0xe0840000179c405a */ - /* 0x089980818880a010 */ - /*02c8*/ IADD.X R23, R17, R30; /* 0xe08040000f1c445e */ - /*02d0*/ IMAD R26.CC, R31, R48, c[0x0][0x170]; /* 0x910cc0002e1c7c6a */ - /*02d8*/ DADD R20, R20, R18; /* 0xe3800000091c5052 */ - /*02e0*/ LD.E.64 R18, [R16]; /* 0xc5800000001c4048 */ - /*02e8*/ IMAD.HI.X R27, R31, R48, c[0x0][0x174]; /* 0x9318c0002e9c7c6e */ - /*02f0*/ LD.E.64 R24, [R22]; /* 0xc5800000001c5860 */ - /*02f8*/ IADD R51, R33, R45; /* 0xe0800000169c84ce */ - /* 0x088880ac818c11b8 */ - /*0308*/ LD.E.64 R30, [R26]; /* 0xc5800000001c6878 */ - /*0310*/ LD.E.64 R26, [R10]; /* 0xc5800000001c2868 */ - /*0318*/ DADD R14, R14, R18; /* 0xe3800000091c383a */ - /*0320*/ IMAD R18.CC, R51, R48, c[0x0][0x170]; /* 0x910cc0002e1ccc4a */ - /*0328*/ IADD R17, R35, R45; /* 0xe0800000169c8c46 */ - /*0330*/ IMAD.HI.X R19, R51, R48, c[0x0][0x174]; /* 0x9318c0002e9ccc4e */ - /*0338*/ DADD R22, R20, R30; /* 0xe38000000f1c505a */ - /* 0x098c10a0999c1090 */ - /*0348*/ IMAD R16.CC, R17, R48, c[0x0][0x170]; /* 0x910cc0002e1c4442 */ - /*0350*/ LD.E.64 R20, [R18]; /* 0xc5800000001c4850 */ - /*0358*/ DADD R12, R12, R24; /* 0xe38000000c1c3032 */ - /*0360*/ IMAD.HI.X R17, R17, R48, c[0x0][0x174]; /* 0x9318c0002e9c4446 */ - /*0368*/ IADD R18.CC, R18, R28; /* 0xe08400000e1c484a */ - /*0370*/ LD.E.64 R24, [R16]; /* 0xc5800000001c4060 */ - /*0378*/ DADD R20, R14, R20; /* 0xe38000000a1c3852 */ - /* 0x088080b4a18010cc */ - /*0388*/ IADD.X R19, R19, R50; /* 0xe0804000191c4c4e */ - /*0390*/ LD.E.64 R14, [R18]; /* 0xc5800000001c4838 */ - /*0398*/ DADD R22, R22, R24; /* 0xe38000000c1c585a */ - /*03a0*/ IADD R25, R36, R45; /* 0xe0800000169c9066 */ - /*03a8*/ IMAD R16.CC, R25, R48, c[0x0][0x170]; /* 0x910cc0002e1c6442 */ - /*03b0*/ DADD R20, R20, R14; /* 0xe3800000071c5052 */ - /*03b8*/ IADD R15, R38, R45; /* 0xe0800000169c983e */ - /* 0x09a010b081ac809c */ - /*03c8*/ IMAD.HI.X R17, R25, R48, c[0x0][0x174]; /* 0x9318c0002e9c6446 */ - /*03d0*/ IMAD R14.CC, R15, R48, c[0x0][0x170]; /* 0x910cc0002e1c3c3a */ - /*03d8*/ LD.E.64 R24, [R16]; /* 0xc5800000001c4060 */ - /*03e0*/ IMAD.HI.X R15, R15, R48, c[0x0][0x174]; /* 0x9318c0002e9c3c3e */ - /*03e8*/ IADD R18.CC, R18, R28; /* 0xe08400000e1c484a */ - /*03f0*/ LD.E.64 R30, [R14]; /* 0xc5800000001c3878 */ - /*03f8*/ IADD.X R19, R19, R50; /* 0xe0804000191c4c4e */ - /* 0x08a480a480b58010 */ - /*0408*/ LD.E.64 R50, [R18]; /* 0xc5800000001c48c8 */ - /*0410*/ DMUL R20, R6, R20; /* 0xe40000000a1c1852 */ - /*0418*/ DADD R22, R22, R24; /* 0xe38000000c1c585a */ - /*0420*/ DADD R12, R12, R30; /* 0xe38000000f1c3032 */ - /*0428*/ DFMA R24, R8, R26, R20; /* 0xdb8050000d1c2062 */ - /*0430*/ DFMA R16, R4, R22, R24; /* 0xdb8060000b1c1042 */ - /*0438*/ DADD R12, R12, R50; /* 0xe3800000191c3032 */ - /* 0x08908cb0a010ac80 */ - /*0448*/ DFMA R10, R2, R12, R16; /* 0xdb804000061c082a */ - /*0450*/ @P0 BRA.U 0x4b8; /* 0x120000003000023c */ - /*0458*/ @!P0 MOV32I R17, 0x8; /* 0x740000000423c046 */ - /*0460*/ @!P0 DADD R18, R26, R26; /* 0xe38000000d20684a */ - /*0468*/ @!P0 IMAD R14.CC, R46, R17, c[0x0][0x178]; /* 0x910c44002f20b83a */ - /*0470*/ @!P0 IMAD.HI.X R15, R46, R17, c[0x0][0x17c]; /* 0x931844002fa0b83e */ - /*0478*/ @!P0 IMAD R16.CC, R46, R17, c[0x0][0x168]; /* 0x910c44002d20b842 */ - /* 0x08a180a5dc10bd9c */ - /*0488*/ @!P0 LD.E.64 R12, [R14]; /* 0xc580000000203830 */ - /*0490*/ @!P0 IMAD.HI.X R17, R46, R17, c[0x0][0x16c]; /* 0x931844002da0b846 */ - /*0498*/ @!P0 LD.E.64 R20, [R16]; /* 0xc580000000204050 */ - /*04a0*/ @!P0 DADD R22, R18, -R12; /* 0xe38100000620485a */ - /*04a8*/ @!P0 DFMA R10, R20, R10, R22; /* 0xdb8058000520502a */ - /*04b0*/ @!P0 ST.E.64 [R14], R10; /* 0xe580000000203828 */ - /*04b8*/ IADD R39, R39, 0x20; /* 0xc0800000101c9c9d */ - /* 0x08b0a0b8b0a0b8b0 */ - /*04c8*/ ISETP.LT.AND P0, PT, R39, c[0x0][0x144], PT; /* 0x5b181c00289c9c1e */ - /*04d0*/ @P0 BRA 0x178; /* 0x12007ffe5000003c */ - /*04d8*/ IADD R29, R29, 0x1; /* 0xc0800000009c7475 */ - /*04e0*/ ISETP.LT.AND P0, PT, R29, c[0x0][0x14c], PT; /* 0x5b181c00299c741e */ - /*04e8*/ @P0 BRA 0xe0; /* 0x12007ffdf800003c */ - /*04f0*/ IADD R0, R0, 0x1; /* 0xc0800000009c0001 */ - /*04f8*/ ISETP.LT.AND P0, PT, R0, R49, PT; /* 0xdb181c00189c001e */ - /* 0x0800000000b810b8 */ - /*0508*/ @P0 BRA 0xb0; /* 0x12007ffdd000003c */ - /*0510*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ - /*0518*/ EXIT ; /* 0x18000000001c003c */ - /*0520*/ BRA 0x520; /* 0x12007ffffc1c003c */ - /*0528*/ NOP; /* 0x85800000001c3c02 */ - /*0530*/ NOP; /* 0x85800000001c3c02 */ - /*0538*/ NOP; /* 0x85800000001c3c02 */ - .................................. - - diff --git a/examples/stencil/2.s b/examples/stencil/2.s deleted file mode 100644 index 76476d03..00000000 --- a/examples/stencil/2.s +++ /dev/null @@ -1,239 +0,0 @@ - - code for sm_35 - Function : stencil_step_task - .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" - /* 0x0880acb0a0a0a000 */ - /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ - /*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */ - /*0018*/ IADD R44, R10, c[0x0][0x150]; /* 0x608000002a1c28b2 */ - /*0020*/ IADD R0, R44, 0x1; /* 0xc0800000009cb001 */ - /*0028*/ ISETP.GE.AND P0, PT, R44, R0, PT; /* 0xdb681c00001cb01e */ - /*0030*/ @P0 EXIT ; /* 0x180000000000003c */ - /*0038*/ MOV R11, c[0x0][0x154]; /* 0x64c03c002a9c002e */ - /* 0x0888108010a01080 */ - /*0048*/ IADD R41, R10, c[0x0][0x150]; /* 0x608000002a1c28a6 */ - /*0050*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */ - /*0058*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */ - /*0060*/ IMUL R35, R11, c[0x0][0x158]; /* 0x61c018002b1c2c8e */ - /*0068*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */ - /*0070*/ SHF.L R36, RZ, 0x1, R11; /* 0xb7c02c00009ffc91 */ - /*0078*/ MOV R42, c[0x0][0x148]; /* 0x64c03c00291c00aa */ - /* 0x088c80108c108c10 */ - /*0088*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */ - /*0090*/ IMUL R0, R11, 0x3; /* 0xc1c01800019c2c01 */ - /*0098*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */ - /*00a0*/ IMUL R18, R11, -0x3; /* 0xc9c01bfffe9c2c49 */ - /*00a8*/ SHF.L R37, RZ, 0x1, R35; /* 0xb7c08c00009ffc95 */ - /*00b0*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */ - /*00b8*/ IMUL R19, R35, 0x3; /* 0xc1c01800019c8c4d */ - /* 0x0880acb0a0acb000 */ - /*00c8*/ IMUL R20, R35, -0x3; /* 0xc9c01bfffe9c8c51 */ - /*00d0*/ ISETP.GE.AND P0, PT, R42, c[0x0][0x14c], PT; /* 0x5b681c00299ca81e */ - /*00d8*/ @P0 BRA 0x6d8; /* 0x12000002fc00003c */ - /*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */ - /*00e8*/ ISETP.LT.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b181c00289c281e */ - /*00f0*/ @!P0 BRA 0x6d8; /* 0x12000002f020003c */ - /*00f8*/ IMUL R40, R44, R35; /* 0xe1c01800119cb0a2 */ - /* 0x088880108c10a000 */ - /*0108*/ MOV R21, c[0x0][0x148]; /* 0x64c03c00291c0056 */ - /*0110*/ IMAD R39, R21, c[0x0][0x154], R40; /* 0x5108a0002a9c549e */ - /*0118*/ MOV R34, c[0x0][0x140]; /* 0x64c03c00281c008a */ - /*0120*/ IADD R29, R39, R37; /* 0xe0800000129c9c76 */ - /*0128*/ IADD R22, R39, c[0x0][0x154]; /* 0x608000002a9c9c5a */ - /*0130*/ ISUB R32, R39, R37; /* 0xe0880000129c9c82 */ - /*0138*/ IADD R23, R39, R36; /* 0xe0800000121c9c5e */ - /* 0x0880808080108c10 */ - /*0148*/ ISUB R24, R39, c[0x0][0x154]; /* 0x608800002a9c9c62 */ - /*0150*/ IADD R25, R39, R0; /* 0xe0800000001c9c66 */ - /*0158*/ ISUB R26, R39, R36; /* 0xe0880000121c9c6a */ - /*0160*/ IADD R27, R39, R35; /* 0xe0800000119c9c6e */ - /*0168*/ IADD R28, R39, R18; /* 0xe0800000091c9c72 */ - /*0170*/ ISUB R30, R39, R35; /* 0xe0880000119c9c7a */ - /*0178*/ IADD R33, R39, R20; /* 0xe08000000a1c9c86 */ - /* 0x08a0acb0a0a0a000 */ - /*0188*/ IADD R31, R39, R19; /* 0xe0800000099c9c7e */ - /*0190*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */ - /*0198*/ LOP.AND R11, R10, 0x1f; /* 0xc20000000f9c282d */ - /*01a0*/ IADD R43, R11, R34; /* 0xe0800000111c2cae */ - /*01a8*/ ISETP.GE.AND P0, PT, R43, c[0x0][0x144], PT; /* 0x5b681c00289cac1e */ - /*01b0*/ @P0 BRA.U 0x6a0; /* 0x120000027400023c */ - /*01b8*/ @!P0 IADD R10, R39, R43; /* 0xe080000015a09c2a */ - /* 0x08a0108c109c80a0 */ - /*01c8*/ @!P0 SHF.L R38, RZ, 0x3, R10; /* 0xb7c0280001a3fc99 */ - /*01d0*/ @!P0 IADD R10, R38, -0x8; /* 0xc88003fffc209829 */ - /*01d8*/ @!P0 IADD R11, R38, 0x8; /* 0xc08000000420982d */ - /*01e0*/ @!P0 BFE R12, R10, 0x11f; /* 0xc00800008fa02831 */ - /*01e8*/ @!P0 IADD R54.CC, R10, c[0x0][0x170]; /* 0x608400002e2028da */ - /*01f0*/ @!P0 IADD R10, R38, -0x10; /* 0xc88003fff8209829 */ - /*01f8*/ @!P0 BFE R13, R11, 0x11f; /* 0xc00800008fa02c35 */ - /* 0x08808080a0108c10 */ - /*0208*/ @!P0 IADD.X R55, R12, c[0x0][0x174]; /* 0x608040002ea030de */ - /*0210*/ @!P0 IADD R46.CC, R11, c[0x0][0x170]; /* 0x608400002e202cba */ - /*0218*/ @!P0 IADD R11, R38, 0x10; /* 0xc08000000820982d */ - /*0220*/ @!P0 BFE R14, R10, 0x11f; /* 0xc00800008fa02839 */ - /*0228*/ @!P0 IADD.X R47, R13, c[0x0][0x174]; /* 0x608040002ea034be */ - /*0230*/ @!P0 IADD R48.CC, R10, c[0x0][0x170]; /* 0x608400002e2028c2 */ - /*0238*/ @!P0 IADD R10, R22, R43; /* 0xe080000015a0582a */ - /* 0x08ac108080909410 */ - /*0248*/ @!P0 LD.E.64 R12, [R54]; /* 0xc58000000020d830 */ - /*0250*/ @!P0 BFE R15, R11, 0x11f; /* 0xc00800008fa02c3d */ - /*0258*/ @!P0 LD.E.64 R16, [R46]; /* 0xc58000000020b840 */ - /*0260*/ @!P0 IADD.X R49, R14, c[0x0][0x174]; /* 0x608040002ea038c6 */ - /*0268*/ @!P0 IADD R52.CC, R11, c[0x0][0x170]; /* 0x608400002e202cd2 */ - /*0270*/ @!P0 SHF.L R50, RZ, 0x3, R10; /* 0xb7c0280001a3fcc9 */ - /*0278*/ @!P0 IADD R14, R23, R43; /* 0xe080000015a05c3a */ - /* 0x08908c108c108010 */ - /*0288*/ @!P0 IADD.X R53, R15, c[0x0][0x174]; /* 0x608040002ea03cd6 */ - /*0290*/ @!P0 BFE R51, R50, 0x11f; /* 0xc00800008fa0c8cd */ - /*0298*/ @!P0 IADD R50.CC, R50, c[0x0][0x170]; /* 0x608400002e20c8ca */ - /*02a0*/ @!P0 SHF.L R45, RZ, 0x3, R14; /* 0xb7c0380001a3fcb5 */ - /*02a8*/ @!P0 LD.E.64 R10, [R48]; /* 0xc58000000020c028 */ - /*02b0*/ @!P0 DADD R12, R12, R16; /* 0xe380000008203032 */ - /*02b8*/ @!P0 LD.E.64 R14, [R52]; /* 0xc58000000020d038 */ - /* 0x089c8010b0108c10 */ - /*02c8*/ @!P0 IADD.X R51, R51, c[0x0][0x174]; /* 0x608040002ea0ccce */ - /*02d0*/ @!P0 BFE R17, R45, 0x11f; /* 0xc00800008fa0b445 */ - /*02d8*/ @!P0 IADD R16, R24, R43; /* 0xe080000015a06042 */ - /*02e0*/ @!P0 IADD R46.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4ba */ - /*02e8*/ @!P0 SHF.L R45, RZ, 0x3, R16; /* 0xb7c0400001a3fcb5 */ - /*02f0*/ @!P0 IADD.X R47, R17, c[0x0][0x174]; /* 0x608040002ea044be */ - /*02f8*/ @!P0 LD.E.64 R16, [R50]; /* 0xc58000000020c840 */ - /* 0x08848010a8108080 */ - /*0308*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*0310*/ @!P0 DADD R48, R10, R14; /* 0xe3800000072028c2 */ - /*0318*/ @!P0 BFE R11, R45, 0x11f; /* 0xc00800008fa0b42d */ - /*0320*/ @!P0 IADD R10, R26, R43; /* 0xe080000015a0682a */ - /*0328*/ @!P0 IADD.X R55, R11, c[0x0][0x174]; /* 0x608040002ea02cde */ - /*0330*/ @!P0 SHF.L R45, RZ, 0x3, R10; /* 0xb7c0280001a3fcb5 */ - /*0338*/ @!P0 LD.E.64 R14, [R46]; /* 0xc58000000020b838 */ - /* 0x0890988010801094 */ - /*0348*/ @!P0 DADD R16, R12, R16; /* 0xe380000008203042 */ - /*0350*/ @!P0 IADD R13, R27, R43; /* 0xe080000015a06c36 */ - /*0358*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0360*/ @!P0 BFE R53, R45, 0x11f; /* 0xc00800008fa0b4d5 */ - /*0368*/ @!P0 IADD R52.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4d2 */ - /*0370*/ @!P0 IADD R12, R29, R43; /* 0xe080000015a07432 */ - /*0378*/ @!P0 SHF.L R13, RZ, 0x3, R13; /* 0xb7c0340001a3fc35 */ - /* 0x0894801094108c10 */ - /*0388*/ @!P0 IADD.X R53, R53, c[0x0][0x174]; /* 0x608040002ea0d4d6 */ - /*0390*/ @!P0 SHF.L R45, RZ, 0x3, R12; /* 0xb7c0300001a3fcb5 */ - /*0398*/ @!P0 BFE R46, R13, 0x11f; /* 0xc00800008fa034b9 */ - /*03a0*/ @!P0 IADD R50.CC, R13, c[0x0][0x170]; /* 0x608400002e2034ca */ - /*03a8*/ @!P0 LD.E.64 R12, [R52]; /* 0xc58000000020d030 */ - /*03b0*/ @!P0 DADD R16, R16, R10; /* 0xe380000005204042 */ - /*03b8*/ @!P0 BFE R10, R45, 0x11f; /* 0xc00800008fa0b429 */ - /* 0x08a0108c109c8010 */ - /*03c8*/ @!P0 IADD.X R51, R46, c[0x0][0x174]; /* 0x608040002ea0b8ce */ - /*03d0*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*03d8*/ @!P0 IADD R45, R30, R43; /* 0xe080000015a078b6 */ - /*03e0*/ @!P0 LD.E.64 R46, [R50]; /* 0xc58000000020c8b8 */ - /*03e8*/ @!P0 DADD R14, R48, R14; /* 0xe38000000720c03a */ - /*03f0*/ @!P0 IADD.X R55, R10, c[0x0][0x174]; /* 0x608040002ea028de */ - /*03f8*/ @!P0 SHF.L R48, RZ, 0x3, R45; /* 0xb7c0b40001a3fcc1 */ - /* 0x088480a080108010 */ - /*0408*/ @!P0 IADD R45, R32, R43; /* 0xe080000015a080b6 */ - /*0410*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0418*/ @!P0 BFE R49, R48, 0x11f; /* 0xc00800008fa0c0c5 */ - /*0420*/ @!P0 IADD R48.CC, R48, c[0x0][0x170]; /* 0x608400002e20c0c2 */ - /*0428*/ @!P0 DADD R14, R14, R12; /* 0xe38000000620383a */ - /*0430*/ @!P0 DADD R12, R16, R46; /* 0xe380000017204032 */ - /*0438*/ @!P0 SHF.L R46, RZ, 0x3, R45; /* 0xb7c0b40001a3fcb9 */ - /* 0x0880808010b08010 */ - /*0448*/ @!P0 IADD.X R49, R49, c[0x0][0x174]; /* 0x608040002ea0c4c6 */ - /*0450*/ @!P0 BFE R45, R38, 0x11f; /* 0xc00800008fa098b5 */ - /*0458*/ @!P0 IADD R16.CC, R38, c[0x0][0x170]; /* 0x608400002e209842 */ - /*0460*/ @!P0 IADD.X R17, R45, c[0x0][0x174]; /* 0x608040002ea0b446 */ - /*0468*/ @!P0 LD.E.64 R50, [R48]; /* 0xc58000000020c0c8 */ - /*0470*/ @!P0 DADD R14, R14, R10; /* 0xe38000000520383a */ - /*0478*/ @!P0 BFE R10, R46, 0x11f; /* 0xc00800008fa0b829 */ - /* 0x0880bc109c1080b0 */ - /*0488*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0490*/ @!P0 IADD.X R47, R10, c[0x0][0x174]; /* 0x608040002ea028be */ - /*0498*/ @!P0 LD.E.64 R10, [R16]; /* 0xc580000000204028 */ - /*04a0*/ @!P0 IADD R48, R38, -0x18; /* 0xc88003fff42098c1 */ - /*04a8*/ @!P0 LD.E.64 R52, [R46]; /* 0xc58000000020b8d0 */ - /*04b0*/ @!P0 DADD R12, R12, R50; /* 0xe380000019203032 */ - /*04b8*/ @!P0 DMUL R50, R8, R10; /* 0xe4000000052020ca */ - /* 0x08b08010b01080a0 */ - /*04c8*/ @!P0 IADD R46, R38, 0x18; /* 0xc08000000c2098b9 */ - /*04d0*/ @!P0 DFMA R16, R6, R12, R50; /* 0xdb80c80006201842 */ - /*04d8*/ @!P0 BFE R13, R48, 0x11f; /* 0xc00800008fa0c035 */ - /*04e0*/ @!P0 IADD R12.CC, R48, c[0x0][0x170]; /* 0x608400002e20c032 */ - /*04e8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*04f0*/ @!P0 IADD.X R13, R13, c[0x0][0x174]; /* 0x608040002ea03436 */ - /*04f8*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /* 0x08a0a080dc109c80 */ - /*0508*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0510*/ @!P0 LD.E.64 R48, [R12]; /* 0xc5800000002030c0 */ - /*0518*/ @!P0 LD.E.64 R50, [R46]; /* 0xc58000000020b8c8 */ - /*0520*/ @!P0 DADD R14, R14, R52; /* 0xe38000001a20383a */ - /*0528*/ @!P0 DADD R12, R48, R50; /* 0xe38000001920c032 */ - /*0530*/ @!P0 IADD R48, R25, R43; /* 0xe080000015a064c2 */ - /*0538*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /* 0x08a080dc10a0b010 */ - /*0548*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0550*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0558*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0560*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*0568*/ @!P0 DADD R10, R10, R10; /* 0xe38000000520282a */ - /*0570*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*0578*/ @!P0 IADD R48, R28, R43; /* 0xe080000015a070c2 */ - /* 0x08a080dca0b010a0 */ - /*0588*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*0590*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0598*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*05a0*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*05a8*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*05b0*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*05b8*/ @!P0 IADD R48, R31, R43; /* 0xe080000015a07cc2 */ - /* 0x0880a010b010a010 */ - /*05c8*/ @!P0 IADD R43, R33, R43; /* 0xe080000015a084ae */ - /*05d0*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*05d8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*05e0*/ @!P0 IADD R48.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8c2 */ - /*05e8*/ @!P0 IADD.X R49, R47, c[0x0][0x174]; /* 0x608040002ea0bcc6 */ - /*05f0*/ @!P0 SHF.L R43, RZ, 0x3, R43; /* 0xb7c0ac0001a3fcad */ - /*05f8*/ @!P0 LD.E.64 R46, [R48]; /* 0xc58000000020c0b8 */ - /* 0x0880909c80a080d8 */ - /*0608*/ @!P0 IADD R52.CC, R43, c[0x0][0x170]; /* 0x608400002e20acd2 */ - /*0610*/ @!P0 DADD R46, R12, R46; /* 0xe3800000172030ba */ - /*0618*/ @!P0 BFE R12, R43, 0x11f; /* 0xc00800008fa0ac31 */ - /*0620*/ @!P0 IADD.X R53, R12, c[0x0][0x174]; /* 0x608040002ea030d6 */ - /*0628*/ @!P0 IADD R12.CC, R38, c[0x0][0x178]; /* 0x608400002f209832 */ - /*0630*/ @!P0 LD.E.64 R48, [R52]; /* 0xc58000000020d0c0 */ - /*0638*/ @!P0 IADD.X R13, R45, c[0x0][0x17c]; /* 0x608040002fa0b436 */ - /* 0x08cc8c10a48090b0 */ - /*0648*/ @!P0 IADD R50.CC, R38, c[0x0][0x168]; /* 0x608400002d2098ca */ - /*0650*/ @!P0 IADD.X R51, R45, c[0x0][0x16c]; /* 0x608040002da0b4ce */ - /*0658*/ @!P0 DADD R46, R46, R48; /* 0xe38000001820b8ba */ - /*0660*/ @!P0 DFMA R48, R4, R14, R16; /* 0xdb804000072010c2 */ - /*0668*/ @!P0 LD.E.64 R16, [R12]; /* 0xc580000000203040 */ - /*0670*/ @!P0 DFMA R48, R2, R46, R48; /* 0xdb80c000172008c2 */ - /*0678*/ @!P0 LD.E.64 R14, [R50]; /* 0xc58000000020c838 */ - /* 0x08a0b8b0a000a4a4 */ - /*0688*/ @!P0 DADD R10, R10, -R16; /* 0xe38100000820282a */ - /*0690*/ @!P0 DFMA R10, R48, R14, R10; /* 0xdb8028000720c02a */ - /*0698*/ @!P0 ST.E.64 [R12], R10; /* 0xe580000000203028 */ - /*06a0*/ IADD R34, R34, 0x20; /* 0xc0800000101c8889 */ - /*06a8*/ ISETP.LT.AND P0, PT, R34, c[0x0][0x144], PT; /* 0x5b181c00289c881e */ - /*06b0*/ @P0 BRA 0x190; /* 0x12007ffd6c00003c */ - /*06b8*/ IADD R21, R21, 0x1; /* 0xc0800000009c5455 */ - /* 0x08b810b8b010b8b0 */ - /*06c8*/ ISETP.EQ.AND P0, PT, R21, c[0x0][0x14c], PT; /* 0x5b281c00299c541e */ - /*06d0*/ @!P0 BRA 0x110; /* 0x12007ffd1c20003c */ - /*06d8*/ ISETP.NE.AND P0, PT, R44, R41, PT; /* 0xdb581c00149cb01e */ - /*06e0*/ IADD R44, R44, 0x1; /* 0xc0800000009cb0b1 */ - /*06e8*/ @P0 BRA 0xd0; /* 0x12007ffcf000003c */ - /*06f0*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ - /*06f8*/ EXIT ; /* 0x18000000001c003c */ - /*0700*/ BRA 0x700; /* 0x12007ffffc1c003c */ - /*0708*/ NOP; /* 0x85800000001c3c02 */ - /*0710*/ NOP; /* 0x85800000001c3c02 */ - /*0718*/ NOP; /* 0x85800000001c3c02 */ - /*0720*/ NOP; /* 0x85800000001c3c02 */ - /*0728*/ NOP; /* 0x85800000001c3c02 */ - /*0730*/ NOP; /* 0x85800000001c3c02 */ - /*0738*/ NOP; /* 0x85800000001c3c02 */ - .................................. - - diff --git a/examples/stencil/3.s b/examples/stencil/3.s deleted file mode 100644 index 76476d03..00000000 --- a/examples/stencil/3.s +++ /dev/null @@ -1,239 +0,0 @@ - - code for sm_35 - Function : stencil_step_task - .headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)" - /* 0x0880acb0a0a0a000 */ - /*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */ - /*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */ - /*0018*/ IADD R44, R10, c[0x0][0x150]; /* 0x608000002a1c28b2 */ - /*0020*/ IADD R0, R44, 0x1; /* 0xc0800000009cb001 */ - /*0028*/ ISETP.GE.AND P0, PT, R44, R0, PT; /* 0xdb681c00001cb01e */ - /*0030*/ @P0 EXIT ; /* 0x180000000000003c */ - /*0038*/ MOV R11, c[0x0][0x154]; /* 0x64c03c002a9c002e */ - /* 0x0888108010a01080 */ - /*0048*/ IADD R41, R10, c[0x0][0x150]; /* 0x608000002a1c28a6 */ - /*0050*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */ - /*0058*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */ - /*0060*/ IMUL R35, R11, c[0x0][0x158]; /* 0x61c018002b1c2c8e */ - /*0068*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */ - /*0070*/ SHF.L R36, RZ, 0x1, R11; /* 0xb7c02c00009ffc91 */ - /*0078*/ MOV R42, c[0x0][0x148]; /* 0x64c03c00291c00aa */ - /* 0x088c80108c108c10 */ - /*0088*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */ - /*0090*/ IMUL R0, R11, 0x3; /* 0xc1c01800019c2c01 */ - /*0098*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */ - /*00a0*/ IMUL R18, R11, -0x3; /* 0xc9c01bfffe9c2c49 */ - /*00a8*/ SHF.L R37, RZ, 0x1, R35; /* 0xb7c08c00009ffc95 */ - /*00b0*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */ - /*00b8*/ IMUL R19, R35, 0x3; /* 0xc1c01800019c8c4d */ - /* 0x0880acb0a0acb000 */ - /*00c8*/ IMUL R20, R35, -0x3; /* 0xc9c01bfffe9c8c51 */ - /*00d0*/ ISETP.GE.AND P0, PT, R42, c[0x0][0x14c], PT; /* 0x5b681c00299ca81e */ - /*00d8*/ @P0 BRA 0x6d8; /* 0x12000002fc00003c */ - /*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */ - /*00e8*/ ISETP.LT.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b181c00289c281e */ - /*00f0*/ @!P0 BRA 0x6d8; /* 0x12000002f020003c */ - /*00f8*/ IMUL R40, R44, R35; /* 0xe1c01800119cb0a2 */ - /* 0x088880108c10a000 */ - /*0108*/ MOV R21, c[0x0][0x148]; /* 0x64c03c00291c0056 */ - /*0110*/ IMAD R39, R21, c[0x0][0x154], R40; /* 0x5108a0002a9c549e */ - /*0118*/ MOV R34, c[0x0][0x140]; /* 0x64c03c00281c008a */ - /*0120*/ IADD R29, R39, R37; /* 0xe0800000129c9c76 */ - /*0128*/ IADD R22, R39, c[0x0][0x154]; /* 0x608000002a9c9c5a */ - /*0130*/ ISUB R32, R39, R37; /* 0xe0880000129c9c82 */ - /*0138*/ IADD R23, R39, R36; /* 0xe0800000121c9c5e */ - /* 0x0880808080108c10 */ - /*0148*/ ISUB R24, R39, c[0x0][0x154]; /* 0x608800002a9c9c62 */ - /*0150*/ IADD R25, R39, R0; /* 0xe0800000001c9c66 */ - /*0158*/ ISUB R26, R39, R36; /* 0xe0880000121c9c6a */ - /*0160*/ IADD R27, R39, R35; /* 0xe0800000119c9c6e */ - /*0168*/ IADD R28, R39, R18; /* 0xe0800000091c9c72 */ - /*0170*/ ISUB R30, R39, R35; /* 0xe0880000119c9c7a */ - /*0178*/ IADD R33, R39, R20; /* 0xe08000000a1c9c86 */ - /* 0x08a0acb0a0a0a000 */ - /*0188*/ IADD R31, R39, R19; /* 0xe0800000099c9c7e */ - /*0190*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */ - /*0198*/ LOP.AND R11, R10, 0x1f; /* 0xc20000000f9c282d */ - /*01a0*/ IADD R43, R11, R34; /* 0xe0800000111c2cae */ - /*01a8*/ ISETP.GE.AND P0, PT, R43, c[0x0][0x144], PT; /* 0x5b681c00289cac1e */ - /*01b0*/ @P0 BRA.U 0x6a0; /* 0x120000027400023c */ - /*01b8*/ @!P0 IADD R10, R39, R43; /* 0xe080000015a09c2a */ - /* 0x08a0108c109c80a0 */ - /*01c8*/ @!P0 SHF.L R38, RZ, 0x3, R10; /* 0xb7c0280001a3fc99 */ - /*01d0*/ @!P0 IADD R10, R38, -0x8; /* 0xc88003fffc209829 */ - /*01d8*/ @!P0 IADD R11, R38, 0x8; /* 0xc08000000420982d */ - /*01e0*/ @!P0 BFE R12, R10, 0x11f; /* 0xc00800008fa02831 */ - /*01e8*/ @!P0 IADD R54.CC, R10, c[0x0][0x170]; /* 0x608400002e2028da */ - /*01f0*/ @!P0 IADD R10, R38, -0x10; /* 0xc88003fff8209829 */ - /*01f8*/ @!P0 BFE R13, R11, 0x11f; /* 0xc00800008fa02c35 */ - /* 0x08808080a0108c10 */ - /*0208*/ @!P0 IADD.X R55, R12, c[0x0][0x174]; /* 0x608040002ea030de */ - /*0210*/ @!P0 IADD R46.CC, R11, c[0x0][0x170]; /* 0x608400002e202cba */ - /*0218*/ @!P0 IADD R11, R38, 0x10; /* 0xc08000000820982d */ - /*0220*/ @!P0 BFE R14, R10, 0x11f; /* 0xc00800008fa02839 */ - /*0228*/ @!P0 IADD.X R47, R13, c[0x0][0x174]; /* 0x608040002ea034be */ - /*0230*/ @!P0 IADD R48.CC, R10, c[0x0][0x170]; /* 0x608400002e2028c2 */ - /*0238*/ @!P0 IADD R10, R22, R43; /* 0xe080000015a0582a */ - /* 0x08ac108080909410 */ - /*0248*/ @!P0 LD.E.64 R12, [R54]; /* 0xc58000000020d830 */ - /*0250*/ @!P0 BFE R15, R11, 0x11f; /* 0xc00800008fa02c3d */ - /*0258*/ @!P0 LD.E.64 R16, [R46]; /* 0xc58000000020b840 */ - /*0260*/ @!P0 IADD.X R49, R14, c[0x0][0x174]; /* 0x608040002ea038c6 */ - /*0268*/ @!P0 IADD R52.CC, R11, c[0x0][0x170]; /* 0x608400002e202cd2 */ - /*0270*/ @!P0 SHF.L R50, RZ, 0x3, R10; /* 0xb7c0280001a3fcc9 */ - /*0278*/ @!P0 IADD R14, R23, R43; /* 0xe080000015a05c3a */ - /* 0x08908c108c108010 */ - /*0288*/ @!P0 IADD.X R53, R15, c[0x0][0x174]; /* 0x608040002ea03cd6 */ - /*0290*/ @!P0 BFE R51, R50, 0x11f; /* 0xc00800008fa0c8cd */ - /*0298*/ @!P0 IADD R50.CC, R50, c[0x0][0x170]; /* 0x608400002e20c8ca */ - /*02a0*/ @!P0 SHF.L R45, RZ, 0x3, R14; /* 0xb7c0380001a3fcb5 */ - /*02a8*/ @!P0 LD.E.64 R10, [R48]; /* 0xc58000000020c028 */ - /*02b0*/ @!P0 DADD R12, R12, R16; /* 0xe380000008203032 */ - /*02b8*/ @!P0 LD.E.64 R14, [R52]; /* 0xc58000000020d038 */ - /* 0x089c8010b0108c10 */ - /*02c8*/ @!P0 IADD.X R51, R51, c[0x0][0x174]; /* 0x608040002ea0ccce */ - /*02d0*/ @!P0 BFE R17, R45, 0x11f; /* 0xc00800008fa0b445 */ - /*02d8*/ @!P0 IADD R16, R24, R43; /* 0xe080000015a06042 */ - /*02e0*/ @!P0 IADD R46.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4ba */ - /*02e8*/ @!P0 SHF.L R45, RZ, 0x3, R16; /* 0xb7c0400001a3fcb5 */ - /*02f0*/ @!P0 IADD.X R47, R17, c[0x0][0x174]; /* 0x608040002ea044be */ - /*02f8*/ @!P0 LD.E.64 R16, [R50]; /* 0xc58000000020c840 */ - /* 0x08848010a8108080 */ - /*0308*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*0310*/ @!P0 DADD R48, R10, R14; /* 0xe3800000072028c2 */ - /*0318*/ @!P0 BFE R11, R45, 0x11f; /* 0xc00800008fa0b42d */ - /*0320*/ @!P0 IADD R10, R26, R43; /* 0xe080000015a0682a */ - /*0328*/ @!P0 IADD.X R55, R11, c[0x0][0x174]; /* 0x608040002ea02cde */ - /*0330*/ @!P0 SHF.L R45, RZ, 0x3, R10; /* 0xb7c0280001a3fcb5 */ - /*0338*/ @!P0 LD.E.64 R14, [R46]; /* 0xc58000000020b838 */ - /* 0x0890988010801094 */ - /*0348*/ @!P0 DADD R16, R12, R16; /* 0xe380000008203042 */ - /*0350*/ @!P0 IADD R13, R27, R43; /* 0xe080000015a06c36 */ - /*0358*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0360*/ @!P0 BFE R53, R45, 0x11f; /* 0xc00800008fa0b4d5 */ - /*0368*/ @!P0 IADD R52.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4d2 */ - /*0370*/ @!P0 IADD R12, R29, R43; /* 0xe080000015a07432 */ - /*0378*/ @!P0 SHF.L R13, RZ, 0x3, R13; /* 0xb7c0340001a3fc35 */ - /* 0x0894801094108c10 */ - /*0388*/ @!P0 IADD.X R53, R53, c[0x0][0x174]; /* 0x608040002ea0d4d6 */ - /*0390*/ @!P0 SHF.L R45, RZ, 0x3, R12; /* 0xb7c0300001a3fcb5 */ - /*0398*/ @!P0 BFE R46, R13, 0x11f; /* 0xc00800008fa034b9 */ - /*03a0*/ @!P0 IADD R50.CC, R13, c[0x0][0x170]; /* 0x608400002e2034ca */ - /*03a8*/ @!P0 LD.E.64 R12, [R52]; /* 0xc58000000020d030 */ - /*03b0*/ @!P0 DADD R16, R16, R10; /* 0xe380000005204042 */ - /*03b8*/ @!P0 BFE R10, R45, 0x11f; /* 0xc00800008fa0b429 */ - /* 0x08a0108c109c8010 */ - /*03c8*/ @!P0 IADD.X R51, R46, c[0x0][0x174]; /* 0x608040002ea0b8ce */ - /*03d0*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */ - /*03d8*/ @!P0 IADD R45, R30, R43; /* 0xe080000015a078b6 */ - /*03e0*/ @!P0 LD.E.64 R46, [R50]; /* 0xc58000000020c8b8 */ - /*03e8*/ @!P0 DADD R14, R48, R14; /* 0xe38000000720c03a */ - /*03f0*/ @!P0 IADD.X R55, R10, c[0x0][0x174]; /* 0x608040002ea028de */ - /*03f8*/ @!P0 SHF.L R48, RZ, 0x3, R45; /* 0xb7c0b40001a3fcc1 */ - /* 0x088480a080108010 */ - /*0408*/ @!P0 IADD R45, R32, R43; /* 0xe080000015a080b6 */ - /*0410*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */ - /*0418*/ @!P0 BFE R49, R48, 0x11f; /* 0xc00800008fa0c0c5 */ - /*0420*/ @!P0 IADD R48.CC, R48, c[0x0][0x170]; /* 0x608400002e20c0c2 */ - /*0428*/ @!P0 DADD R14, R14, R12; /* 0xe38000000620383a */ - /*0430*/ @!P0 DADD R12, R16, R46; /* 0xe380000017204032 */ - /*0438*/ @!P0 SHF.L R46, RZ, 0x3, R45; /* 0xb7c0b40001a3fcb9 */ - /* 0x0880808010b08010 */ - /*0448*/ @!P0 IADD.X R49, R49, c[0x0][0x174]; /* 0x608040002ea0c4c6 */ - /*0450*/ @!P0 BFE R45, R38, 0x11f; /* 0xc00800008fa098b5 */ - /*0458*/ @!P0 IADD R16.CC, R38, c[0x0][0x170]; /* 0x608400002e209842 */ - /*0460*/ @!P0 IADD.X R17, R45, c[0x0][0x174]; /* 0x608040002ea0b446 */ - /*0468*/ @!P0 LD.E.64 R50, [R48]; /* 0xc58000000020c0c8 */ - /*0470*/ @!P0 DADD R14, R14, R10; /* 0xe38000000520383a */ - /*0478*/ @!P0 BFE R10, R46, 0x11f; /* 0xc00800008fa0b829 */ - /* 0x0880bc109c1080b0 */ - /*0488*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0490*/ @!P0 IADD.X R47, R10, c[0x0][0x174]; /* 0x608040002ea028be */ - /*0498*/ @!P0 LD.E.64 R10, [R16]; /* 0xc580000000204028 */ - /*04a0*/ @!P0 IADD R48, R38, -0x18; /* 0xc88003fff42098c1 */ - /*04a8*/ @!P0 LD.E.64 R52, [R46]; /* 0xc58000000020b8d0 */ - /*04b0*/ @!P0 DADD R12, R12, R50; /* 0xe380000019203032 */ - /*04b8*/ @!P0 DMUL R50, R8, R10; /* 0xe4000000052020ca */ - /* 0x08b08010b01080a0 */ - /*04c8*/ @!P0 IADD R46, R38, 0x18; /* 0xc08000000c2098b9 */ - /*04d0*/ @!P0 DFMA R16, R6, R12, R50; /* 0xdb80c80006201842 */ - /*04d8*/ @!P0 BFE R13, R48, 0x11f; /* 0xc00800008fa0c035 */ - /*04e0*/ @!P0 IADD R12.CC, R48, c[0x0][0x170]; /* 0x608400002e20c032 */ - /*04e8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*04f0*/ @!P0 IADD.X R13, R13, c[0x0][0x174]; /* 0x608040002ea03436 */ - /*04f8*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /* 0x08a0a080dc109c80 */ - /*0508*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0510*/ @!P0 LD.E.64 R48, [R12]; /* 0xc5800000002030c0 */ - /*0518*/ @!P0 LD.E.64 R50, [R46]; /* 0xc58000000020b8c8 */ - /*0520*/ @!P0 DADD R14, R14, R52; /* 0xe38000001a20383a */ - /*0528*/ @!P0 DADD R12, R48, R50; /* 0xe38000001920c032 */ - /*0530*/ @!P0 IADD R48, R25, R43; /* 0xe080000015a064c2 */ - /*0538*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /* 0x08a080dc10a0b010 */ - /*0548*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0550*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*0558*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*0560*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*0568*/ @!P0 DADD R10, R10, R10; /* 0xe38000000520282a */ - /*0570*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*0578*/ @!P0 IADD R48, R28, R43; /* 0xe080000015a070c2 */ - /* 0x08a080dca0b010a0 */ - /*0588*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*0590*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*0598*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */ - /*05a0*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */ - /*05a8*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */ - /*05b0*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */ - /*05b8*/ @!P0 IADD R48, R31, R43; /* 0xe080000015a07cc2 */ - /* 0x0880a010b010a010 */ - /*05c8*/ @!P0 IADD R43, R33, R43; /* 0xe080000015a084ae */ - /*05d0*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */ - /*05d8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */ - /*05e0*/ @!P0 IADD R48.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8c2 */ - /*05e8*/ @!P0 IADD.X R49, R47, c[0x0][0x174]; /* 0x608040002ea0bcc6 */ - /*05f0*/ @!P0 SHF.L R43, RZ, 0x3, R43; /* 0xb7c0ac0001a3fcad */ - /*05f8*/ @!P0 LD.E.64 R46, [R48]; /* 0xc58000000020c0b8 */ - /* 0x0880909c80a080d8 */ - /*0608*/ @!P0 IADD R52.CC, R43, c[0x0][0x170]; /* 0x608400002e20acd2 */ - /*0610*/ @!P0 DADD R46, R12, R46; /* 0xe3800000172030ba */ - /*0618*/ @!P0 BFE R12, R43, 0x11f; /* 0xc00800008fa0ac31 */ - /*0620*/ @!P0 IADD.X R53, R12, c[0x0][0x174]; /* 0x608040002ea030d6 */ - /*0628*/ @!P0 IADD R12.CC, R38, c[0x0][0x178]; /* 0x608400002f209832 */ - /*0630*/ @!P0 LD.E.64 R48, [R52]; /* 0xc58000000020d0c0 */ - /*0638*/ @!P0 IADD.X R13, R45, c[0x0][0x17c]; /* 0x608040002fa0b436 */ - /* 0x08cc8c10a48090b0 */ - /*0648*/ @!P0 IADD R50.CC, R38, c[0x0][0x168]; /* 0x608400002d2098ca */ - /*0650*/ @!P0 IADD.X R51, R45, c[0x0][0x16c]; /* 0x608040002da0b4ce */ - /*0658*/ @!P0 DADD R46, R46, R48; /* 0xe38000001820b8ba */ - /*0660*/ @!P0 DFMA R48, R4, R14, R16; /* 0xdb804000072010c2 */ - /*0668*/ @!P0 LD.E.64 R16, [R12]; /* 0xc580000000203040 */ - /*0670*/ @!P0 DFMA R48, R2, R46, R48; /* 0xdb80c000172008c2 */ - /*0678*/ @!P0 LD.E.64 R14, [R50]; /* 0xc58000000020c838 */ - /* 0x08a0b8b0a000a4a4 */ - /*0688*/ @!P0 DADD R10, R10, -R16; /* 0xe38100000820282a */ - /*0690*/ @!P0 DFMA R10, R48, R14, R10; /* 0xdb8028000720c02a */ - /*0698*/ @!P0 ST.E.64 [R12], R10; /* 0xe580000000203028 */ - /*06a0*/ IADD R34, R34, 0x20; /* 0xc0800000101c8889 */ - /*06a8*/ ISETP.LT.AND P0, PT, R34, c[0x0][0x144], PT; /* 0x5b181c00289c881e */ - /*06b0*/ @P0 BRA 0x190; /* 0x12007ffd6c00003c */ - /*06b8*/ IADD R21, R21, 0x1; /* 0xc0800000009c5455 */ - /* 0x08b810b8b010b8b0 */ - /*06c8*/ ISETP.EQ.AND P0, PT, R21, c[0x0][0x14c], PT; /* 0x5b281c00299c541e */ - /*06d0*/ @!P0 BRA 0x110; /* 0x12007ffd1c20003c */ - /*06d8*/ ISETP.NE.AND P0, PT, R44, R41, PT; /* 0xdb581c00149cb01e */ - /*06e0*/ IADD R44, R44, 0x1; /* 0xc0800000009cb0b1 */ - /*06e8*/ @P0 BRA 0xd0; /* 0x12007ffcf000003c */ - /*06f0*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */ - /*06f8*/ EXIT ; /* 0x18000000001c003c */ - /*0700*/ BRA 0x700; /* 0x12007ffffc1c003c */ - /*0708*/ NOP; /* 0x85800000001c3c02 */ - /*0710*/ NOP; /* 0x85800000001c3c02 */ - /*0718*/ NOP; /* 0x85800000001c3c02 */ - /*0720*/ NOP; /* 0x85800000001c3c02 */ - /*0728*/ NOP; /* 0x85800000001c3c02 */ - /*0730*/ NOP; /* 0x85800000001c3c02 */ - /*0738*/ NOP; /* 0x85800000001c3c02 */ - .................................. - - diff --git a/examples/stencil/Makefile b/examples/stencil/Makefile index 097cd597..47cbf5d5 100644 --- a/examples/stencil/Makefile +++ b/examples/stencil/Makefile @@ -2,7 +2,7 @@ EXAMPLE=stencil CPP_SRC=stencil.cpp stencil_serial.cpp ISPC_SRC=stencil.ispc -ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 +ISPC_IA_TARGETS=avx ISPC_ARM_TARGETS=neon include ../common.mk diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp index 593d901f..93d11b7e 100644 --- a/examples/stencil/stencil.cpp +++ b/examples/stencil/stencil.cpp @@ -85,6 +85,7 @@ int main() { // the minimum time of three runs. // double minTimeISPC = 1e30; +#if 0 for (int i = 0; i < 3; ++i) { reset_and_start_timer(); loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, @@ -95,6 +96,7 @@ int main() { } printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); +#endif InitData(Nx, Ny, Nz, Aispc, vsq); diff --git a/examples/stencil/stencil.cubin b/examples/stencil/stencil.cubin deleted file mode 100644 index db1b1bca..00000000 Binary files a/examples/stencil/stencil.cubin and /dev/null differ diff --git a/examples/stencil/stencil1.cubin b/examples/stencil/stencil1.cubin deleted file mode 100644 index 8b7d18d9..00000000 Binary files a/examples/stencil/stencil1.cubin and /dev/null differ diff --git a/examples/stencil/stencil2.cubin b/examples/stencil/stencil2.cubin deleted file mode 100644 index 64a9d3ea..00000000 Binary files a/examples/stencil/stencil2.cubin and /dev/null differ diff --git a/examples/stencil/stencil_avx.bc b/examples/stencil/stencil_avx.bc deleted file mode 100644 index 7a63ccce..00000000 Binary files a/examples/stencil/stencil_avx.bc and /dev/null differ diff --git a/examples/stencil/stencil_cu b/examples/stencil/stencil_cu deleted file mode 100755 index 40e4a9ba..00000000 Binary files a/examples/stencil/stencil_cu and /dev/null differ diff --git a/examples/stencil/stencil_cu.bc b/examples/stencil/stencil_cu.bc deleted file mode 100644 index 5d9aecbe..00000000 Binary files a/examples/stencil/stencil_cu.bc and /dev/null differ diff --git a/examples/stencil/stencil_cu.ll b/examples/stencil/stencil_cu.ll deleted file mode 100644 index 6ea8748c..00000000 --- a/examples/stencil/stencil_cu.ll +++ /dev/null @@ -1,762 +0,0 @@ -; ModuleID = 'stencil_cu.bc' -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: nounwind -declare i8* @ISPCAlloc(i8**, i64, i32) #0 - -; Function Attrs: nounwind -declare void @ISPCLaunch(i8**, i8*, i8*, i32, i32, i32) #0 - -; Function Attrs: nounwind -declare void @ISPCSync(i8*) #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) #1 - -; Function Attrs: nounwind readonly -declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x double>) #2 - -; Function Attrs: nounwind -declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x double>, <4 x double>) #0 - -; Function Attrs: nounwind -define internal fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, double* noalias nocapture %coef, double* noalias %vsq, double* noalias %Ain, double* noalias %Aout, <8 x i32> %__mask) #3 { -allocas: - %floatmask.i = bitcast <8 x i32> %__mask to <8 x float> - %v.i = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i) #1 - %cmp.i = icmp eq i32 %v.i, 255 - %mul_Nx_load_Ny_load = mul i32 %Ny, %Nx - %coef_load_offset_load = load double* %coef, align 8 - %coef_load18_offset = getelementptr double* %coef, i64 1 - %coef_load18_offset_load = load double* %coef_load18_offset, align 8 - %coef_load21_offset = getelementptr double* %coef, i64 2 - %coef_load21_offset_load = load double* %coef_load21_offset, align 8 - %coef_load24_offset = getelementptr double* %coef, i64 3 - %coef_load24_offset_load = load double* %coef_load24_offset, align 8 - %less_z_load_z1_load260 = icmp slt i32 %z0, %z1 - br i1 %cmp.i, label %for_test.preheader, label %for_test264.preheader - -for_test264.preheader: ; preds = %allocas - br i1 %less_z_load_z1_load260, label %for_test275.preheader.lr.ph, label %for_exit - -for_test275.preheader.lr.ph: ; preds = %for_test264.preheader - %less_y_load282_y1_load283264 = icmp slt i32 %y0, %y1 - %less_xb_load293_x1_load294262 = icmp slt i32 %x0, %x1 - %x1_load463_broadcast_init = insertelement <8 x i32> undef, i32 %x1, i32 0 - %x1_load463_broadcast = shufflevector <8 x i32> %x1_load463_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer - %mul__Nx_load382 = shl i32 %Nx, 1 - %mul__Nx_load431 = mul i32 %Nx, 3 - %mul__Nx_load390 = mul i32 %Nx, -2 - %mul__Nx_load439 = mul i32 %Nx, -3 - %mul__Nxy_load399 = shl i32 %mul_Nx_load_Ny_load, 1 - %mul__Nxy_load448 = mul i32 %mul_Nx_load_Ny_load, 3 - %mul__Nxy_load407 = mul i32 %mul_Nx_load_Ny_load, -2 - %mul__Nxy_load456 = mul i32 %mul_Nx_load_Ny_load, -3 - %Ain_load327_ptr2int_2void = bitcast double* %Ain to i8* - %mask0.i.i201 = shufflevector <8 x i32> %__mask, <8 x i32> undef, <8 x i32> - %mask1.i.i202 = shufflevector <8 x i32> %__mask, <8 x i32> undef, <8 x i32> - %mask0d.i.i203 = bitcast <8 x i32> %mask0.i.i201 to <4 x double> - %mask1d.i.i204 = bitcast <8 x i32> %mask1.i.i202 to <4 x double> - %coef1_load315_broadcast_init = insertelement <8 x double> undef, double %coef_load18_offset_load, i32 0 - %coef0_load306_broadcast_init = insertelement <8 x double> undef, double %coef_load_offset_load, i32 0 - %coef2_load364_broadcast_init = insertelement <8 x double> undef, double %coef_load21_offset_load, i32 0 - %coef1_load315_broadcast = shufflevector <8 x double> %coef1_load315_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef0_load306_broadcast = shufflevector <8 x double> %coef0_load306_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load413_broadcast_init = insertelement <8 x double> undef, double %coef_load24_offset_load, i32 0 - %coef2_load364_broadcast = shufflevector <8 x double> %coef2_load364_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load413_broadcast = shufflevector <8 x double> %coef3_load413_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %Aout_load488_ptr2int_2void = bitcast double* %Aout to i8* - %vsq_load494_ptr2int_2void = bitcast double* %vsq to i8* - br label %for_test275.preheader - -for_test.preheader: ; preds = %allocas - br i1 %less_z_load_z1_load260, label %for_test30.preheader.lr.ph, label %for_exit - -for_test30.preheader.lr.ph: ; preds = %for_test.preheader - %less_y_load_y1_load258 = icmp slt i32 %y0, %y1 - %less_xb_load_x1_load256 = icmp slt i32 %x0, %x1 - %x1_load199_broadcast_init = insertelement <8 x i32> undef, i32 %x1, i32 0 - %x1_load199_broadcast = shufflevector <8 x i32> %x1_load199_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer - %mul__Nx_load119 = shl i32 %Nx, 1 - %mul__Nx_load167 = mul i32 %Nx, 3 - %mul__Nx_load127 = mul i32 %Nx, -2 - %mul__Nx_load175 = mul i32 %Nx, -3 - %mul__Nxy_load136 = shl i32 %mul_Nx_load_Ny_load, 1 - %mul__Nxy_load184 = mul i32 %mul_Nx_load_Ny_load, 3 - %mul__Nxy_load144 = mul i32 %mul_Nx_load_Ny_load, -2 - %mul__Nxy_load192 = mul i32 %mul_Nx_load_Ny_load, -3 - %Ain_load65_ptr2int_2void = bitcast double* %Ain to i8* - %coef1_load_broadcast_init = insertelement <8 x double> undef, double %coef_load18_offset_load, i32 0 - %coef0_load_broadcast_init = insertelement <8 x double> undef, double %coef_load_offset_load, i32 0 - %coef2_load_broadcast_init = insertelement <8 x double> undef, double %coef_load21_offset_load, i32 0 - %coef1_load_broadcast = shufflevector <8 x double> %coef1_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef0_load_broadcast = shufflevector <8 x double> %coef0_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load_broadcast_init = insertelement <8 x double> undef, double %coef_load24_offset_load, i32 0 - %coef2_load_broadcast = shufflevector <8 x double> %coef2_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %coef3_load_broadcast = shufflevector <8 x double> %coef3_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer - %Aout_load219_ptr2int_2void = bitcast double* %Aout to i8* - %vsq_load_ptr2int_2void = bitcast double* %vsq to i8* - br label %for_test30.preheader - -for_test30.preheader: ; preds = %for_exit33, %for_test30.preheader.lr.ph - %z.0261 = phi i32 [ %z0, %for_test30.preheader.lr.ph ], [ %z_load242_plus1, %for_exit33 ] - br i1 %less_y_load_y1_load258, label %for_test37.preheader.lr.ph, label %for_exit33 - -for_test37.preheader.lr.ph: ; preds = %for_test30.preheader - %mul_z_load45_Nxy_load = mul i32 %z.0261, %mul_Nx_load_Ny_load - br i1 %less_xb_load_x1_load256, label %for_loop39.lr.ph.us, label %for_exit33 - -for_exit40.us: ; preds = %safe_if_after_true.us - %y_load241_plus1.us = add i32 %y.0259.us, 1 - %exitcond = icmp eq i32 %y_load241_plus1.us, %y1 - br i1 %exitcond, label %for_exit33, label %for_loop39.lr.ph.us - -for_loop39.us: ; preds = %for_loop39.lr.ph.us, %safe_if_after_true.us - %xb.0257.us = phi i32 [ %x0, %for_loop39.lr.ph.us ], [ %add_xb_load240_.us, %safe_if_after_true.us ] - %xb_load44_broadcast_init.us = insertelement <8 x i32> undef, i32 %xb.0257.us, i32 0 - %xb_load44_broadcast.us = shufflevector <8 x i32> %xb_load44_broadcast_init.us, <8 x i32> undef, <8 x i32> zeroinitializer - %add_xb_load44_broadcast_.us = add <8 x i32> %xb_load44_broadcast.us, - %less_x_load198_x1_load199_broadcast.us = icmp slt <8 x i32> %add_xb_load44_broadcast_.us, %x1_load199_broadcast - %"oldMask&test.us" = select <8 x i1> %less_x_load198_x1_load199_broadcast.us, <8 x i32> , <8 x i32> zeroinitializer - %floatmask.i244.us = bitcast <8 x i32> %"oldMask&test.us" to <8 x float> - %v.i245.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i244.us) #1 - %cmp.i246.us = icmp eq i32 %v.i245.us, 0 - br i1 %cmp.i246.us, label %safe_if_after_true.us, label %safe_if_run_true.us - -safe_if_run_true.us: ; preds = %for_loop39.us - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast.elt0.us = add i32 %xb.0257.us, %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us - %scaled_varying.elt0.us = shl i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast.elt0.us, 3 - %"varying+const_offsets.elt0.us" = add i32 %scaled_varying.elt0.us, -8 - %0 = sext i32 %"varying+const_offsets.elt0.us" to i64 - %ptr.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %0, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3 - %ptr_cast_for_load.us = bitcast i8* %ptr.us to <8 x double>* - %ptr_masked_load521.us = load <8 x double>* %ptr_cast_for_load.us, align 8, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3 - %"varying+const_offsets529.elt0.us" = add i32 %scaled_varying.elt0.us, 8 - %1 = sext i32 %"varying+const_offsets529.elt0.us" to i64 - %ptr530.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %1, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5 - %ptr_cast_for_load531.us = bitcast i8* %ptr530.us to <8 x double>* - %ptr530_masked_load532.us = load <8 x double>* %ptr_cast_for_load531.us, align 8, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5 - %"varying+const_offsets540.elt0.us" = add i32 %scaled_varying.elt0.us, -16 - %2 = sext i32 %"varying+const_offsets540.elt0.us" to i64 - %ptr541.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %2, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3 - %ptr_cast_for_load542.us = bitcast i8* %ptr541.us to <8 x double>* - %ptr541_masked_load543.us = load <8 x double>* %ptr_cast_for_load542.us, align 8, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3 - %"varying+const_offsets551.elt0.us" = add i32 %scaled_varying.elt0.us, 16 - %3 = sext i32 %"varying+const_offsets551.elt0.us" to i64 - %ptr552.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %3, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5 - %ptr_cast_for_load553.us = bitcast i8* %ptr552.us to <8 x double>* - %ptr552_masked_load554.us = load <8 x double>* %ptr_cast_for_load553.us, align 8, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5 - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556_mul__Nx_load71_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556.elt0.us, %xb.0257.us - %scaled_varying560.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556_mul__Nx_load71_broadcast.elt0.us, 3 - %4 = sext i32 %scaled_varying560.elt0.us to i64 - %ptr562.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %4, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8 - %ptr_cast_for_load563.us = bitcast i8* %ptr562.us to <8 x double>* - %ptr562_masked_load564.us = load <8 x double>* %ptr_cast_for_load563.us, align 8, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8 - %add_Ain_load57_offset_load_Ain_load65_offset_load.us = fadd <8 x double> %ptr_masked_load521.us, %ptr530_masked_load532.us - %"varying+const_offsets572.elt0.us" = add i32 %scaled_varying.elt0.us, -24 - %5 = sext i32 %"varying+const_offsets572.elt0.us" to i64 - %ptr573.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %5, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3 - %ptr_cast_for_load574.us = bitcast i8* %ptr573.us to <8 x double>* - %ptr573_masked_load575.us = load <8 x double>* %ptr_cast_for_load574.us, align 8, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3 - %"varying+const_offsets583.elt0.us" = add i32 %scaled_varying.elt0.us, 24 - %6 = sext i32 %"varying+const_offsets583.elt0.us" to i64 - %ptr584.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %6, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5 - %ptr_cast_for_load585.us = bitcast i8* %ptr584.us to <8 x double>* - %ptr584_masked_load586.us = load <8 x double>* %ptr_cast_for_load585.us, align 8, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5 - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588_mul__Nx_load119_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588.elt0.us, %xb.0257.us - %scaled_varying593.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588_mul__Nx_load119_broadcast.elt0.us, 3 - %7 = sext i32 %scaled_varying593.elt0.us to i64 - %ptr595.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %7, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1 - %ptr_cast_for_load596.us = bitcast i8* %ptr595.us to <8 x double>* - %ptr595_masked_load597.us = load <8 x double>* %ptr_cast_for_load596.us, align 8, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1 - %add_Ain_load105_offset_load_Ain_load113_offset_load.us = fadd <8 x double> %ptr541_masked_load543.us, %ptr552_masked_load554.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599_mul__Nx_load79_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599.elt0.us, %xb.0257.us - %scaled_varying604.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599_mul__Nx_load79_broadcast.elt0.us, 3 - %8 = sext i32 %scaled_varying604.elt0.us to i64 - %ptr606.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %8, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13 - %ptr_cast_for_load607.us = bitcast i8* %ptr606.us to <8 x double>* - %ptr606_masked_load608.us = load <8 x double>* %ptr_cast_for_load607.us, align 8, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13 - %add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load.us = fadd <8 x double> %add_Ain_load57_offset_load_Ain_load65_offset_load.us, %ptr562_masked_load564.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610_mul__Nx_load167_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610.elt0.us, %xb.0257.us - %scaled_varying615.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610_mul__Nx_load167_broadcast.elt0.us, 3 - %9 = sext i32 %scaled_varying615.elt0.us to i64 - %ptr617.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %9, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1 - %ptr_cast_for_load618.us = bitcast i8* %ptr617.us to <8 x double>* - %ptr617_masked_load619.us = load <8 x double>* %ptr_cast_for_load618.us, align 8, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1 - %add_Ain_load153_offset_load_Ain_load161_offset_load.us = fadd <8 x double> %ptr573_masked_load575.us, %ptr584_masked_load586.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621_mul__Nx_load127_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621.elt0.us, %xb.0257.us - %scaled_varying626.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621_mul__Nx_load127_broadcast.elt0.us, 3 - %10 = sext i32 %scaled_varying626.elt0.us to i64 - %ptr628.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %10, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15 - %ptr_cast_for_load629.us = bitcast i8* %ptr628.us to <8 x double>* - %ptr628_masked_load630.us = load <8 x double>* %ptr_cast_for_load629.us, align 8, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15 - %add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load.us = fadd <8 x double> %add_Ain_load105_offset_load_Ain_load113_offset_load.us, %ptr595_masked_load597.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632_mul__Nxy_load88_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632.elt0.us, %xb.0257.us - %scaled_varying637.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632_mul__Nxy_load88_broadcast.elt0.us, 3 - %11 = sext i32 %scaled_varying637.elt0.us to i64 - %ptr639.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %11, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1 - %ptr_cast_for_load640.us = bitcast i8* %ptr639.us to <8 x double>* - %ptr639_masked_load641.us = load <8 x double>* %ptr_cast_for_load640.us, align 8, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1 - %add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load.us = fadd <8 x double> %add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load.us, %ptr606_masked_load608.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643_mul__Nx_load175_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643.elt0.us, %xb.0257.us - %scaled_varying648.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643_mul__Nx_load175_broadcast.elt0.us, 3 - %12 = sext i32 %scaled_varying648.elt0.us to i64 - %ptr650.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %12, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15 - %ptr_cast_for_load651.us = bitcast i8* %ptr650.us to <8 x double>* - %ptr650_masked_load652.us = load <8 x double>* %ptr_cast_for_load651.us, align 8, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15 - %add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load.us = fadd <8 x double> %add_Ain_load153_offset_load_Ain_load161_offset_load.us, %ptr617_masked_load619.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654_mul__Nxy_load136_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654.elt0.us, %xb.0257.us - %scaled_varying659.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654_mul__Nxy_load136_broadcast.elt0.us, 3 - %13 = sext i32 %scaled_varying659.elt0.us to i64 - %ptr661.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %13, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1 - %ptr_cast_for_load662.us = bitcast i8* %ptr661.us to <8 x double>* - %ptr661_masked_load663.us = load <8 x double>* %ptr_cast_for_load662.us, align 8, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1 - %add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load.us = fadd <8 x double> %add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load.us, %ptr628_masked_load630.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665_mul__Nxy_load96_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665.elt0.us, %xb.0257.us - %scaled_varying670.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665_mul__Nxy_load96_broadcast.elt0.us, 3 - %14 = sext i32 %scaled_varying670.elt0.us to i64 - %ptr672.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %14, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15 - %ptr_cast_for_load673.us = bitcast i8* %ptr672.us to <8 x double>* - %ptr672_masked_load674.us = load <8 x double>* %ptr_cast_for_load673.us, align 8, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15 - %add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load.us = fadd <8 x double> %add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load.us, %ptr639_masked_load641.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676_mul__Nxy_load184_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676.elt0.us, %xb.0257.us - %scaled_varying681.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676_mul__Nxy_load184_broadcast.elt0.us, 3 - %15 = sext i32 %scaled_varying681.elt0.us to i64 - %ptr683.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %15, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1 - %ptr_cast_for_load684.us = bitcast i8* %ptr683.us to <8 x double>* - %ptr683_masked_load685.us = load <8 x double>* %ptr_cast_for_load684.us, align 8, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1 - %add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load.us = fadd <8 x double> %add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load.us, %ptr650_masked_load652.us - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687_mul__Nxy_load144_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687.elt0.us, %xb.0257.us - %scaled_varying692.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687_mul__Nxy_load144_broadcast.elt0.us, 3 - %16 = sext i32 %scaled_varying692.elt0.us to i64 - %ptr694.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %16, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15 - %ptr_cast_for_load695.us = bitcast i8* %ptr694.us to <8 x double>* - %ptr694_masked_load696.us = load <8 x double>* %ptr_cast_for_load695.us, align 8, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15 - %add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load.us = fadd <8 x double> %add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load.us, %ptr661_masked_load663.us - %add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load.us, %ptr672_masked_load674.us - %17 = sext i32 %scaled_varying.elt0.us to i64 - %ptr705.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %17, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19 - %ptr_cast_for_load706.us = bitcast i8* %ptr705.us to <8 x double>* - %ptr705_masked_load707.us = load <8 x double>* %ptr_cast_for_load706.us, align 8, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19 - %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709_mul__Nxy_load192_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709.elt0.us, %xb.0257.us - %scaled_varying714.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709_mul__Nxy_load192_broadcast.elt0.us, 3 - %18 = sext i32 %scaled_varying714.elt0.us to i64 - %ptr716.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %18, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15 - %ptr_cast_for_load717.us = bitcast i8* %ptr716.us to <8 x double>* - %ptr716_masked_load718.us = load <8 x double>* %ptr_cast_for_load717.us, align 8, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15 - %add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load.us = fadd <8 x double> %add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load.us, %ptr683_masked_load685.us - %add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load.us, %ptr694_masked_load696.us - %mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fmul <8 x double> %coef1_load_broadcast, %add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us - %mul_coef0_load_broadcast_Ain_load_offset_load.us = fmul <8 x double> %coef0_load_broadcast, %ptr705_masked_load707.us - %add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load.us, %ptr716_masked_load718.us - %mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fmul <8 x double> %coef2_load_broadcast, %add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us - %add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fadd <8 x double> %mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us, %mul_coef0_load_broadcast_Ain_load_offset_load.us - %mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fmul <8 x double> %coef3_load_broadcast, %add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us - %add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fadd <8 x double> %mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us, %add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us - %add_add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load_mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fadd <8 x double> %add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us, %mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us - %mask0.i.i234.us = shufflevector <8 x i32> %"oldMask&test.us", <8 x i32> undef, <8 x i32> - %mask1.i.i235.us = shufflevector <8 x i32> %"oldMask&test.us", <8 x i32> undef, <8 x i32> - %mask0d.i.i236.us = bitcast <8 x i32> %mask0.i.i234.us to <4 x double> - %mask1d.i.i237.us = bitcast <8 x i32> %mask1.i.i235.us to <4 x double> - %val0d.i.i238.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr705.us, <4 x double> %mask0d.i.i236.us) #0 - %ptr727.sum.us = add i64 %17, 32 - %ptr1.i.i239.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %ptr727.sum.us - %val1d.i.i240.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i239.us, <4 x double> %mask1d.i.i237.us) #0 - %vald.i.i241.us = shufflevector <4 x double> %val0d.i.i238.us, <4 x double> %val1d.i.i240.us, <8 x i32> - %mul__Ain_load211_offset_load.us = fmul <8 x double> %vald.i.i241.us, - %ptr736.us = getelementptr i8* %Aout_load219_ptr2int_2void, i64 %17, !filename !0, !first_line !20, !first_column !21, !last_line !20, !last_column !22 - %val0d.i.i228.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr736.us, <4 x double> %mask0d.i.i236.us) #0 - %ptr1.i.i229.us = getelementptr i8* %Aout_load219_ptr2int_2void, i64 %ptr727.sum.us - %val1d.i.i230.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i229.us, <4 x double> %mask1d.i.i237.us) #0 - %vald.i.i231.us = shufflevector <4 x double> %val0d.i.i228.us, <4 x double> %val1d.i.i230.us, <8 x i32> - %sub_mul__Ain_load211_offset_load_Aout_load219_offset_load.us = fsub <8 x double> %mul__Ain_load211_offset_load.us, %vald.i.i231.us - %ptr745.us = getelementptr i8* %vsq_load_ptr2int_2void, i64 %17, !filename !0, !first_line !23, !first_column !24, !last_line !23, !last_column !7 - %val0d.i.i218.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr745.us, <4 x double> %mask0d.i.i236.us) #0 - %ptr1.i.i219.us = getelementptr i8* %vsq_load_ptr2int_2void, i64 %ptr727.sum.us - %val1d.i.i220.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i219.us, <4 x double> %mask1d.i.i237.us) #0 - %vald.i.i221.us = shufflevector <4 x double> %val0d.i.i218.us, <4 x double> %val1d.i.i220.us, <8 x i32> - %mul_vsq_load_offset_load_div_load.us = fmul <8 x double> %add_add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load_mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us, %vald.i.i221.us - %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us = fadd <8 x double> %sub_mul__Ain_load211_offset_load_Aout_load219_offset_load.us, %mul_vsq_load_offset_load_div_load.us - %val0.i.i.us = shufflevector <8 x double> %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us, <8 x double> undef, <4 x i32> - %val1.i.i.us = shufflevector <8 x double> %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us, <8 x double> undef, <4 x i32> - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr736.us, <4 x double> %mask0d.i.i236.us, <4 x double> %val0.i.i.us) #0 - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr1.i.i229.us, <4 x double> %mask1d.i.i237.us, <4 x double> %val1.i.i.us) #0 - br label %safe_if_after_true.us - -safe_if_after_true.us: ; preds = %safe_if_run_true.us, %for_loop39.us - %add_xb_load240_.us = add i32 %xb.0257.us, 8 - %less_xb_load_x1_load.us = icmp slt i32 %add_xb_load240_.us, %x1 - br i1 %less_xb_load_x1_load.us, label %for_loop39.us, label %for_exit40.us - -for_loop39.lr.ph.us: ; preds = %for_exit40.us, %for_test37.preheader.lr.ph - %y.0259.us = phi i32 [ %y_load241_plus1.us, %for_exit40.us ], [ %y0, %for_test37.preheader.lr.ph ] - %mul_y_load46_Nx_load47.us = mul i32 %y.0259.us, %Nx - %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us = add i32 %mul_y_load46_Nx_load47.us, %mul_z_load45_Nxy_load - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %Nx - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load119 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599.elt0.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %Nx - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load167 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load127 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load175 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load136 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665.elt0.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load184 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load144 - %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load192 - br label %for_loop39.us - -for_exit: ; preds = %for_exit278, %for_exit33, %for_test.preheader, %for_test264.preheader - ret void - -for_exit33: ; preds = %for_exit40.us, %for_test37.preheader.lr.ph, %for_test30.preheader - %z_load242_plus1 = add i32 %z.0261, 1 - %exitcond269 = icmp eq i32 %z_load242_plus1, %z1 - br i1 %exitcond269, label %for_exit, label %for_test30.preheader - -for_test275.preheader: ; preds = %for_exit278, %for_test275.preheader.lr.ph - %z269.0268 = phi i32 [ %z0, %for_test275.preheader.lr.ph ], [ %z_load518_plus1, %for_exit278 ] - br i1 %less_y_load282_y1_load283264, label %for_test286.preheader.lr.ph, label %for_exit278 - -for_test286.preheader.lr.ph: ; preds = %for_test275.preheader - %mul_z_load300_Nxy_load301 = mul i32 %z269.0268, %mul_Nx_load_Ny_load - br i1 %less_xb_load293_x1_load294262, label %for_loop288.lr.ph.us, label %for_exit278 - -for_exit289.us: ; preds = %safe_if_after_true466.us - %y_load517_plus1.us = add i32 %y280.0265.us, 1 - %exitcond271 = icmp eq i32 %y_load517_plus1.us, %y1 - br i1 %exitcond271, label %for_exit278, label %for_loop288.lr.ph.us - -for_loop288.us: ; preds = %for_loop288.lr.ph.us, %safe_if_after_true466.us - %xb291.0263.us = phi i32 [ %x0, %for_loop288.lr.ph.us ], [ %add_xb291_load_.us, %safe_if_after_true466.us ] - %xb_load298_broadcast_init.us = insertelement <8 x i32> undef, i32 %xb291.0263.us, i32 0 - %xb_load298_broadcast.us = shufflevector <8 x i32> %xb_load298_broadcast_init.us, <8 x i32> undef, <8 x i32> zeroinitializer - %add_xb_load298_broadcast_.us = add <8 x i32> %xb_load298_broadcast.us, - %less_x_load462_x1_load463_broadcast.us = icmp slt <8 x i32> %add_xb_load298_broadcast_.us, %x1_load463_broadcast - %"oldMask&test468.us" = select <8 x i1> %less_x_load462_x1_load463_broadcast.us, <8 x i32> , <8 x i32> zeroinitializer - %"internal_mask&function_mask472.us" = and <8 x i32> %"oldMask&test468.us", %__mask - %floatmask.i211.us = bitcast <8 x i32> %"internal_mask&function_mask472.us" to <8 x float> - %v.i212.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i211.us) #1 - %cmp.i213.us = icmp eq i32 %v.i212.us, 0 - br i1 %cmp.i213.us, label %safe_if_after_true466.us, label %safe_if_run_true467.us - -safe_if_run_true467.us: ; preds = %for_loop288.us - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast.elt0.us = add i32 %xb291.0263.us, %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us - %scaled_varying757.elt0.us = shl i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast.elt0.us, 3 - %"varying+const_offsets.elt0758.us" = add i32 %scaled_varying757.elt0.us, -8 - %19 = sext i32 %"varying+const_offsets.elt0758.us" to i64 - %ptr759.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %19, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3 - %val0d.i.i205.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr759.us, <4 x double> %mask0d.i.i203) #0 - %ptr759.sum.us = add i64 %19, 32 - %ptr1.i.i206.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr759.sum.us - %val1d.i.i207.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i206.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i208.us = shufflevector <4 x double> %val0d.i.i205.us, <4 x double> %val1d.i.i207.us, <8 x i32> - %"varying+const_offsets767.elt0.us" = add i32 %scaled_varying757.elt0.us, 8 - %20 = sext i32 %"varying+const_offsets767.elt0.us" to i64 - %ptr768.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %20, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5 - %val0d.i.i195.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr768.us, <4 x double> %mask0d.i.i203) #0 - %ptr768.sum.us = add i64 %20, 32 - %ptr1.i.i196.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr768.sum.us - %val1d.i.i197.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i196.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i198.us = shufflevector <4 x double> %val0d.i.i195.us, <4 x double> %val1d.i.i197.us, <8 x i32> - %"varying+const_offsets776.elt0.us" = add i32 %scaled_varying757.elt0.us, -16 - %21 = sext i32 %"varying+const_offsets776.elt0.us" to i64 - %ptr777.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %21, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3 - %val0d.i.i185.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr777.us, <4 x double> %mask0d.i.i203) #0 - %ptr777.sum.us = add i64 %21, 32 - %ptr1.i.i186.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr777.sum.us - %val1d.i.i187.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i186.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i188.us = shufflevector <4 x double> %val0d.i.i185.us, <4 x double> %val1d.i.i187.us, <8 x i32> - %"varying+const_offsets785.elt0.us" = add i32 %scaled_varying757.elt0.us, 16 - %22 = sext i32 %"varying+const_offsets785.elt0.us" to i64 - %ptr786.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %22, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5 - %val0d.i.i175.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr786.us, <4 x double> %mask0d.i.i203) #0 - %ptr786.sum.us = add i64 %22, 32 - %ptr1.i.i176.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr786.sum.us - %val1d.i.i177.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i176.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i178.us = shufflevector <4 x double> %val0d.i.i175.us, <4 x double> %val1d.i.i177.us, <8 x i32> - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788_mul__Nx_load333_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788.elt0.us, %xb291.0263.us - %scaled_varying793.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788_mul__Nx_load333_broadcast.elt0.us, 3 - %23 = sext i32 %scaled_varying793.elt0.us to i64 - %ptr795.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %23, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8 - %val0d.i.i165.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr795.us, <4 x double> %mask0d.i.i203) #0 - %ptr795.sum.us = add i64 %23, 32 - %ptr1.i.i166.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr795.sum.us - %val1d.i.i167.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i166.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i168.us = shufflevector <4 x double> %val0d.i.i165.us, <4 x double> %val1d.i.i167.us, <8 x i32> - %add_Ain_load319_offset_load_Ain_load327_offset_load.us = fadd <8 x double> %vald.i.i208.us, %vald.i.i198.us - %"varying+const_offsets803.elt0.us" = add i32 %scaled_varying757.elt0.us, -24 - %24 = sext i32 %"varying+const_offsets803.elt0.us" to i64 - %ptr804.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %24, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3 - %val0d.i.i155.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr804.us, <4 x double> %mask0d.i.i203) #0 - %ptr804.sum.us = add i64 %24, 32 - %ptr1.i.i156.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr804.sum.us - %val1d.i.i157.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i156.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i158.us = shufflevector <4 x double> %val0d.i.i155.us, <4 x double> %val1d.i.i157.us, <8 x i32> - %"varying+const_offsets812.elt0.us" = add i32 %scaled_varying757.elt0.us, 24 - %25 = sext i32 %"varying+const_offsets812.elt0.us" to i64 - %ptr813.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %25, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5 - %val0d.i.i145.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr813.us, <4 x double> %mask0d.i.i203) #0 - %ptr813.sum.us = add i64 %25, 32 - %ptr1.i.i146.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr813.sum.us - %val1d.i.i147.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i146.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i148.us = shufflevector <4 x double> %val0d.i.i145.us, <4 x double> %val1d.i.i147.us, <8 x i32> - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815_mul__Nx_load382_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815.elt0.us, %xb291.0263.us - %scaled_varying820.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815_mul__Nx_load382_broadcast.elt0.us, 3 - %26 = sext i32 %scaled_varying820.elt0.us to i64 - %ptr822.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %26, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1 - %val0d.i.i135.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr822.us, <4 x double> %mask0d.i.i203) #0 - %ptr822.sum.us = add i64 %26, 32 - %ptr1.i.i136.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr822.sum.us - %val1d.i.i137.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i136.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i138.us = shufflevector <4 x double> %val0d.i.i135.us, <4 x double> %val1d.i.i137.us, <8 x i32> - %add_Ain_load368_offset_load_Ain_load376_offset_load.us = fadd <8 x double> %vald.i.i188.us, %vald.i.i178.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824_mul__Nx_load341_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824.elt0.us, %xb291.0263.us - %scaled_varying829.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824_mul__Nx_load341_broadcast.elt0.us, 3 - %27 = sext i32 %scaled_varying829.elt0.us to i64 - %ptr831.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %27, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13 - %val0d.i.i125.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr831.us, <4 x double> %mask0d.i.i203) #0 - %ptr831.sum.us = add i64 %27, 32 - %ptr1.i.i126.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr831.sum.us - %val1d.i.i127.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i126.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i128.us = shufflevector <4 x double> %val0d.i.i125.us, <4 x double> %val1d.i.i127.us, <8 x i32> - %add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load.us = fadd <8 x double> %add_Ain_load319_offset_load_Ain_load327_offset_load.us, %vald.i.i168.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833_mul__Nx_load431_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833.elt0.us, %xb291.0263.us - %scaled_varying838.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833_mul__Nx_load431_broadcast.elt0.us, 3 - %28 = sext i32 %scaled_varying838.elt0.us to i64 - %ptr840.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %28, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1 - %val0d.i.i115.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr840.us, <4 x double> %mask0d.i.i203) #0 - %ptr840.sum.us = add i64 %28, 32 - %ptr1.i.i116.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr840.sum.us - %val1d.i.i117.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i116.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i118.us = shufflevector <4 x double> %val0d.i.i115.us, <4 x double> %val1d.i.i117.us, <8 x i32> - %add_Ain_load417_offset_load_Ain_load425_offset_load.us = fadd <8 x double> %vald.i.i158.us, %vald.i.i148.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842_mul__Nx_load390_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842.elt0.us, %xb291.0263.us - %scaled_varying847.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842_mul__Nx_load390_broadcast.elt0.us, 3 - %29 = sext i32 %scaled_varying847.elt0.us to i64 - %ptr849.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %29, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15 - %val0d.i.i105.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr849.us, <4 x double> %mask0d.i.i203) #0 - %ptr849.sum.us = add i64 %29, 32 - %ptr1.i.i106.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr849.sum.us - %val1d.i.i107.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i106.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i108.us = shufflevector <4 x double> %val0d.i.i105.us, <4 x double> %val1d.i.i107.us, <8 x i32> - %add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load.us = fadd <8 x double> %add_Ain_load368_offset_load_Ain_load376_offset_load.us, %vald.i.i138.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851_mul__Nxy_load350_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851.elt0.us, %xb291.0263.us - %scaled_varying856.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851_mul__Nxy_load350_broadcast.elt0.us, 3 - %30 = sext i32 %scaled_varying856.elt0.us to i64 - %ptr858.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %30, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1 - %val0d.i.i95.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr858.us, <4 x double> %mask0d.i.i203) #0 - %ptr858.sum.us = add i64 %30, 32 - %ptr1.i.i96.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr858.sum.us - %val1d.i.i97.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i96.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i98.us = shufflevector <4 x double> %val0d.i.i95.us, <4 x double> %val1d.i.i97.us, <8 x i32> - %add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load.us = fadd <8 x double> %add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load.us, %vald.i.i128.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860_mul__Nx_load439_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860.elt0.us, %xb291.0263.us - %scaled_varying865.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860_mul__Nx_load439_broadcast.elt0.us, 3 - %31 = sext i32 %scaled_varying865.elt0.us to i64 - %ptr867.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %31, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15 - %val0d.i.i85.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr867.us, <4 x double> %mask0d.i.i203) #0 - %ptr867.sum.us = add i64 %31, 32 - %ptr1.i.i86.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr867.sum.us - %val1d.i.i87.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i86.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i88.us = shufflevector <4 x double> %val0d.i.i85.us, <4 x double> %val1d.i.i87.us, <8 x i32> - %add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load.us = fadd <8 x double> %add_Ain_load417_offset_load_Ain_load425_offset_load.us, %vald.i.i118.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869_mul__Nxy_load399_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869.elt0.us, %xb291.0263.us - %scaled_varying874.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869_mul__Nxy_load399_broadcast.elt0.us, 3 - %32 = sext i32 %scaled_varying874.elt0.us to i64 - %ptr876.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %32, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1 - %val0d.i.i75.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr876.us, <4 x double> %mask0d.i.i203) #0 - %ptr876.sum.us = add i64 %32, 32 - %ptr1.i.i76.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr876.sum.us - %val1d.i.i77.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i76.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i78.us = shufflevector <4 x double> %val0d.i.i75.us, <4 x double> %val1d.i.i77.us, <8 x i32> - %add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load.us = fadd <8 x double> %add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load.us, %vald.i.i108.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878_mul__Nxy_load358_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878.elt0.us, %xb291.0263.us - %scaled_varying883.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878_mul__Nxy_load358_broadcast.elt0.us, 3 - %33 = sext i32 %scaled_varying883.elt0.us to i64 - %ptr885.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %33, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15 - %val0d.i.i65.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr885.us, <4 x double> %mask0d.i.i203) #0 - %ptr885.sum.us = add i64 %33, 32 - %ptr1.i.i66.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr885.sum.us - %val1d.i.i67.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i66.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i68.us = shufflevector <4 x double> %val0d.i.i65.us, <4 x double> %val1d.i.i67.us, <8 x i32> - %add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load.us = fadd <8 x double> %add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load.us, %vald.i.i98.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887_mul__Nxy_load448_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887.elt0.us, %xb291.0263.us - %scaled_varying892.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887_mul__Nxy_load448_broadcast.elt0.us, 3 - %34 = sext i32 %scaled_varying892.elt0.us to i64 - %ptr894.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %34, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1 - %val0d.i.i55.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr894.us, <4 x double> %mask0d.i.i203) #0 - %ptr894.sum.us = add i64 %34, 32 - %ptr1.i.i56.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr894.sum.us - %val1d.i.i57.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i56.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i58.us = shufflevector <4 x double> %val0d.i.i55.us, <4 x double> %val1d.i.i57.us, <8 x i32> - %add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load.us = fadd <8 x double> %add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load.us, %vald.i.i88.us - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896_mul__Nxy_load407_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896.elt0.us, %xb291.0263.us - %scaled_varying901.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896_mul__Nxy_load407_broadcast.elt0.us, 3 - %35 = sext i32 %scaled_varying901.elt0.us to i64 - %ptr903.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %35, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15 - %val0d.i.i45.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr903.us, <4 x double> %mask0d.i.i203) #0 - %ptr903.sum.us = add i64 %35, 32 - %ptr1.i.i46.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr903.sum.us - %val1d.i.i47.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i46.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i48.us = shufflevector <4 x double> %val0d.i.i45.us, <4 x double> %val1d.i.i47.us, <8 x i32> - %add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load.us = fadd <8 x double> %add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load.us, %vald.i.i78.us - %add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load.us, %vald.i.i68.us - %36 = sext i32 %scaled_varying757.elt0.us to i64 - %ptr912.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %36, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19 - %val0d.i.i35.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr912.us, <4 x double> %mask0d.i.i203) #0 - %ptr912.sum.us = add i64 %36, 32 - %ptr1.i.i36.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr912.sum.us - %val1d.i.i37.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i36.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i38.us = shufflevector <4 x double> %val0d.i.i35.us, <4 x double> %val1d.i.i37.us, <8 x i32> - %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914_mul__Nxy_load456_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914.elt0.us, %xb291.0263.us - %scaled_varying919.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914_mul__Nxy_load456_broadcast.elt0.us, 3 - %37 = sext i32 %scaled_varying919.elt0.us to i64 - %ptr921.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %37, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15 - %val0d.i.i25.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr921.us, <4 x double> %mask0d.i.i203) #0 - %ptr921.sum.us = add i64 %37, 32 - %ptr1.i.i26.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr921.sum.us - %val1d.i.i27.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i26.us, <4 x double> %mask1d.i.i204) #0 - %vald.i.i28.us = shufflevector <4 x double> %val0d.i.i25.us, <4 x double> %val1d.i.i27.us, <8 x i32> - %add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load.us = fadd <8 x double> %add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load.us, %vald.i.i58.us - %add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load.us, %vald.i.i48.us - %mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fmul <8 x double> %coef1_load315_broadcast, %add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us - %mul_coef0_load306_broadcast_Ain_load310_offset_load.us = fmul <8 x double> %coef0_load306_broadcast, %vald.i.i38.us - %add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load.us, %vald.i.i28.us - %mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fmul <8 x double> %coef2_load364_broadcast, %add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us - %add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fadd <8 x double> %mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us, %mul_coef0_load306_broadcast_Ain_load310_offset_load.us - %mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fmul <8 x double> %coef3_load413_broadcast, %add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us - %add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fadd <8 x double> %mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us, %add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us - %add_add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load_mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fadd <8 x double> %add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us, %mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us - %mask0.i.i11.us = shufflevector <8 x i32> %"internal_mask&function_mask472.us", <8 x i32> undef, <8 x i32> - %mask1.i.i12.us = shufflevector <8 x i32> %"internal_mask&function_mask472.us", <8 x i32> undef, <8 x i32> - %mask0d.i.i13.us = bitcast <8 x i32> %mask0.i.i11.us to <4 x double> - %mask1d.i.i14.us = bitcast <8 x i32> %mask1.i.i12.us to <4 x double> - %val0d.i.i15.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr912.us, <4 x double> %mask0d.i.i13.us) #0 - %val1d.i.i17.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i36.us, <4 x double> %mask1d.i.i14.us) #0 - %vald.i.i18.us = shufflevector <4 x double> %val0d.i.i15.us, <4 x double> %val1d.i.i17.us, <8 x i32> - %mul__Ain_load480_offset_load.us = fmul <8 x double> %vald.i.i18.us, - %ptr939.us = getelementptr i8* %Aout_load488_ptr2int_2void, i64 %36, !filename !0, !first_line !20, !first_column !21, !last_line !20, !last_column !22 - %val0d.i.i5.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr939.us, <4 x double> %mask0d.i.i13.us) #0 - %ptr1.i.i6.us = getelementptr i8* %Aout_load488_ptr2int_2void, i64 %ptr912.sum.us - %val1d.i.i7.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i6.us, <4 x double> %mask1d.i.i14.us) #0 - %vald.i.i8.us = shufflevector <4 x double> %val0d.i.i5.us, <4 x double> %val1d.i.i7.us, <8 x i32> - %sub_mul__Ain_load480_offset_load_Aout_load488_offset_load.us = fsub <8 x double> %mul__Ain_load480_offset_load.us, %vald.i.i8.us - %ptr948.us = getelementptr i8* %vsq_load494_ptr2int_2void, i64 %36, !filename !0, !first_line !23, !first_column !24, !last_line !23, !last_column !7 - %val0d.i.i.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr948.us, <4 x double> %mask0d.i.i13.us) #0 - %ptr1.i.i.us = getelementptr i8* %vsq_load494_ptr2int_2void, i64 %ptr912.sum.us - %val1d.i.i.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i.us, <4 x double> %mask1d.i.i14.us) #0 - %vald.i.i.us = shufflevector <4 x double> %val0d.i.i.us, <4 x double> %val1d.i.i.us, <8 x i32> - %mul_vsq_load494_offset_load_div_load499.us = fmul <8 x double> %add_add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load_mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us, %vald.i.i.us - %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us = fadd <8 x double> %sub_mul__Ain_load480_offset_load_Aout_load488_offset_load.us, %mul_vsq_load494_offset_load_div_load499.us - %val0.i.i253.us = shufflevector <8 x double> %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us, <8 x double> undef, <4 x i32> - %val1.i.i254.us = shufflevector <8 x double> %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us, <8 x double> undef, <4 x i32> - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr939.us, <4 x double> %mask0d.i.i13.us, <4 x double> %val0.i.i253.us) #0 - call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr1.i.i6.us, <4 x double> %mask1d.i.i14.us, <4 x double> %val1.i.i254.us) #0 - br label %safe_if_after_true466.us - -safe_if_after_true466.us: ; preds = %safe_if_run_true467.us, %for_loop288.us - %add_xb291_load_.us = add i32 %xb291.0263.us, 8 - %less_xb_load293_x1_load294.us = icmp slt i32 %add_xb291_load_.us, %x1 - br i1 %less_xb_load293_x1_load294.us, label %for_loop288.us, label %for_exit289.us - -for_loop288.lr.ph.us: ; preds = %for_exit289.us, %for_test286.preheader.lr.ph - %y280.0265.us = phi i32 [ %y_load517_plus1.us, %for_exit289.us ], [ %y0, %for_test286.preheader.lr.ph ] - %mul_y_load302_Nx_load303.us = mul i32 %y280.0265.us, %Nx - %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us = add i32 %mul_y_load302_Nx_load303.us, %mul_z_load300_Nxy_load301 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %Nx - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load382 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824.elt0.us = sub i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %Nx - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load431 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load390 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load439 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load399 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878.elt0.us = sub i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul_Nx_load_Ny_load - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load448 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load407 - %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load456 - br label %for_loop288.us - -for_exit278: ; preds = %for_exit289.us, %for_test286.preheader.lr.ph, %for_test275.preheader - %z_load518_plus1 = add i32 %z269.0268, 1 - %exitcond272 = icmp eq i32 %z_load518_plus1, %z1 - br i1 %exitcond272, label %for_exit, label %for_test275.preheader -} - -; Function Attrs: nounwind -define internal void @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* noalias nocapture, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #3 { -allocas: - %x01 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 0 - %x02 = load i32* %x01, align 4 - %x13 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 1 - %x14 = load i32* %x13, align 4 - %y05 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 2 - %y06 = load i32* %y05, align 4 - %y17 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 3 - %y18 = load i32* %y17, align 4 - %z09 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 4 - %z010 = load i32* %z09, align 4 - %Nx11 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 5 - %Nx12 = load i32* %Nx11, align 4 - %Ny13 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 6 - %Ny14 = load i32* %Ny13, align 4 - %coef17 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 8 - %coef18 = load double** %coef17, align 8 - %vsq19 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 9 - %vsq20 = load double** %vsq19, align 8 - %Ain21 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 10 - %Ain22 = load double** %Ain21, align 8 - %Aout23 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 11 - %Aout24 = load double** %Aout23, align 8 - %task_struct_mask = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 12 - %mask = load <8 x i32>* %task_struct_mask, align 32 - %floatmask.i = bitcast <8 x i32> %mask to <8 x float> - %v.i = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i) #1 - %cmp.i = icmp eq i32 %v.i, 255 - %add_z0_load_taskIndex_load = add i32 %z010, %3 - %add_z0_load27_taskIndex_load28 = add i32 %3, 1 - %add_add_z0_load27_taskIndex_load28_ = add i32 %add_z0_load27_taskIndex_load28, %z010 - br i1 %cmp.i, label %all_on, label %some_on - -all_on: ; preds = %allocas - tail call fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x02, i32 %x14, i32 %y06, i32 %y18, i32 %add_z0_load_taskIndex_load, i32 %add_add_z0_load27_taskIndex_load28_, i32 %Nx12, i32 %Ny14, double* %coef18, double* %vsq20, double* %Ain22, double* %Aout24, <8 x i32> ) - ret void - -some_on: ; preds = %allocas - tail call fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x02, i32 %x14, i32 %y06, i32 %y18, i32 %add_z0_load_taskIndex_load, i32 %add_add_z0_load27_taskIndex_load28_, i32 %Nx12, i32 %Ny14, double* %coef18, double* %vsq20, double* %Ain22, double* %Aout24, <8 x i32> %mask) - ret void -} - -; Function Attrs: nounwind -define void @loop_stencil_ispc_tasks(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd) #3 { -allocas: - %launch_group_handle = alloca i8*, align 8 - store i8* null, i8** %launch_group_handle, align 8 - %less_t_load_t1_load166 = icmp slt i32 %t0, %t1 - br i1 %less_t_load_t1_load166, label %for_loop.lr.ph, label %post_sync73 - -for_loop.lr.ph: ; preds = %allocas - %sub_z1_load_z0_load23 = sub i32 %z1, %z0 - br label %for_loop - -for_loop: ; preds = %post_sync, %for_loop.lr.ph - %t.0167 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load69_plus1, %post_sync ] - %bitop = and i32 %t.0167, 1 - %equal_bitop_ = icmp eq i32 %bitop, 0 - %args_ptr = call i8* @ISPCAlloc(i8** %launch_group_handle, i64 96, i32 32) - %funarg = bitcast i8* %args_ptr to i32* - store i32 %x0, i32* %funarg, align 4 - %funarg24 = getelementptr i8* %args_ptr, i64 4 - %0 = bitcast i8* %funarg24 to i32* - store i32 %x1, i32* %0, align 4 - %funarg25 = getelementptr i8* %args_ptr, i64 8 - %1 = bitcast i8* %funarg25 to i32* - store i32 %y0, i32* %1, align 4 - %funarg26 = getelementptr i8* %args_ptr, i64 12 - %2 = bitcast i8* %funarg26 to i32* - store i32 %y1, i32* %2, align 4 - %funarg27 = getelementptr i8* %args_ptr, i64 16 - %3 = bitcast i8* %funarg27 to i32* - store i32 %z0, i32* %3, align 4 - %funarg28 = getelementptr i8* %args_ptr, i64 20 - %4 = bitcast i8* %funarg28 to i32* - store i32 %Nx, i32* %4, align 4 - %funarg29 = getelementptr i8* %args_ptr, i64 24 - %5 = bitcast i8* %funarg29 to i32* - store i32 %Ny, i32* %5, align 4 - %funarg30 = getelementptr i8* %args_ptr, i64 28 - %6 = bitcast i8* %funarg30 to i32* - store i32 %Nz, i32* %6, align 4 - %funarg31 = getelementptr i8* %args_ptr, i64 32 - %7 = bitcast i8* %funarg31 to double** - store double* %coef, double** %7, align 8 - %funarg32 = getelementptr i8* %args_ptr, i64 40 - %8 = bitcast i8* %funarg32 to double** - store double* %vsq, double** %8, align 8 - %funarg33 = getelementptr i8* %args_ptr, i64 48 - %9 = bitcast i8* %funarg33 to double** - br i1 %equal_bitop_, label %if_then, label %if_else - -for_exit: ; preds = %post_sync - %launch_group_handle_load70.pre = load i8** %launch_group_handle, align 8 - %cmp71 = icmp eq i8* %launch_group_handle_load70.pre, null - br i1 %cmp71, label %post_sync73, label %call_sync72 - -if_then: ; preds = %for_loop - store double* %Aeven, double** %9, align 8 - %funarg34 = getelementptr i8* %args_ptr, i64 56 - %10 = bitcast i8* %funarg34 to double** - store double* %Aodd, double** %10, align 8 - %funarg_mask = getelementptr i8* %args_ptr, i64 64 - %11 = bitcast i8* %funarg_mask to <8 x i32>* - store <8 x i32> , <8 x i32>* %11, align 32 - call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i8*), i8* %args_ptr, i32 %sub_z1_load_z0_load23, i32 1, i32 1) - br label %if_exit - -if_else: ; preds = %for_loop - store double* %Aodd, double** %9, align 8 - %funarg64 = getelementptr i8* %args_ptr, i64 56 - %12 = bitcast i8* %funarg64 to double** - store double* %Aeven, double** %12, align 8 - %funarg_mask67 = getelementptr i8* %args_ptr, i64 64 - %13 = bitcast i8* %funarg_mask67 to <8 x i32>* - store <8 x i32> , <8 x i32>* %13, align 32 - call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i8*), i8* %args_ptr, i32 %sub_z1_load_z0_load23, i32 1, i32 1) - br label %if_exit - -if_exit: ; preds = %if_else, %if_then - %launch_group_handle_load = load i8** %launch_group_handle, align 8 - %cmp = icmp eq i8* %launch_group_handle_load, null - br i1 %cmp, label %post_sync, label %call_sync - -call_sync: ; preds = %if_exit - call void @ISPCSync(i8* %launch_group_handle_load) - store i8* null, i8** %launch_group_handle, align 8 - br label %post_sync - -post_sync: ; preds = %call_sync, %if_exit - %t_load69_plus1 = add i32 %t.0167, 1 - %exitcond = icmp eq i32 %t_load69_plus1, %t1 - br i1 %exitcond, label %for_exit, label %for_loop - -call_sync72: ; preds = %for_exit - call void @ISPCSync(i8* %launch_group_handle_load70.pre) - store i8* null, i8** %launch_group_handle, align 8 - br label %post_sync73 - -post_sync73: ; preds = %call_sync72, %for_exit, %allocas - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind readonly } -attributes #3 = { nounwind "target-cpu"="corei7-avx" "target-features"="+avx,+popcnt,+cmov" } - -!0 = metadata !{metadata !"stencil.ispc"} -!1 = metadata !{i32 68} -!2 = metadata !{i32 69} -!3 = metadata !{i32 113} -!4 = metadata !{i32 22} -!5 = metadata !{i32 66} -!6 = metadata !{i32 71} -!7 = metadata !{i32 23} -!8 = metadata !{i32 67} -!9 = metadata !{i32 74} -!10 = metadata !{i32 72} -!11 = metadata !{i32 24} -!12 = metadata !{i32 70} -!13 = metadata !{i32 114} -!14 = metadata !{i32 75} -!15 = metadata !{i32 115} -!16 = metadata !{i32 73} -!17 = metadata !{i32 76} -!18 = metadata !{i32 21} -!19 = metadata !{i32 64} -!20 = metadata !{i32 79} -!21 = metadata !{i32 112} -!22 = metadata !{i32 156} -!23 = metadata !{i32 80} -!24 = metadata !{i32 13} diff --git a/examples/stencil/stencil_cu.s b/examples/stencil/stencil_cu.s deleted file mode 100644 index a10402a9..00000000 --- a/examples/stencil/stencil_cu.s +++ /dev/null @@ -1,1134 +0,0 @@ - .file "stencil_cu.ll" - .section .rodata.cst16,"aM",@progbits,16 - .align 16 -.LCPI0_0: - .long 4 # 0x4 - .long 5 # 0x5 - .long 6 # 0x6 - .long 7 # 0x7 -.LCPI0_1: - .long 0 # 0x0 - .long 1 # 0x1 - .long 2 # 0x2 - .long 3 # 0x3 - .section .rodata,"a",@progbits - .align 32 -.LCPI0_2: - .quad 4611686018427387904 # double 2.000000e+00 - .quad 4611686018427387904 # double 2.000000e+00 - .quad 4611686018427387904 # double 2.000000e+00 - .quad 4611686018427387904 # double 2.000000e+00 - .text - .align 16, 0x90 - .type stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_,@function -stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_: # @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ -# BB#0: # %allocas - pushq %rbp - pushq %r15 - pushq %r14 - pushq %r13 - pushq %r12 - pushq %rbx - subq $1384, %rsp # imm = 0x568 - movl %ecx, -72(%rsp) # 4-byte Spill - movl %esi, 1308(%rsp) # 4-byte Spill - movl %edi, -68(%rsp) # 4-byte Spill - movq 1456(%rsp), %rcx - vmovsd 24(%rcx), %xmm1 - vmovsd 16(%rcx), %xmm3 - movq 1472(%rsp), %rax - vmovsd (%rcx), %xmm2 - vmovsd 8(%rcx), %xmm4 - movl 1448(%rsp), %esi - vmovmskps %ymm0, %ecx - cmpl $255, %ecx - jne .LBB0_1 -# BB#7: # %for_test.preheader - cmpl %r9d, %r8d - jge .LBB0_6 -# BB#8: # %for_test30.preheader.lr.ph - leal -3(%r8), %ecx - leal 2(%r8), %r13d - leal -1(%r8), %edi - leal 3(%r8), %ebp - movl %esi, %r11d - imull %r11d, %ebp - movl %ebp, %ebx - imull %r11d, %edi - movl %edi, %ebp - imull %r11d, %r13d - imull %r8d, %esi - imull %r11d, %ecx - leal -2(%r8), %r10d - imull %r11d, %r10d - leal 1(%r8), %r14d - imull %r11d, %r14d - movl %edx, -96(%rsp) # 4-byte Spill - addl %edx, %r14d - addl %edx, %r10d - addl %edx, %ecx - movl %ecx, 1344(%rsp) # 4-byte Spill - movl %r9d, -92(%rsp) # 4-byte Spill - leal 1(%rdx,%rsi), %r15d - leal 2(%rdx,%rsi), %edi - addl %edx, %r13d - addl %edx, %ebp - movl %ebp, 1216(%rsp) # 4-byte Spill - addl %edx, %ebx - movl %ebx, 1152(%rsp) # 4-byte Spill - leal -1(%rdx,%rsi), %ebp - leal 3(%rdx,%rsi), %ecx - leal (%rdx,%rsi), %r12d - leal -3(%rdx,%rsi), %ebx - movl %ebx, 1184(%rsp) # 4-byte Spill - movl %r8d, -88(%rsp) # 4-byte Spill - leal -2(%rdx,%rsi), %edx - vmovd 1308(%rsp), %xmm0 # 4-byte Folded Reload - movl 1440(%rsp), %r9d - imull %r9d, %r13d - imull %r9d, %ecx - movl %ecx, 1312(%rsp) # 4-byte Spill - imull %r9d, %ebp - movl %ebp, 1248(%rsp) # 4-byte Spill - imull %r9d, %edi - imull %r9d, %r15d - movl 1344(%rsp), %ecx # 4-byte Reload - imull %r9d, %ecx - movl %ecx, 1344(%rsp) # 4-byte Spill - imull %r9d, %r10d - movl 1152(%rsp), %ebx # 4-byte Reload - imull %r9d, %ebx - movl 1216(%rsp), %ebp # 4-byte Reload - imull %r9d, %ebp - imull %r9d, %r14d - movl 1184(%rsp), %r8d # 4-byte Reload - imull %r9d, %r8d - imull %r9d, %edx - movl %edx, 1216(%rsp) # 4-byte Spill - imull %r9d, %r12d - movl -68(%rsp), %edx # 4-byte Reload - leal (,%rdx,8), %edx - leal -16(%rdx,%r12,8), %esi - movl %esi, 76(%rsp) # 4-byte Spill - leal (%rdx,%r12,8), %ecx - movl %ecx, 72(%rsp) # 4-byte Spill - leal (%rdx,%r15,8), %ecx - movl %ecx, 68(%rsp) # 4-byte Spill - movl -92(%rsp), %ecx # 4-byte Reload - leal (%rdx,%rdi,8), %esi - movl %esi, 64(%rsp) # 4-byte Spill - movl 1248(%rsp), %esi # 4-byte Reload - leal (%rdx,%rsi,8), %esi - movl %esi, 60(%rsp) # 4-byte Spill - movl 1312(%rsp), %esi # 4-byte Reload - leal (%rdx,%rsi,8), %esi - movl %esi, 56(%rsp) # 4-byte Spill - movl 1216(%rsp), %esi # 4-byte Reload - leal (%rdx,%rsi,8), %esi - movl %esi, 52(%rsp) # 4-byte Spill - movl -88(%rsp), %esi # 4-byte Reload - leal (%rdx,%r8,8), %edi - movl %edi, 48(%rsp) # 4-byte Spill - leal (%rdx,%r14,8), %edi - movl %edi, 44(%rsp) # 4-byte Spill - leal (%rdx,%r13,8), %edi - movl %edi, 40(%rsp) # 4-byte Spill - leal (%rdx,%rbp,8), %edi - movl %edi, 36(%rsp) # 4-byte Spill - leal (%rdx,%rbx,8), %edi - movl %edi, 32(%rsp) # 4-byte Spill - leal (%rdx,%r10,8), %edi - movl %edi, 28(%rsp) # 4-byte Spill - movl 1344(%rsp), %edi # 4-byte Reload - leal (%rdx,%rdi,8), %edx - movl %edx, 24(%rsp) # 4-byte Spill - movl $0, -100(%rsp) # 4-byte Folded Spill - imull %r9d, %r11d - shll $3, %r9d - movl %r9d, -76(%rsp) # 4-byte Spill - shll $3, %r11d - movl %r11d, -104(%rsp) # 4-byte Spill - vpermilpd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0] - vpermilpd $0, %xmm2, %xmm2 # xmm2 = xmm2[0,0] - vpermilpd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0] - vpshufd $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] - vinsertf128 $1, %xmm1, %ymm1, %ymm1 - vmovupd %ymm1, 1312(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm3, %ymm3, %ymm1 - vmovupd %ymm1, 1344(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm2, %ymm2, %ymm15 - vmovupd %ymm15, -32(%rsp) # 32-byte Folded Spill - vpermilpd $0, %xmm4, %xmm1 # xmm1 = xmm4[0,0] - vinsertf128 $1, %xmm1, %ymm1, %ymm14 - vmovupd %ymm14, -64(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm0, %ymm0, %ymm0 - vmovups %ymm0, 1248(%rsp) # 32-byte Folded Spill - vmovapd .LCPI0_2(%rip), %ymm13 - .align 16, 0x90 -.LBB0_9: # %for_test30.preheader - # =>This Loop Header: Depth=1 - # Child Loop BB0_16 Depth 2 - # Child Loop BB0_12 Depth 3 - movl %esi, -88(%rsp) # 4-byte Spill - movl -96(%rsp), %edx # 4-byte Reload - cmpl -72(%rsp), %edx # 4-byte Folded Reload - jge .LBB0_11 -# BB#10: # %for_test37.preheader.lr.ph - # in Loop: Header=BB0_9 Depth=1 - movl -68(%rsp), %edx # 4-byte Reload - cmpl 1308(%rsp), %edx # 4-byte Folded Reload - movl -100(%rsp), %edx # 4-byte Reload - movl -96(%rsp), %edi # 4-byte Reload - jge .LBB0_11 - .align 16, 0x90 -.LBB0_16: # %for_loop39.lr.ph.us - # Parent Loop BB0_9 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB0_12 Depth 3 - movl %edi, -84(%rsp) # 4-byte Spill - movl %edx, -80(%rsp) # 4-byte Spill - movl %edx, %r13d - movl -68(%rsp), %ecx # 4-byte Reload - .align 16, 0x90 -.LBB0_12: # %for_loop39.us - # Parent Loop BB0_9 Depth=1 - # Parent Loop BB0_16 Depth=2 - # => This Inner Loop Header: Depth=3 - movl %ecx, 1216(%rsp) # 4-byte Spill - vmovups 1248(%rsp), %ymm3 # 32-byte Folded Reload - vmovups %ymm3, 1248(%rsp) # 32-byte Folded Spill - vextractf128 $1, %ymm3, %xmm0 - vmovd %ecx, %xmm1 - vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0] - vpaddd .LCPI0_0(%rip), %xmm1, %xmm2 - vpcmpgtd %xmm2, %xmm0, %xmm0 - vpaddd .LCPI0_1(%rip), %xmm1, %xmm1 - vpcmpgtd %xmm1, %xmm3, %xmm1 - vinsertf128 $1, %xmm0, %ymm1, %ymm8 - vmovmskps %ymm8, %ecx - testl %ecx, %ecx - je .LBB0_14 -# BB#13: # %safe_if_run_true.us - # in Loop: Header=BB0_12 Depth=3 - movl 76(%rsp), %esi # 4-byte Reload - leal 8(%rsi,%r13), %edx - movl 68(%rsp), %ecx # 4-byte Reload - leal (%rcx,%r13), %ecx - movl 72(%rsp), %r12d # 4-byte Reload - leal 24(%r12,%r13), %r14d - leal -8(%rsi,%r13), %r8d - movl 52(%rsp), %edi # 4-byte Reload - leal (%rdi,%r13), %edi - leal 8(%r12,%r13), %ebp - leal (%rsi,%r13), %esi - leal 16(%r12,%r13), %r11d - movl 64(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r9d - movl 44(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r15d - movl 60(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r10d - movl 40(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 832(%rsp) # 4-byte Spill - movl 56(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 800(%rsp) # 4-byte Spill - movl 36(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 768(%rsp) # 4-byte Spill - movl 28(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 736(%rsp) # 4-byte Spill - leal (%r12,%r13), %ebx - movl %ebx, 960(%rsp) # 4-byte Spill - movl 48(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 896(%rsp) # 4-byte Spill - movl 32(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %r12d - movl 24(%rsp), %ebx # 4-byte Reload - leal (%rbx,%r13), %ebx - movl %ebx, 992(%rsp) # 4-byte Spill - movslq %edx, %rdx - movq %rdx, 1184(%rsp) # 8-byte Spill - movslq %ecx, %rbx - movq %rbx, 1056(%rsp) # 8-byte Spill - movslq %esi, %rcx - movq %rcx, 1120(%rsp) # 8-byte Spill - vmovupd (%rax,%rbx), %xmm0 - movq %rbx, %rsi - vmovupd 16(%rax,%rdx), %xmm2 - vmovupd (%rax,%rdx), %xmm3 - movslq %ebp, %rdx - movq %rdx, 1152(%rsp) # 8-byte Spill - vmovupd 16(%rax,%rdx), %xmm1 - vmovupd (%rax,%rdx), %xmm4 - vinsertf128 $1, %xmm1, %ymm4, %ymm1 - vinsertf128 $1, %xmm2, %ymm3, %ymm2 - movslq %edi, %rdx - movq %rdx, 928(%rsp) # 8-byte Spill - movslq %r8d, %rbx - movslq %r14d, %r14 - vmovupd 16(%rax,%rsi), %xmm3 - vmovupd 16(%rax,%rcx), %xmm4 - vmovupd (%rax,%rcx), %xmm5 - movslq %r11d, %rcx - movq %rcx, 1088(%rsp) # 8-byte Spill - vmovupd 16(%rax,%rcx), %xmm6 - vmovupd (%rax,%rcx), %xmm7 - vinsertf128 $1, %xmm6, %ymm7, %ymm6 - vinsertf128 $1, %xmm4, %ymm5, %ymm7 - vaddpd %ymm1, %ymm2, %ymm1 - vinsertf128 $1, %xmm3, %ymm0, %ymm3 - movslq %r10d, %rsi - movq %rsi, 864(%rsp) # 8-byte Spill - vmovupd (%rax,%r14), %xmm5 - vmovupd (%rax,%rbx), %xmm4 - vmovupd (%rax,%rdx), %xmm2 - movslq %r15d, %rbp - movslq %r9d, %rcx - movq %rcx, 1048(%rsp) # 8-byte Spill - vmovupd 16(%rax,%rcx), %xmm0 - vmovupd (%rax,%rcx), %xmm9 - vaddpd %ymm6, %ymm7, %ymm7 - vinsertf128 $1, %xmm0, %ymm9, %ymm9 - vmovupd (%rax,%rbp), %xmm12 - vmovupd (%rax,%rsi), %xmm6 - vmovupd 16(%rax,%rdx), %xmm0 - vaddpd %ymm3, %ymm1, %ymm3 - vinsertf128 $1, 16(%rax,%rsi), %ymm6, %ymm6 - vinsertf128 $1, 16(%rax,%r14), %ymm5, %ymm5 - vinsertf128 $1, 16(%rax,%rbx), %ymm4, %ymm4 - vaddpd %ymm9, %ymm7, %ymm1 - vinsertf128 $1, %xmm0, %ymm2, %ymm2 - movslq 736(%rsp), %r8 # 4-byte Folded Reload - movslq 768(%rsp), %rdx # 4-byte Folded Reload - movslq 800(%rsp), %rdi # 4-byte Folded Reload - vmovupd (%rax,%rdi), %xmm10 - movslq 832(%rsp), %r15 # 4-byte Folded Reload - vmovupd (%rax,%r15), %xmm9 - vmovupd (%rax,%rdx), %xmm7 - vaddpd %ymm5, %ymm4, %ymm4 - vmovupd (%rax,%r8), %xmm11 - vaddpd %ymm6, %ymm3, %ymm5 - vinsertf128 $1, 16(%rax,%rdi), %ymm10, %ymm3 - vinsertf128 $1, 16(%rax,%rbp), %ymm12, %ymm10 - vinsertf128 $1, 16(%rax,%r15), %ymm9, %ymm0 - movslq 896(%rsp), %r11 # 4-byte Folded Reload - vaddpd %ymm2, %ymm1, %ymm1 - movslq 960(%rsp), %rcx # 4-byte Folded Reload - vmovupd (%rax,%rcx), %xmm6 - vaddpd %ymm0, %ymm1, %ymm1 - vinsertf128 $1, 16(%rax,%r8), %ymm11, %ymm2 - vinsertf128 $1, 16(%rax,%rdx), %ymm7, %ymm0 - movslq %r12d, %r12 - vaddpd %ymm10, %ymm5, %ymm7 - vmovupd (%rax,%r11), %xmm5 - vaddpd %ymm3, %ymm4, %ymm3 - vinsertf128 $1, 16(%rax,%r11), %ymm5, %ymm4 - vinsertf128 $1, 16(%rax,%rcx), %ymm6, %ymm9 - vmovupd (%rax,%r12), %xmm5 - movslq 992(%rsp), %rsi # 4-byte Folded Reload - vaddpd %ymm0, %ymm7, %ymm10 - vextractf128 $1, %ymm8, %xmm6 - vaddpd %ymm2, %ymm1, %ymm2 - vpshufd $80, %xmm6, %xmm7 # xmm7 = xmm6[0,0,1,1] - vmulpd %ymm9, %ymm15, %ymm1 - vmovupd (%rax,%rsi), %xmm9 - vaddpd %ymm4, %ymm3, %ymm3 - vinsertf128 $1, 16(%rax,%r12), %ymm5, %ymm4 - vpshufd $80, %xmm8, %xmm5 # xmm5 = xmm8[0,0,1,1] - vpshufd $-6, %xmm6, %xmm0 # xmm0 = xmm6[2,2,3,3] - vpshufd $-6, %xmm8, %xmm6 # xmm6 = xmm8[2,2,3,3] - vinsertf128 $1, %xmm6, %ymm5, %ymm6 - vinsertf128 $1, 16(%rax,%rsi), %ymm9, %ymm5 - vinsertf128 $1, %xmm0, %ymm7, %ymm8 - vmovupd %ymm8, 96(%rsp) # 32-byte Folded Spill - vmovupd 1344(%rsp), %ymm0 # 32-byte Folded Reload - vmovupd %ymm0, 1344(%rsp) # 32-byte Folded Spill - vmovupd %ymm0, 1344(%rsp) # 32-byte Folded Spill - vmulpd %ymm2, %ymm0, %ymm0 - vmulpd %ymm10, %ymm14, %ymm2 - movq 1480(%rsp), %r9 - vmaskmovpd (%r9,%rcx), %ymm6, %ymm7 - vaddpd %ymm1, %ymm2, %ymm1 - vaddpd %ymm1, %ymm0, %ymm0 - vaddpd %ymm4, %ymm3, %ymm3 - vmaskmovpd (%rax,%rcx), %ymm6, %ymm1 - vmulpd %ymm13, %ymm1, %ymm1 - movq 1464(%rsp), %r10 - vmaskmovpd (%r10,%rcx), %ymm6, %ymm2 - vsubpd %ymm7, %ymm1, %ymm1 - vmaskmovpd 32(%r10,%rcx), %ymm8, %ymm4 - vmovupd %ymm4, 992(%rsp) # 32-byte Folded Spill - vaddpd %ymm5, %ymm3, %ymm3 - vmovups 48(%rax,%rsi), %xmm4 - vmovaps %xmm4, 960(%rsp) # 16-byte Spill - vmovupd 1312(%rsp), %ymm4 # 32-byte Folded Reload - vmovupd %ymm4, 1312(%rsp) # 32-byte Folded Spill - vmovupd %ymm4, 1312(%rsp) # 32-byte Folded Spill - vmulpd %ymm3, %ymm4, %ymm3 - vmovups 32(%rax,%rsi), %xmm4 - vmovups %ymm4, 896(%rsp) # 32-byte Folded Spill - vaddpd %ymm3, %ymm0, %ymm0 - vmovups 48(%rax,%r12), %xmm3 - vmovaps %xmm3, 832(%rsp) # 16-byte Spill - vmulpd %ymm2, %ymm0, %ymm0 - vmovups 32(%rax,%r12), %xmm2 - vmovups %ymm2, 800(%rsp) # 32-byte Folded Spill - vaddpd %ymm0, %ymm1, %ymm0 - vmovupd %ymm0, 128(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r11), %xmm0 - vmovaps %xmm0, 768(%rsp) # 16-byte Spill - vmovups 32(%rax,%r11), %xmm0 - vmovups %ymm0, 736(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rdi), %xmm0 - vmovaps %xmm0, 704(%rsp) # 16-byte Spill - vmovups 32(%rax,%rdi), %xmm0 - vmovups %ymm0, 640(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rbx), %xmm0 - vmovaps %xmm0, 592(%rsp) # 16-byte Spill - vmovups 32(%rax,%rbx), %xmm0 - vmovups %ymm0, 544(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r14), %xmm0 - vmovaps %xmm0, 464(%rsp) # 16-byte Spill - vmovups 32(%rax,%r14), %xmm0 - vmovups %ymm0, 416(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rdx), %xmm0 - vmovaps %xmm0, 400(%rsp) # 16-byte Spill - vmovups 32(%rax,%rdx), %xmm0 - vmovups %ymm0, 352(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r8), %xmm0 - vmovaps %xmm0, 336(%rsp) # 16-byte Spill - vmovups 32(%rax,%r8), %xmm0 - vmovups %ymm0, 288(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rbp), %xmm0 - vmovaps %xmm0, 272(%rsp) # 16-byte Spill - vmovups 32(%rax,%rbp), %xmm0 - vmovups %ymm0, 224(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%r9,%rcx), %ymm8, %ymm0 - vmovupd %ymm0, 672(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rcx), %ymm8, %ymm0 - vmovupd %ymm0, 608(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%rcx), %xmm0 - vmovaps %xmm0, 528(%rsp) # 16-byte Spill - vmovups 32(%rax,%rcx), %xmm0 - vmovups %ymm0, 480(%rsp) # 32-byte Folded Spill - vmovups 48(%rax,%r15), %xmm0 - vmovaps %xmm0, 208(%rsp) # 16-byte Spill - vmovups 32(%rax,%r15), %xmm0 - vmovups %ymm0, 160(%rsp) # 32-byte Folded Spill - movq 864(%rsp), %rdx # 8-byte Reload - vmovups 48(%rax,%rdx), %xmm0 - vmovaps %xmm0, 80(%rsp) # 16-byte Spill - vmovups 32(%rax,%rdx), %xmm0 - vmovups %ymm0, 864(%rsp) # 32-byte Folded Spill - movq 928(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm4 - vmovupd 32(%rax,%rdx), %xmm9 - movq 1056(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm5 - vmovupd 32(%rax,%rdx), %xmm11 - movq 1048(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm13 - vmovupd 32(%rax,%rdx), %xmm7 - movq 1184(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm15 - vmovupd 32(%rax,%rdx), %xmm10 - movq 1152(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm12 - vmovupd 32(%rax,%rdx), %xmm14 - movq 1120(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm0 - vmovupd 32(%rax,%rdx), %xmm1 - movq 1088(%rsp), %rdx # 8-byte Reload - vmovupd 48(%rax,%rdx), %xmm2 - vmovupd 32(%rax,%rdx), %xmm3 - vmovupd 128(%rsp), %ymm8 # 32-byte Folded Reload - vmaskmovpd %ymm8, %ymm6, (%r9,%rcx) - vinsertf128 $1, %xmm2, %ymm3, %ymm2 - vinsertf128 $1, %xmm0, %ymm1, %ymm0 - vaddpd %ymm2, %ymm0, %ymm1 - vinsertf128 $1, %xmm12, %ymm14, %ymm0 - vinsertf128 $1, %xmm15, %ymm10, %ymm2 - vaddpd %ymm0, %ymm2, %ymm0 - vinsertf128 $1, %xmm13, %ymm7, %ymm2 - vinsertf128 $1, %xmm5, %ymm11, %ymm3 - vaddpd %ymm3, %ymm0, %ymm5 - vaddpd %ymm2, %ymm1, %ymm0 - vinsertf128 $1, %xmm4, %ymm9, %ymm1 - vaddpd %ymm1, %ymm0, %ymm0 - vmovupd 864(%rsp), %ymm1 # 32-byte Folded Reload - vinsertf128 $1, 80(%rsp), %ymm1, %ymm1 # 16-byte Folded Reload - vmovupd 160(%rsp), %ymm2 # 32-byte Folded Reload - vinsertf128 $1, 208(%rsp), %ymm2, %ymm2 # 16-byte Folded Reload - vaddpd %ymm2, %ymm0, %ymm0 - vaddpd %ymm1, %ymm5, %ymm1 - vmovupd 224(%rsp), %ymm2 # 32-byte Folded Reload - vinsertf128 $1, 272(%rsp), %ymm2, %ymm2 # 16-byte Folded Reload - vaddpd %ymm2, %ymm1, %ymm1 - vmovupd 288(%rsp), %ymm2 # 32-byte Folded Reload - vinsertf128 $1, 336(%rsp), %ymm2, %ymm2 # 16-byte Folded Reload - vmovupd 352(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 400(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm1, %ymm1 - vaddpd %ymm2, %ymm0, %ymm2 - vmovupd 416(%rsp), %ymm0 # 32-byte Folded Reload - vinsertf128 $1, 464(%rsp), %ymm0, %ymm0 # 16-byte Folded Reload - vmovupd 544(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 592(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm0, %ymm3, %ymm0 - vmovupd 640(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 704(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 1344(%rsp), %ymm3 # 32-byte Folded Reload - vmulpd %ymm2, %ymm3, %ymm2 - vmovupd -64(%rsp), %ymm3 # 32-byte Folded Reload - vmulpd %ymm1, %ymm3, %ymm1 - vmovapd %ymm3, %ymm14 - vmovupd 736(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 768(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vmovupd 480(%rsp), %ymm4 # 32-byte Folded Reload - vinsertf128 $1, 528(%rsp), %ymm4, %ymm4 # 16-byte Folded Reload - vmovupd -32(%rsp), %ymm5 # 32-byte Folded Reload - vmulpd %ymm4, %ymm5, %ymm4 - vmovapd %ymm5, %ymm15 - vaddpd %ymm4, %ymm1, %ymm1 - vmovapd .LCPI0_2(%rip), %ymm5 - vmovupd 608(%rsp), %ymm4 # 32-byte Folded Reload - vmulpd %ymm5, %ymm4, %ymm4 - vmovapd %ymm5, %ymm13 - vaddpd %ymm1, %ymm2, %ymm2 - vsubpd 672(%rsp), %ymm4, %ymm1 # 32-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 800(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 832(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 896(%rsp), %ymm3 # 32-byte Folded Reload - vinsertf128 $1, 960(%rsp), %ymm3, %ymm3 # 16-byte Folded Reload - vaddpd %ymm3, %ymm0, %ymm0 - vmovupd 1312(%rsp), %ymm3 # 32-byte Folded Reload - vmulpd %ymm0, %ymm3, %ymm0 - vaddpd %ymm0, %ymm2, %ymm0 - vmulpd 992(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd %ymm0, %ymm1, %ymm0 - vmovupd 96(%rsp), %ymm1 # 32-byte Folded Reload - vmaskmovpd %ymm0, %ymm1, 32(%r9,%rcx) -.LBB0_14: # %safe_if_after_true.us - # in Loop: Header=BB0_12 Depth=3 - addl $64, %r13d - movl 1216(%rsp), %ecx # 4-byte Reload - addl $8, %ecx - cmpl 1308(%rsp), %ecx # 4-byte Folded Reload - jl .LBB0_12 -# BB#15: # %for_exit40.us - # in Loop: Header=BB0_16 Depth=2 - movl -80(%rsp), %edx # 4-byte Reload - addl -76(%rsp), %edx # 4-byte Folded Reload - movl -84(%rsp), %edi # 4-byte Reload - incl %edi - cmpl -72(%rsp), %edi # 4-byte Folded Reload - movl -92(%rsp), %ecx # 4-byte Reload - movl -88(%rsp), %esi # 4-byte Reload - jne .LBB0_16 -.LBB0_11: # %for_exit33 - # in Loop: Header=BB0_9 Depth=1 - movl -100(%rsp), %edx # 4-byte Reload - addl -104(%rsp), %edx # 4-byte Folded Reload - movl %edx, -100(%rsp) # 4-byte Spill - incl %esi - cmpl %ecx, %esi - jne .LBB0_9 - jmp .LBB0_6 -.LBB0_1: # %for_test264.preheader - cmpl %r9d, %r8d - jge .LBB0_6 -# BB#2: # %for_test275.preheader.lr.ph - leal 2(%r8), %r13d - movl %esi, %r10d - imull %r10d, %r13d - movl %r10d, %ecx - imull %r8d, %ecx - movl %edx, %esi - movl %esi, -96(%rsp) # 4-byte Spill - leal (%rsi,%rcx), %r15d - movl %r9d, -92(%rsp) # 4-byte Spill - leal 2(%rsi,%rcx), %edx - movl %edx, 1248(%rsp) # 4-byte Spill - leal -1(%rsi,%rcx), %edx - movl %edx, 1344(%rsp) # 4-byte Spill - leal 3(%rsi,%rcx), %r12d - leal -2(%rsi,%rcx), %edx - movl %edx, 1312(%rsp) # 4-byte Spill - leal -3(%rsi,%rcx), %edi - addl %esi, %r13d - leal 1(%rsi,%rcx), %ecx - leal -3(%r8), %r14d - imull %r10d, %r14d - leal -2(%r8), %r9d - imull %r10d, %r9d - leal 3(%r8), %ebx - imull %r10d, %ebx - leal -1(%r8), %ebp - imull %r10d, %ebp - leal 1(%r8), %edx - imull %r10d, %edx - addl %esi, %edx - addl %esi, %ebp - addl %esi, %ebx - addl %esi, %r9d - addl %esi, %r14d - vmovd 1308(%rsp), %xmm5 # 4-byte Folded Reload - movl 1440(%rsp), %r11d - imull %r11d, %ecx - movl %ecx, 1184(%rsp) # 4-byte Spill - imull %r11d, %r13d - imull %r11d, %edi - movl %edi, 1216(%rsp) # 4-byte Spill - movl 1312(%rsp), %ecx # 4-byte Reload - imull %r11d, %ecx - movl %ecx, 1312(%rsp) # 4-byte Spill - imull %r11d, %r12d - movl 1344(%rsp), %esi # 4-byte Reload - imull %r11d, %esi - movl %esi, 1344(%rsp) # 4-byte Spill - movl 1248(%rsp), %ecx # 4-byte Reload - imull %r11d, %ecx - imull %r11d, %r15d - movl -68(%rsp), %esi # 4-byte Reload - leal (,%rsi,8), %esi - imull %r11d, %r14d - imull %r11d, %r9d - imull %r11d, %ebx - imull %r11d, %ebp - imull %r11d, %edx - leal -16(%rsi,%r15,8), %edi - movl %edi, 672(%rsp) # 4-byte Spill - leal (%rsi,%r15,8), %edi - movl %edi, 640(%rsp) # 4-byte Spill - movl 1184(%rsp), %edi # 4-byte Reload - leal (%rsi,%rdi,8), %edi - movl %edi, 608(%rsp) # 4-byte Spill - movl %r8d, %edi - leal (%rsi,%rcx,8), %ecx - movl %ecx, 592(%rsp) # 4-byte Spill - movl 1344(%rsp), %ecx # 4-byte Reload - leal (%rsi,%rcx,8), %ecx - movl %ecx, 544(%rsp) # 4-byte Spill - leal (%rsi,%r12,8), %ecx - movl %ecx, 528(%rsp) # 4-byte Spill - movl 1312(%rsp), %ecx # 4-byte Reload - leal (%rsi,%rcx,8), %ecx - movl %ecx, 480(%rsp) # 4-byte Spill - movl 1216(%rsp), %ecx # 4-byte Reload - leal (%rsi,%rcx,8), %ecx - movl %ecx, 464(%rsp) # 4-byte Spill - leal (%rsi,%rdx,8), %ecx - movl %ecx, 416(%rsp) # 4-byte Spill - leal (%rsi,%r13,8), %ecx - movl %ecx, 400(%rsp) # 4-byte Spill - leal (%rsi,%rbp,8), %ecx - movl %ecx, 352(%rsp) # 4-byte Spill - leal (%rsi,%rbx,8), %ecx - movl %ecx, 336(%rsp) # 4-byte Spill - leal (%rsi,%r9,8), %ecx - movl %ecx, 288(%rsp) # 4-byte Spill - leal (%rsi,%r14,8), %ecx - movl %ecx, 272(%rsp) # 4-byte Spill - movl $0, 160(%rsp) # 4-byte Folded Spill - imull %r11d, %r10d - shll $3, %r11d - movl %r11d, -76(%rsp) # 4-byte Spill - shll $3, %r10d - movl %r10d, -104(%rsp) # 4-byte Spill - vpermilpd $0, %xmm1, %xmm6 # xmm6 = xmm1[0,0] - vpermilpd $0, %xmm3, %xmm3 # xmm3 = xmm3[0,0] - vpermilpd $0, %xmm2, %xmm1 # xmm1 = xmm2[0,0] - vmovaps %ymm0, %ymm8 - vmovups %ymm8, 704(%rsp) # 32-byte Folded Spill - vextractf128 $1, %ymm8, %xmm7 - vpshufd $80, %xmm8, %xmm0 # xmm0 = xmm8[0,0,1,1] - vinsertf128 $1, %xmm6, %ymm6, %ymm13 - vpshufd $80, %xmm7, %xmm2 # xmm2 = xmm7[0,0,1,1] - vinsertf128 $1, %xmm3, %ymm3, %ymm15 - vpshufd $-6, %xmm7, %xmm3 # xmm3 = xmm7[2,2,3,3] - vinsertf128 $1, %xmm1, %ymm1, %ymm10 - vpshufd $-6, %xmm8, %xmm1 # xmm1 = xmm8[2,2,3,3] - vpshufd $0, %xmm5, %xmm7 # xmm7 = xmm5[0,0,0,0] - vpermilpd $0, %xmm4, %xmm4 # xmm4 = xmm4[0,0] - vinsertf128 $1, %xmm4, %ymm4, %ymm4 - vmovupd %ymm4, 1344(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm3, %ymm2, %ymm5 - vinsertf128 $1, %xmm1, %ymm0, %ymm6 - vinsertf128 $1, %xmm7, %ymm7, %ymm0 - vmovups %ymm0, 1312(%rsp) # 32-byte Folded Spill - vmovapd .LCPI0_2(%rip), %ymm14 - .align 16, 0x90 -.LBB0_3: # %for_test275.preheader - # =>This Loop Header: Depth=1 - # Child Loop BB0_21 Depth 2 - # Child Loop BB0_17 Depth 3 - movl %edi, -88(%rsp) # 4-byte Spill - movl -96(%rsp), %ecx # 4-byte Reload - cmpl -72(%rsp), %ecx # 4-byte Folded Reload - jge .LBB0_5 -# BB#4: # %for_test286.preheader.lr.ph - # in Loop: Header=BB0_3 Depth=1 - movl -68(%rsp), %ecx # 4-byte Reload - cmpl 1308(%rsp), %ecx # 4-byte Folded Reload - movl 160(%rsp), %ecx # 4-byte Reload - movl -96(%rsp), %edx # 4-byte Reload - jge .LBB0_5 - .align 16, 0x90 -.LBB0_21: # %for_loop288.lr.ph.us - # Parent Loop BB0_3 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB0_17 Depth 3 - movl %edx, 208(%rsp) # 4-byte Spill - movl %ecx, 224(%rsp) # 4-byte Spill - movl %ecx, %r9d - movl -68(%rsp), %r15d # 4-byte Reload - .align 16, 0x90 -.LBB0_17: # %for_loop288.us - # Parent Loop BB0_3 Depth=1 - # Parent Loop BB0_21 Depth=2 - # => This Inner Loop Header: Depth=3 - vmovups 1312(%rsp), %ymm3 # 32-byte Folded Reload - vmovups %ymm3, 1312(%rsp) # 32-byte Folded Spill - vextractf128 $1, %ymm3, %xmm0 - vmovd %r15d, %xmm1 - vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0] - vpaddd .LCPI0_0(%rip), %xmm1, %xmm2 - vpcmpgtd %xmm2, %xmm0, %xmm0 - vpaddd .LCPI0_1(%rip), %xmm1, %xmm1 - vpcmpgtd %xmm1, %xmm3, %xmm1 - vinsertf128 $1, %xmm0, %ymm1, %ymm0 - vandps 704(%rsp), %ymm0, %ymm11 # 32-byte Folded Reload - vmovmskps %ymm11, %ecx - testl %ecx, %ecx - je .LBB0_19 -# BB#18: # %safe_if_run_true467.us - # in Loop: Header=BB0_17 Depth=3 - movl 640(%rsp), %r11d # 4-byte Reload - leal 24(%r11,%r9), %ecx - movl 528(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %ebx - leal 8(%r11,%r9), %edx - movl %edx, 1088(%rsp) # 4-byte Spill - movl 608(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 1048(%rsp) # 4-byte Spill - movl 480(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 768(%rsp) # 4-byte Spill - movl 464(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %r14d - movl 592(%rsp), %esi # 4-byte Reload - leal (%rsi,%r9), %esi - movl 672(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %ebp - leal 16(%r11,%r9), %r12d - leal -8(%rdi,%r9), %r13d - movl 336(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 832(%rsp) # 4-byte Spill - movl 272(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 800(%rsp) # 4-byte Spill - leal 8(%rdi,%r9), %r10d - movl 544(%rsp), %r8d # 4-byte Reload - leal (%r8,%r9), %edx - movl %edx, 960(%rsp) # 4-byte Spill - leal (%r11,%r9), %edx - movl %edx, 928(%rsp) # 4-byte Spill - movl 416(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %edx - movl %edx, 896(%rsp) # 4-byte Spill - movl 400(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %edx - movl %edx, 864(%rsp) # 4-byte Spill - movl 288(%rsp), %edx # 4-byte Reload - leal (%rdx,%r9), %edx - movl %edx, 992(%rsp) # 4-byte Spill - movl 352(%rsp), %edi # 4-byte Reload - leal (%rdi,%r9), %r8d - movslq %ecx, %rcx - movq %rcx, 1184(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rcx), %ymm6, %ymm0 - movslq %r13d, %rcx - movq %rcx, 1152(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rcx), %ymm6, %ymm1 - vaddpd %ymm0, %ymm1, %ymm0 - movslq %r12d, %rcx - movq %rcx, 1248(%rsp) # 8-byte Spill - movslq %ebx, %rdx - movq %rdx, 1120(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rdx), %ymm6, %ymm1 - vaddpd %ymm1, %ymm0, %ymm0 - vmaskmovpd (%rax,%rcx), %ymm6, %ymm1 - movslq %ebp, %rcx - movq %rcx, 1216(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rcx), %ymm6, %ymm2 - vaddpd %ymm1, %ymm2, %ymm1 - movslq %esi, %rsi - movq %rsi, 1056(%rsp) # 8-byte Spill - movslq %r14d, %rdx - vmaskmovpd (%rax,%rdx), %ymm6, %ymm2 - vaddpd %ymm2, %ymm0, %ymm0 - movslq 768(%rsp), %rcx # 4-byte Folded Reload - movslq 1048(%rsp), %rdi # 4-byte Folded Reload - movq %rdi, 1048(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rsi), %ymm6, %ymm2 - movslq 1088(%rsp), %rsi # 4-byte Folded Reload - movq %rsi, 1088(%rsp) # 8-byte Spill - vmaskmovpd (%rax,%rsi), %ymm6, %ymm3 - movslq %r10d, %r11 - vmaskmovpd (%rax,%r11), %ymm6, %ymm4 - vaddpd %ymm3, %ymm4, %ymm3 - vaddpd %ymm2, %ymm1, %ymm1 - movslq 800(%rsp), %rsi # 4-byte Folded Reload - vmaskmovpd (%rax,%rdi), %ymm6, %ymm7 - vmaskmovpd (%rax,%rcx), %ymm6, %ymm2 - movslq 832(%rsp), %rdi # 4-byte Folded Reload - vmaskmovpd (%rax,%rdi), %ymm6, %ymm8 - vpshufd $80, %xmm11, %xmm4 # xmm4 = xmm11[0,0,1,1] - vaddpd %ymm8, %ymm0, %ymm0 - vaddpd %ymm2, %ymm1, %ymm2 - vaddpd %ymm7, %ymm3, %ymm3 - vmaskmovpd (%rax,%rsi), %ymm6, %ymm1 - movslq 864(%rsp), %r12 # 4-byte Folded Reload - movslq 896(%rsp), %rbx # 4-byte Folded Reload - vpshufd $-6, %xmm11, %xmm7 # xmm7 = xmm11[2,2,3,3] - vinsertf128 $1, %xmm7, %ymm4, %ymm12 - movslq 928(%rsp), %r13 # 4-byte Folded Reload - movslq 960(%rsp), %r10 # 4-byte Folded Reload - vmaskmovpd (%rax,%r10), %ymm6, %ymm4 - vaddpd %ymm4, %ymm3, %ymm4 - vmaskmovpd (%rax,%r13), %ymm12, %ymm7 - vmaskmovpd (%rax,%rbx), %ymm6, %ymm8 - vextractf128 $1, %ymm11, %xmm3 - vmaskmovpd (%rax,%r12), %ymm6, %ymm9 - vaddpd %ymm9, %ymm2, %ymm2 - movslq 992(%rsp), %rbp # 4-byte Folded Reload - vmaskmovpd (%rax,%rbp), %ymm6, %ymm9 - vaddpd %ymm9, %ymm2, %ymm2 - vaddpd %ymm1, %ymm0, %ymm1 - vmulpd %ymm14, %ymm7, %ymm0 - vaddpd %ymm8, %ymm4, %ymm4 - vmaskmovpd (%rax,%r13), %ymm6, %ymm7 - movslq %r8d, %r8 - vmaskmovpd (%rax,%r8), %ymm6, %ymm8 - vaddpd %ymm8, %ymm4, %ymm8 - vmovapd %ymm10, %ymm14 - vmulpd %ymm7, %ymm14, %ymm7 - vpshufd $-6, %xmm3, %xmm4 # xmm4 = xmm3[2,2,3,3] - vpshufd $80, %xmm3, %xmm3 # xmm3 = xmm3[0,0,1,1] - movq 1480(%rsp), %r14 - vmaskmovpd (%r14,%r13), %ymm12, %ymm9 - vsubpd %ymm9, %ymm0, %ymm0 - vmulpd %ymm1, %ymm13, %ymm1 - vmulpd %ymm2, %ymm15, %ymm2 - vmovupd 1344(%rsp), %ymm9 # 32-byte Folded Reload - vmovupd %ymm9, 1344(%rsp) # 32-byte Folded Spill - vmulpd %ymm8, %ymm9, %ymm8 - vaddpd %ymm7, %ymm8, %ymm7 - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm8 - vmovupd %ymm8, 992(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm4, %ymm3, %ymm11 - vmaskmovpd 32(%rax,%rdi), %ymm5, %ymm3 - vmovupd %ymm3, 960(%rsp) # 32-byte Folded Spill - vaddpd %ymm7, %ymm2, %ymm2 - vmaskmovpd 32(%rax,%rdx), %ymm5, %ymm3 - vmovupd %ymm3, 928(%rsp) # 32-byte Folded Spill - vaddpd %ymm1, %ymm2, %ymm1 - movq 1464(%rsp), %rdx - vmaskmovpd (%rdx,%r13), %ymm12, %ymm2 - vmulpd %ymm2, %ymm1, %ymm1 - movq 1120(%rsp), %rsi # 8-byte Reload - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm2 - vmovupd %ymm2, 1120(%rsp) # 32-byte Folded Spill - vaddpd %ymm1, %ymm0, %ymm0 - vmovupd %ymm0, 736(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r8), %ymm5, %ymm0 - vmovupd %ymm0, 896(%rsp) # 32-byte Folded Spill - movq 1184(%rsp), %rsi # 8-byte Reload - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm0 - vmovupd %ymm0, 1184(%rsp) # 32-byte Folded Spill - movq 1152(%rsp), %rsi # 8-byte Reload - vmaskmovpd 32(%rax,%rsi), %ymm5, %ymm0 - vmovupd %ymm0, 1152(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rbp), %ymm5, %ymm0 - vmovupd %ymm0, 832(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r12), %ymm5, %ymm0 - vmovupd %ymm0, 800(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm0 - vmovupd %ymm0, 768(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r13), %ymm5, %ymm0 - vmovupd %ymm0, 864(%rsp) # 32-byte Folded Spill - movq 1056(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm0 - vmovupd %ymm0, 1056(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%rbx), %ymm5, %ymm7 - vmaskmovpd 32(%rax,%r10), %ymm5, %ymm10 - movq 1048(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm0 - movq 1088(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm1 - vmaskmovpd 32(%rax,%r11), %ymm5, %ymm2 - movq 1248(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm3 - movq 1216(%rsp), %rcx # 8-byte Reload - vmaskmovpd 32(%rax,%rcx), %ymm5, %ymm4 - vmaskmovpd 32(%rdx,%r13), %ymm11, %ymm8 - vmovupd %ymm8, 1248(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%r14,%r13), %ymm11, %ymm8 - vmovupd %ymm8, 1216(%rsp) # 32-byte Folded Spill - vmaskmovpd 32(%rax,%r13), %ymm11, %ymm8 - vmovupd %ymm8, 1088(%rsp) # 32-byte Folded Spill - vmovupd 736(%rsp), %ymm8 # 32-byte Folded Reload - vmaskmovpd %ymm8, %ymm12, (%r14,%r13) - vaddpd %ymm3, %ymm4, %ymm3 - vaddpd %ymm1, %ymm2, %ymm1 - vaddpd %ymm0, %ymm1, %ymm0 - vaddpd %ymm10, %ymm0, %ymm0 - vaddpd %ymm7, %ymm0, %ymm1 - vaddpd 1056(%rsp), %ymm3, %ymm0 # 32-byte Folded Reload - vaddpd 768(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 800(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 832(%rsp), %ymm0, %ymm2 # 32-byte Folded Reload - vmovupd 1152(%rsp), %ymm0 # 32-byte Folded Reload - vaddpd 1184(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vmulpd %ymm2, %ymm15, %ymm2 - vaddpd 896(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload - vmulpd %ymm1, %ymm9, %ymm1 - vmulpd 864(%rsp), %ymm14, %ymm3 # 32-byte Folded Reload - vmovapd %ymm14, %ymm10 - vaddpd %ymm3, %ymm1, %ymm3 - vmovapd .LCPI0_2(%rip), %ymm4 - vmovupd 1088(%rsp), %ymm1 # 32-byte Folded Reload - vmulpd %ymm4, %ymm1, %ymm1 - vmovapd %ymm4, %ymm14 - vsubpd 1216(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload - vaddpd %ymm3, %ymm2, %ymm2 - vaddpd 1120(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 928(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 960(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd 992(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vmulpd %ymm0, %ymm13, %ymm0 - vaddpd %ymm0, %ymm2, %ymm0 - vmulpd 1248(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload - vaddpd %ymm0, %ymm1, %ymm0 - vmaskmovpd %ymm0, %ymm11, 32(%r14,%r13) -.LBB0_19: # %safe_if_after_true466.us - # in Loop: Header=BB0_17 Depth=3 - addl $64, %r9d - addl $8, %r15d - cmpl 1308(%rsp), %r15d # 4-byte Folded Reload - jl .LBB0_17 -# BB#20: # %for_exit289.us - # in Loop: Header=BB0_21 Depth=2 - movl 224(%rsp), %ecx # 4-byte Reload - addl -76(%rsp), %ecx # 4-byte Folded Reload - movl 208(%rsp), %edx # 4-byte Reload - incl %edx - cmpl -72(%rsp), %edx # 4-byte Folded Reload - jne .LBB0_21 -.LBB0_5: # %for_exit278 - # in Loop: Header=BB0_3 Depth=1 - movl 160(%rsp), %ecx # 4-byte Reload - addl -104(%rsp), %ecx # 4-byte Folded Reload - movl %ecx, 160(%rsp) # 4-byte Spill - movl -88(%rsp), %edi # 4-byte Reload - incl %edi - movl -92(%rsp), %ecx # 4-byte Reload - cmpl %ecx, %edi - jne .LBB0_3 -.LBB0_6: # %for_exit - addq $1384, %rsp # imm = 0x568 - popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 - popq %rbp - vzeroupper - ret -.Ltmp0: - .size stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, .Ltmp0-stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ - - .align 16, 0x90 - .type stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_,@function -stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_: # @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ -# BB#0: # %allocas - pushq %rbp - pushq %r15 - pushq %r14 - pushq %rbx - subq $56, %rsp - movq %rdi, %rax - movl 16(%rax), %r8d - movq 56(%rax), %rbx - movq 48(%rax), %r15 - movq 40(%rax), %r14 - movq 32(%rax), %r11 - leal 1(%r8,%rcx), %r9d - movl 24(%rax), %r10d - vmovaps 64(%rax), %ymm0 - addl %ecx, %r8d - movl 20(%rax), %ebp - movl 12(%rax), %ecx - movl 8(%rax), %edx - movl (%rax), %edi - movl 4(%rax), %esi - vmovmskps %ymm0, %eax - cmpl $255, %eax - jne .LBB1_2 -# BB#1: # %all_on - vpcmpeqd %xmm0, %xmm0, %xmm0 - movq %rbx, 40(%rsp) - movq %r15, 32(%rsp) - movq %r14, 24(%rsp) - movq %r11, 16(%rsp) - movl %r10d, 8(%rsp) - movl %ebp, (%rsp) - vinsertf128 $1, %xmm0, %ymm0, %ymm0 - jmp .LBB1_3 -.LBB1_2: # %some_on - movq %rbx, 40(%rsp) - movq %r15, 32(%rsp) - movq %r14, 24(%rsp) - movq %r11, 16(%rsp) - movl %r10d, 8(%rsp) - movl %ebp, (%rsp) -.LBB1_3: # %some_on - callq stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ - addq $56, %rsp - popq %rbx - popq %r14 - popq %r15 - popq %rbp - vzeroupper - ret -.Ltmp1: - .size stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, .Ltmp1-stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ - - .globl loop_stencil_ispc_tasks - .align 16, 0x90 - .type loop_stencil_ispc_tasks,@function -loop_stencil_ispc_tasks: # @loop_stencil_ispc_tasks -# BB#0: # %allocas - pushq %rbp - pushq %r15 - pushq %r14 - pushq %r13 - pushq %r12 - pushq %rbx - subq $104, %rsp - movl %r9d, 92(%rsp) # 4-byte Spill - movl %r8d, 88(%rsp) # 4-byte Spill - movl %ecx, 84(%rsp) # 4-byte Spill - movl %edx, 80(%rsp) # 4-byte Spill - movl %esi, %ebx - movl %edi, %ebp - movq $0, 96(%rsp) - cmpl %ebx, %ebp - jge .LBB2_10 -# BB#1: # %for_loop.lr.ph - movq 216(%rsp), %r13 - movl 168(%rsp), %r14d - movl 160(%rsp), %r12d - subl %r12d, %r14d - leaq 96(%rsp), %r15 - vpcmpeqd %xmm0, %xmm0, %xmm0 - vinsertf128 $1, %xmm0, %ymm0, %ymm1 - vmovups %ymm1, 32(%rsp) # 32-byte Folded Spill - vinsertf128 $1, %xmm0, %ymm0, %ymm0 - vmovups %ymm0, (%rsp) # 32-byte Folded Spill - .align 16, 0x90 -.LBB2_2: # %for_loop - # =>This Inner Loop Header: Depth=1 - movq %r15, %rdi - movl $96, %esi - movl $32, %edx - vzeroupper - callq ISPCAlloc - movq %rax, %rdx - movl 80(%rsp), %eax # 4-byte Reload - movl %eax, (%rdx) - movl 84(%rsp), %eax # 4-byte Reload - movl %eax, 4(%rdx) - movl 88(%rsp), %eax # 4-byte Reload - movl %eax, 8(%rdx) - movl 92(%rsp), %eax # 4-byte Reload - movl %eax, 12(%rdx) - movl %r12d, 16(%rdx) - movl 176(%rsp), %eax - movl %eax, 20(%rdx) - movl 184(%rsp), %eax - movl %eax, 24(%rdx) - testb $1, %bpl - movl 192(%rsp), %eax - movl %eax, 28(%rdx) - movq 200(%rsp), %rax - movq %rax, 32(%rdx) - movq 208(%rsp), %rax - movq %rax, 40(%rdx) - jne .LBB2_4 -# BB#3: # %if_then - # in Loop: Header=BB2_2 Depth=1 - movq %r13, 48(%rdx) - movq 224(%rsp), %rax - movq %rax, 56(%rdx) - vmovups 32(%rsp), %ymm0 # 32-byte Folded Reload - jmp .LBB2_5 - .align 16, 0x90 -.LBB2_4: # %if_else - # in Loop: Header=BB2_2 Depth=1 - movq 224(%rsp), %rax - movq %rax, 48(%rdx) - movq %r13, 56(%rdx) - vmovups (%rsp), %ymm0 # 32-byte Folded Reload -.LBB2_5: # %if_else - # in Loop: Header=BB2_2 Depth=1 - vmovaps %ymm0, 64(%rdx) - movq %r15, %rdi - movl $stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, %esi - movl %r14d, %ecx - movl $1, %r8d - movl $1, %r9d - vzeroupper - callq ISPCLaunch - movq 96(%rsp), %rdi - testq %rdi, %rdi - je .LBB2_7 -# BB#6: # %call_sync - # in Loop: Header=BB2_2 Depth=1 - callq ISPCSync - movq $0, 96(%rsp) -.LBB2_7: # %post_sync - # in Loop: Header=BB2_2 Depth=1 - incl %ebp - cmpl %ebp, %ebx - jne .LBB2_2 -# BB#8: # %for_exit - movq 96(%rsp), %rdi - testq %rdi, %rdi - je .LBB2_10 -# BB#9: # %call_sync72 - callq ISPCSync - movq $0, 96(%rsp) -.LBB2_10: # %post_sync73 - addq $104, %rsp - popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 - popq %rbp - ret -.Ltmp2: - .size loop_stencil_ispc_tasks, .Ltmp2-loop_stencil_ispc_tasks - - - .section ".note.GNU-stack","",@progbits diff --git a/examples/stencil/stencil_cu_avx.bc b/examples/stencil/stencil_cu_avx.bc deleted file mode 100644 index d9338e7c..00000000 Binary files a/examples/stencil/stencil_cu_avx.bc and /dev/null differ diff --git a/examples/stencil/stencil_cu_avx.s b/examples/stencil/stencil_cu_avx.s deleted file mode 100644 index 774d0a55..00000000 --- a/examples/stencil/stencil_cu_avx.s +++ /dev/null @@ -1,214 +0,0 @@ - .file "stencil.ispc" - .text - .globl loop_stencil_ispc_tasks - .align 16, 0x90 - .type loop_stencil_ispc_tasks,@function -loop_stencil_ispc_tasks: # @loop_stencil_ispc_tasks -# BB#0: # %allocas - pushq %rbp - movq %rsp, %rbp - pushq %r15 - pushq %r14 - pushq %r13 - pushq %r12 - pushq %rbx - andq $-32, %rsp - subq $384, %rsp # imm = 0x180 - movl %r9d, 28(%rsp) # 4-byte Spill - movl %r8d, 24(%rsp) # 4-byte Spill - movl %ecx, 20(%rsp) # 4-byte Spill - movl %edx, %ebx - movl %esi, 16(%rsp) # 4-byte Spill - movl %edi, %r13d - movq $0, 352(%rsp) - cmpl %esi, %r13d - jge .LBB0_10 -# BB#1: # %for_loop.lr.ph - movl 24(%rbp), %r14d - movl 16(%rbp), %r15d - subl %r15d, %r14d - leaq 352(%rsp), %rax - .align 16, 0x90 -.LBB0_2: # %for_loop - # =>This Inner Loop Header: Depth=1 - movq %rax, %r12 - movq %r12, %rdi - movl $96, %esi - movl $32, %edx - callq CUDAAlloc - testb $1, %r13b - jne .LBB0_4 -# BB#3: # %if_then - # in Loop: Header=BB0_2 Depth=1 - movl %ebx, 252(%rsp) - leaq 252(%rsp), %rax - movq %rax, 256(%rsp) - movl 20(%rsp), %eax # 4-byte Reload - movl %eax, 248(%rsp) - leaq 248(%rsp), %rax - movq %rax, 264(%rsp) - movl 24(%rsp), %eax # 4-byte Reload - movl %eax, 244(%rsp) - leaq 244(%rsp), %rax - movq %rax, 272(%rsp) - movl 28(%rsp), %eax # 4-byte Reload - movl %eax, 240(%rsp) - leaq 240(%rsp), %rax - movq %rax, 280(%rsp) - movl %r15d, 236(%rsp) - leaq 236(%rsp), %rax - movq %rax, 288(%rsp) - movl 32(%rbp), %eax - movl %eax, 232(%rsp) - leaq 232(%rsp), %rax - movq %rax, 296(%rsp) - movl 40(%rbp), %eax - movl %eax, 228(%rsp) - leaq 228(%rsp), %rax - movq %rax, 304(%rsp) - movl 48(%rbp), %eax - movl %eax, 224(%rsp) - leaq 224(%rsp), %rax - movq %rax, 312(%rsp) - movq 56(%rbp), %rax - movq %rax, 216(%rsp) - leaq 216(%rsp), %rax - movq %rax, 320(%rsp) - movq 64(%rbp), %rax - movq %rax, 208(%rsp) - leaq 208(%rsp), %rax - movq %rax, 328(%rsp) - movq 72(%rbp), %rax - movq %rax, 200(%rsp) - leaq 200(%rsp), %rax - movq %rax, 336(%rsp) - movq 80(%rbp), %rax - movq %rax, 192(%rsp) - leaq 192(%rsp), %rax - movq %rax, 344(%rsp) - movl $1, 8(%rsp) - movl $1, (%rsp) - movq %r12, %rdi - movl $.L.module_str, %esi - movl $.L.ptx_str, %edx - movl $.L.func_str, %ecx - leaq 256(%rsp), %r8 - jmp .LBB0_5 - .align 16, 0x90 -.LBB0_4: # %if_else - # in Loop: Header=BB0_2 Depth=1 - movl %ebx, 92(%rsp) - leaq 92(%rsp), %rax - movq %rax, 96(%rsp) - movl 20(%rsp), %eax # 4-byte Reload - movl %eax, 88(%rsp) - leaq 88(%rsp), %rax - movq %rax, 104(%rsp) - movl 24(%rsp), %eax # 4-byte Reload - movl %eax, 84(%rsp) - leaq 84(%rsp), %rax - movq %rax, 112(%rsp) - movl 28(%rsp), %eax # 4-byte Reload - movl %eax, 80(%rsp) - leaq 80(%rsp), %rax - movq %rax, 120(%rsp) - movl %r15d, 76(%rsp) - leaq 76(%rsp), %rax - movq %rax, 128(%rsp) - movl 32(%rbp), %eax - movl %eax, 72(%rsp) - leaq 72(%rsp), %rax - movq %rax, 136(%rsp) - movl 40(%rbp), %eax - movl %eax, 68(%rsp) - leaq 68(%rsp), %rax - movq %rax, 144(%rsp) - movl 48(%rbp), %eax - movl %eax, 64(%rsp) - leaq 64(%rsp), %rax - movq %rax, 152(%rsp) - movq 56(%rbp), %rax - movq %rax, 56(%rsp) - leaq 56(%rsp), %rax - movq %rax, 160(%rsp) - movq 64(%rbp), %rax - movq %rax, 48(%rsp) - leaq 48(%rsp), %rax - movq %rax, 168(%rsp) - movq 80(%rbp), %rax - movq %rax, 40(%rsp) - leaq 40(%rsp), %rax - movq %rax, 176(%rsp) - movq 72(%rbp), %rax - movq %rax, 32(%rsp) - leaq 32(%rsp), %rax - movq %rax, 184(%rsp) - movl $1, 8(%rsp) - movl $1, (%rsp) - movq %r12, %rdi - movl $.L.module_str, %esi - movl $.L.ptx_str, %edx - movl $.L.func_str1, %ecx - leaq 96(%rsp), %r8 -.LBB0_5: # %if_else - # in Loop: Header=BB0_2 Depth=1 - movl %r14d, %r9d - callq CUDALaunch - movq 352(%rsp), %rdi - testq %rdi, %rdi - je .LBB0_7 -# BB#6: # %call_sync - # in Loop: Header=BB0_2 Depth=1 - callq ISPCSync - movq $0, 352(%rsp) -.LBB0_7: # %post_sync - # in Loop: Header=BB0_2 Depth=1 - incl %r13d - cmpl %r13d, 16(%rsp) # 4-byte Folded Reload - movq %r12, %rax - jne .LBB0_2 -# BB#8: # %for_exit - movq 352(%rsp), %rdi - testq %rdi, %rdi - je .LBB0_10 -# BB#9: # %call_sync113 - callq ISPCSync - movq $0, 352(%rsp) -.LBB0_10: # %post_sync114 - leaq -40(%rbp), %rsp - popq %rbx - popq %r12 - popq %r13 - popq %r14 - popq %r15 - popq %rbp - ret -.Ltmp0: - .size loop_stencil_ispc_tasks, .Ltmp0-loop_stencil_ispc_tasks - - .type .L.module_str,@object # @.module_str - .section .rodata,"a",@progbits -.L.module_str: - .asciz "stencil.ispc" - .size .L.module_str, 13 - - .type .L.ptx_str,@object # @.ptx_str - .align 16 -.L.ptx_str: - .asciz "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.1\n.target sm_35, texmode_independent\n.address_size 64\n\n\t// .globl\tstencil_step_task\n // @stencil_step_task\n.entry stencil_step_task(\n\t.param .u32 stencil_step_task_param_0,\n\t.param .u32 stencil_step_task_param_1,\n\t.param .u32 stencil_step_task_param_2,\n\t.param .u32 stencil_step_task_param_3,\n\t.param .u32 stencil_step_task_param_4,\n\t.param .u32 stencil_step_task_param_5,\n\t.param .u32 stencil_step_task_param_6,\n\t.param .u32 stencil_step_task_param_7,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_8,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_9,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_10,\n\t.param .u64 .ptr .align 8 stencil_step_task_param_11\n)\n{\n\t.reg .pred %p<396>;\n\t.reg .s16 %rc<396>;\n\t.reg .s16 %rs<396>;\n\t.reg .s32 %r<396>;\n\t.reg .s64 %rl<396>;\n\t.reg .f32 %f<396>;\n\t.reg .f64 %fl<396>;\n\n// BB#0: // %allocas\n\tmov.u32 \t%r12, %ctaid.x;\n\tld.param.u32 \t%r13, [stencil_step_task_param_4];\n\tadd.s32 \t%r16, %r12, %r13;\n\tadd.s32 \t%r0, %r16, 1;\n\tsetp.ge.s32 \t%p0, %r16, %r0;\n\t@%p0 bra \tBB0_11;\n// BB#1: // %for_test28.i.preheader.lr.ph\n\tld.param.u32 \t%r0, [stencil_step_task_param_0];\n\tld.param.u32 \t%r1, [stencil_step_task_param_1];\n\tld.param.u32 \t%r2, [stencil_step_task_param_2];\n\tld.param.u32 \t%r3, [stencil_step_task_param_3];\n\tld.param.u32 \t%r4, [stencil_step_task_param_5];\n\tld.param.u32 \t%r5, [stencil_step_task_param_6];\n\tmul.lo.s32 \t%r5, %r5, %r4;\n\tld.param.u64 \t%rl3, [stencil_step_task_param_8];\n\tld.f64 \t%fl0, [%rl3];\n\tld.f64 \t%fl1, [%rl3+8];\n\tld.param.u64 \t%rl0, [stencil_step_task_param_9];\n\tld.f64 \t%fl2, [%rl3+16];\n\tld.param.u64 \t%rl1, [stencil_step_task_param_10];\n\tld.param.u64 \t%rl2, [stencil_step_task_param_11];\n\tld.f64 \t%fl3, [%rl3+24];\n\tshl.b32 \t%r6, %r4, 1;\n\tmul.lo.s32 \t%r7, %r4, 3;\n\tmul.lo.s32 \t%r8, %r4, -3;\n\tshl.b32 \t%r9, %r5, 1;\n\tmul.lo.s32 \t%r10, %r5, 3;\n\tmul.lo.s32 \t%r11, %r5, -3;\n\tadd.s32 \t%r12, %r12, %r13;\n\tneg.s32 \t%r13, %r9;\n\tneg.s32 \t%r14, %r6;\n\tmov.u32 \t%r32, WARP_SZ;\nBB0_2: // %for_test28.i.preheader\n // =>This Loop Header: Depth=1\n // Child Loop BB0_9 Depth 2\n // Child Loop BB0_5 Depth 3\n\tmov.u32 \t%r15, %r16;\n\tsetp.ge.s32 \t%p0, %r2, %r3;\n\t@%p0 bra \tBB0_10;\n// BB#3: // %for_test35.i.preheader.lr.ph\n // in Loop: Header=BB0_2 Depth=1\n\tsetp.lt.s32 \t%p0, %r0, %r1;\n\t@%p0 bra \tBB0_4;\n\tbra.uni \tBB0_10;\nBB0_4: // in Loop: Header=BB0_2 Depth=1\n\tmul.lo.s32 \t%r16, %r15, %r5;\n\tmov.u32 \t%r17, %r2;\nBB0_9: // %for_loop37.i.lr.ph.us\n // Parent Loop BB0_2 Depth=1\n // => This Loop Header: Depth=2\n // Child Loop BB0_5 Depth 3\n\tmad.lo.s32 \t%r18, %r17, %r4, %r16;\n\tadd.s32 \t%r19, %r18, %r4;\n\tadd.s32 \t%r20, %r18, %r6;\n\tsub.s32 \t%r21, %r18, %r4;\n\tadd.s32 \t%r22, %r18, %r7;\n\tadd.s32 \t%r23, %r18, %r14;\n\tadd.s32 \t%r24, %r18, %r5;\n\tadd.s32 \t%r25, %r18, %r8;\n\tadd.s32 \t%r26, %r18, %r9;\n\tsub.s32 \t%r27, %r18, %r5;\n\tadd.s32 \t%r28, %r18, %r10;\n\tadd.s32 \t%r29, %r18, %r13;\n\tadd.s32 \t%r30, %r18, %r11;\n\tmov.u32 \t%r31, %r0;\nBB0_5: // %for_loop37.i.us\n // Parent Loop BB0_2 Depth=1\n // Parent Loop BB0_9 Depth=2\n // => This Inner Loop Header: Depth=3\n\tmov.u32 \t%r33, %tid.x;\n\tadd.s32 \t%r34, %r32, -1;\n\tand.b32 \t%r33, %r34, %r33;\n\tadd.s32 \t%r33, %r33, %r31;\n\tsetp.ge.s32 \t%p0, %r33, %r1;\n\t@%p0 bra \tBB0_7;\n// BB#6: // %pl_dolane.i.us\n // in Loop: Header=BB0_5 Depth=3\n\tadd.s32 \t%r34, %r18, %r33;\n\tshl.b32 \t%r34, %r34, 3;\n\tadd.s32 \t%r35, %r34, -8;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl4, [%rl3];\n\tadd.s32 \t%r35, %r34, 8;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl5, [%rl3];\n\tadd.s32 \t%r35, %r34, -16;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl6, [%rl3];\n\tadd.s32 \t%r35, %r34, 16;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl9, [%rl3];\n\tadd.s32 \t%r35, %r19, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl8, [%rl3];\n\tadd.s32 \t%r35, %r34, -24;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl7, [%rl3];\n\tadd.s32 \t%r35, %r34, 24;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl10, [%rl3];\n\tadd.s32 \t%r35, %r20, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl13, [%rl3];\n\tadd.s32 \t%r35, %r21, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl12, [%rl3];\n\tadd.s32 \t%r35, %r22, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl11, [%rl3];\n\tadd.s32 \t%r35, %r23, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl16, [%rl3];\n\tadd.s32 \t%r35, %r24, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl15, [%rl3];\n\tadd.s32 \t%r35, %r25, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl14, [%rl3];\n\tadd.s32 \t%r35, %r26, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl19, [%rl3];\n\tadd.s32 \t%r35, %r27, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl18, [%rl3];\n\tadd.s32 \t%r35, %r28, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl17, [%rl3];\n\tadd.s32 \t%r35, %r29, %r33;\n\tshl.b32 \t%r35, %r35, 3;\n\tcvt.s64.s32 \t%rl3, %r35;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl24, [%rl3];\n\tcvt.s64.s32 \t%rl4, %r34;\n\tadd.s64 \t%rl3, %rl4, %rl1;\n\tld.f64 \t%fl21, [%rl3];\n\tadd.s32 \t%r33, %r30, %r33;\n\tshl.b32 \t%r33, %r33, 3;\n\tcvt.s64.s32 \t%rl3, %r33;\n\tadd.s64 \t%rl3, %rl3, %rl1;\n\tld.f64 \t%fl20, [%rl3];\n\tadd.s64 \t%rl3, %rl4, %rl2;\n\tld.f64 \t%fl23, [%rl3];\n\tadd.s64 \t%rl4, %rl4, %rl0;\n\tld.f64 \t%fl22, [%rl4];\n\tadd.f64 \t%fl25, %fl21, %fl21;\n\tsub.f64 \t%fl23, %fl25, %fl23;\n\tadd.f64 \t%fl6, %fl6, %fl9;\n\tadd.f64 \t%fl6, %fl6, %fl13;\n\tadd.f64 \t%fl6, %fl6, %fl16;\n\tadd.f64 \t%fl6, %fl6, %fl19;\n\tadd.f64 \t%fl6, %fl6, %fl24;\n\tadd.f64 \t%fl4, %fl4, %fl5;\n\tadd.f64 \t%fl4, %fl4, %fl8;\n\tadd.f64 \t%fl4, %fl4, %fl12;\n\tadd.f64 \t%fl4, %fl4, %fl15;\n\tadd.f64 \t%fl4, %fl4, %fl18;\n\tmul.f64 \t%fl5, %fl0, %fl21;\n\tfma.rn.f64 \t%fl4, %fl1, %fl4, %fl5;\n\tfma.rn.f64 \t%fl4, %fl2, %fl6, %fl4;\n\tadd.f64 \t%fl5, %fl7, %fl10;\n\tadd.f64 \t%fl5, %fl5, %fl11;\n\tadd.f64 \t%fl5, %fl5, %fl14;\n\tadd.f64 \t%fl5, %fl5, %fl17;\n\tadd.f64 \t%fl5, %fl5, %fl20;\n\tfma.rn.f64 \t%fl4, %fl3, %fl5, %fl4;\n\tfma.rn.f64 \t%fl4, %fl4, %fl22, %fl23;\n\tst.f64 \t[%rl3], %fl4;\nBB0_7: // %safe_if_after_true.i.us\n // in Loop: Header=BB0_5 Depth=3\n\tadd.s32 \t%r31, %r32, %r31;\n\tsetp.lt.s32 \t%p0, %r31, %r1;\n\t@%p0 bra \tBB0_5;\n// BB#8: // %for_exit38.i.us\n // in Loop: Header=BB0_9 Depth=2\n\tadd.s32 \t%r17, %r17, 1;\n\tsetp.eq.s32 \t%p0, %r17, %r3;\n\t@%p0 bra \tBB0_10;\n\tbra.uni \tBB0_9;\nBB0_10: // %for_exit31.i\n // in Loop: Header=BB0_2 Depth=1\n\tadd.s32 \t%r16, %r15, 1;\n\tsetp.ne.s32 \t%p0, %r15, %r12;\n\t@%p0 bra \tBB0_2;\nBB0_11: // %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit\n\tret;\n}\n\n" - .size .L.ptx_str, 7954 - - .type .L.func_str,@object # @.func_str - .align 16 -.L.func_str: - .asciz "stencil_step_task" - .size .L.func_str, 18 - - .type .L.func_str1,@object # @.func_str1 - .align 16 -.L.func_str1: - .asciz "stencil_step_task" - .size .L.func_str1, 18 - - - .section ".note.GNU-stack","",@progbits diff --git a/examples/stencil/stencil_cu_nvptx64.bc b/examples/stencil/stencil_cu_nvptx64.bc deleted file mode 100644 index 2f3c05da..00000000 Binary files a/examples/stencil/stencil_cu_nvptx64.bc and /dev/null differ diff --git a/examples/stencil/stencil_cu_nvptx64.cubin b/examples/stencil/stencil_cu_nvptx64.cubin deleted file mode 100644 index a7e9a38a..00000000 Binary files a/examples/stencil/stencil_cu_nvptx64.cubin and /dev/null differ diff --git a/examples/stencil/stencil_cu_nvptx64.ll b/examples/stencil/stencil_cu_nvptx64.ll deleted file mode 100644 index d0c5e824..00000000 --- a/examples/stencil/stencil_cu_nvptx64.ll +++ /dev/null @@ -1,269 +0,0 @@ -; ModuleID = 'stencil_cu_nvptx64.bc' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -target triple = "nvptx64" - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #0 - -; Function Attrs: nounwind -define void @stencil_step_task(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %Nx, i32 %Ny, i32 %Nz, double* nocapture %coef, double* %vsq, double* %Ain, double* %Aout) #1 { -allocas: - %bid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 - %add_z0_load_calltmp = add i32 %bid.i.i, %z0 - %bid.i.i21 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 - %add_z0_load15_calltmp18 = add i32 %z0, 1 - %add_add_z0_load15_calltmp18_ = add i32 %add_z0_load15_calltmp18, %bid.i.i21 - %mul_Nx_load_Ny_load.i = mul i32 %Ny, %Nx - %coef_load_offset_load.i = load double* %coef, align 8 - %coef_load16_offset.i = getelementptr double* %coef, i64 1 - %coef_load16_offset_load.i = load double* %coef_load16_offset.i, align 8 - %coef_load19_offset.i = getelementptr double* %coef, i64 2 - %coef_load19_offset_load.i = load double* %coef_load19_offset.i, align 8 - %coef_load22_offset.i = getelementptr double* %coef, i64 3 - %coef_load22_offset_load.i = load double* %coef_load22_offset.i, align 8 - %less_z_load_z1_load.i161 = icmp slt i32 %add_z0_load_calltmp, %add_add_z0_load15_calltmp18_ - br i1 %less_z_load_z1_load.i161, label %for_test28.i.preheader.lr.ph, label %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit - -for_test28.i.preheader.lr.ph: ; preds = %allocas - %less_y_load_y1_load.i159 = icmp slt i32 %y0, %y1 - %less_xb_load_x1_load.i157 = icmp slt i32 %x0, %x1 - %x1_load199_broadcast_init.i = insertelement <1 x i32> undef, i32 %x1, i32 0 - %mul__Nx_load119.i = shl i32 %Nx, 1 - %mul__Nx_load167.i = mul i32 %Nx, 3 - %mul__Nx_load127.i = mul i32 %Nx, -2 - %Ain_load65_ptr2int.i = ptrtoint double* %Ain to i64 - %mul__Nx_load175.i = mul i32 %Nx, -3 - %mul__Nxy_load136.i = shl i32 %mul_Nx_load_Ny_load.i, 1 - %mul__Nxy_load184.i = mul i32 %mul_Nx_load_Ny_load.i, 3 - %mul__Nxy_load144.i = mul i32 %mul_Nx_load_Ny_load.i, -2 - %mul__Nxy_load192.i = mul i32 %mul_Nx_load_Ny_load.i, -3 - %Aout_load_ptr2int.i = ptrtoint double* %Aout to i64 - %vsq_load_ptr2int.i = ptrtoint double* %vsq to i64 - %0 = add i32 %bid.i.i21, %z0 - br label %for_test28.i.preheader - -for_test28.i.preheader: ; preds = %for_exit31.i, %for_test28.i.preheader.lr.ph - %z.0.i162 = phi i32 [ %add_z0_load_calltmp, %for_test28.i.preheader.lr.ph ], [ %z_load245_plus1.i, %for_exit31.i ] - br i1 %less_y_load_y1_load.i159, label %for_test35.i.preheader.lr.ph, label %for_exit31.i - -for_test35.i.preheader.lr.ph: ; preds = %for_test28.i.preheader - %mul_z_load45_Nxy_load.i = mul i32 %z.0.i162, %mul_Nx_load_Ny_load.i - br i1 %less_xb_load_x1_load.i157, label %for_loop37.i.lr.ph.us, label %for_exit31.i - -for_exit38.i.us: ; preds = %safe_if_after_true.i.us - %y_load244_plus1.i.us = add i32 %y.0.i160.us, 1 - %exitcond = icmp eq i32 %y_load244_plus1.i.us, %y1 - br i1 %exitcond, label %for_exit31.i, label %for_loop37.i.lr.ph.us - -for_loop37.i.us: ; preds = %for_loop37.i.lr.ph.us, %safe_if_after_true.i.us - %xb.0.i158.us = phi i32 [ %x0, %for_loop37.i.lr.ph.us ], [ %add_xb_load243_calltmp241.i.us, %safe_if_after_true.i.us ] - %tid.i.i.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 - %tid.i.i.i.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2 - %sub_calltmp3_.i.i.us = add i32 %tid.i.i.i.i.us, -1 - %bitop.i.i.us = and i32 %sub_calltmp3_.i.i.us, %tid.i.i.i.us - %add_xb_load42_calltmp.i.us = add i32 %bitop.i.i.us, %xb.0.i158.us - %add_xb_load42_calltmp_broadcast_init.i.us = insertelement <1 x i32> undef, i32 %add_xb_load42_calltmp.i.us, i32 0 - %less_x_load198_x1_load199_broadcast.i.us = icmp slt <1 x i32> %add_xb_load42_calltmp_broadcast_init.i.us, %x1_load199_broadcast_init.i - %v.i.i.us = extractelement <1 x i1> %less_x_load198_x1_load199_broadcast.i.us, i32 0 - br i1 %v.i.i.us, label %pl_dolane.i.us, label %safe_if_after_true.i.us - -pl_dolane.i.us: ; preds = %for_loop37.i.us - %.lhs.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %add_xb_load42_calltmp.i.us - %.lhs.us = shl i32 %.lhs.lhs.us, 3 - %1 = add i32 %.lhs.us, -8 - %iptr__id.i.rhs.us = sext i32 %1 to i64 - %iptr__id.i.us = add i64 %iptr__id.i.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i.us = inttoptr i64 %iptr__id.i.us to double* - %val__id.i.us = load double* %ptr__id.i.us, align 8 - %2 = add i32 %.lhs.us, 8 - %iptr__id.i130.rhs.us = sext i32 %2 to i64 - %iptr__id.i130.us = add i64 %iptr__id.i130.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i131.us = inttoptr i64 %iptr__id.i130.us to double* - %val__id.i132.us = load double* %ptr__id.i131.us, align 8 - %3 = add i32 %.lhs.us, -16 - %iptr__id.i125.rhs.us = sext i32 %3 to i64 - %iptr__id.i125.us = add i64 %iptr__id.i125.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i126.us = inttoptr i64 %iptr__id.i125.us to double* - %val__id.i127.us = load double* %ptr__id.i126.us, align 8 - %4 = add i32 %.lhs.us, 16 - %iptr__id.i120.rhs.us = sext i32 %4 to i64 - %iptr__id.i120.us = add i64 %iptr__id.i120.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i121.us = inttoptr i64 %iptr__id.i120.us to double* - %val__id.i122.us = load double* %ptr__id.i121.us, align 8 - %.lhs138.us = add i32 %.lhs138.lhs.us, %add_xb_load42_calltmp.i.us - %5 = shl i32 %.lhs138.us, 3 - %iptr__id.i115.rhs.us = sext i32 %5 to i64 - %iptr__id.i115.us = add i64 %iptr__id.i115.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i116.us = inttoptr i64 %iptr__id.i115.us to double* - %val__id.i117.us = load double* %ptr__id.i116.us, align 8 - %6 = add i32 %.lhs.us, -24 - %iptr__id.i110.rhs.us = sext i32 %6 to i64 - %iptr__id.i110.us = add i64 %iptr__id.i110.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i111.us = inttoptr i64 %iptr__id.i110.us to double* - %val__id.i112.us = load double* %ptr__id.i111.us, align 8 - %7 = add i32 %.lhs.us, 24 - %iptr__id.i105.rhs.us = sext i32 %7 to i64 - %iptr__id.i105.us = add i64 %iptr__id.i105.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i106.us = inttoptr i64 %iptr__id.i105.us to double* - %val__id.i107.us = load double* %ptr__id.i106.us, align 8 - %.lhs141.us = add i32 %.lhs141.lhs.us, %add_xb_load42_calltmp.i.us - %8 = shl i32 %.lhs141.us, 3 - %iptr__id.i100.rhs.us = sext i32 %8 to i64 - %iptr__id.i100.us = add i64 %iptr__id.i100.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i101.us = inttoptr i64 %iptr__id.i100.us to double* - %val__id.i102.us = load double* %ptr__id.i101.us, align 8 - %.lhs142.us = add i32 %.lhs142.lhs.us, %add_xb_load42_calltmp.i.us - %9 = shl i32 %.lhs142.us, 3 - %iptr__id.i95.rhs.us = sext i32 %9 to i64 - %iptr__id.i95.us = add i64 %iptr__id.i95.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i96.us = inttoptr i64 %iptr__id.i95.us to double* - %val__id.i97.us = load double* %ptr__id.i96.us, align 8 - %.lhs143.us = add i32 %.lhs143.lhs.us, %add_xb_load42_calltmp.i.us - %10 = shl i32 %.lhs143.us, 3 - %iptr__id.i90.rhs.us = sext i32 %10 to i64 - %iptr__id.i90.us = add i64 %iptr__id.i90.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i91.us = inttoptr i64 %iptr__id.i90.us to double* - %val__id.i92.us = load double* %ptr__id.i91.us, align 8 - %.lhs144.us = add i32 %.lhs144.lhs.us, %add_xb_load42_calltmp.i.us - %11 = shl i32 %.lhs144.us, 3 - %iptr__id.i85.rhs.us = sext i32 %11 to i64 - %iptr__id.i85.us = add i64 %iptr__id.i85.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i86.us = inttoptr i64 %iptr__id.i85.us to double* - %val__id.i87.us = load double* %ptr__id.i86.us, align 8 - %.lhs145.us = add i32 %.lhs145.lhs.us, %add_xb_load42_calltmp.i.us - %12 = shl i32 %.lhs145.us, 3 - %iptr__id.i80.rhs.us = sext i32 %12 to i64 - %iptr__id.i80.us = add i64 %iptr__id.i80.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i81.us = inttoptr i64 %iptr__id.i80.us to double* - %val__id.i82.us = load double* %ptr__id.i81.us, align 8 - %.lhs146.us = add i32 %.lhs146.lhs.us, %add_xb_load42_calltmp.i.us - %13 = shl i32 %.lhs146.us, 3 - %iptr__id.i75.rhs.us = sext i32 %13 to i64 - %iptr__id.i75.us = add i64 %iptr__id.i75.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i76.us = inttoptr i64 %iptr__id.i75.us to double* - %val__id.i77.us = load double* %ptr__id.i76.us, align 8 - %.lhs147.us = add i32 %.lhs147.lhs.us, %add_xb_load42_calltmp.i.us - %14 = shl i32 %.lhs147.us, 3 - %iptr__id.i70.rhs.us = sext i32 %14 to i64 - %iptr__id.i70.us = add i64 %iptr__id.i70.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i71.us = inttoptr i64 %iptr__id.i70.us to double* - %val__id.i72.us = load double* %ptr__id.i71.us, align 8 - %.lhs148.us = add i32 %.lhs148.lhs.us, %add_xb_load42_calltmp.i.us - %15 = shl i32 %.lhs148.us, 3 - %iptr__id.i65.rhs.us = sext i32 %15 to i64 - %iptr__id.i65.us = add i64 %iptr__id.i65.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i66.us = inttoptr i64 %iptr__id.i65.us to double* - %val__id.i67.us = load double* %ptr__id.i66.us, align 8 - %.lhs149.us = add i32 %.lhs149.lhs.us, %add_xb_load42_calltmp.i.us - %16 = shl i32 %.lhs149.us, 3 - %iptr__id.i60.rhs.us = sext i32 %16 to i64 - %iptr__id.i60.us = add i64 %iptr__id.i60.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i61.us = inttoptr i64 %iptr__id.i60.us to double* - %val__id.i62.us = load double* %ptr__id.i61.us, align 8 - %.lhs150.us = add i32 %.lhs150.lhs.us, %add_xb_load42_calltmp.i.us - %17 = shl i32 %.lhs150.us, 3 - %iptr__id.i55.rhs.us = sext i32 %17 to i64 - %iptr__id.i55.us = add i64 %iptr__id.i55.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i56.us = inttoptr i64 %iptr__id.i55.us to double* - %val__id.i57.us = load double* %ptr__id.i56.us, align 8 - %.lhs151.us = add i32 %add_xb_load42_calltmp.i.us, %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us - %18 = shl i32 %.lhs151.us, 3 - %iptr__id.i50.rhs.us = sext i32 %18 to i64 - %iptr__id.i50.us = add i64 %iptr__id.i50.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i51.us = inttoptr i64 %iptr__id.i50.us to double* - %val__id.i52.us = load double* %ptr__id.i51.us, align 8 - %.lhs152.us = add i32 %.lhs152.lhs.us, %add_xb_load42_calltmp.i.us - %19 = shl i32 %.lhs152.us, 3 - %iptr__id.i45.rhs.us = sext i32 %19 to i64 - %iptr__id.i45.us = add i64 %iptr__id.i45.rhs.us, %Ain_load65_ptr2int.i - %ptr__id.i46.us = inttoptr i64 %iptr__id.i45.us to double* - %val__id.i47.us = load double* %ptr__id.i46.us, align 8 - %val__id.i41.us = load double* %ptr__id.i51.us, align 8 - %iptr__id.i32.us = add i64 %iptr__id.i50.rhs.us, %Aout_load_ptr2int.i - %ptr__id.i33.us = inttoptr i64 %iptr__id.i32.us to double* - %val__id.i34.us = load double* %ptr__id.i33.us, align 8 - %iptr__id.i27.rhs.us = sext i32 %.lhs.us to i64 - %iptr__id.i27.us = add i64 %iptr__id.i27.rhs.us, %vsq_load_ptr2int.i - %ptr__id.i28.us = inttoptr i64 %iptr__id.i27.us to double* - %val__id.i29.us = load double* %ptr__id.i28.us, align 8 - %iptr__id.i23.us = add i64 %iptr__id.i50.rhs.us, %Aout_load_ptr2int.i - %ptr__id.i24.us = inttoptr i64 %iptr__id.i23.us to double* - %val__id.i25.lhs.us.lhs = fmul double %val__id.i41.us, 2.000000e+00 - %val__id.i25.lhs.us = fsub double %val__id.i25.lhs.us.lhs, %val__id.i34.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i127.us, %val__id.i122.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i102.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.us, %val__id.i87.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.us, %val__id.i72.us - %val__id.i25.rhs.rhs.lhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.us, %val__id.i57.us - %val__id.i25.rhs.rhs.lhs.lhs.us = fmul double %coef_load19_offset_load.i, %val__id.i25.rhs.rhs.lhs.lhs.rhs.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i.us, %val__id.i132.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i117.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.us, %val__id.i97.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.us, %val__id.i82.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.us, %val__id.i67.us - %val__id.i25.rhs.rhs.lhs.rhs.lhs.us = fmul double %coef_load16_offset_load.i, %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.us - %val__id.i25.rhs.rhs.lhs.rhs.rhs.us = fmul double %coef_load_offset_load.i, %val__id.i52.us - %val__id.i25.rhs.rhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.us, %val__id.i25.rhs.rhs.lhs.rhs.rhs.us - %val__id.i25.rhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.us, %val__id.i25.rhs.rhs.lhs.rhs.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i112.us, %val__id.i107.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i92.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.us, %val__id.i77.us - %val__id.i25.rhs.rhs.rhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.us, %val__id.i62.us - %val__id.i25.rhs.rhs.rhs.rhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.us, %val__id.i47.us - %val__id.i25.rhs.rhs.rhs.us = fmul double %coef_load22_offset_load.i, %val__id.i25.rhs.rhs.rhs.rhs.us - %val__id.i25.rhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.us, %val__id.i25.rhs.rhs.rhs.us - %val__id.i25.rhs.us = fmul double %val__id.i25.rhs.rhs.us, %val__id.i29.us - %val__id.i25.us = fadd double %val__id.i25.lhs.us, %val__id.i25.rhs.us - store double %val__id.i25.us, double* %ptr__id.i24.us, align 8 - br label %safe_if_after_true.i.us - -safe_if_after_true.i.us: ; preds = %pl_dolane.i.us, %for_loop37.i.us - %tid.i.i1.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2 - %add_xb_load243_calltmp241.i.us = add i32 %tid.i.i1.i.us, %xb.0.i158.us - %less_xb_load_x1_load.i.us = icmp slt i32 %add_xb_load243_calltmp241.i.us, %x1 - br i1 %less_xb_load_x1_load.i.us, label %for_loop37.i.us, label %for_exit38.i.us - -for_loop37.i.lr.ph.us: ; preds = %for_exit38.i.us, %for_test35.i.preheader.lr.ph - %y.0.i160.us = phi i32 [ %y_load244_plus1.i.us, %for_exit38.i.us ], [ %y0, %for_test35.i.preheader.lr.ph ] - %mul_y_load46_Nx_load47.i.us = mul i32 %y.0.i160.us, %Nx - %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us = add i32 %mul_y_load46_Nx_load47.i.us, %mul_z_load45_Nxy_load.i - %.lhs138.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %Nx - %.lhs141.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load119.i - %.lhs142.lhs.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %Nx - %.lhs143.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load167.i - %.lhs144.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load127.i - %.lhs145.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul_Nx_load_Ny_load.i - %.lhs146.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load175.i - %.lhs147.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load136.i - %.lhs148.lhs.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul_Nx_load_Ny_load.i - %.lhs149.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load184.i - %.lhs150.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load144.i - %.lhs152.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load192.i - br label %for_loop37.i.us - -for_exit31.i: ; preds = %for_exit38.i.us, %for_test35.i.preheader.lr.ph, %for_test28.i.preheader - %z_load245_plus1.i = add i32 %z.0.i162, 1 - %exitcond163 = icmp eq i32 %z.0.i162, %0 - br i1 %exitcond163, label %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit, label %for_test28.i.preheader - -stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit: ; preds = %for_exit31.i, %allocas - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "target-features"="+sm_35" } -attributes #2 = { nounwind } - -!nvvm.annotations = !{!0} - -!0 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task, metadata !"kernel", i32 1} -!1 = metadata !{ } -!2 = metadata !{ metadata !"output", metadata !0 } -!3 = metadata !{ metadata !"input1", metadata !0 } -!4 = metadata !{ metadata !"input2", metadata !0 } diff --git a/examples/stencil/stencil_ispc.h b/examples/stencil/stencil_ispc.h deleted file mode 100644 index ebf29582..00000000 --- a/examples/stencil/stencil_ispc.h +++ /dev/null @@ -1,35 +0,0 @@ -// -// stencil_ispc.h -// (Header automatically generated by the ispc compiler.) -// DO NOT EDIT THIS FILE. -// - -#ifndef ISPC_STENCIL_ISPC_H -#define ISPC_STENCIL_ISPC_H - -#include - - - -#ifdef __cplusplus -namespace ispc { /* namespace */ -#endif // __cplusplus - -/////////////////////////////////////////////////////////////////////////// -// Functions exported from ispc code -/////////////////////////////////////////////////////////////////////////// -#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C) -extern "C" { -#endif // __cplusplus - extern void loop_stencil_ispc(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd); - extern void loop_stencil_ispc_tasks(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd); -#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C) -} /* end extern C */ -#endif // __cplusplus - - -#ifdef __cplusplus -} /* namespace */ -#endif // __cplusplus - -#endif // ISPC_STENCIL_ISPC_H diff --git a/examples/stencil/stencil_nvptx64.bc b/examples/stencil/stencil_nvptx64.bc deleted file mode 100644 index b77be1e3..00000000 Binary files a/examples/stencil/stencil_nvptx64.bc and /dev/null differ diff --git a/examples_cuda/common.mk b/examples_cuda/common.mk index c11f22dc..b5100169 100644 --- a/examples_cuda/common.mk +++ b/examples_cuda/common.mk @@ -10,7 +10,8 @@ CCFLAGS+=-Iobjs/ -O2 LIBS=-lm $(TASK_LIB) -lstdc++ ISPC=ispc -ISPC_FLAGS+=-O2 --opt=fast-math --math-lib=default +ISPC_FLAGS+=-O2 +ISPC_FLAGS+=--opt=fast-math --math-lib=default ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h) ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/) diff --git a/examples_cuda/stencil/.stencil.ispc.swn b/examples_cuda/stencil/.stencil.ispc.swn new file mode 100644 index 00000000..ad3f6c78 Binary files /dev/null and b/examples_cuda/stencil/.stencil.ispc.swn differ diff --git a/examples_cuda/stencil/__kernels.ptx b/examples_cuda/stencil/__kernels.ptx deleted file mode 100644 index b0339cbf..00000000 --- a/examples_cuda/stencil/__kernels.ptx +++ /dev/null @@ -1,1246 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) -// Cuda compilation tools, release 5.5, V5.5.0 -// - -.version 3.2 -.target sm_35 -.address_size 64 - - -.extern .func (.param .b32 func_retval0) cudaLaunchDevice -( - .param .b64 cudaLaunchDevice_param_0, - .param .b64 cudaLaunchDevice_param_1, - .param .align 4 .b8 cudaLaunchDevice_param_2[12], - .param .align 4 .b8 cudaLaunchDevice_param_3[12], - .param .b32 cudaLaunchDevice_param_4, - .param .b64 cudaLaunchDevice_param_5 -); - - -.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer -( - .param .b64 cudaGetParameterBuffer_param_0, - .param .b64 cudaGetParameterBuffer_param_1 -) -; -.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize -( - -) -; -.global .align 1 .b8 constDeltaForeach1[32]; -.global .align 1 .b8 constDeltaForeach4[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; - -.visible .func (.param .b32 func_retval0) __shfl_i32( - .param .b32 __shfl_i32_param_0, - .param .b32 __shfl_i32_param_1 -) -{ - .reg .s32 %r<4>; - - - ld.param.u32 %r2, [__shfl_i32_param_0]; - ld.param.u32 %r3, [__shfl_i32_param_1]; - // inline asm - shfl.idx.b32 %r1, %r2, %r3, 0x1f; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __shfl_xor_float( - .param .b32 __shfl_xor_float_param_0, - .param .b32 __shfl_xor_float_param_1 -) -{ - .reg .s32 %r<2>; - .reg .f32 %f<3>; - - - ld.param.f32 %f2, [__shfl_xor_float_param_0]; - ld.param.u32 %r1, [__shfl_xor_float_param_1]; - // inline asm - shfl.bfly.b32 %f1, %f2, %r1, 0x1f; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __shfl_xor_i32( - .param .b32 __shfl_xor_i32_param_0, - .param .b32 __shfl_xor_i32_param_1 -) -{ - .reg .s32 %r<4>; - - - ld.param.u32 %r2, [__shfl_xor_i32_param_0]; - ld.param.u32 %r3, [__shfl_xor_i32_param_1]; - // inline asm - shfl.bfly.b32 %r1, %r2, %r3, 0x1f; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __fminf( - .param .b32 __fminf_param_0, - .param .b32 __fminf_param_1 -) -{ - .reg .f32 %f<4>; - - - ld.param.f32 %f2, [__fminf_param_0]; - ld.param.f32 %f3, [__fminf_param_1]; - // inline asm - min.f32 %f1, %f2, %f3; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __fmaxf( - .param .b32 __fmaxf_param_0, - .param .b32 __fmaxf_param_1 -) -{ - .reg .f32 %f<4>; - - - ld.param.f32 %f2, [__fmaxf_param_0]; - ld.param.f32 %f3, [__fmaxf_param_1]; - // inline asm - max.f32 %f1, %f2, %f3; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __ballot( - .param .b32 __ballot_param_0 -) -{ - .reg .s32 %r<3>; - - - ld.param.u8 %r2, [__ballot_param_0]; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r2, 0; - vote.ballot.b32 %r1, %p1; - } - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __lanemask_lt( - -) -{ - .reg .s32 %r<2>; - - - // inline asm - mov.u32 %r1, %lanemask_lt; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b64 func_retval0) ISPCAlloc( - .param .b64 ISPCAlloc_param_0, - .param .b64 ISPCAlloc_param_1, - .param .b32 ISPCAlloc_param_2 -) -{ - .reg .s64 %rd<2>; - - - mov.u64 %rd1, 1; - st.param.b64 [func_retval0+0], %rd1; - ret; -} - -.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer( - .param .b64 ISPCGetParamBuffer_param_0, - .param .b64 ISPCGetParamBuffer_param_1, - .param .b64 ISPCGetParamBuffer_param_2 -) -{ - .reg .pred %p<2>; - .reg .s32 %r<3>; - .reg .s64 %rd<7>; - - - ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1]; - ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2]; - mov.u32 %r1, %tid.x; - and.b32 %r2, %r1, 31; - setp.ne.s32 %p1, %r2, 0; - mov.u64 %rd6, 0; - @%p1 bra BB8_2; - - // Callseq Start 0 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd3; - .param .b64 param1; - st.param.b64 [param1+0], %rd4; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd6, [retval0+0]; - } - // Callseq End 0 - -BB8_2: - st.param.b64 [func_retval0+0], %rd6; - ret; -} - -.visible .func ISPCLaunch( - .param .b64 ISPCLaunch_param_0, - .param .b64 ISPCLaunch_param_1, - .param .b64 ISPCLaunch_param_2, - .param .b32 ISPCLaunch_param_3, - .param .b32 ISPCLaunch_param_4, - .param .b32 ISPCLaunch_param_5 -) -{ - .reg .pred %p<2>; - .reg .s32 %r<16>; - .reg .s64 %rd<6>; - - - ld.param.u64 %rd1, [ISPCLaunch_param_1]; - ld.param.u64 %rd2, [ISPCLaunch_param_2]; - ld.param.u32 %r1, [ISPCLaunch_param_3]; - ld.param.u32 %r2, [ISPCLaunch_param_4]; - ld.param.u32 %r3, [ISPCLaunch_param_5]; - mov.u32 %r4, %tid.x; - and.b32 %r5, %r4, 31; - setp.ne.s32 %p1, %r5, 0; - @%p1 bra BB9_2; - - add.s32 %r14, %r1, -1; - shr.s32 %r15, %r14, 2; - add.s32 %r7, %r15, 1; - mov.u32 %r12, 1; - mov.u32 %r10, 128; - mov.u32 %r13, 0; - mov.u64 %rd5, 0; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd1; - .param .b64 param1; - st.param.b64 [param1+0], %rd2; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r7; - st.param.b32 [param2+4], %r2; - st.param.b32 [param2+8], %r3; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r10; - st.param.b32 [param3+4], %r12; - st.param.b32 [param3+8], %r12; - .param .b32 param4; - st.param.b32 [param4+0], %r13; - .param .b64 param5; - st.param.b64 [param5+0], %rd5; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r6, [retval0+0]; - } - - // inline asm - -BB9_2: - ret; -} - -.visible .func ISPCSync( - .param .b64 ISPCSync_param_0 -) -{ - .reg .s32 %r<2>; - - - // Callseq Start 1 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r1, [retval0+0]; - } - // Callseq End 1 - ret; -} - -.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan( - .param .b32 __warpBinExclusiveScan_param_0 -) -{ - .reg .s32 %r<8>; - .reg .s64 %rd<5>; - - - ld.param.u8 %r2, [__warpBinExclusiveScan_param_0]; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r2, 0; - vote.ballot.b32 %r1, %p1; - } - // inline asm - // inline asm - popc.b32 %r3, %r1; - // inline asm - // inline asm - mov.u32 %r5, %lanemask_lt; - // inline asm - and.b32 %r7, %r5, %r1; - // inline asm - popc.b32 %r6, %r7; - // inline asm - cvt.u64.u32 %rd1, %r6; - shl.b64 %rd2, %rd1, 32; - cvt.u64.u32 %rd3, %r3; - or.b64 %rd4, %rd2, %rd3; - st.param.b64 [func_retval0+0], %rd4; - ret; -} - -.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_( - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12 -) -{ - .reg .pred %p<14>; - .reg .s32 %r<178>; - .reg .s64 %rd<96>; - .reg .f64 %fd<95>; - - - ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0]; - ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1]; - ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2]; - ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3]; - ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4]; - ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5]; - ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6]; - ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7]; - ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9]; - ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10]; - ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11]; - ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12]; - mov.u32 %r1, %ctaid.x; - shl.b32 %r50, %r1, 2; - mov.u32 %r2, %tid.x; - shr.s32 %r51, %r2, 5; - add.s32 %r52, %r51, %r50; - mov.u32 %r53, %nctaid.x; - shl.b32 %r54, %r53, 2; - setp.ge.s32 %p1, %r52, %r54; - mov.u32 %r55, %nctaid.y; - mov.u32 %r3, %ctaid.y; - setp.ge.s32 %p2, %r3, %r55; - or.pred %p3, %p1, %p2; - mov.u32 %r56, %nctaid.z; - mov.u32 %r4, %ctaid.z; - setp.ge.s32 %p4, %r4, %r56; - or.pred %p5, %p3, %p4; - @%p5 bra BB12_13; - - shl.b32 %r57, %r1, 7; - add.s32 %r58, %r2, %r57; - and.b32 %r59, %r58, -32; - add.s32 %r60, %r59, %r42; - add.s32 %r61, %r60, 32; - min.s32 %r5, %r43, %r61; - shl.b32 %r6, %r3, 3; - add.s32 %r62, %r6, %r44; - add.s32 %r7, %r62, 8; - shl.b32 %r8, %r4, 3; - add.s32 %r172, %r8, %r46; - add.s32 %r63, %r172, 8; - min.s32 %r64, %r47, %r63; - mul.lo.s32 %r10, %r49, %r48; - sub.s32 %r65, %r5, %r60; - shr.s32 %r66, %r65, 31; - shr.u32 %r67, %r66, 27; - add.s32 %r68, %r65, %r67; - and.b32 %r69, %r68, -32; - sub.s32 %r70, %r65, %r69; - sub.s32 %r11, %r5, %r70; - and.b32 %r71, %r2, 31; - cvt.u64.u32 %rd6, %r71; - mov.u64 %rd7, constDeltaForeach1; - add.s64 %rd1, %rd7, %rd6; - setp.ge.s32 %p6, %r172, %r64; - @%p6 bra BB12_13; - - min.s32 %r12, %r45, %r7; - shl.b32 %r15, %r10, 1; - neg.s32 %r16, %r15; - mul.lo.s32 %r17, %r10, 3; - mul.lo.s32 %r18, %r10, -3; - mov.u32 %r72, -9; - sub.s32 %r73, %r72, %r44; - sub.s32 %r74, %r73, %r6; - not.b32 %r75, %r45; - max.s32 %r76, %r74, %r75; - not.b32 %r19, %r76; - sub.s32 %r77, %r72, %r46; - sub.s32 %r78, %r77, %r8; - not.b32 %r79, %r47; - max.s32 %r80, %r78, %r79; - not.b32 %r20, %r80; - ld.global.u8 %r13, [%rd1]; - mov.u32 %r171, %r172; - -BB12_3: - mov.u32 %r21, %r171; - add.s32 %r23, %r21, %r13; - setp.ge.s32 %p7, %r62, %r12; - @%p7 bra BB12_12; - - mul.lo.s32 %r24, %r23, %r10; - mov.u32 %r174, %r62; - mov.u32 %r173, %r62; - -BB12_5: - mov.u32 %r27, %r173; - add.s32 %r30, %r27, %r13; - setp.ge.s32 %p8, %r60, %r11; - mov.u32 %r176, %r60; - @%p8 bra BB12_8; - - mov.u64 %rd9, constDeltaForeach4; - add.s64 %rd10, %rd9, %rd6; - ld.global.u8 %r31, [%rd10]; - mad.lo.s32 %r32, %r30, %r48, %r24; - add.s32 %r177, %r59, %r42; - -BB12_7: - cvta.to.global.u64 %rd11, %rd2; - add.s32 %r98, %r32, %r177; - add.s32 %r99, %r98, %r31; - shl.b32 %r100, %r99, 3; - cvt.s64.s32 %rd12, %r100; - add.s64 %rd13, %rd12, %rd4; - add.s32 %r101, %r100, 8; - cvt.s64.s32 %rd14, %r101; - add.s64 %rd15, %rd14, %rd4; - add.s32 %r102, %r100, -8; - cvt.s64.s32 %rd16, %r102; - add.s64 %rd17, %rd16, %rd4; - add.s32 %r103, %r99, %r48; - shl.b32 %r104, %r103, 3; - cvt.s64.s32 %rd18, %r104; - add.s64 %rd19, %rd18, %rd4; - sub.s32 %r105, %r99, %r48; - shl.b32 %r106, %r105, 3; - cvt.s64.s32 %rd20, %r106; - add.s64 %rd21, %rd20, %rd4; - add.s32 %r108, %r99, %r10; - shl.b32 %r109, %r108, 3; - cvt.s64.s32 %rd22, %r109; - add.s64 %rd23, %rd22, %rd4; - sub.s32 %r110, %r99, %r10; - shl.b32 %r111, %r110, 3; - cvt.s64.s32 %rd24, %r111; - add.s64 %rd25, %rd24, %rd4; - add.s32 %r112, %r100, 16; - cvt.s64.s32 %rd26, %r112; - add.s64 %rd27, %rd26, %rd4; - add.s32 %r113, %r100, -16; - cvt.s64.s32 %rd28, %r113; - add.s64 %rd29, %rd28, %rd4; - shl.b32 %r114, %r48, 1; - add.s32 %r115, %r99, %r114; - shl.b32 %r116, %r115, 3; - cvt.s64.s32 %rd30, %r116; - add.s64 %rd31, %rd30, %rd4; - mad.lo.s32 %r117, %r48, -2, %r99; - shl.b32 %r118, %r117, 3; - cvt.s64.s32 %rd32, %r118; - add.s64 %rd33, %rd32, %rd4; - add.s32 %r119, %r99, %r15; - shl.b32 %r120, %r119, 3; - cvt.s64.s32 %rd34, %r120; - add.s64 %rd35, %rd34, %rd4; - add.s32 %r121, %r99, %r16; - shl.b32 %r122, %r121, 3; - cvt.s64.s32 %rd36, %r122; - add.s64 %rd37, %rd36, %rd4; - add.s32 %r123, %r100, 24; - cvt.s64.s32 %rd38, %r123; - add.s64 %rd39, %rd38, %rd4; - add.s32 %r124, %r100, -24; - cvt.s64.s32 %rd40, %r124; - add.s64 %rd41, %rd40, %rd4; - mad.lo.s32 %r125, %r48, 3, %r99; - shl.b32 %r126, %r125, 3; - cvt.s64.s32 %rd42, %r126; - add.s64 %rd43, %rd42, %rd4; - mad.lo.s32 %r127, %r48, -3, %r99; - shl.b32 %r128, %r127, 3; - cvt.s64.s32 %rd44, %r128; - add.s64 %rd45, %rd44, %rd4; - add.s32 %r129, %r99, %r17; - shl.b32 %r130, %r129, 3; - cvt.s64.s32 %rd46, %r130; - add.s64 %rd47, %rd46, %rd4; - add.s32 %r131, %r99, %r18; - shl.b32 %r132, %r131, 3; - cvt.s64.s32 %rd48, %r132; - add.s64 %rd49, %rd48, %rd4; - add.s64 %rd50, %rd12, %rd5; - add.s64 %rd51, %rd12, %rd3; - ld.f64 %fd1, [%rd13]; - add.f64 %fd2, %fd1, %fd1; - ld.f64 %fd3, [%rd50]; - sub.f64 %fd4, %fd2, %fd3; - ld.global.f64 %fd5, [%rd11]; - ld.f64 %fd6, [%rd17]; - ld.f64 %fd7, [%rd15]; - add.f64 %fd8, %fd7, %fd6; - ld.f64 %fd9, [%rd19]; - add.f64 %fd10, %fd8, %fd9; - ld.f64 %fd11, [%rd21]; - add.f64 %fd12, %fd10, %fd11; - ld.f64 %fd13, [%rd23]; - add.f64 %fd14, %fd12, %fd13; - ld.f64 %fd15, [%rd25]; - add.f64 %fd16, %fd14, %fd15; - ld.global.f64 %fd17, [%rd11+8]; - mul.f64 %fd18, %fd17, %fd16; - fma.rn.f64 %fd19, %fd5, %fd1, %fd18; - ld.f64 %fd20, [%rd29]; - ld.f64 %fd21, [%rd27]; - add.f64 %fd22, %fd21, %fd20; - ld.f64 %fd23, [%rd31]; - add.f64 %fd24, %fd22, %fd23; - ld.f64 %fd25, [%rd33]; - add.f64 %fd26, %fd24, %fd25; - ld.f64 %fd27, [%rd35]; - add.f64 %fd28, %fd26, %fd27; - ld.f64 %fd29, [%rd37]; - add.f64 %fd30, %fd28, %fd29; - ld.global.f64 %fd31, [%rd11+16]; - fma.rn.f64 %fd32, %fd31, %fd30, %fd19; - ld.f64 %fd33, [%rd41]; - ld.f64 %fd34, [%rd39]; - add.f64 %fd35, %fd34, %fd33; - ld.f64 %fd36, [%rd43]; - add.f64 %fd37, %fd35, %fd36; - ld.f64 %fd38, [%rd45]; - add.f64 %fd39, %fd37, %fd38; - ld.f64 %fd40, [%rd47]; - add.f64 %fd41, %fd39, %fd40; - ld.f64 %fd42, [%rd49]; - add.f64 %fd43, %fd41, %fd42; - ld.global.f64 %fd44, [%rd11+24]; - fma.rn.f64 %fd45, %fd44, %fd43, %fd32; - ld.f64 %fd46, [%rd51]; - fma.rn.f64 %fd47, %fd46, %fd45, %fd4; - st.f64 [%rd50], %fd47; - add.s32 %r177, %r177, 32; - setp.lt.s32 %p9, %r177, %r11; - mov.u32 %r175, %r177; - mov.u32 %r176, %r175; - @%p9 bra BB12_7; - -BB12_8: - mov.u32 %r36, %r176; - setp.ge.s32 %p10, %r36, %r5; - @%p10 bra BB12_11; - - mov.u64 %rd53, constDeltaForeach4; - add.s64 %rd54, %rd53, %rd6; - ld.global.u8 %r135, [%rd54]; - add.s32 %r37, %r36, %r135; - setp.ge.s32 %p11, %r37, %r5; - @%p11 bra BB12_11; - - cvta.to.global.u64 %rd55, %rd2; - mad.lo.s32 %r136, %r30, %r48, %r24; - add.s32 %r137, %r136, %r37; - shl.b32 %r138, %r137, 3; - cvt.s64.s32 %rd56, %r138; - add.s64 %rd57, %rd56, %rd4; - add.s32 %r139, %r138, 8; - cvt.s64.s32 %rd58, %r139; - add.s64 %rd59, %rd58, %rd4; - add.s32 %r140, %r138, -8; - cvt.s64.s32 %rd60, %r140; - add.s64 %rd61, %rd60, %rd4; - add.s32 %r141, %r137, %r48; - shl.b32 %r142, %r141, 3; - cvt.s64.s32 %rd62, %r142; - add.s64 %rd63, %rd62, %rd4; - sub.s32 %r143, %r137, %r48; - shl.b32 %r144, %r143, 3; - cvt.s64.s32 %rd64, %r144; - add.s64 %rd65, %rd64, %rd4; - add.s32 %r146, %r137, %r10; - shl.b32 %r147, %r146, 3; - cvt.s64.s32 %rd66, %r147; - add.s64 %rd67, %rd66, %rd4; - sub.s32 %r148, %r137, %r10; - shl.b32 %r149, %r148, 3; - cvt.s64.s32 %rd68, %r149; - add.s64 %rd69, %rd68, %rd4; - add.s32 %r150, %r138, 16; - cvt.s64.s32 %rd70, %r150; - add.s64 %rd71, %rd70, %rd4; - add.s32 %r151, %r138, -16; - cvt.s64.s32 %rd72, %r151; - add.s64 %rd73, %rd72, %rd4; - shl.b32 %r152, %r48, 1; - add.s32 %r153, %r137, %r152; - shl.b32 %r154, %r153, 3; - cvt.s64.s32 %rd74, %r154; - add.s64 %rd75, %rd74, %rd4; - mad.lo.s32 %r155, %r48, -2, %r137; - shl.b32 %r156, %r155, 3; - cvt.s64.s32 %rd76, %r156; - add.s64 %rd77, %rd76, %rd4; - add.s32 %r157, %r137, %r15; - shl.b32 %r158, %r157, 3; - cvt.s64.s32 %rd78, %r158; - add.s64 %rd79, %rd78, %rd4; - add.s32 %r159, %r137, %r16; - shl.b32 %r160, %r159, 3; - cvt.s64.s32 %rd80, %r160; - add.s64 %rd81, %rd80, %rd4; - add.s32 %r161, %r138, 24; - cvt.s64.s32 %rd82, %r161; - add.s64 %rd83, %rd82, %rd4; - add.s32 %r162, %r138, -24; - cvt.s64.s32 %rd84, %r162; - add.s64 %rd85, %rd84, %rd4; - mad.lo.s32 %r163, %r48, 3, %r137; - shl.b32 %r164, %r163, 3; - cvt.s64.s32 %rd86, %r164; - add.s64 %rd87, %rd86, %rd4; - mad.lo.s32 %r165, %r48, -3, %r137; - shl.b32 %r166, %r165, 3; - cvt.s64.s32 %rd88, %r166; - add.s64 %rd89, %rd88, %rd4; - add.s32 %r167, %r137, %r17; - shl.b32 %r168, %r167, 3; - cvt.s64.s32 %rd90, %r168; - add.s64 %rd91, %rd90, %rd4; - add.s32 %r169, %r137, %r18; - shl.b32 %r170, %r169, 3; - cvt.s64.s32 %rd92, %r170; - add.s64 %rd93, %rd92, %rd4; - add.s64 %rd94, %rd56, %rd5; - add.s64 %rd95, %rd56, %rd3; - ld.f64 %fd48, [%rd57]; - add.f64 %fd49, %fd48, %fd48; - ld.f64 %fd50, [%rd94]; - sub.f64 %fd51, %fd49, %fd50; - ld.global.f64 %fd52, [%rd55]; - ld.f64 %fd53, [%rd61]; - ld.f64 %fd54, [%rd59]; - add.f64 %fd55, %fd54, %fd53; - ld.f64 %fd56, [%rd63]; - add.f64 %fd57, %fd55, %fd56; - ld.f64 %fd58, [%rd65]; - add.f64 %fd59, %fd57, %fd58; - ld.f64 %fd60, [%rd67]; - add.f64 %fd61, %fd59, %fd60; - ld.f64 %fd62, [%rd69]; - add.f64 %fd63, %fd61, %fd62; - ld.global.f64 %fd64, [%rd55+8]; - mul.f64 %fd65, %fd64, %fd63; - fma.rn.f64 %fd66, %fd52, %fd48, %fd65; - ld.f64 %fd67, [%rd73]; - ld.f64 %fd68, [%rd71]; - add.f64 %fd69, %fd68, %fd67; - ld.f64 %fd70, [%rd75]; - add.f64 %fd71, %fd69, %fd70; - ld.f64 %fd72, [%rd77]; - add.f64 %fd73, %fd71, %fd72; - ld.f64 %fd74, [%rd79]; - add.f64 %fd75, %fd73, %fd74; - ld.f64 %fd76, [%rd81]; - add.f64 %fd77, %fd75, %fd76; - ld.global.f64 %fd78, [%rd55+16]; - fma.rn.f64 %fd79, %fd78, %fd77, %fd66; - ld.f64 %fd80, [%rd85]; - ld.f64 %fd81, [%rd83]; - add.f64 %fd82, %fd81, %fd80; - ld.f64 %fd83, [%rd87]; - add.f64 %fd84, %fd82, %fd83; - ld.f64 %fd85, [%rd89]; - add.f64 %fd86, %fd84, %fd85; - ld.f64 %fd87, [%rd91]; - add.f64 %fd88, %fd86, %fd87; - ld.f64 %fd89, [%rd93]; - add.f64 %fd90, %fd88, %fd89; - ld.global.f64 %fd91, [%rd55+24]; - fma.rn.f64 %fd92, %fd91, %fd90, %fd79; - ld.f64 %fd93, [%rd95]; - fma.rn.f64 %fd94, %fd92, %fd93, %fd51; - st.f64 [%rd94], %fd94; - -BB12_11: - add.s32 %r39, %r174, 1; - setp.ne.s32 %p12, %r39, %r19; - mov.u32 %r174, %r39; - mov.u32 %r173, %r39; - @%p12 bra BB12_5; - -BB12_12: - add.s32 %r171, %r172, 1; - setp.ne.s32 %p13, %r171, %r20; - mov.u32 %r172, %r171; - @%p13 bra BB12_3; - -BB12_13: - ret; -} - -.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_( - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14, - .param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1] -) -{ - .reg .pred %p<9>; - .reg .s32 %r<63>; - .reg .s64 %rd<18>; - - - ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0]; - ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1]; - ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2]; - ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3]; - ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4]; - ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5]; - ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6]; - ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7]; - ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8]; - ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9]; - ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10]; - ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11]; - ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12]; - ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13]; - ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14]; - setp.ge.s32 %p1, %r62, %r12; - @%p1 bra BB13_14; - - mov.u32 %r22, 31; - sub.s32 %r23, %r22, %r13; - add.s32 %r24, %r23, %r14; - shr.s32 %r25, %r24, 31; - shr.u32 %r26, %r25, 27; - add.s32 %r27, %r24, %r26; - shr.s32 %r28, %r27, 5; - mov.u32 %r29, 7; - sub.s32 %r30, %r29, %r15; - add.s32 %r31, %r30, %r16; - shr.s32 %r32, %r31, 31; - shr.u32 %r33, %r32, 29; - add.s32 %r34, %r31, %r33; - shr.s32 %r1, %r34, 3; - sub.s32 %r35, %r29, %r17; - add.s32 %r36, %r35, %r18; - shr.s32 %r37, %r36, 31; - shr.u32 %r38, %r37, 29; - add.s32 %r39, %r36, %r38; - shr.s32 %r2, %r39, 3; - add.s32 %r40, %r28, -1; - shr.s32 %r41, %r40, 2; - add.s32 %r3, %r41, 1; - mov.u32 %r42, %tid.x; - and.b32 %r4, %r42, 31; - sub.s32 %r61, %r62, %r12; - -BB13_2: - and.b32 %r8, %r62, 1; - setp.ne.s32 %p2, %r4, 0; - mov.u64 %rd17, 0; - @%p2 bra BB13_4; - - mov.u64 %rd9, 8; - mov.u64 %rd10, 72; - // Callseq Start 2 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd10; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd17, [retval0+0]; - } - // Callseq End 2 - -BB13_4: - setp.eq.s32 %p3, %r8, 0; - @%p3 bra BB13_9; - - setp.eq.s64 %p4, %rd17, 0; - @%p4 bra BB13_7; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd7; - st.u64 [%rd17+64], %rd6; - -BB13_7: - @%p2 bra BB13_13; - - mov.u32 %r47, 128; - mov.u32 %r49, 1; - mov.u32 %r50, 0; - mov.u64 %rd13, 0; - mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd11; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r47; - st.param.b32 [param3+4], %r49; - st.param.b32 [param3+8], %r49; - .param .b32 param4; - st.param.b32 [param4+0], %r50; - .param .b64 param5; - st.param.b64 [param5+0], %rd13; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r43, [retval0+0]; - } - - // inline asm - bra.uni BB13_13; - -BB13_9: - setp.eq.s64 %p6, %rd17, 0; - @%p6 bra BB13_11; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd6; - st.u64 [%rd17+64], %rd7; - -BB13_11: - @%p2 bra BB13_13; - - mov.u32 %r55, 128; - mov.u32 %r57, 1; - mov.u32 %r58, 0; - mov.u64 %rd16, 0; - mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd14; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r55; - st.param.b32 [param3+4], %r57; - st.param.b32 [param3+8], %r57; - .param .b32 param4; - st.param.b32 [param4+0], %r58; - .param .b64 param5; - st.param.b64 [param5+0], %rd16; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r51, [retval0+0]; - } - - // inline asm - -BB13_13: - // Callseq Start 3 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r59, [retval0+0]; - } - // Callseq End 3 - add.s32 %r62, %r62, 1; - add.s32 %r61, %r61, 1; - setp.ne.s32 %p8, %r61, 0; - @%p8 bra BB13_2; - -BB13_14: - // Callseq Start 4 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r60, [retval0+0]; - } - // Callseq End 4 - ret; -} - -.visible .entry loop_stencil_ispc_tasks( - .param .u32 loop_stencil_ispc_tasks_param_0, - .param .u32 loop_stencil_ispc_tasks_param_1, - .param .u32 loop_stencil_ispc_tasks_param_2, - .param .u32 loop_stencil_ispc_tasks_param_3, - .param .u32 loop_stencil_ispc_tasks_param_4, - .param .u32 loop_stencil_ispc_tasks_param_5, - .param .u32 loop_stencil_ispc_tasks_param_6, - .param .u32 loop_stencil_ispc_tasks_param_7, - .param .u32 loop_stencil_ispc_tasks_param_8, - .param .u32 loop_stencil_ispc_tasks_param_9, - .param .u32 loop_stencil_ispc_tasks_param_10, - .param .u64 loop_stencil_ispc_tasks_param_11, - .param .u64 loop_stencil_ispc_tasks_param_12, - .param .u64 loop_stencil_ispc_tasks_param_13, - .param .u64 loop_stencil_ispc_tasks_param_14 -) -{ - .reg .pred %p<9>; - .reg .s32 %r<63>; - .reg .s64 %rd<18>; - - - ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0]; - ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1]; - ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2]; - ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3]; - ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4]; - ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5]; - ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6]; - ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7]; - ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8]; - ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9]; - ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10]; - ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11]; - ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12]; - ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13]; - ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14]; - setp.ge.s32 %p1, %r62, %r12; - @%p1 bra BB14_14; - - mov.u32 %r22, 31; - sub.s32 %r23, %r22, %r13; - add.s32 %r24, %r23, %r14; - shr.s32 %r25, %r24, 31; - shr.u32 %r26, %r25, 27; - add.s32 %r27, %r24, %r26; - shr.s32 %r28, %r27, 5; - mov.u32 %r29, 7; - sub.s32 %r30, %r29, %r15; - add.s32 %r31, %r30, %r16; - shr.s32 %r32, %r31, 31; - shr.u32 %r33, %r32, 29; - add.s32 %r34, %r31, %r33; - shr.s32 %r1, %r34, 3; - sub.s32 %r35, %r29, %r17; - add.s32 %r36, %r35, %r18; - shr.s32 %r37, %r36, 31; - shr.u32 %r38, %r37, 29; - add.s32 %r39, %r36, %r38; - shr.s32 %r2, %r39, 3; - add.s32 %r40, %r28, -1; - shr.s32 %r41, %r40, 2; - add.s32 %r3, %r41, 1; - mov.u32 %r42, %tid.x; - and.b32 %r4, %r42, 31; - sub.s32 %r61, %r62, %r12; - -BB14_2: - and.b32 %r8, %r62, 1; - setp.ne.s32 %p2, %r4, 0; - mov.u64 %rd17, 0; - @%p2 bra BB14_4; - - mov.u64 %rd9, 8; - mov.u64 %rd10, 72; - // Callseq Start 5 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd10; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd17, [retval0+0]; - } - // Callseq End 5 - -BB14_4: - setp.eq.s32 %p3, %r8, 0; - @%p3 bra BB14_9; - - setp.eq.s64 %p4, %rd17, 0; - @%p4 bra BB14_7; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd7; - st.u64 [%rd17+64], %rd6; - -BB14_7: - @%p2 bra BB14_13; - - mov.u32 %r47, 128; - mov.u32 %r49, 1; - mov.u32 %r50, 0; - mov.u64 %rd13, 0; - mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd11; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r47; - st.param.b32 [param3+4], %r49; - st.param.b32 [param3+8], %r49; - .param .b32 param4; - st.param.b32 [param4+0], %r50; - .param .b64 param5; - st.param.b64 [param5+0], %rd13; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r43, [retval0+0]; - } - - // inline asm - bra.uni BB14_13; - -BB14_9: - setp.eq.s64 %p6, %rd17, 0; - @%p6 bra BB14_11; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd6; - st.u64 [%rd17+64], %rd7; - -BB14_11: - @%p2 bra BB14_13; - - mov.u32 %r55, 128; - mov.u32 %r57, 1; - mov.u32 %r58, 0; - mov.u64 %rd16, 0; - mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd14; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r55; - st.param.b32 [param3+4], %r57; - st.param.b32 [param3+8], %r57; - .param .b32 param4; - st.param.b32 [param4+0], %r58; - .param .b64 param5; - st.param.b64 [param5+0], %rd16; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r51, [retval0+0]; - } - - // inline asm - -BB14_13: - // Callseq Start 6 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r59, [retval0+0]; - } - // Callseq End 6 - add.s32 %r62, %r62, 1; - add.s32 %r61, %r61, 1; - setp.ne.s32 %p8, %r61, 0; - @%p8 bra BB14_2; - -BB14_14: - // Callseq Start 7 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r60, [retval0+0]; - } - // Callseq End 7 - ret; -} - - - diff --git a/examples_cuda/stencil/drvapi_error_string.h b/examples_cuda/stencil/drvapi_error_string.h deleted file mode 100644 index ce85f152..00000000 --- a/examples_cuda/stencil/drvapi_error_string.h +++ /dev/null @@ -1,370 +0,0 @@ -/* - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * Please refer to the NVIDIA end user license agreement (EULA) associated - * with this source code for terms and conditions that govern your use of - * this software. Any use, reproduction, disclosure, or distribution of - * this software and related documentation outside the terms of the EULA - * is strictly prohibited. - * - */ - -#ifndef _DRVAPI_ERROR_STRING_H_ -#define _DRVAPI_ERROR_STRING_H_ - -#include -#include -#include - -// Error Code string definitions here -typedef struct -{ - char const *error_string; - int error_id; -} s_CudaErrorStr; - -/** - * Error codes - */ -static s_CudaErrorStr sCudaDrvErrorString[] = -{ - /** - * The API call returned with no errors. In the case of query calls, this - * can also mean that the operation being queried is complete (see - * ::cuEventQuery() and ::cuStreamQuery()). - */ - { "CUDA_SUCCESS", 0 }, - - /** - * This indicates that one or more of the parameters passed to the API call - * is not within an acceptable range of values. - */ - { "CUDA_ERROR_INVALID_VALUE", 1 }, - - /** - * The API call failed because it was unable to allocate enough memory to - * perform the requested operation. - */ - { "CUDA_ERROR_OUT_OF_MEMORY", 2 }, - - /** - * This indicates that the CUDA driver has not been initialized with - * ::cuInit() or that initialization has failed. - */ - { "CUDA_ERROR_NOT_INITIALIZED", 3 }, - - /** - * This indicates that the CUDA driver is in the process of shutting down. - */ - { "CUDA_ERROR_DEINITIALIZED", 4 }, - - /** - * This indicates profiling APIs are called while application is running - * in visual profiler mode. - */ - { "CUDA_ERROR_PROFILER_DISABLED", 5 }, - /** - * This indicates profiling has not been initialized for this context. - * Call cuProfilerInitialize() to resolve this. - */ - { "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 }, - /** - * This indicates profiler has already been started and probably - * cuProfilerStart() is incorrectly called. - */ - { "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 }, - /** - * This indicates profiler has already been stopped and probably - * cuProfilerStop() is incorrectly called. - */ - { "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 }, - /** - * This indicates that no CUDA-capable devices were detected by the installed - * CUDA driver. - */ - { "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 }, - - /** - * This indicates that the device ordinal supplied by the user does not - * correspond to a valid CUDA device. - */ - { "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 }, - - - /** - * This indicates that the device kernel image is invalid. This can also - * indicate an invalid CUDA module. - */ - { "CUDA_ERROR_INVALID_IMAGE", 200 }, - - /** - * This most frequently indicates that there is no context bound to the - * current thread. This can also be returned if the context passed to an - * API call is not a valid handle (such as a context that has had - * ::cuCtxDestroy() invoked on it). This can also be returned if a user - * mixes different API versions (i.e. 3010 context with 3020 API calls). - * See ::cuCtxGetApiVersion() for more details. - */ - { "CUDA_ERROR_INVALID_CONTEXT", 201 }, - - /** - * This indicated that the context being supplied as a parameter to the - * API call was already the active context. - * \deprecated - * This error return is deprecated as of CUDA 3.2. It is no longer an - * error to attempt to push the active context via ::cuCtxPushCurrent(). - */ - { "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 }, - - /** - * This indicates that a map or register operation has failed. - */ - { "CUDA_ERROR_MAP_FAILED", 205 }, - - /** - * This indicates that an unmap or unregister operation has failed. - */ - { "CUDA_ERROR_UNMAP_FAILED", 206 }, - - /** - * This indicates that the specified array is currently mapped and thus - * cannot be destroyed. - */ - { "CUDA_ERROR_ARRAY_IS_MAPPED", 207 }, - - /** - * This indicates that the resource is already mapped. - */ - { "CUDA_ERROR_ALREADY_MAPPED", 208 }, - - /** - * This indicates that there is no kernel image available that is suitable - * for the device. This can occur when a user specifies code generation - * options for a particular CUDA source file that do not include the - * corresponding device configuration. - */ - { "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 }, - - /** - * This indicates that a resource has already been acquired. - */ - { "CUDA_ERROR_ALREADY_ACQUIRED", 210 }, - - /** - * This indicates that a resource is not mapped. - */ - { "CUDA_ERROR_NOT_MAPPED", 211 }, - - /** - * This indicates that a mapped resource is not available for access as an - * array. - */ - { "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 }, - - /** - * This indicates that a mapped resource is not available for access as a - * pointer. - */ - { "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 }, - - /** - * This indicates that an uncorrectable ECC error was detected during - * execution. - */ - { "CUDA_ERROR_ECC_UNCORRECTABLE", 214 }, - - /** - * This indicates that the ::CUlimit passed to the API call is not - * supported by the active device. - */ - { "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 }, - - /** - * This indicates that the ::CUcontext passed to the API call can - * only be bound to a single CPU thread at a time but is already - * bound to a CPU thread. - */ - { "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 }, - - /** - * This indicates that peer access is not supported across the given - * devices. - */ - { "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217}, - - /** - * This indicates that the device kernel source is invalid. - */ - { "CUDA_ERROR_INVALID_SOURCE", 300 }, - - /** - * This indicates that the file specified was not found. - */ - { "CUDA_ERROR_FILE_NOT_FOUND", 301 }, - - /** - * This indicates that a link to a shared object failed to resolve. - */ - { "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 }, - - /** - * This indicates that initialization of a shared object failed. - */ - { "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 }, - - /** - * This indicates that an OS call failed. - */ - { "CUDA_ERROR_OPERATING_SYSTEM", 304 }, - - - /** - * This indicates that a resource handle passed to the API call was not - * valid. Resource handles are opaque types like ::CUstream and ::CUevent. - */ - { "CUDA_ERROR_INVALID_HANDLE", 400 }, - - - /** - * This indicates that a named symbol was not found. Examples of symbols - * are global/constant variable names, texture names }, and surface names. - */ - { "CUDA_ERROR_NOT_FOUND", 500 }, - - - /** - * This indicates that asynchronous operations issued previously have not - * completed yet. This result is not actually an error, but must be indicated - * differently than ::CUDA_SUCCESS (which indicates completion). Calls that - * may return this value include ::cuEventQuery() and ::cuStreamQuery(). - */ - { "CUDA_ERROR_NOT_READY", 600 }, - - - /** - * An exception occurred on the device while executing a kernel. Common - * causes include dereferencing an invalid device pointer and accessing - * out of bounds shared memory. The context cannot be used }, so it must - * be destroyed (and a new one should be created). All existing device - * memory allocations from this context are invalid and must be - * reconstructed if the program is to continue using CUDA. - */ - { "CUDA_ERROR_LAUNCH_FAILED", 700 }, - - /** - * This indicates that a launch did not occur because it did not have - * appropriate resources. This error usually indicates that the user has - * attempted to pass too many arguments to the device kernel, or the - * kernel launch specifies too many threads for the kernel's register - * count. Passing arguments of the wrong size (i.e. a 64-bit pointer - * when a 32-bit int is expected) is equivalent to passing too many - * arguments and can also result in this error. - */ - { "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 }, - - /** - * This indicates that the device kernel took too long to execute. This can - * only occur if timeouts are enabled - see the device attribute - * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The - * context cannot be used (and must be destroyed similar to - * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from - * this context are invalid and must be reconstructed if the program is to - * continue using CUDA. - */ - { "CUDA_ERROR_LAUNCH_TIMEOUT", 702 }, - - /** - * This error indicates a kernel launch that uses an incompatible texturing - * mode. - */ - { "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 }, - - /** - * This error indicates that a call to ::cuCtxEnablePeerAccess() is - * trying to re-enable peer access to a context which has already - * had peer access to it enabled. - */ - { "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 }, - - /** - * This error indicates that ::cuCtxDisablePeerAccess() is - * trying to disable peer access which has not been enabled yet - * via ::cuCtxEnablePeerAccess(). - */ - { "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 }, - - /** - * This error indicates that the primary context for the specified device - * has already been initialized. - */ - { "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 }, - - /** - * This error indicates that the context current to the calling thread - * has been destroyed using ::cuCtxDestroy }, or is a primary context which - * has not yet been initialized. - */ - { "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 }, - - /** - * A device-side assert triggered during kernel execution. The context - * cannot be used anymore, and must be destroyed. All existing device - * memory allocations from this context are invalid and must be - * reconstructed if the program is to continue using CUDA. - */ - { "CUDA_ERROR_ASSERT", 710 }, - - /** - * This error indicates that the hardware resources required to enable - * peer access have been exhausted for one or more of the devices - * passed to ::cuCtxEnablePeerAccess(). - */ - { "CUDA_ERROR_TOO_MANY_PEERS", 711 }, - - /** - * This error indicates that the memory range passed to ::cuMemHostRegister() - * has already been registered. - */ - { "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 }, - - /** - * This error indicates that the pointer passed to ::cuMemHostUnregister() - * does not correspond to any currently registered memory region. - */ - { "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 }, - - /** - * This error indicates that the attempted operation is not permitted. - */ - { "CUDA_ERROR_NOT_PERMITTED", 800 }, - - /** - * This error indicates that the attempted operation is not supported - * on the current system or device. - */ - { "CUDA_ERROR_NOT_SUPPORTED", 801 }, - - /** - * This indicates that an unknown internal error has occurred. - */ - { "CUDA_ERROR_UNKNOWN", 999 }, - { NULL, -1 } -}; - -// This is just a linear search through the array, since the error_id's are not -// always ocurring consecutively -const char * getCudaDrvErrorString(CUresult error_id) -{ - int index = 0; - while (sCudaDrvErrorString[index].error_id != error_id && - sCudaDrvErrorString[index].error_id != -1) - { - index++; - } - if (sCudaDrvErrorString[index].error_id == error_id) - return (const char *)sCudaDrvErrorString[index].error_string; - else - return (const char *)"CUDA_ERROR not found!"; -} - -#endif diff --git a/examples_cuda/stencil/kernel.ptx b/examples_cuda/stencil/kernel.ptx deleted file mode 100644 index b0339cbf..00000000 --- a/examples_cuda/stencil/kernel.ptx +++ /dev/null @@ -1,1246 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) -// Cuda compilation tools, release 5.5, V5.5.0 -// - -.version 3.2 -.target sm_35 -.address_size 64 - - -.extern .func (.param .b32 func_retval0) cudaLaunchDevice -( - .param .b64 cudaLaunchDevice_param_0, - .param .b64 cudaLaunchDevice_param_1, - .param .align 4 .b8 cudaLaunchDevice_param_2[12], - .param .align 4 .b8 cudaLaunchDevice_param_3[12], - .param .b32 cudaLaunchDevice_param_4, - .param .b64 cudaLaunchDevice_param_5 -); - - -.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer -( - .param .b64 cudaGetParameterBuffer_param_0, - .param .b64 cudaGetParameterBuffer_param_1 -) -; -.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize -( - -) -; -.global .align 1 .b8 constDeltaForeach1[32]; -.global .align 1 .b8 constDeltaForeach4[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; - -.visible .func (.param .b32 func_retval0) __shfl_i32( - .param .b32 __shfl_i32_param_0, - .param .b32 __shfl_i32_param_1 -) -{ - .reg .s32 %r<4>; - - - ld.param.u32 %r2, [__shfl_i32_param_0]; - ld.param.u32 %r3, [__shfl_i32_param_1]; - // inline asm - shfl.idx.b32 %r1, %r2, %r3, 0x1f; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __shfl_xor_float( - .param .b32 __shfl_xor_float_param_0, - .param .b32 __shfl_xor_float_param_1 -) -{ - .reg .s32 %r<2>; - .reg .f32 %f<3>; - - - ld.param.f32 %f2, [__shfl_xor_float_param_0]; - ld.param.u32 %r1, [__shfl_xor_float_param_1]; - // inline asm - shfl.bfly.b32 %f1, %f2, %r1, 0x1f; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __shfl_xor_i32( - .param .b32 __shfl_xor_i32_param_0, - .param .b32 __shfl_xor_i32_param_1 -) -{ - .reg .s32 %r<4>; - - - ld.param.u32 %r2, [__shfl_xor_i32_param_0]; - ld.param.u32 %r3, [__shfl_xor_i32_param_1]; - // inline asm - shfl.bfly.b32 %r1, %r2, %r3, 0x1f; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __fminf( - .param .b32 __fminf_param_0, - .param .b32 __fminf_param_1 -) -{ - .reg .f32 %f<4>; - - - ld.param.f32 %f2, [__fminf_param_0]; - ld.param.f32 %f3, [__fminf_param_1]; - // inline asm - min.f32 %f1, %f2, %f3; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __fmaxf( - .param .b32 __fmaxf_param_0, - .param .b32 __fmaxf_param_1 -) -{ - .reg .f32 %f<4>; - - - ld.param.f32 %f2, [__fmaxf_param_0]; - ld.param.f32 %f3, [__fmaxf_param_1]; - // inline asm - max.f32 %f1, %f2, %f3; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __ballot( - .param .b32 __ballot_param_0 -) -{ - .reg .s32 %r<3>; - - - ld.param.u8 %r2, [__ballot_param_0]; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r2, 0; - vote.ballot.b32 %r1, %p1; - } - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __lanemask_lt( - -) -{ - .reg .s32 %r<2>; - - - // inline asm - mov.u32 %r1, %lanemask_lt; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b64 func_retval0) ISPCAlloc( - .param .b64 ISPCAlloc_param_0, - .param .b64 ISPCAlloc_param_1, - .param .b32 ISPCAlloc_param_2 -) -{ - .reg .s64 %rd<2>; - - - mov.u64 %rd1, 1; - st.param.b64 [func_retval0+0], %rd1; - ret; -} - -.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer( - .param .b64 ISPCGetParamBuffer_param_0, - .param .b64 ISPCGetParamBuffer_param_1, - .param .b64 ISPCGetParamBuffer_param_2 -) -{ - .reg .pred %p<2>; - .reg .s32 %r<3>; - .reg .s64 %rd<7>; - - - ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1]; - ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2]; - mov.u32 %r1, %tid.x; - and.b32 %r2, %r1, 31; - setp.ne.s32 %p1, %r2, 0; - mov.u64 %rd6, 0; - @%p1 bra BB8_2; - - // Callseq Start 0 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd3; - .param .b64 param1; - st.param.b64 [param1+0], %rd4; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd6, [retval0+0]; - } - // Callseq End 0 - -BB8_2: - st.param.b64 [func_retval0+0], %rd6; - ret; -} - -.visible .func ISPCLaunch( - .param .b64 ISPCLaunch_param_0, - .param .b64 ISPCLaunch_param_1, - .param .b64 ISPCLaunch_param_2, - .param .b32 ISPCLaunch_param_3, - .param .b32 ISPCLaunch_param_4, - .param .b32 ISPCLaunch_param_5 -) -{ - .reg .pred %p<2>; - .reg .s32 %r<16>; - .reg .s64 %rd<6>; - - - ld.param.u64 %rd1, [ISPCLaunch_param_1]; - ld.param.u64 %rd2, [ISPCLaunch_param_2]; - ld.param.u32 %r1, [ISPCLaunch_param_3]; - ld.param.u32 %r2, [ISPCLaunch_param_4]; - ld.param.u32 %r3, [ISPCLaunch_param_5]; - mov.u32 %r4, %tid.x; - and.b32 %r5, %r4, 31; - setp.ne.s32 %p1, %r5, 0; - @%p1 bra BB9_2; - - add.s32 %r14, %r1, -1; - shr.s32 %r15, %r14, 2; - add.s32 %r7, %r15, 1; - mov.u32 %r12, 1; - mov.u32 %r10, 128; - mov.u32 %r13, 0; - mov.u64 %rd5, 0; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd1; - .param .b64 param1; - st.param.b64 [param1+0], %rd2; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r7; - st.param.b32 [param2+4], %r2; - st.param.b32 [param2+8], %r3; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r10; - st.param.b32 [param3+4], %r12; - st.param.b32 [param3+8], %r12; - .param .b32 param4; - st.param.b32 [param4+0], %r13; - .param .b64 param5; - st.param.b64 [param5+0], %rd5; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r6, [retval0+0]; - } - - // inline asm - -BB9_2: - ret; -} - -.visible .func ISPCSync( - .param .b64 ISPCSync_param_0 -) -{ - .reg .s32 %r<2>; - - - // Callseq Start 1 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r1, [retval0+0]; - } - // Callseq End 1 - ret; -} - -.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan( - .param .b32 __warpBinExclusiveScan_param_0 -) -{ - .reg .s32 %r<8>; - .reg .s64 %rd<5>; - - - ld.param.u8 %r2, [__warpBinExclusiveScan_param_0]; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r2, 0; - vote.ballot.b32 %r1, %p1; - } - // inline asm - // inline asm - popc.b32 %r3, %r1; - // inline asm - // inline asm - mov.u32 %r5, %lanemask_lt; - // inline asm - and.b32 %r7, %r5, %r1; - // inline asm - popc.b32 %r6, %r7; - // inline asm - cvt.u64.u32 %rd1, %r6; - shl.b64 %rd2, %rd1, 32; - cvt.u64.u32 %rd3, %r3; - or.b64 %rd4, %rd2, %rd3; - st.param.b64 [func_retval0+0], %rd4; - ret; -} - -.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_( - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12 -) -{ - .reg .pred %p<14>; - .reg .s32 %r<178>; - .reg .s64 %rd<96>; - .reg .f64 %fd<95>; - - - ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0]; - ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1]; - ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2]; - ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3]; - ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4]; - ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5]; - ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6]; - ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7]; - ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9]; - ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10]; - ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11]; - ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12]; - mov.u32 %r1, %ctaid.x; - shl.b32 %r50, %r1, 2; - mov.u32 %r2, %tid.x; - shr.s32 %r51, %r2, 5; - add.s32 %r52, %r51, %r50; - mov.u32 %r53, %nctaid.x; - shl.b32 %r54, %r53, 2; - setp.ge.s32 %p1, %r52, %r54; - mov.u32 %r55, %nctaid.y; - mov.u32 %r3, %ctaid.y; - setp.ge.s32 %p2, %r3, %r55; - or.pred %p3, %p1, %p2; - mov.u32 %r56, %nctaid.z; - mov.u32 %r4, %ctaid.z; - setp.ge.s32 %p4, %r4, %r56; - or.pred %p5, %p3, %p4; - @%p5 bra BB12_13; - - shl.b32 %r57, %r1, 7; - add.s32 %r58, %r2, %r57; - and.b32 %r59, %r58, -32; - add.s32 %r60, %r59, %r42; - add.s32 %r61, %r60, 32; - min.s32 %r5, %r43, %r61; - shl.b32 %r6, %r3, 3; - add.s32 %r62, %r6, %r44; - add.s32 %r7, %r62, 8; - shl.b32 %r8, %r4, 3; - add.s32 %r172, %r8, %r46; - add.s32 %r63, %r172, 8; - min.s32 %r64, %r47, %r63; - mul.lo.s32 %r10, %r49, %r48; - sub.s32 %r65, %r5, %r60; - shr.s32 %r66, %r65, 31; - shr.u32 %r67, %r66, 27; - add.s32 %r68, %r65, %r67; - and.b32 %r69, %r68, -32; - sub.s32 %r70, %r65, %r69; - sub.s32 %r11, %r5, %r70; - and.b32 %r71, %r2, 31; - cvt.u64.u32 %rd6, %r71; - mov.u64 %rd7, constDeltaForeach1; - add.s64 %rd1, %rd7, %rd6; - setp.ge.s32 %p6, %r172, %r64; - @%p6 bra BB12_13; - - min.s32 %r12, %r45, %r7; - shl.b32 %r15, %r10, 1; - neg.s32 %r16, %r15; - mul.lo.s32 %r17, %r10, 3; - mul.lo.s32 %r18, %r10, -3; - mov.u32 %r72, -9; - sub.s32 %r73, %r72, %r44; - sub.s32 %r74, %r73, %r6; - not.b32 %r75, %r45; - max.s32 %r76, %r74, %r75; - not.b32 %r19, %r76; - sub.s32 %r77, %r72, %r46; - sub.s32 %r78, %r77, %r8; - not.b32 %r79, %r47; - max.s32 %r80, %r78, %r79; - not.b32 %r20, %r80; - ld.global.u8 %r13, [%rd1]; - mov.u32 %r171, %r172; - -BB12_3: - mov.u32 %r21, %r171; - add.s32 %r23, %r21, %r13; - setp.ge.s32 %p7, %r62, %r12; - @%p7 bra BB12_12; - - mul.lo.s32 %r24, %r23, %r10; - mov.u32 %r174, %r62; - mov.u32 %r173, %r62; - -BB12_5: - mov.u32 %r27, %r173; - add.s32 %r30, %r27, %r13; - setp.ge.s32 %p8, %r60, %r11; - mov.u32 %r176, %r60; - @%p8 bra BB12_8; - - mov.u64 %rd9, constDeltaForeach4; - add.s64 %rd10, %rd9, %rd6; - ld.global.u8 %r31, [%rd10]; - mad.lo.s32 %r32, %r30, %r48, %r24; - add.s32 %r177, %r59, %r42; - -BB12_7: - cvta.to.global.u64 %rd11, %rd2; - add.s32 %r98, %r32, %r177; - add.s32 %r99, %r98, %r31; - shl.b32 %r100, %r99, 3; - cvt.s64.s32 %rd12, %r100; - add.s64 %rd13, %rd12, %rd4; - add.s32 %r101, %r100, 8; - cvt.s64.s32 %rd14, %r101; - add.s64 %rd15, %rd14, %rd4; - add.s32 %r102, %r100, -8; - cvt.s64.s32 %rd16, %r102; - add.s64 %rd17, %rd16, %rd4; - add.s32 %r103, %r99, %r48; - shl.b32 %r104, %r103, 3; - cvt.s64.s32 %rd18, %r104; - add.s64 %rd19, %rd18, %rd4; - sub.s32 %r105, %r99, %r48; - shl.b32 %r106, %r105, 3; - cvt.s64.s32 %rd20, %r106; - add.s64 %rd21, %rd20, %rd4; - add.s32 %r108, %r99, %r10; - shl.b32 %r109, %r108, 3; - cvt.s64.s32 %rd22, %r109; - add.s64 %rd23, %rd22, %rd4; - sub.s32 %r110, %r99, %r10; - shl.b32 %r111, %r110, 3; - cvt.s64.s32 %rd24, %r111; - add.s64 %rd25, %rd24, %rd4; - add.s32 %r112, %r100, 16; - cvt.s64.s32 %rd26, %r112; - add.s64 %rd27, %rd26, %rd4; - add.s32 %r113, %r100, -16; - cvt.s64.s32 %rd28, %r113; - add.s64 %rd29, %rd28, %rd4; - shl.b32 %r114, %r48, 1; - add.s32 %r115, %r99, %r114; - shl.b32 %r116, %r115, 3; - cvt.s64.s32 %rd30, %r116; - add.s64 %rd31, %rd30, %rd4; - mad.lo.s32 %r117, %r48, -2, %r99; - shl.b32 %r118, %r117, 3; - cvt.s64.s32 %rd32, %r118; - add.s64 %rd33, %rd32, %rd4; - add.s32 %r119, %r99, %r15; - shl.b32 %r120, %r119, 3; - cvt.s64.s32 %rd34, %r120; - add.s64 %rd35, %rd34, %rd4; - add.s32 %r121, %r99, %r16; - shl.b32 %r122, %r121, 3; - cvt.s64.s32 %rd36, %r122; - add.s64 %rd37, %rd36, %rd4; - add.s32 %r123, %r100, 24; - cvt.s64.s32 %rd38, %r123; - add.s64 %rd39, %rd38, %rd4; - add.s32 %r124, %r100, -24; - cvt.s64.s32 %rd40, %r124; - add.s64 %rd41, %rd40, %rd4; - mad.lo.s32 %r125, %r48, 3, %r99; - shl.b32 %r126, %r125, 3; - cvt.s64.s32 %rd42, %r126; - add.s64 %rd43, %rd42, %rd4; - mad.lo.s32 %r127, %r48, -3, %r99; - shl.b32 %r128, %r127, 3; - cvt.s64.s32 %rd44, %r128; - add.s64 %rd45, %rd44, %rd4; - add.s32 %r129, %r99, %r17; - shl.b32 %r130, %r129, 3; - cvt.s64.s32 %rd46, %r130; - add.s64 %rd47, %rd46, %rd4; - add.s32 %r131, %r99, %r18; - shl.b32 %r132, %r131, 3; - cvt.s64.s32 %rd48, %r132; - add.s64 %rd49, %rd48, %rd4; - add.s64 %rd50, %rd12, %rd5; - add.s64 %rd51, %rd12, %rd3; - ld.f64 %fd1, [%rd13]; - add.f64 %fd2, %fd1, %fd1; - ld.f64 %fd3, [%rd50]; - sub.f64 %fd4, %fd2, %fd3; - ld.global.f64 %fd5, [%rd11]; - ld.f64 %fd6, [%rd17]; - ld.f64 %fd7, [%rd15]; - add.f64 %fd8, %fd7, %fd6; - ld.f64 %fd9, [%rd19]; - add.f64 %fd10, %fd8, %fd9; - ld.f64 %fd11, [%rd21]; - add.f64 %fd12, %fd10, %fd11; - ld.f64 %fd13, [%rd23]; - add.f64 %fd14, %fd12, %fd13; - ld.f64 %fd15, [%rd25]; - add.f64 %fd16, %fd14, %fd15; - ld.global.f64 %fd17, [%rd11+8]; - mul.f64 %fd18, %fd17, %fd16; - fma.rn.f64 %fd19, %fd5, %fd1, %fd18; - ld.f64 %fd20, [%rd29]; - ld.f64 %fd21, [%rd27]; - add.f64 %fd22, %fd21, %fd20; - ld.f64 %fd23, [%rd31]; - add.f64 %fd24, %fd22, %fd23; - ld.f64 %fd25, [%rd33]; - add.f64 %fd26, %fd24, %fd25; - ld.f64 %fd27, [%rd35]; - add.f64 %fd28, %fd26, %fd27; - ld.f64 %fd29, [%rd37]; - add.f64 %fd30, %fd28, %fd29; - ld.global.f64 %fd31, [%rd11+16]; - fma.rn.f64 %fd32, %fd31, %fd30, %fd19; - ld.f64 %fd33, [%rd41]; - ld.f64 %fd34, [%rd39]; - add.f64 %fd35, %fd34, %fd33; - ld.f64 %fd36, [%rd43]; - add.f64 %fd37, %fd35, %fd36; - ld.f64 %fd38, [%rd45]; - add.f64 %fd39, %fd37, %fd38; - ld.f64 %fd40, [%rd47]; - add.f64 %fd41, %fd39, %fd40; - ld.f64 %fd42, [%rd49]; - add.f64 %fd43, %fd41, %fd42; - ld.global.f64 %fd44, [%rd11+24]; - fma.rn.f64 %fd45, %fd44, %fd43, %fd32; - ld.f64 %fd46, [%rd51]; - fma.rn.f64 %fd47, %fd46, %fd45, %fd4; - st.f64 [%rd50], %fd47; - add.s32 %r177, %r177, 32; - setp.lt.s32 %p9, %r177, %r11; - mov.u32 %r175, %r177; - mov.u32 %r176, %r175; - @%p9 bra BB12_7; - -BB12_8: - mov.u32 %r36, %r176; - setp.ge.s32 %p10, %r36, %r5; - @%p10 bra BB12_11; - - mov.u64 %rd53, constDeltaForeach4; - add.s64 %rd54, %rd53, %rd6; - ld.global.u8 %r135, [%rd54]; - add.s32 %r37, %r36, %r135; - setp.ge.s32 %p11, %r37, %r5; - @%p11 bra BB12_11; - - cvta.to.global.u64 %rd55, %rd2; - mad.lo.s32 %r136, %r30, %r48, %r24; - add.s32 %r137, %r136, %r37; - shl.b32 %r138, %r137, 3; - cvt.s64.s32 %rd56, %r138; - add.s64 %rd57, %rd56, %rd4; - add.s32 %r139, %r138, 8; - cvt.s64.s32 %rd58, %r139; - add.s64 %rd59, %rd58, %rd4; - add.s32 %r140, %r138, -8; - cvt.s64.s32 %rd60, %r140; - add.s64 %rd61, %rd60, %rd4; - add.s32 %r141, %r137, %r48; - shl.b32 %r142, %r141, 3; - cvt.s64.s32 %rd62, %r142; - add.s64 %rd63, %rd62, %rd4; - sub.s32 %r143, %r137, %r48; - shl.b32 %r144, %r143, 3; - cvt.s64.s32 %rd64, %r144; - add.s64 %rd65, %rd64, %rd4; - add.s32 %r146, %r137, %r10; - shl.b32 %r147, %r146, 3; - cvt.s64.s32 %rd66, %r147; - add.s64 %rd67, %rd66, %rd4; - sub.s32 %r148, %r137, %r10; - shl.b32 %r149, %r148, 3; - cvt.s64.s32 %rd68, %r149; - add.s64 %rd69, %rd68, %rd4; - add.s32 %r150, %r138, 16; - cvt.s64.s32 %rd70, %r150; - add.s64 %rd71, %rd70, %rd4; - add.s32 %r151, %r138, -16; - cvt.s64.s32 %rd72, %r151; - add.s64 %rd73, %rd72, %rd4; - shl.b32 %r152, %r48, 1; - add.s32 %r153, %r137, %r152; - shl.b32 %r154, %r153, 3; - cvt.s64.s32 %rd74, %r154; - add.s64 %rd75, %rd74, %rd4; - mad.lo.s32 %r155, %r48, -2, %r137; - shl.b32 %r156, %r155, 3; - cvt.s64.s32 %rd76, %r156; - add.s64 %rd77, %rd76, %rd4; - add.s32 %r157, %r137, %r15; - shl.b32 %r158, %r157, 3; - cvt.s64.s32 %rd78, %r158; - add.s64 %rd79, %rd78, %rd4; - add.s32 %r159, %r137, %r16; - shl.b32 %r160, %r159, 3; - cvt.s64.s32 %rd80, %r160; - add.s64 %rd81, %rd80, %rd4; - add.s32 %r161, %r138, 24; - cvt.s64.s32 %rd82, %r161; - add.s64 %rd83, %rd82, %rd4; - add.s32 %r162, %r138, -24; - cvt.s64.s32 %rd84, %r162; - add.s64 %rd85, %rd84, %rd4; - mad.lo.s32 %r163, %r48, 3, %r137; - shl.b32 %r164, %r163, 3; - cvt.s64.s32 %rd86, %r164; - add.s64 %rd87, %rd86, %rd4; - mad.lo.s32 %r165, %r48, -3, %r137; - shl.b32 %r166, %r165, 3; - cvt.s64.s32 %rd88, %r166; - add.s64 %rd89, %rd88, %rd4; - add.s32 %r167, %r137, %r17; - shl.b32 %r168, %r167, 3; - cvt.s64.s32 %rd90, %r168; - add.s64 %rd91, %rd90, %rd4; - add.s32 %r169, %r137, %r18; - shl.b32 %r170, %r169, 3; - cvt.s64.s32 %rd92, %r170; - add.s64 %rd93, %rd92, %rd4; - add.s64 %rd94, %rd56, %rd5; - add.s64 %rd95, %rd56, %rd3; - ld.f64 %fd48, [%rd57]; - add.f64 %fd49, %fd48, %fd48; - ld.f64 %fd50, [%rd94]; - sub.f64 %fd51, %fd49, %fd50; - ld.global.f64 %fd52, [%rd55]; - ld.f64 %fd53, [%rd61]; - ld.f64 %fd54, [%rd59]; - add.f64 %fd55, %fd54, %fd53; - ld.f64 %fd56, [%rd63]; - add.f64 %fd57, %fd55, %fd56; - ld.f64 %fd58, [%rd65]; - add.f64 %fd59, %fd57, %fd58; - ld.f64 %fd60, [%rd67]; - add.f64 %fd61, %fd59, %fd60; - ld.f64 %fd62, [%rd69]; - add.f64 %fd63, %fd61, %fd62; - ld.global.f64 %fd64, [%rd55+8]; - mul.f64 %fd65, %fd64, %fd63; - fma.rn.f64 %fd66, %fd52, %fd48, %fd65; - ld.f64 %fd67, [%rd73]; - ld.f64 %fd68, [%rd71]; - add.f64 %fd69, %fd68, %fd67; - ld.f64 %fd70, [%rd75]; - add.f64 %fd71, %fd69, %fd70; - ld.f64 %fd72, [%rd77]; - add.f64 %fd73, %fd71, %fd72; - ld.f64 %fd74, [%rd79]; - add.f64 %fd75, %fd73, %fd74; - ld.f64 %fd76, [%rd81]; - add.f64 %fd77, %fd75, %fd76; - ld.global.f64 %fd78, [%rd55+16]; - fma.rn.f64 %fd79, %fd78, %fd77, %fd66; - ld.f64 %fd80, [%rd85]; - ld.f64 %fd81, [%rd83]; - add.f64 %fd82, %fd81, %fd80; - ld.f64 %fd83, [%rd87]; - add.f64 %fd84, %fd82, %fd83; - ld.f64 %fd85, [%rd89]; - add.f64 %fd86, %fd84, %fd85; - ld.f64 %fd87, [%rd91]; - add.f64 %fd88, %fd86, %fd87; - ld.f64 %fd89, [%rd93]; - add.f64 %fd90, %fd88, %fd89; - ld.global.f64 %fd91, [%rd55+24]; - fma.rn.f64 %fd92, %fd91, %fd90, %fd79; - ld.f64 %fd93, [%rd95]; - fma.rn.f64 %fd94, %fd92, %fd93, %fd51; - st.f64 [%rd94], %fd94; - -BB12_11: - add.s32 %r39, %r174, 1; - setp.ne.s32 %p12, %r39, %r19; - mov.u32 %r174, %r39; - mov.u32 %r173, %r39; - @%p12 bra BB12_5; - -BB12_12: - add.s32 %r171, %r172, 1; - setp.ne.s32 %p13, %r171, %r20; - mov.u32 %r172, %r171; - @%p13 bra BB12_3; - -BB12_13: - ret; -} - -.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_( - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14, - .param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1] -) -{ - .reg .pred %p<9>; - .reg .s32 %r<63>; - .reg .s64 %rd<18>; - - - ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0]; - ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1]; - ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2]; - ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3]; - ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4]; - ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5]; - ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6]; - ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7]; - ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8]; - ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9]; - ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10]; - ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11]; - ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12]; - ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13]; - ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14]; - setp.ge.s32 %p1, %r62, %r12; - @%p1 bra BB13_14; - - mov.u32 %r22, 31; - sub.s32 %r23, %r22, %r13; - add.s32 %r24, %r23, %r14; - shr.s32 %r25, %r24, 31; - shr.u32 %r26, %r25, 27; - add.s32 %r27, %r24, %r26; - shr.s32 %r28, %r27, 5; - mov.u32 %r29, 7; - sub.s32 %r30, %r29, %r15; - add.s32 %r31, %r30, %r16; - shr.s32 %r32, %r31, 31; - shr.u32 %r33, %r32, 29; - add.s32 %r34, %r31, %r33; - shr.s32 %r1, %r34, 3; - sub.s32 %r35, %r29, %r17; - add.s32 %r36, %r35, %r18; - shr.s32 %r37, %r36, 31; - shr.u32 %r38, %r37, 29; - add.s32 %r39, %r36, %r38; - shr.s32 %r2, %r39, 3; - add.s32 %r40, %r28, -1; - shr.s32 %r41, %r40, 2; - add.s32 %r3, %r41, 1; - mov.u32 %r42, %tid.x; - and.b32 %r4, %r42, 31; - sub.s32 %r61, %r62, %r12; - -BB13_2: - and.b32 %r8, %r62, 1; - setp.ne.s32 %p2, %r4, 0; - mov.u64 %rd17, 0; - @%p2 bra BB13_4; - - mov.u64 %rd9, 8; - mov.u64 %rd10, 72; - // Callseq Start 2 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd10; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd17, [retval0+0]; - } - // Callseq End 2 - -BB13_4: - setp.eq.s32 %p3, %r8, 0; - @%p3 bra BB13_9; - - setp.eq.s64 %p4, %rd17, 0; - @%p4 bra BB13_7; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd7; - st.u64 [%rd17+64], %rd6; - -BB13_7: - @%p2 bra BB13_13; - - mov.u32 %r47, 128; - mov.u32 %r49, 1; - mov.u32 %r50, 0; - mov.u64 %rd13, 0; - mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd11; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r47; - st.param.b32 [param3+4], %r49; - st.param.b32 [param3+8], %r49; - .param .b32 param4; - st.param.b32 [param4+0], %r50; - .param .b64 param5; - st.param.b64 [param5+0], %rd13; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r43, [retval0+0]; - } - - // inline asm - bra.uni BB13_13; - -BB13_9: - setp.eq.s64 %p6, %rd17, 0; - @%p6 bra BB13_11; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd6; - st.u64 [%rd17+64], %rd7; - -BB13_11: - @%p2 bra BB13_13; - - mov.u32 %r55, 128; - mov.u32 %r57, 1; - mov.u32 %r58, 0; - mov.u64 %rd16, 0; - mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd14; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r55; - st.param.b32 [param3+4], %r57; - st.param.b32 [param3+8], %r57; - .param .b32 param4; - st.param.b32 [param4+0], %r58; - .param .b64 param5; - st.param.b64 [param5+0], %rd16; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r51, [retval0+0]; - } - - // inline asm - -BB13_13: - // Callseq Start 3 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r59, [retval0+0]; - } - // Callseq End 3 - add.s32 %r62, %r62, 1; - add.s32 %r61, %r61, 1; - setp.ne.s32 %p8, %r61, 0; - @%p8 bra BB13_2; - -BB13_14: - // Callseq Start 4 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r60, [retval0+0]; - } - // Callseq End 4 - ret; -} - -.visible .entry loop_stencil_ispc_tasks( - .param .u32 loop_stencil_ispc_tasks_param_0, - .param .u32 loop_stencil_ispc_tasks_param_1, - .param .u32 loop_stencil_ispc_tasks_param_2, - .param .u32 loop_stencil_ispc_tasks_param_3, - .param .u32 loop_stencil_ispc_tasks_param_4, - .param .u32 loop_stencil_ispc_tasks_param_5, - .param .u32 loop_stencil_ispc_tasks_param_6, - .param .u32 loop_stencil_ispc_tasks_param_7, - .param .u32 loop_stencil_ispc_tasks_param_8, - .param .u32 loop_stencil_ispc_tasks_param_9, - .param .u32 loop_stencil_ispc_tasks_param_10, - .param .u64 loop_stencil_ispc_tasks_param_11, - .param .u64 loop_stencil_ispc_tasks_param_12, - .param .u64 loop_stencil_ispc_tasks_param_13, - .param .u64 loop_stencil_ispc_tasks_param_14 -) -{ - .reg .pred %p<9>; - .reg .s32 %r<63>; - .reg .s64 %rd<18>; - - - ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0]; - ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1]; - ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2]; - ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3]; - ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4]; - ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5]; - ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6]; - ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7]; - ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8]; - ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9]; - ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10]; - ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11]; - ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12]; - ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13]; - ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14]; - setp.ge.s32 %p1, %r62, %r12; - @%p1 bra BB14_14; - - mov.u32 %r22, 31; - sub.s32 %r23, %r22, %r13; - add.s32 %r24, %r23, %r14; - shr.s32 %r25, %r24, 31; - shr.u32 %r26, %r25, 27; - add.s32 %r27, %r24, %r26; - shr.s32 %r28, %r27, 5; - mov.u32 %r29, 7; - sub.s32 %r30, %r29, %r15; - add.s32 %r31, %r30, %r16; - shr.s32 %r32, %r31, 31; - shr.u32 %r33, %r32, 29; - add.s32 %r34, %r31, %r33; - shr.s32 %r1, %r34, 3; - sub.s32 %r35, %r29, %r17; - add.s32 %r36, %r35, %r18; - shr.s32 %r37, %r36, 31; - shr.u32 %r38, %r37, 29; - add.s32 %r39, %r36, %r38; - shr.s32 %r2, %r39, 3; - add.s32 %r40, %r28, -1; - shr.s32 %r41, %r40, 2; - add.s32 %r3, %r41, 1; - mov.u32 %r42, %tid.x; - and.b32 %r4, %r42, 31; - sub.s32 %r61, %r62, %r12; - -BB14_2: - and.b32 %r8, %r62, 1; - setp.ne.s32 %p2, %r4, 0; - mov.u64 %rd17, 0; - @%p2 bra BB14_4; - - mov.u64 %rd9, 8; - mov.u64 %rd10, 72; - // Callseq Start 5 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd10; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd17, [retval0+0]; - } - // Callseq End 5 - -BB14_4: - setp.eq.s32 %p3, %r8, 0; - @%p3 bra BB14_9; - - setp.eq.s64 %p4, %rd17, 0; - @%p4 bra BB14_7; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd7; - st.u64 [%rd17+64], %rd6; - -BB14_7: - @%p2 bra BB14_13; - - mov.u32 %r47, 128; - mov.u32 %r49, 1; - mov.u32 %r50, 0; - mov.u64 %rd13, 0; - mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd11; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r47; - st.param.b32 [param3+4], %r49; - st.param.b32 [param3+8], %r49; - .param .b32 param4; - st.param.b32 [param4+0], %r50; - .param .b64 param5; - st.param.b64 [param5+0], %rd13; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r43, [retval0+0]; - } - - // inline asm - bra.uni BB14_13; - -BB14_9: - setp.eq.s64 %p6, %rd17, 0; - @%p6 bra BB14_11; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd6; - st.u64 [%rd17+64], %rd7; - -BB14_11: - @%p2 bra BB14_13; - - mov.u32 %r55, 128; - mov.u32 %r57, 1; - mov.u32 %r58, 0; - mov.u64 %rd16, 0; - mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd14; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r55; - st.param.b32 [param3+4], %r57; - st.param.b32 [param3+8], %r57; - .param .b32 param4; - st.param.b32 [param4+0], %r58; - .param .b64 param5; - st.param.b64 [param5+0], %rd16; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r51, [retval0+0]; - } - - // inline asm - -BB14_13: - // Callseq Start 6 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r59, [retval0+0]; - } - // Callseq End 6 - add.s32 %r62, %r62, 1; - add.s32 %r61, %r61, 1; - setp.ne.s32 %p8, %r61, 0; - @%p8 bra BB14_2; - -BB14_14: - // Callseq Start 7 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r60, [retval0+0]; - } - // Callseq End 7 - ret; -} - - - diff --git a/examples_cuda/stencil/libcudadevrt.a b/examples_cuda/stencil/libcudadevrt.a deleted file mode 100644 index 6cf40658..00000000 Binary files a/examples_cuda/stencil/libcudadevrt.a and /dev/null differ diff --git a/examples_cuda/stencil/stencil.cu b/examples_cuda/stencil/stencil.cu index 7895589f..1533505b 100644 --- a/examples_cuda/stencil/stencil.cu +++ b/examples_cuda/stencil/stencil.cu @@ -1,6 +1,11 @@ #define programCount 32 #define programIndex (threadIdx.x & 31) -#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5)) +#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5)) +#define taskIndex1 (blockIdx.y) +#define taskIndex2 (blockIdx.z) +#define taskCount0 (gridDim.x*4) +#define taskCount1 (gridDim.y) +#define taskCount2 (gridDim.z) __device__ static void stencil_step( int x0, int x1, @@ -48,15 +53,71 @@ stencil_step( int x0, int x1, } -extern "C" +#define SPANX 32 +#define SPANY 8 +#define SPANZ 8 + __global__ void stencil_step_task( int x0, int x1, int y0, int y1, - int z0, + int z0, int z1, int Nx, int Ny, int Nz, const double coef[4], const double vsq[], const double Ain[], double Aout[]) { - stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, - Nx, Ny, Nz, coef, vsq, Ain, Aout); + if (taskIndex0 >= taskCount0 || + taskIndex1 >= taskCount1 || + taskIndex2 >= taskCount2) + return; + + const int xfirst = x0 + taskIndex0 * SPANX; + const int xlast = min(x1, xfirst + SPANX); + + const int yfirst = y0 + taskIndex1 * SPANY; + const int ylast = min(y1, yfirst + SPANY); + + const int zfirst = z0 + taskIndex2 * SPANZ; + const int zlast = min(z1, zfirst + SPANZ); + + stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, + Nx, Ny, Nz, coef, vsq, Ain, Aout); } + + +extern "C" +__global__ void +loop_stencil_ispc_tasks( int t0, int t1, + int x0, int x1, + int y0, int y1, + int z0, int z1, + int Nx, int Ny, int Nz, + const double coef[4], + const double vsq[], + double Aeven[], double Aodd[]) +{ +#define NB(x,n) (((x)+(n)-1)/(n)) + + dim3 grid((NB(x1-x0,SPANX)-1)/4+1, NB(y1-y0,SPANY), NB(z1-z0,SPANZ)); + + for ( int t = t0; t < t1; ++t) + { + // Parallelize across cores as well: each task will work on a slice + // of 1 in the z extent of the volume. + if ((t & 1) == 0) + { + if (programIndex == 0) + stencil_step_task<<>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aeven, Aodd); + } + else + { + if (programIndex == 0) + stencil_step_task<<>>(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, + coef, vsq, Aodd, Aeven); + } + + // We need to wait for all of the launched tasks to finish before + // starting the next iteration + cudaDeviceSynchronize(); + } +} diff --git a/examples_cuda/stencil/stencilX.ispc b/examples_cuda/stencil/stencilX.ispc deleted file mode 100644 index 36d9d521..00000000 --- a/examples_cuda/stencil/stencilX.ispc +++ /dev/null @@ -1,159 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -static inline void -stencil_step(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], uniform const double vsq[], - uniform const double Ain[], uniform double Aout[]) { - const uniform int Nxy = Nx * Ny; - -#if 0 -#define VER1 -#endif - -#ifdef VER1 - const uniform int x1o = 1; - const uniform int x2o = 2; - const uniform int x3o = 3; - const uniform int y1o = Nx; - const uniform int y2o = Nx*2; - const uniform int y3o = Nx*3; - const uniform int z1o = Nxy; - const uniform int z2o = Nxy*2; - const uniform int z3o = Nxy*3; -#endif - foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) - { - const int index= (z * Nxy) + (y * Nx) + x; - -#ifndef VER1 -#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] -#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - double div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + - A_cur(0, +1, 0) + A_cur(0, -1, 0) + - A_cur(0, 0, +1) + A_cur(0, 0, -1)) + - coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + - A_cur(0, +2, 0) + A_cur(0, -2, 0) + - A_cur(0, 0, +2) + A_cur(0, 0, -2)) + - coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + - A_cur(0, +3, 0) + A_cur(0, -3, 0) + - A_cur(0, 0, +3) + A_cur(0, 0, -3)); - -#else - -#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)] -#define A_next(x, y, z) Aout[index + (x) + (y) + (z)] - double div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) + - A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) + - A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) + - coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) + - A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) + - A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) + - coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) + - A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) + - A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o)); - -#endif - - A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + - vsq[index] * div; - } -} - -#define SPANX 32 -#define SPANY 8 -#define SPANZ 8 - -static task void -stencil_step_task(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], uniform const double vsq[], - uniform const double Ain[], uniform double Aout[]) { - if (taskIndex0 >= taskCount0 || - taskIndex1 >= taskCount1 || - taskIndex2 >= taskCount2) - return; - - const uniform int xfirst = x0 + taskIndex0 * SPANX; - const uniform int xlast = min(x1, xfirst + SPANX); - - const uniform int yfirst = y0 + taskIndex1 * SPANY; - const uniform int ylast = min(y1, yfirst + SPANY); - - const uniform int zfirst = z0 + taskIndex2 * SPANZ; - const uniform int zlast = min(z1, zfirst + SPANZ); - - stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, - Nx, Ny, Nz, coef, vsq, Ain, Aout); -} - - - -export void -loop_stencil_ispc_tasks(uniform int t0, uniform int t1, - uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], - uniform const double vsq[], - uniform double Aeven[], uniform double Aodd[]) -{ -#define NB(x,n) (((x)+(n)-1)/(n)) - - for (uniform int t = t0; t < t1; ++t) - { - // Parallelize across cores as well: each task will work on a slice - // of 1 in the z extent of the volume. - if ((t & 1) == 0) - launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] - stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, - coef, vsq, Aeven, Aodd); - else - launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] - stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, - coef, vsq, Aodd, Aeven); - - // We need to wait for all of the launched tasks to finish before - // starting the next iteration. - sync; - } -} - diff --git a/examples_cuda/stencil/stencilY.ispc b/examples_cuda/stencil/stencilY.ispc deleted file mode 100644 index 72c28ef6..00000000 --- a/examples_cuda/stencil/stencilY.ispc +++ /dev/null @@ -1,126 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -static inline void -stencil_step(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], uniform const double vsq[], - uniform const double Ain[], uniform double Aout[]) { - const uniform int Nxy = Nx * Ny; - - foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) - { - int index = (z * Nxy) + (y * Nx) + x; -#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] -#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - double div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + - A_cur(0, +1, 0) + A_cur(0, -1, 0) + - A_cur(0, 0, +1) + A_cur(0, 0, -1)) + - coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + - A_cur(0, +2, 0) + A_cur(0, -2, 0) + - A_cur(0, 0, +2) + A_cur(0, 0, -2)) + - coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + - A_cur(0, +3, 0) + A_cur(0, -3, 0) + - A_cur(0, 0, +3) + A_cur(0, 0, -3)); - - A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + - vsq[index] * div; - - } -} - -#define SPANX 32 -#define SPANY 8 -#define SPANZ 8 - -static task void -stencil_step_task(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], uniform const double vsq[], - uniform const double Ain[], uniform double Aout[]) { - if (taskIndex0 >= taskCount0 || - taskIndex1 >= taskCount1 || - taskIndex2 >= taskCount2) - return; - - const uniform int xfirst = x0 + taskIndex0 * SPANX; - const uniform int xlast = min(x1, xfirst + SPANX); - - const uniform int yfirst = y0 + taskIndex1 * SPANY; - const uniform int ylast = min(y1, yfirst + SPANY); - - const uniform int zfirst = z0 + taskIndex2 * SPANZ; - const uniform int zlast = min(z1, zfirst + SPANZ); - - stencil_step(xfirst,xlast, yfirst,ylast, zfirst,zlast, - Nx, Ny, Nz, coef, vsq, Ain, Aout); -} - - - -export void -loop_stencil_ispc_tasks(uniform int t0, uniform int t1, - uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], - uniform const double vsq[], - uniform double Aeven[], uniform double Aodd[]) -{ -#define NB(x,n) (((x)+(n)-1)/(n)) - - for (uniform int t = t0; t < t1; ++t) - { - // Parallelize across cores as well: each task will work on a slice - // of 1 in the z extent of the volume. - if ((t & 1) == 0) - launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] - stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, - coef, vsq, Aeven, Aodd); - else - launch[NB(z1-z0,SPANZ)][NB(y1-y0,SPANY)][NB(x1-x0,SPANX)] - stencil_step_task(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, - coef, vsq, Aodd, Aeven); - - // We need to wait for all of the launched tasks to finish before - // starting the next iteration. - sync; - } -} - diff --git a/examples_cuda/stencil/stencil_cu b/examples_cuda/stencil/stencil_cu deleted file mode 100755 index 28fe453a..00000000 Binary files a/examples_cuda/stencil/stencil_cu and /dev/null differ diff --git a/examples_cuda/stencil/stencil_cu.o b/examples_cuda/stencil/stencil_cu.o deleted file mode 100644 index dcd38c9f..00000000 Binary files a/examples_cuda/stencil/stencil_cu.o and /dev/null differ diff --git a/examples_cuda/stencil/stencil_ispc.h b/examples_cuda/stencil/stencil_ispc.h deleted file mode 100644 index 10b0d713..00000000 --- a/examples_cuda/stencil/stencil_ispc.h +++ /dev/null @@ -1,34 +0,0 @@ -// -// stencil_ispc.h -// (Header automatically generated by the ispc compiler.) -// DO NOT EDIT THIS FILE. -// - -#ifndef ISPC_STENCIL_ISPC_H -#define ISPC_STENCIL_ISPC_H - -#include - - - -#ifdef __cplusplus -namespace ispc { /* namespace */ -#endif // __cplusplus - -/////////////////////////////////////////////////////////////////////////// -// Functions exported from ispc code -/////////////////////////////////////////////////////////////////////////// -#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C) -extern "C" { -#endif // __cplusplus - extern void loop_stencil_ispc_tasks(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd); -#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C) -} /* end extern C */ -#endif // __cplusplus - - -#ifdef __cplusplus -} /* namespace */ -#endif // __cplusplus - -#endif // ISPC_STENCIL_ISPC_H diff --git a/examples_cuda/stencil/stencil_ispc_nvptx64.ll b/examples_cuda/stencil/stencil_ispc_nvptx64.ll deleted file mode 100644 index 51c0d95a..00000000 --- a/examples_cuda/stencil/stencil_ispc_nvptx64.ll +++ /dev/null @@ -1,974 +0,0 @@ -; ModuleID = 'stencil_ispc_nvptx64.bc' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -target triple = "nvptx64" - -module asm "" -module asm ".extern .func (.param .b32 func_retval0) cudaLaunchDevice" -module asm "(" -module asm " .param .b64 cudaLaunchDevice_param_0," -module asm " .param .b64 cudaLaunchDevice_param_1," -module asm " .param .align 4 .b8 cudaLaunchDevice_param_2[12]," -module asm " .param .align 4 .b8 cudaLaunchDevice_param_3[12]," -module asm " .param .b32 cudaLaunchDevice_param_4," -module asm " .param .b64 cudaLaunchDevice_param_5" -module asm ");" - -@constDeltaForeach1 = private unnamed_addr constant [32 x i8] zeroinitializer -@constDeltaForeach4 = private unnamed_addr constant [32 x i8] c"\00\01\02\03\04\05\06\07\08\09\0A\0B\0C\0D\0E\0F\10\11\12\13\14\15\16\17\18\19\1A\1B\1C\1D\1E\1F" - -declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone - -declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone - -declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() nounwind readnone - -declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() nounwind readnone - -declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() nounwind readnone - -declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() nounwind readnone - -declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() nounwind readnone - -define i32 @__shfl_i32(i32, i32) { - %shfl = tail call i32 asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) - ret i32 %shfl -} - -define float @__shfl_xor_float(float, i32) { - %shfl = tail call float asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) - ret float %shfl -} - -define i32 @__shfl_xor_i32(i32, i32) { - %shfl = tail call i32 asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) - ret i32 %shfl -} - -define float @__fminf(float, float) { - %min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) - ret float %min -} - -define float @__fmaxf(float, float) { - %max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) - ret float %max -} - -define i32 @__ballot(i1) { - %conv = zext i1 %0 to i32 - %res = tail call i32 asm sideeffect "{ .reg .pred %p1; \0A setp.ne.u32 %p1, $1, 0; \0A vote.ballot.b32 $0, %p1; \0A }", "=r,r"(i32 %conv) - ret i32 %res -} - -define i32 @__lanemask_lt() { - %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() - ret i32 %mask -} - -define i8* @ISPCAlloc(i8**, i64, i32) { - ret i8* inttoptr (i64 1 to i8*) -} - -declare i64 @cudaGetParameterBuffer(i64, i64) - -define i8* @ISPCGetParamBuffer(i8**, i64 %align, i64 %size) { -entry: - %tid.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and = and i32 %tid.i, 31 - %cmp = icmp eq i32 %and, 0 - br i1 %cmp, label %if.then, label %if.end - -if.then: ; preds = %entry - %ptri64tmp = tail call i64 @cudaGetParameterBuffer(i64 %align, i64 %size) - %phitmp = inttoptr i64 %ptri64tmp to i8* - br label %if.end - -if.end: ; preds = %if.then, %entry - %ptri64 = phi i8* [ %phitmp, %if.then ], [ null, %entry ] - ret i8* %ptri64 -} - -define void @ISPCLaunch(i8**, i8* %func_ptr, i8* %func_args, i32 %ntx, i32 %nty, i32 %ntz) { -entry: - %tid.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and = and i32 %tid.i, 31 - %cmp = icmp eq i32 %and, 0 - br i1 %cmp, label %if.then, label %if.end - -if.then: ; preds = %entry - %ntxm1 = add nsw i32 %ntx, -1 - %ntxm1d4 = ashr i32 %ntxm1, 2 - %nbx = add nsw i32 %ntxm1d4, 1 - %args_i64 = ptrtoint i8* %func_args to i64 - %func_i64 = ptrtoint i8* %func_ptr to i64 - %res_tmp = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 %func_i64, i64 %args_i64, i32 %nbx, i32 %nty, i32 %ntz, i32 128, i32 1, i32 1, i32 0, i64 0) - br label %if.end - -if.end: ; preds = %if.then, %entry - ret void -} - -declare i32 @cudaDeviceSynchronize() - -define void @ISPCSync(i8*) { - %2 = tail call i32 @cudaDeviceSynchronize() - ret void -} - -define i64 @__warpBinExclusiveScan(i1 %p) { -entry: - %conv.i = zext i1 %p to i32 - %res.i = tail call i32 asm sideeffect "{ .reg .pred %p1; \0A setp.ne.u32 %p1, $1, 0; \0A vote.ballot.b32 $0, %p1; \0A }", "=r,r"(i32 %conv.i) - %res.i1 = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %res.i) - %mask.i = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() - %and = and i32 %mask.i, %res.i - %res.i2 = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %and) - %retval.sroa.1.4.insert.ext.i = zext i32 %res.i2 to i64 - %retval.sroa.1.4.insert.shift.i = shl nuw i64 %retval.sroa.1.4.insert.ext.i, 32 - %retval.sroa.0.0.insert.ext.i = zext i32 %res.i1 to i64 - %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.1.4.insert.shift.i, %retval.sroa.0.0.insert.ext.i - ret i64 %retval.sroa.0.0.insert.insert.i -} - -define internal void @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Ain, double* %Aout) { -allocas: - %bid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() - %mul_calltmp_.i = shl i32 %bid.i.i, 2 - %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %bitop.i = ashr i32 %tid.i.i, 5 - %add_mul_calltmp__bitop.i = add i32 %bitop.i, %mul_calltmp_.i - %nb.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() - %mul_calltmp_.i57 = shl i32 %nb.i.i, 2 - %greaterequal_calltmp_calltmp18 = icmp sge i32 %add_mul_calltmp__bitop.i, %mul_calltmp_.i57 - %bid.i.i58 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() - %nb.i.i59 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() - %greaterequal_calltmp21_calltmp24 = icmp sge i32 %bid.i.i58, %nb.i.i59 - %logical_or = or i1 %greaterequal_calltmp_calltmp18, %greaterequal_calltmp21_calltmp24 - %bid.i.i60 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() - %nb.i.i61 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() - %greaterequal_calltmp27_calltmp30 = icmp sge i32 %bid.i.i60, %nb.i.i61 - %logical_or31 = or i1 %logical_or, %greaterequal_calltmp27_calltmp30 - br i1 %logical_or31, label %if_then, label %if_exit - -if_then: ; preds = %foreach_reset19.i, %if_exit, %allocas - ret void - -if_exit: ; preds = %allocas - %bid.i.i62 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() - %mul_calltmp_.i63 = shl i32 %bid.i.i62, 7 - %tid.i.i64 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %bitop.i657375 = add i32 %tid.i.i64, %mul_calltmp_.i63 - %mul_calltmp35_ = and i32 %bitop.i657375, -32 - %add_x0_load_mul_calltmp35_ = add i32 %mul_calltmp35_, %x0 - %add_xfirst_load_ = add i32 %add_x0_load_mul_calltmp35_, 32 - %c.i.i = icmp sgt i32 %add_xfirst_load_, %x1 - %r.i.i = select i1 %c.i.i, i32 %x1, i32 %add_xfirst_load_ - %bid.i.i67 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() - %mul_calltmp41_ = shl i32 %bid.i.i67, 3 - %add_y0_load_mul_calltmp41_ = add i32 %mul_calltmp41_, %y0 - %add_yfirst_load_ = add i32 %add_y0_load_mul_calltmp41_, 8 - %bid.i.i70 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() - %mul_calltmp47_ = shl i32 %bid.i.i70, 3 - %add_z0_load_mul_calltmp47_ = add i32 %mul_calltmp47_, %z0 - %add_zfirst_load_ = add i32 %add_z0_load_mul_calltmp47_, 8 - %c.i.i71 = icmp sgt i32 %add_zfirst_load_, %z1 - %r.i.i72 = select i1 %c.i.i71, i32 %z1, i32 %add_zfirst_load_ - %mul_Nx_load_Ny_load.i = mul i32 %Ny, %Nx - %nitems29.i = sub i32 %r.i.i, %add_x0_load_mul_calltmp35_ - %nextras30.i = srem i32 %nitems29.i, 32 - %aligned_end31.i = sub i32 %r.i.i, %nextras30.i - %tid.i4.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %__laneidx.i = and i32 %tid.i4.i, 31 - %0 = zext i32 %__laneidx.i to i64 - %arrayidx.i = getelementptr [32 x i8]* @constDeltaForeach1, i64 0, i64 %0 - %cmp38.i396 = icmp slt i32 %add_z0_load_mul_calltmp47_, %r.i.i72 - br i1 %cmp38.i396, label %foreach_test21.i.preheader.lr.ph, label %if_then - -foreach_test21.i.preheader.lr.ph: ; preds = %if_exit - %c.i.i68 = icmp sgt i32 %add_yfirst_load_, %y1 - %r.i.i69 = select i1 %c.i.i68, i32 %y1, i32 %add_yfirst_load_ - %1 = load i8* %arrayidx.i, align 1 - %_zext.i394 = zext i8 %1 to i32 - %2 = insertelement <1 x i32> undef, i32 %_zext.i394, i32 0 - %smear_counter_init.i393 = insertelement <1 x i32> undef, i32 %add_z0_load_mul_calltmp47_, i32 0 - %iter_val.i395 = add <1 x i32> %smear_counter_init.i393, %2 - %smear_counter_init44.i387 = insertelement <1 x i32> undef, i32 %add_y0_load_mul_calltmp41_, i32 0 - %cmp54.i390 = icmp slt i32 %add_y0_load_mul_calltmp41_, %r.i.i69 - %before_aligned_end73.i385 = icmp slt i32 %add_x0_load_mul_calltmp35_, %aligned_end31.i - %smear_end_init289.i = insertelement <1 x i32> undef, i32 %r.i.i, i32 0 - %Nxy_load298_broadcast_init.i = insertelement <1 x i32> undef, i32 %mul_Nx_load_Ny_load.i, i32 0 - %Nx_load300_broadcast_init.i = insertelement <1 x i32> undef, i32 %Nx, i32 0 - %Ain_load309_ptr2int.i = ptrtoint double* %Ain to i64 - %coef_load314_offset.i = getelementptr double* %coef, i64 1 - %coef_load365_offset.i = getelementptr double* %coef, i64 2 - %mul__Nx_load385.i = shl i32 %Nx, 1 - %mul__Nx_load393.i = mul i32 %Nx, -2 - %mul__Nxy_load402.i = shl i32 %mul_Nx_load_Ny_load.i, 1 - %mul__Nxy_load410.i = mul i32 %mul_Nx_load_Ny_load.i, -2 - %coef_load416_offset.i = getelementptr double* %coef, i64 3 - %mul__Nx_load436.i = mul i32 %Nx, 3 - %mul__Nx_load444.i = mul i32 %Nx, -3 - %mul__Nxy_load453.i = mul i32 %mul_Nx_load_Ny_load.i, 3 - %mul__Nxy_load461.i = mul i32 %mul_Nx_load_Ny_load.i, -3 - %Aout_load470_ptr2int.i = ptrtoint double* %Aout to i64 - %vsq_load488_ptr2int.i = ptrtoint double* %vsq to i64 - %3 = sub i32 -9, %y0 - %4 = shl i32 %bid.i.i67, 3 - %5 = sub i32 %3, %4 - %6 = xor i32 %y1, -1 - %7 = icmp sgt i32 %5, %6 - %smax = select i1 %7, i32 %5, i32 %6 - %8 = xor i32 %smax, -1 - %9 = sub i32 -9, %z0 - %10 = shl i32 %bid.i.i70, 3 - %11 = sub i32 %9, %10 - %12 = xor i32 %z1, -1 - %13 = icmp sgt i32 %11, %12 - %smax399 = select i1 %13, i32 %11, i32 %12 - %14 = xor i32 %smax399, -1 - br label %foreach_test21.i.preheader - -foreach_full_body.i: ; preds = %outer_not_in_extras.i.preheader, %foreach_full_body.i - %counter32.4.i386 = phi i32 [ %new_counter279.i, %foreach_full_body.i ], [ %add_x0_load_mul_calltmp35_, %outer_not_in_extras.i.preheader ] - %tid.i.i56 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %__laneidx80.i = and i32 %tid.i.i56, 31 - %15 = zext i32 %__laneidx80.i to i64 - %arrayidx81.i = getelementptr [32 x i8]* @constDeltaForeach4, i64 0, i64 %15 - %16 = load i8* %arrayidx81.i, align 1 - %_zext82.i = zext i8 %16 to i32 - %coef_load_offset_load.i = load double* %coef, align 8 - %.lhs362.lhs.lhs = extractelement <1 x i32> %mul_z_load297_Nxy_load298_broadcast.i, i32 0 - %.lhs362.lhs.rhs.lhs = extractelement <1 x i32> %iter_val50.i392, i32 0 - %.lhs362.lhs.rhs = mul i32 %.lhs362.lhs.rhs.lhs, %Nx - %.lhs362.lhs = add i32 %.lhs362.lhs.lhs, %.lhs362.lhs.rhs - %.lhs362.rhs = add i32 %counter32.4.i386, %_zext82.i - %.lhs362 = add i32 %.lhs362.lhs, %.lhs362.rhs - %17 = shl i32 %.lhs362, 3 - %iptr__id.i.rhs = sext i32 %17 to i64 - %iptr__id.i = add i64 %iptr__id.i.rhs, %Ain_load309_ptr2int.i - %ptr__id.i = inttoptr i64 %iptr__id.i to double* - %val__id.i = load double* %ptr__id.i, align 8 - %coef_load94_offset_load.i = load double* %coef_load314_offset.i, align 8 - %18 = add i32 %17, 8 - %iptr__id.i335.rhs = sext i32 %18 to i64 - %iptr__id.i335 = add i64 %iptr__id.i335.rhs, %Ain_load309_ptr2int.i - %ptr__id.i336 = inttoptr i64 %iptr__id.i335 to double* - %val__id.i337 = load double* %ptr__id.i336, align 8 - %19 = add i32 %17, -8 - %iptr__id.i330.rhs = sext i32 %19 to i64 - %iptr__id.i330 = add i64 %iptr__id.i330.rhs, %Ain_load309_ptr2int.i - %ptr__id.i331 = inttoptr i64 %iptr__id.i330 to double* - %val__id.i332 = load double* %ptr__id.i331, align 8 - %.lhs365 = add i32 %.lhs362, %Nx - %20 = shl i32 %.lhs365, 3 - %iptr__id.i325.rhs = sext i32 %20 to i64 - %iptr__id.i325 = add i64 %iptr__id.i325.rhs, %Ain_load309_ptr2int.i - %ptr__id.i326 = inttoptr i64 %iptr__id.i325 to double* - %val__id.i327 = load double* %ptr__id.i326, align 8 - %.lhs366 = sub i32 %.lhs362, %Nx - %21 = shl i32 %.lhs366, 3 - %iptr__id.i320.rhs = sext i32 %21 to i64 - %iptr__id.i320 = add i64 %iptr__id.i320.rhs, %Ain_load309_ptr2int.i - %ptr__id.i321 = inttoptr i64 %iptr__id.i320 to double* - %val__id.i322 = load double* %ptr__id.i321, align 8 - %.lhs367 = add i32 %.lhs362, %mul_Nx_load_Ny_load.i - %22 = shl i32 %.lhs367, 3 - %iptr__id.i315.rhs = sext i32 %22 to i64 - %iptr__id.i315 = add i64 %iptr__id.i315.rhs, %Ain_load309_ptr2int.i - %ptr__id.i316 = inttoptr i64 %iptr__id.i315 to double* - %val__id.i317 = load double* %ptr__id.i316, align 8 - %.lhs368 = sub i32 %.lhs362, %mul_Nx_load_Ny_load.i - %23 = shl i32 %.lhs368, 3 - %iptr__id.i310.rhs = sext i32 %23 to i64 - %iptr__id.i310 = add i64 %iptr__id.i310.rhs, %Ain_load309_ptr2int.i - %ptr__id.i311 = inttoptr i64 %iptr__id.i310 to double* - %val__id.i312 = load double* %ptr__id.i311, align 8 - %coef_load145_offset_load.i = load double* %coef_load365_offset.i, align 8 - %24 = add i32 %17, 16 - %iptr__id.i305.rhs = sext i32 %24 to i64 - %iptr__id.i305 = add i64 %iptr__id.i305.rhs, %Ain_load309_ptr2int.i - %ptr__id.i306 = inttoptr i64 %iptr__id.i305 to double* - %val__id.i307 = load double* %ptr__id.i306, align 8 - %25 = add i32 %17, -16 - %iptr__id.i300.rhs = sext i32 %25 to i64 - %iptr__id.i300 = add i64 %iptr__id.i300.rhs, %Ain_load309_ptr2int.i - %ptr__id.i301 = inttoptr i64 %iptr__id.i300 to double* - %val__id.i302 = load double* %ptr__id.i301, align 8 - %.lhs371 = add i32 %.lhs362, %mul__Nx_load385.i - %26 = shl i32 %.lhs371, 3 - %iptr__id.i295.rhs = sext i32 %26 to i64 - %iptr__id.i295 = add i64 %iptr__id.i295.rhs, %Ain_load309_ptr2int.i - %ptr__id.i296 = inttoptr i64 %iptr__id.i295 to double* - %val__id.i297 = load double* %ptr__id.i296, align 8 - %.lhs372 = add i32 %.lhs362, %mul__Nx_load393.i - %27 = shl i32 %.lhs372, 3 - %iptr__id.i290.rhs = sext i32 %27 to i64 - %iptr__id.i290 = add i64 %iptr__id.i290.rhs, %Ain_load309_ptr2int.i - %ptr__id.i291 = inttoptr i64 %iptr__id.i290 to double* - %val__id.i292 = load double* %ptr__id.i291, align 8 - %.lhs373 = add i32 %.lhs362, %mul__Nxy_load402.i - %28 = shl i32 %.lhs373, 3 - %iptr__id.i285.rhs = sext i32 %28 to i64 - %iptr__id.i285 = add i64 %iptr__id.i285.rhs, %Ain_load309_ptr2int.i - %ptr__id.i286 = inttoptr i64 %iptr__id.i285 to double* - %val__id.i287 = load double* %ptr__id.i286, align 8 - %.lhs374 = add i32 %.lhs362, %mul__Nxy_load410.i - %29 = shl i32 %.lhs374, 3 - %iptr__id.i280.rhs = sext i32 %29 to i64 - %iptr__id.i280 = add i64 %iptr__id.i280.rhs, %Ain_load309_ptr2int.i - %ptr__id.i281 = inttoptr i64 %iptr__id.i280 to double* - %val__id.i282 = load double* %ptr__id.i281, align 8 - %coef_load196_offset_load.i = load double* %coef_load416_offset.i, align 8 - %30 = add i32 %17, 24 - %iptr__id.i275.rhs = sext i32 %30 to i64 - %iptr__id.i275 = add i64 %iptr__id.i275.rhs, %Ain_load309_ptr2int.i - %ptr__id.i276 = inttoptr i64 %iptr__id.i275 to double* - %val__id.i277 = load double* %ptr__id.i276, align 8 - %31 = add i32 %17, -24 - %iptr__id.i270.rhs = sext i32 %31 to i64 - %iptr__id.i270 = add i64 %iptr__id.i270.rhs, %Ain_load309_ptr2int.i - %ptr__id.i271 = inttoptr i64 %iptr__id.i270 to double* - %val__id.i272 = load double* %ptr__id.i271, align 8 - %.lhs377 = add i32 %.lhs362, %mul__Nx_load436.i - %32 = shl i32 %.lhs377, 3 - %iptr__id.i265.rhs = sext i32 %32 to i64 - %iptr__id.i265 = add i64 %iptr__id.i265.rhs, %Ain_load309_ptr2int.i - %ptr__id.i266 = inttoptr i64 %iptr__id.i265 to double* - %val__id.i267 = load double* %ptr__id.i266, align 8 - %.lhs378 = add i32 %.lhs362, %mul__Nx_load444.i - %33 = shl i32 %.lhs378, 3 - %iptr__id.i260.rhs = sext i32 %33 to i64 - %iptr__id.i260 = add i64 %iptr__id.i260.rhs, %Ain_load309_ptr2int.i - %ptr__id.i261 = inttoptr i64 %iptr__id.i260 to double* - %val__id.i262 = load double* %ptr__id.i261, align 8 - %.lhs379 = add i32 %.lhs362, %mul__Nxy_load453.i - %34 = shl i32 %.lhs379, 3 - %iptr__id.i255.rhs = sext i32 %34 to i64 - %iptr__id.i255 = add i64 %iptr__id.i255.rhs, %Ain_load309_ptr2int.i - %ptr__id.i256 = inttoptr i64 %iptr__id.i255 to double* - %val__id.i257 = load double* %ptr__id.i256, align 8 - %.lhs380 = add i32 %.lhs362, %mul__Nxy_load461.i - %35 = shl i32 %.lhs380, 3 - %iptr__id.i250.rhs = sext i32 %35 to i64 - %iptr__id.i250 = add i64 %iptr__id.i250.rhs, %Ain_load309_ptr2int.i - %ptr__id.i251 = inttoptr i64 %iptr__id.i250 to double* - %val__id.i252 = load double* %ptr__id.i251, align 8 - %val__id.i247 = load double* %ptr__id.i, align 8 - %iptr__id.i240 = add i64 %iptr__id.i.rhs, %Aout_load470_ptr2int.i - %ptr__id.i241 = inttoptr i64 %iptr__id.i240 to double* - %val__id.i242 = load double* %ptr__id.i241, align 8 - %iptr__id.i235 = add i64 %iptr__id.i.rhs, %vsq_load488_ptr2int.i - %ptr__id.i236 = inttoptr i64 %iptr__id.i235 to double* - %val__id.i237 = load double* %ptr__id.i236, align 8 - %val__id.i233.lhs.lhs = fmul double %val__id.i247, 2.000000e+00 - %val__id.i233.lhs = fsub double %val__id.i233.lhs.lhs, %val__id.i242 - %val__id.i233.rhs.rhs.lhs.lhs.lhs = fmul double %coef_load_offset_load.i, %val__id.i - %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i337, %val__id.i332 - %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i327 - %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i322 - %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs, %val__id.i317 - %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs.lhs, %val__id.i312 - %val__id.i233.rhs.rhs.lhs.lhs.rhs = fmul double %coef_load94_offset_load.i, %val__id.i233.rhs.rhs.lhs.lhs.rhs.rhs - %val__id.i233.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs.lhs, %val__id.i233.rhs.rhs.lhs.lhs.rhs - %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i307, %val__id.i302 - %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i297 - %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i292 - %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs.lhs, %val__id.i287 - %val__id.i233.rhs.rhs.lhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs.rhs.rhs.lhs, %val__id.i282 - %val__id.i233.rhs.rhs.lhs.rhs = fmul double %coef_load145_offset_load.i, %val__id.i233.rhs.rhs.lhs.rhs.rhs - %val__id.i233.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.lhs.lhs, %val__id.i233.rhs.rhs.lhs.rhs - %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i277, %val__id.i272 - %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i267 - %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs.lhs, %val__id.i262 - %val__id.i233.rhs.rhs.rhs.rhs.lhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs.lhs, %val__id.i257 - %val__id.i233.rhs.rhs.rhs.rhs = fadd double %val__id.i233.rhs.rhs.rhs.rhs.lhs, %val__id.i252 - %val__id.i233.rhs.rhs.rhs = fmul double %coef_load196_offset_load.i, %val__id.i233.rhs.rhs.rhs.rhs - %val__id.i233.rhs.rhs = fadd double %val__id.i233.rhs.rhs.lhs, %val__id.i233.rhs.rhs.rhs - %val__id.i233.rhs = fmul double %val__id.i237, %val__id.i233.rhs.rhs - %val__id.i233 = fadd double %val__id.i233.lhs, %val__id.i233.rhs - store double %val__id.i233, double* %ptr__id.i241, align 8 - %new_counter279.i = add i32 %counter32.4.i386, 32 - %before_aligned_end73.i = icmp slt i32 %new_counter279.i, %aligned_end31.i - br i1 %before_aligned_end73.i, label %foreach_full_body.i, label %partial_inner_all_outer.i - -foreach_test21.i.preheader: ; preds = %foreach_reset19.i, %foreach_test21.i.preheader.lr.ph - %iter_val.i398 = phi <1 x i32> [ %iter_val.i395, %foreach_test21.i.preheader.lr.ph ], [ %iter_val.i, %foreach_reset19.i ] - %counter.0.i397 = phi i32 [ %add_z0_load_mul_calltmp47_, %foreach_test21.i.preheader.lr.ph ], [ %new_counter.i, %foreach_reset19.i ] - %tid.i3.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %__laneidx47.i = and i32 %tid.i3.i, 31 - %36 = zext i32 %__laneidx47.i to i64 - %arrayidx48.i = getelementptr [32 x i8]* @constDeltaForeach1, i64 0, i64 %36 - br i1 %cmp54.i390, label %outer_not_in_extras.i.preheader.lr.ph, label %foreach_reset19.i - -outer_not_in_extras.i.preheader.lr.ph: ; preds = %foreach_test21.i.preheader - %37 = load i8* %arrayidx48.i, align 1 - %_zext49.i388 = zext i8 %37 to i32 - %38 = insertelement <1 x i32> undef, i32 %_zext49.i388, i32 0 - %iter_val50.i389 = add <1 x i32> %smear_counter_init44.i387, %38 - %mul_z_load297_Nxy_load298_broadcast.i = mul <1 x i32> %iter_val.i398, %Nxy_load298_broadcast_init.i - br label %outer_not_in_extras.i.preheader - -foreach_reset19.i: ; preds = %foreach_reset27.i, %foreach_test21.i.preheader - %new_counter.i = add i32 %counter.0.i397, 1 - %smear_counter_init.i = insertelement <1 x i32> undef, i32 %new_counter.i, i32 0 - %39 = load i8* %arrayidx.i, align 1 - %_zext.i = zext i8 %39 to i32 - %40 = insertelement <1 x i32> undef, i32 %_zext.i, i32 0 - %iter_val.i = add <1 x i32> %smear_counter_init.i, %40 - %exitcond400 = icmp eq i32 %new_counter.i, %14 - br i1 %exitcond400, label %if_then, label %foreach_test21.i.preheader - -outer_not_in_extras.i.preheader: ; preds = %foreach_reset27.i, %outer_not_in_extras.i.preheader.lr.ph - %iter_val50.i392 = phi <1 x i32> [ %iter_val50.i389, %outer_not_in_extras.i.preheader.lr.ph ], [ %iter_val50.i, %foreach_reset27.i ] - %counter25.1.i391 = phi i32 [ %add_y0_load_mul_calltmp41_, %outer_not_in_extras.i.preheader.lr.ph ], [ %new_counter35.i, %foreach_reset27.i ] - br i1 %before_aligned_end73.i385, label %foreach_full_body.i, label %partial_inner_all_outer.i - -foreach_reset27.i: ; preds = %pl_dolane.i, %partial_inner_only.i, %partial_inner_all_outer.i - %new_counter35.i = add i32 %counter25.1.i391, 1 - %smear_counter_init44.i = insertelement <1 x i32> undef, i32 %new_counter35.i, i32 0 - %41 = load i8* %arrayidx48.i, align 1 - %_zext49.i = zext i8 %41 to i32 - %42 = insertelement <1 x i32> undef, i32 %_zext49.i, i32 0 - %iter_val50.i = add <1 x i32> %smear_counter_init44.i, %42 - %exitcond = icmp eq i32 %new_counter35.i, %8 - br i1 %exitcond, label %foreach_reset19.i, label %outer_not_in_extras.i.preheader - -partial_inner_all_outer.i: ; preds = %outer_not_in_extras.i.preheader, %foreach_full_body.i - %counter32.4.i.lcssa = phi i32 [ %add_x0_load_mul_calltmp35_, %outer_not_in_extras.i.preheader ], [ %new_counter279.i, %foreach_full_body.i ] - %before_full_end.i = icmp slt i32 %counter32.4.i.lcssa, %r.i.i - br i1 %before_full_end.i, label %partial_inner_only.i, label %foreach_reset27.i - -partial_inner_only.i: ; preds = %partial_inner_all_outer.i - %smear_counter_init282.i = insertelement <1 x i32> undef, i32 %counter32.4.i.lcssa, i32 0 - %tid.i2.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %__laneidx285.i = and i32 %tid.i2.i, 31 - %43 = zext i32 %__laneidx285.i to i64 - %arrayidx286.i = getelementptr [32 x i8]* @constDeltaForeach4, i64 0, i64 %43 - %44 = load i8* %arrayidx286.i, align 1 - %_zext287.i = zext i8 %44 to i32 - %45 = insertelement <1 x i32> undef, i32 %_zext287.i, i32 0 - %iter_val288.i = add <1 x i32> %smear_counter_init282.i, %45 - %cmp291.i = icmp slt <1 x i32> %iter_val288.i, %smear_end_init289.i - %mul_y_load299_Nx_load300_broadcast.i = mul <1 x i32> %iter_val50.i392, %Nx_load300_broadcast_init.i - %add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast.i = add <1 x i32> %mul_z_load297_Nxy_load298_broadcast.i, %mul_y_load299_Nx_load300_broadcast.i - %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i = add <1 x i32> %add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast.i, %iter_val288.i - %v.i.i224 = extractelement <1 x i1> %cmp291.i, i32 0 - br i1 %v.i.i224, label %pl_dolane.i, label %foreach_reset27.i - -pl_dolane.i: ; preds = %partial_inner_only.i - %coef_load303_offset_load.i = load double* %coef, align 8 - %.lhs361 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %46 = shl i32 %.lhs361, 3 - %iptr__id.i225.rhs = sext i32 %46 to i64 - %iptr__id.i225 = add i64 %iptr__id.i225.rhs, %Ain_load309_ptr2int.i - %ptr__id.i226 = inttoptr i64 %iptr__id.i225 to double* - %val__id.i227 = load double* %ptr__id.i226, align 8 - %coef_load314_offset_load.i401 = load double* %coef_load314_offset.i, align 8 - %.lhs360.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs360 = shl i32 %.lhs360.lhs, 3 - %47 = add i32 %.lhs360, 8 - %iptr__id.i218.rhs = sext i32 %47 to i64 - %iptr__id.i218 = add i64 %iptr__id.i218.rhs, %Ain_load309_ptr2int.i - %ptr__id.i219 = inttoptr i64 %iptr__id.i218 to double* - %val__id.i220 = load double* %ptr__id.i219, align 8 - %.lhs359.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs359 = shl i32 %.lhs359.lhs, 3 - %48 = add i32 %.lhs359, -8 - %iptr__id.i211.rhs = sext i32 %48 to i64 - %iptr__id.i211 = add i64 %iptr__id.i211.rhs, %Ain_load309_ptr2int.i - %ptr__id.i212 = inttoptr i64 %iptr__id.i211 to double* - %val__id.i213 = load double* %ptr__id.i212, align 8 - %.lhs358.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs358 = add i32 %.lhs358.lhs, %Nx - %49 = shl i32 %.lhs358, 3 - %iptr__id.i204.rhs = sext i32 %49 to i64 - %iptr__id.i204 = add i64 %iptr__id.i204.rhs, %Ain_load309_ptr2int.i - %ptr__id.i205 = inttoptr i64 %iptr__id.i204 to double* - %val__id.i206 = load double* %ptr__id.i205, align 8 - %.lhs357.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs357 = sub i32 %.lhs357.lhs, %Nx - %50 = shl i32 %.lhs357, 3 - %iptr__id.i197.rhs = sext i32 %50 to i64 - %iptr__id.i197 = add i64 %iptr__id.i197.rhs, %Ain_load309_ptr2int.i - %ptr__id.i198 = inttoptr i64 %iptr__id.i197 to double* - %val__id.i199 = load double* %ptr__id.i198, align 8 - %.lhs356.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs356 = add i32 %.lhs356.lhs, %mul_Nx_load_Ny_load.i - %51 = shl i32 %.lhs356, 3 - %iptr__id.i190.rhs = sext i32 %51 to i64 - %iptr__id.i190 = add i64 %iptr__id.i190.rhs, %Ain_load309_ptr2int.i - %ptr__id.i191 = inttoptr i64 %iptr__id.i190 to double* - %val__id.i192 = load double* %ptr__id.i191, align 8 - %.lhs355.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs355 = sub i32 %.lhs355.lhs, %mul_Nx_load_Ny_load.i - %52 = shl i32 %.lhs355, 3 - %iptr__id.i183.rhs = sext i32 %52 to i64 - %iptr__id.i183 = add i64 %iptr__id.i183.rhs, %Ain_load309_ptr2int.i - %ptr__id.i184 = inttoptr i64 %iptr__id.i183 to double* - %val__id.i185 = load double* %ptr__id.i184, align 8 - %coef_load365_offset_load.i457 = load double* %coef_load365_offset.i, align 8 - %.lhs354.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs354 = shl i32 %.lhs354.lhs, 3 - %53 = add i32 %.lhs354, 16 - %iptr__id.i176.rhs = sext i32 %53 to i64 - %iptr__id.i176 = add i64 %iptr__id.i176.rhs, %Ain_load309_ptr2int.i - %ptr__id.i177 = inttoptr i64 %iptr__id.i176 to double* - %val__id.i178 = load double* %ptr__id.i177, align 8 - %.lhs353.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs353 = shl i32 %.lhs353.lhs, 3 - %54 = add i32 %.lhs353, -16 - %iptr__id.i169.rhs = sext i32 %54 to i64 - %iptr__id.i169 = add i64 %iptr__id.i169.rhs, %Ain_load309_ptr2int.i - %ptr__id.i170 = inttoptr i64 %iptr__id.i169 to double* - %val__id.i171 = load double* %ptr__id.i170, align 8 - %.lhs352.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs352 = add i32 %.lhs352.lhs, %mul__Nx_load385.i - %55 = shl i32 %.lhs352, 3 - %iptr__id.i162.rhs = sext i32 %55 to i64 - %iptr__id.i162 = add i64 %iptr__id.i162.rhs, %Ain_load309_ptr2int.i - %ptr__id.i163 = inttoptr i64 %iptr__id.i162 to double* - %val__id.i164 = load double* %ptr__id.i163, align 8 - %.lhs351.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs351 = add i32 %.lhs351.lhs, %mul__Nx_load393.i - %56 = shl i32 %.lhs351, 3 - %iptr__id.i155.rhs = sext i32 %56 to i64 - %iptr__id.i155 = add i64 %iptr__id.i155.rhs, %Ain_load309_ptr2int.i - %ptr__id.i156 = inttoptr i64 %iptr__id.i155 to double* - %val__id.i157 = load double* %ptr__id.i156, align 8 - %.lhs350.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs350 = add i32 %.lhs350.lhs, %mul__Nxy_load402.i - %57 = shl i32 %.lhs350, 3 - %iptr__id.i148.rhs = sext i32 %57 to i64 - %iptr__id.i148 = add i64 %iptr__id.i148.rhs, %Ain_load309_ptr2int.i - %ptr__id.i149 = inttoptr i64 %iptr__id.i148 to double* - %val__id.i150 = load double* %ptr__id.i149, align 8 - %.lhs349.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs349 = add i32 %.lhs349.lhs, %mul__Nxy_load410.i - %58 = shl i32 %.lhs349, 3 - %iptr__id.i141.rhs = sext i32 %58 to i64 - %iptr__id.i141 = add i64 %iptr__id.i141.rhs, %Ain_load309_ptr2int.i - %ptr__id.i142 = inttoptr i64 %iptr__id.i141 to double* - %val__id.i143 = load double* %ptr__id.i142, align 8 - %coef_load416_offset_load.i544 = load double* %coef_load416_offset.i, align 8 - %.lhs348.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs348 = shl i32 %.lhs348.lhs, 3 - %59 = add i32 %.lhs348, 24 - %iptr__id.i134.rhs = sext i32 %59 to i64 - %iptr__id.i134 = add i64 %iptr__id.i134.rhs, %Ain_load309_ptr2int.i - %ptr__id.i135 = inttoptr i64 %iptr__id.i134 to double* - %val__id.i136 = load double* %ptr__id.i135, align 8 - %.lhs347.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs347 = shl i32 %.lhs347.lhs, 3 - %60 = add i32 %.lhs347, -24 - %iptr__id.i127.rhs = sext i32 %60 to i64 - %iptr__id.i127 = add i64 %iptr__id.i127.rhs, %Ain_load309_ptr2int.i - %ptr__id.i128 = inttoptr i64 %iptr__id.i127 to double* - %val__id.i129 = load double* %ptr__id.i128, align 8 - %.lhs346.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs346 = add i32 %.lhs346.lhs, %mul__Nx_load436.i - %61 = shl i32 %.lhs346, 3 - %iptr__id.i120.rhs = sext i32 %61 to i64 - %iptr__id.i120 = add i64 %iptr__id.i120.rhs, %Ain_load309_ptr2int.i - %ptr__id.i121 = inttoptr i64 %iptr__id.i120 to double* - %val__id.i122 = load double* %ptr__id.i121, align 8 - %.lhs345.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs345 = add i32 %.lhs345.lhs, %mul__Nx_load444.i - %62 = shl i32 %.lhs345, 3 - %iptr__id.i113.rhs = sext i32 %62 to i64 - %iptr__id.i113 = add i64 %iptr__id.i113.rhs, %Ain_load309_ptr2int.i - %ptr__id.i114 = inttoptr i64 %iptr__id.i113 to double* - %val__id.i115 = load double* %ptr__id.i114, align 8 - %.lhs344.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs344 = add i32 %.lhs344.lhs, %mul__Nxy_load453.i - %63 = shl i32 %.lhs344, 3 - %iptr__id.i106.rhs = sext i32 %63 to i64 - %iptr__id.i106 = add i64 %iptr__id.i106.rhs, %Ain_load309_ptr2int.i - %ptr__id.i107 = inttoptr i64 %iptr__id.i106 to double* - %val__id.i108 = load double* %ptr__id.i107, align 8 - %.lhs343.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %.lhs343 = add i32 %.lhs343.lhs, %mul__Nxy_load461.i - %64 = shl i32 %.lhs343, 3 - %iptr__id.i99.rhs = sext i32 %64 to i64 - %iptr__id.i99 = add i64 %iptr__id.i99.rhs, %Ain_load309_ptr2int.i - %ptr__id.i100 = inttoptr i64 %iptr__id.i99 to double* - %val__id.i101 = load double* %ptr__id.i100, align 8 - %.lhs342 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %65 = shl i32 %.lhs342, 3 - %iptr__id.i92.rhs = sext i32 %65 to i64 - %iptr__id.i92 = add i64 %iptr__id.i92.rhs, %Ain_load309_ptr2int.i - %ptr__id.i93 = inttoptr i64 %iptr__id.i92 to double* - %val__id.i94 = load double* %ptr__id.i93, align 8 - %.lhs341 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %66 = shl i32 %.lhs341, 3 - %iptr__id.i85.rhs = sext i32 %66 to i64 - %iptr__id.i85 = add i64 %iptr__id.i85.rhs, %Aout_load470_ptr2int.i - %ptr__id.i86 = inttoptr i64 %iptr__id.i85 to double* - %val__id.i87 = load double* %ptr__id.i86, align 8 - %.lhs340 = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %67 = shl i32 %.lhs340, 3 - %iptr__id.i80.rhs = sext i32 %67 to i64 - %iptr__id.i80 = add i64 %iptr__id.i80.rhs, %vsq_load488_ptr2int.i - %ptr__id.i81 = inttoptr i64 %iptr__id.i80 to double* - %val__id.i82 = load double* %ptr__id.i81, align 8 - %.lhs = extractelement <1 x i32> %add_add_mul_z_load297_Nxy_load298_broadcast_mul_y_load299_Nx_load300_broadcast_x_load301.i, i32 0 - %68 = shl i32 %.lhs, 3 - %iptr__id.i76.rhs = sext i32 %68 to i64 - %iptr__id.i76 = add i64 %iptr__id.i76.rhs, %Aout_load470_ptr2int.i - %ptr__id.i77 = inttoptr i64 %iptr__id.i76 to double* - %val__id.i78.lhs.lhs = fmul double %val__id.i94, 2.000000e+00 - %val__id.i78.lhs = fsub double %val__id.i78.lhs.lhs, %val__id.i87 - %val__id.i78.rhs.rhs.lhs.lhs.lhs = fmul double %coef_load303_offset_load.i, %val__id.i227 - %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i220, %val__id.i213 - %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i206 - %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i199 - %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs.lhs, %val__id.i192 - %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs.lhs, %val__id.i185 - %val__id.i78.rhs.rhs.lhs.lhs.rhs = fmul double %coef_load314_offset_load.i401, %val__id.i78.rhs.rhs.lhs.lhs.rhs.rhs - %val__id.i78.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs.lhs, %val__id.i78.rhs.rhs.lhs.lhs.rhs - %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i178, %val__id.i171 - %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i164 - %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs.lhs, %val__id.i157 - %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs.lhs, %val__id.i150 - %val__id.i78.rhs.rhs.lhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs.rhs.rhs.lhs, %val__id.i143 - %val__id.i78.rhs.rhs.lhs.rhs = fmul double %coef_load365_offset_load.i457, %val__id.i78.rhs.rhs.lhs.rhs.rhs - %val__id.i78.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.lhs.lhs, %val__id.i78.rhs.rhs.lhs.rhs - %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs = fadd double %val__id.i136, %val__id.i129 - %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs, %val__id.i122 - %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs.lhs, %val__id.i115 - %val__id.i78.rhs.rhs.rhs.rhs.lhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs.lhs, %val__id.i108 - %val__id.i78.rhs.rhs.rhs.rhs = fadd double %val__id.i78.rhs.rhs.rhs.rhs.lhs, %val__id.i101 - %val__id.i78.rhs.rhs.rhs = fmul double %coef_load416_offset_load.i544, %val__id.i78.rhs.rhs.rhs.rhs - %val__id.i78.rhs.rhs = fadd double %val__id.i78.rhs.rhs.lhs, %val__id.i78.rhs.rhs.rhs - %val__id.i78.rhs = fmul double %val__id.i78.rhs.rhs, %val__id.i82 - %val__id.i78 = fadd double %val__id.i78.lhs, %val__id.i78.rhs - store double %val__id.i78, double* %ptr__id.i77, align 8 - br label %foreach_reset27.i -} - -define void @loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd, <1 x i1> %__mask) { -allocas: - %less_t_load_t1_load94 = icmp slt i32 %t0, %t1 - br i1 %less_t_load_t1_load94, label %for_loop.lr.ph, label %for_exit - -for_loop.lr.ph: ; preds = %allocas - %add_sub_x1_load21_x0_load22_ = sub i32 31, %x0 - %sub_add_sub_x1_load21_x0_load22__ = add i32 %add_sub_x1_load21_x0_load22_, %x1 - %div_sub_add_sub_x1_load21_x0_load22___ = sdiv i32 %sub_add_sub_x1_load21_x0_load22__, 32 - %add_sub_y1_load23_y0_load24_ = sub i32 7, %y0 - %sub_add_sub_y1_load23_y0_load24__ = add i32 %add_sub_y1_load23_y0_load24_, %y1 - %div_sub_add_sub_y1_load23_y0_load24___ = sdiv i32 %sub_add_sub_y1_load23_y0_load24__, 8 - %add_sub_z1_load25_z0_load26_ = sub i32 7, %z0 - %sub_add_sub_z1_load25_z0_load26__ = add i32 %add_sub_z1_load25_z0_load26_, %z1 - %div_sub_add_sub_z1_load25_z0_load26___ = sdiv i32 %sub_add_sub_z1_load25_z0_load26__, 8 - %ntxm1.i = add nsw i32 %div_sub_add_sub_x1_load21_x0_load22___, -1 - %ntxm1d4.i = ashr i32 %ntxm1.i, 2 - %nbx.i = add nsw i32 %ntxm1d4.i, 1 - br label %for_loop - -for_loop: ; preds = %if_exit, %for_loop.lr.ph - %t.095 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load78_plus1, %if_exit ] - %bitop = and i32 %t.095, 1 - %equal_bitop_ = icmp eq i32 %bitop, 0 - %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and.i = and i32 %tid.i.i, 31 - %cmp.i = icmp eq i32 %and.i, 0 - br i1 %cmp.i, label %if.then.i, label %ISPCGetParamBuffer.exit - -if.then.i: ; preds = %for_loop - %ptri64tmp.i = tail call i64 @cudaGetParameterBuffer(i64 8, i64 72) - %phitmp.i = inttoptr i64 %ptri64tmp.i to i8* - br label %ISPCGetParamBuffer.exit - -ISPCGetParamBuffer.exit: ; preds = %if.then.i, %for_loop - %ptri64.i = phi i8* [ %phitmp.i, %if.then.i ], [ null, %for_loop ] - %cmp1 = icmp eq i8* %ptri64.i, null - br i1 %equal_bitop_, label %if_then, label %if_else - -for_exit: ; preds = %if_exit, %allocas - %0 = tail call i32 @cudaDeviceSynchronize() - ret void - -if_then: ; preds = %ISPCGetParamBuffer.exit - br i1 %cmp1, label %if_false, label %if_true - -if_else: ; preds = %ISPCGetParamBuffer.exit - br i1 %cmp1, label %if_false62, label %if_true61 - -if_exit: ; preds = %if.then.i92, %if_false62, %if.then.i83, %if_false - %1 = tail call i32 @cudaDeviceSynchronize() - %t_load78_plus1 = add i32 %t.095, 1 - %exitcond = icmp eq i32 %t_load78_plus1, %t1 - br i1 %exitcond, label %for_exit, label %for_loop - -if_true: ; preds = %if_then - %funarg = bitcast i8* %ptri64.i to i32* - store i32 %x0, i32* %funarg, align 4 - %funarg27 = getelementptr i8* %ptri64.i, i64 4 - %2 = bitcast i8* %funarg27 to i32* - store i32 %x1, i32* %2, align 4 - %funarg28 = getelementptr i8* %ptri64.i, i64 8 - %3 = bitcast i8* %funarg28 to i32* - store i32 %y0, i32* %3, align 4 - %funarg29 = getelementptr i8* %ptri64.i, i64 12 - %4 = bitcast i8* %funarg29 to i32* - store i32 %y1, i32* %4, align 4 - %funarg30 = getelementptr i8* %ptri64.i, i64 16 - %5 = bitcast i8* %funarg30 to i32* - store i32 %z0, i32* %5, align 4 - %funarg31 = getelementptr i8* %ptri64.i, i64 20 - %6 = bitcast i8* %funarg31 to i32* - store i32 %z1, i32* %6, align 4 - %funarg32 = getelementptr i8* %ptri64.i, i64 24 - %7 = bitcast i8* %funarg32 to i32* - store i32 %Nx, i32* %7, align 4 - %funarg33 = getelementptr i8* %ptri64.i, i64 28 - %8 = bitcast i8* %funarg33 to i32* - store i32 %Ny, i32* %8, align 4 - %funarg34 = getelementptr i8* %ptri64.i, i64 32 - %9 = bitcast i8* %funarg34 to i32* - store i32 %Nz, i32* %9, align 4 - %funarg35 = getelementptr i8* %ptri64.i, i64 40 - %10 = bitcast i8* %funarg35 to double** - store double* %coef, double** %10, align 8 - %funarg36 = getelementptr i8* %ptri64.i, i64 48 - %11 = bitcast i8* %funarg36 to double** - store double* %vsq, double** %11, align 8 - %funarg37 = getelementptr i8* %ptri64.i, i64 56 - %12 = bitcast i8* %funarg37 to double** - store double* %Aeven, double** %12, align 8 - %funarg38 = getelementptr i8* %ptri64.i, i64 64 - %13 = bitcast i8* %funarg38 to double** - store double* %Aodd, double** %13, align 8 - br label %if_false - -if_false: ; preds = %if_true, %if_then - %tid.i.i80 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and.i81 = and i32 %tid.i.i80, 31 - %cmp.i82 = icmp eq i32 %and.i81, 0 - br i1 %cmp.i82, label %if.then.i83, label %if_exit - -if.then.i83: ; preds = %if_false - %args_i64.i = ptrtoint i8* %ptri64.i to i64 - %res_tmp.i = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) - br label %if_exit - -if_true61: ; preds = %if_else - %funarg64 = bitcast i8* %ptri64.i to i32* - store i32 %x0, i32* %funarg64, align 4 - %funarg65 = getelementptr i8* %ptri64.i, i64 4 - %14 = bitcast i8* %funarg65 to i32* - store i32 %x1, i32* %14, align 4 - %funarg66 = getelementptr i8* %ptri64.i, i64 8 - %15 = bitcast i8* %funarg66 to i32* - store i32 %y0, i32* %15, align 4 - %funarg67 = getelementptr i8* %ptri64.i, i64 12 - %16 = bitcast i8* %funarg67 to i32* - store i32 %y1, i32* %16, align 4 - %funarg68 = getelementptr i8* %ptri64.i, i64 16 - %17 = bitcast i8* %funarg68 to i32* - store i32 %z0, i32* %17, align 4 - %funarg69 = getelementptr i8* %ptri64.i, i64 20 - %18 = bitcast i8* %funarg69 to i32* - store i32 %z1, i32* %18, align 4 - %funarg70 = getelementptr i8* %ptri64.i, i64 24 - %19 = bitcast i8* %funarg70 to i32* - store i32 %Nx, i32* %19, align 4 - %funarg71 = getelementptr i8* %ptri64.i, i64 28 - %20 = bitcast i8* %funarg71 to i32* - store i32 %Ny, i32* %20, align 4 - %funarg72 = getelementptr i8* %ptri64.i, i64 32 - %21 = bitcast i8* %funarg72 to i32* - store i32 %Nz, i32* %21, align 4 - %funarg73 = getelementptr i8* %ptri64.i, i64 40 - %22 = bitcast i8* %funarg73 to double** - store double* %coef, double** %22, align 8 - %funarg74 = getelementptr i8* %ptri64.i, i64 48 - %23 = bitcast i8* %funarg74 to double** - store double* %vsq, double** %23, align 8 - %funarg75 = getelementptr i8* %ptri64.i, i64 56 - %24 = bitcast i8* %funarg75 to double** - store double* %Aodd, double** %24, align 8 - %funarg76 = getelementptr i8* %ptri64.i, i64 64 - %25 = bitcast i8* %funarg76 to double** - store double* %Aeven, double** %25, align 8 - br label %if_false62 - -if_false62: ; preds = %if_true61, %if_else - %tid.i.i84 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and.i85 = and i32 %tid.i.i84, 31 - %cmp.i86 = icmp eq i32 %and.i85, 0 - br i1 %cmp.i86, label %if.then.i92, label %if_exit - -if.then.i92: ; preds = %if_false62 - %args_i64.i90 = ptrtoint i8* %ptri64.i to i64 - %res_tmp.i91 = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i90, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) - br label %if_exit -} - -define void @loop_stencil_ispc_tasks(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd) { -allocas: - %less_t_load_t1_load94 = icmp slt i32 %t0, %t1 - br i1 %less_t_load_t1_load94, label %for_loop.lr.ph, label %for_exit - -for_loop.lr.ph: ; preds = %allocas - %add_sub_x1_load21_x0_load22_ = sub i32 31, %x0 - %sub_add_sub_x1_load21_x0_load22__ = add i32 %add_sub_x1_load21_x0_load22_, %x1 - %div_sub_add_sub_x1_load21_x0_load22___ = sdiv i32 %sub_add_sub_x1_load21_x0_load22__, 32 - %add_sub_y1_load23_y0_load24_ = sub i32 7, %y0 - %sub_add_sub_y1_load23_y0_load24__ = add i32 %add_sub_y1_load23_y0_load24_, %y1 - %div_sub_add_sub_y1_load23_y0_load24___ = sdiv i32 %sub_add_sub_y1_load23_y0_load24__, 8 - %add_sub_z1_load25_z0_load26_ = sub i32 7, %z0 - %sub_add_sub_z1_load25_z0_load26__ = add i32 %add_sub_z1_load25_z0_load26_, %z1 - %div_sub_add_sub_z1_load25_z0_load26___ = sdiv i32 %sub_add_sub_z1_load25_z0_load26__, 8 - %ntxm1.i = add nsw i32 %div_sub_add_sub_x1_load21_x0_load22___, -1 - %ntxm1d4.i = ashr i32 %ntxm1.i, 2 - %nbx.i = add nsw i32 %ntxm1d4.i, 1 - br label %for_loop - -for_loop: ; preds = %if_exit, %for_loop.lr.ph - %t.095 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load78_plus1, %if_exit ] - %bitop = and i32 %t.095, 1 - %equal_bitop_ = icmp eq i32 %bitop, 0 - %tid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and.i = and i32 %tid.i.i, 31 - %cmp.i = icmp eq i32 %and.i, 0 - br i1 %cmp.i, label %if.then.i, label %ISPCGetParamBuffer.exit - -if.then.i: ; preds = %for_loop - %ptri64tmp.i = tail call i64 @cudaGetParameterBuffer(i64 8, i64 72) - %phitmp.i = inttoptr i64 %ptri64tmp.i to i8* - br label %ISPCGetParamBuffer.exit - -ISPCGetParamBuffer.exit: ; preds = %if.then.i, %for_loop - %ptri64.i = phi i8* [ %phitmp.i, %if.then.i ], [ null, %for_loop ] - %cmp1 = icmp eq i8* %ptri64.i, null - br i1 %equal_bitop_, label %if_then, label %if_else - -for_exit: ; preds = %if_exit, %allocas - %0 = tail call i32 @cudaDeviceSynchronize() - ret void - -if_then: ; preds = %ISPCGetParamBuffer.exit - br i1 %cmp1, label %if_false, label %if_true - -if_else: ; preds = %ISPCGetParamBuffer.exit - br i1 %cmp1, label %if_false62, label %if_true61 - -if_exit: ; preds = %if.then.i92, %if_false62, %if.then.i83, %if_false - %1 = tail call i32 @cudaDeviceSynchronize() - %t_load78_plus1 = add i32 %t.095, 1 - %exitcond = icmp eq i32 %t_load78_plus1, %t1 - br i1 %exitcond, label %for_exit, label %for_loop - -if_true: ; preds = %if_then - %funarg = bitcast i8* %ptri64.i to i32* - store i32 %x0, i32* %funarg, align 4 - %funarg27 = getelementptr i8* %ptri64.i, i64 4 - %2 = bitcast i8* %funarg27 to i32* - store i32 %x1, i32* %2, align 4 - %funarg28 = getelementptr i8* %ptri64.i, i64 8 - %3 = bitcast i8* %funarg28 to i32* - store i32 %y0, i32* %3, align 4 - %funarg29 = getelementptr i8* %ptri64.i, i64 12 - %4 = bitcast i8* %funarg29 to i32* - store i32 %y1, i32* %4, align 4 - %funarg30 = getelementptr i8* %ptri64.i, i64 16 - %5 = bitcast i8* %funarg30 to i32* - store i32 %z0, i32* %5, align 4 - %funarg31 = getelementptr i8* %ptri64.i, i64 20 - %6 = bitcast i8* %funarg31 to i32* - store i32 %z1, i32* %6, align 4 - %funarg32 = getelementptr i8* %ptri64.i, i64 24 - %7 = bitcast i8* %funarg32 to i32* - store i32 %Nx, i32* %7, align 4 - %funarg33 = getelementptr i8* %ptri64.i, i64 28 - %8 = bitcast i8* %funarg33 to i32* - store i32 %Ny, i32* %8, align 4 - %funarg34 = getelementptr i8* %ptri64.i, i64 32 - %9 = bitcast i8* %funarg34 to i32* - store i32 %Nz, i32* %9, align 4 - %funarg35 = getelementptr i8* %ptri64.i, i64 40 - %10 = bitcast i8* %funarg35 to double** - store double* %coef, double** %10, align 8 - %funarg36 = getelementptr i8* %ptri64.i, i64 48 - %11 = bitcast i8* %funarg36 to double** - store double* %vsq, double** %11, align 8 - %funarg37 = getelementptr i8* %ptri64.i, i64 56 - %12 = bitcast i8* %funarg37 to double** - store double* %Aeven, double** %12, align 8 - %funarg38 = getelementptr i8* %ptri64.i, i64 64 - %13 = bitcast i8* %funarg38 to double** - store double* %Aodd, double** %13, align 8 - br label %if_false - -if_false: ; preds = %if_true, %if_then - %tid.i.i80 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and.i81 = and i32 %tid.i.i80, 31 - %cmp.i82 = icmp eq i32 %and.i81, 0 - br i1 %cmp.i82, label %if.then.i83, label %if_exit - -if.then.i83: ; preds = %if_false - %args_i64.i = ptrtoint i8* %ptri64.i to i64 - %res_tmp.i = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) - br label %if_exit - -if_true61: ; preds = %if_else - %funarg64 = bitcast i8* %ptri64.i to i32* - store i32 %x0, i32* %funarg64, align 4 - %funarg65 = getelementptr i8* %ptri64.i, i64 4 - %14 = bitcast i8* %funarg65 to i32* - store i32 %x1, i32* %14, align 4 - %funarg66 = getelementptr i8* %ptri64.i, i64 8 - %15 = bitcast i8* %funarg66 to i32* - store i32 %y0, i32* %15, align 4 - %funarg67 = getelementptr i8* %ptri64.i, i64 12 - %16 = bitcast i8* %funarg67 to i32* - store i32 %y1, i32* %16, align 4 - %funarg68 = getelementptr i8* %ptri64.i, i64 16 - %17 = bitcast i8* %funarg68 to i32* - store i32 %z0, i32* %17, align 4 - %funarg69 = getelementptr i8* %ptri64.i, i64 20 - %18 = bitcast i8* %funarg69 to i32* - store i32 %z1, i32* %18, align 4 - %funarg70 = getelementptr i8* %ptri64.i, i64 24 - %19 = bitcast i8* %funarg70 to i32* - store i32 %Nx, i32* %19, align 4 - %funarg71 = getelementptr i8* %ptri64.i, i64 28 - %20 = bitcast i8* %funarg71 to i32* - store i32 %Ny, i32* %20, align 4 - %funarg72 = getelementptr i8* %ptri64.i, i64 32 - %21 = bitcast i8* %funarg72 to i32* - store i32 %Nz, i32* %21, align 4 - %funarg73 = getelementptr i8* %ptri64.i, i64 40 - %22 = bitcast i8* %funarg73 to double** - store double* %coef, double** %22, align 8 - %funarg74 = getelementptr i8* %ptri64.i, i64 48 - %23 = bitcast i8* %funarg74 to double** - store double* %vsq, double** %23, align 8 - %funarg75 = getelementptr i8* %ptri64.i, i64 56 - %24 = bitcast i8* %funarg75 to double** - store double* %Aodd, double** %24, align 8 - %funarg76 = getelementptr i8* %ptri64.i, i64 64 - %25 = bitcast i8* %funarg76 to double** - store double* %Aeven, double** %25, align 8 - br label %if_false62 - -if_false62: ; preds = %if_true61, %if_else - %tid.i.i84 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - %and.i85 = and i32 %tid.i.i84, 31 - %cmp.i86 = icmp eq i32 %and.i85, 0 - br i1 %cmp.i86, label %if.then.i92, label %if_exit - -if.then.i92: ; preds = %if_false62 - %args_i64.i90 = ptrtoint i8* %ptri64.i to i64 - %res_tmp.i91 = tail call i32 asm sideeffect "{\0A .param .b64 param0;\0A st.param.b64\09[param0+0], $1;\0A .param .b64 param1;\0A st.param.b64\09[param1+0], $2;\0A .param .align 4 .b8 param2[12];\0A st.param.b32\09[param2+0], $3; \0A st.param.b32\09[param2+4], $4; \0A st.param.b32\09[param2+8], $5; \0A .param .align 4 .b8 param3[12];\0A st.param.b32\09[param3+0], $6; \0A st.param.b32\09[param3+4], $7; \0A st.param.b32\09[param3+8], $8; \0A .param .b32 param4;\0A st.param.b32\09[param4+0], $9; \0A .param .b64 param5;\0A st.param.b64\09[param5+0], $10; \0A\0A .param .b32 retval0;\0A call.uni (retval0), \0A cudaLaunchDevice,\0A (\0A param0, \0A param1, \0A param2, \0A param3, \0A param4, \0A param5\0A );\0A ld.param.b32\09$0, [retval0+0];\0A }\0A ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 ptrtoint (void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i64), i64 %args_i64.i90, i32 %nbx.i, i32 %div_sub_add_sub_y1_load23_y0_load24___, i32 %div_sub_add_sub_z1_load25_z0_load26___, i32 128, i32 1, i32 1, i32 0, i64 0) - br label %if_exit -} - -!llvm.ident = !{!0} -!nvvm.annotations = !{!1, !2} - -!0 = metadata !{metadata !"clang version 3.4 (trunk 194723)"} -!1 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_, metadata !"kernel", i32 1} -!2 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @loop_stencil_ispc_tasks, metadata !"kernel", i32 1} diff --git a/examples_cuda/stencil/stencil_ispc_nvptx64.ptx b/examples_cuda/stencil/stencil_ispc_nvptx64.ptx deleted file mode 100644 index b0339cbf..00000000 --- a/examples_cuda/stencil/stencil_ispc_nvptx64.ptx +++ /dev/null @@ -1,1246 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857) -// Cuda compilation tools, release 5.5, V5.5.0 -// - -.version 3.2 -.target sm_35 -.address_size 64 - - -.extern .func (.param .b32 func_retval0) cudaLaunchDevice -( - .param .b64 cudaLaunchDevice_param_0, - .param .b64 cudaLaunchDevice_param_1, - .param .align 4 .b8 cudaLaunchDevice_param_2[12], - .param .align 4 .b8 cudaLaunchDevice_param_3[12], - .param .b32 cudaLaunchDevice_param_4, - .param .b64 cudaLaunchDevice_param_5 -); - - -.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer -( - .param .b64 cudaGetParameterBuffer_param_0, - .param .b64 cudaGetParameterBuffer_param_1 -) -; -.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize -( - -) -; -.global .align 1 .b8 constDeltaForeach1[32]; -.global .align 1 .b8 constDeltaForeach4[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; - -.visible .func (.param .b32 func_retval0) __shfl_i32( - .param .b32 __shfl_i32_param_0, - .param .b32 __shfl_i32_param_1 -) -{ - .reg .s32 %r<4>; - - - ld.param.u32 %r2, [__shfl_i32_param_0]; - ld.param.u32 %r3, [__shfl_i32_param_1]; - // inline asm - shfl.idx.b32 %r1, %r2, %r3, 0x1f; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __shfl_xor_float( - .param .b32 __shfl_xor_float_param_0, - .param .b32 __shfl_xor_float_param_1 -) -{ - .reg .s32 %r<2>; - .reg .f32 %f<3>; - - - ld.param.f32 %f2, [__shfl_xor_float_param_0]; - ld.param.u32 %r1, [__shfl_xor_float_param_1]; - // inline asm - shfl.bfly.b32 %f1, %f2, %r1, 0x1f; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __shfl_xor_i32( - .param .b32 __shfl_xor_i32_param_0, - .param .b32 __shfl_xor_i32_param_1 -) -{ - .reg .s32 %r<4>; - - - ld.param.u32 %r2, [__shfl_xor_i32_param_0]; - ld.param.u32 %r3, [__shfl_xor_i32_param_1]; - // inline asm - shfl.bfly.b32 %r1, %r2, %r3, 0x1f; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __fminf( - .param .b32 __fminf_param_0, - .param .b32 __fminf_param_1 -) -{ - .reg .f32 %f<4>; - - - ld.param.f32 %f2, [__fminf_param_0]; - ld.param.f32 %f3, [__fminf_param_1]; - // inline asm - min.f32 %f1, %f2, %f3; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __fmaxf( - .param .b32 __fmaxf_param_0, - .param .b32 __fmaxf_param_1 -) -{ - .reg .f32 %f<4>; - - - ld.param.f32 %f2, [__fmaxf_param_0]; - ld.param.f32 %f3, [__fmaxf_param_1]; - // inline asm - max.f32 %f1, %f2, %f3; - // inline asm - st.param.f32 [func_retval0+0], %f1; - ret; -} - -.visible .func (.param .b32 func_retval0) __ballot( - .param .b32 __ballot_param_0 -) -{ - .reg .s32 %r<3>; - - - ld.param.u8 %r2, [__ballot_param_0]; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r2, 0; - vote.ballot.b32 %r1, %p1; - } - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b32 func_retval0) __lanemask_lt( - -) -{ - .reg .s32 %r<2>; - - - // inline asm - mov.u32 %r1, %lanemask_lt; - // inline asm - st.param.b32 [func_retval0+0], %r1; - ret; -} - -.visible .func (.param .b64 func_retval0) ISPCAlloc( - .param .b64 ISPCAlloc_param_0, - .param .b64 ISPCAlloc_param_1, - .param .b32 ISPCAlloc_param_2 -) -{ - .reg .s64 %rd<2>; - - - mov.u64 %rd1, 1; - st.param.b64 [func_retval0+0], %rd1; - ret; -} - -.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer( - .param .b64 ISPCGetParamBuffer_param_0, - .param .b64 ISPCGetParamBuffer_param_1, - .param .b64 ISPCGetParamBuffer_param_2 -) -{ - .reg .pred %p<2>; - .reg .s32 %r<3>; - .reg .s64 %rd<7>; - - - ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1]; - ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2]; - mov.u32 %r1, %tid.x; - and.b32 %r2, %r1, 31; - setp.ne.s32 %p1, %r2, 0; - mov.u64 %rd6, 0; - @%p1 bra BB8_2; - - // Callseq Start 0 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd3; - .param .b64 param1; - st.param.b64 [param1+0], %rd4; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd6, [retval0+0]; - } - // Callseq End 0 - -BB8_2: - st.param.b64 [func_retval0+0], %rd6; - ret; -} - -.visible .func ISPCLaunch( - .param .b64 ISPCLaunch_param_0, - .param .b64 ISPCLaunch_param_1, - .param .b64 ISPCLaunch_param_2, - .param .b32 ISPCLaunch_param_3, - .param .b32 ISPCLaunch_param_4, - .param .b32 ISPCLaunch_param_5 -) -{ - .reg .pred %p<2>; - .reg .s32 %r<16>; - .reg .s64 %rd<6>; - - - ld.param.u64 %rd1, [ISPCLaunch_param_1]; - ld.param.u64 %rd2, [ISPCLaunch_param_2]; - ld.param.u32 %r1, [ISPCLaunch_param_3]; - ld.param.u32 %r2, [ISPCLaunch_param_4]; - ld.param.u32 %r3, [ISPCLaunch_param_5]; - mov.u32 %r4, %tid.x; - and.b32 %r5, %r4, 31; - setp.ne.s32 %p1, %r5, 0; - @%p1 bra BB9_2; - - add.s32 %r14, %r1, -1; - shr.s32 %r15, %r14, 2; - add.s32 %r7, %r15, 1; - mov.u32 %r12, 1; - mov.u32 %r10, 128; - mov.u32 %r13, 0; - mov.u64 %rd5, 0; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd1; - .param .b64 param1; - st.param.b64 [param1+0], %rd2; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r7; - st.param.b32 [param2+4], %r2; - st.param.b32 [param2+8], %r3; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r10; - st.param.b32 [param3+4], %r12; - st.param.b32 [param3+8], %r12; - .param .b32 param4; - st.param.b32 [param4+0], %r13; - .param .b64 param5; - st.param.b64 [param5+0], %rd5; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r6, [retval0+0]; - } - - // inline asm - -BB9_2: - ret; -} - -.visible .func ISPCSync( - .param .b64 ISPCSync_param_0 -) -{ - .reg .s32 %r<2>; - - - // Callseq Start 1 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r1, [retval0+0]; - } - // Callseq End 1 - ret; -} - -.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan( - .param .b32 __warpBinExclusiveScan_param_0 -) -{ - .reg .s32 %r<8>; - .reg .s64 %rd<5>; - - - ld.param.u8 %r2, [__warpBinExclusiveScan_param_0]; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r2, 0; - vote.ballot.b32 %r1, %p1; - } - // inline asm - // inline asm - popc.b32 %r3, %r1; - // inline asm - // inline asm - mov.u32 %r5, %lanemask_lt; - // inline asm - and.b32 %r7, %r5, %r1; - // inline asm - popc.b32 %r6, %r7; - // inline asm - cvt.u64.u32 %rd1, %r6; - shl.b64 %rd2, %rd1, 32; - cvt.u64.u32 %rd3, %r3; - or.b64 %rd4, %rd2, %rd3; - st.param.b64 [func_retval0+0], %rd4; - ret; -} - -.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_( - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7, - .param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11, - .param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12 -) -{ - .reg .pred %p<14>; - .reg .s32 %r<178>; - .reg .s64 %rd<96>; - .reg .f64 %fd<95>; - - - ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0]; - ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1]; - ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2]; - ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3]; - ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4]; - ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5]; - ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6]; - ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7]; - ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9]; - ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10]; - ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11]; - ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12]; - mov.u32 %r1, %ctaid.x; - shl.b32 %r50, %r1, 2; - mov.u32 %r2, %tid.x; - shr.s32 %r51, %r2, 5; - add.s32 %r52, %r51, %r50; - mov.u32 %r53, %nctaid.x; - shl.b32 %r54, %r53, 2; - setp.ge.s32 %p1, %r52, %r54; - mov.u32 %r55, %nctaid.y; - mov.u32 %r3, %ctaid.y; - setp.ge.s32 %p2, %r3, %r55; - or.pred %p3, %p1, %p2; - mov.u32 %r56, %nctaid.z; - mov.u32 %r4, %ctaid.z; - setp.ge.s32 %p4, %r4, %r56; - or.pred %p5, %p3, %p4; - @%p5 bra BB12_13; - - shl.b32 %r57, %r1, 7; - add.s32 %r58, %r2, %r57; - and.b32 %r59, %r58, -32; - add.s32 %r60, %r59, %r42; - add.s32 %r61, %r60, 32; - min.s32 %r5, %r43, %r61; - shl.b32 %r6, %r3, 3; - add.s32 %r62, %r6, %r44; - add.s32 %r7, %r62, 8; - shl.b32 %r8, %r4, 3; - add.s32 %r172, %r8, %r46; - add.s32 %r63, %r172, 8; - min.s32 %r64, %r47, %r63; - mul.lo.s32 %r10, %r49, %r48; - sub.s32 %r65, %r5, %r60; - shr.s32 %r66, %r65, 31; - shr.u32 %r67, %r66, 27; - add.s32 %r68, %r65, %r67; - and.b32 %r69, %r68, -32; - sub.s32 %r70, %r65, %r69; - sub.s32 %r11, %r5, %r70; - and.b32 %r71, %r2, 31; - cvt.u64.u32 %rd6, %r71; - mov.u64 %rd7, constDeltaForeach1; - add.s64 %rd1, %rd7, %rd6; - setp.ge.s32 %p6, %r172, %r64; - @%p6 bra BB12_13; - - min.s32 %r12, %r45, %r7; - shl.b32 %r15, %r10, 1; - neg.s32 %r16, %r15; - mul.lo.s32 %r17, %r10, 3; - mul.lo.s32 %r18, %r10, -3; - mov.u32 %r72, -9; - sub.s32 %r73, %r72, %r44; - sub.s32 %r74, %r73, %r6; - not.b32 %r75, %r45; - max.s32 %r76, %r74, %r75; - not.b32 %r19, %r76; - sub.s32 %r77, %r72, %r46; - sub.s32 %r78, %r77, %r8; - not.b32 %r79, %r47; - max.s32 %r80, %r78, %r79; - not.b32 %r20, %r80; - ld.global.u8 %r13, [%rd1]; - mov.u32 %r171, %r172; - -BB12_3: - mov.u32 %r21, %r171; - add.s32 %r23, %r21, %r13; - setp.ge.s32 %p7, %r62, %r12; - @%p7 bra BB12_12; - - mul.lo.s32 %r24, %r23, %r10; - mov.u32 %r174, %r62; - mov.u32 %r173, %r62; - -BB12_5: - mov.u32 %r27, %r173; - add.s32 %r30, %r27, %r13; - setp.ge.s32 %p8, %r60, %r11; - mov.u32 %r176, %r60; - @%p8 bra BB12_8; - - mov.u64 %rd9, constDeltaForeach4; - add.s64 %rd10, %rd9, %rd6; - ld.global.u8 %r31, [%rd10]; - mad.lo.s32 %r32, %r30, %r48, %r24; - add.s32 %r177, %r59, %r42; - -BB12_7: - cvta.to.global.u64 %rd11, %rd2; - add.s32 %r98, %r32, %r177; - add.s32 %r99, %r98, %r31; - shl.b32 %r100, %r99, 3; - cvt.s64.s32 %rd12, %r100; - add.s64 %rd13, %rd12, %rd4; - add.s32 %r101, %r100, 8; - cvt.s64.s32 %rd14, %r101; - add.s64 %rd15, %rd14, %rd4; - add.s32 %r102, %r100, -8; - cvt.s64.s32 %rd16, %r102; - add.s64 %rd17, %rd16, %rd4; - add.s32 %r103, %r99, %r48; - shl.b32 %r104, %r103, 3; - cvt.s64.s32 %rd18, %r104; - add.s64 %rd19, %rd18, %rd4; - sub.s32 %r105, %r99, %r48; - shl.b32 %r106, %r105, 3; - cvt.s64.s32 %rd20, %r106; - add.s64 %rd21, %rd20, %rd4; - add.s32 %r108, %r99, %r10; - shl.b32 %r109, %r108, 3; - cvt.s64.s32 %rd22, %r109; - add.s64 %rd23, %rd22, %rd4; - sub.s32 %r110, %r99, %r10; - shl.b32 %r111, %r110, 3; - cvt.s64.s32 %rd24, %r111; - add.s64 %rd25, %rd24, %rd4; - add.s32 %r112, %r100, 16; - cvt.s64.s32 %rd26, %r112; - add.s64 %rd27, %rd26, %rd4; - add.s32 %r113, %r100, -16; - cvt.s64.s32 %rd28, %r113; - add.s64 %rd29, %rd28, %rd4; - shl.b32 %r114, %r48, 1; - add.s32 %r115, %r99, %r114; - shl.b32 %r116, %r115, 3; - cvt.s64.s32 %rd30, %r116; - add.s64 %rd31, %rd30, %rd4; - mad.lo.s32 %r117, %r48, -2, %r99; - shl.b32 %r118, %r117, 3; - cvt.s64.s32 %rd32, %r118; - add.s64 %rd33, %rd32, %rd4; - add.s32 %r119, %r99, %r15; - shl.b32 %r120, %r119, 3; - cvt.s64.s32 %rd34, %r120; - add.s64 %rd35, %rd34, %rd4; - add.s32 %r121, %r99, %r16; - shl.b32 %r122, %r121, 3; - cvt.s64.s32 %rd36, %r122; - add.s64 %rd37, %rd36, %rd4; - add.s32 %r123, %r100, 24; - cvt.s64.s32 %rd38, %r123; - add.s64 %rd39, %rd38, %rd4; - add.s32 %r124, %r100, -24; - cvt.s64.s32 %rd40, %r124; - add.s64 %rd41, %rd40, %rd4; - mad.lo.s32 %r125, %r48, 3, %r99; - shl.b32 %r126, %r125, 3; - cvt.s64.s32 %rd42, %r126; - add.s64 %rd43, %rd42, %rd4; - mad.lo.s32 %r127, %r48, -3, %r99; - shl.b32 %r128, %r127, 3; - cvt.s64.s32 %rd44, %r128; - add.s64 %rd45, %rd44, %rd4; - add.s32 %r129, %r99, %r17; - shl.b32 %r130, %r129, 3; - cvt.s64.s32 %rd46, %r130; - add.s64 %rd47, %rd46, %rd4; - add.s32 %r131, %r99, %r18; - shl.b32 %r132, %r131, 3; - cvt.s64.s32 %rd48, %r132; - add.s64 %rd49, %rd48, %rd4; - add.s64 %rd50, %rd12, %rd5; - add.s64 %rd51, %rd12, %rd3; - ld.f64 %fd1, [%rd13]; - add.f64 %fd2, %fd1, %fd1; - ld.f64 %fd3, [%rd50]; - sub.f64 %fd4, %fd2, %fd3; - ld.global.f64 %fd5, [%rd11]; - ld.f64 %fd6, [%rd17]; - ld.f64 %fd7, [%rd15]; - add.f64 %fd8, %fd7, %fd6; - ld.f64 %fd9, [%rd19]; - add.f64 %fd10, %fd8, %fd9; - ld.f64 %fd11, [%rd21]; - add.f64 %fd12, %fd10, %fd11; - ld.f64 %fd13, [%rd23]; - add.f64 %fd14, %fd12, %fd13; - ld.f64 %fd15, [%rd25]; - add.f64 %fd16, %fd14, %fd15; - ld.global.f64 %fd17, [%rd11+8]; - mul.f64 %fd18, %fd17, %fd16; - fma.rn.f64 %fd19, %fd5, %fd1, %fd18; - ld.f64 %fd20, [%rd29]; - ld.f64 %fd21, [%rd27]; - add.f64 %fd22, %fd21, %fd20; - ld.f64 %fd23, [%rd31]; - add.f64 %fd24, %fd22, %fd23; - ld.f64 %fd25, [%rd33]; - add.f64 %fd26, %fd24, %fd25; - ld.f64 %fd27, [%rd35]; - add.f64 %fd28, %fd26, %fd27; - ld.f64 %fd29, [%rd37]; - add.f64 %fd30, %fd28, %fd29; - ld.global.f64 %fd31, [%rd11+16]; - fma.rn.f64 %fd32, %fd31, %fd30, %fd19; - ld.f64 %fd33, [%rd41]; - ld.f64 %fd34, [%rd39]; - add.f64 %fd35, %fd34, %fd33; - ld.f64 %fd36, [%rd43]; - add.f64 %fd37, %fd35, %fd36; - ld.f64 %fd38, [%rd45]; - add.f64 %fd39, %fd37, %fd38; - ld.f64 %fd40, [%rd47]; - add.f64 %fd41, %fd39, %fd40; - ld.f64 %fd42, [%rd49]; - add.f64 %fd43, %fd41, %fd42; - ld.global.f64 %fd44, [%rd11+24]; - fma.rn.f64 %fd45, %fd44, %fd43, %fd32; - ld.f64 %fd46, [%rd51]; - fma.rn.f64 %fd47, %fd46, %fd45, %fd4; - st.f64 [%rd50], %fd47; - add.s32 %r177, %r177, 32; - setp.lt.s32 %p9, %r177, %r11; - mov.u32 %r175, %r177; - mov.u32 %r176, %r175; - @%p9 bra BB12_7; - -BB12_8: - mov.u32 %r36, %r176; - setp.ge.s32 %p10, %r36, %r5; - @%p10 bra BB12_11; - - mov.u64 %rd53, constDeltaForeach4; - add.s64 %rd54, %rd53, %rd6; - ld.global.u8 %r135, [%rd54]; - add.s32 %r37, %r36, %r135; - setp.ge.s32 %p11, %r37, %r5; - @%p11 bra BB12_11; - - cvta.to.global.u64 %rd55, %rd2; - mad.lo.s32 %r136, %r30, %r48, %r24; - add.s32 %r137, %r136, %r37; - shl.b32 %r138, %r137, 3; - cvt.s64.s32 %rd56, %r138; - add.s64 %rd57, %rd56, %rd4; - add.s32 %r139, %r138, 8; - cvt.s64.s32 %rd58, %r139; - add.s64 %rd59, %rd58, %rd4; - add.s32 %r140, %r138, -8; - cvt.s64.s32 %rd60, %r140; - add.s64 %rd61, %rd60, %rd4; - add.s32 %r141, %r137, %r48; - shl.b32 %r142, %r141, 3; - cvt.s64.s32 %rd62, %r142; - add.s64 %rd63, %rd62, %rd4; - sub.s32 %r143, %r137, %r48; - shl.b32 %r144, %r143, 3; - cvt.s64.s32 %rd64, %r144; - add.s64 %rd65, %rd64, %rd4; - add.s32 %r146, %r137, %r10; - shl.b32 %r147, %r146, 3; - cvt.s64.s32 %rd66, %r147; - add.s64 %rd67, %rd66, %rd4; - sub.s32 %r148, %r137, %r10; - shl.b32 %r149, %r148, 3; - cvt.s64.s32 %rd68, %r149; - add.s64 %rd69, %rd68, %rd4; - add.s32 %r150, %r138, 16; - cvt.s64.s32 %rd70, %r150; - add.s64 %rd71, %rd70, %rd4; - add.s32 %r151, %r138, -16; - cvt.s64.s32 %rd72, %r151; - add.s64 %rd73, %rd72, %rd4; - shl.b32 %r152, %r48, 1; - add.s32 %r153, %r137, %r152; - shl.b32 %r154, %r153, 3; - cvt.s64.s32 %rd74, %r154; - add.s64 %rd75, %rd74, %rd4; - mad.lo.s32 %r155, %r48, -2, %r137; - shl.b32 %r156, %r155, 3; - cvt.s64.s32 %rd76, %r156; - add.s64 %rd77, %rd76, %rd4; - add.s32 %r157, %r137, %r15; - shl.b32 %r158, %r157, 3; - cvt.s64.s32 %rd78, %r158; - add.s64 %rd79, %rd78, %rd4; - add.s32 %r159, %r137, %r16; - shl.b32 %r160, %r159, 3; - cvt.s64.s32 %rd80, %r160; - add.s64 %rd81, %rd80, %rd4; - add.s32 %r161, %r138, 24; - cvt.s64.s32 %rd82, %r161; - add.s64 %rd83, %rd82, %rd4; - add.s32 %r162, %r138, -24; - cvt.s64.s32 %rd84, %r162; - add.s64 %rd85, %rd84, %rd4; - mad.lo.s32 %r163, %r48, 3, %r137; - shl.b32 %r164, %r163, 3; - cvt.s64.s32 %rd86, %r164; - add.s64 %rd87, %rd86, %rd4; - mad.lo.s32 %r165, %r48, -3, %r137; - shl.b32 %r166, %r165, 3; - cvt.s64.s32 %rd88, %r166; - add.s64 %rd89, %rd88, %rd4; - add.s32 %r167, %r137, %r17; - shl.b32 %r168, %r167, 3; - cvt.s64.s32 %rd90, %r168; - add.s64 %rd91, %rd90, %rd4; - add.s32 %r169, %r137, %r18; - shl.b32 %r170, %r169, 3; - cvt.s64.s32 %rd92, %r170; - add.s64 %rd93, %rd92, %rd4; - add.s64 %rd94, %rd56, %rd5; - add.s64 %rd95, %rd56, %rd3; - ld.f64 %fd48, [%rd57]; - add.f64 %fd49, %fd48, %fd48; - ld.f64 %fd50, [%rd94]; - sub.f64 %fd51, %fd49, %fd50; - ld.global.f64 %fd52, [%rd55]; - ld.f64 %fd53, [%rd61]; - ld.f64 %fd54, [%rd59]; - add.f64 %fd55, %fd54, %fd53; - ld.f64 %fd56, [%rd63]; - add.f64 %fd57, %fd55, %fd56; - ld.f64 %fd58, [%rd65]; - add.f64 %fd59, %fd57, %fd58; - ld.f64 %fd60, [%rd67]; - add.f64 %fd61, %fd59, %fd60; - ld.f64 %fd62, [%rd69]; - add.f64 %fd63, %fd61, %fd62; - ld.global.f64 %fd64, [%rd55+8]; - mul.f64 %fd65, %fd64, %fd63; - fma.rn.f64 %fd66, %fd52, %fd48, %fd65; - ld.f64 %fd67, [%rd73]; - ld.f64 %fd68, [%rd71]; - add.f64 %fd69, %fd68, %fd67; - ld.f64 %fd70, [%rd75]; - add.f64 %fd71, %fd69, %fd70; - ld.f64 %fd72, [%rd77]; - add.f64 %fd73, %fd71, %fd72; - ld.f64 %fd74, [%rd79]; - add.f64 %fd75, %fd73, %fd74; - ld.f64 %fd76, [%rd81]; - add.f64 %fd77, %fd75, %fd76; - ld.global.f64 %fd78, [%rd55+16]; - fma.rn.f64 %fd79, %fd78, %fd77, %fd66; - ld.f64 %fd80, [%rd85]; - ld.f64 %fd81, [%rd83]; - add.f64 %fd82, %fd81, %fd80; - ld.f64 %fd83, [%rd87]; - add.f64 %fd84, %fd82, %fd83; - ld.f64 %fd85, [%rd89]; - add.f64 %fd86, %fd84, %fd85; - ld.f64 %fd87, [%rd91]; - add.f64 %fd88, %fd86, %fd87; - ld.f64 %fd89, [%rd93]; - add.f64 %fd90, %fd88, %fd89; - ld.global.f64 %fd91, [%rd55+24]; - fma.rn.f64 %fd92, %fd91, %fd90, %fd79; - ld.f64 %fd93, [%rd95]; - fma.rn.f64 %fd94, %fd92, %fd93, %fd51; - st.f64 [%rd94], %fd94; - -BB12_11: - add.s32 %r39, %r174, 1; - setp.ne.s32 %p12, %r39, %r19; - mov.u32 %r174, %r39; - mov.u32 %r173, %r39; - @%p12 bra BB12_5; - -BB12_12: - add.s32 %r171, %r172, 1; - setp.ne.s32 %p13, %r171, %r20; - mov.u32 %r172, %r171; - @%p13 bra BB12_3; - -BB12_13: - ret; -} - -.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_( - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9, - .param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13, - .param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14, - .param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1] -) -{ - .reg .pred %p<9>; - .reg .s32 %r<63>; - .reg .s64 %rd<18>; - - - ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0]; - ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1]; - ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2]; - ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3]; - ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4]; - ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5]; - ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6]; - ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7]; - ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8]; - ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9]; - ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10]; - ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11]; - ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12]; - ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13]; - ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14]; - setp.ge.s32 %p1, %r62, %r12; - @%p1 bra BB13_14; - - mov.u32 %r22, 31; - sub.s32 %r23, %r22, %r13; - add.s32 %r24, %r23, %r14; - shr.s32 %r25, %r24, 31; - shr.u32 %r26, %r25, 27; - add.s32 %r27, %r24, %r26; - shr.s32 %r28, %r27, 5; - mov.u32 %r29, 7; - sub.s32 %r30, %r29, %r15; - add.s32 %r31, %r30, %r16; - shr.s32 %r32, %r31, 31; - shr.u32 %r33, %r32, 29; - add.s32 %r34, %r31, %r33; - shr.s32 %r1, %r34, 3; - sub.s32 %r35, %r29, %r17; - add.s32 %r36, %r35, %r18; - shr.s32 %r37, %r36, 31; - shr.u32 %r38, %r37, 29; - add.s32 %r39, %r36, %r38; - shr.s32 %r2, %r39, 3; - add.s32 %r40, %r28, -1; - shr.s32 %r41, %r40, 2; - add.s32 %r3, %r41, 1; - mov.u32 %r42, %tid.x; - and.b32 %r4, %r42, 31; - sub.s32 %r61, %r62, %r12; - -BB13_2: - and.b32 %r8, %r62, 1; - setp.ne.s32 %p2, %r4, 0; - mov.u64 %rd17, 0; - @%p2 bra BB13_4; - - mov.u64 %rd9, 8; - mov.u64 %rd10, 72; - // Callseq Start 2 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd10; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd17, [retval0+0]; - } - // Callseq End 2 - -BB13_4: - setp.eq.s32 %p3, %r8, 0; - @%p3 bra BB13_9; - - setp.eq.s64 %p4, %rd17, 0; - @%p4 bra BB13_7; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd7; - st.u64 [%rd17+64], %rd6; - -BB13_7: - @%p2 bra BB13_13; - - mov.u32 %r47, 128; - mov.u32 %r49, 1; - mov.u32 %r50, 0; - mov.u64 %rd13, 0; - mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd11; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r47; - st.param.b32 [param3+4], %r49; - st.param.b32 [param3+8], %r49; - .param .b32 param4; - st.param.b32 [param4+0], %r50; - .param .b64 param5; - st.param.b64 [param5+0], %rd13; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r43, [retval0+0]; - } - - // inline asm - bra.uni BB13_13; - -BB13_9: - setp.eq.s64 %p6, %rd17, 0; - @%p6 bra BB13_11; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd6; - st.u64 [%rd17+64], %rd7; - -BB13_11: - @%p2 bra BB13_13; - - mov.u32 %r55, 128; - mov.u32 %r57, 1; - mov.u32 %r58, 0; - mov.u64 %rd16, 0; - mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd14; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r55; - st.param.b32 [param3+4], %r57; - st.param.b32 [param3+8], %r57; - .param .b32 param4; - st.param.b32 [param4+0], %r58; - .param .b64 param5; - st.param.b64 [param5+0], %rd16; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r51, [retval0+0]; - } - - // inline asm - -BB13_13: - // Callseq Start 3 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r59, [retval0+0]; - } - // Callseq End 3 - add.s32 %r62, %r62, 1; - add.s32 %r61, %r61, 1; - setp.ne.s32 %p8, %r61, 0; - @%p8 bra BB13_2; - -BB13_14: - // Callseq Start 4 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r60, [retval0+0]; - } - // Callseq End 4 - ret; -} - -.visible .entry loop_stencil_ispc_tasks( - .param .u32 loop_stencil_ispc_tasks_param_0, - .param .u32 loop_stencil_ispc_tasks_param_1, - .param .u32 loop_stencil_ispc_tasks_param_2, - .param .u32 loop_stencil_ispc_tasks_param_3, - .param .u32 loop_stencil_ispc_tasks_param_4, - .param .u32 loop_stencil_ispc_tasks_param_5, - .param .u32 loop_stencil_ispc_tasks_param_6, - .param .u32 loop_stencil_ispc_tasks_param_7, - .param .u32 loop_stencil_ispc_tasks_param_8, - .param .u32 loop_stencil_ispc_tasks_param_9, - .param .u32 loop_stencil_ispc_tasks_param_10, - .param .u64 loop_stencil_ispc_tasks_param_11, - .param .u64 loop_stencil_ispc_tasks_param_12, - .param .u64 loop_stencil_ispc_tasks_param_13, - .param .u64 loop_stencil_ispc_tasks_param_14 -) -{ - .reg .pred %p<9>; - .reg .s32 %r<63>; - .reg .s64 %rd<18>; - - - ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0]; - ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1]; - ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2]; - ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3]; - ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4]; - ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5]; - ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6]; - ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7]; - ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8]; - ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9]; - ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10]; - ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11]; - ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12]; - ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13]; - ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14]; - setp.ge.s32 %p1, %r62, %r12; - @%p1 bra BB14_14; - - mov.u32 %r22, 31; - sub.s32 %r23, %r22, %r13; - add.s32 %r24, %r23, %r14; - shr.s32 %r25, %r24, 31; - shr.u32 %r26, %r25, 27; - add.s32 %r27, %r24, %r26; - shr.s32 %r28, %r27, 5; - mov.u32 %r29, 7; - sub.s32 %r30, %r29, %r15; - add.s32 %r31, %r30, %r16; - shr.s32 %r32, %r31, 31; - shr.u32 %r33, %r32, 29; - add.s32 %r34, %r31, %r33; - shr.s32 %r1, %r34, 3; - sub.s32 %r35, %r29, %r17; - add.s32 %r36, %r35, %r18; - shr.s32 %r37, %r36, 31; - shr.u32 %r38, %r37, 29; - add.s32 %r39, %r36, %r38; - shr.s32 %r2, %r39, 3; - add.s32 %r40, %r28, -1; - shr.s32 %r41, %r40, 2; - add.s32 %r3, %r41, 1; - mov.u32 %r42, %tid.x; - and.b32 %r4, %r42, 31; - sub.s32 %r61, %r62, %r12; - -BB14_2: - and.b32 %r8, %r62, 1; - setp.ne.s32 %p2, %r4, 0; - mov.u64 %rd17, 0; - @%p2 bra BB14_4; - - mov.u64 %rd9, 8; - mov.u64 %rd10, 72; - // Callseq Start 5 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd10; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd17, [retval0+0]; - } - // Callseq End 5 - -BB14_4: - setp.eq.s32 %p3, %r8, 0; - @%p3 bra BB14_9; - - setp.eq.s64 %p4, %rd17, 0; - @%p4 bra BB14_7; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd7; - st.u64 [%rd17+64], %rd6; - -BB14_7: - @%p2 bra BB14_13; - - mov.u32 %r47, 128; - mov.u32 %r49, 1; - mov.u32 %r50, 0; - mov.u64 %rd13, 0; - mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd11; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r47; - st.param.b32 [param3+4], %r49; - st.param.b32 [param3+8], %r49; - .param .b32 param4; - st.param.b32 [param4+0], %r50; - .param .b64 param5; - st.param.b64 [param5+0], %rd13; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r43, [retval0+0]; - } - - // inline asm - bra.uni BB14_13; - -BB14_9: - setp.eq.s64 %p6, %rd17, 0; - @%p6 bra BB14_11; - - st.u32 [%rd17], %r13; - st.u32 [%rd17+4], %r14; - st.u32 [%rd17+8], %r15; - st.u32 [%rd17+12], %r16; - st.u32 [%rd17+16], %r17; - st.u32 [%rd17+20], %r18; - st.u32 [%rd17+24], %r19; - st.u32 [%rd17+28], %r20; - st.u32 [%rd17+32], %r21; - st.u64 [%rd17+40], %rd4; - st.u64 [%rd17+48], %rd5; - st.u64 [%rd17+56], %rd6; - st.u64 [%rd17+64], %rd7; - -BB14_11: - @%p2 bra BB14_13; - - mov.u32 %r55, 128; - mov.u32 %r57, 1; - mov.u32 %r58, 0; - mov.u64 %rd16, 0; - mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd14; - .param .b64 param1; - st.param.b64 [param1+0], %rd17; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r3; - st.param.b32 [param2+4], %r1; - st.param.b32 [param2+8], %r2; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r55; - st.param.b32 [param3+4], %r57; - st.param.b32 [param3+8], %r57; - .param .b32 param4; - st.param.b32 [param4+0], %r58; - .param .b64 param5; - st.param.b64 [param5+0], %rd16; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r51, [retval0+0]; - } - - // inline asm - -BB14_13: - // Callseq Start 6 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r59, [retval0+0]; - } - // Callseq End 6 - add.s32 %r62, %r62, 1; - add.s32 %r61, %r61, 1; - setp.ne.s32 %p8, %r61, 0; - @%p8 bra BB14_2; - -BB14_14: - // Callseq Start 7 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r60, [retval0+0]; - } - // Callseq End 7 - ret; -} - - - diff --git a/examples_cuda/stencil/stencil_orig.cpp b/examples_cuda/stencil/stencil_orig.cpp deleted file mode 100644 index 015f2b80..00000000 --- a/examples_cuda/stencil/stencil_orig.cpp +++ /dev/null @@ -1,172 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifdef _MSC_VER -#define _CRT_SECURE_NO_WARNINGS -#define NOMINMAX -#pragma warning (disable: 4244) -#pragma warning (disable: 4305) -#endif - -#include -#include -#include -#include "../timing.h" -#include "stencil_ispc.h" -using namespace ispc; - -#include - - -double rtc(void) -{ - struct timeval Tvalue; - double etime; - struct timezone dummy; - - gettimeofday(&Tvalue,&dummy); - etime = (double) Tvalue.tv_sec + - 1.e-6*((double) Tvalue.tv_usec); - return etime; -} - - -extern void loop_stencil_serial(int t0, int t1, int x0, int x1, - int y0, int y1, int z0, int z1, - int Nx, int Ny, int Nz, - const double coef[5], - const double vsq[], - double Aeven[], double Aodd[]); - - -void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) { - int offset = 0; - for (int z = 0; z < Nz; ++z) - for (int y = 0; y < Ny; ++y) - for (int x = 0; x < Nx; ++x, ++offset) { - A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny); - A[1][offset] = 0; - vsq[offset] = x*y*z / double(Nx * Ny * Nz); - } -} - - -int main() { - int Nx = 256, Ny = 256, Nz = 256; - int width = 4; - double *Aserial[2], *Aispc[2]; - Aserial[0] = new double [Nx * Ny * Nz]; - Aserial[1] = new double [Nx * Ny * Nz]; - Aispc[0] = new double [Nx * Ny * Nz]; - Aispc[1] = new double [Nx * Ny * Nz]; - double *vsq = new double [Nx * Ny * Nz]; - - double coeff[4] = { 0.5, -.25, .125, -.0625 }; - -// InitData(Nx, Ny, Nz, Aispc, vsq); - - // - // Compute the image using the ispc implementation on one core; report - // the minimum time of three runs. - // - double minTimeISPC = 1e30; -#if 0 - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aispc[0], Aispc[1]); - double dt = get_elapsed_mcycles(); - minTimeISPC = std::min(minTimeISPC, dt); - } - - printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); -#endif - - fprintf(stderr, " -- init -- \n"); - InitData(Nx, Ny, Nz, Aispc, vsq); - fprintf(stderr, " -- done init -- \n"); - - // - // Compute the image using the ispc implementation with tasks; report - // the minimum time of three runs. - // - double minTimeISPCTasks = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - const double t0 = rtc(); - loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aispc[0], Aispc[1]); - double dt = 1e3*(rtc() - t0); //get_elapsed_mcycles(); - minTimeISPCTasks = std::min(minTimeISPCTasks, dt); - } - - fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); - - - InitData(Nx, Ny, Nz, Aserial, vsq); - - // - // And run the serial implementation 3 times, again reporting the - // minimum time. - // - double minTimeSerial = 1e30; - for (int i = 0; i < 3; ++i) { - reset_and_start_timer(); - loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, - width, Nz - width, Nx, Ny, Nz, coeff, vsq, - Aserial[0], Aserial[1]); - double dt = get_elapsed_mcycles(); - minTimeSerial = std::min(minTimeSerial, dt); - } - - printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); - - printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", - minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); - - // Check for agreement - int offset = 0; - for (int z = 0; z < Nz; ++z) - for (int y = 0; y < Ny; ++y) - for (int x = 0; x < Nx; ++x, ++offset) { - double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) / - Aserial[1][offset]); - if (error > 1e-4) - printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n", - x, y, z, Aispc[1][offset], Aserial[1][offset]); - } - - return 0; -} diff --git a/examples_cuda/stencil/stencil_orig.ispc b/examples_cuda/stencil/stencil_orig.ispc deleted file mode 100644 index d2e095b3..00000000 --- a/examples_cuda/stencil/stencil_orig.ispc +++ /dev/null @@ -1,172 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifdef __NVPTX__ -#warning "emitting DEVICE code" -#define taskIndex blockIndex0() -#define taskCount blockCount0() -#define programIndex laneIndex() -#define programCount warpSize() -#else -#warning "emitting HOST code" -#endif - -static inline void -stencil_step(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], uniform const double vsq[], - uniform const double Ain[], uniform double Aout[]) { - const uniform int Nxy = Nx * Ny; - -// foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) -#if 0 -#define VER1 -#endif - -#ifdef VER1 - const uniform long x1o = 1; - const uniform long x2o = 2; - const uniform long x3o = 3; - const uniform long y1o = Nx; - const uniform long y2o = Nx*2; - const uniform long y3o = Nx*3; - const uniform long z1o = Nxy; - const uniform long z2o = Nxy*2; - const uniform long z3o = Nxy*3; -#endif - for (uniform int z = z0; z < z1; z++) - for (uniform int y = y0; y < y1; y++) - { - const int index_base = (z * Nxy) + (y * Nx); - for (uniform int xb = x0; xb < x1; xb += programCount) - { - const int x = xb + programIndex; - int index = index_base + x; -#ifndef VER1 -#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] -#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] - double div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + - A_cur(0, +1, 0) + A_cur(0, -1, 0) + - A_cur(0, 0, +1) + A_cur(0, 0, -1)) + - coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + - A_cur(0, +2, 0) + A_cur(0, -2, 0) + - A_cur(0, 0, +2) + A_cur(0, 0, -2)) + - coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + - A_cur(0, +3, 0) + A_cur(0, -3, 0) + - A_cur(0, 0, +3) + A_cur(0, 0, -3)); -#else -#define A_cur(x, y, z) Ain [index + (x) + (y) + (z)] -#define A_next(x, y, z) Aout[index + (x) + (y) + (z)] - double div = coef[0] * A_cur(0, 0, 0) + - coef[1] * (A_cur(+x1o, 0, 0) + A_cur(-x1o, 0, 0) + - A_cur(0, +y1o, 0) + A_cur(0, -y1o, 0) + - A_cur(0, 0, +z1o) + A_cur(0, 0, -z1o)) + - coef[2] * (A_cur(+x2o, 0, 0) + A_cur(-x2o, 0, 0) + - A_cur(0, +y2o, 0) + A_cur(0, -y2o, 0) + - A_cur(0, 0, +z2o) + A_cur(0, 0, -z2o)) + - coef[3] * (A_cur(+x3o, 0, 0) + A_cur(-x3o, 0, 0) + - A_cur(0, +y3o, 0) + A_cur(0, -y3o, 0) + - A_cur(0, 0, +z3o) + A_cur(0, 0, -z3o)); -#endif - - if (x < x1) - A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) + - vsq[index] * div; - } - } -} - - -static task void -stencil_step_task(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], uniform const double vsq[], - uniform const double Ain[], uniform double Aout[]) { - if(taskIndex >= taskCount) return; - - stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, - Nx, Ny, Nz, coef, vsq, Ain, Aout); -} - - -export void -loop_stencil_ispc_tasks(uniform int t0, uniform int t1, - uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], - uniform const double vsq[], - uniform double Aeven[], uniform double Aodd[]) -{ - for (uniform int t = t0; t < t1; ++t) { - // Parallelize across cores as well: each task will work on a slice - // of 1 in the z extent of the volume. - if ((t & 1) == 0) - launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, - coef, vsq, Aeven, Aodd); - else - launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, - coef, vsq, Aodd, Aeven); - - // We need to wait for all of the launched tasks to finish before - // starting the next iteration. - sync; - } -} - - -export void -loop_stencil_ispc(uniform int t0, uniform int t1, - uniform int x0, uniform int x1, - uniform int y0, uniform int y1, - uniform int z0, uniform int z1, - uniform int Nx, uniform int Ny, uniform int Nz, - uniform const double coef[4], - uniform const double vsq[], - uniform double Aeven[], uniform double Aodd[]) -{ - for (uniform int t = t0; t < t1; ++t) { - if ((t & 1) == 0) - stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, - Aeven, Aodd); - else - stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, - Aodd, Aeven); - } -} diff --git a/examples_cuda/stencil/stencil_serial.o b/examples_cuda/stencil/stencil_serial.o deleted file mode 100644 index 1fd32c29..00000000 Binary files a/examples_cuda/stencil/stencil_serial.o and /dev/null differ diff --git a/stdlib.ispc b/stdlib.ispc index 25728ed0..fcb61eb4 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -93,7 +93,7 @@ __declspec(safe,cost0) __declspec(safe,cost0) static inline uniform int blockIndex2() { - return __ctaid_y(); + return __ctaid_z(); } /***************/ __declspec(safe,cost0)