This commit is contained in:
Evghenii
2013-11-13 13:56:45 +01:00
parent d3ade0654e
commit dededd1929
50 changed files with 0 additions and 160909 deletions

View File

@@ -1,127 +0,0 @@
code for sm_35
Function : mandelbrot_scanline
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08a0b010a0a01000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ S2R R2, SR_CTAID.Y; /* 0x86400000131c000a */
/*0018*/ MOV R3, c[0x0][0x15c]; /* 0x64c03c002b9c000e */
/*0020*/ IMAD R3, R2, c[0x0][0x15c], R3; /* 0x51080c002b9c080e */
/*0028*/ ISETP.LT.AND P0, PT, R3, c[0x0][0x154], PT; /* 0x5b181c002a9c0c1e */
/*0030*/ IMUL R0, R2, c[0x0][0x15c]; /* 0x61c018002b9c0802 */
/*0038*/ SEL R3, R3, c[0x0][0x154], P0; /* 0x650000002a9c0c0e */
/* 0x089c8010a01000b0 */
/*0048*/ ISETP.GE.AND P0, PT, R0, R3, PT; /* 0xdb681c00019c001e */
/*0050*/ @P0 EXIT ; /* 0x180000000000003c */
/*0058*/ IADD R2, R2, 0x1; /* 0xc0800000009c0809 */
/*0060*/ MOV R3, c[0x0][0x158]; /* 0x64c03c002b1c000e */
/*0068*/ IMUL R5, R2, c[0x0][0x15c]; /* 0x61c018002b9c0816 */
/*0070*/ LOP.PASS_B R4, RZ, ~c[0x0][0x154]; /* 0x620038002a9ffc12 */
/*0078*/ S2R R2, SR_CTAID.X; /* 0x86400000129c000a */
/* 0x08ac80109c108010 */
/*0088*/ LOP.PASS_B R7, RZ, ~R5; /* 0xe2003800029ffc1e */
/*0090*/ LOP.PASS_B R6, RZ, ~c[0x0][0x154]; /* 0x620038002a9ffc1a */
/*0098*/ LOP.PASS_B R5, RZ, ~R5; /* 0xe2003800029ffc16 */
/*00a0*/ IMAD R3, R2, c[0x0][0x158], R3; /* 0x51080c002b1c080e */
/*00a8*/ ISETP.GT.AND P0, PT, R4, R7, PT; /* 0xdb481c00039c101e */
/*00b0*/ IMUL R2, R2, c[0x0][0x158]; /* 0x61c018002b1c080a */
/*00b8*/ ISETP.LT.AND P1, PT, R3, c[0x0][0x150], PT; /* 0x5b181c002a1c0c3e */
/* 0x0800b010008010a0 */
/*00c8*/ SEL R4, R5, R6, !P0; /* 0xe5002000031c1412 */
/*00d0*/ ISETP.LT.AND P0, PT, RZ, c[0x0][0x160], PT; /* 0x5b181c002c1ffc1e */
/*00d8*/ LOP.PASS_B R4, RZ, ~R4; /* 0xe2003800021ffc12 */
/*00e0*/ SEL R3, R3, c[0x0][0x150], P1; /* 0x650004002a1c0c0e */
/*00e8*/ ISETP.GE.AND P1, PT, R2, R3, PT; /* 0xdb681c00019c083e */
/*00f0*/ SSY 0x368; /* 0x1480000138000000 */
/*00f8*/ @P1 BRA 0x360; /* 0x120000013004003c */
/* 0x089c108010001080 */
/*0108*/ IMUL R5, R0, c[0x0][0x150]; /* 0x61c018002a1c0016 */
/*0110*/ MOV R8, R2; /* 0xe4c03c00011c0022 */
/*0118*/ @!P0 BRA 0x2d8; /* 0x12000000dc20003c */
/*0120*/ I2F.F32.S32 R6, R0; /* 0xe5c00000001ca81a */
/*0128*/ MOV R7, c[0x0][0x148]; /* 0x64c03c00291c001e */
/*0130*/ MOV R14, R2; /* 0xe4c03c00011c003a */
/*0138*/ MOV R16, c[0x0][0x140]; /* 0x64c03c00281c0042 */
/* 0x089c80a010a01000 */
/*0148*/ FFMA R6, R6, c[0x0][0x14c], R7; /* 0x4c001c00299c181a */
/*0150*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */
/*0158*/ MOV R9, R6; /* 0xe4c03c00031c0026 */
/*0160*/ LOP.AND R7, R10, 0x1f; /* 0xc20000000f9c281d */
/*0168*/ PSETP.AND.AND P2, PT, PT, PT, PT; /* 0x84801c07001dc05e */
/*0170*/ IADD R12, R7, R14; /* 0xe0800000071c1c32 */
/*0178*/ PSETP.AND.AND P3, PT, P0, PT, PT; /* 0x84801c07001c007e */
/* 0x08a00010a010a010 */
/*0188*/ I2F.F32.S32 R7, R12; /* 0xe5c00000061ca81e */
/*0190*/ PSETP.AND.AND P1, PT, !PT, PT, PT; /* 0x84801c07001fc03e */
/*0198*/ FFMA R11, R7, c[0x0][0x144], R16; /* 0x4c004000289c1c2e */
/*01a0*/ SSY 0x260; /* 0x148000005c000000 */
/*01a8*/ MOV R7, RZ; /* 0xe4c03c007f9c001e */
/*01b0*/ MOV R8, R11; /* 0xe4c03c00059c0022 */
/*01b8*/ FMUL R15, R8, R8; /* 0xe3400000041c203e */
/* 0x08b0b0ac80b0a010 */
/*01c8*/ PSETP.AND.AND P3, PT, P2, P3, PT; /* 0x84801c03001c807e */
/*01d0*/ FFMA R13, R9, R9, R15; /* 0xcc003c00049c2436 */
/*01d8*/ FSETP.GTU.AND P2, PT, R13, 4, PT; /* 0xb5e01e04001c345d */
/*01e0*/ PSETP.AND.OR P1, PT, P3, P2, P1; /* 0x84810402001cc03e */
/*01e8*/ PSETP.AND.AND P2, PT, !PT, PT, PT; /* 0x84801c07001fc05e */
/*01f0*/ PSETP.XOR.AND P5, PT, P1, P3, PT; /* 0x84801c03101c40be */
/*01f8*/ @P5 PSETP.AND.AND P2, PT, P3, !P1, PT; /* 0x84801c090014c05e */
/* 0x08ac8010b09c1080 */
/*0208*/ @P2 IADD R7, R7, 0x1; /* 0xc080000000881c1d */
/*0210*/ @P5 FFMA R13, -R9, R9, R15; /* 0xcc083c0004942436 */
/*0218*/ @P5 FADD R15, R8, R8; /* 0xe2c000000414203e */
/*0220*/ ISETP.LT.AND P3, PT, R7, c[0x0][0x160], PT; /* 0x5b181c002c1c1c7e */
/*0228*/ @P5 FADD R13, R11, R13; /* 0xe2c0000006942c36 */
/*0230*/ PSETP.AND.AND P4, PT, P2, P3, PT; /* 0x84801c03001c809e */
/*0238*/ @P5 FFMA R9, R9, R15, R6; /* 0xcc00180007942426 */
/* 0x08a0a0100000b810 */
/*0248*/ @P5 MOV R8, R13; /* 0xe4c03c0006940022 */
/*0250*/ @P4 BRA 0x1b8; /* 0x12007fffb010003c */
/*0258*/ ISETP.GE.AND.S P1, PT, R12, R3, PT; /* 0xdb681c0001dc303e */
/*0260*/ @P1 BRA.U 0x2b0; /* 0x120000002404023c */
/*0268*/ @!P1 LOP32I.AND R9, R10, 0x4000001f; /* 0x202000000fa42824 */
/*0270*/ @!P1 IADD R8, R14, R5; /* 0xe080000002a43822 */
/*0278*/ @!P1 IADD R8, R8, R9; /* 0xe080000004a42022 */
/* 0x08b0a000a0b010a0 */
/*0288*/ @!P1 SHF.L R8, RZ, 0x2, R8; /* 0xb7c020000127fc21 */
/*0290*/ @!P1 BFE R9, R8, 0x11f; /* 0xc00800008fa42025 */
/*0298*/ @!P1 IADD R8.CC, R8, c[0x0][0x168]; /* 0x608400002d242022 */
/*02a0*/ @!P1 IADD.X R9, R9, c[0x0][0x16c]; /* 0x608040002da42426 */
/*02a8*/ @!P1 ST.E [R8], R7; /* 0xe48000000024201c */
/*02b0*/ IADD R14, R14, 0x20; /* 0xc0800000101c3839 */
/*02b8*/ ISETP.LT.AND P1, PT, R14, R3, PT; /* 0xdb181c00019c383e */
/* 0x0880b0a0a0a0b8b8 */
/*02c8*/ @P1 BRA 0x150; /* 0x12007fff4004003c */
/*02d0*/ BRA 0x360; /* 0x12000000441c003c */
/*02d8*/ S2R R7, SR_TID.X; /* 0x86400000109c001e */
/*02e0*/ LOP.AND R6, R7, 0x1f; /* 0xc20000000f9c1c19 */
/*02e8*/ IADD R6, R6, R8; /* 0xe0800000041c181a */
/*02f0*/ ISETP.LT.AND P1, PT, R6, R3, PT; /* 0xdb181c00019c183e */
/*02f8*/ @P1 LOP32I.AND R7, R7, 0x4000001f; /* 0x202000000f841c1c */
/* 0x08a0b010a0a0a010 */
/*0308*/ @P1 IADD R6, R8, R5; /* 0xe08000000284201a */
/*0310*/ IADD R8, R8, 0x20; /* 0xc0800000101c2021 */
/*0318*/ @P1 IADD R6, R6, R7; /* 0xe08000000384181a */
/*0320*/ @P1 SHF.L R6, RZ, 0x2, R6; /* 0xb7c018000107fc19 */
/*0328*/ @P1 BFE R7, R6, 0x11f; /* 0xc00800008f84181d */
/*0330*/ @P1 IADD R6.CC, R6, c[0x0][0x168]; /* 0x608400002d04181a */
/*0338*/ @P1 IADD.X R7, R7, c[0x0][0x16c]; /* 0x608040002d841c1e */
/* 0x0880b8b000b8b0c8 */
/*0348*/ @P1 ST.E [R6], RZ; /* 0xe480000000041bfc */
/*0350*/ ISETP.LT.AND P1, PT, R8, R3, PT; /* 0xdb181c00019c203e */
/*0358*/ @P1 BRA 0x2d8; /* 0x12007fffbc04003c */
/*0360*/ IADD.S R0, R0, 0x1; /* 0xc080000000dc0001 */
/*0368*/ ISETP.EQ.AND P1, PT, R0, R4, PT; /* 0xdb281c00021c003e */
/*0370*/ @!P1 BRA 0xe8; /* 0x12007ffeb824003c */
/*0378*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/* 0x08000000000000b8 */
/*0388*/ EXIT ; /* 0x18000000001c003c */
/*0390*/ BRA 0x390; /* 0x12007ffffc1c003c */
/*0398*/ NOP; /* 0x85800000001c3c02 */
/*03a0*/ NOP; /* 0x85800000001c3c02 */
/*03a8*/ NOP; /* 0x85800000001c3c02 */
/*03b0*/ NOP; /* 0x85800000001c3c02 */
/*03b8*/ NOP; /* 0x85800000001c3c02 */
....................................

View File

@@ -1,127 +0,0 @@
code for sm_35
Function : mandelbrot_scanline
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08a0b010a0a01000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ S2R R2, SR_CTAID.Y; /* 0x86400000131c000a */
/*0018*/ MOV R3, c[0x0][0x15c]; /* 0x64c03c002b9c000e */
/*0020*/ IMAD R3, R2, c[0x0][0x15c], R3; /* 0x51080c002b9c080e */
/*0028*/ ISETP.LT.AND P0, PT, R3, c[0x0][0x154], PT; /* 0x5b181c002a9c0c1e */
/*0030*/ IMUL R0, R2, c[0x0][0x15c]; /* 0x61c018002b9c0802 */
/*0038*/ SEL R3, R3, c[0x0][0x154], P0; /* 0x650000002a9c0c0e */
/* 0x089c8010a01000b0 */
/*0048*/ ISETP.GE.AND P0, PT, R0, R3, PT; /* 0xdb681c00019c001e */
/*0050*/ @P0 EXIT ; /* 0x180000000000003c */
/*0058*/ IADD R2, R2, 0x1; /* 0xc0800000009c0809 */
/*0060*/ MOV R3, c[0x0][0x158]; /* 0x64c03c002b1c000e */
/*0068*/ IMUL R5, R2, c[0x0][0x15c]; /* 0x61c018002b9c0816 */
/*0070*/ LOP.PASS_B R4, RZ, ~c[0x0][0x154]; /* 0x620038002a9ffc12 */
/*0078*/ S2R R2, SR_CTAID.X; /* 0x86400000129c000a */
/* 0x08ac80109c108010 */
/*0088*/ LOP.PASS_B R7, RZ, ~R5; /* 0xe2003800029ffc1e */
/*0090*/ LOP.PASS_B R6, RZ, ~c[0x0][0x154]; /* 0x620038002a9ffc1a */
/*0098*/ LOP.PASS_B R5, RZ, ~R5; /* 0xe2003800029ffc16 */
/*00a0*/ IMAD R3, R2, c[0x0][0x158], R3; /* 0x51080c002b1c080e */
/*00a8*/ ISETP.GT.AND P0, PT, R4, R7, PT; /* 0xdb481c00039c101e */
/*00b0*/ IMUL R2, R2, c[0x0][0x158]; /* 0x61c018002b1c080a */
/*00b8*/ ISETP.LT.AND P1, PT, R3, c[0x0][0x150], PT; /* 0x5b181c002a1c0c3e */
/* 0x0800b010008010a0 */
/*00c8*/ SEL R4, R5, R6, !P0; /* 0xe5002000031c1412 */
/*00d0*/ ISETP.LT.AND P0, PT, RZ, c[0x0][0x160], PT; /* 0x5b181c002c1ffc1e */
/*00d8*/ LOP.PASS_B R4, RZ, ~R4; /* 0xe2003800021ffc12 */
/*00e0*/ SEL R3, R3, c[0x0][0x150], P1; /* 0x650004002a1c0c0e */
/*00e8*/ ISETP.GE.AND P1, PT, R2, R3, PT; /* 0xdb681c00019c083e */
/*00f0*/ SSY 0x368; /* 0x1480000138000000 */
/*00f8*/ @P1 BRA 0x360; /* 0x120000013004003c */
/* 0x089c108010001080 */
/*0108*/ IMUL R5, R0, c[0x0][0x150]; /* 0x61c018002a1c0016 */
/*0110*/ MOV R8, R2; /* 0xe4c03c00011c0022 */
/*0118*/ @!P0 BRA 0x2d8; /* 0x12000000dc20003c */
/*0120*/ I2F.F32.S32 R6, R0; /* 0xe5c00000001ca81a */
/*0128*/ MOV R7, c[0x0][0x148]; /* 0x64c03c00291c001e */
/*0130*/ MOV R14, R2; /* 0xe4c03c00011c003a */
/*0138*/ MOV R16, c[0x0][0x140]; /* 0x64c03c00281c0042 */
/* 0x089c80a010a01000 */
/*0148*/ FFMA R6, R6, c[0x0][0x14c], R7; /* 0x4c001c00299c181a */
/*0150*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */
/*0158*/ MOV R9, R6; /* 0xe4c03c00031c0026 */
/*0160*/ LOP.AND R7, R10, 0x1f; /* 0xc20000000f9c281d */
/*0168*/ PSETP.AND.AND P2, PT, PT, PT, PT; /* 0x84801c07001dc05e */
/*0170*/ IADD R12, R7, R14; /* 0xe0800000071c1c32 */
/*0178*/ PSETP.AND.AND P3, PT, P0, PT, PT; /* 0x84801c07001c007e */
/* 0x08a00010a010a010 */
/*0188*/ I2F.F32.S32 R7, R12; /* 0xe5c00000061ca81e */
/*0190*/ PSETP.AND.AND P1, PT, !PT, PT, PT; /* 0x84801c07001fc03e */
/*0198*/ FFMA R11, R7, c[0x0][0x144], R16; /* 0x4c004000289c1c2e */
/*01a0*/ SSY 0x260; /* 0x148000005c000000 */
/*01a8*/ MOV R7, RZ; /* 0xe4c03c007f9c001e */
/*01b0*/ MOV R8, R11; /* 0xe4c03c00059c0022 */
/*01b8*/ FMUL R15, R8, R8; /* 0xe3400000041c203e */
/* 0x08b0b0ac80b0a010 */
/*01c8*/ PSETP.AND.AND P3, PT, P2, P3, PT; /* 0x84801c03001c807e */
/*01d0*/ FFMA R13, R9, R9, R15; /* 0xcc003c00049c2436 */
/*01d8*/ FSETP.GTU.AND P2, PT, R13, 4, PT; /* 0xb5e01e04001c345d */
/*01e0*/ PSETP.AND.OR P1, PT, P3, P2, P1; /* 0x84810402001cc03e */
/*01e8*/ PSETP.AND.AND P2, PT, !PT, PT, PT; /* 0x84801c07001fc05e */
/*01f0*/ PSETP.XOR.AND P5, PT, P1, P3, PT; /* 0x84801c03101c40be */
/*01f8*/ @P5 PSETP.AND.AND P2, PT, P3, !P1, PT; /* 0x84801c090014c05e */
/* 0x08ac8010b09c1080 */
/*0208*/ @P2 IADD R7, R7, 0x1; /* 0xc080000000881c1d */
/*0210*/ @P5 FFMA R13, -R9, R9, R15; /* 0xcc083c0004942436 */
/*0218*/ @P5 FADD R15, R8, R8; /* 0xe2c000000414203e */
/*0220*/ ISETP.LT.AND P3, PT, R7, c[0x0][0x160], PT; /* 0x5b181c002c1c1c7e */
/*0228*/ @P5 FADD R13, R11, R13; /* 0xe2c0000006942c36 */
/*0230*/ PSETP.AND.AND P4, PT, P2, P3, PT; /* 0x84801c03001c809e */
/*0238*/ @P5 FFMA R9, R9, R15, R6; /* 0xcc00180007942426 */
/* 0x08a0a0100000b810 */
/*0248*/ @P5 MOV R8, R13; /* 0xe4c03c0006940022 */
/*0250*/ @P4 BRA 0x1b8; /* 0x12007fffb010003c */
/*0258*/ ISETP.GE.AND.S P1, PT, R12, R3, PT; /* 0xdb681c0001dc303e */
/*0260*/ @P1 BRA.U 0x2b0; /* 0x120000002404023c */
/*0268*/ @!P1 LOP32I.AND R9, R10, 0x4000001f; /* 0x202000000fa42824 */
/*0270*/ @!P1 IADD R8, R14, R5; /* 0xe080000002a43822 */
/*0278*/ @!P1 IADD R8, R8, R9; /* 0xe080000004a42022 */
/* 0x08b0a000a0b010a0 */
/*0288*/ @!P1 SHF.L R8, RZ, 0x2, R8; /* 0xb7c020000127fc21 */
/*0290*/ @!P1 BFE R9, R8, 0x11f; /* 0xc00800008fa42025 */
/*0298*/ @!P1 IADD R8.CC, R8, c[0x0][0x168]; /* 0x608400002d242022 */
/*02a0*/ @!P1 IADD.X R9, R9, c[0x0][0x16c]; /* 0x608040002da42426 */
/*02a8*/ @!P1 ST.E [R8], R7; /* 0xe48000000024201c */
/*02b0*/ IADD R14, R14, 0x20; /* 0xc0800000101c3839 */
/*02b8*/ ISETP.LT.AND P1, PT, R14, R3, PT; /* 0xdb181c00019c383e */
/* 0x0880b0a0a0a0b8b8 */
/*02c8*/ @P1 BRA 0x150; /* 0x12007fff4004003c */
/*02d0*/ BRA 0x360; /* 0x12000000441c003c */
/*02d8*/ S2R R7, SR_TID.X; /* 0x86400000109c001e */
/*02e0*/ LOP.AND R6, R7, 0x1f; /* 0xc20000000f9c1c19 */
/*02e8*/ IADD R6, R6, R8; /* 0xe0800000041c181a */
/*02f0*/ ISETP.LT.AND P1, PT, R6, R3, PT; /* 0xdb181c00019c183e */
/*02f8*/ @P1 LOP32I.AND R7, R7, 0x4000001f; /* 0x202000000f841c1c */
/* 0x08a0b010a0a0a010 */
/*0308*/ @P1 IADD R6, R8, R5; /* 0xe08000000284201a */
/*0310*/ IADD R8, R8, 0x20; /* 0xc0800000101c2021 */
/*0318*/ @P1 IADD R6, R6, R7; /* 0xe08000000384181a */
/*0320*/ @P1 SHF.L R6, RZ, 0x2, R6; /* 0xb7c018000107fc19 */
/*0328*/ @P1 BFE R7, R6, 0x11f; /* 0xc00800008f84181d */
/*0330*/ @P1 IADD R6.CC, R6, c[0x0][0x168]; /* 0x608400002d04181a */
/*0338*/ @P1 IADD.X R7, R7, c[0x0][0x16c]; /* 0x608040002d841c1e */
/* 0x0880b8b000b8b0c8 */
/*0348*/ @P1 ST.E [R6], RZ; /* 0xe480000000041bfc */
/*0350*/ ISETP.LT.AND P1, PT, R8, R3, PT; /* 0xdb181c00019c203e */
/*0358*/ @P1 BRA 0x2d8; /* 0x12007fffbc04003c */
/*0360*/ IADD.S R0, R0, 0x1; /* 0xc080000000dc0001 */
/*0368*/ ISETP.EQ.AND P1, PT, R0, R4, PT; /* 0xdb281c00021c003e */
/*0370*/ @!P1 BRA 0xe8; /* 0x12007ffeb824003c */
/*0378*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/* 0x08000000000000b8 */
/*0388*/ EXIT ; /* 0x18000000001c003c */
/*0390*/ BRA 0x390; /* 0x12007ffffc1c003c */
/*0398*/ NOP; /* 0x85800000001c3c02 */
/*03a0*/ NOP; /* 0x85800000001c3c02 */
/*03a8*/ NOP; /* 0x85800000001c3c02 */
/*03b0*/ NOP; /* 0x85800000001c3c02 */
/*03b8*/ NOP; /* 0x85800000001c3c02 */
....................................

View File

@@ -1,79 +0,0 @@
code for sm_35
Function : _Z19mandelbrot_scanlineffffiiiiiPi
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x0880a010a0a01000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ S2R R0, SR_CTAID.Y; /* 0x86400000131c0002 */
/*0018*/ MOV R4, c[0x0][0x158]; /* 0x64c03c002b1c0012 */
/*0020*/ IMUL R2, R0, c[0x0][0x15c]; /* 0x61c018002b9c000a */
/*0028*/ IADD R0, R2, c[0x0][0x15c]; /* 0x608000002b9c0802 */
/*0030*/ S2R R9, SR_CTAID.X; /* 0x86400000129c0026 */
/*0038*/ IMNMX R11, R0, c[0x0][0x154], PT; /* 0x61081c002a9c002e */
/* 0x08b0a0100010b09c */
/*0048*/ IMAD R0, R9, c[0x0][0x158], R4; /* 0x510810002b1c2402 */
/*0050*/ ISETP.GE.AND P0, PT, R2, R11, PT; /* 0xdb681c00059c081e */
/*0058*/ IMNMX R0, R0, c[0x0][0x150], PT; /* 0x61081c002a1c0002 */
/*0060*/ @P0 EXIT ; /* 0x180000000000003c */
/*0068*/ IMUL R3, R9, c[0x0][0x158]; /* 0x61c018002b1c240e */
/*0070*/ SSY 0x1f8; /* 0x14800000c0000000 */
/*0078*/ ISETP.GE.AND P0, PT, R3, R0, PT; /* 0xdb681c00001c0c1e */
/* 0x08a0100010a01000 */
/*0088*/ @P0 BRA 0x1f0; /* 0x12000000b000003c */
/*0090*/ I2F.F32.S32 R4, R2; /* 0xe5c00000011ca812 */
/*0098*/ MOV R5, c[0x0][0x148]; /* 0x64c03c00291c0016 */
/*00a0*/ MOV R16, c[0x0][0x140]; /* 0x64c03c00281c0042 */
/*00a8*/ FFMA R4, R4, c[0x0][0x14c], R5; /* 0x4c001400299c1012 */
/*00b0*/ S2R R5, SR_TID.X; /* 0x86400000109c0016 */
/*00b8*/ MOV R6, RZ; /* 0xe4c03c007f9c001a */
/* 0x08800010a0a0a010 */
/*00c8*/ LOP.AND R10, R5, 0x1f; /* 0xc20000000f9c1429 */
/*00d0*/ ISETP.LT.AND P0, PT, RZ, c[0x0][0x160], PT; /* 0x5b181c002c1ffc1e */
/*00d8*/ IADD R12, R10, R3; /* 0xe0800000019c2832 */
/*00e0*/ I2F.F32.U32 R5, R12; /* 0xe5c00000061c2816 */
/*00e8*/ FFMA R5, R5, c[0x0][0x144], R16; /* 0x4c004000289c1416 */
/*00f0*/ @!P0 BRA 0x190; /* 0x120000004c20003c */
/*00f8*/ MOV R7, R4; /* 0xe4c03c00021c001e */
/* 0x0800b0a0a0100010 */
/*0108*/ MOV R8, R5; /* 0xe4c03c00029c0022 */
/*0110*/ PBK 0x190; /* 0x150000003c000000 */
/*0118*/ FMUL R13, R7, R7; /* 0xe3400000039c1c36 */
/*0120*/ FMUL R14, R8, R8; /* 0xe3400000041c203a */
/*0128*/ FADD R15, R14, R13; /* 0xe2c00000069c383e */
/*0130*/ FSETP.GT.AND P0, PT, R15, 4, PT; /* 0xb5a01e04001c3c1d */
/*0138*/ @P0 BRK ; /* 0x1a0000000000003c */
/* 0x080010ac809c8010 */
/*0148*/ IADD R6, R6, 0x1; /* 0xc0800000009c1819 */
/*0150*/ FADD R8, R8, R8; /* 0xe2c00000041c2022 */
/*0158*/ FADD R14, R14, -R13; /* 0xe2c10000069c383a */
/*0160*/ ISETP.LT.AND P0, PT, R6, c[0x0][0x160], PT; /* 0x5b181c002c1c181e */
/*0168*/ FFMA R7, R8, R7, R4; /* 0xcc001000039c201e */
/*0170*/ FADD R8, R5, R14; /* 0xe2c00000071c1422 */
/*0178*/ @!P0 BRK ; /* 0x1a0000000020003c */
/* 0x08b0a00010ac80b8 */
/*0188*/ BRA 0x118; /* 0x12007fffc41c003c */
/*0190*/ ISETP.GE.U32.AND P0, PT, R12, R0, PT; /* 0xdb601c00001c301e */
/*0198*/ IMAD R5, R2, c[0x0][0x150], R3; /* 0x51080c002a1c0816 */
/*01a0*/ IADD R5, R5, R10; /* 0xe0800000051c1416 */
/*01a8*/ @P0 BRA.U 0x1d8; /* 0x120000001400023c */
/*01b0*/ @!P0 MOV32I R8, 0x4; /* 0x740000000223c022 */
/*01b8*/ @!P0 IMAD R12.CC, R5, R8, c[0x0][0x168]; /* 0x910c20002d201432 */
/* 0x08b000b8b0a000a0 */
/*01c8*/ @!P0 IMAD.HI.X R13, R5, R8, c[0x0][0x16c]; /* 0x931820002da01436 */
/*01d0*/ @!P0 ST.E [R12], R6; /* 0xe480000000203018 */
/*01d8*/ IADD R3, R3, 0x20; /* 0xc0800000101c0c0d */
/*01e0*/ ISETP.LT.AND P0, PT, R3, R0, PT; /* 0xdb181c00001c0c1e */
/*01e8*/ @P0 BRA 0xb0; /* 0x12007fff6000003c */
/*01f0*/ IADD.S R2, R2, 0x1; /* 0xc080000000dc0809 */
/*01f8*/ ISETP.LT.AND P0, PT, R2, R11, PT; /* 0xdb181c00059c081e */
/* 0x0800000000b810b8 */
/*0208*/ @P0 BRA 0x68; /* 0x12007fff2c00003c */
/*0210*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0218*/ EXIT ; /* 0x18000000001c003c */
/*0220*/ BRA 0x220; /* 0x12007ffffc1c003c */
/*0228*/ NOP; /* 0x85800000001c3c02 */
/*0230*/ NOP; /* 0x85800000001c3c02 */
/*0238*/ NOP; /* 0x85800000001c3c02 */
...................................................

View File

@@ -1,111 +0,0 @@
code for sm_35
Function : mandelbrot_scanline
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x0880a010a0a01000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ S2R R4, SR_CTAID.Y; /* 0x86400000131c0012 */
/*0018*/ MOV R6, c[0x0][0x158]; /* 0x64c03c002b1c001a */
/*0020*/ IMUL R0, R4, c[0x0][0x15c]; /* 0x61c018002b9c1002 */
/*0028*/ IADD R3, R0, c[0x0][0x15c]; /* 0x608000002b9c000e */
/*0030*/ S2R R2, SR_CTAID.X; /* 0x86400000129c000a */
/*0038*/ IMNMX R5, R3, c[0x0][0x154], PT; /* 0x61081c002a9c0c16 */
/* 0x08a010a000b010a0 */
/*0048*/ IMAD R3, R2, c[0x0][0x158], R6; /* 0x510818002b1c080e */
/*0050*/ ISETP.GE.AND P0, PT, R0, R5, PT; /* 0xdb681c00029c001e */
/*0058*/ IMNMX R3, R3, c[0x0][0x150], PT; /* 0x61081c002a1c0c0e */
/*0060*/ @P0 EXIT ; /* 0x180000000000003c */
/*0068*/ IADD R4, R4, 0x1; /* 0xc0800000009c1011 */
/*0070*/ IMUL R5, R4, c[0x0][0x15c]; /* 0x61c018002b9c1016 */
/*0078*/ LOP.PASS_B R4, RZ, ~c[0x0][0x154]; /* 0x620038002a9ffc12 */
/* 0x0800b0a01000a0a0 */
/*0088*/ LOP.PASS_B R5, RZ, ~R5; /* 0xe2003800029ffc16 */
/*0090*/ IMNMX R4, R4, R5, !PT; /* 0xe1083c00029c1012 */
/*0098*/ LOP.PASS_B R4, RZ, ~R4; /* 0xe2003800021ffc12 */
/*00a0*/ IMUL R5, R2, c[0x0][0x158]; /* 0x61c018002b1c0816 */
/*00a8*/ SSY 0x318; /* 0x1480000134000000 */
/*00b0*/ ISETP.GE.AND P0, PT, R5, R3, PT; /* 0xdb681c00019c141e */
/*00b8*/ @P0 BRA 0x310; /* 0x120000012800003c */
/* 0x08a0a00010ac8010 */
/*00c8*/ ISETP.LT.AND P0, PT, RZ, c[0x0][0x160], PT; /* 0x5b181c002c1ffc1e */
/*00d0*/ I2F.F32.S32 R6, R0; /* 0xe5c00000001ca81a */
/*00d8*/ MOV R7, c[0x0][0x148]; /* 0x64c03c00291c001e */
/*00e0*/ FFMA R6, R6, c[0x0][0x14c], R7; /* 0x4c001c00299c181a */
/*00e8*/ @P0 BRA 0x180; /* 0x120000004800003c */
/*00f0*/ S2R R7, SR_TID.X; /* 0x86400000109c001e */
/*00f8*/ LOP.AND R6, R7, 0x1f; /* 0xc20000000f9c1c19 */
/* 0x08a010a0a080b0a0 */
/*0108*/ IADD R6, R6, R5; /* 0xe0800000029c181a */
/*0110*/ ISETP.GE.AND P0, PT, R6, R3, PT; /* 0xdb681c00019c181e */
/*0118*/ @!P0 LOP32I.AND R7, R7, 0x4000001f; /* 0x202000000fa01c1c */
/*0120*/ @!P0 IMAD R6, R0, c[0x0][0x150], R5; /* 0x510814002a20001a */
/*0128*/ @!P0 IADD R6, R6, R7; /* 0xe080000003a0181a */
/*0130*/ @!P0 SHF.L R6, RZ, 0x2, R6; /* 0xb7c018000123fc19 */
/*0138*/ IADD R5, R5, 0x20; /* 0xc0800000101c1415 */
/* 0x08b8b8b0c8a0b010 */
/*0148*/ @!P0 BFE R7, R6, 0x11f; /* 0xc00800008fa0181d */
/*0150*/ @!P0 IADD R6.CC, R6, c[0x0][0x168]; /* 0x608400002d20181a */
/*0158*/ @!P0 IADD.X R7, R7, c[0x0][0x16c]; /* 0x608040002da01c1e */
/*0160*/ @!P0 ST.E [R6], RZ; /* 0xe480000000201bfc */
/*0168*/ ISETP.LT.AND P0, PT, R5, R3, PT; /* 0xdb181c00019c141e */
/*0170*/ @P0 BRA 0xf0; /* 0x12007fffbc00003c */
/*0178*/ BRA 0x310; /* 0x12000000c81c003c */
/* 0x08a0a0a010a01000 */
/*0188*/ MOV R16, c[0x0][0x140]; /* 0x64c03c00281c0042 */
/*0190*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */
/*0198*/ SSY 0x2a0; /* 0x1480000080000000 */
/*01a0*/ LOP.AND R8, R10, 0x1f; /* 0xc20000000f9c2821 */
/*01a8*/ PSETP.AND.AND P2, PT, PT, PT, PT; /* 0x84801c07001dc05e */
/*01b0*/ IADD R12, R8, R5; /* 0xe0800000029c2032 */
/*01b8*/ I2F.F32.S32 R7, R12; /* 0xe5c00000061ca81e */
/* 0x0880009880108010 */
/*01c8*/ PSETP.AND.AND P3, PT, P0, PT, PT; /* 0x84801c07001c007e */
/*01d0*/ FFMA R11, R7, c[0x0][0x144], R16; /* 0x4c004000289c1c2e */
/*01d8*/ PSETP.AND.AND P1, PT, !PT, PT, PT; /* 0x84801c07001fc03e */
/*01e0*/ MOV R7, RZ; /* 0xe4c03c007f9c001e */
/*01e8*/ MOV R8, R6; /* 0xe4c03c00031c0022 */
/*01f0*/ MOV R9, R11; /* 0xe4c03c00059c0026 */
/*01f8*/ FMUL R14, R9, R9; /* 0xe3400000049c243a */
/* 0x08b0ac80b0a0a010 */
/*0208*/ FMUL R15, R8, R8; /* 0xe3400000041c203e */
/*0210*/ PSETP.AND.AND P3, PT, P2, P3, PT; /* 0x84801c03001c807e */
/*0218*/ FADD R13, R15, R14; /* 0xe2c00000071c3c36 */
/*0220*/ FSETP.GTU.AND P2, PT, R13, 4, PT; /* 0xb5e01e04001c345d */
/*0228*/ PSETP.AND.OR P1, PT, P3, P2, P1; /* 0x84810402001cc03e */
/*0230*/ PSETP.AND.AND P2, PT, !PT, PT, PT; /* 0x84801c07001fc05e */
/*0238*/ PSETP.XOR.AND P5, PT, P1, P3, PT; /* 0x84801c03101c40be */
/* 0x08ac8010b0a010b0 */
/*0248*/ @P5 PSETP.AND.AND P2, PT, P3, !P1, PT; /* 0x84801c090014c05e */
/*0250*/ @P2 IADD R7, R7, 0x1; /* 0xc080000000881c1d */
/*0258*/ @P5 FADD R13, R9, R9; /* 0xe2c0000004942436 */
/*0260*/ ISETP.LT.AND P3, PT, R7, c[0x0][0x160], PT; /* 0x5b181c002c1c1c7e */
/*0268*/ @P5 FADD R14, R14, -R15; /* 0xe2c100000794383a */
/*0270*/ PSETP.AND.AND P4, PT, P2, P3, PT; /* 0x84801c03001c809e */
/*0278*/ @P5 FFMA R8, R8, R13, R6; /* 0xcc00180006942022 */
/* 0x08a0a0800000b810 */
/*0288*/ @P5 FADD R9, R11, R14; /* 0xe2c0000007142c26 */
/*0290*/ @P4 BRA 0x1f8; /* 0x12007fffb010003c */
/*0298*/ ISETP.GE.AND.S P1, PT, R12, R3, PT; /* 0xdb681c0001dc303e */
/*02a0*/ @P1 BRA.U 0x2f0; /* 0x120000002404023c */
/*02a8*/ @!P1 LOP32I.AND R9, R10, 0x4000001f; /* 0x202000000fa42824 */
/*02b0*/ @!P1 IMAD R8, R0, c[0x0][0x150], R5; /* 0x510814002a240022 */
/*02b8*/ @!P1 IADD R8, R8, R9; /* 0xe080000004a42022 */
/* 0x08b0a000a0b010a0 */
/*02c8*/ @!P1 SHF.L R8, RZ, 0x2, R8; /* 0xb7c020000127fc21 */
/*02d0*/ @!P1 BFE R9, R8, 0x11f; /* 0xc00800008fa42025 */
/*02d8*/ @!P1 IADD R8.CC, R8, c[0x0][0x168]; /* 0x608400002d242022 */
/*02e0*/ @!P1 IADD.X R9, R9, c[0x0][0x16c]; /* 0x608040002da42426 */
/*02e8*/ @!P1 ST.E [R8], R7; /* 0xe48000000024201c */
/*02f0*/ IADD R5, R5, 0x20; /* 0xc0800000101c1415 */
/*02f8*/ ISETP.LT.AND P1, PT, R5, R3, PT; /* 0xdb181c00019c143e */
/* 0x0800b810b8b000b8 */
/*0308*/ @P1 BRA 0x190; /* 0x12007fff4004003c */
/*0310*/ IADD.S R0, R0, 0x1; /* 0xc080000000dc0001 */
/*0318*/ ISETP.NE.AND P0, PT, R0, R4, PT; /* 0xdb581c00021c001e */
/*0320*/ @P0 BRA 0xa0; /* 0x12007ffebc00003c */
/*0328*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0330*/ EXIT ; /* 0x18000000001c003c */
/*0338*/ BRA 0x338; /* 0x12007ffffc1c003c */
....................................

View File

@@ -1,186 +0,0 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl mandelbrot_scanline
// @mandelbrot_scanline
.entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 .ptr .align 4 mandelbrot_scanline_param_9
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u32 %r6, [mandelbrot_scanline_param_5];
mov.u32 %r5, %ctaid.y;
ld.param.u32 %r7, [mandelbrot_scanline_param_7];
mul.lo.s32 %r0, %r5, %r7;
mad.lo.s32 %r1, %r5, %r7, %r7;
setp.lt.s32 %p0, %r1, %r6;
selp.b32 %r1, %r1, %r6, %p0;
setp.ge.s32 %p0, %r0, %r1;
@%p0 bra BB0_13;
// BB#1: // %for_test28.preheader.lr.ph
ld.param.f32 %f0, [mandelbrot_scanline_param_0];
mov.u32 %r2, %ctaid.x;
ld.param.u32 %r3, [mandelbrot_scanline_param_6];
mul.lo.s32 %r1, %r2, %r3;
ld.param.f32 %f1, [mandelbrot_scanline_param_1];
mad.lo.s32 %r3, %r2, %r3, %r3;
ld.param.f32 %f2, [mandelbrot_scanline_param_2];
ld.param.u32 %r2, [mandelbrot_scanline_param_4];
setp.lt.s32 %p0, %r3, %r2;
ld.param.f32 %f3, [mandelbrot_scanline_param_3];
selp.b32 %r3, %r3, %r2, %p0;
ld.param.u32 %r4, [mandelbrot_scanline_param_8];
ld.param.u64 %rl0, [mandelbrot_scanline_param_9];
setp.gt.s32 %p0, %r4, 0;
not.b32 %r6, %r6;
add.s32 %r5, %r5, 1;
mul.lo.s32 %r5, %r5, %r7;
not.b32 %r5, %r5;
setp.gt.s32 %p1, %r6, %r5;
selp.b32 %r5, %r6, %r5, %p1;
not.b32 %r5, %r5;
BB0_2: // %for_test28.preheader
// =>This Loop Header: Depth=1
// Child Loop BB0_15 Depth 2
// Child Loop BB0_8 Depth 2
// Child Loop BB0_11 Depth 3
setp.ge.s32 %p1, %r1, %r3;
@%p1 bra BB0_12;
// BB#3: // %for_loop30.lr.ph
// in Loop: Header=BB0_2 Depth=1
mul.lo.s32 %r6, %r0, %r2;
mov.u32 %r7, %r1;
@%p0 bra BB0_4;
bra.uni BB0_15;
BB0_4: // in Loop: Header=BB0_2 Depth=1
cvt.rn.f32.s32 %f4, %r0;
fma.rn.f32 %f4, %f4, %f3, %f2;
mov.u32 %r7, %r1;
BB0_8: // %for_loop.i.lr.ph.us
// Parent Loop BB0_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB0_11 Depth 3
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r11, %r10, %r7;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f5, %f1, %f0;
mov.u32 %r10, 0;
mov.pred %p1, 0;
mov.pred %p3, -1;
mov.pred %p4, %p0;
mov.pred %p2, %p1;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB0_11: // %for_loop.i.us
// Parent Loop BB0_2 Depth=1
// Parent Loop BB0_8 Depth=2
// => This Inner Loop Header: Depth=3
and.pred %p4, %p3, %p4;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p3, %f9, 0f40800000;
and.pred %p3, %p4, %p3;
or.pred %p2, %p3, %p2;
xor.pred %p5, %p2, %p4;
mov.pred %p3, %p1;
@!%p5 bra BB0_10;
bra.uni BB0_9;
BB0_9: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB0_11 Depth=3
mul.f32 %f9, %f6, %f6;
not.pred %p3, %p2;
and.pred %p3, %p4, %p3;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB0_10: // %for_step.i.us
// in Loop: Header=BB0_11 Depth=3
add.s32 %r12, %r10, 1;
selp.b32 %r10, %r12, %r10, %p3;
setp.lt.s32 %p4, %r10, %r4;
and.pred %p5, %p3, %p4;
@%p5 bra BB0_11;
// BB#5: // %mandel___vyfvyfvyi.exit.us
// in Loop: Header=BB0_8 Depth=2
setp.ge.s32 %p1, %r11, %r3;
@%p1 bra BB0_7;
// BB#6: // %if_then.us
// in Loop: Header=BB0_8 Depth=2
add.s32 %r11, %r8, 1073741823;
and.b32 %r9, %r11, %r9;
add.s32 %r11, %r7, %r6;
add.s32 %r9, %r11, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB0_7: // %if_exit.us
// in Loop: Header=BB0_8 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r3;
@%p1 bra BB0_8;
bra.uni BB0_12;
BB0_15: // %mandel___vyfvyfvyi.exit
// Parent Loop BB0_2 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r10, %r10, %r7;
setp.lt.s32 %p1, %r10, %r3;
@%p1 bra BB0_16;
bra.uni BB0_14;
BB0_16: // %if_then
// in Loop: Header=BB0_15 Depth=2
add.s32 %r10, %r8, 1073741823;
and.b32 %r9, %r10, %r9;
add.s32 %r10, %r7, %r6;
add.s32 %r9, %r10, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
BB0_14: // %if_exit
// in Loop: Header=BB0_15 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r3;
@%p1 bra BB0_15;
BB0_12: // %for_exit31
// in Loop: Header=BB0_2 Depth=1
add.s32 %r0, %r0, 1;
setp.eq.s32 %p1, %r0, %r5;
@%p1 bra BB0_13;
bra.uni BB0_2;
BB0_13: // %for_exit
ret;
}

View File

@@ -1,186 +0,0 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl mandelbrot_scanline
// @mandelbrot_scanline
.entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 .ptr .align 4 mandelbrot_scanline_param_9
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u32 %r6, [mandelbrot_scanline_param_5];
mov.u32 %r5, %ctaid.y;
ld.param.u32 %r7, [mandelbrot_scanline_param_7];
mul.lo.s32 %r0, %r5, %r7;
mad.lo.s32 %r1, %r5, %r7, %r7;
setp.lt.s32 %p0, %r1, %r6;
selp.b32 %r1, %r1, %r6, %p0;
setp.ge.s32 %p0, %r0, %r1;
@%p0 bra BB0_13;
// BB#1: // %for_test28.preheader.lr.ph
ld.param.f32 %f0, [mandelbrot_scanline_param_0];
mov.u32 %r2, %ctaid.x;
ld.param.u32 %r3, [mandelbrot_scanline_param_6];
mul.lo.s32 %r1, %r2, %r3;
ld.param.f32 %f1, [mandelbrot_scanline_param_1];
mad.lo.s32 %r3, %r2, %r3, %r3;
ld.param.f32 %f2, [mandelbrot_scanline_param_2];
ld.param.u32 %r2, [mandelbrot_scanline_param_4];
setp.lt.s32 %p0, %r3, %r2;
ld.param.f32 %f3, [mandelbrot_scanline_param_3];
selp.b32 %r3, %r3, %r2, %p0;
ld.param.u32 %r4, [mandelbrot_scanline_param_8];
ld.param.u64 %rl0, [mandelbrot_scanline_param_9];
setp.gt.s32 %p0, %r4, 0;
not.b32 %r6, %r6;
add.s32 %r5, %r5, 1;
mul.lo.s32 %r5, %r5, %r7;
not.b32 %r5, %r5;
setp.gt.s32 %p1, %r6, %r5;
selp.b32 %r5, %r6, %r5, %p1;
not.b32 %r5, %r5;
BB0_2: // %for_test28.preheader
// =>This Loop Header: Depth=1
// Child Loop BB0_15 Depth 2
// Child Loop BB0_8 Depth 2
// Child Loop BB0_11 Depth 3
setp.ge.s32 %p1, %r1, %r3;
@%p1 bra BB0_12;
// BB#3: // %for_loop30.lr.ph
// in Loop: Header=BB0_2 Depth=1
mul.lo.s32 %r6, %r0, %r2;
mov.u32 %r7, %r1;
@%p0 bra BB0_4;
bra.uni BB0_15;
BB0_4: // in Loop: Header=BB0_2 Depth=1
cvt.rn.f32.s32 %f4, %r0;
fma.rn.f32 %f4, %f4, %f3, %f2;
mov.u32 %r7, %r1;
BB0_8: // %for_loop.i.lr.ph.us
// Parent Loop BB0_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB0_11 Depth 3
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r11, %r10, %r7;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f5, %f1, %f0;
mov.u32 %r10, 0;
mov.pred %p1, 0;
mov.pred %p3, -1;
mov.pred %p4, %p0;
mov.pred %p2, %p1;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB0_11: // %for_loop.i.us
// Parent Loop BB0_2 Depth=1
// Parent Loop BB0_8 Depth=2
// => This Inner Loop Header: Depth=3
and.pred %p4, %p3, %p4;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p3, %f9, 0f40800000;
and.pred %p3, %p4, %p3;
or.pred %p2, %p3, %p2;
xor.pred %p5, %p2, %p4;
mov.pred %p3, %p1;
@!%p5 bra BB0_10;
bra.uni BB0_9;
BB0_9: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB0_11 Depth=3
mul.f32 %f9, %f6, %f6;
not.pred %p3, %p2;
and.pred %p3, %p4, %p3;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB0_10: // %for_step.i.us
// in Loop: Header=BB0_11 Depth=3
add.s32 %r12, %r10, 1;
selp.b32 %r10, %r12, %r10, %p3;
setp.lt.s32 %p4, %r10, %r4;
and.pred %p5, %p3, %p4;
@%p5 bra BB0_11;
// BB#5: // %mandel___vyfvyfvyi.exit.us
// in Loop: Header=BB0_8 Depth=2
setp.ge.s32 %p1, %r11, %r3;
@%p1 bra BB0_7;
// BB#6: // %if_then.us
// in Loop: Header=BB0_8 Depth=2
add.s32 %r11, %r8, 1073741823;
and.b32 %r9, %r11, %r9;
add.s32 %r11, %r7, %r6;
add.s32 %r9, %r11, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB0_7: // %if_exit.us
// in Loop: Header=BB0_8 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r3;
@%p1 bra BB0_8;
bra.uni BB0_12;
BB0_15: // %mandel___vyfvyfvyi.exit
// Parent Loop BB0_2 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r10, %r10, %r7;
setp.lt.s32 %p1, %r10, %r3;
@%p1 bra BB0_16;
bra.uni BB0_14;
BB0_16: // %if_then
// in Loop: Header=BB0_15 Depth=2
add.s32 %r10, %r8, 1073741823;
and.b32 %r9, %r10, %r9;
add.s32 %r10, %r7, %r6;
add.s32 %r9, %r10, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
BB0_14: // %if_exit
// in Loop: Header=BB0_15 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r3;
@%p1 bra BB0_15;
BB0_12: // %for_exit31
// in Loop: Header=BB0_2 Depth=1
add.s32 %r0, %r0, 1;
setp.eq.s32 %p1, %r0, %r5;
@%p1 bra BB0_13;
bra.uni BB0_2;
BB0_13: // %for_exit
ret;
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,370 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef _DRVAPI_ERROR_STRING_H_
#define _DRVAPI_ERROR_STRING_H_
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// Error Code string definitions here
typedef struct
{
char const *error_string;
int error_id;
} s_CudaErrorStr;
/**
* Error codes
*/
static s_CudaErrorStr sCudaDrvErrorString[] =
{
/**
* The API call returned with no errors. In the case of query calls, this
* can also mean that the operation being queried is complete (see
* ::cuEventQuery() and ::cuStreamQuery()).
*/
{ "CUDA_SUCCESS", 0 },
/**
* This indicates that one or more of the parameters passed to the API call
* is not within an acceptable range of values.
*/
{ "CUDA_ERROR_INVALID_VALUE", 1 },
/**
* The API call failed because it was unable to allocate enough memory to
* perform the requested operation.
*/
{ "CUDA_ERROR_OUT_OF_MEMORY", 2 },
/**
* This indicates that the CUDA driver has not been initialized with
* ::cuInit() or that initialization has failed.
*/
{ "CUDA_ERROR_NOT_INITIALIZED", 3 },
/**
* This indicates that the CUDA driver is in the process of shutting down.
*/
{ "CUDA_ERROR_DEINITIALIZED", 4 },
/**
* This indicates profiling APIs are called while application is running
* in visual profiler mode.
*/
{ "CUDA_ERROR_PROFILER_DISABLED", 5 },
/**
* This indicates profiling has not been initialized for this context.
* Call cuProfilerInitialize() to resolve this.
*/
{ "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 },
/**
* This indicates profiler has already been started and probably
* cuProfilerStart() is incorrectly called.
*/
{ "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 },
/**
* This indicates profiler has already been stopped and probably
* cuProfilerStop() is incorrectly called.
*/
{ "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 },
/**
* This indicates that no CUDA-capable devices were detected by the installed
* CUDA driver.
*/
{ "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 },
/**
* This indicates that the device ordinal supplied by the user does not
* correspond to a valid CUDA device.
*/
{ "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 },
/**
* This indicates that the device kernel image is invalid. This can also
* indicate an invalid CUDA module.
*/
{ "CUDA_ERROR_INVALID_IMAGE", 200 },
/**
* This most frequently indicates that there is no context bound to the
* current thread. This can also be returned if the context passed to an
* API call is not a valid handle (such as a context that has had
* ::cuCtxDestroy() invoked on it). This can also be returned if a user
* mixes different API versions (i.e. 3010 context with 3020 API calls).
* See ::cuCtxGetApiVersion() for more details.
*/
{ "CUDA_ERROR_INVALID_CONTEXT", 201 },
/**
* This indicated that the context being supplied as a parameter to the
* API call was already the active context.
* \deprecated
* This error return is deprecated as of CUDA 3.2. It is no longer an
* error to attempt to push the active context via ::cuCtxPushCurrent().
*/
{ "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 },
/**
* This indicates that a map or register operation has failed.
*/
{ "CUDA_ERROR_MAP_FAILED", 205 },
/**
* This indicates that an unmap or unregister operation has failed.
*/
{ "CUDA_ERROR_UNMAP_FAILED", 206 },
/**
* This indicates that the specified array is currently mapped and thus
* cannot be destroyed.
*/
{ "CUDA_ERROR_ARRAY_IS_MAPPED", 207 },
/**
* This indicates that the resource is already mapped.
*/
{ "CUDA_ERROR_ALREADY_MAPPED", 208 },
/**
* This indicates that there is no kernel image available that is suitable
* for the device. This can occur when a user specifies code generation
* options for a particular CUDA source file that do not include the
* corresponding device configuration.
*/
{ "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 },
/**
* This indicates that a resource has already been acquired.
*/
{ "CUDA_ERROR_ALREADY_ACQUIRED", 210 },
/**
* This indicates that a resource is not mapped.
*/
{ "CUDA_ERROR_NOT_MAPPED", 211 },
/**
* This indicates that a mapped resource is not available for access as an
* array.
*/
{ "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 },
/**
* This indicates that a mapped resource is not available for access as a
* pointer.
*/
{ "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 },
/**
* This indicates that an uncorrectable ECC error was detected during
* execution.
*/
{ "CUDA_ERROR_ECC_UNCORRECTABLE", 214 },
/**
* This indicates that the ::CUlimit passed to the API call is not
* supported by the active device.
*/
{ "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 },
/**
* This indicates that the ::CUcontext passed to the API call can
* only be bound to a single CPU thread at a time but is already
* bound to a CPU thread.
*/
{ "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 },
/**
* This indicates that peer access is not supported across the given
* devices.
*/
{ "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217},
/**
* This indicates that the device kernel source is invalid.
*/
{ "CUDA_ERROR_INVALID_SOURCE", 300 },
/**
* This indicates that the file specified was not found.
*/
{ "CUDA_ERROR_FILE_NOT_FOUND", 301 },
/**
* This indicates that a link to a shared object failed to resolve.
*/
{ "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 },
/**
* This indicates that initialization of a shared object failed.
*/
{ "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 },
/**
* This indicates that an OS call failed.
*/
{ "CUDA_ERROR_OPERATING_SYSTEM", 304 },
/**
* This indicates that a resource handle passed to the API call was not
* valid. Resource handles are opaque types like ::CUstream and ::CUevent.
*/
{ "CUDA_ERROR_INVALID_HANDLE", 400 },
/**
* This indicates that a named symbol was not found. Examples of symbols
* are global/constant variable names, texture names }, and surface names.
*/
{ "CUDA_ERROR_NOT_FOUND", 500 },
/**
* This indicates that asynchronous operations issued previously have not
* completed yet. This result is not actually an error, but must be indicated
* differently than ::CUDA_SUCCESS (which indicates completion). Calls that
* may return this value include ::cuEventQuery() and ::cuStreamQuery().
*/
{ "CUDA_ERROR_NOT_READY", 600 },
/**
* An exception occurred on the device while executing a kernel. Common
* causes include dereferencing an invalid device pointer and accessing
* out of bounds shared memory. The context cannot be used }, so it must
* be destroyed (and a new one should be created). All existing device
* memory allocations from this context are invalid and must be
* reconstructed if the program is to continue using CUDA.
*/
{ "CUDA_ERROR_LAUNCH_FAILED", 700 },
/**
* This indicates that a launch did not occur because it did not have
* appropriate resources. This error usually indicates that the user has
* attempted to pass too many arguments to the device kernel, or the
* kernel launch specifies too many threads for the kernel's register
* count. Passing arguments of the wrong size (i.e. a 64-bit pointer
* when a 32-bit int is expected) is equivalent to passing too many
* arguments and can also result in this error.
*/
{ "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 },
/**
* This indicates that the device kernel took too long to execute. This can
* only occur if timeouts are enabled - see the device attribute
* ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
* context cannot be used (and must be destroyed similar to
* ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
* this context are invalid and must be reconstructed if the program is to
* continue using CUDA.
*/
{ "CUDA_ERROR_LAUNCH_TIMEOUT", 702 },
/**
* This error indicates a kernel launch that uses an incompatible texturing
* mode.
*/
{ "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 },
/**
* This error indicates that a call to ::cuCtxEnablePeerAccess() is
* trying to re-enable peer access to a context which has already
* had peer access to it enabled.
*/
{ "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 },
/**
* This error indicates that ::cuCtxDisablePeerAccess() is
* trying to disable peer access which has not been enabled yet
* via ::cuCtxEnablePeerAccess().
*/
{ "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 },
/**
* This error indicates that the primary context for the specified device
* has already been initialized.
*/
{ "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 },
/**
* This error indicates that the context current to the calling thread
* has been destroyed using ::cuCtxDestroy }, or is a primary context which
* has not yet been initialized.
*/
{ "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 },
/**
* A device-side assert triggered during kernel execution. The context
* cannot be used anymore, and must be destroyed. All existing device
* memory allocations from this context are invalid and must be
* reconstructed if the program is to continue using CUDA.
*/
{ "CUDA_ERROR_ASSERT", 710 },
/**
* This error indicates that the hardware resources required to enable
* peer access have been exhausted for one or more of the devices
* passed to ::cuCtxEnablePeerAccess().
*/
{ "CUDA_ERROR_TOO_MANY_PEERS", 711 },
/**
* This error indicates that the memory range passed to ::cuMemHostRegister()
* has already been registered.
*/
{ "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 },
/**
* This error indicates that the pointer passed to ::cuMemHostUnregister()
* does not correspond to any currently registered memory region.
*/
{ "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 },
/**
* This error indicates that the attempted operation is not permitted.
*/
{ "CUDA_ERROR_NOT_PERMITTED", 800 },
/**
* This error indicates that the attempted operation is not supported
* on the current system or device.
*/
{ "CUDA_ERROR_NOT_SUPPORTED", 801 },
/**
* This indicates that an unknown internal error has occurred.
*/
{ "CUDA_ERROR_UNKNOWN", 999 },
{ NULL, -1 }
};
// This is just a linear search through the array, since the error_id's are not
// always ocurring consecutively
const char * getCudaDrvErrorString(CUresult error_id)
{
int index = 0;
while (sCudaDrvErrorString[index].error_id != error_id &&
sCudaDrvErrorString[index].error_id != -1)
{
index++;
}
if (sCudaDrvErrorString[index].error_id == error_id)
return (const char *)sCudaDrvErrorString[index].error_string;
else
return (const char *)"CUDA_ERROR not found!";
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,410 +0,0 @@
; ModuleID = 'mandelbrot_task.bc'
target datalayout = "e-p:64:64:64-S0-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-v16:16:16-v32:32:32-n16:32:64"
target triple = "nvptx64"
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #0
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #0
; Function Attrs: alwaysinline nounwind readnone
define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8>, <1 x i32> %mask) #1 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i8> %0, i32 0
%d1 = extractelement <1 x i8> %1, i32 0
%sel = select i1 %cmp, i8 %d0, i8 %d1
%r = insertelement <1 x i8> undef, i8 %sel, i32 0
ret <1 x i8> %r
}
; Function Attrs: alwaysinline nounwind readnone
define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16>, <1 x i32> %mask) #1 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i16> %0, i32 0
%d1 = extractelement <1 x i16> %1, i32 0
%sel = select i1 %cmp, i16 %d0, i16 %d1
%r = insertelement <1 x i16> undef, i16 %sel, i32 0
ret <1 x i16> %r
}
; Function Attrs: alwaysinline nounwind readnone
define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64>, <1 x i32> %mask) #1 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i64> %0, i32 0
%d1 = extractelement <1 x i64> %1, i32 0
%sel = select i1 %cmp, i64 %d0, i64 %d1
%r = insertelement <1 x i64> undef, i64 %sel, i32 0
ret <1 x i64> %r
}
; Function Attrs: nounwind readnone
declare double @llvm.nvvm.rsqrt.approx.d(double) #0
; Function Attrs: alwaysinline nounwind
define void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float> %v3, <1 x float>* noalias nocapture %out0, <1 x float>* noalias nocapture %out1, <1 x float>* noalias nocapture %out2, <1 x float>* noalias nocapture %out3) #2 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
store <1 x float> %v3, <1 x float>* %out3, align 4
ret void
}
; Function Attrs: alwaysinline nounwind
define void @__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float> %v3, <1 x float>* noalias nocapture %out0, <1 x float>* noalias nocapture %out1, <1 x float>* noalias nocapture %out2, <1 x float>* noalias nocapture %out3) #2 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
store <1 x float> %v3, <1 x float>* %out3, align 4
ret void
}
; Function Attrs: nounwind
define void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float>* nocapture %out0, <1 x float>* nocapture %out1, <1 x float>* nocapture %out2) #3 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
ret void
}
; Function Attrs: nounwind
define void @__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float>* nocapture %out0, <1 x float>* nocapture %out1, <1 x float>* nocapture %out2) #3 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
ret void
}
; Function Attrs: alwaysinline nounwind readonly
define <1 x double> @__rsqrt_varying_double(<1 x double> %v) #4 {
%vs = extractelement <1 x double> %v, i32 0
%rs = tail call double @llvm.nvvm.rsqrt.approx.d(double %vs)
%rv = insertelement <1 x double> undef, double %rs, i32 0
ret <1 x double> %rv
}
; Function Attrs: nounwind
define void @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* noalias nocapture, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #5 {
allocas:
%x01 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 0
%x02 = load float* %x01, align 4
%dx3 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 1
%dx4 = load float* %dx3, align 4
%y05 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 2
%y06 = load float* %y05, align 4
%dy7 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 3
%dy8 = load float* %dy7, align 4
%width9 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 4
%width10 = load i32* %width9, align 4
%height11 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 5
%height12 = load i32* %height11, align 4
%xspan13 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 6
%xspan14 = load i32* %xspan13, align 4
%yspan15 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 7
%yspan16 = load i32* %yspan15, align 4
%maxIterations17 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 8
%maxIterations18 = load i32* %maxIterations17, align 4
%output19 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 9
%output20 = load i32** %output19, align 8
%task_struct_mask = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 10
%mask = load <1 x i32>* %task_struct_mask, align 4
%item.i = extractelement <1 x i32> %mask, i32 0
%cmp.i = icmp slt i32 %item.i, 0
%bid.i.i = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
%mul_calltmp_xspan_load = mul i32 %bid.i.i, %xspan14
%add_xstart_load_xspan_load25 = add i32 %mul_calltmp_xspan_load, %xspan14
%c.i.i = icmp slt i32 %add_xstart_load_xspan_load25, %width10
%r.i.i = select i1 %c.i.i, i32 %add_xstart_load_xspan_load25, i32 %width10
%bid.i.i177 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
%mul_calltmp31_yspan_load = mul i32 %bid.i.i177, %yspan16
%add_ystart_load_yspan_load32 = add i32 %mul_calltmp31_yspan_load, %yspan16
%c.i.i178 = icmp slt i32 %add_ystart_load_yspan_load32, %height12
%r.i.i179 = select i1 %c.i.i178, i32 %add_ystart_load_yspan_load32, i32 %height12
%less_yi_load_yend_load319 = icmp slt i32 %mul_calltmp31_yspan_load, %r.i.i179
br i1 %cmp.i, label %for_test.preheader, label %for_test104.preheader
for_test104.preheader: ; preds = %allocas
br i1 %less_yi_load_yend_load319, label %for_test115.preheader.lr.ph, label %for_exit
for_test115.preheader.lr.ph: ; preds = %for_test104.preheader
%less_xi_load122_xend_load123331 = icmp slt i32 %mul_calltmp_xspan_load, %r.i.i
%maxIterations_load140_broadcast_init = insertelement <1 x i32> undef, i32 %maxIterations18, i32 0
%less_i_load_count_load.i321 = icmp sgt <1 x i32> %maxIterations_load140_broadcast_init, zeroinitializer
%"oldMask&test.i322" = select <1 x i1> %less_i_load_count_load.i321, <1 x i32> <i32 -1>, <1 x i32> zeroinitializer
%"internal_mask&function_mask10.i323" = and <1 x i32> %"oldMask&test.i322", %mask
%item.i.i324 = extractelement <1 x i32> %"internal_mask&function_mask10.i323", i32 0
%cmp.i.i325 = icmp slt i32 %item.i.i324, 0
%11 = xor i32 %height12, -1
%12 = add i32 %bid.i.i177, 1
%13 = mul i32 %yspan16, %12
%14 = xor i32 %13, -1
%15 = icmp sgt i32 %11, %14
%smax336 = select i1 %15, i32 %11, i32 %14
%16 = xor i32 %smax336, -1
br label %for_test115.preheader
for_test.preheader: ; preds = %allocas
br i1 %less_yi_load_yend_load319, label %for_test40.preheader.lr.ph, label %for_exit
for_test40.preheader.lr.ph: ; preds = %for_test.preheader
%less_xi_load_xend_load317 = icmp slt i32 %mul_calltmp_xspan_load, %r.i.i
%maxIterations_load_broadcast_init = insertelement <1 x i32> undef, i32 %maxIterations18, i32 0
%less_i_load_count_load.i204308 = icmp sgt <1 x i32> %maxIterations_load_broadcast_init, zeroinitializer
%"oldMask&test.i205309" = select <1 x i1> %less_i_load_count_load.i204308, <1 x i32> <i32 -1>, <1 x i32> zeroinitializer
%item.i.i206310 = extractelement <1 x i32> %"oldMask&test.i205309", i32 0
%cmp.i.i207311 = icmp slt i32 %item.i.i206310, 0
%output_load_ptr2int = ptrtoint i32* %output20 to i64
%17 = xor i32 %height12, -1
%18 = add i32 %bid.i.i177, 1
%19 = mul i32 %yspan16, %18
%20 = xor i32 %19, -1
%21 = icmp sgt i32 %17, %20
%smax = select i1 %21, i32 %17, i32 %20
%22 = xor i32 %smax, -1
br label %for_test40.preheader
for_test40.preheader: ; preds = %for_exit43, %for_test40.preheader.lr.ph
%yi.0320 = phi i32 [ %mul_calltmp31_yspan_load, %for_test40.preheader.lr.ph ], [ %yi_load77_plus1, %for_exit43 ]
br i1 %less_xi_load_xend_load317, label %for_loop42.lr.ph, label %for_exit43
for_loop42.lr.ph: ; preds = %for_test40.preheader
%yi_load52_to_float = sitofp i32 %yi.0320 to float
%mul_yi_load52_to_float_dy_load = fmul float %dy8, %yi_load52_to_float
%add_y0_load_mul_yi_load52_to_float_dy_load = fadd float %y06, %mul_yi_load52_to_float_dy_load
%add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init = insertelement <1 x float> undef, float %add_y0_load_mul_yi_load52_to_float_dy_load, i32 0
%mul_yi_load56_width_load57 = mul i32 %yi.0320, %width10
br i1 %cmp.i.i207311, label %for_loop.i229.lr.ph.us, label %mandel___vyfvyfvyi.exit244
mandel___vyfvyfvyi.exit244.us: ; preds = %for_step.i212.us
%tid.i.i189.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%tid.i.i.i190.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%sub_calltmp3_.i191.us = add i32 %tid.i.i.i190.us, -1
%bitop.i192.us = and i32 %sub_calltmp3_.i191.us, %tid.i.i189.us
%add_xi_load62_calltmp65.us = add i32 %bitop.i192.us, %xi.0318.us
%less_add_xi_load62_calltmp65_xend_load66.us = icmp slt i32 %add_xi_load62_calltmp65.us, %r.i.i
br i1 %less_add_xi_load62_calltmp65_xend_load66.us, label %if_then.us, label %if_exit.us
if_then.us: ; preds = %mandel___vyfvyfvyi.exit244.us
%tid.i.i.i194.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%sub_calltmp3_.i195.us = add i32 %tid.i.i.i194.us, 1073741823
%tid.i.i193.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%bitop.i196.us = and i32 %sub_calltmp3_.i195.us, %tid.i.i193.us
%add_xi_load58_calltmp61.us = add i32 %xi.0318.us, %mul_yi_load56_width_load57
%add_mul_yi_load56_width_load57_add_xi_load58_calltmp61.us = add i32 %add_xi_load58_calltmp61.us, %bitop.i196.us
%23 = shl i32 %add_mul_yi_load56_width_load57_add_xi_load58_calltmp61.us, 2
%iptr__id.i264.rhs.us = sext i32 %23 to i64
%iptr__id.i264.us = add i64 %iptr__id.i264.rhs.us, %output_load_ptr2int
%ptr__id.i265.us = inttoptr i64 %iptr__id.i264.us to i32*
store i32 %sel.i.i291.us, i32* %ptr__id.i265.us, align 4
br label %if_exit.us
if_exit.us: ; preds = %if_then.us, %mandel___vyfvyfvyi.exit244.us
%tid.i.i188.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%add_xi_load76_calltmp74.us = add i32 %tid.i.i188.us, %xi.0318.us
%less_xi_load_xend_load.us = icmp slt i32 %add_xi_load76_calltmp74.us, %r.i.i
br i1 %less_xi_load_xend_load.us, label %for_loop.i229.lr.ph.us, label %for_exit43
for_loop.i229.us: ; preds = %for_loop.i229.lr.ph.us, %for_step.i212.us
%"oldMask&test.i205316.us" = phi <1 x i32> [ %"oldMask&test.i205309", %for_loop.i229.lr.ph.us ], [ %"oldMask&test.i205.us", %for_step.i212.us ]
%break_lanes_memory.0.i201315.us = phi <1 x i32> [ zeroinitializer, %for_loop.i229.lr.ph.us ], [ %"mask|break_mask.i220.us", %for_step.i212.us ]
%r.i.i292295314.us = phi <1 x i32> [ zeroinitializer, %for_loop.i229.lr.ph.us ], [ %r.i.i292.us, %for_step.i212.us ]
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init301313.us = phi <1 x float> [ %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init.us, %for_loop.i229.lr.ph.us ], [ %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init300.us, %for_step.i212.us ]
%add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init303312.us = phi <1 x float> [ %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init, %for_loop.i229.lr.ph.us ], [ %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init302.us, %for_step.i212.us ]
%mul_z_re_load_z_re_load13.i214.us = fmul <1 x float> %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init301313.us, %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init301313.us
%mul_z_im_load_z_im_load14.i216.us = fmul <1 x float> %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init303312.us, %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init303312.us
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i217.us = fadd <1 x float> %mul_z_im_load_z_im_load14.i216.us, %mul_z_re_load_z_re_load13.i214.us
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i218.us = fcmp ugt <1 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i217.us, <float 4.000000e+00>
%"oldMask&test16.i219.us" = select <1 x i1> %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i218.us, <1 x i32> %"oldMask&test.i205316.us", <1 x i32> zeroinitializer
%"mask|break_mask.i220.us" = or <1 x i32> %"oldMask&test16.i219.us", %break_lanes_memory.0.i201315.us
%item.i63.i222.us = extractelement <1 x i32> %"mask|break_mask.i220.us", i32 0
%v.i64.i223.us = lshr i32 %item.i63.i222.us, 31
%item.i62.i225.us = extractelement <1 x i32> %"oldMask&test.i205316.us", i32 0
%v.i.i226.us = lshr i32 %item.i62.i225.us, 31
%"equal_finished&func_internal_mask&function_mask12.i228.us" = icmp eq i32 %v.i64.i223.us, %v.i.i226.us
br i1 %"equal_finished&func_internal_mask&function_mask12.i228.us", label %for_step.i212.us, label %not_all_continued_or_breaked.i243.us
not_all_continued_or_breaked.i243.us: ; preds = %for_loop.i229.us
%"!(break|continue)_lanes.i232.us" = xor <1 x i32> %"mask|break_mask.i220.us", <i32 -1>
%new_mask28.i233.us = and <1 x i32> %"oldMask&test.i205316.us", %"!(break|continue)_lanes.i232.us"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i238.us = fsub <1 x float> %mul_z_re_load_z_re_load13.i214.us, %mul_z_im_load_z_im_load14.i216.us
%mul__z_re_load35.i239.us = fmul <1 x float> %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init301313.us, <float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i240.us = fmul <1 x float> %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init303312.us, %mul__z_re_load35.i239.us
%add_c_re_load42_new_re_load.i241.us = fadd <1 x float> %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init.us, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i238.us
%add_c_im_load44_new_im_load.i242.us = fadd <1 x float> %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init, %mul_mul__z_re_load35_z_im_load36.i240.us
br label %for_step.i212.us
for_step.i212.us: ; preds = %not_all_continued_or_breaked.i243.us, %for_loop.i229.us
%add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init302.us = phi <1 x float> [ %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init303312.us, %for_loop.i229.us ], [ %add_c_im_load44_new_im_load.i242.us, %not_all_continued_or_breaked.i243.us ]
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init300.us = phi <1 x float> [ %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init301313.us, %for_loop.i229.us ], [ %add_c_re_load42_new_re_load.i241.us, %not_all_continued_or_breaked.i243.us ]
%internal_mask_memory.1.i209.us = phi <1 x i32> [ zeroinitializer, %for_loop.i229.us ], [ %new_mask28.i233.us, %not_all_continued_or_breaked.i243.us ]
%m.i.i287.us = extractelement <1 x i32> %internal_mask_memory.1.i209.us, i32 0
%d0.i.i289.us = extractelement <1 x i32> %r.i.i292295314.us, i32 0
%not.cmp.i.i288.us = icmp ne i32 %m.i.i287.us, 0
%d1.i.i290.us = zext i1 %not.cmp.i.i288.us to i32
%sel.i.i291.us = add i32 %d0.i.i289.us, %d1.i.i290.us
%r.i.i292.us = insertelement <1 x i32> undef, i32 %sel.i.i291.us, i32 0
%less_i_load_count_load.i204.us = icmp slt <1 x i32> %r.i.i292.us, %maxIterations_load_broadcast_init
%"oldMask&test.i205.us" = select <1 x i1> %less_i_load_count_load.i204.us, <1 x i32> %internal_mask_memory.1.i209.us, <1 x i32> zeroinitializer
%item.i.i206.us = extractelement <1 x i32> %"oldMask&test.i205.us", i32 0
%cmp.i.i207.us = icmp slt i32 %item.i.i206.us, 0
br i1 %cmp.i.i207.us, label %for_loop.i229.us, label %mandel___vyfvyfvyi.exit244.us
for_loop.i229.lr.ph.us: ; preds = %if_exit.us, %for_loop42.lr.ph
%xi.0318.us = phi i32 [ %add_xi_load76_calltmp74.us, %if_exit.us ], [ %mul_calltmp_xspan_load, %for_loop42.lr.ph ]
%tid.i.i180.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%tid.i.i.i181.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%sub_calltmp3_.i182.us = add i32 %tid.i.i.i181.us, -1
%bitop.i183.us = and i32 %sub_calltmp3_.i182.us, %tid.i.i180.us
%add_xi_load48_calltmp51.us = add i32 %bitop.i183.us, %xi.0318.us
%add_xi_load48_calltmp51_to_float.us = sitofp i32 %add_xi_load48_calltmp51.us to float
%mul_add_xi_load48_calltmp51_to_float_dx_load.us = fmul float %dx4, %add_xi_load48_calltmp51_to_float.us
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load.us = fadd float %x02, %mul_add_xi_load48_calltmp51_to_float_dx_load.us
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init.us = insertelement <1 x float> undef, float %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load.us, i32 0
br label %for_loop.i229.us
for_exit: ; preds = %for_exit118, %for_exit43, %for_test.preheader, %for_test104.preheader
ret void
mandel___vyfvyfvyi.exit244: ; preds = %if_exit, %for_loop42.lr.ph
%xi.0318 = phi i32 [ %add_xi_load76_calltmp74, %if_exit ], [ %mul_calltmp_xspan_load, %for_loop42.lr.ph ]
%tid.i.i189 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%tid.i.i.i190 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%sub_calltmp3_.i191 = add i32 %tid.i.i.i190, -1
%bitop.i192 = and i32 %sub_calltmp3_.i191, %tid.i.i189
%add_xi_load62_calltmp65 = add i32 %bitop.i192, %xi.0318
%less_add_xi_load62_calltmp65_xend_load66 = icmp slt i32 %add_xi_load62_calltmp65, %r.i.i
br i1 %less_add_xi_load62_calltmp65_xend_load66, label %if_then, label %if_exit
for_exit43: ; preds = %if_exit, %if_exit.us, %for_test40.preheader
%yi_load77_plus1 = add i32 %yi.0320, 1
%exitcond = icmp eq i32 %yi_load77_plus1, %22
br i1 %exitcond, label %for_exit, label %for_test40.preheader
if_then: ; preds = %mandel___vyfvyfvyi.exit244
%tid.i.i.i194 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%sub_calltmp3_.i195 = add i32 %tid.i.i.i194, 1073741823
%tid.i.i193 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%bitop.i196 = and i32 %sub_calltmp3_.i195, %tid.i.i193
%add_xi_load58_calltmp61 = add i32 %xi.0318, %mul_yi_load56_width_load57
%add_mul_yi_load56_width_load57_add_xi_load58_calltmp61 = add i32 %add_xi_load58_calltmp61, %bitop.i196
%24 = shl i32 %add_mul_yi_load56_width_load57_add_xi_load58_calltmp61, 2
%iptr__id.i264.rhs = sext i32 %24 to i64
%iptr__id.i264 = add i64 %iptr__id.i264.rhs, %output_load_ptr2int
%ptr__id.i265 = inttoptr i64 %iptr__id.i264 to i32*
store i32 0, i32* %ptr__id.i265, align 4
br label %if_exit
if_exit: ; preds = %if_then, %mandel___vyfvyfvyi.exit244
%tid.i.i188 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%add_xi_load76_calltmp74 = add i32 %tid.i.i188, %xi.0318
%less_xi_load_xend_load = icmp slt i32 %add_xi_load76_calltmp74, %r.i.i
br i1 %less_xi_load_xend_load, label %mandel___vyfvyfvyi.exit244, label %for_exit43
for_test115.preheader: ; preds = %for_exit118, %for_test115.preheader.lr.ph
%yi109.0335 = phi i32 [ %mul_calltmp31_yspan_load, %for_test115.preheader.lr.ph ], [ %yi_load171_plus1, %for_exit118 ]
br i1 %less_xi_load122_xend_load123331, label %for_loop117.lr.ph, label %for_exit118
for_loop117.lr.ph: ; preds = %for_test115.preheader
%yi_load135_to_float = sitofp i32 %yi109.0335 to float
%mul_yi_load135_to_float_dy_load136 = fmul float %dy8, %yi_load135_to_float
%add_y0_load134_mul_yi_load135_to_float_dy_load136 = fadd float %y06, %mul_yi_load135_to_float_dy_load136
%add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init = insertelement <1 x float> undef, float %add_y0_load134_mul_yi_load135_to_float_dy_load136, i32 0
br i1 %cmp.i.i325, label %for_loop.i.lr.ph.us, label %if_exit159
if_exit159.us: ; preds = %for_step.i.us
%tid.i.i.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%add_xi120_load_calltmp169.us = add i32 %tid.i.i.us, %xi120.0332.us
%less_xi_load122_xend_load123.us = icmp slt i32 %add_xi120_load_calltmp169.us, %r.i.i
br i1 %less_xi_load122_xend_load123.us, label %for_loop.i.lr.ph.us, label %for_exit118
for_loop.i.us: ; preds = %for_loop.i.lr.ph.us, %for_step.i.us
%"oldMask&test.i329.us" = phi <1 x i32> [ %"oldMask&test.i322", %for_loop.i.lr.ph.us ], [ %"oldMask&test.i.us", %for_step.i.us ]
%break_lanes_memory.0.i328.us = phi <1 x i32> [ zeroinitializer, %for_loop.i.lr.ph.us ], [ %"mask|break_mask.i.us", %for_step.i.us ]
%25 = phi <1 x i32> [ zeroinitializer, %for_loop.i.lr.ph.us ], [ %r.i.i261.us, %for_step.i.us ]
%add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init305327.us = phi <1 x float> [ %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init.us, %for_loop.i.lr.ph.us ], [ %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init304.us, %for_step.i.us ]
%add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init307326.us = phi <1 x float> [ %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init, %for_loop.i.lr.ph.us ], [ %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init306.us, %for_step.i.us ]
%"internal_mask&function_mask12.i.us" = and <1 x i32> %"oldMask&test.i329.us", %mask
%mul_z_re_load_z_re_load13.i.us = fmul <1 x float> %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init305327.us, %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init305327.us
%mul_z_im_load_z_im_load14.i.us = fmul <1 x float> %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init307326.us, %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init307326.us
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i.us = fadd <1 x float> %mul_z_im_load_z_im_load14.i.us, %mul_z_re_load_z_re_load13.i.us
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i.us = fcmp ugt <1 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i.us, <float 4.000000e+00>
%"oldMask&test16.i.us" = select <1 x i1> %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i.us, <1 x i32> %"oldMask&test.i329.us", <1 x i32> zeroinitializer
%"mask|break_mask.i.us" = or <1 x i32> %"oldMask&test16.i.us", %break_lanes_memory.0.i328.us
%"finished&func.i.us" = and <1 x i32> %"mask|break_mask.i.us", %mask
%item.i63.i.us = extractelement <1 x i32> %"finished&func.i.us", i32 0
%v.i64.i.us = lshr i32 %item.i63.i.us, 31
%item.i62.i.us = extractelement <1 x i32> %"internal_mask&function_mask12.i.us", i32 0
%v.i.i.us = lshr i32 %item.i62.i.us, 31
%"equal_finished&func_internal_mask&function_mask12.i.us" = icmp eq i32 %v.i64.i.us, %v.i.i.us
br i1 %"equal_finished&func_internal_mask&function_mask12.i.us", label %for_step.i.us, label %not_all_continued_or_breaked.i.us
not_all_continued_or_breaked.i.us: ; preds = %for_loop.i.us
%"!(break|continue)_lanes.i.us" = xor <1 x i32> %"mask|break_mask.i.us", <i32 -1>
%new_mask28.i.us = and <1 x i32> %"oldMask&test.i329.us", %"!(break|continue)_lanes.i.us"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i.us = fsub <1 x float> %mul_z_re_load_z_re_load13.i.us, %mul_z_im_load_z_im_load14.i.us
%mul__z_re_load35.i.us = fmul <1 x float> %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init305327.us, <float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i.us = fmul <1 x float> %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init307326.us, %mul__z_re_load35.i.us
%add_c_re_load42_new_re_load.i.us = fadd <1 x float> %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init.us, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i.us
%add_c_im_load44_new_im_load.i.us = fadd <1 x float> %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init, %mul_mul__z_re_load35_z_im_load36.i.us
br label %for_step.i.us
for_step.i.us: ; preds = %not_all_continued_or_breaked.i.us, %for_loop.i.us
%add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init306.us = phi <1 x float> [ %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init307326.us, %for_loop.i.us ], [ %add_c_im_load44_new_im_load.i.us, %not_all_continued_or_breaked.i.us ]
%add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init304.us = phi <1 x float> [ %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init305327.us, %for_loop.i.us ], [ %add_c_re_load42_new_re_load.i.us, %not_all_continued_or_breaked.i.us ]
%internal_mask_memory.1.i.us = phi <1 x i32> [ zeroinitializer, %for_loop.i.us ], [ %new_mask28.i.us, %not_all_continued_or_breaked.i.us ]
%m.i.i.us = extractelement <1 x i32> %internal_mask_memory.1.i.us, i32 0
%d0.i.i259.us = extractelement <1 x i32> %25, i32 0
%not.cmp.i.i258.us = icmp ne i32 %m.i.i.us, 0
%d1.i.i260.us = zext i1 %not.cmp.i.i258.us to i32
%sel.i.i.us = add i32 %d0.i.i259.us, %d1.i.i260.us
%r.i.i261.us = insertelement <1 x i32> undef, i32 %sel.i.i.us, i32 0
%less_i_load_count_load.i.us = icmp slt <1 x i32> %r.i.i261.us, %maxIterations_load140_broadcast_init
%"oldMask&test.i.us" = select <1 x i1> %less_i_load_count_load.i.us, <1 x i32> %internal_mask_memory.1.i.us, <1 x i32> zeroinitializer
%"internal_mask&function_mask10.i.us" = and <1 x i32> %"oldMask&test.i.us", %mask
%item.i.i.us = extractelement <1 x i32> %"internal_mask&function_mask10.i.us", i32 0
%cmp.i.i.us = icmp slt i32 %item.i.i.us, 0
br i1 %cmp.i.i.us, label %for_loop.i.us, label %if_exit159.us
for_loop.i.lr.ph.us: ; preds = %if_exit159.us, %for_loop117.lr.ph
%xi120.0332.us = phi i32 [ %add_xi120_load_calltmp169.us, %if_exit159.us ], [ %mul_calltmp_xspan_load, %for_loop117.lr.ph ]
%tid.i.i184.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%tid.i.i.i185.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%sub_calltmp3_.i186.us = add i32 %tid.i.i.i185.us, -1
%bitop.i187.us = and i32 %sub_calltmp3_.i186.us, %tid.i.i184.us
%add_xi_load128_calltmp131.us = add i32 %bitop.i187.us, %xi120.0332.us
%add_xi_load128_calltmp131_to_float.us = sitofp i32 %add_xi_load128_calltmp131.us to float
%mul_add_xi_load128_calltmp131_to_float_dx_load132.us = fmul float %dx4, %add_xi_load128_calltmp131_to_float.us
%add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132.us = fadd float %x02, %mul_add_xi_load128_calltmp131_to_float_dx_load132.us
%add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init.us = insertelement <1 x float> undef, float %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132.us, i32 0
br label %for_loop.i.us
for_exit118: ; preds = %if_exit159, %if_exit159.us, %for_test115.preheader
%yi_load171_plus1 = add i32 %yi109.0335, 1
%exitcond337 = icmp eq i32 %yi_load171_plus1, %16
br i1 %exitcond337, label %for_exit, label %for_test115.preheader
if_exit159: ; preds = %if_exit159, %for_loop117.lr.ph
%xi120.0332 = phi i32 [ %add_xi120_load_calltmp169, %if_exit159 ], [ %mul_calltmp_xspan_load, %for_loop117.lr.ph ]
%tid.i.i = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%add_xi120_load_calltmp169 = add i32 %tid.i.i, %xi120.0332
%less_xi_load122_xend_load123 = icmp slt i32 %add_xi120_load_calltmp169, %r.i.i
br i1 %less_xi_load122_xend_load123, label %if_exit159, label %for_exit118
}
attributes #0 = { nounwind readnone }
attributes #1 = { alwaysinline nounwind readnone }
attributes #2 = { alwaysinline nounwind }
attributes #3 = { nounwind }
attributes #4 = { alwaysinline nounwind readonly }
attributes #5 = { nounwind "target-features"="+sm_35" }
!nvvm.annotations = !{!1}
!1 = metadata !{void ({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* , i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, metadata !"kernel", i32 1}

View File

@@ -1,213 +0,0 @@
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
// Cuda compilation tools, release 5.5, V5.5.0
//
.version 3.2
.target sm_35
.address_size 64
.file 1 "/home/evghenii/soft/ispc-code/ispc/examples/mandelbrot_tasks3d/mandel_task_cu.cu", 1383122156, 1370
.file 2 "/usr/local/cuda-5.5/bin/..//include/cuda_device_runtime_api.h", 1375338991, 7655
.file 3 "/usr/local/cuda-5.5/bin/..//include/device_functions.h", 1375338991, 185228
.extern .func (.param .b32 func_retval0) vprintf
(
.param .b64 vprintf_param_0,
.param .b64 vprintf_param_1
)
;
.global .align 1 .b8 $str[26] = {118, 101, 99, 116, 111, 114, 73, 110, 100, 101, 120, 61, 32, 37, 100, 32, 32, 98, 105, 100, 61, 32, 37, 100, 10, 0};
.weak .func (.param .b32 func_retval0) cudaMalloc(
.param .b64 cudaMalloc_param_0,
.param .b64 cudaMalloc_param_1
)
{
.reg .s32 %r<2>;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
.loc 2 66 3
ret;
}
.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes(
.param .b64 cudaFuncGetAttributes_param_0,
.param .b64 cudaFuncGetAttributes_param_1
)
{
.reg .s32 %r<2>;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
.loc 2 71 3
ret;
}
.visible .entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 mandelbrot_scanline_param_9
)
{
.local .align 8 .b8 __local_depot2[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<9>;
.reg .s32 %r<40>;
.reg .f32 %f<20>;
.reg .s64 %rd<8>;
mov.u64 %SPL, __local_depot2;
cvta.local.u64 %SP, %SPL;
ld.param.f32 %f9, [mandelbrot_scanline_param_0];
ld.param.f32 %f10, [mandelbrot_scanline_param_1];
ld.param.f32 %f11, [mandelbrot_scanline_param_2];
ld.param.f32 %f12, [mandelbrot_scanline_param_3];
ld.param.u32 %r14, [mandelbrot_scanline_param_4];
ld.param.u32 %r17, [mandelbrot_scanline_param_5];
ld.param.u32 %r15, [mandelbrot_scanline_param_6];
ld.param.u32 %r18, [mandelbrot_scanline_param_7];
ld.param.u32 %r16, [mandelbrot_scanline_param_8];
ld.param.u64 %rd1, [mandelbrot_scanline_param_9];
add.u64 %rd2, %SP, 0;
.loc 1 35 1
cvta.to.local.u64 %rd3, %rd2;
mov.u32 %r19, %tid.x;
and.b32 %r20, %r19, 31;
mov.u32 %r21, %ntid.x;
cvta.global.u64 %rd4, $str;
st.local.v2.u32 [%rd3], {%r20, %r21};
// Callseq Start 0
{
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd4;
.param .b64 param1;
st.param.b64 [param1+0], %rd2;
.param .b32 retval0;
.loc 1 35 1
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r22, [retval0+0];
}
// Callseq End 0
.loc 1 36 1
mov.u32 %r23, %ctaid.x;
.loc 1 37 1
mad.lo.s32 %r24, %r23, %r15, %r15;
.loc 3 2621 10
min.s32 %r1, %r24, %r14;
.loc 1 39 1
mov.u32 %r25, %ctaid.y;
mul.lo.s32 %r37, %r25, %r18;
.loc 1 40 1
add.s32 %r26, %r37, %r18;
.loc 3 2621 10
min.s32 %r3, %r26, %r17;
.loc 1 42 1
setp.ge.s32 %p1, %r37, %r3;
@%p1 bra BB2_12;
cvta.to.global.u64 %rd5, %rd1;
BB2_2:
.loc 1 36 1
mul.lo.s32 %r38, %r23, %r15;
.loc 1 43 1
setp.ge.s32 %p2, %r38, %r1;
@%p2 bra BB2_11;
.loc 1 46 1
cvt.rn.f32.s32 %f13, %r37;
fma.rn.f32 %f1, %f13, %f12, %f11;
BB2_4:
.loc 1 45 1
add.s32 %r7, %r20, %r38;
cvt.rn.f32.u32 %f14, %r7;
fma.rn.f32 %f2, %f14, %f10, %f9;
mov.u32 %r39, 0;
setp.gt.s32 %p3, %r16, 0;
.loc 1 12 1
@%p3 bra BB2_5;
bra.uni BB2_8;
BB2_5:
mov.f32 %f18, %f1;
mov.f32 %f19, %f2;
BB2_6:
.loc 1 13 1
mov.f32 %f4, %f19;
mov.f32 %f3, %f18;
mul.f32 %f5, %f3, %f3;
mul.f32 %f6, %f4, %f4;
add.f32 %f15, %f6, %f5;
setp.gt.f32 %p4, %f15, 0f40800000;
@%p4 bra BB2_8;
.loc 1 16 1
sub.f32 %f16, %f6, %f5;
.loc 1 17 1
add.f32 %f17, %f4, %f4;
.loc 1 19 1
add.f32 %f7, %f2, %f16;
.loc 1 20 1
fma.rn.f32 %f8, %f17, %f3, %f1;
.loc 1 12 96
add.s32 %r39, %r39, 1;
.loc 1 12 1
setp.lt.s32 %p5, %r39, %r16;
mov.f32 %f18, %f8;
mov.f32 %f19, %f7;
@%p5 bra BB2_6;
BB2_8:
.loc 1 49 1
mad.lo.s32 %r34, %r37, %r14, %r38;
add.s32 %r11, %r34, %r20;
.loc 1 50 1
setp.ge.u32 %p6, %r7, %r1;
@%p6 bra BB2_10;
.loc 1 51 1
mul.wide.s32 %rd6, %r11, 4;
add.s64 %rd7, %rd5, %rd6;
st.global.u32 [%rd7], %r39;
BB2_10:
.loc 1 43 57
add.s32 %r38, %r38, 32;
.loc 1 43 1
setp.lt.s32 %p7, %r38, %r1;
@%p7 bra BB2_4;
BB2_11:
.loc 1 42 57
add.s32 %r37, %r37, 1;
.loc 1 42 1
setp.lt.s32 %p8, %r37, %r3;
@%p8 bra BB2_2;
BB2_12:
.loc 1 53 2
ret;
}

View File

@@ -1,676 +0,0 @@
; ModuleID = 'mandelbrot_task.bc'
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
declare i8* @ISPCAlloc(i8**, i64, i32)
declare void @ISPCLaunch(i8**, i8*, i8*, i32, i32, i32)
declare void @ISPCSync(i8*)
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
define void @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
allocas:
%x01 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 0
%x02 = load float* %x01, align 4
%dx3 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 1
%dx4 = load float* %dx3, align 4
%y05 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 2
%y06 = load float* %y05, align 4
%dy7 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 3
%dy8 = load float* %dy7, align 4
%width9 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 4
%width10 = load i32* %width9, align 4
%height11 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 5
%height12 = load i32* %height11, align 4
%xspan13 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 6
%xspan14 = load i32* %xspan13, align 4
%yspan15 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 7
%yspan16 = load i32* %yspan15, align 4
%maxIterations17 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 8
%maxIterations18 = load i32* %maxIterations17, align 4
%output19 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 9
%output20 = load i32** %output19, align 8
%task_struct_mask = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 10
%mask = load <8 x i32>* %task_struct_mask, align 32
%floatmask.i = bitcast <8 x i32> %mask to <8 x float>
%v.i = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i)
%cmp.i = icmp eq i32 %v.i, 255
%mul_taskIndex0_load_xspan_load = mul i32 %xspan14, %5
%add_xstart_load_xspan_load23 = add i32 %mul_taskIndex0_load_xspan_load, %xspan14
%ret_veca.i.i = insertelement <4 x i32> undef, i32 %add_xstart_load_xspan_load23, i32 0
%ret_vecb.i.i = insertelement <4 x i32> undef, i32 %width10, i32 0
%ret_val.i.i = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %ret_veca.i.i, <4 x i32> %ret_vecb.i.i)
%ret.i.i = extractelement <4 x i32> %ret_val.i.i, i32 0
%mul_taskIndex1_load_yspan_load = mul i32 %yspan16, %6
%add_ystart_load_yspan_load26 = add i32 %mul_taskIndex1_load_yspan_load, %yspan16
%ret_veca.i.i220 = insertelement <4 x i32> undef, i32 %add_ystart_load_yspan_load26, i32 0
%ret_vecb.i.i221 = insertelement <4 x i32> undef, i32 %height12, i32 0
%ret_val.i.i222 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %ret_veca.i.i220, <4 x i32> %ret_vecb.i.i221)
%ret.i.i223 = extractelement <4 x i32> %ret_val.i.i222, i32 0
%less_yi_load_yend_load345 = icmp slt i32 %mul_taskIndex1_load_yspan_load, %ret.i.i223
br i1 %cmp.i, label %for_test.preheader, label %for_test92.preheader
for_test92.preheader: ; preds = %allocas
br i1 %less_yi_load_yend_load345, label %for_test103.preheader.lr.ph, label %for_exit
for_test103.preheader.lr.ph: ; preds = %for_test92.preheader
%less_xi_load110_xend_load111360 = icmp slt i32 %mul_taskIndex0_load_xspan_load, %ret.i.i
%x0_load115_broadcast_init = insertelement <8 x float> undef, float %x02, i32 0
%x0_load115_broadcast = shufflevector <8 x float> %x0_load115_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%dx_load117_broadcast_init = insertelement <8 x float> undef, float %dx4, i32 0
%dx_load117_broadcast = shufflevector <8 x float> %dx_load117_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%maxIterations_load125_broadcast_init = insertelement <8 x i32> undef, i32 %maxIterations18, i32 0
%maxIterations_load125_broadcast = shufflevector <8 x i32> %maxIterations_load125_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%less_i_load_count_load.i347 = icmp sgt <8 x i32> %maxIterations_load125_broadcast, zeroinitializer
%"oldMask&test.i348" = select <8 x i1> %less_i_load_count_load.i347, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> zeroinitializer
%"internal_mask&function_mask10.i349" = and <8 x i32> %"oldMask&test.i348", %mask
%floatmask.i.i350 = bitcast <8 x i32> %"internal_mask&function_mask10.i349" to <8 x float>
%xend_load134_broadcast_init = insertelement <8 x i32> undef, i32 %ret.i.i, i32 0
%xend_load134_broadcast = shufflevector <8 x i32> %xend_load134_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%output_load145_ptr2int_2void = bitcast i32* %output20 to i8*
br label %for_test103.preheader
for_test.preheader: ; preds = %allocas
br i1 %less_yi_load_yend_load345, label %for_test34.preheader.lr.ph, label %for_exit
for_test34.preheader.lr.ph: ; preds = %for_test.preheader
%less_xi_load_xend_load343 = icmp slt i32 %mul_taskIndex0_load_xspan_load, %ret.i.i
%x0_load_broadcast_init = insertelement <8 x float> undef, float %x02, i32 0
%x0_load_broadcast = shufflevector <8 x float> %x0_load_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%dx_load_broadcast_init = insertelement <8 x float> undef, float %dx4, i32 0
%dx_load_broadcast = shufflevector <8 x float> %dx_load_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%maxIterations_load_broadcast_init = insertelement <8 x i32> undef, i32 %maxIterations18, i32 0
%maxIterations_load_broadcast = shufflevector <8 x i32> %maxIterations_load_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%less_i_load_count_load.i181332 = icmp sgt <8 x i32> %maxIterations_load_broadcast, zeroinitializer
%"oldMask&test.i182333" = select <8 x i1> %less_i_load_count_load.i181332, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> zeroinitializer
%floatmask.i.i183334 = bitcast <8 x i32> %"oldMask&test.i182333" to <8 x float>
%xend_load51_broadcast_init = insertelement <8 x i32> undef, i32 %ret.i.i, i32 0
%xend_load51_broadcast = shufflevector <8 x i32> %xend_load51_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%output_load_ptr2int_2void = bitcast i32* %output20 to i8*
br label %for_test34.preheader
for_test34.preheader: ; preds = %for_exit37, %for_test34.preheader.lr.ph
%yi.0346 = phi i32 [ %mul_taskIndex1_load_yspan_load, %for_test34.preheader.lr.ph ], [ %yi_load69_plus1, %for_exit37 ]
br i1 %less_xi_load_xend_load343, label %for_loop36.lr.ph, label %for_exit37
for_loop36.lr.ph: ; preds = %for_test34.preheader
%yi_load43_to_float = sitofp i32 %yi.0346 to float
%mul_yi_load43_to_float_dy_load = fmul float %dy8, %yi_load43_to_float
%add_y0_load_mul_yi_load43_to_float_dy_load = fadd float %y06, %mul_yi_load43_to_float_dy_load
%add_y0_load_mul_yi_load43_to_float_dy_load_broadcast_init = insertelement <8 x float> undef, float %add_y0_load_mul_yi_load43_to_float_dy_load, i32 0
%add_y0_load_mul_yi_load43_to_float_dy_load_broadcast = shufflevector <8 x float> %add_y0_load_mul_yi_load43_to_float_dy_load_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%mul_yi_load47_width_load48 = mul i32 %yi.0346, %width10
%mul_yi_load47_width_load48_broadcast_init = insertelement <8 x i32> undef, i32 %mul_yi_load47_width_load48, i32 0
%mul_yi_load47_width_load48_broadcast = shufflevector <8 x i32> %mul_yi_load47_width_load48_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
br label %for_loop36
for_exit: ; preds = %for_exit106, %for_exit37, %for_test.preheader, %for_test92.preheader
ret void
for_loop36: ; preds = %safe_if_after_true, %for_loop36.lr.ph
%xi.0344 = phi i32 [ %mul_taskIndex0_load_xspan_load, %for_loop36.lr.ph ], [ %add_xi_load68_, %safe_if_after_true ]
%xi_load42_broadcast_init = insertelement <8 x i32> undef, i32 %xi.0344, i32 0
%xi_load42_broadcast = shufflevector <8 x i32> %xi_load42_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%add_xi_load42_broadcast_ = add <8 x i32> %xi_load42_broadcast, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%add_xi_load42_broadcast__to_float = sitofp <8 x i32> %add_xi_load42_broadcast_ to <8 x float>
%mul_add_xi_load42_broadcast__to_float_dx_load_broadcast = fmul <8 x float> %dx_load_broadcast, %add_xi_load42_broadcast__to_float
%add_x0_load_broadcast_mul_add_xi_load42_broadcast__to_float_dx_load_broadcast = fadd <8 x float> %x0_load_broadcast, %mul_add_xi_load42_broadcast__to_float_dx_load_broadcast
%v.i.i184335 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i.i183334)
%cmp.i.i185336 = icmp eq i32 %v.i.i184335, 0
br i1 %cmp.i.i185336, label %mandel___vyfvyfvyi.exit219, label %for_loop.i207
for_step.i192: ; preds = %not_all_continued_or_breaked.i218, %for_loop.i207
%z_re.1.i187 = phi <8 x float> [ %z_re.0.i176338, %for_loop.i207 ], [ %add_c_re_load42_new_re_load.i216, %not_all_continued_or_breaked.i218 ]
%z_im.1.i188 = phi <8 x float> [ %z_im.0.i177339, %for_loop.i207 ], [ %add_c_im_load44_new_im_load.i217, %not_all_continued_or_breaked.i218 ]
%internal_mask_memory.1.i189 = phi <8 x i32> [ zeroinitializer, %for_loop.i207 ], [ %new_mask28.i210, %not_all_continued_or_breaked.i218 ]
%i_load53_plus1.i191 = add <8 x i32> %blendAsInt.i328337, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%mask_as_float.i = bitcast <8 x i32> %internal_mask_memory.1.i189 to <8 x float>
%oldAsFloat.i = bitcast <8 x i32> %blendAsInt.i328337 to <8 x float>
%newAsFloat.i = bitcast <8 x i32> %i_load53_plus1.i191 to <8 x float>
%blend.i = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat.i, <8 x float> %newAsFloat.i, <8 x float> %mask_as_float.i)
%blendAsInt.i = bitcast <8 x float> %blend.i to <8 x i32>
%less_i_load_count_load.i181 = icmp slt <8 x i32> %blendAsInt.i, %maxIterations_load_broadcast
%"oldMask&test.i182" = select <8 x i1> %less_i_load_count_load.i181, <8 x i32> %internal_mask_memory.1.i189, <8 x i32> zeroinitializer
%floatmask.i.i183 = bitcast <8 x i32> %"oldMask&test.i182" to <8 x float>
%v.i.i184 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i.i183)
%cmp.i.i185 = icmp eq i32 %v.i.i184, 0
br i1 %cmp.i.i185, label %mandel___vyfvyfvyi.exit219, label %for_loop.i207
for_loop.i207: ; preds = %for_step.i192, %for_loop36
%v.i.i184342 = phi i32 [ %v.i.i184, %for_step.i192 ], [ %v.i.i184335, %for_loop36 ]
%"oldMask&test.i182341" = phi <8 x i32> [ %"oldMask&test.i182", %for_step.i192 ], [ %"oldMask&test.i182333", %for_loop36 ]
%break_lanes_memory.0.i178340 = phi <8 x i32> [ %"mask|break_mask.i198", %for_step.i192 ], [ zeroinitializer, %for_loop36 ]
%z_im.0.i177339 = phi <8 x float> [ %z_im.1.i188, %for_step.i192 ], [ %add_y0_load_mul_yi_load43_to_float_dy_load_broadcast, %for_loop36 ]
%z_re.0.i176338 = phi <8 x float> [ %z_re.1.i187, %for_step.i192 ], [ %add_x0_load_broadcast_mul_add_xi_load42_broadcast__to_float_dx_load_broadcast, %for_loop36 ]
%blendAsInt.i328337 = phi <8 x i32> [ %blendAsInt.i, %for_step.i192 ], [ zeroinitializer, %for_loop36 ]
%mul_z_re_load_z_re_load13.i193 = fmul <8 x float> %z_re.0.i176338, %z_re.0.i176338
%mul_z_im_load_z_im_load14.i194 = fmul <8 x float> %z_im.0.i177339, %z_im.0.i177339
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i195 = fadd <8 x float> %mul_z_re_load_z_re_load13.i193, %mul_z_im_load_z_im_load14.i194
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i196 = fcmp ugt <8 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i195, <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
%"oldMask&test16.i197" = select <8 x i1> %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i196, <8 x i32> %"oldMask&test.i182341", <8 x i32> zeroinitializer
%"mask|break_mask.i198" = or <8 x i32> %"oldMask&test16.i197", %break_lanes_memory.0.i178340
%floatmask.i67.i200 = bitcast <8 x i32> %"mask|break_mask.i198" to <8 x float>
%v.i68.i201 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i67.i200)
%"equal_finished&func_internal_mask&function_mask12.i206" = icmp eq i32 %v.i68.i201, %v.i.i184342
br i1 %"equal_finished&func_internal_mask&function_mask12.i206", label %for_step.i192, label %not_all_continued_or_breaked.i218
not_all_continued_or_breaked.i218: ; preds = %for_loop.i207
%"!(break|continue)_lanes.i209" = xor <8 x i32> %"mask|break_mask.i198", <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%new_mask28.i210 = and <8 x i32> %"oldMask&test.i182341", %"!(break|continue)_lanes.i209"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i213 = fsub <8 x float> %mul_z_re_load_z_re_load13.i193, %mul_z_im_load_z_im_load14.i194
%mul__z_re_load35.i214 = fmul <8 x float> %z_re.0.i176338, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i215 = fmul <8 x float> %mul__z_re_load35.i214, %z_im.0.i177339
%add_c_re_load42_new_re_load.i216 = fadd <8 x float> %add_x0_load_broadcast_mul_add_xi_load42_broadcast__to_float_dx_load_broadcast, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i213
%add_c_im_load44_new_im_load.i217 = fadd <8 x float> %add_y0_load_mul_yi_load43_to_float_dy_load_broadcast, %mul_mul__z_re_load35_z_im_load36.i215
br label %for_step.i192
mandel___vyfvyfvyi.exit219: ; preds = %for_step.i192, %for_loop36
%blendAsInt.i328.lcssa = phi <8 x i32> [ zeroinitializer, %for_loop36 ], [ %blendAsInt.i, %for_step.i192 ]
%less_add_xi_load50_broadcast__xend_load51_broadcast = icmp slt <8 x i32> %add_xi_load42_broadcast_, %xend_load51_broadcast
%floatmask.i172 = select <8 x i1> %less_add_xi_load50_broadcast__xend_load51_broadcast, <8 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000>, <8 x float> zeroinitializer
%v.i173 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i172)
%cmp.i174 = icmp eq i32 %v.i173, 0
br i1 %cmp.i174, label %safe_if_after_true, label %safe_if_run_true
for_exit37: ; preds = %safe_if_after_true, %for_test34.preheader
%yi_load69_plus1 = add i32 %yi.0346, 1
%exitcond = icmp eq i32 %yi_load69_plus1, %ret.i.i223
br i1 %exitcond, label %for_exit, label %for_test34.preheader
safe_if_after_true: ; preds = %pl_dolane.7.i326, %pl_loopend.6.i318, %mandel___vyfvyfvyi.exit219
%add_xi_load68_ = add i32 %xi.0344, 8
%less_xi_load_xend_load = icmp slt i32 %add_xi_load68_, %ret.i.i
br i1 %less_xi_load_xend_load, label %for_loop36, label %for_exit37
safe_if_run_true: ; preds = %mandel___vyfvyfvyi.exit219
%add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast = add <8 x i32> %mul_yi_load47_width_load48_broadcast, %xi_load42_broadcast
%v.i.i239 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i172)
%v64.i.i240 = zext i32 %v.i.i239 to i64
%pl_and.i241 = and i64 %v64.i.i240, 1
%pl_doit.i242 = icmp eq i64 %pl_and.i241, 0
br i1 %pl_doit.i242, label %pl_loopend.i252, label %pl_dolane.i249
pl_dolane.i249: ; preds = %safe_if_run_true
%offset32.i.i243 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 0
%offset64.i.i244 = sext i32 %offset32.i.i243 to i64
%finalptr.i.i246331 = getelementptr i32* %output20, i64 %offset64.i.i244
%storeval.i.i248 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 0
store i32 %storeval.i.i248, i32* %finalptr.i.i246331, align 4
br label %pl_loopend.i252
pl_loopend.i252: ; preds = %pl_dolane.i249, %safe_if_run_true
%pl_and.1.i250 = and i64 %v64.i.i240, 2
%pl_doit.1.i251 = icmp eq i64 %pl_and.1.i250, 0
br i1 %pl_doit.1.i251, label %pl_loopend.1.i263, label %pl_dolane.1.i260
pl_dolane.1.i260: ; preds = %pl_loopend.i252
%offset32.i.1.i253 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 1
%offset64.i.1.i254 = sext i32 %offset32.i.1.i253 to i64
%offset.i.1.i255 = shl nsw i64 %offset64.i.1.i254, 2
%ptroffset.sum.i.1.i256 = add i64 %offset.i.1.i255, 8
%finalptr.i.1.i257 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.1.i256
%ptrcast.i.1.i258 = bitcast i8* %finalptr.i.1.i257 to i32*
%storeval.i.1.i259 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 1
store i32 %storeval.i.1.i259, i32* %ptrcast.i.1.i258, align 4
br label %pl_loopend.1.i263
pl_loopend.1.i263: ; preds = %pl_dolane.1.i260, %pl_loopend.i252
%pl_and.2.i261 = and i64 %v64.i.i240, 4
%pl_doit.2.i262 = icmp eq i64 %pl_and.2.i261, 0
br i1 %pl_doit.2.i262, label %pl_loopend.2.i274, label %pl_dolane.2.i271
pl_dolane.2.i271: ; preds = %pl_loopend.1.i263
%offset32.i.2.i264 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 2
%offset64.i.2.i265 = sext i32 %offset32.i.2.i264 to i64
%offset.i.2.i266 = shl nsw i64 %offset64.i.2.i265, 2
%ptroffset.sum.i.2.i267 = add i64 %offset.i.2.i266, 16
%finalptr.i.2.i268 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.2.i267
%ptrcast.i.2.i269 = bitcast i8* %finalptr.i.2.i268 to i32*
%storeval.i.2.i270 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 2
store i32 %storeval.i.2.i270, i32* %ptrcast.i.2.i269, align 4
br label %pl_loopend.2.i274
pl_loopend.2.i274: ; preds = %pl_dolane.2.i271, %pl_loopend.1.i263
%pl_and.3.i272 = and i64 %v64.i.i240, 8
%pl_doit.3.i273 = icmp eq i64 %pl_and.3.i272, 0
br i1 %pl_doit.3.i273, label %pl_loopend.3.i285, label %pl_dolane.3.i282
pl_dolane.3.i282: ; preds = %pl_loopend.2.i274
%offset32.i.3.i275 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 3
%offset64.i.3.i276 = sext i32 %offset32.i.3.i275 to i64
%offset.i.3.i277 = shl nsw i64 %offset64.i.3.i276, 2
%ptroffset.sum.i.3.i278 = add i64 %offset.i.3.i277, 24
%finalptr.i.3.i279 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.3.i278
%ptrcast.i.3.i280 = bitcast i8* %finalptr.i.3.i279 to i32*
%storeval.i.3.i281 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 3
store i32 %storeval.i.3.i281, i32* %ptrcast.i.3.i280, align 4
br label %pl_loopend.3.i285
pl_loopend.3.i285: ; preds = %pl_dolane.3.i282, %pl_loopend.2.i274
%pl_and.4.i283 = and i64 %v64.i.i240, 16
%pl_doit.4.i284 = icmp eq i64 %pl_and.4.i283, 0
br i1 %pl_doit.4.i284, label %pl_loopend.4.i296, label %pl_dolane.4.i293
pl_dolane.4.i293: ; preds = %pl_loopend.3.i285
%offset32.i.4.i286 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 4
%offset64.i.4.i287 = sext i32 %offset32.i.4.i286 to i64
%offset.i.4.i288 = shl nsw i64 %offset64.i.4.i287, 2
%ptroffset.sum.i.4.i289 = add i64 %offset.i.4.i288, 32
%finalptr.i.4.i290 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.4.i289
%ptrcast.i.4.i291 = bitcast i8* %finalptr.i.4.i290 to i32*
%storeval.i.4.i292 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 4
store i32 %storeval.i.4.i292, i32* %ptrcast.i.4.i291, align 4
br label %pl_loopend.4.i296
pl_loopend.4.i296: ; preds = %pl_dolane.4.i293, %pl_loopend.3.i285
%pl_and.5.i294 = and i64 %v64.i.i240, 32
%pl_doit.5.i295 = icmp eq i64 %pl_and.5.i294, 0
br i1 %pl_doit.5.i295, label %pl_loopend.5.i307, label %pl_dolane.5.i304
pl_dolane.5.i304: ; preds = %pl_loopend.4.i296
%offset32.i.5.i297 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 5
%offset64.i.5.i298 = sext i32 %offset32.i.5.i297 to i64
%offset.i.5.i299 = shl nsw i64 %offset64.i.5.i298, 2
%ptroffset.sum.i.5.i300 = add i64 %offset.i.5.i299, 40
%finalptr.i.5.i301 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.5.i300
%ptrcast.i.5.i302 = bitcast i8* %finalptr.i.5.i301 to i32*
%storeval.i.5.i303 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 5
store i32 %storeval.i.5.i303, i32* %ptrcast.i.5.i302, align 4
br label %pl_loopend.5.i307
pl_loopend.5.i307: ; preds = %pl_dolane.5.i304, %pl_loopend.4.i296
%pl_and.6.i305 = and i64 %v64.i.i240, 64
%pl_doit.6.i306 = icmp eq i64 %pl_and.6.i305, 0
br i1 %pl_doit.6.i306, label %pl_loopend.6.i318, label %pl_dolane.6.i315
pl_dolane.6.i315: ; preds = %pl_loopend.5.i307
%offset32.i.6.i308 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 6
%offset64.i.6.i309 = sext i32 %offset32.i.6.i308 to i64
%offset.i.6.i310 = shl nsw i64 %offset64.i.6.i309, 2
%ptroffset.sum.i.6.i311 = add i64 %offset.i.6.i310, 48
%finalptr.i.6.i312 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.6.i311
%ptrcast.i.6.i313 = bitcast i8* %finalptr.i.6.i312 to i32*
%storeval.i.6.i314 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 6
store i32 %storeval.i.6.i314, i32* %ptrcast.i.6.i313, align 4
br label %pl_loopend.6.i318
pl_loopend.6.i318: ; preds = %pl_dolane.6.i315, %pl_loopend.5.i307
%pl_and.7.i316 = and i64 %v64.i.i240, 128
%pl_doit.7.i317 = icmp eq i64 %pl_and.7.i316, 0
br i1 %pl_doit.7.i317, label %safe_if_after_true, label %pl_dolane.7.i326
pl_dolane.7.i326: ; preds = %pl_loopend.6.i318
%offset32.i.7.i319 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 7
%offset64.i.7.i320 = sext i32 %offset32.i.7.i319 to i64
%offset.i.7.i321 = shl nsw i64 %offset64.i.7.i320, 2
%ptroffset.sum.i.7.i322 = add i64 %offset.i.7.i321, 56
%finalptr.i.7.i323 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.7.i322
%ptrcast.i.7.i324 = bitcast i8* %finalptr.i.7.i323 to i32*
%storeval.i.7.i325 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 7
store i32 %storeval.i.7.i325, i32* %ptrcast.i.7.i324, align 4
br label %safe_if_after_true
for_test103.preheader: ; preds = %for_exit106, %for_test103.preheader.lr.ph
%yi97.0364 = phi i32 [ %mul_taskIndex1_load_yspan_load, %for_test103.preheader.lr.ph ], [ %yi_load164_plus1, %for_exit106 ]
br i1 %less_xi_load110_xend_load111360, label %for_loop105.lr.ph, label %for_exit106
for_loop105.lr.ph: ; preds = %for_test103.preheader
%yi_load120_to_float = sitofp i32 %yi97.0364 to float
%mul_yi_load120_to_float_dy_load121 = fmul float %dy8, %yi_load120_to_float
%add_y0_load119_mul_yi_load120_to_float_dy_load121 = fadd float %y06, %mul_yi_load120_to_float_dy_load121
%add_y0_load119_mul_yi_load120_to_float_dy_load121_broadcast_init = insertelement <8 x float> undef, float %add_y0_load119_mul_yi_load120_to_float_dy_load121, i32 0
%add_y0_load119_mul_yi_load120_to_float_dy_load121_broadcast = shufflevector <8 x float> %add_y0_load119_mul_yi_load120_to_float_dy_load121_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%mul_yi_load130_width_load131 = mul i32 %yi97.0364, %width10
%mul_yi_load130_width_load131_broadcast_init = insertelement <8 x i32> undef, i32 %mul_yi_load130_width_load131, i32 0
%mul_yi_load130_width_load131_broadcast = shufflevector <8 x i32> %mul_yi_load130_width_load131_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
br label %for_loop105
for_loop105: ; preds = %safe_if_after_true137, %for_loop105.lr.ph
%xi108.0361 = phi i32 [ %mul_taskIndex0_load_xspan_load, %for_loop105.lr.ph ], [ %add_xi108_load_, %safe_if_after_true137 ]
%xi_load116_broadcast_init = insertelement <8 x i32> undef, i32 %xi108.0361, i32 0
%xi_load116_broadcast = shufflevector <8 x i32> %xi_load116_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%add_xi_load116_broadcast_ = add <8 x i32> %xi_load116_broadcast, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%add_xi_load116_broadcast__to_float = sitofp <8 x i32> %add_xi_load116_broadcast_ to <8 x float>
%mul_add_xi_load116_broadcast__to_float_dx_load117_broadcast = fmul <8 x float> %dx_load117_broadcast, %add_xi_load116_broadcast__to_float
%add_x0_load115_broadcast_mul_add_xi_load116_broadcast__to_float_dx_load117_broadcast = fadd <8 x float> %x0_load115_broadcast, %mul_add_xi_load116_broadcast__to_float_dx_load117_broadcast
%v.i.i351 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i.i350)
%cmp.i.i352 = icmp eq i32 %v.i.i351, 0
br i1 %cmp.i.i352, label %mandel___vyfvyfvyi.exit, label %for_loop.i
for_step.i: ; preds = %not_all_continued_or_breaked.i, %for_loop.i
%z_re.1.i = phi <8 x float> [ %z_re.0.i354, %for_loop.i ], [ %add_c_re_load42_new_re_load.i, %not_all_continued_or_breaked.i ]
%z_im.1.i = phi <8 x float> [ %z_im.0.i355, %for_loop.i ], [ %add_c_im_load44_new_im_load.i, %not_all_continued_or_breaked.i ]
%internal_mask_memory.1.i = phi <8 x i32> [ zeroinitializer, %for_loop.i ], [ %new_mask28.i, %not_all_continued_or_breaked.i ]
%i_load53_plus1.i = add <8 x i32> %blendAsInt.i237329353, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%mask_as_float.i232 = bitcast <8 x i32> %internal_mask_memory.1.i to <8 x float>
%oldAsFloat.i234 = bitcast <8 x i32> %blendAsInt.i237329353 to <8 x float>
%newAsFloat.i235 = bitcast <8 x i32> %i_load53_plus1.i to <8 x float>
%blend.i236 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat.i234, <8 x float> %newAsFloat.i235, <8 x float> %mask_as_float.i232)
%blendAsInt.i237 = bitcast <8 x float> %blend.i236 to <8 x i32>
%less_i_load_count_load.i = icmp slt <8 x i32> %blendAsInt.i237, %maxIterations_load125_broadcast
%"oldMask&test.i" = select <8 x i1> %less_i_load_count_load.i, <8 x i32> %internal_mask_memory.1.i, <8 x i32> zeroinitializer
%"internal_mask&function_mask10.i" = and <8 x i32> %"oldMask&test.i", %mask
%floatmask.i.i = bitcast <8 x i32> %"internal_mask&function_mask10.i" to <8 x float>
%v.i.i = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i.i)
%cmp.i.i = icmp eq i32 %v.i.i, 0
br i1 %cmp.i.i, label %mandel___vyfvyfvyi.exit, label %for_loop.i
for_loop.i: ; preds = %for_step.i, %for_loop105
%v.i.i358 = phi i32 [ %v.i.i, %for_step.i ], [ %v.i.i351, %for_loop105 ]
%"oldMask&test.i357" = phi <8 x i32> [ %"oldMask&test.i", %for_step.i ], [ %"oldMask&test.i348", %for_loop105 ]
%break_lanes_memory.0.i356 = phi <8 x i32> [ %"mask|break_mask.i", %for_step.i ], [ zeroinitializer, %for_loop105 ]
%z_im.0.i355 = phi <8 x float> [ %z_im.1.i, %for_step.i ], [ %add_y0_load119_mul_yi_load120_to_float_dy_load121_broadcast, %for_loop105 ]
%z_re.0.i354 = phi <8 x float> [ %z_re.1.i, %for_step.i ], [ %add_x0_load115_broadcast_mul_add_xi_load116_broadcast__to_float_dx_load117_broadcast, %for_loop105 ]
%blendAsInt.i237329353 = phi <8 x i32> [ %blendAsInt.i237, %for_step.i ], [ zeroinitializer, %for_loop105 ]
%mul_z_re_load_z_re_load13.i = fmul <8 x float> %z_re.0.i354, %z_re.0.i354
%mul_z_im_load_z_im_load14.i = fmul <8 x float> %z_im.0.i355, %z_im.0.i355
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i = fadd <8 x float> %mul_z_re_load_z_re_load13.i, %mul_z_im_load_z_im_load14.i
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i = fcmp ugt <8 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i, <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
%"oldMask&test16.i" = select <8 x i1> %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i, <8 x i32> %"oldMask&test.i357", <8 x i32> zeroinitializer
%"mask|break_mask.i" = or <8 x i32> %"oldMask&test16.i", %break_lanes_memory.0.i356
%"finished&func.i" = and <8 x i32> %"mask|break_mask.i", %mask
%floatmask.i67.i = bitcast <8 x i32> %"finished&func.i" to <8 x float>
%v.i68.i = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i67.i)
%"equal_finished&func_internal_mask&function_mask12.i" = icmp eq i32 %v.i68.i, %v.i.i358
br i1 %"equal_finished&func_internal_mask&function_mask12.i", label %for_step.i, label %not_all_continued_or_breaked.i
not_all_continued_or_breaked.i: ; preds = %for_loop.i
%"!(break|continue)_lanes.i" = xor <8 x i32> %"mask|break_mask.i", <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%new_mask28.i = and <8 x i32> %"oldMask&test.i357", %"!(break|continue)_lanes.i"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i = fsub <8 x float> %mul_z_re_load_z_re_load13.i, %mul_z_im_load_z_im_load14.i
%mul__z_re_load35.i = fmul <8 x float> %z_re.0.i354, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i = fmul <8 x float> %mul__z_re_load35.i, %z_im.0.i355
%add_c_re_load42_new_re_load.i = fadd <8 x float> %add_x0_load115_broadcast_mul_add_xi_load116_broadcast__to_float_dx_load117_broadcast, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i
%add_c_im_load44_new_im_load.i = fadd <8 x float> %add_y0_load119_mul_yi_load120_to_float_dy_load121_broadcast, %mul_mul__z_re_load35_z_im_load36.i
br label %for_step.i
mandel___vyfvyfvyi.exit: ; preds = %for_step.i, %for_loop105
%blendAsInt.i237329.lcssa = phi <8 x i32> [ zeroinitializer, %for_loop105 ], [ %blendAsInt.i237, %for_step.i ]
%less_add_xi_load133_broadcast__xend_load134_broadcast = icmp slt <8 x i32> %add_xi_load116_broadcast_, %xend_load134_broadcast
%"oldMask&test139" = select <8 x i1> %less_add_xi_load133_broadcast__xend_load134_broadcast, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> zeroinitializer
%"internal_mask&function_mask143" = and <8 x i32> %"oldMask&test139", %mask
%floatmask.i169 = bitcast <8 x i32> %"internal_mask&function_mask143" to <8 x float>
%v.i170 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i169)
%cmp.i171 = icmp eq i32 %v.i170, 0
br i1 %cmp.i171, label %safe_if_after_true137, label %safe_if_run_true138
for_exit106: ; preds = %safe_if_after_true137, %for_test103.preheader
%yi_load164_plus1 = add i32 %yi97.0364, 1
%exitcond365 = icmp eq i32 %yi_load164_plus1, %ret.i.i223
br i1 %exitcond365, label %for_exit, label %for_test103.preheader
safe_if_after_true137: ; preds = %pl_dolane.7.i, %pl_loopend.6.i, %mandel___vyfvyfvyi.exit
%add_xi108_load_ = add i32 %xi108.0361, 8
%less_xi_load110_xend_load111 = icmp slt i32 %add_xi108_load_, %ret.i.i
br i1 %less_xi_load110_xend_load111, label %for_loop105, label %for_exit106
safe_if_run_true138: ; preds = %mandel___vyfvyfvyi.exit
%add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast = add <8 x i32> %mul_yi_load130_width_load131_broadcast, %xi_load116_broadcast
%v.i.i231 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i169)
%v64.i.i = zext i32 %v.i.i231 to i64
%pl_and.i = and i64 %v64.i.i, 1
%pl_doit.i = icmp eq i64 %pl_and.i, 0
br i1 %pl_doit.i, label %pl_loopend.i, label %pl_dolane.i
pl_dolane.i: ; preds = %safe_if_run_true138
%offset32.i.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 0
%offset64.i.i = sext i32 %offset32.i.i to i64
%finalptr.i.i330 = getelementptr i32* %output20, i64 %offset64.i.i
%storeval.i.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 0
store i32 %storeval.i.i, i32* %finalptr.i.i330, align 4
br label %pl_loopend.i
pl_loopend.i: ; preds = %pl_dolane.i, %safe_if_run_true138
%pl_and.1.i = and i64 %v64.i.i, 2
%pl_doit.1.i = icmp eq i64 %pl_and.1.i, 0
br i1 %pl_doit.1.i, label %pl_loopend.1.i, label %pl_dolane.1.i
pl_dolane.1.i: ; preds = %pl_loopend.i
%offset32.i.1.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 1
%offset64.i.1.i = sext i32 %offset32.i.1.i to i64
%offset.i.1.i = shl nsw i64 %offset64.i.1.i, 2
%ptroffset.sum.i.1.i = add i64 %offset.i.1.i, 8
%finalptr.i.1.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.1.i
%ptrcast.i.1.i = bitcast i8* %finalptr.i.1.i to i32*
%storeval.i.1.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 1
store i32 %storeval.i.1.i, i32* %ptrcast.i.1.i, align 4
br label %pl_loopend.1.i
pl_loopend.1.i: ; preds = %pl_dolane.1.i, %pl_loopend.i
%pl_and.2.i = and i64 %v64.i.i, 4
%pl_doit.2.i = icmp eq i64 %pl_and.2.i, 0
br i1 %pl_doit.2.i, label %pl_loopend.2.i, label %pl_dolane.2.i
pl_dolane.2.i: ; preds = %pl_loopend.1.i
%offset32.i.2.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 2
%offset64.i.2.i = sext i32 %offset32.i.2.i to i64
%offset.i.2.i = shl nsw i64 %offset64.i.2.i, 2
%ptroffset.sum.i.2.i = add i64 %offset.i.2.i, 16
%finalptr.i.2.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.2.i
%ptrcast.i.2.i = bitcast i8* %finalptr.i.2.i to i32*
%storeval.i.2.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 2
store i32 %storeval.i.2.i, i32* %ptrcast.i.2.i, align 4
br label %pl_loopend.2.i
pl_loopend.2.i: ; preds = %pl_dolane.2.i, %pl_loopend.1.i
%pl_and.3.i = and i64 %v64.i.i, 8
%pl_doit.3.i = icmp eq i64 %pl_and.3.i, 0
br i1 %pl_doit.3.i, label %pl_loopend.3.i, label %pl_dolane.3.i
pl_dolane.3.i: ; preds = %pl_loopend.2.i
%offset32.i.3.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 3
%offset64.i.3.i = sext i32 %offset32.i.3.i to i64
%offset.i.3.i = shl nsw i64 %offset64.i.3.i, 2
%ptroffset.sum.i.3.i = add i64 %offset.i.3.i, 24
%finalptr.i.3.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.3.i
%ptrcast.i.3.i = bitcast i8* %finalptr.i.3.i to i32*
%storeval.i.3.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 3
store i32 %storeval.i.3.i, i32* %ptrcast.i.3.i, align 4
br label %pl_loopend.3.i
pl_loopend.3.i: ; preds = %pl_dolane.3.i, %pl_loopend.2.i
%pl_and.4.i = and i64 %v64.i.i, 16
%pl_doit.4.i = icmp eq i64 %pl_and.4.i, 0
br i1 %pl_doit.4.i, label %pl_loopend.4.i, label %pl_dolane.4.i
pl_dolane.4.i: ; preds = %pl_loopend.3.i
%offset32.i.4.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 4
%offset64.i.4.i = sext i32 %offset32.i.4.i to i64
%offset.i.4.i = shl nsw i64 %offset64.i.4.i, 2
%ptroffset.sum.i.4.i = add i64 %offset.i.4.i, 32
%finalptr.i.4.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.4.i
%ptrcast.i.4.i = bitcast i8* %finalptr.i.4.i to i32*
%storeval.i.4.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 4
store i32 %storeval.i.4.i, i32* %ptrcast.i.4.i, align 4
br label %pl_loopend.4.i
pl_loopend.4.i: ; preds = %pl_dolane.4.i, %pl_loopend.3.i
%pl_and.5.i = and i64 %v64.i.i, 32
%pl_doit.5.i = icmp eq i64 %pl_and.5.i, 0
br i1 %pl_doit.5.i, label %pl_loopend.5.i, label %pl_dolane.5.i
pl_dolane.5.i: ; preds = %pl_loopend.4.i
%offset32.i.5.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 5
%offset64.i.5.i = sext i32 %offset32.i.5.i to i64
%offset.i.5.i = shl nsw i64 %offset64.i.5.i, 2
%ptroffset.sum.i.5.i = add i64 %offset.i.5.i, 40
%finalptr.i.5.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.5.i
%ptrcast.i.5.i = bitcast i8* %finalptr.i.5.i to i32*
%storeval.i.5.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 5
store i32 %storeval.i.5.i, i32* %ptrcast.i.5.i, align 4
br label %pl_loopend.5.i
pl_loopend.5.i: ; preds = %pl_dolane.5.i, %pl_loopend.4.i
%pl_and.6.i = and i64 %v64.i.i, 64
%pl_doit.6.i = icmp eq i64 %pl_and.6.i, 0
br i1 %pl_doit.6.i, label %pl_loopend.6.i, label %pl_dolane.6.i
pl_dolane.6.i: ; preds = %pl_loopend.5.i
%offset32.i.6.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 6
%offset64.i.6.i = sext i32 %offset32.i.6.i to i64
%offset.i.6.i = shl nsw i64 %offset64.i.6.i, 2
%ptroffset.sum.i.6.i = add i64 %offset.i.6.i, 48
%finalptr.i.6.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.6.i
%ptrcast.i.6.i = bitcast i8* %finalptr.i.6.i to i32*
%storeval.i.6.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 6
store i32 %storeval.i.6.i, i32* %ptrcast.i.6.i, align 4
br label %pl_loopend.6.i
pl_loopend.6.i: ; preds = %pl_dolane.6.i, %pl_loopend.5.i
%pl_and.7.i = and i64 %v64.i.i, 128
%pl_doit.7.i = icmp eq i64 %pl_and.7.i, 0
br i1 %pl_doit.7.i, label %safe_if_after_true137, label %pl_dolane.7.i
pl_dolane.7.i: ; preds = %pl_loopend.6.i
%offset32.i.7.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 7
%offset64.i.7.i = sext i32 %offset32.i.7.i to i64
%offset.i.7.i = shl nsw i64 %offset64.i.7.i, 2
%ptroffset.sum.i.7.i = add i64 %offset.i.7.i, 56
%finalptr.i.7.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.7.i
%ptrcast.i.7.i = bitcast i8* %finalptr.i.7.i to i32*
%storeval.i.7.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 7
store i32 %storeval.i.7.i, i32* %ptrcast.i.7.i, align 4
br label %safe_if_after_true137
}
define void @mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_(float %x0, float %y0, float %x1, float %y1, i32 %width, i32 %height, i32 %maxIterations, i32* %output, <8 x i32> %__mask) {
allocas:
%launch_group_handle = alloca i8*, align 8
store i8* null, i8** %launch_group_handle, align 8
%floatmask.i = bitcast <8 x i32> %__mask to <8 x float>
%v.i = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i)
%cmp.i = icmp eq i32 %v.i, 255
%sub_x1_load_x0_load = fsub float %x1, %x0
%width_load_to_float = sitofp i32 %width to float
%div_sub_x1_load_x0_load_width_load_to_float = fdiv float %sub_x1_load_x0_load, %width_load_to_float
%sub_y1_load_y0_load = fsub float %y1, %y0
%height_load_to_float = sitofp i32 %height to float
%div_sub_y1_load_y0_load_height_load_to_float = fdiv float %sub_y1_load_y0_load, %height_load_to_float
%div_width_load15_ = sdiv i32 %width, 16
%div_height_load16_yspan_load = sdiv i32 %height, 16
%args_ptr = call i8* @ISPCAlloc(i8** %launch_group_handle, i64 96, i32 32)
%funarg = bitcast i8* %args_ptr to float*
store float %x0, float* %funarg, align 4
%funarg17 = getelementptr i8* %args_ptr, i64 4
%0 = bitcast i8* %funarg17 to float*
store float %div_sub_x1_load_x0_load_width_load_to_float, float* %0, align 4
%funarg18 = getelementptr i8* %args_ptr, i64 8
%1 = bitcast i8* %funarg18 to float*
store float %y0, float* %1, align 4
%funarg19 = getelementptr i8* %args_ptr, i64 12
%2 = bitcast i8* %funarg19 to float*
store float %div_sub_y1_load_y0_load_height_load_to_float, float* %2, align 4
%funarg20 = getelementptr i8* %args_ptr, i64 16
%3 = bitcast i8* %funarg20 to i32*
store i32 %width, i32* %3, align 4
%funarg21 = getelementptr i8* %args_ptr, i64 20
%4 = bitcast i8* %funarg21 to i32*
store i32 %height, i32* %4, align 4
%funarg22 = getelementptr i8* %args_ptr, i64 24
%5 = bitcast i8* %funarg22 to i32*
store i32 16, i32* %5, align 4
%funarg23 = getelementptr i8* %args_ptr, i64 28
%6 = bitcast i8* %funarg23 to i32*
store i32 16, i32* %6, align 4
%funarg24 = getelementptr i8* %args_ptr, i64 32
%7 = bitcast i8* %funarg24 to i32*
store i32 %maxIterations, i32* %7, align 4
%funarg25 = getelementptr i8* %args_ptr, i64 40
%8 = bitcast i8* %funarg25 to i32**
store i32* %output, i32** %8, align 8
%funarg_mask = getelementptr i8* %args_ptr, i64 64
%9 = bitcast i8* %funarg_mask to <8 x i32>*
br i1 %cmp.i, label %all_on, label %some_on
all_on: ; preds = %allocas
store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %9, align 32
call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_ to i8*), i8* %args_ptr, i32 %div_width_load15_, i32 %div_height_load16_yspan_load, i32 1)
%launch_group_handle_load = load i8** %launch_group_handle, align 8
%cmp = icmp eq i8* %launch_group_handle_load, null
br i1 %cmp, label %post_sync, label %call_sync
some_on: ; preds = %allocas
store <8 x i32> %__mask, <8 x i32>* %9, align 32
call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_ to i8*), i8* %args_ptr, i32 %div_width_load15_, i32 %div_height_load16_yspan_load, i32 1)
%launch_group_handle_load67 = load i8** %launch_group_handle, align 8
%cmp68 = icmp eq i8* %launch_group_handle_load67, null
br i1 %cmp68, label %post_sync, label %call_sync69
call_sync: ; preds = %all_on
call void @ISPCSync(i8* %launch_group_handle_load)
store i8* null, i8** %launch_group_handle, align 8
br label %post_sync
post_sync: ; preds = %call_sync69, %call_sync, %some_on, %all_on
ret void
call_sync69: ; preds = %some_on
call void @ISPCSync(i8* %launch_group_handle_load67)
store i8* null, i8** %launch_group_handle, align 8
br label %post_sync
}
define void @mandelbrot_ispc(float %x0, float %y0, float %x1, float %y1, i32 %width, i32 %height, i32 %maxIterations, i32* %output) {
allocas:
%launch_group_handle = alloca i8*, align 8
store i8* null, i8** %launch_group_handle, align 8
%sub_x1_load_x0_load = fsub float %x1, %x0
%width_load_to_float = sitofp i32 %width to float
%div_sub_x1_load_x0_load_width_load_to_float = fdiv float %sub_x1_load_x0_load, %width_load_to_float
%sub_y1_load_y0_load = fsub float %y1, %y0
%height_load_to_float = sitofp i32 %height to float
%div_sub_y1_load_y0_load_height_load_to_float = fdiv float %sub_y1_load_y0_load, %height_load_to_float
%div_width_load15_ = sdiv i32 %width, 16
%div_height_load16_yspan_load = sdiv i32 %height, 16
%args_ptr = call i8* @ISPCAlloc(i8** %launch_group_handle, i64 96, i32 32)
%funarg = bitcast i8* %args_ptr to float*
store float %x0, float* %funarg, align 4
%funarg17 = getelementptr i8* %args_ptr, i64 4
%0 = bitcast i8* %funarg17 to float*
store float %div_sub_x1_load_x0_load_width_load_to_float, float* %0, align 4
%funarg18 = getelementptr i8* %args_ptr, i64 8
%1 = bitcast i8* %funarg18 to float*
store float %y0, float* %1, align 4
%funarg19 = getelementptr i8* %args_ptr, i64 12
%2 = bitcast i8* %funarg19 to float*
store float %div_sub_y1_load_y0_load_height_load_to_float, float* %2, align 4
%funarg20 = getelementptr i8* %args_ptr, i64 16
%3 = bitcast i8* %funarg20 to i32*
store i32 %width, i32* %3, align 4
%funarg21 = getelementptr i8* %args_ptr, i64 20
%4 = bitcast i8* %funarg21 to i32*
store i32 %height, i32* %4, align 4
%funarg22 = getelementptr i8* %args_ptr, i64 24
%5 = bitcast i8* %funarg22 to i32*
store i32 16, i32* %5, align 4
%funarg23 = getelementptr i8* %args_ptr, i64 28
%6 = bitcast i8* %funarg23 to i32*
store i32 16, i32* %6, align 4
%funarg24 = getelementptr i8* %args_ptr, i64 32
%7 = bitcast i8* %funarg24 to i32*
store i32 %maxIterations, i32* %7, align 4
%funarg25 = getelementptr i8* %args_ptr, i64 40
%8 = bitcast i8* %funarg25 to i32**
store i32* %output, i32** %8, align 8
%funarg_mask = getelementptr i8* %args_ptr, i64 64
%9 = bitcast i8* %funarg_mask to <8 x i32>*
store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %9, align 32
call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_ to i8*), i8* %args_ptr, i32 %div_width_load15_, i32 %div_height_load16_yspan_load, i32 1)
%launch_group_handle_load = load i8** %launch_group_handle, align 8
%cmp = icmp eq i8* %launch_group_handle_load, null
br i1 %cmp, label %post_sync, label %call_sync
call_sync: ; preds = %allocas
call void @ISPCSync(i8* %launch_group_handle_load)
store i8* null, i8** %launch_group_handle, align 8
br label %post_sync
post_sync: ; preds = %call_sync, %allocas
ret void
}

View File

@@ -1,320 +0,0 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl __aos_to_soa4_float1
// @__aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl mandelbrot_scanline
.entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 .ptr .align 4 mandelbrot_scanline_param_9
) // @mandelbrot_scanline
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u32 %r6, [mandelbrot_scanline_param_5];
ld.param.u32 %r5, [mandelbrot_scanline_param_7];
mov.u32 %r7, %ctaid.y;
mul.lo.s32 %r0, %r7, %r5;
mad.lo.s32 %r1, %r7, %r5, %r5;
setp.lt.s32 %p0, %r1, %r6;
selp.b32 %r1, %r1, %r6, %p0;
setp.ge.s32 %p0, %r0, %r1;
@%p0 bra BB4_13;
// BB#1: // %for_test28.preheader.lr.ph
ld.param.f32 %f0, [mandelbrot_scanline_param_0];
ld.param.f32 %f1, [mandelbrot_scanline_param_1];
ld.param.f32 %f2, [mandelbrot_scanline_param_2];
ld.param.f32 %f3, [mandelbrot_scanline_param_3];
ld.param.u32 %r1, [mandelbrot_scanline_param_4];
ld.param.u32 %r4, [mandelbrot_scanline_param_6];
ld.param.u32 %r2, [mandelbrot_scanline_param_8];
ld.param.u64 %rl0, [mandelbrot_scanline_param_9];
mov.u32 %r8, %ctaid.x;
mul.lo.s32 %r3, %r8, %r4;
mad.lo.s32 %r4, %r8, %r4, %r4;
setp.lt.s32 %p0, %r4, %r1;
selp.b32 %r4, %r4, %r1, %p0;
setp.gt.s32 %p0, %r2, 0;
not.b32 %r6, %r6;
add.s32 %r7, %r7, 1;
mul.lo.s32 %r5, %r7, %r5;
not.b32 %r5, %r5;
setp.gt.s32 %p1, %r6, %r5;
selp.b32 %r5, %r6, %r5, %p1;
not.b32 %r5, %r5;
BB4_2: // %for_test28.preheader
// =>This Loop Header: Depth=1
// Child Loop BB4_15 Depth 2
// Child Loop BB4_8 Depth 2
// Child Loop BB4_11 Depth 3
setp.ge.s32 %p1, %r3, %r4;
@%p1 bra BB4_12;
// BB#3: // %for_loop30.lr.ph
// in Loop: Header=BB4_2 Depth=1
mul.lo.s32 %r6, %r0, %r1;
mov.u32 %r7, %r3;
@%p0 bra BB4_4;
bra.uni BB4_15;
BB4_4: // in Loop: Header=BB4_2 Depth=1
cvt.rn.f32.s32 %f4, %r0;
fma.rn.f32 %f4, %f4, %f3, %f2;
mov.u32 %r7, %r3;
BB4_8: // %for_loop.i.lr.ph.us
// Parent Loop BB4_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB4_11 Depth 3
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r11, %r10, %r7;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f5, %f1, %f0;
mov.u32 %r10, 0;
mov.pred %p1, 0;
mov.pred %p3, -1;
mov.pred %p4, %p0;
mov.pred %p2, %p1;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB4_11: // %for_loop.i.us
// Parent Loop BB4_2 Depth=1
// Parent Loop BB4_8 Depth=2
// => This Inner Loop Header: Depth=3
and.pred %p4, %p3, %p4;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p3, %f9, 0f40800000;
and.pred %p3, %p4, %p3;
or.pred %p2, %p3, %p2;
xor.pred %p5, %p2, %p4;
mov.pred %p3, %p1;
@!%p5 bra BB4_10;
bra.uni BB4_9;
BB4_9: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB4_11 Depth=3
mul.f32 %f9, %f6, %f6;
not.pred %p3, %p2;
and.pred %p3, %p4, %p3;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB4_10: // %for_step.i.us
// in Loop: Header=BB4_11 Depth=3
add.s32 %r12, %r10, 1;
selp.b32 %r10, %r12, %r10, %p3;
setp.lt.s32 %p4, %r10, %r2;
and.pred %p5, %p3, %p4;
@%p5 bra BB4_11;
// BB#5: // %mandel___vyfvyfvyi.exit.us
// in Loop: Header=BB4_8 Depth=2
setp.ge.s32 %p1, %r11, %r4;
@%p1 bra BB4_7;
// BB#6: // %if_then.us
// in Loop: Header=BB4_8 Depth=2
add.s32 %r11, %r8, 1073741823;
and.b32 %r9, %r11, %r9;
add.s32 %r11, %r7, %r6;
add.s32 %r9, %r11, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB4_7: // %if_exit.us
// in Loop: Header=BB4_8 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r4;
@%p1 bra BB4_8;
bra.uni BB4_12;
BB4_15: // %mandel___vyfvyfvyi.exit
// Parent Loop BB4_2 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r10, %r10, %r7;
setp.lt.s32 %p1, %r10, %r4;
@%p1 bra BB4_16;
bra.uni BB4_14;
BB4_16: // %if_then
// in Loop: Header=BB4_15 Depth=2
add.s32 %r10, %r8, 1073741823;
and.b32 %r9, %r10, %r9;
add.s32 %r10, %r7, %r6;
add.s32 %r9, %r10, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
BB4_14: // %if_exit
// in Loop: Header=BB4_15 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r4;
@%p1 bra BB4_15;
BB4_12: // %for_exit31
// in Loop: Header=BB4_2 Depth=1
add.s32 %r0, %r0, 1;
setp.eq.s32 %p1, %r0, %r5;
@%p1 bra BB4_13;
bra.uni BB4_2;
BB4_13: // %for_exit
ret;
}

View File

@@ -1,534 +0,0 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_20, texmode_independent
.address_size 64
// .globl __vselect_i8
.func (.param .b32 func_retval0) getBlockIndex0___UM_
(
)
;
.func (.param .b32 func_retval0) getBlockIndex1___UM_
(
)
;
.func (.param .b32 func_retval0) getLaneIndex___UM_
(
)
;
// @__vselect_i8
.func (.param .align 1 .b8 func_retval0[1]) __vselect_i8(
.param .align 1 .b8 __vselect_i8_param_0[1],
.param .align 1 .b8 __vselect_i8_param_1[1],
.param .align 4 .b8 __vselect_i8_param_2[4]
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i8_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u8 %rc0, [__vselect_i8_param_0];
ld.param.u8 %rc1, [__vselect_i8_param_1];
selp.b16 %rc0, %rc0, %rc1, %p0;
st.param.b8 [func_retval0+0], %rc0;
ret;
}
// .globl __vselect_i16
.func (.param .align 2 .b8 func_retval0[2]) __vselect_i16(
.param .align 2 .b8 __vselect_i16_param_0[2],
.param .align 2 .b8 __vselect_i16_param_1[2],
.param .align 4 .b8 __vselect_i16_param_2[4]
) // @__vselect_i16
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i16_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u16 %rs0, [__vselect_i16_param_0];
ld.param.u16 %rs1, [__vselect_i16_param_1];
selp.b16 %rs0, %rs0, %rs1, %p0;
st.param.b16 [func_retval0+0], %rs0;
ret;
}
// .globl __vselect_i64
.func (.param .align 8 .b8 func_retval0[8]) __vselect_i64(
.param .align 8 .b8 __vselect_i64_param_0[8],
.param .align 8 .b8 __vselect_i64_param_1[8],
.param .align 4 .b8 __vselect_i64_param_2[4]
) // @__vselect_i64
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i64_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u64 %rl0, [__vselect_i64_param_0];
ld.param.u64 %rl1, [__vselect_i64_param_1];
selp.b64 %rl0, %rl0, %rl1, %p0;
st.param.b64 [func_retval0+0], %rl0;
ret;
}
// .globl __aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
) // @__aos_to_soa4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __rsqrt_varying_double
.func (.param .align 8 .b8 func_retval0[8]) __rsqrt_varying_double(
.param .align 8 .b8 __rsqrt_varying_double_param_0[8]
) // @__rsqrt_varying_double
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f64 %fl0, [__rsqrt_varying_double_param_0];
rsqrt.approx.f64 %fl0, %fl0;
st.param.f64 [func_retval0+0], %fl0;
ret;
}
// .globl mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_
.func mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_(
.param .b64 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_1,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_2,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_3,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_4,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_5,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_6,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_7,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_8,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_9,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_10
) // @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u64 %rl0, [mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0];
ld.f32 %f0, [%rl0];
ld.f32 %f1, [%rl0+4];
ld.f32 %f2, [%rl0+8];
ld.f32 %f3, [%rl0+12];
ld.u32 %r0, [%rl0+16];
ld.u32 %r6, [%rl0+20];
ld.u32 %r7, [%rl0+24];
ld.u32 %r8, [%rl0+28];
ld.u32 %r1, [%rl0+32];
ld.u64 %rl0, [%rl0+40];
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getBlockIndex0___UM_,
(
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 0
// Callseq Start 1
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getBlockIndex1___UM_,
(
);
ld.param.b32 %r10, [retval0+0];
//{
}// Callseq End 1
mul.lo.s32 %r2, %r10, %r8;
mad.lo.s32 %r3, %r10, %r8, %r8;
setp.lt.s32 %p0, %r3, %r6;
selp.b32 %r3, %r3, %r6, %p0;
setp.ge.s32 %p0, %r2, %r3;
@%p0 bra BB8_14;
// BB#1: // %for_test34.preheader.lr.ph
mul.lo.s32 %r3, %r9, %r7;
mad.lo.s32 %r4, %r9, %r7, %r7;
setp.lt.s32 %p0, %r4, %r0;
selp.b32 %r4, %r4, %r0, %p0;
setp.gt.s32 %p0, %r1, 0;
selp.b32 %r5, -1, 0, %p0;
not.b32 %r6, %r6;
add.s32 %r11, %r10, 1;
mul.lo.s32 %r11, %r8, %r11;
not.b32 %r11, %r11;
setp.gt.s32 %p0, %r6, %r11;
selp.b32 %r6, %r6, %r11, %p0;
not.b32 %r6, %r6;
mul.lo.s32 %r8, %r10, %r8;
mul.lo.s32 %r8, %r8, %r0;
mad.lo.s32 %r7, %r9, %r7, %r8;
BB8_2: // %for_test34.preheader
// =>This Loop Header: Depth=1
// Child Loop BB8_16 Depth 2
// Child Loop BB8_8 Depth 2
// Child Loop BB8_9 Depth 3
setp.ge.s32 %p0, %r3, %r4;
@%p0 bra BB8_13;
// BB#3: // %for_loop36.lr.ph
// in Loop: Header=BB8_2 Depth=1
setp.lt.s32 %p0, %r5, 0;
mov.u32 %r8, %r7;
mov.u32 %r9, %r3;
@%p0 bra BB8_4;
bra.uni BB8_16;
BB8_4: // in Loop: Header=BB8_2 Depth=1
cvt.rn.f32.s32 %f4, %r2;
mul.lo.s32 %r8, %r2, %r0;
fma.rn.f32 %f4, %f3, %f4, %f2;
mov.u32 %r9, %r3;
BB8_8: // %for_loop.i178.lr.ph.us
// Parent Loop BB8_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB8_9 Depth 3
// Callseq Start 5
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___UM_,
(
);
ld.param.b32 %r10, [retval0+0];
//{
}// Callseq End 5
add.s32 %r10, %r10, %r9;
cvt.rn.f32.s32 %f5, %r10;
fma.rn.f32 %f5, %f1, %f5, %f0;
mov.u32 %r11, 0;
mov.u32 %r13, %r5;
mov.u32 %r12, %r11;
mov.u32 %r10, %r11;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB8_9: // %for_loop.i178.us
// Parent Loop BB8_2 Depth=1
// Parent Loop BB8_8 Depth=2
// => This Inner Loop Header: Depth=3
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p0, %f9, 0f40800000;
selp.b32 %r14, %r13, 0, %p0;
or.b32 %r12, %r14, %r12;
shr.u32 %r14, %r12, 31;
shr.u32 %r15, %r13, 31;
setp.eq.s32 %p0, %r14, %r15;
@%p0 bra BB8_10;
bra.uni BB8_11;
BB8_10: // in Loop: Header=BB8_9 Depth=3
mov.u32 %r13, %r11;
bra.uni BB8_12;
BB8_11: // %not_all_continued_or_breaked.i192.us
// in Loop: Header=BB8_9 Depth=3
mul.f32 %f9, %f6, %f6;
not.b32 %r14, %r12;
and.b32 %r13, %r13, %r14;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB8_12: // %for_step.i161.us
// in Loop: Header=BB8_9 Depth=3
setp.ne.s32 %p0, %r13, 0;
selp.u32 %r14, 1, 0, %p0;
add.s32 %r10, %r10, %r14;
setp.lt.s32 %p0, %r10, %r1;
selp.b32 %r13, %r13, 0, %p0;
setp.lt.s32 %p0, %r13, 0;
@%p0 bra BB8_9;
// BB#5: // %mandel___vyfvyfvyi.exit193.us
// in Loop: Header=BB8_8 Depth=2
// Callseq Start 6
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___UM_,
(
);
ld.param.b32 %r11, [retval0+0];
//{
}// Callseq End 6
// Callseq Start 7
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___UM_,
(
);
ld.param.b32 %r12, [retval0+0];
//{
}// Callseq End 7
add.s32 %r12, %r12, %r9;
setp.ge.s32 %p0, %r12, %r4;
@%p0 bra BB8_7;
// BB#6: // %if_then.us
// in Loop: Header=BB8_8 Depth=2
add.s32 %r12, %r9, %r8;
add.s32 %r11, %r12, %r11;
shl.b32 %r11, %r11, 2;
cvt.s64.s32 %rl1, %r11;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB8_7: // %if_exit.us
// in Loop: Header=BB8_8 Depth=2
add.s32 %r9, %r9, 32;
setp.lt.s32 %p0, %r9, %r4;
@%p0 bra BB8_8;
bra.uni BB8_13;
BB8_16: // %mandel___vyfvyfvyi.exit193
// Parent Loop BB8_2 Depth=1
// => This Inner Loop Header: Depth=2
// Callseq Start 2
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___UM_,
(
);
ld.param.b32 %r10, [retval0+0];
//{
}// Callseq End 2
// Callseq Start 3
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___UM_,
(
);
ld.param.b32 %r10, [retval0+0];
//{
}// Callseq End 3
// Callseq Start 4
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___UM_,
(
);
ld.param.b32 %r11, [retval0+0];
//{
}// Callseq End 4
add.s32 %r11, %r11, %r9;
setp.lt.s32 %p0, %r11, %r4;
@%p0 bra BB8_17;
bra.uni BB8_15;
BB8_17: // %if_then
// in Loop: Header=BB8_16 Depth=2
add.s32 %r10, %r10, %r8;
shl.b32 %r10, %r10, 2;
cvt.s64.s32 %rl1, %r10;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r10, 0;
st.u32 [%rl1], %r10;
BB8_15: // %if_exit
// in Loop: Header=BB8_16 Depth=2
add.s32 %r9, %r9, 32;
add.s32 %r8, %r8, 32;
setp.lt.s32 %p0, %r9, %r4;
@%p0 bra BB8_16;
BB8_13: // %for_exit37
// in Loop: Header=BB8_2 Depth=1
add.s32 %r2, %r2, 1;
add.s32 %r7, %r7, %r0;
setp.eq.s32 %p0, %r2, %r6;
@%p0 bra BB8_14;
bra.uni BB8_2;
BB8_14: // %for_exit
ret;
}

View File

@@ -1,178 +0,0 @@
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
// Cuda compilation tools, release 5.5, V5.5.0
//
.version 3.2
.target sm_20
.address_size 64
.visible .entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 mandelbrot_scanline_param_9
)
{
.reg .pred %p<32>;
.reg .s32 %r<62>;
.reg .f32 %f<28>;
.reg .s64 %rd<6>;
ld.param.f32 %f11, [mandelbrot_scanline_param_0];
ld.param.f32 %f12, [mandelbrot_scanline_param_1];
ld.param.f32 %f13, [mandelbrot_scanline_param_2];
ld.param.f32 %f14, [mandelbrot_scanline_param_3];
ld.param.u32 %r17, [mandelbrot_scanline_param_4];
ld.param.u32 %r18, [mandelbrot_scanline_param_5];
ld.param.u32 %r19, [mandelbrot_scanline_param_6];
ld.param.u32 %r20, [mandelbrot_scanline_param_7];
ld.param.u32 %r21, [mandelbrot_scanline_param_8];
ld.param.u64 %rd1, [mandelbrot_scanline_param_9];
mov.u32 %r22, %ctaid.x;
mad.lo.s32 %r23, %r22, %r19, %r19;
min.s32 %r1, %r23, %r17;
mov.u32 %r2, %ctaid.y;
mul.lo.s32 %r59, %r2, %r20;
add.s32 %r24, %r59, %r20;
min.s32 %r25, %r24, %r18;
setp.ge.s32 %p10, %r59, %r25;
@%p10 bra BB0_15;
not.b32 %r26, %r18;
add.s32 %r27, %r2, 1;
mul.lo.s32 %r28, %r27, %r20;
not.b32 %r29, %r28;
max.s32 %r30, %r26, %r29;
not.b32 %r4, %r30;
BB0_2:
mul.lo.s32 %r60, %r22, %r19;
setp.ge.s32 %p11, %r60, %r1;
@%p11 bra BB0_14;
cvt.rn.f32.s32 %f15, %r59;
setp.gt.s32 %p12, %r21, 0;
fma.rn.f32 %f1, %f15, %f14, %f13;
@%p12 bra BB0_7;
BB0_4:
mov.u32 %r8, WARP_SZ;
add.s32 %r34, %r8, -1;
mov.u32 %r35, %tid.x;
and.b32 %r36, %r34, %r35;
add.s32 %r37, %r36, %r60;
setp.ge.s32 %p13, %r37, %r1;
@%p13 bra BB0_6;
mad.lo.s32 %r38, %r59, %r17, %r60;
add.s32 %r40, %r8, 1073741823;
and.b32 %r42, %r40, %r35;
add.s32 %r43, %r38, %r42;
shl.b32 %r44, %r43, 2;
cvt.s64.s32 %rd2, %r44;
add.s64 %rd3, %rd2, %rd1;
mov.u32 %r45, 0;
st.u32 [%rd3], %r45;
BB0_6:
add.s32 %r60, %r8, %r60;
setp.lt.s32 %p14, %r60, %r1;
@%p14 bra BB0_4;
bra.uni BB0_14;
BB0_7:
mov.u32 %r47, WARP_SZ;
add.s32 %r48, %r47, -1;
mov.u32 %r49, %tid.x;
and.b32 %r50, %r48, %r49;
add.s32 %r11, %r50, %r60;
cvt.rn.f32.s32 %f16, %r11;
fma.rn.f32 %f2, %f16, %f12, %f11;
mov.u32 %r61, 0;
mov.pred %p16, 0;
mov.pred %p29, -1;
mov.pred %p26, %p12;
mov.pred %p31, %p16;
mov.f32 %f22, %f2;
mov.f32 %f26, %f1;
BB0_8:
mov.f32 %f24, %f26;
mov.f32 %f27, %f24;
mov.f32 %f20, %f22;
mov.f32 %f23, %f20;
mov.pred %p3, %p29;
mov.pred %p2, %p26;
and.pred %p5, %p3, %p2;
mul.f32 %f6, %f23, %f23;
mul.f32 %f5, %f27, %f27;
add.f32 %f17, %f5, %f6;
setp.gtu.f32 %p18, %f17, 0f40800000;
and.pred %p19, %p5, %p18;
or.pred %p31, %p19, %p31;
xor.pred %p20, %p31, %p5;
mov.pred %p30, %p16;
@!%p20 bra BB0_10;
bra.uni BB0_9;
BB0_9:
add.f32 %f18, %f23, %f23;
fma.rn.f32 %f27, %f27, %f18, %f1;
sub.f32 %f19, %f6, %f5;
add.f32 %f23, %f2, %f19;
not.pred %p21, %p31;
and.pred %p7, %p5, %p21;
mov.pred %p30, %p7;
BB0_10:
mov.f32 %f9, %f27;
mov.f32 %f10, %f23;
mov.pred %p28, %p30;
mov.pred %p29, %p28;
add.s32 %r51, %r61, 1;
selp.b32 %r61, %r51, %r61, %p29;
setp.lt.s32 %p9, %r61, %r21;
and.pred %p22, %p29, %p9;
mov.pred %p26, %p9;
mov.f32 %f22, %f10;
mov.f32 %f26, %f9;
@%p22 bra BB0_8;
setp.ge.s32 %p23, %r11, %r1;
@%p23 bra BB0_13;
mad.lo.s32 %r52, %r59, %r17, %r60;
add.s32 %r54, %r47, 1073741823;
and.b32 %r56, %r54, %r49;
add.s32 %r57, %r52, %r56;
shl.b32 %r58, %r57, 2;
cvt.s64.s32 %rd4, %r58;
add.s64 %rd5, %rd4, %rd1;
st.u32 [%rd5], %r61;
BB0_13:
add.s32 %r60, %r47, %r60;
setp.lt.s32 %p24, %r60, %r1;
@%p24 bra BB0_7;
BB0_14:
add.s32 %r59, %r59, 1;
setp.ne.s32 %p25, %r59, %r4;
@%p25 bra BB0_2;
BB0_15:
ret;
}

File diff suppressed because one or more lines are too long

View File

@@ -1,171 +0,0 @@
.file "mandelbrot_task.ispc"
.text
.globl mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
.align 16, 0x90
.type mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_,@function
mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_: # @mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
# BB#0: # %allocas
pushq %rbp
pushq %r15
pushq %r14
pushq %rbx
subq $88, %rsp
vmovups %ymm4, 32(%rsp) # 32-byte Folded Spill
movq %rcx, %r14
movl %edx, %r15d
movl %esi, %ebx
movl %edi, %ebp
vmovss %xmm1, 76(%rsp) # 4-byte Spill
vmovss %xmm0, 28(%rsp) # 4-byte Spill
vcvtsi2ssl %ebp, %xmm0, %xmm5
vsubss %xmm0, %xmm2, %xmm4
vcvtsi2ssl %ebx, %xmm0, %xmm2
vsubss %xmm1, %xmm3, %xmm3
movq $0, 80(%rsp)
leaq 80(%rsp), %rdi
vdivss %xmm2, %xmm3, %xmm1
vmovss %xmm1, 72(%rsp) # 4-byte Spill
vdivss %xmm5, %xmm4, %xmm0
vmovss %xmm0, 68(%rsp) # 4-byte Spill
movl $96, %esi
movl $32, %edx
vzeroupper
callq ISPCAlloc
vmovups 32(%rsp), %ymm0 # 32-byte Folded Reload
movq %rax, %rdx
movl %ebx, %r8d
sarl $31, %r8d
shrl $28, %r8d
addl %ebx, %r8d
vmovss 28(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, (%rdx)
sarl $4, %r8d
movl %ebp, %ecx
sarl $31, %ecx
shrl $28, %ecx
addl %ebp, %ecx
sarl $4, %ecx
vmovmskps %ymm0, %eax
cmpl $255, %eax
vmovss 68(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 4(%rdx)
vmovss 76(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 8(%rdx)
vmovss 72(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 12(%rdx)
movl %ebp, 16(%rdx)
movl %ebx, 20(%rdx)
movl $16, 24(%rdx)
movl $16, 28(%rdx)
movl %r15d, 32(%rdx)
movq %r14, 40(%rdx)
jne .LBB0_2
# BB#1: # %all_on
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
.LBB0_2: # %all_on
vmovaps %ymm0, 64(%rdx)
leaq 80(%rsp), %rdi
movl $mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, %esi
movl $1, %r9d
vzeroupper
callq ISPCLaunch
movq 80(%rsp), %rdi
testq %rdi, %rdi
je .LBB0_4
# BB#3: # %call_sync
callq ISPCSync
movq $0, 80(%rsp)
.LBB0_4: # %post_sync
addq $88, %rsp
popq %rbx
popq %r14
popq %r15
popq %rbp
ret
.Ltmp0:
.size mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_, .Ltmp0-mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
.globl mandelbrot_ispc
.align 16, 0x90
.type mandelbrot_ispc,@function
mandelbrot_ispc: # @mandelbrot_ispc
# BB#0: # %allocas
pushq %rbp
pushq %r15
pushq %r14
pushq %r12
pushq %rbx
subq $32, %rsp
movq %rcx, %r14
movl %edx, %r15d
movl %esi, %ebx
movl %edi, %ebp
vmovss %xmm1, 20(%rsp) # 4-byte Spill
vmovss %xmm0, 8(%rsp) # 4-byte Spill
vcvtsi2ssl %ebp, %xmm0, %xmm5
vsubss %xmm0, %xmm2, %xmm4
vcvtsi2ssl %ebx, %xmm0, %xmm2
vsubss %xmm1, %xmm3, %xmm3
movq $0, 24(%rsp)
leaq 24(%rsp), %r12
vdivss %xmm2, %xmm3, %xmm1
vmovss %xmm1, 16(%rsp) # 4-byte Spill
vdivss %xmm5, %xmm4, %xmm0
vmovss %xmm0, 12(%rsp) # 4-byte Spill
movq %r12, %rdi
movl $96, %esi
movl $32, %edx
callq ISPCAlloc
movl %ebx, %r8d
sarl $31, %r8d
vpcmpeqd %xmm0, %xmm0, %xmm0
vmovss 8(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, (%rax)
shrl $28, %r8d
addl %ebx, %r8d
movl %ebp, %ecx
sarl $31, %ecx
shrl $28, %ecx
addl %ebp, %ecx
sarl $4, %ecx
sarl $4, %r8d
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovss 12(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 4(%rax)
vmovss 20(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 8(%rax)
vmovss 16(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 12(%rax)
movl %ebp, 16(%rax)
movl %ebx, 20(%rax)
movl $16, 24(%rax)
movl $16, 28(%rax)
movl %r15d, 32(%rax)
movq %r14, 40(%rax)
vmovaps %ymm0, 64(%rax)
movq %r12, %rdi
movl $mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, %esi
movq %rax, %rdx
movl $1, %r9d
vzeroupper
callq ISPCLaunch
movq 24(%rsp), %rdi
testq %rdi, %rdi
je .LBB1_2
# BB#1: # %call_sync
callq ISPCSync
movq $0, 24(%rsp)
.LBB1_2: # %post_sync
addq $32, %rsp
popq %rbx
popq %r12
popq %r14
popq %r15
popq %rbp
ret
.Ltmp1:
.size mandelbrot_ispc, .Ltmp1-mandelbrot_ispc
.section ".note.GNU-stack","",@progbits

View File

@@ -1,171 +0,0 @@
.file "mandelbrot_task.ispc"
.text
.globl mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
.align 16, 0x90
.type mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_,@function
mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_: # @mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
# BB#0: # %allocas
pushq %rbp
pushq %r15
pushq %r14
pushq %rbx
subq $88, %rsp
vmovups %ymm4, 32(%rsp) # 32-byte Folded Spill
movq %rcx, %r14
movl %edx, %r15d
movl %esi, %ebx
movl %edi, %ebp
vmovss %xmm1, 76(%rsp) # 4-byte Spill
vmovss %xmm0, 28(%rsp) # 4-byte Spill
vcvtsi2ssl %ebp, %xmm0, %xmm5
vsubss %xmm0, %xmm2, %xmm4
vcvtsi2ssl %ebx, %xmm0, %xmm2
vsubss %xmm1, %xmm3, %xmm3
movq $0, 80(%rsp)
leaq 80(%rsp), %rdi
vdivss %xmm2, %xmm3, %xmm1
vmovss %xmm1, 72(%rsp) # 4-byte Spill
vdivss %xmm5, %xmm4, %xmm0
vmovss %xmm0, 68(%rsp) # 4-byte Spill
movl $96, %esi
movl $32, %edx
vzeroupper
callq ISPCAlloc
vmovups 32(%rsp), %ymm0 # 32-byte Folded Reload
movq %rax, %rdx
movl %ebx, %r8d
sarl $31, %r8d
shrl $28, %r8d
addl %ebx, %r8d
vmovss 28(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, (%rdx)
sarl $4, %r8d
movl %ebp, %ecx
sarl $31, %ecx
shrl $28, %ecx
addl %ebp, %ecx
sarl $4, %ecx
vmovmskps %ymm0, %eax
cmpl $255, %eax
vmovss 68(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 4(%rdx)
vmovss 76(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 8(%rdx)
vmovss 72(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 12(%rdx)
movl %ebp, 16(%rdx)
movl %ebx, 20(%rdx)
movl $16, 24(%rdx)
movl $16, 28(%rdx)
movl %r15d, 32(%rdx)
movq %r14, 40(%rdx)
jne .LBB0_2
# BB#1: # %all_on
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
.LBB0_2: # %all_on
vmovaps %ymm0, 64(%rdx)
leaq 80(%rsp), %rdi
movl $mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, %esi
movl $1, %r9d
vzeroupper
callq ISPCLaunch
movq 80(%rsp), %rdi
testq %rdi, %rdi
je .LBB0_4
# BB#3: # %call_sync
callq ISPCSync
movq $0, 80(%rsp)
.LBB0_4: # %post_sync
addq $88, %rsp
popq %rbx
popq %r14
popq %r15
popq %rbp
ret
.Ltmp0:
.size mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_, .Ltmp0-mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
.globl mandelbrot_ispc
.align 16, 0x90
.type mandelbrot_ispc,@function
mandelbrot_ispc: # @mandelbrot_ispc
# BB#0: # %allocas
pushq %rbp
pushq %r15
pushq %r14
pushq %r12
pushq %rbx
subq $32, %rsp
movq %rcx, %r14
movl %edx, %r15d
movl %esi, %ebx
movl %edi, %ebp
vmovss %xmm1, 20(%rsp) # 4-byte Spill
vmovss %xmm0, 8(%rsp) # 4-byte Spill
vcvtsi2ssl %ebp, %xmm0, %xmm5
vsubss %xmm0, %xmm2, %xmm4
vcvtsi2ssl %ebx, %xmm0, %xmm2
vsubss %xmm1, %xmm3, %xmm3
movq $0, 24(%rsp)
leaq 24(%rsp), %r12
vdivss %xmm2, %xmm3, %xmm1
vmovss %xmm1, 16(%rsp) # 4-byte Spill
vdivss %xmm5, %xmm4, %xmm0
vmovss %xmm0, 12(%rsp) # 4-byte Spill
movq %r12, %rdi
movl $96, %esi
movl $32, %edx
callq ISPCAlloc
movl %ebx, %r8d
sarl $31, %r8d
vpcmpeqd %xmm0, %xmm0, %xmm0
vmovss 8(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, (%rax)
shrl $28, %r8d
addl %ebx, %r8d
movl %ebp, %ecx
sarl $31, %ecx
shrl $28, %ecx
addl %ebp, %ecx
sarl $4, %ecx
sarl $4, %r8d
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovss 12(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 4(%rax)
vmovss 20(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 8(%rax)
vmovss 16(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 12(%rax)
movl %ebp, 16(%rax)
movl %ebx, 20(%rax)
movl $16, 24(%rax)
movl $16, 28(%rax)
movl %r15d, 32(%rax)
movq %r14, 40(%rax)
vmovaps %ymm0, 64(%rax)
movq %r12, %rdi
movl $mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, %esi
movq %rax, %rdx
movl $1, %r9d
vzeroupper
callq ISPCLaunch
movq 24(%rsp), %rdi
testq %rdi, %rdi
je .LBB1_2
# BB#1: # %call_sync
callq ISPCSync
movq $0, 24(%rsp)
.LBB1_2: # %post_sync
addq $32, %rsp
popq %rbx
popq %r12
popq %r14
popq %r15
popq %rbp
ret
.Ltmp1:
.size mandelbrot_ispc, .Ltmp1-mandelbrot_ispc
.section ".note.GNU-stack","",@progbits

File diff suppressed because one or more lines are too long

View File

@@ -1,208 +0,0 @@
; ModuleID = 'mandelbrot_task_nvptx64.bc'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64"
@__str = internal constant [66 x i8] c"mandelbrot_task.ispc:55:3: Assertion failed: xspan >= vectorWidth\00"
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #0
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #0
; Function Attrs: nounwind
declare i32 @puts(i8* nocapture) #1
; Function Attrs: noreturn
declare void @abort() #2
; Function Attrs: nounwind
define void @mandelbrot_scanline(float %x0, float %dx, float %y0, float %dy, i32 %width, i32 %height, i32 %xspan, i32 %yspan, i32 %maxIterations, i32* %output) #3 {
allocas:
%bid.i.i = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
%mul_calltmp_xspan_load = mul i32 %bid.i.i, %xspan
%add_xstart_load_xspan_load13 = add i32 %mul_calltmp_xspan_load, %xspan
%c.i.i = icmp slt i32 %add_xstart_load_xspan_load13, %width
%r.i.i = select i1 %c.i.i, i32 %add_xstart_load_xspan_load13, i32 %width
%bid.i.i77 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1
%mul_calltmp19_yspan_load = mul i32 %bid.i.i77, %yspan
%add_ystart_load_yspan_load20 = add i32 %mul_calltmp19_yspan_load, %yspan
%tid.i.i80 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%greaterequal_xspan_load24_calltmp27 = icmp sgt i32 %tid.i.i80, %xspan
br i1 %greaterequal_xspan_load24_calltmp27, label %fail.i, label %for_test.preheader
for_test.preheader: ; preds = %allocas
%c.i.i78 = icmp slt i32 %add_ystart_load_yspan_load20, %height
%r.i.i79 = select i1 %c.i.i78, i32 %add_ystart_load_yspan_load20, i32 %height
%less_yi_load_yend_load113 = icmp slt i32 %mul_calltmp19_yspan_load, %r.i.i79
br i1 %less_yi_load_yend_load113, label %for_test34.preheader.lr.ph, label %for_exit
for_test34.preheader.lr.ph: ; preds = %for_test.preheader
%less_xi_load_xend_load111 = icmp slt i32 %mul_calltmp_xspan_load, %r.i.i
%maxIterations_load_broadcast_init = insertelement <1 x i32> undef, i32 %maxIterations, i32 0
%less_i_load_count_load.i102 = icmp sgt <1 x i32> %maxIterations_load_broadcast_init, zeroinitializer
%v.i.i104 = extractelement <1 x i1> %less_i_load_count_load.i102, i32 0
%output_load_ptr2int = ptrtoint i32* %output to i64
%0 = xor i32 %height, -1
%1 = add i32 %bid.i.i77, 1
%2 = mul i32 %1, %yspan
%3 = xor i32 %2, -1
%4 = icmp sgt i32 %0, %3
%smax = select i1 %4, i32 %0, i32 %3
%5 = xor i32 %smax, -1
br label %for_test34.preheader
fail.i: ; preds = %allocas
%call.i = call i32 @puts(i8* getelementptr inbounds ([66 x i8]* @__str, i64 0, i64 0)) #1
call void @abort() #4
unreachable
for_test34.preheader: ; preds = %for_exit37, %for_test34.preheader.lr.ph
%yi.0114 = phi i32 [ %mul_calltmp19_yspan_load, %for_test34.preheader.lr.ph ], [ %yi_load71_plus1, %for_exit37 ]
br i1 %less_xi_load_xend_load111, label %for_loop36.lr.ph, label %for_exit37
for_loop36.lr.ph: ; preds = %for_test34.preheader
%yi_load46_to_float = sitofp i32 %yi.0114 to float
%mul_yi_load46_to_float_dy_load = fmul float %yi_load46_to_float, %dy
%add_y0_load_mul_yi_load46_to_float_dy_load = fadd float %mul_yi_load46_to_float_dy_load, %y0
%add_y0_load_mul_yi_load46_to_float_dy_load_broadcast_init = insertelement <1 x float> undef, float %add_y0_load_mul_yi_load46_to_float_dy_load, i32 0
%mul_yi_load50_width_load51 = mul i32 %yi.0114, %width
br i1 %v.i.i104, label %for_loop.i.lr.ph.us, label %mandel___vyfvyfvyi.exit
mandel___vyfvyfvyi.exit.us: ; preds = %for_step.i.us
%tid.i.i72.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
%tid.i.i.i.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%sub_calltmp3_.i.us = add i32 %tid.i.i.i.us, -1
%bitop.i.us = and i32 %sub_calltmp3_.i.us, %tid.i.i72.us
%add_xi_load56_calltmp59.us = add i32 %bitop.i.us, %xi.0112.us
%less_add_xi_load56_calltmp59_xend_load60.us = icmp slt i32 %add_xi_load56_calltmp59.us, %r.i.i
br i1 %less_add_xi_load56_calltmp59_xend_load60.us, label %if_then.us, label %if_exit.us
if_then.us: ; preds = %mandel___vyfvyfvyi.exit.us
%tid.i.i.i74.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%sub_calltmp3_.i75.us = add i32 %tid.i.i.i74.us, 1073741823
%tid.i.i73.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
%bitop.i76.us = and i32 %sub_calltmp3_.i75.us, %tid.i.i73.us
%add_xi_load52_calltmp55.us = add i32 %xi.0112.us, %mul_yi_load50_width_load51
%add_mul_yi_load50_width_load51_add_xi_load52_calltmp55.us = add i32 %add_xi_load52_calltmp55.us, %bitop.i76.us
%6 = shl i32 %add_mul_yi_load50_width_load51_add_xi_load52_calltmp55.us, 2
%iptr__id.i.rhs.us = sext i32 %6 to i64
%iptr__id.i.us = add i64 %iptr__id.i.rhs.us, %output_load_ptr2int
%ptr__id.i.us = inttoptr i64 %iptr__id.i.us to i32*
%val__id.i.us = extractelement <1 x i32> %v1.i92.us, i32 0
store i32 %val__id.i.us, i32* %ptr__id.i.us, align 4
br label %if_exit.us
if_exit.us: ; preds = %if_then.us, %mandel___vyfvyfvyi.exit.us
%tid.i.i.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%add_xi_load70_calltmp68.us = add i32 %tid.i.i.us, %xi.0112.us
%less_xi_load_xend_load.us = icmp slt i32 %add_xi_load70_calltmp68.us, %r.i.i
br i1 %less_xi_load_xend_load.us, label %for_loop.i.lr.ph.us, label %for_exit37
for_loop.i.us: ; preds = %for_loop.i.lr.ph.us, %for_step.i.us
%less_i_load_count_load.i110.us = phi <1 x i1> [ %less_i_load_count_load.i102, %for_loop.i.lr.ph.us ], [ %less_i_load_count_load.i.us, %for_step.i.us ]
%internal_mask_memory.0.i109.us = phi <1 x i1> [ <i1 true>, %for_loop.i.lr.ph.us ], [ %internal_mask_memory.1.i.us, %for_step.i.us ]
%break_lanes_memory.0.i108.us = phi <1 x i1> [ zeroinitializer, %for_loop.i.lr.ph.us ], [ %"mask|break_mask.i.us", %for_step.i.us ]
%v1.i9096107.us = phi <1 x float> [ %add_x0_load_mul_add_xi_load42_calltmp45_to_float_dx_load_broadcast_init.us, %for_loop.i.lr.ph.us ], [ %v1.i9095.us, %for_step.i.us ]
%v1.i8898106.us = phi <1 x float> [ %add_y0_load_mul_yi_load46_to_float_dy_load_broadcast_init, %for_loop.i.lr.ph.us ], [ %v1.i8897.us, %for_step.i.us ]
%v1.i9299105.us = phi <1 x i32> [ zeroinitializer, %for_loop.i.lr.ph.us ], [ %v1.i92.us, %for_step.i.us ]
%"oldMask&test.i.us" = and <1 x i1> %internal_mask_memory.0.i109.us, %less_i_load_count_load.i110.us
%mul_z_re_load_z_re_load13.i.us = fmul <1 x float> %v1.i9096107.us, %v1.i9096107.us
%mul_z_im_load_z_im_load14.i.us = fmul <1 x float> %v1.i8898106.us, %v1.i8898106.us
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i.us = fadd <1 x float> %mul_z_im_load_z_im_load14.i.us, %mul_z_re_load_z_re_load13.i.us
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i.us = fcmp ugt <1 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i.us, <float 4.000000e+00>
%"oldMask&test16.i.us" = and <1 x i1> %"oldMask&test.i.us", %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i.us
%"mask|break_mask.i.us" = or <1 x i1> %"oldMask&test16.i.us", %break_lanes_memory.0.i108.us
%v.i2.i.us = extractelement <1 x i1> %"mask|break_mask.i.us", i32 0
%v.i1.i.us = extractelement <1 x i1> %"oldMask&test.i.us", i32 0
%"equal_finished&func_internal_mask&function_mask12.itmp.us" = xor i1 %v.i2.i.us, %v.i1.i.us
br i1 %"equal_finished&func_internal_mask&function_mask12.itmp.us", label %not_all_continued_or_breaked.i.us, label %for_step.i.us
not_all_continued_or_breaked.i.us: ; preds = %for_loop.i.us
%"!(break|continue)_lanes.i.us" = xor <1 x i1> %"mask|break_mask.i.us", <i1 true>
%new_mask28.i.us = and <1 x i1> %"oldMask&test.i.us", %"!(break|continue)_lanes.i.us"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i.us = fsub <1 x float> %mul_z_re_load_z_re_load13.i.us, %mul_z_im_load_z_im_load14.i.us
%mul__z_re_load35.i.us = fmul <1 x float> %v1.i9096107.us, <float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i.us = fmul <1 x float> %v1.i8898106.us, %mul__z_re_load35.i.us
%add_c_re_load42_new_re_load.i.us = fadd <1 x float> %add_x0_load_mul_add_xi_load42_calltmp45_to_float_dx_load_broadcast_init.us, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i.us
%add_c_im_load44_new_im_load.i.us = fadd <1 x float> %add_y0_load_mul_yi_load46_to_float_dy_load_broadcast_init, %mul_mul__z_re_load35_z_im_load36.i.us
br label %for_step.i.us
for_step.i.us: ; preds = %not_all_continued_or_breaked.i.us, %for_loop.i.us
%v1.i8897.us = phi <1 x float> [ %v1.i8898106.us, %for_loop.i.us ], [ %add_c_im_load44_new_im_load.i.us, %not_all_continued_or_breaked.i.us ]
%v1.i9095.us = phi <1 x float> [ %v1.i9096107.us, %for_loop.i.us ], [ %add_c_re_load42_new_re_load.i.us, %not_all_continued_or_breaked.i.us ]
%internal_mask_memory.1.i.us = phi <1 x i1> [ zeroinitializer, %for_loop.i.us ], [ %new_mask28.i.us, %not_all_continued_or_breaked.i.us ]
%i_load53_plus1.i.us = add <1 x i32> %v1.i9299105.us, <i32 1>
%v1.i92.us = select <1 x i1> %internal_mask_memory.1.i.us, <1 x i32> %i_load53_plus1.i.us, <1 x i32> %v1.i9299105.us
%less_i_load_count_load.i.us = icmp slt <1 x i32> %v1.i92.us, %maxIterations_load_broadcast_init
%"internal_mask&function_mask10.i.us" = and <1 x i1> %internal_mask_memory.1.i.us, %less_i_load_count_load.i.us
%v.i.i.us = extractelement <1 x i1> %"internal_mask&function_mask10.i.us", i32 0
br i1 %v.i.i.us, label %for_loop.i.us, label %mandel___vyfvyfvyi.exit.us
for_loop.i.lr.ph.us: ; preds = %if_exit.us, %for_loop36.lr.ph
%xi.0112.us = phi i32 [ %add_xi_load70_calltmp68.us, %if_exit.us ], [ %mul_calltmp_xspan_load, %for_loop36.lr.ph ]
%tid.i.i81.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
%tid.i.i.i82.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%sub_calltmp3_.i83.us = add i32 %tid.i.i.i82.us, -1
%bitop.i84.us = and i32 %sub_calltmp3_.i83.us, %tid.i.i81.us
%add_xi_load42_calltmp45.us = add i32 %bitop.i84.us, %xi.0112.us
%add_xi_load42_calltmp45_to_float.us = sitofp i32 %add_xi_load42_calltmp45.us to float
%mul_add_xi_load42_calltmp45_to_float_dx_load.us = fmul float %add_xi_load42_calltmp45_to_float.us, %dx
%add_x0_load_mul_add_xi_load42_calltmp45_to_float_dx_load.us = fadd float %mul_add_xi_load42_calltmp45_to_float_dx_load.us, %x0
%add_x0_load_mul_add_xi_load42_calltmp45_to_float_dx_load_broadcast_init.us = insertelement <1 x float> undef, float %add_x0_load_mul_add_xi_load42_calltmp45_to_float_dx_load.us, i32 0
br label %for_loop.i.us
for_exit: ; preds = %for_exit37, %for_test.preheader
ret void
mandel___vyfvyfvyi.exit: ; preds = %if_exit, %for_loop36.lr.ph
%xi.0112 = phi i32 [ %add_xi_load70_calltmp68, %if_exit ], [ %mul_calltmp_xspan_load, %for_loop36.lr.ph ]
%tid.i.i72 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
%tid.i.i.i = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%sub_calltmp3_.i = add i32 %tid.i.i.i, -1
%bitop.i = and i32 %sub_calltmp3_.i, %tid.i.i72
%add_xi_load56_calltmp59 = add i32 %bitop.i, %xi.0112
%less_add_xi_load56_calltmp59_xend_load60 = icmp slt i32 %add_xi_load56_calltmp59, %r.i.i
br i1 %less_add_xi_load56_calltmp59_xend_load60, label %if_then, label %if_exit
for_exit37: ; preds = %if_exit, %if_exit.us, %for_test34.preheader
%yi_load71_plus1 = add i32 %yi.0114, 1
%exitcond = icmp eq i32 %yi_load71_plus1, %5
br i1 %exitcond, label %for_exit, label %for_test34.preheader
if_then: ; preds = %mandel___vyfvyfvyi.exit
%tid.i.i.i74 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%sub_calltmp3_.i75 = add i32 %tid.i.i.i74, 1073741823
%tid.i.i73 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
%bitop.i76 = and i32 %sub_calltmp3_.i75, %tid.i.i73
%add_xi_load52_calltmp55 = add i32 %xi.0112, %mul_yi_load50_width_load51
%add_mul_yi_load50_width_load51_add_xi_load52_calltmp55 = add i32 %add_xi_load52_calltmp55, %bitop.i76
%7 = shl i32 %add_mul_yi_load50_width_load51_add_xi_load52_calltmp55, 2
%iptr__id.i.rhs = sext i32 %7 to i64
%iptr__id.i = add i64 %iptr__id.i.rhs, %output_load_ptr2int
%ptr__id.i = inttoptr i64 %iptr__id.i to i32*
store i32 0, i32* %ptr__id.i, align 4
br label %if_exit
if_exit: ; preds = %if_then, %mandel___vyfvyfvyi.exit
%tid.i.i = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%add_xi_load70_calltmp68 = add i32 %tid.i.i, %xi.0112
%less_xi_load_xend_load = icmp slt i32 %add_xi_load70_calltmp68, %r.i.i
br i1 %less_xi_load_xend_load, label %mandel___vyfvyfvyi.exit, label %for_exit37
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
attributes #2 = { noreturn }
attributes #3 = { nounwind "target-features"="+sm_35" }
attributes #4 = { noreturn nounwind }
!nvvm.annotations = !{!0}
!0 = metadata !{void (float, float, float, float, i32, i32, i32, i32, i32, i32*)* @mandelbrot_scanline, metadata !"kernel", i32 1}

View File

@@ -1,229 +0,0 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl mandelbrot_scanline
.func (.param .b32 func_retval0) puts
(
.param .b64 puts_param_0
)
;
.func abort
(
)
;
.global .align 1 .b8 __str[66] = {109, 97, 110, 100, 101, 108, 98, 114, 111, 116, 95, 116, 97, 115, 107, 46, 105, 115, 112, 99, 58, 53, 53, 58, 51, 58, 32, 65, 115, 115, 101, 114, 116, 105, 111, 110, 32, 102, 97, 105, 108, 101, 100, 58, 32, 120, 115, 112, 97, 110, 32, 62, 61, 32, 118, 101, 99, 116, 111, 114, 87, 105, 100, 116, 104, 0};
// @mandelbrot_scanline
.entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 .ptr .align 4 mandelbrot_scanline_param_9
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u32 %r3, [mandelbrot_scanline_param_6];
mov.u32 %r0, WARP_SZ;
setp.gt.s32 %p0, %r0, %r3;
@%p0 bra BB0_18;
// BB#1: // %for_test.preheader
ld.param.u32 %r7, [mandelbrot_scanline_param_5];
ld.param.u32 %r6, [mandelbrot_scanline_param_7];
mov.u32 %r8, %ctaid.y;
mul.lo.s32 %r1, %r8, %r6;
mad.lo.s32 %r2, %r8, %r6, %r6;
setp.lt.s32 %p0, %r2, %r7;
selp.b32 %r2, %r2, %r7, %p0;
setp.ge.s32 %p0, %r1, %r2;
@%p0 bra BB0_14;
// BB#2: // %for_test34.preheader.lr.ph
ld.param.f32 %f0, [mandelbrot_scanline_param_0];
ld.param.f32 %f1, [mandelbrot_scanline_param_1];
ld.param.f32 %f2, [mandelbrot_scanline_param_2];
mov.u32 %r4, %ctaid.x;
mul.lo.s32 %r2, %r4, %r3;
ld.param.f32 %f3, [mandelbrot_scanline_param_3];
mad.lo.s32 %r4, %r4, %r3, %r3;
ld.param.u32 %r3, [mandelbrot_scanline_param_4];
setp.lt.s32 %p0, %r4, %r3;
selp.b32 %r4, %r4, %r3, %p0;
ld.param.u32 %r5, [mandelbrot_scanline_param_8];
ld.param.u64 %rl0, [mandelbrot_scanline_param_9];
setp.gt.s32 %p0, %r5, 0;
not.b32 %r7, %r7;
add.s32 %r8, %r8, 1;
mul.lo.s32 %r6, %r8, %r6;
not.b32 %r6, %r6;
setp.gt.s32 %p1, %r7, %r6;
selp.b32 %r6, %r7, %r6, %p1;
not.b32 %r6, %r6;
BB0_3: // %for_test34.preheader
// =>This Loop Header: Depth=1
// Child Loop BB0_16 Depth 2
// Child Loop BB0_9 Depth 2
// Child Loop BB0_12 Depth 3
setp.ge.s32 %p1, %r2, %r4;
@%p1 bra BB0_13;
// BB#4: // %for_loop36.lr.ph
// in Loop: Header=BB0_3 Depth=1
mul.lo.s32 %r7, %r1, %r3;
mov.u32 %r8, %r2;
@%p0 bra BB0_5;
bra.uni BB0_16;
BB0_5: // in Loop: Header=BB0_3 Depth=1
cvt.rn.f32.s32 %f4, %r1;
fma.rn.f32 %f4, %f4, %f3, %f2;
mov.u32 %r8, %r2;
BB0_9: // %for_loop.i.lr.ph.us
// Parent Loop BB0_3 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB0_12 Depth 3
mov.u32 %r9, %tid.x;
add.s32 %r10, %r0, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r11, %r10, %r8;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f5, %f1, %f0;
mov.u32 %r10, 0;
mov.pred %p1, 0;
mov.pred %p3, -1;
mov.pred %p4, %p0;
mov.pred %p2, %p1;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB0_12: // %for_loop.i.us
// Parent Loop BB0_3 Depth=1
// Parent Loop BB0_9 Depth=2
// => This Inner Loop Header: Depth=3
and.pred %p4, %p3, %p4;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p3, %f9, 0f40800000;
and.pred %p3, %p4, %p3;
or.pred %p2, %p3, %p2;
xor.pred %p5, %p2, %p4;
mov.pred %p3, %p1;
@!%p5 bra BB0_11;
bra.uni BB0_10;
BB0_10: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB0_12 Depth=3
mul.f32 %f9, %f6, %f6;
not.pred %p3, %p2;
and.pred %p3, %p4, %p3;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB0_11: // %for_step.i.us
// in Loop: Header=BB0_12 Depth=3
add.s32 %r12, %r10, 1;
selp.b32 %r10, %r12, %r10, %p3;
setp.lt.s32 %p4, %r10, %r5;
and.pred %p5, %p3, %p4;
@%p5 bra BB0_12;
// BB#6: // %mandel___vyfvyfvyi.exit.us
// in Loop: Header=BB0_9 Depth=2
setp.ge.s32 %p1, %r11, %r4;
@%p1 bra BB0_8;
// BB#7: // %if_then.us
// in Loop: Header=BB0_9 Depth=2
add.s32 %r11, %r0, 1073741823;
and.b32 %r9, %r11, %r9;
add.s32 %r11, %r8, %r7;
add.s32 %r9, %r11, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB0_8: // %if_exit.us
// in Loop: Header=BB0_9 Depth=2
add.s32 %r8, %r0, %r8;
setp.lt.s32 %p1, %r8, %r4;
@%p1 bra BB0_9;
bra.uni BB0_13;
BB0_16: // %mandel___vyfvyfvyi.exit
// Parent Loop BB0_3 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r9, %tid.x;
add.s32 %r10, %r0, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r10, %r10, %r8;
setp.lt.s32 %p1, %r10, %r4;
@%p1 bra BB0_17;
bra.uni BB0_15;
BB0_17: // %if_then
// in Loop: Header=BB0_16 Depth=2
add.s32 %r10, %r0, 1073741823;
and.b32 %r9, %r10, %r9;
add.s32 %r10, %r8, %r7;
add.s32 %r9, %r10, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
BB0_15: // %if_exit
// in Loop: Header=BB0_16 Depth=2
add.s32 %r8, %r0, %r8;
setp.lt.s32 %p1, %r8, %r4;
@%p1 bra BB0_16;
BB0_13: // %for_exit37
// in Loop: Header=BB0_3 Depth=1
add.s32 %r1, %r1, 1;
setp.eq.s32 %p1, %r1, %r6;
@%p1 bra BB0_14;
bra.uni BB0_3;
BB0_14: // %for_exit
ret;
BB0_18: // %fail.i
mov.u64 %rl0, __str;
cvta.global.u64 %rl0, %rl0;
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rl0;
.param .b32 retval0;
call.uni (retval0),
puts,
(
param0
);
ld.param.b32 %r0, [retval0+0];
//{
}// Callseq End 0
// Callseq Start 1
{
.reg .b32 temp_param_reg;
// <end>}
call.uni
abort,
(
);
//{
}// Callseq End 1
}

View File

@@ -1,320 +0,0 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl __aos_to_soa4_float1
// @__aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl mandelbrot_scanline
.entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 .ptr .align 4 mandelbrot_scanline_param_9
) // @mandelbrot_scanline
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u32 %r6, [mandelbrot_scanline_param_5];
ld.param.u32 %r5, [mandelbrot_scanline_param_7];
mov.u32 %r7, %ctaid.y;
mul.lo.s32 %r0, %r7, %r5;
mad.lo.s32 %r1, %r7, %r5, %r5;
setp.lt.s32 %p0, %r1, %r6;
selp.b32 %r1, %r1, %r6, %p0;
setp.ge.s32 %p0, %r0, %r1;
@%p0 bra BB4_13;
// BB#1: // %for_test28.preheader.lr.ph
ld.param.f32 %f0, [mandelbrot_scanline_param_0];
ld.param.f32 %f1, [mandelbrot_scanline_param_1];
ld.param.f32 %f2, [mandelbrot_scanline_param_2];
ld.param.f32 %f3, [mandelbrot_scanline_param_3];
ld.param.u32 %r1, [mandelbrot_scanline_param_4];
ld.param.u32 %r4, [mandelbrot_scanline_param_6];
ld.param.u32 %r2, [mandelbrot_scanline_param_8];
ld.param.u64 %rl0, [mandelbrot_scanline_param_9];
mov.u32 %r8, %ctaid.x;
mul.lo.s32 %r3, %r8, %r4;
mad.lo.s32 %r4, %r8, %r4, %r4;
setp.lt.s32 %p0, %r4, %r1;
selp.b32 %r4, %r4, %r1, %p0;
setp.gt.s32 %p0, %r2, 0;
not.b32 %r6, %r6;
add.s32 %r7, %r7, 1;
mul.lo.s32 %r5, %r7, %r5;
not.b32 %r5, %r5;
setp.gt.s32 %p1, %r6, %r5;
selp.b32 %r5, %r6, %r5, %p1;
not.b32 %r5, %r5;
BB4_2: // %for_test28.preheader
// =>This Loop Header: Depth=1
// Child Loop BB4_15 Depth 2
// Child Loop BB4_8 Depth 2
// Child Loop BB4_11 Depth 3
setp.ge.s32 %p1, %r3, %r4;
@%p1 bra BB4_12;
// BB#3: // %for_loop30.lr.ph
// in Loop: Header=BB4_2 Depth=1
mul.lo.s32 %r6, %r0, %r1;
mov.u32 %r7, %r3;
@%p0 bra BB4_4;
bra.uni BB4_15;
BB4_4: // in Loop: Header=BB4_2 Depth=1
cvt.rn.f32.s32 %f4, %r0;
fma.rn.f32 %f4, %f4, %f3, %f2;
mov.u32 %r7, %r3;
BB4_8: // %for_loop.i.lr.ph.us
// Parent Loop BB4_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB4_11 Depth 3
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r11, %r10, %r7;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f5, %f1, %f0;
mov.u32 %r10, 0;
mov.pred %p1, 0;
mov.pred %p3, -1;
mov.pred %p4, %p0;
mov.pred %p2, %p1;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB4_11: // %for_loop.i.us
// Parent Loop BB4_2 Depth=1
// Parent Loop BB4_8 Depth=2
// => This Inner Loop Header: Depth=3
and.pred %p4, %p3, %p4;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p3, %f9, 0f40800000;
and.pred %p3, %p4, %p3;
or.pred %p2, %p3, %p2;
xor.pred %p5, %p2, %p4;
mov.pred %p3, %p1;
@!%p5 bra BB4_10;
bra.uni BB4_9;
BB4_9: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB4_11 Depth=3
mul.f32 %f9, %f6, %f6;
not.pred %p3, %p2;
and.pred %p3, %p4, %p3;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB4_10: // %for_step.i.us
// in Loop: Header=BB4_11 Depth=3
add.s32 %r12, %r10, 1;
selp.b32 %r10, %r12, %r10, %p3;
setp.lt.s32 %p4, %r10, %r2;
and.pred %p5, %p3, %p4;
@%p5 bra BB4_11;
// BB#5: // %mandel___vyfvyfvyi.exit.us
// in Loop: Header=BB4_8 Depth=2
setp.ge.s32 %p1, %r11, %r4;
@%p1 bra BB4_7;
// BB#6: // %if_then.us
// in Loop: Header=BB4_8 Depth=2
add.s32 %r11, %r8, 1073741823;
and.b32 %r9, %r11, %r9;
add.s32 %r11, %r7, %r6;
add.s32 %r9, %r11, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB4_7: // %if_exit.us
// in Loop: Header=BB4_8 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r4;
@%p1 bra BB4_8;
bra.uni BB4_12;
BB4_15: // %mandel___vyfvyfvyi.exit
// Parent Loop BB4_2 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r10, %r10, %r7;
setp.lt.s32 %p1, %r10, %r4;
@%p1 bra BB4_16;
bra.uni BB4_14;
BB4_16: // %if_then
// in Loop: Header=BB4_15 Depth=2
add.s32 %r10, %r8, 1073741823;
and.b32 %r9, %r10, %r9;
add.s32 %r10, %r7, %r6;
add.s32 %r9, %r10, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
BB4_14: // %if_exit
// in Loop: Header=BB4_15 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r4;
@%p1 bra BB4_15;
BB4_12: // %for_exit31
// in Loop: Header=BB4_2 Depth=1
add.s32 %r0, %r0, 1;
setp.eq.s32 %p1, %r0, %r5;
@%p1 bra BB4_13;
bra.uni BB4_2;
BB4_13: // %for_exit
ret;
}

View File

@@ -1,320 +0,0 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl __aos_to_soa4_float1
// @__aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl mandelbrot_scanline
.entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 .ptr .align 4 mandelbrot_scanline_param_9
) // @mandelbrot_scanline
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u32 %r6, [mandelbrot_scanline_param_5];
ld.param.u32 %r5, [mandelbrot_scanline_param_7];
mov.u32 %r7, %ctaid.y;
mul.lo.s32 %r0, %r7, %r5;
mad.lo.s32 %r1, %r7, %r5, %r5;
setp.lt.s32 %p0, %r1, %r6;
selp.b32 %r1, %r1, %r6, %p0;
setp.ge.s32 %p0, %r0, %r1;
@%p0 bra BB4_13;
// BB#1: // %for_test28.preheader.lr.ph
ld.param.f32 %f0, [mandelbrot_scanline_param_0];
ld.param.f32 %f1, [mandelbrot_scanline_param_1];
ld.param.f32 %f2, [mandelbrot_scanline_param_2];
ld.param.f32 %f3, [mandelbrot_scanline_param_3];
ld.param.u32 %r1, [mandelbrot_scanline_param_4];
ld.param.u32 %r4, [mandelbrot_scanline_param_6];
ld.param.u32 %r2, [mandelbrot_scanline_param_8];
ld.param.u64 %rl0, [mandelbrot_scanline_param_9];
mov.u32 %r8, %ctaid.x;
mul.lo.s32 %r3, %r8, %r4;
mad.lo.s32 %r4, %r8, %r4, %r4;
setp.lt.s32 %p0, %r4, %r1;
selp.b32 %r4, %r4, %r1, %p0;
setp.gt.s32 %p0, %r2, 0;
not.b32 %r6, %r6;
add.s32 %r7, %r7, 1;
mul.lo.s32 %r5, %r7, %r5;
not.b32 %r5, %r5;
setp.gt.s32 %p1, %r6, %r5;
selp.b32 %r5, %r6, %r5, %p1;
not.b32 %r5, %r5;
BB4_2: // %for_test28.preheader
// =>This Loop Header: Depth=1
// Child Loop BB4_15 Depth 2
// Child Loop BB4_8 Depth 2
// Child Loop BB4_11 Depth 3
setp.ge.s32 %p1, %r3, %r4;
@%p1 bra BB4_12;
// BB#3: // %for_loop30.lr.ph
// in Loop: Header=BB4_2 Depth=1
mul.lo.s32 %r6, %r0, %r1;
mov.u32 %r7, %r3;
@%p0 bra BB4_4;
bra.uni BB4_15;
BB4_4: // in Loop: Header=BB4_2 Depth=1
cvt.rn.f32.s32 %f4, %r0;
fma.rn.f32 %f4, %f4, %f3, %f2;
mov.u32 %r7, %r3;
BB4_8: // %for_loop.i.lr.ph.us
// Parent Loop BB4_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB4_11 Depth 3
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r11, %r10, %r7;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f5, %f1, %f0;
mov.u32 %r10, 0;
mov.pred %p1, 0;
mov.pred %p3, -1;
mov.pred %p4, %p0;
mov.pred %p2, %p1;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB4_11: // %for_loop.i.us
// Parent Loop BB4_2 Depth=1
// Parent Loop BB4_8 Depth=2
// => This Inner Loop Header: Depth=3
and.pred %p4, %p3, %p4;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p3, %f9, 0f40800000;
and.pred %p3, %p4, %p3;
or.pred %p2, %p3, %p2;
xor.pred %p5, %p2, %p4;
mov.pred %p3, %p1;
@!%p5 bra BB4_10;
bra.uni BB4_9;
BB4_9: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB4_11 Depth=3
mul.f32 %f9, %f6, %f6;
not.pred %p3, %p2;
and.pred %p3, %p4, %p3;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB4_10: // %for_step.i.us
// in Loop: Header=BB4_11 Depth=3
add.s32 %r12, %r10, 1;
selp.b32 %r10, %r12, %r10, %p3;
setp.lt.s32 %p4, %r10, %r2;
and.pred %p5, %p3, %p4;
@%p5 bra BB4_11;
// BB#5: // %mandel___vyfvyfvyi.exit.us
// in Loop: Header=BB4_8 Depth=2
setp.ge.s32 %p1, %r11, %r4;
@%p1 bra BB4_7;
// BB#6: // %if_then.us
// in Loop: Header=BB4_8 Depth=2
add.s32 %r11, %r8, 1073741823;
and.b32 %r9, %r11, %r9;
add.s32 %r11, %r7, %r6;
add.s32 %r9, %r11, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB4_7: // %if_exit.us
// in Loop: Header=BB4_8 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r4;
@%p1 bra BB4_8;
bra.uni BB4_12;
BB4_15: // %mandel___vyfvyfvyi.exit
// Parent Loop BB4_2 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r10, %r10, %r7;
setp.lt.s32 %p1, %r10, %r4;
@%p1 bra BB4_16;
bra.uni BB4_14;
BB4_16: // %if_then
// in Loop: Header=BB4_15 Depth=2
add.s32 %r10, %r8, 1073741823;
and.b32 %r9, %r10, %r9;
add.s32 %r10, %r7, %r6;
add.s32 %r9, %r10, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
BB4_14: // %if_exit
// in Loop: Header=BB4_15 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r4;
@%p1 bra BB4_15;
BB4_12: // %for_exit31
// in Loop: Header=BB4_2 Depth=1
add.s32 %r0, %r0, 1;
setp.eq.s32 %p1, %r0, %r5;
@%p1 bra BB4_13;
bra.uni BB4_2;
BB4_13: // %for_exit
ret;
}

View File

@@ -1,171 +0,0 @@
.file "mandelbrot_task.ispc"
.text
.globl mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
.align 16, 0x90
.type mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_,@function
mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_: # @mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
# BB#0: # %allocas
pushq %rbp
pushq %r15
pushq %r14
pushq %rbx
subq $88, %rsp
vmovups %ymm4, 32(%rsp) # 32-byte Folded Spill
movq %rcx, %r14
movl %edx, %r15d
movl %esi, %ebx
movl %edi, %ebp
vmovss %xmm1, 76(%rsp) # 4-byte Spill
vmovss %xmm0, 28(%rsp) # 4-byte Spill
vcvtsi2ssl %ebp, %xmm0, %xmm5
vsubss %xmm0, %xmm2, %xmm4
vcvtsi2ssl %ebx, %xmm0, %xmm2
vsubss %xmm1, %xmm3, %xmm3
movq $0, 80(%rsp)
leaq 80(%rsp), %rdi
vdivss %xmm2, %xmm3, %xmm1
vmovss %xmm1, 72(%rsp) # 4-byte Spill
vdivss %xmm5, %xmm4, %xmm0
vmovss %xmm0, 68(%rsp) # 4-byte Spill
movl $96, %esi
movl $32, %edx
vzeroupper
callq ISPCAlloc
vmovups 32(%rsp), %ymm0 # 32-byte Folded Reload
movq %rax, %rdx
movl %ebx, %r8d
sarl $31, %r8d
shrl $28, %r8d
addl %ebx, %r8d
vmovss 28(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, (%rdx)
sarl $4, %r8d
movl %ebp, %ecx
sarl $31, %ecx
shrl $28, %ecx
addl %ebp, %ecx
sarl $4, %ecx
vmovmskps %ymm0, %eax
cmpl $255, %eax
vmovss 68(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 4(%rdx)
vmovss 76(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 8(%rdx)
vmovss 72(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 12(%rdx)
movl %ebp, 16(%rdx)
movl %ebx, 20(%rdx)
movl $16, 24(%rdx)
movl $16, 28(%rdx)
movl %r15d, 32(%rdx)
movq %r14, 40(%rdx)
jne .LBB0_2
# BB#1: # %all_on
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
.LBB0_2: # %all_on
vmovaps %ymm0, 64(%rdx)
leaq 80(%rsp), %rdi
movl $mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, %esi
movl $1, %r9d
vzeroupper
callq ISPCLaunch
movq 80(%rsp), %rdi
testq %rdi, %rdi
je .LBB0_4
# BB#3: # %call_sync
callq ISPCSync
movq $0, 80(%rsp)
.LBB0_4: # %post_sync
addq $88, %rsp
popq %rbx
popq %r14
popq %r15
popq %rbp
ret
.Ltmp0:
.size mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_, .Ltmp0-mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
.globl mandelbrot_ispc
.align 16, 0x90
.type mandelbrot_ispc,@function
mandelbrot_ispc: # @mandelbrot_ispc
# BB#0: # %allocas
pushq %rbp
pushq %r15
pushq %r14
pushq %r12
pushq %rbx
subq $32, %rsp
movq %rcx, %r14
movl %edx, %r15d
movl %esi, %ebx
movl %edi, %ebp
vmovss %xmm1, 20(%rsp) # 4-byte Spill
vmovss %xmm0, 8(%rsp) # 4-byte Spill
vcvtsi2ssl %ebp, %xmm0, %xmm5
vsubss %xmm0, %xmm2, %xmm4
vcvtsi2ssl %ebx, %xmm0, %xmm2
vsubss %xmm1, %xmm3, %xmm3
movq $0, 24(%rsp)
leaq 24(%rsp), %r12
vdivss %xmm2, %xmm3, %xmm1
vmovss %xmm1, 16(%rsp) # 4-byte Spill
vdivss %xmm5, %xmm4, %xmm0
vmovss %xmm0, 12(%rsp) # 4-byte Spill
movq %r12, %rdi
movl $96, %esi
movl $32, %edx
callq ISPCAlloc
movl %ebx, %r8d
sarl $31, %r8d
vpcmpeqd %xmm0, %xmm0, %xmm0
vmovss 8(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, (%rax)
shrl $28, %r8d
addl %ebx, %r8d
movl %ebp, %ecx
sarl $31, %ecx
shrl $28, %ecx
addl %ebp, %ecx
sarl $4, %ecx
sarl $4, %r8d
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovss 12(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 4(%rax)
vmovss 20(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 8(%rax)
vmovss 16(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 12(%rax)
movl %ebp, 16(%rax)
movl %ebx, 20(%rax)
movl $16, 24(%rax)
movl $16, 28(%rax)
movl %r15d, 32(%rax)
movq %r14, 40(%rax)
vmovaps %ymm0, 64(%rax)
movq %r12, %rdi
movl $mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, %esi
movq %rax, %rdx
movl $1, %r9d
vzeroupper
callq ISPCLaunch
movq 24(%rsp), %rdi
testq %rdi, %rdi
je .LBB1_2
# BB#1: # %call_sync
callq ISPCSync
movq $0, 24(%rsp)
.LBB1_2: # %post_sync
addq $32, %rsp
popq %rbx
popq %r12
popq %r14
popq %r15
popq %rbp
ret
.Ltmp1:
.size mandelbrot_ispc, .Ltmp1-mandelbrot_ispc
.section ".note.GNU-stack","",@progbits

View File

@@ -1,103 +0,0 @@
; ModuleID = 'task.bc'
target datalayout = "e-p:64:64:64-S0-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-v16:16:16-v32:32:32-n16:32:64"
target triple = "nvptx64"
@data = external global [1024 x i32]
; Function Attrs: alwaysinline nounwind readnone
define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8>, <1 x i32> %mask) #0 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i8> %0, i32 0
%d1 = extractelement <1 x i8> %1, i32 0
%sel = select i1 %cmp, i8 %d0, i8 %d1
%r = insertelement <1 x i8> undef, i8 %sel, i32 0
ret <1 x i8> %r
}
; Function Attrs: alwaysinline nounwind readnone
define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16>, <1 x i32> %mask) #0 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i16> %0, i32 0
%d1 = extractelement <1 x i16> %1, i32 0
%sel = select i1 %cmp, i16 %d0, i16 %d1
%r = insertelement <1 x i16> undef, i16 %sel, i32 0
ret <1 x i16> %r
}
; Function Attrs: alwaysinline nounwind readnone
define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64>, <1 x i32> %mask) #0 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i64> %0, i32 0
%d1 = extractelement <1 x i64> %1, i32 0
%sel = select i1 %cmp, i64 %d0, i64 %d1
%r = insertelement <1 x i64> undef, i64 %sel, i32 0
ret <1 x i64> %r
}
; Function Attrs: nounwind readnone
declare double @llvm.nvvm.rsqrt.approx.d(double) #1
; Function Attrs: alwaysinline nounwind
define void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float> %v3, <1 x float>* noalias nocapture %out0, <1 x float>* noalias nocapture %out1, <1 x float>* noalias nocapture %out2, <1 x float>* noalias nocapture %out3) #2 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
store <1 x float> %v3, <1 x float>* %out3, align 4
ret void
}
; Function Attrs: alwaysinline nounwind
define void @__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float> %v3, <1 x float>* noalias nocapture %out0, <1 x float>* noalias nocapture %out1, <1 x float>* noalias nocapture %out2, <1 x float>* noalias nocapture %out3) #2 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
store <1 x float> %v3, <1 x float>* %out3, align 4
ret void
}
; Function Attrs: nounwind
define void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float>* nocapture %out0, <1 x float>* nocapture %out1, <1 x float>* nocapture %out2) #3 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
ret void
}
; Function Attrs: nounwind
define void @__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float>* nocapture %out0, <1 x float>* nocapture %out1, <1 x float>* nocapture %out2) #3 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
ret void
}
; Function Attrs: alwaysinline nounwind readonly
define <1 x double> @__rsqrt_varying_double(<1 x double> %v) #4 {
%vs = extractelement <1 x double> %v, i32 0
%rs = tail call double @llvm.nvvm.rsqrt.approx.d(double %vs)
%rv = insertelement <1 x double> undef, double %rs, i32 0
ret <1 x double> %rv
}
; Function Attrs: nounwind
declare i32 @foo1___(<1 x i32>) #5
; Function Attrs: nounwind
define void @foo___(<1 x i32> %__mask) #5 {
allocas:
%calltmp = tail call i32 @foo1___(<1 x i32> %__mask)
%calltmp_to_int64 = sext i32 %calltmp to i64
%data_offset = getelementptr [1024 x i32]* @data, i64 0, i64 %calltmp_to_int64
store i32 0, i32* %data_offset, align 4
ret void
}
attributes #0 = { alwaysinline nounwind readnone }
attributes #1 = { nounwind readnone }
attributes #2 = { alwaysinline nounwind }
attributes #3 = { nounwind }
attributes #4 = { alwaysinline nounwind readonly }
attributes #5 = { nounwind "target-features"="+sm_35" }

View File

@@ -1,543 +0,0 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_20, texmode_independent
.address_size 64
// .globl __land_id
// @__land_id
.func (.param .b32 func_retval0) __land_id(
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
mov.u32 %r0, %laneid;
st.param.b32 [func_retval0+0], %r0;
ret;
}
// .globl __vselect_i8
.func (.param .align 1 .b8 func_retval0[1]) __vselect_i8(
.param .align 1 .b8 __vselect_i8_param_0[1],
.param .align 1 .b8 __vselect_i8_param_1[1],
.param .align 4 .b8 __vselect_i8_param_2[4]
) // @__vselect_i8
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i8_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u8 %rc0, [__vselect_i8_param_0];
ld.param.u8 %rc1, [__vselect_i8_param_1];
selp.b16 %rc0, %rc0, %rc1, %p0;
st.param.b8 [func_retval0+0], %rc0;
ret;
}
// .globl __vselect_i16
.func (.param .align 2 .b8 func_retval0[2]) __vselect_i16(
.param .align 2 .b8 __vselect_i16_param_0[2],
.param .align 2 .b8 __vselect_i16_param_1[2],
.param .align 4 .b8 __vselect_i16_param_2[4]
) // @__vselect_i16
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i16_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u16 %rs0, [__vselect_i16_param_0];
ld.param.u16 %rs1, [__vselect_i16_param_1];
selp.b16 %rs0, %rs0, %rs1, %p0;
st.param.b16 [func_retval0+0], %rs0;
ret;
}
// .globl __vselect_i64
.func (.param .align 8 .b8 func_retval0[8]) __vselect_i64(
.param .align 8 .b8 __vselect_i64_param_0[8],
.param .align 8 .b8 __vselect_i64_param_1[8],
.param .align 4 .b8 __vselect_i64_param_2[4]
) // @__vselect_i64
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i64_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u64 %rl0, [__vselect_i64_param_0];
ld.param.u64 %rl1, [__vselect_i64_param_1];
selp.b64 %rl0, %rl0, %rl1, %p0;
st.param.b64 [func_retval0+0], %rl0;
ret;
}
// .globl __aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
) // @__aos_to_soa4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __rsqrt_varying_double
.func (.param .align 8 .b8 func_retval0[8]) __rsqrt_varying_double(
.param .align 8 .b8 __rsqrt_varying_double_param_0[8]
) // @__rsqrt_varying_double
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f64 %fl0, [__rsqrt_varying_double_param_0];
rsqrt.approx.f64 %fl0, %fl0;
st.param.f64 [func_retval0+0], %fl0;
ret;
}
// .globl mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_
.func mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_(
.param .b64 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_1,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_2,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_3,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_4,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_5,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_6,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_7,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_8,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_9,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_10
) // @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u64 %rl0, [mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0];
ld.f32 %f0, [%rl0];
ld.f32 %f1, [%rl0+4];
ld.f32 %f2, [%rl0+8];
ld.f32 %f3, [%rl0+12];
ld.u32 %r4, [%rl0+16];
ld.u32 %r8, [%rl0+20];
ld.u32 %r9, [%rl0+24];
ld.u32 %r7, [%rl0+28];
ld.u32 %r0, [%rl0+32];
ld.u32 %r5, [%rl0+48];
setp.lt.s32 %p0, %r5, 0;
mov.u32 %r11, %ctaid.x;
mul.lo.s32 %r1, %r11, %r9;
mad.lo.s32 %r2, %r11, %r9, %r9;
setp.lt.s32 %p1, %r2, %r4;
selp.b32 %r2, %r2, %r4, %p1;
mov.u32 %r10, %ctaid.y;
mul.lo.s32 %r3, %r10, %r7;
mad.lo.s32 %r6, %r10, %r7, %r7;
setp.lt.s32 %p1, %r6, %r8;
selp.b32 %r6, %r6, %r8, %p1;
@%p0 bra BB9_4;
// BB#1: // %for_test101.preheader
setp.ge.s32 %p0, %r3, %r6;
@%p0 bra BB9_31;
// BB#2: // %for_test112.preheader.lr.ph
setp.gt.s32 %p0, %r0, 0;
selp.b32 %r4, -1, 0, %p0;
and.b32 %r6, %r4, %r5;
not.b32 %r8, %r8;
add.s32 %r9, %r10, 1;
mul.lo.s32 %r7, %r7, %r9;
not.b32 %r7, %r7;
setp.gt.s32 %p0, %r8, %r7;
selp.b32 %r7, %r8, %r7, %p0;
not.b32 %r7, %r7;
BB9_3: // %for_test112.preheader
// =>This Loop Header: Depth=1
// Child Loop BB9_29 Depth 2
// Child Loop BB9_28 Depth 2
// Child Loop BB9_23 Depth 3
setp.ge.s32 %p0, %r1, %r2;
@%p0 bra BB9_30;
// BB#21: // %for_loop114.lr.ph
// in Loop: Header=BB9_3 Depth=1
setp.lt.s32 %p0, %r6, 0;
mov.u32 %r8, %r1;
@%p0 bra BB9_22;
bra.uni BB9_29;
BB9_22: // in Loop: Header=BB9_3 Depth=1
cvt.rn.f32.s32 %f4, %r3;
fma.rn.f32 %f4, %f3, %f4, %f2;
mov.u32 %r8, %r1;
BB9_28: // %for_loop.i.lr.ph.us
// Parent Loop BB9_3 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB9_23 Depth 3
mov.u32 %r9, %laneid;
add.s32 %r9, %r9, %r8;
cvt.rn.f32.s32 %f5, %r9;
fma.rn.f32 %f5, %f1, %f5, %f0;
mov.u32 %r9, 0;
mov.u32 %r12, %r4;
mov.u32 %r10, %r9;
mov.u32 %r11, %r9;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB9_23: // %for_loop.i.us
// Parent Loop BB9_3 Depth=1
// Parent Loop BB9_28 Depth=2
// => This Inner Loop Header: Depth=3
and.b32 %r13, %r12, %r5;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p0, %f9, 0f40800000;
selp.b32 %r14, %r12, 0, %p0;
or.b32 %r10, %r14, %r10;
and.b32 %r14, %r10, %r5;
shr.u32 %r14, %r14, 31;
shr.u32 %r13, %r13, 31;
setp.eq.s32 %p0, %r14, %r13;
@%p0 bra BB9_24;
bra.uni BB9_25;
BB9_24: // in Loop: Header=BB9_23 Depth=3
mov.u32 %r12, %r9;
bra.uni BB9_26;
BB9_25: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB9_23 Depth=3
mul.f32 %f9, %f6, %f6;
not.b32 %r13, %r10;
and.b32 %r12, %r12, %r13;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB9_26: // %for_step.i.us
// in Loop: Header=BB9_23 Depth=3
setp.ne.s32 %p0, %r12, 0;
selp.u32 %r13, 1, 0, %p0;
add.s32 %r11, %r11, %r13;
setp.lt.s32 %p0, %r11, %r0;
selp.b32 %r12, %r12, 0, %p0;
and.b32 %r13, %r12, %r5;
setp.lt.s32 %p0, %r13, 0;
@%p0 bra BB9_23;
// BB#27: // %if_exit156.us
// in Loop: Header=BB9_28 Depth=2
add.s32 %r8, %r8, 32;
setp.lt.s32 %p0, %r8, %r2;
@%p0 bra BB9_28;
bra.uni BB9_30;
BB9_29: // %if_exit156
// Parent Loop BB9_3 Depth=1
// => This Inner Loop Header: Depth=2
add.s32 %r8, %r8, 32;
setp.lt.s32 %p0, %r8, %r2;
@%p0 bra BB9_29;
BB9_30: // %for_exit115
// in Loop: Header=BB9_3 Depth=1
add.s32 %r3, %r3, 1;
setp.eq.s32 %p0, %r3, %r7;
@%p0 bra BB9_31;
bra.uni BB9_3;
BB9_4: // %for_test.preheader
setp.ge.s32 %p0, %r3, %r6;
@%p0 bra BB9_31;
// BB#5: // %for_test40.preheader.lr.ph
ld.u64 %rl0, [%rl0+40];
setp.gt.s32 %p0, %r0, 0;
selp.b32 %r5, -1, 0, %p0;
not.b32 %r6, %r8;
add.s32 %r8, %r10, 1;
mul.lo.s32 %r8, %r7, %r8;
not.b32 %r8, %r8;
setp.gt.s32 %p0, %r6, %r8;
selp.b32 %r6, %r6, %r8, %p0;
not.b32 %r6, %r6;
mul.lo.s32 %r7, %r10, %r7;
mul.lo.s32 %r7, %r7, %r4;
mad.lo.s32 %r7, %r11, %r9, %r7;
mov.u32 %r13, 0;
BB9_6: // %for_test40.preheader
// =>This Loop Header: Depth=1
// Child Loop BB9_19 Depth 2
// Child Loop BB9_12 Depth 2
// Child Loop BB9_13 Depth 3
setp.ge.s32 %p0, %r1, %r2;
@%p0 bra BB9_17;
// BB#7: // %for_loop42.lr.ph
// in Loop: Header=BB9_6 Depth=1
setp.lt.s32 %p0, %r5, 0;
mov.u32 %r8, %r7;
mov.u32 %r9, %r1;
@%p0 bra BB9_8;
bra.uni BB9_19;
BB9_8: // in Loop: Header=BB9_6 Depth=1
cvt.rn.f32.s32 %f4, %r3;
mul.lo.s32 %r8, %r3, %r4;
fma.rn.f32 %f4, %f3, %f4, %f2;
mov.u32 %r9, %r1;
BB9_12: // %for_loop.i206.lr.ph.us
// Parent Loop BB9_6 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB9_13 Depth 3
mov.u32 %r10, %laneid;
add.s32 %r12, %r10, %r9;
cvt.rn.f32.s32 %f5, %r12;
fma.rn.f32 %f5, %f1, %f5, %f0;
mov.u32 %r15, %r5;
mov.u32 %r14, %r13;
mov.u32 %r11, %r13;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB9_13: // %for_loop.i206.us
// Parent Loop BB9_6 Depth=1
// Parent Loop BB9_12 Depth=2
// => This Inner Loop Header: Depth=3
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p0, %f9, 0f40800000;
selp.b32 %r16, %r15, 0, %p0;
or.b32 %r14, %r16, %r14;
shr.u32 %r16, %r14, 31;
shr.u32 %r17, %r15, 31;
setp.eq.s32 %p0, %r16, %r17;
@%p0 bra BB9_14;
bra.uni BB9_15;
BB9_14: // in Loop: Header=BB9_13 Depth=3
mov.u32 %r15, %r13;
bra.uni BB9_16;
BB9_15: // %not_all_continued_or_breaked.i220.us
// in Loop: Header=BB9_13 Depth=3
mul.f32 %f9, %f6, %f6;
not.b32 %r16, %r14;
and.b32 %r15, %r15, %r16;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB9_16: // %for_step.i189.us
// in Loop: Header=BB9_13 Depth=3
setp.ne.s32 %p0, %r15, 0;
selp.u32 %r16, 1, 0, %p0;
add.s32 %r11, %r11, %r16;
setp.lt.s32 %p0, %r11, %r0;
selp.b32 %r15, %r15, 0, %p0;
setp.lt.s32 %p0, %r15, 0;
@%p0 bra BB9_13;
// BB#9: // %mandel___vyfvyfvyi.exit221.us
// in Loop: Header=BB9_12 Depth=2
setp.ge.s32 %p0, %r12, %r2;
@%p0 bra BB9_11;
// BB#10: // %if_then.us
// in Loop: Header=BB9_12 Depth=2
add.s32 %r12, %r9, %r8;
add.s32 %r10, %r12, %r10;
shl.b32 %r10, %r10, 2;
cvt.s64.s32 %rl1, %r10;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r11;
BB9_11: // %if_exit.us
// in Loop: Header=BB9_12 Depth=2
add.s32 %r9, %r9, 32;
setp.lt.s32 %p0, %r9, %r2;
@%p0 bra BB9_12;
bra.uni BB9_17;
BB9_19: // %mandel___vyfvyfvyi.exit221
// Parent Loop BB9_6 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r10, %laneid;
add.s32 %r11, %r10, %r9;
setp.lt.s32 %p0, %r11, %r2;
@%p0 bra BB9_20;
bra.uni BB9_18;
BB9_20: // %if_then
// in Loop: Header=BB9_19 Depth=2
add.s32 %r10, %r10, %r8;
shl.b32 %r10, %r10, 2;
cvt.s64.s32 %rl1, %r10;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r10, 0;
st.u32 [%rl1], %r10;
BB9_18: // %if_exit
// in Loop: Header=BB9_19 Depth=2
add.s32 %r9, %r9, 32;
add.s32 %r8, %r8, 32;
setp.lt.s32 %p0, %r9, %r2;
@%p0 bra BB9_19;
BB9_17: // %for_exit43
// in Loop: Header=BB9_6 Depth=1
add.s32 %r3, %r3, 1;
add.s32 %r7, %r7, %r4;
setp.eq.s32 %p0, %r3, %r6;
@%p0 bra BB9_31;
bra.uni BB9_6;
BB9_31: // %for_exit
ret;
}

View File

@@ -1,284 +0,0 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl __vselect_i8
.func (.param .b32 func_retval0) foo1___
(
.param .align 4 .b8 foo1____param_0[4]
)
;
.extern .global .align 4 .b8 data[4096];
// @__vselect_i8
.func (.param .align 1 .b8 func_retval0[1]) __vselect_i8(
.param .align 1 .b8 __vselect_i8_param_0[1],
.param .align 1 .b8 __vselect_i8_param_1[1],
.param .align 4 .b8 __vselect_i8_param_2[4]
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i8_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u8 %rc0, [__vselect_i8_param_0];
ld.param.u8 %rc1, [__vselect_i8_param_1];
selp.b16 %rc0, %rc0, %rc1, %p0;
st.param.b8 [func_retval0+0], %rc0;
ret;
}
// .globl __vselect_i16
.func (.param .align 2 .b8 func_retval0[2]) __vselect_i16(
.param .align 2 .b8 __vselect_i16_param_0[2],
.param .align 2 .b8 __vselect_i16_param_1[2],
.param .align 4 .b8 __vselect_i16_param_2[4]
) // @__vselect_i16
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i16_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u16 %rs0, [__vselect_i16_param_0];
ld.param.u16 %rs1, [__vselect_i16_param_1];
selp.b16 %rs0, %rs0, %rs1, %p0;
st.param.b16 [func_retval0+0], %rs0;
ret;
}
// .globl __vselect_i64
.func (.param .align 8 .b8 func_retval0[8]) __vselect_i64(
.param .align 8 .b8 __vselect_i64_param_0[8],
.param .align 8 .b8 __vselect_i64_param_1[8],
.param .align 4 .b8 __vselect_i64_param_2[4]
) // @__vselect_i64
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i64_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u64 %rl0, [__vselect_i64_param_0];
ld.param.u64 %rl1, [__vselect_i64_param_1];
selp.b64 %rl0, %rl0, %rl1, %p0;
st.param.b64 [func_retval0+0], %rl0;
ret;
}
// .globl __aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
) // @__aos_to_soa4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __rsqrt_varying_double
.func (.param .align 8 .b8 func_retval0[8]) __rsqrt_varying_double(
.param .align 8 .b8 __rsqrt_varying_double_param_0[8]
) // @__rsqrt_varying_double
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f64 %fl0, [__rsqrt_varying_double_param_0];
rsqrt.approx.f64 %fl0, %fl0;
st.param.f64 [func_retval0+0], %fl0;
ret;
}
// .globl foo___
.func foo___(
.param .align 4 .b8 foo____param_0[4]
) // @foo___
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
mov.u64 %rl0, data;
cvta.global.u64 %rl0, %rl0;
ld.param.u32 %r0, [foo____param_0];
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r0;
.param .b32 retval0;
call.uni (retval0),
foo1___,
(
param0
);
ld.param.b32 %r0, [retval0+0];
//{
}// Callseq End 0
mov.u32 %r1, 0;
mul.wide.s32 %rl1, %r0, 4;
add.s64 %rl0, %rl0, %rl1;
st.u32 [%rl0], %r1;
ret;
}

View File

@@ -1,400 +0,0 @@
; ModuleID = 'test.bc'
target datalayout = "e-p:64:64:64-S0-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-v16:16:16-v32:32:32-n16:32:64"
target triple = "nvptx64"
; Function Attrs: alwaysinline nounwind readnone
define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8>, <1 x i32> %mask) #0 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i8> %0, i32 0
%d1 = extractelement <1 x i8> %1, i32 0
%sel = select i1 %cmp, i8 %d0, i8 %d1
%r = insertelement <1 x i8> undef, i8 %sel, i32 0
ret <1 x i8> %r
}
; Function Attrs: alwaysinline nounwind readnone
define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16>, <1 x i32> %mask) #0 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i16> %0, i32 0
%d1 = extractelement <1 x i16> %1, i32 0
%sel = select i1 %cmp, i16 %d0, i16 %d1
%r = insertelement <1 x i16> undef, i16 %sel, i32 0
ret <1 x i16> %r
}
; Function Attrs: alwaysinline nounwind readnone
define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64>, <1 x i32> %mask) #0 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i64> %0, i32 0
%d1 = extractelement <1 x i64> %1, i32 0
%sel = select i1 %cmp, i64 %d0, i64 %d1
%r = insertelement <1 x i64> undef, i64 %sel, i32 0
ret <1 x i64> %r
}
; Function Attrs: nounwind readnone
declare double @llvm.nvvm.rsqrt.approx.d(double) #1
; Function Attrs: alwaysinline nounwind
define void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float> %v3, <1 x float>* noalias nocapture %out0, <1 x float>* noalias nocapture %out1, <1 x float>* noalias nocapture %out2, <1 x float>* noalias nocapture %out3) #2 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
store <1 x float> %v3, <1 x float>* %out3, align 4
ret void
}
; Function Attrs: alwaysinline nounwind
define void @__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float> %v3, <1 x float>* noalias nocapture %out0, <1 x float>* noalias nocapture %out1, <1 x float>* noalias nocapture %out2, <1 x float>* noalias nocapture %out3) #2 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
store <1 x float> %v3, <1 x float>* %out3, align 4
ret void
}
; Function Attrs: nounwind
define void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float>* nocapture %out0, <1 x float>* nocapture %out1, <1 x float>* nocapture %out2) #3 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
ret void
}
; Function Attrs: nounwind
define void @__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float>* nocapture %out0, <1 x float>* nocapture %out1, <1 x float>* nocapture %out2) #3 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
ret void
}
; Function Attrs: alwaysinline nounwind readonly
define <1 x double> @__rsqrt_varying_double(<1 x double> %v) #4 {
%vs = extractelement <1 x double> %v, i32 0
%rs = tail call double @llvm.nvvm.rsqrt.approx.d(double %vs)
%rv = insertelement <1 x double> undef, double %rs, i32 0
ret <1 x double> %rv
}
; Function Attrs: nounwind
declare i32 @getBlockIndex0___(<1 x i32>) #5
; Function Attrs: nounwind
declare i32 @getBlockIndex1___(<1 x i32>) #5
; Function Attrs: nounwind
declare i32 @getLaneIndex___(<1 x i32>) #5
; Function Attrs: nounwind
define void @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* noalias nocapture, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #5 {
allocas:
%x01 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 0
%x02 = load float* %x01, align 4
%dx3 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 1
%dx4 = load float* %dx3, align 4
%y05 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 2
%y06 = load float* %y05, align 4
%dy7 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 3
%dy8 = load float* %dy7, align 4
%width9 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 4
%width10 = load i32* %width9, align 4
%height11 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 5
%height12 = load i32* %height11, align 4
%xspan13 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 6
%xspan14 = load i32* %xspan13, align 4
%yspan15 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 7
%yspan16 = load i32* %yspan15, align 4
%maxIterations17 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 8
%maxIterations18 = load i32* %maxIterations17, align 4
%output19 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 9
%output20 = load i32** %output19, align 8
%task_struct_mask = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 10
%mask = load <1 x i32>* %task_struct_mask, align 4
%item.i = extractelement <1 x i32> %mask, i32 0
%cmp.i = icmp slt i32 %item.i, 0
br i1 %cmp.i, label %all_on, label %some_on
all_on: ; preds = %allocas
%calltmp = call i32 @getBlockIndex0___(<1 x i32> <i32 -1>)
%mul_calltmp_xspan_load = mul i32 %calltmp, %xspan14
%add_xstart_load_xspan_load25 = add i32 %mul_calltmp_xspan_load, %xspan14
%c.i.i = icmp slt i32 %add_xstart_load_xspan_load25, %width10
%r.i.i = select i1 %c.i.i, i32 %add_xstart_load_xspan_load25, i32 %width10
%calltmp31 = call i32 @getBlockIndex1___(<1 x i32> <i32 -1>)
%mul_calltmp31_yspan_load = mul i32 %calltmp31, %yspan16
%add_ystart_load_yspan_load32 = add i32 %mul_calltmp31_yspan_load, %yspan16
%c.i.i166 = icmp slt i32 %add_ystart_load_yspan_load32, %height12
%r.i.i167 = select i1 %c.i.i166, i32 %add_ystart_load_yspan_load32, i32 %height12
%less_yi_load_yend_load294 = icmp slt i32 %mul_calltmp31_yspan_load, %r.i.i167
br i1 %less_yi_load_yend_load294, label %for_test40.preheader.lr.ph, label %for_exit
for_test40.preheader.lr.ph: ; preds = %all_on
%less_xi_load_xend_load292 = icmp slt i32 %mul_calltmp_xspan_load, %r.i.i
%maxIterations_load_broadcast_init = insertelement <1 x i32> undef, i32 %maxIterations18, i32 0
%less_i_load_count_load.i179283 = icmp sgt <1 x i32> %maxIterations_load_broadcast_init, zeroinitializer
%"oldMask&test.i180284" = select <1 x i1> %less_i_load_count_load.i179283, <1 x i32> <i32 -1>, <1 x i32> zeroinitializer
%item.i.i181285 = extractelement <1 x i32> %"oldMask&test.i180284", i32 0
%cmp.i.i182286 = icmp slt i32 %item.i.i181285, 0
%output_load_ptr2int = ptrtoint i32* %output20 to i64
%11 = xor i32 %height12, -1
%12 = add i32 %calltmp31, 1
%13 = mul i32 %yspan16, %12
%14 = xor i32 %13, -1
%15 = icmp sgt i32 %11, %14
%smax = select i1 %15, i32 %11, i32 %14
%16 = xor i32 %smax, -1
br label %for_test40.preheader
some_on: ; preds = %allocas
%calltmp80 = call i32 @getBlockIndex0___(<1 x i32> %mask)
%mul_calltmp80_xspan_load81 = mul i32 %calltmp80, %xspan14
%add_xstart_load83_xspan_load84 = add i32 %mul_calltmp80_xspan_load81, %xspan14
%c.i.i168 = icmp slt i32 %add_xstart_load83_xspan_load84, %width10
%r.i.i169 = select i1 %c.i.i168, i32 %add_xstart_load83_xspan_load84, i32 %width10
%calltmp92 = call i32 @getBlockIndex1___(<1 x i32> %mask)
%mul_calltmp92_yspan_load93 = mul i32 %calltmp92, %yspan16
%add_ystart_load95_yspan_load96 = add i32 %mul_calltmp92_yspan_load93, %yspan16
%c.i.i170 = icmp slt i32 %add_ystart_load95_yspan_load96, %height12
%r.i.i171 = select i1 %c.i.i170, i32 %add_ystart_load95_yspan_load96, i32 %height12
%less_yi_load108_yend_load109309 = icmp slt i32 %mul_calltmp92_yspan_load93, %r.i.i171
br i1 %less_yi_load108_yend_load109309, label %for_test112.preheader.lr.ph, label %for_exit
for_test112.preheader.lr.ph: ; preds = %some_on
%less_xi_load119_xend_load120306 = icmp slt i32 %mul_calltmp80_xspan_load81, %r.i.i169
%maxIterations_load137_broadcast_init = insertelement <1 x i32> undef, i32 %maxIterations18, i32 0
%less_i_load_count_load.i296 = icmp sgt <1 x i32> %maxIterations_load137_broadcast_init, zeroinitializer
%"oldMask&test.i297" = select <1 x i1> %less_i_load_count_load.i296, <1 x i32> <i32 -1>, <1 x i32> zeroinitializer
%"internal_mask&function_mask10.i298" = and <1 x i32> %"oldMask&test.i297", %mask
%item.i.i299 = extractelement <1 x i32> %"internal_mask&function_mask10.i298", i32 0
%cmp.i.i300 = icmp slt i32 %item.i.i299, 0
%17 = xor i32 %height12, -1
%18 = add i32 %calltmp92, 1
%19 = mul i32 %yspan16, %18
%20 = xor i32 %19, -1
%21 = icmp sgt i32 %17, %20
%smax311 = select i1 %21, i32 %17, i32 %20
%22 = xor i32 %smax311, -1
br label %for_test112.preheader
for_test40.preheader: ; preds = %for_exit43, %for_test40.preheader.lr.ph
%yi.0295 = phi i32 [ %mul_calltmp31_yspan_load, %for_test40.preheader.lr.ph ], [ %yi_load74_plus1, %for_exit43 ]
br i1 %less_xi_load_xend_load292, label %for_loop42.lr.ph, label %for_exit43
for_loop42.lr.ph: ; preds = %for_test40.preheader
%yi_load52_to_float = sitofp i32 %yi.0295 to float
%mul_yi_load52_to_float_dy_load = fmul float %dy8, %yi_load52_to_float
%add_y0_load_mul_yi_load52_to_float_dy_load = fadd float %y06, %mul_yi_load52_to_float_dy_load
%add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init = insertelement <1 x float> undef, float %add_y0_load_mul_yi_load52_to_float_dy_load, i32 0
%mul_yi_load56_width_load57 = mul i32 %yi.0295, %width10
br i1 %cmp.i.i182286, label %for_loop.i204.lr.ph.us, label %mandel___vyfvyfvyi.exit219
mandel___vyfvyfvyi.exit219.us: ; preds = %for_step.i187.us
%calltmp61.us = call i32 @getLaneIndex___(<1 x i32> <i32 -1>)
%calltmp65.us = call i32 @getLaneIndex___(<1 x i32> <i32 -1>)
%add_xi_load62_calltmp65.us = add i32 %calltmp65.us, %xi.0293.us
%less_add_xi_load62_calltmp65_xend_load66.us = icmp slt i32 %add_xi_load62_calltmp65.us, %r.i.i
br i1 %less_add_xi_load62_calltmp65_xend_load66.us, label %if_then.us, label %if_exit.us
if_then.us: ; preds = %mandel___vyfvyfvyi.exit219.us
%add_xi_load58_calltmp61.us = add i32 %xi.0293.us, %mul_yi_load56_width_load57
%add_mul_yi_load56_width_load57_add_xi_load58_calltmp61.us = add i32 %add_xi_load58_calltmp61.us, %calltmp61.us
%23 = shl i32 %add_mul_yi_load56_width_load57_add_xi_load58_calltmp61.us, 2
%iptr__id.i239.rhs.us = sext i32 %23 to i64
%iptr__id.i239.us = add i64 %iptr__id.i239.rhs.us, %output_load_ptr2int
%ptr__id.i240.us = inttoptr i64 %iptr__id.i239.us to i32*
store i32 %sel.i.i266.us, i32* %ptr__id.i240.us, align 4
br label %if_exit.us
if_exit.us: ; preds = %if_then.us, %mandel___vyfvyfvyi.exit219.us
%add_xi_load73_.us = add i32 %xi.0293.us, 32
%less_xi_load_xend_load.us = icmp slt i32 %add_xi_load73_.us, %r.i.i
br i1 %less_xi_load_xend_load.us, label %for_loop.i204.lr.ph.us, label %for_exit43
for_loop.i204.us: ; preds = %for_loop.i204.lr.ph.us, %for_step.i187.us
%"oldMask&test.i180291.us" = phi <1 x i32> [ %"oldMask&test.i180284", %for_loop.i204.lr.ph.us ], [ %"oldMask&test.i180.us", %for_step.i187.us ]
%break_lanes_memory.0.i176290.us = phi <1 x i32> [ zeroinitializer, %for_loop.i204.lr.ph.us ], [ %"mask|break_mask.i195.us", %for_step.i187.us ]
%r.i.i267270289.us = phi <1 x i32> [ zeroinitializer, %for_loop.i204.lr.ph.us ], [ %r.i.i267.us, %for_step.i187.us ]
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init276288.us = phi <1 x float> [ %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init.us, %for_loop.i204.lr.ph.us ], [ %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init275.us, %for_step.i187.us ]
%add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init278287.us = phi <1 x float> [ %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init, %for_loop.i204.lr.ph.us ], [ %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init277.us, %for_step.i187.us ]
%mul_z_re_load_z_re_load13.i189.us = fmul <1 x float> %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init276288.us, %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init276288.us
%mul_z_im_load_z_im_load14.i191.us = fmul <1 x float> %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init278287.us, %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init278287.us
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i192.us = fadd <1 x float> %mul_z_im_load_z_im_load14.i191.us, %mul_z_re_load_z_re_load13.i189.us
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i193.us = fcmp ugt <1 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i192.us, <float 4.000000e+00>
%"oldMask&test16.i194.us" = select <1 x i1> %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i193.us, <1 x i32> %"oldMask&test.i180291.us", <1 x i32> zeroinitializer
%"mask|break_mask.i195.us" = or <1 x i32> %"oldMask&test16.i194.us", %break_lanes_memory.0.i176290.us
%item.i63.i197.us = extractelement <1 x i32> %"mask|break_mask.i195.us", i32 0
%v.i64.i198.us = lshr i32 %item.i63.i197.us, 31
%item.i62.i200.us = extractelement <1 x i32> %"oldMask&test.i180291.us", i32 0
%v.i.i201.us = lshr i32 %item.i62.i200.us, 31
%"equal_finished&func_internal_mask&function_mask12.i203.us" = icmp eq i32 %v.i64.i198.us, %v.i.i201.us
br i1 %"equal_finished&func_internal_mask&function_mask12.i203.us", label %for_step.i187.us, label %not_all_continued_or_breaked.i218.us
not_all_continued_or_breaked.i218.us: ; preds = %for_loop.i204.us
%"!(break|continue)_lanes.i207.us" = xor <1 x i32> %"mask|break_mask.i195.us", <i32 -1>
%new_mask28.i208.us = and <1 x i32> %"oldMask&test.i180291.us", %"!(break|continue)_lanes.i207.us"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i213.us = fsub <1 x float> %mul_z_re_load_z_re_load13.i189.us, %mul_z_im_load_z_im_load14.i191.us
%mul__z_re_load35.i214.us = fmul <1 x float> %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init276288.us, <float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i215.us = fmul <1 x float> %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init278287.us, %mul__z_re_load35.i214.us
%add_c_re_load42_new_re_load.i216.us = fadd <1 x float> %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init.us, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i213.us
%add_c_im_load44_new_im_load.i217.us = fadd <1 x float> %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init, %mul_mul__z_re_load35_z_im_load36.i215.us
br label %for_step.i187.us
for_step.i187.us: ; preds = %not_all_continued_or_breaked.i218.us, %for_loop.i204.us
%add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init277.us = phi <1 x float> [ %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init278287.us, %for_loop.i204.us ], [ %add_c_im_load44_new_im_load.i217.us, %not_all_continued_or_breaked.i218.us ]
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init275.us = phi <1 x float> [ %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init276288.us, %for_loop.i204.us ], [ %add_c_re_load42_new_re_load.i216.us, %not_all_continued_or_breaked.i218.us ]
%internal_mask_memory.1.i184.us = phi <1 x i32> [ zeroinitializer, %for_loop.i204.us ], [ %new_mask28.i208.us, %not_all_continued_or_breaked.i218.us ]
%m.i.i262.us = extractelement <1 x i32> %internal_mask_memory.1.i184.us, i32 0
%d0.i.i264.us = extractelement <1 x i32> %r.i.i267270289.us, i32 0
%not.cmp.i.i263.us = icmp ne i32 %m.i.i262.us, 0
%d1.i.i265.us = zext i1 %not.cmp.i.i263.us to i32
%sel.i.i266.us = add i32 %d0.i.i264.us, %d1.i.i265.us
%r.i.i267.us = insertelement <1 x i32> undef, i32 %sel.i.i266.us, i32 0
%less_i_load_count_load.i179.us = icmp slt <1 x i32> %r.i.i267.us, %maxIterations_load_broadcast_init
%"oldMask&test.i180.us" = select <1 x i1> %less_i_load_count_load.i179.us, <1 x i32> %internal_mask_memory.1.i184.us, <1 x i32> zeroinitializer
%item.i.i181.us = extractelement <1 x i32> %"oldMask&test.i180.us", i32 0
%cmp.i.i182.us = icmp slt i32 %item.i.i181.us, 0
br i1 %cmp.i.i182.us, label %for_loop.i204.us, label %mandel___vyfvyfvyi.exit219.us
for_loop.i204.lr.ph.us: ; preds = %if_exit.us, %for_loop42.lr.ph
%xi.0293.us = phi i32 [ %add_xi_load73_.us, %if_exit.us ], [ %mul_calltmp_xspan_load, %for_loop42.lr.ph ]
%calltmp51.us = call i32 @getLaneIndex___(<1 x i32> <i32 -1>)
%add_xi_load48_calltmp51.us = add i32 %calltmp51.us, %xi.0293.us
%add_xi_load48_calltmp51_to_float.us = sitofp i32 %add_xi_load48_calltmp51.us to float
%mul_add_xi_load48_calltmp51_to_float_dx_load.us = fmul float %dx4, %add_xi_load48_calltmp51_to_float.us
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load.us = fadd float %x02, %mul_add_xi_load48_calltmp51_to_float_dx_load.us
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init.us = insertelement <1 x float> undef, float %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load.us, i32 0
br label %for_loop.i204.us
for_exit: ; preds = %for_exit115, %for_exit43, %some_on, %all_on
ret void
mandel___vyfvyfvyi.exit219: ; preds = %if_exit, %for_loop42.lr.ph
%xi.0293 = phi i32 [ %add_xi_load73_, %if_exit ], [ %mul_calltmp_xspan_load, %for_loop42.lr.ph ]
%calltmp51 = call i32 @getLaneIndex___(<1 x i32> <i32 -1>)
%calltmp61 = call i32 @getLaneIndex___(<1 x i32> <i32 -1>)
%calltmp65 = call i32 @getLaneIndex___(<1 x i32> <i32 -1>)
%add_xi_load62_calltmp65 = add i32 %calltmp65, %xi.0293
%less_add_xi_load62_calltmp65_xend_load66 = icmp slt i32 %add_xi_load62_calltmp65, %r.i.i
br i1 %less_add_xi_load62_calltmp65_xend_load66, label %if_then, label %if_exit
for_exit43: ; preds = %if_exit, %if_exit.us, %for_test40.preheader
%yi_load74_plus1 = add i32 %yi.0295, 1
%exitcond = icmp eq i32 %yi_load74_plus1, %16
br i1 %exitcond, label %for_exit, label %for_test40.preheader
if_then: ; preds = %mandel___vyfvyfvyi.exit219
%add_xi_load58_calltmp61 = add i32 %xi.0293, %mul_yi_load56_width_load57
%add_mul_yi_load56_width_load57_add_xi_load58_calltmp61 = add i32 %add_xi_load58_calltmp61, %calltmp61
%24 = shl i32 %add_mul_yi_load56_width_load57_add_xi_load58_calltmp61, 2
%iptr__id.i239.rhs = sext i32 %24 to i64
%iptr__id.i239 = add i64 %iptr__id.i239.rhs, %output_load_ptr2int
%ptr__id.i240 = inttoptr i64 %iptr__id.i239 to i32*
store i32 0, i32* %ptr__id.i240, align 4
br label %if_exit
if_exit: ; preds = %if_then, %mandel___vyfvyfvyi.exit219
%add_xi_load73_ = add i32 %xi.0293, 32
%less_xi_load_xend_load = icmp slt i32 %add_xi_load73_, %r.i.i
br i1 %less_xi_load_xend_load, label %mandel___vyfvyfvyi.exit219, label %for_exit43
for_test112.preheader: ; preds = %for_exit115, %for_test112.preheader.lr.ph
%yi106.0310 = phi i32 [ %mul_calltmp92_yspan_load93, %for_test112.preheader.lr.ph ], [ %yi_load165_plus1, %for_exit115 ]
br i1 %less_xi_load119_xend_load120306, label %for_loop114.lr.ph, label %for_exit115
for_loop114.lr.ph: ; preds = %for_test112.preheader
%yi_load132_to_float = sitofp i32 %yi106.0310 to float
%mul_yi_load132_to_float_dy_load133 = fmul float %dy8, %yi_load132_to_float
%add_y0_load131_mul_yi_load132_to_float_dy_load133 = fadd float %y06, %mul_yi_load132_to_float_dy_load133
%add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init = insertelement <1 x float> undef, float %add_y0_load131_mul_yi_load132_to_float_dy_load133, i32 0
br i1 %cmp.i.i300, label %for_loop.i.lr.ph.us, label %if_exit156
if_exit156.us: ; preds = %for_step.i.us
%calltmp147.us = call i32 @getLaneIndex___(<1 x i32> %mask)
%calltmp151.us = call i32 @getLaneIndex___(<1 x i32> %mask)
%add_xi117_load_.us = add i32 %xi117.0307.us, 32
%less_xi_load119_xend_load120.us = icmp slt i32 %add_xi117_load_.us, %r.i.i169
br i1 %less_xi_load119_xend_load120.us, label %for_loop.i.lr.ph.us, label %for_exit115
for_loop.i.us: ; preds = %for_loop.i.lr.ph.us, %for_step.i.us
%"oldMask&test.i304.us" = phi <1 x i32> [ %"oldMask&test.i297", %for_loop.i.lr.ph.us ], [ %"oldMask&test.i.us", %for_step.i.us ]
%break_lanes_memory.0.i303.us = phi <1 x i32> [ zeroinitializer, %for_loop.i.lr.ph.us ], [ %"mask|break_mask.i.us", %for_step.i.us ]
%25 = phi <1 x i32> [ zeroinitializer, %for_loop.i.lr.ph.us ], [ %r.i.i236.us, %for_step.i.us ]
%add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init280302.us = phi <1 x float> [ %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init.us, %for_loop.i.lr.ph.us ], [ %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init279.us, %for_step.i.us ]
%add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init282301.us = phi <1 x float> [ %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init, %for_loop.i.lr.ph.us ], [ %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init281.us, %for_step.i.us ]
%"internal_mask&function_mask12.i.us" = and <1 x i32> %"oldMask&test.i304.us", %mask
%mul_z_re_load_z_re_load13.i.us = fmul <1 x float> %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init280302.us, %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init280302.us
%mul_z_im_load_z_im_load14.i.us = fmul <1 x float> %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init282301.us, %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init282301.us
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i.us = fadd <1 x float> %mul_z_im_load_z_im_load14.i.us, %mul_z_re_load_z_re_load13.i.us
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i.us = fcmp ugt <1 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i.us, <float 4.000000e+00>
%"oldMask&test16.i.us" = select <1 x i1> %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i.us, <1 x i32> %"oldMask&test.i304.us", <1 x i32> zeroinitializer
%"mask|break_mask.i.us" = or <1 x i32> %"oldMask&test16.i.us", %break_lanes_memory.0.i303.us
%"finished&func.i.us" = and <1 x i32> %"mask|break_mask.i.us", %mask
%item.i63.i.us = extractelement <1 x i32> %"finished&func.i.us", i32 0
%v.i64.i.us = lshr i32 %item.i63.i.us, 31
%item.i62.i.us = extractelement <1 x i32> %"internal_mask&function_mask12.i.us", i32 0
%v.i.i.us = lshr i32 %item.i62.i.us, 31
%"equal_finished&func_internal_mask&function_mask12.i.us" = icmp eq i32 %v.i64.i.us, %v.i.i.us
br i1 %"equal_finished&func_internal_mask&function_mask12.i.us", label %for_step.i.us, label %not_all_continued_or_breaked.i.us
not_all_continued_or_breaked.i.us: ; preds = %for_loop.i.us
%"!(break|continue)_lanes.i.us" = xor <1 x i32> %"mask|break_mask.i.us", <i32 -1>
%new_mask28.i.us = and <1 x i32> %"oldMask&test.i304.us", %"!(break|continue)_lanes.i.us"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i.us = fsub <1 x float> %mul_z_re_load_z_re_load13.i.us, %mul_z_im_load_z_im_load14.i.us
%mul__z_re_load35.i.us = fmul <1 x float> %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init280302.us, <float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i.us = fmul <1 x float> %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init282301.us, %mul__z_re_load35.i.us
%add_c_re_load42_new_re_load.i.us = fadd <1 x float> %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init.us, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i.us
%add_c_im_load44_new_im_load.i.us = fadd <1 x float> %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init, %mul_mul__z_re_load35_z_im_load36.i.us
br label %for_step.i.us
for_step.i.us: ; preds = %not_all_continued_or_breaked.i.us, %for_loop.i.us
%add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init281.us = phi <1 x float> [ %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init282301.us, %for_loop.i.us ], [ %add_c_im_load44_new_im_load.i.us, %not_all_continued_or_breaked.i.us ]
%add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init279.us = phi <1 x float> [ %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init280302.us, %for_loop.i.us ], [ %add_c_re_load42_new_re_load.i.us, %not_all_continued_or_breaked.i.us ]
%internal_mask_memory.1.i.us = phi <1 x i32> [ zeroinitializer, %for_loop.i.us ], [ %new_mask28.i.us, %not_all_continued_or_breaked.i.us ]
%m.i.i.us = extractelement <1 x i32> %internal_mask_memory.1.i.us, i32 0
%d0.i.i234.us = extractelement <1 x i32> %25, i32 0
%not.cmp.i.i233.us = icmp ne i32 %m.i.i.us, 0
%d1.i.i235.us = zext i1 %not.cmp.i.i233.us to i32
%sel.i.i.us = add i32 %d0.i.i234.us, %d1.i.i235.us
%r.i.i236.us = insertelement <1 x i32> undef, i32 %sel.i.i.us, i32 0
%less_i_load_count_load.i.us = icmp slt <1 x i32> %r.i.i236.us, %maxIterations_load137_broadcast_init
%"oldMask&test.i.us" = select <1 x i1> %less_i_load_count_load.i.us, <1 x i32> %internal_mask_memory.1.i.us, <1 x i32> zeroinitializer
%"internal_mask&function_mask10.i.us" = and <1 x i32> %"oldMask&test.i.us", %mask
%item.i.i.us = extractelement <1 x i32> %"internal_mask&function_mask10.i.us", i32 0
%cmp.i.i.us = icmp slt i32 %item.i.i.us, 0
br i1 %cmp.i.i.us, label %for_loop.i.us, label %if_exit156.us
for_loop.i.lr.ph.us: ; preds = %if_exit156.us, %for_loop114.lr.ph
%xi117.0307.us = phi i32 [ %add_xi117_load_.us, %if_exit156.us ], [ %mul_calltmp80_xspan_load81, %for_loop114.lr.ph ]
%calltmp128.us = call i32 @getLaneIndex___(<1 x i32> %mask)
%add_xi_load125_calltmp128.us = add i32 %calltmp128.us, %xi117.0307.us
%add_xi_load125_calltmp128_to_float.us = sitofp i32 %add_xi_load125_calltmp128.us to float
%mul_add_xi_load125_calltmp128_to_float_dx_load129.us = fmul float %dx4, %add_xi_load125_calltmp128_to_float.us
%add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129.us = fadd float %x02, %mul_add_xi_load125_calltmp128_to_float_dx_load129.us
%add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init.us = insertelement <1 x float> undef, float %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129.us, i32 0
br label %for_loop.i.us
for_exit115: ; preds = %if_exit156, %if_exit156.us, %for_test112.preheader
%yi_load165_plus1 = add i32 %yi106.0310, 1
%exitcond312 = icmp eq i32 %yi_load165_plus1, %22
br i1 %exitcond312, label %for_exit, label %for_test112.preheader
if_exit156: ; preds = %if_exit156, %for_loop114.lr.ph
%xi117.0307 = phi i32 [ %add_xi117_load_, %if_exit156 ], [ %mul_calltmp80_xspan_load81, %for_loop114.lr.ph ]
%calltmp128 = call i32 @getLaneIndex___(<1 x i32> %mask)
%calltmp147 = call i32 @getLaneIndex___(<1 x i32> %mask)
%calltmp151 = call i32 @getLaneIndex___(<1 x i32> %mask)
%add_xi117_load_ = add i32 %xi117.0307, 32
%less_xi_load119_xend_load120 = icmp slt i32 %add_xi117_load_, %r.i.i169
br i1 %less_xi_load119_xend_load120, label %if_exit156, label %for_exit115
}
attributes #0 = { alwaysinline nounwind readnone }
attributes #1 = { nounwind readnone }
attributes #2 = { alwaysinline nounwind }
attributes #3 = { nounwind }
attributes #4 = { alwaysinline nounwind readonly }
attributes #5 = { nounwind "target-features"="+sm_35" }

View File

@@ -1,177 +0,0 @@
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
// Cuda compilation tools, release 5.5, V5.5.0
//
.version 3.2
.target sm_35
.address_size 64
.file 1 "/home/evghenii/soft/ispc-code/ispc/examples/mandelbrot_tasks3d/test.cu", 1383046614, 1449
.file 2 "/usr/local/cuda-5.5/bin/..//include/cuda_device_runtime_api.h", 1375338991, 7655
.file 3 "/usr/local/cuda-5.5/bin/..//include/device_functions.h", 1375338991, 185228
.weak .func (.param .b32 func_retval0) cudaMalloc(
.param .b64 cudaMalloc_param_0,
.param .b64 cudaMalloc_param_1
)
{
.reg .s32 %r<2>;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
.loc 2 66 3
ret;
}
.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes(
.param .b64 cudaFuncGetAttributes_param_0,
.param .b64 cudaFuncGetAttributes_param_1
)
{
.reg .s32 %r<2>;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
.loc 2 71 3
ret;
}
.visible .entry _Z19mandelbrot_scanlineffffiiiiiPi(
.param .f32 _Z19mandelbrot_scanlineffffiiiiiPi_param_0,
.param .f32 _Z19mandelbrot_scanlineffffiiiiiPi_param_1,
.param .f32 _Z19mandelbrot_scanlineffffiiiiiPi_param_2,
.param .f32 _Z19mandelbrot_scanlineffffiiiiiPi_param_3,
.param .u32 _Z19mandelbrot_scanlineffffiiiiiPi_param_4,
.param .u32 _Z19mandelbrot_scanlineffffiiiiiPi_param_5,
.param .u32 _Z19mandelbrot_scanlineffffiiiiiPi_param_6,
.param .u32 _Z19mandelbrot_scanlineffffiiiiiPi_param_7,
.param .u32 _Z19mandelbrot_scanlineffffiiiiiPi_param_8,
.param .u64 _Z19mandelbrot_scanlineffffiiiiiPi_param_9
)
{
.reg .pred %p<9>;
.reg .s32 %r<36>;
.reg .f32 %f<20>;
.reg .s64 %rd<5>;
ld.param.f32 %f9, [_Z19mandelbrot_scanlineffffiiiiiPi_param_0];
ld.param.f32 %f10, [_Z19mandelbrot_scanlineffffiiiiiPi_param_1];
ld.param.f32 %f11, [_Z19mandelbrot_scanlineffffiiiiiPi_param_2];
ld.param.f32 %f12, [_Z19mandelbrot_scanlineffffiiiiiPi_param_3];
ld.param.u32 %r14, [_Z19mandelbrot_scanlineffffiiiiiPi_param_4];
ld.param.u32 %r17, [_Z19mandelbrot_scanlineffffiiiiiPi_param_5];
ld.param.u32 %r15, [_Z19mandelbrot_scanlineffffiiiiiPi_param_6];
ld.param.u32 %r18, [_Z19mandelbrot_scanlineffffiiiiiPi_param_7];
ld.param.u32 %r16, [_Z19mandelbrot_scanlineffffiiiiiPi_param_8];
ld.param.u64 %rd1, [_Z19mandelbrot_scanlineffffiiiiiPi_param_9];
.loc 1 43 1
mov.u32 %r19, %ctaid.x;
.loc 1 44 1
mad.lo.s32 %r20, %r19, %r15, %r15;
.loc 3 2621 10
min.s32 %r1, %r20, %r14;
.loc 1 46 1
mov.u32 %r21, %ctaid.y;
mul.lo.s32 %r33, %r21, %r18;
.loc 1 47 1
add.s32 %r22, %r33, %r18;
.loc 3 2621 10
min.s32 %r3, %r22, %r17;
.loc 1 49 1
setp.ge.s32 %p1, %r33, %r3;
@%p1 bra BB2_12;
cvta.to.global.u64 %rd2, %rd1;
BB2_2:
.loc 1 43 1
mul.lo.s32 %r34, %r19, %r15;
.loc 1 50 1
setp.ge.s32 %p2, %r34, %r1;
@%p2 bra BB2_11;
.loc 1 53 1
cvt.rn.f32.s32 %f13, %r33;
fma.rn.f32 %f1, %f13, %f12, %f11;
BB2_4:
.loc 1 52 1
mov.u32 %r26, %tid.x;
and.b32 %r27, %r26, 31;
add.s32 %r7, %r27, %r34;
cvt.rn.f32.u32 %f14, %r7;
fma.rn.f32 %f2, %f14, %f10, %f9;
mov.u32 %r35, 0;
setp.gt.s32 %p3, %r16, 0;
.loc 1 13 1
@%p3 bra BB2_5;
bra.uni BB2_8;
BB2_5:
mov.f32 %f18, %f1;
mov.f32 %f19, %f2;
BB2_6:
.loc 1 14 1
mov.f32 %f4, %f19;
mov.f32 %f3, %f18;
mul.f32 %f5, %f3, %f3;
mul.f32 %f6, %f4, %f4;
add.f32 %f15, %f6, %f5;
setp.gt.f32 %p4, %f15, 0f40800000;
@%p4 bra BB2_8;
.loc 1 17 1
sub.f32 %f16, %f6, %f5;
.loc 1 18 1
add.f32 %f17, %f4, %f4;
.loc 1 19 1
add.f32 %f7, %f2, %f16;
.loc 1 20 1
fma.rn.f32 %f8, %f17, %f3, %f1;
.loc 1 13 96
add.s32 %r35, %r35, 1;
.loc 1 13 1
setp.lt.s32 %p5, %r35, %r16;
mov.f32 %f18, %f8;
mov.f32 %f19, %f7;
@%p5 bra BB2_6;
BB2_8:
.loc 1 56 1
mad.lo.s32 %r30, %r33, %r14, %r34;
add.s32 %r11, %r30, %r27;
.loc 1 57 1
setp.ge.u32 %p6, %r7, %r1;
@%p6 bra BB2_10;
mul.wide.s32 %rd3, %r11, 4;
add.s64 %rd4, %rd2, %rd3;
.loc 1 58 1
st.global.u32 [%rd4], %r35;
BB2_10:
.loc 1 50 57
add.s32 %r34, %r34, 32;
.loc 1 50 1
setp.lt.s32 %p7, %r34, %r1;
@%p7 bra BB2_4;
BB2_11:
.loc 1 49 57
add.s32 %r33, %r33, 1;
.loc 1 49 1
setp.lt.s32 %p8, %r33, %r3;
@%p8 bra BB2_2;
BB2_12:
.loc 1 60 2
ret;
}

View File

@@ -1,801 +0,0 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl __vselect_i8
.func (.param .b32 func_retval0) getBlockIndex0___
(
.param .align 4 .b8 getBlockIndex0____param_0[4]
)
;
.func (.param .b32 func_retval0) getBlockIndex1___
(
.param .align 4 .b8 getBlockIndex1____param_0[4]
)
;
.func (.param .b32 func_retval0) getLaneIndex___
(
.param .align 4 .b8 getLaneIndex____param_0[4]
)
;
// @__vselect_i8
.func (.param .align 1 .b8 func_retval0[1]) __vselect_i8(
.param .align 1 .b8 __vselect_i8_param_0[1],
.param .align 1 .b8 __vselect_i8_param_1[1],
.param .align 4 .b8 __vselect_i8_param_2[4]
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i8_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u8 %rc0, [__vselect_i8_param_0];
ld.param.u8 %rc1, [__vselect_i8_param_1];
selp.b16 %rc0, %rc0, %rc1, %p0;
st.param.b8 [func_retval0+0], %rc0;
ret;
}
// .globl __vselect_i16
.func (.param .align 2 .b8 func_retval0[2]) __vselect_i16(
.param .align 2 .b8 __vselect_i16_param_0[2],
.param .align 2 .b8 __vselect_i16_param_1[2],
.param .align 4 .b8 __vselect_i16_param_2[4]
) // @__vselect_i16
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i16_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u16 %rs0, [__vselect_i16_param_0];
ld.param.u16 %rs1, [__vselect_i16_param_1];
selp.b16 %rs0, %rs0, %rs1, %p0;
st.param.b16 [func_retval0+0], %rs0;
ret;
}
// .globl __vselect_i64
.func (.param .align 8 .b8 func_retval0[8]) __vselect_i64(
.param .align 8 .b8 __vselect_i64_param_0[8],
.param .align 8 .b8 __vselect_i64_param_1[8],
.param .align 4 .b8 __vselect_i64_param_2[4]
) // @__vselect_i64
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i64_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u64 %rl0, [__vselect_i64_param_0];
ld.param.u64 %rl1, [__vselect_i64_param_1];
selp.b64 %rl0, %rl0, %rl1, %p0;
st.param.b64 [func_retval0+0], %rl0;
ret;
}
// .globl __aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
) // @__aos_to_soa4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __rsqrt_varying_double
.func (.param .align 8 .b8 func_retval0[8]) __rsqrt_varying_double(
.param .align 8 .b8 __rsqrt_varying_double_param_0[8]
) // @__rsqrt_varying_double
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f64 %fl0, [__rsqrt_varying_double_param_0];
rsqrt.approx.f64 %fl0, %fl0;
st.param.f64 [func_retval0+0], %fl0;
ret;
}
// .globl mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_
.func mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_(
.param .b64 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_1,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_2,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_3,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_4,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_5,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_6,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_7,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_8,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_9,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_10
) // @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u64 %rl0, [mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0];
ld.f32 %f0, [%rl0];
ld.f32 %f1, [%rl0+4];
ld.f32 %f2, [%rl0+8];
ld.f32 %f3, [%rl0+12];
ld.u32 %r1, [%rl0+16];
ld.u32 %r7, [%rl0+20];
ld.u32 %r9, [%rl0+24];
ld.u32 %r8, [%rl0+28];
ld.u32 %r0, [%rl0+32];
ld.u32 %r2, [%rl0+48];
setp.gt.s32 %p0, %r2, -1;
@%p0 bra BB8_3;
bra.uni BB8_1;
BB8_3: // %some_on
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getBlockIndex0___,
(
param0
);
ld.param.b32 %r5, [retval0+0];
//{
}// Callseq End 0
// Callseq Start 1
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getBlockIndex1___,
(
param0
);
ld.param.b32 %r10, [retval0+0];
//{
}// Callseq End 1
mul.lo.s32 %r3, %r10, %r8;
mad.lo.s32 %r4, %r10, %r8, %r8;
setp.lt.s32 %p0, %r4, %r7;
selp.b32 %r4, %r4, %r7, %p0;
setp.ge.s32 %p0, %r3, %r4;
@%p0 bra BB8_31;
// BB#4: // %for_test112.preheader.lr.ph
mul.lo.s32 %r4, %r5, %r9;
mad.lo.s32 %r5, %r5, %r9, %r9;
setp.lt.s32 %p0, %r5, %r1;
selp.b32 %r1, %r5, %r1, %p0;
setp.gt.s32 %p0, %r0, 0;
selp.b32 %r5, -1, 0, %p0;
and.b32 %r6, %r5, %r2;
not.b32 %r7, %r7;
add.s32 %r9, %r10, 1;
mul.lo.s32 %r8, %r8, %r9;
not.b32 %r8, %r8;
setp.gt.s32 %p0, %r7, %r8;
selp.b32 %r7, %r7, %r8, %p0;
not.b32 %r7, %r7;
BB8_5: // %for_test112.preheader
// =>This Loop Header: Depth=1
// Child Loop BB8_29 Depth 2
// Child Loop BB8_28 Depth 2
// Child Loop BB8_23 Depth 3
setp.ge.s32 %p0, %r4, %r1;
@%p0 bra BB8_30;
// BB#21: // %for_loop114.lr.ph
// in Loop: Header=BB8_5 Depth=1
setp.lt.s32 %p0, %r6, 0;
mov.u32 %r8, %r4;
@%p0 bra BB8_22;
bra.uni BB8_29;
BB8_22: // in Loop: Header=BB8_5 Depth=1
cvt.rn.f32.s32 %f4, %r3;
fma.rn.f32 %f4, %f3, %f4, %f2;
mov.u32 %r8, %r4;
BB8_28: // %for_loop.i.lr.ph.us
// Parent Loop BB8_5 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB8_23 Depth 3
// Callseq Start 5
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 5
add.s32 %r9, %r9, %r8;
cvt.rn.f32.s32 %f5, %r9;
fma.rn.f32 %f5, %f1, %f5, %f0;
mov.u32 %r9, 0;
mov.u32 %r12, %r5;
mov.u32 %r10, %r9;
mov.u32 %r11, %r9;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB8_23: // %for_loop.i.us
// Parent Loop BB8_5 Depth=1
// Parent Loop BB8_28 Depth=2
// => This Inner Loop Header: Depth=3
and.b32 %r13, %r12, %r2;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p0, %f9, 0f40800000;
selp.b32 %r14, %r12, 0, %p0;
or.b32 %r10, %r14, %r10;
and.b32 %r14, %r10, %r2;
shr.u32 %r14, %r14, 31;
shr.u32 %r13, %r13, 31;
setp.eq.s32 %p0, %r14, %r13;
@%p0 bra BB8_24;
bra.uni BB8_25;
BB8_24: // in Loop: Header=BB8_23 Depth=3
mov.u32 %r12, %r9;
bra.uni BB8_26;
BB8_25: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB8_23 Depth=3
mul.f32 %f9, %f6, %f6;
not.b32 %r13, %r10;
and.b32 %r12, %r12, %r13;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB8_26: // %for_step.i.us
// in Loop: Header=BB8_23 Depth=3
setp.ne.s32 %p0, %r12, 0;
selp.u32 %r13, 1, 0, %p0;
add.s32 %r11, %r11, %r13;
setp.lt.s32 %p0, %r11, %r0;
selp.b32 %r12, %r12, 0, %p0;
and.b32 %r13, %r12, %r2;
setp.lt.s32 %p0, %r13, 0;
@%p0 bra BB8_23;
// BB#27: // %if_exit156.us
// in Loop: Header=BB8_28 Depth=2
// Callseq Start 6
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 6
// Callseq Start 7
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 7
add.s32 %r8, %r8, 32;
setp.lt.s32 %p0, %r8, %r1;
@%p0 bra BB8_28;
bra.uni BB8_30;
BB8_29: // %if_exit156
// Parent Loop BB8_5 Depth=1
// => This Inner Loop Header: Depth=2
// Callseq Start 2
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 2
// Callseq Start 3
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 3
// Callseq Start 4
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 4
add.s32 %r8, %r8, 32;
setp.lt.s32 %p0, %r8, %r1;
@%p0 bra BB8_29;
BB8_30: // %for_exit115
// in Loop: Header=BB8_5 Depth=1
add.s32 %r3, %r3, 1;
setp.eq.s32 %p0, %r3, %r7;
@%p0 bra BB8_31;
bra.uni BB8_5;
BB8_1: // %all_on
ld.u64 %rl0, [%rl0+40];
mov.u32 %r2, -1;
// Callseq Start 8
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getBlockIndex0___,
(
param0
);
ld.param.b32 %r10, [retval0+0];
//{
}// Callseq End 8
// Callseq Start 9
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getBlockIndex1___,
(
param0
);
ld.param.b32 %r11, [retval0+0];
//{
}// Callseq End 9
mul.lo.s32 %r3, %r11, %r8;
mad.lo.s32 %r4, %r11, %r8, %r8;
setp.lt.s32 %p0, %r4, %r7;
selp.b32 %r4, %r4, %r7, %p0;
setp.ge.s32 %p0, %r3, %r4;
@%p0 bra BB8_31;
// BB#2: // %for_test40.preheader.lr.ph
mul.lo.s32 %r4, %r10, %r9;
mad.lo.s32 %r5, %r10, %r9, %r9;
setp.lt.s32 %p0, %r5, %r1;
selp.b32 %r5, %r5, %r1, %p0;
setp.gt.s32 %p0, %r0, 0;
selp.b32 %r6, -1, 0, %p0;
not.b32 %r7, %r7;
add.s32 %r12, %r11, 1;
mul.lo.s32 %r12, %r8, %r12;
not.b32 %r12, %r12;
setp.gt.s32 %p0, %r7, %r12;
selp.b32 %r7, %r7, %r12, %p0;
not.b32 %r7, %r7;
mul.lo.s32 %r8, %r11, %r8;
mul.lo.s32 %r8, %r8, %r1;
mad.lo.s32 %r8, %r10, %r9, %r8;
BB8_7: // %for_test40.preheader
// =>This Loop Header: Depth=1
// Child Loop BB8_19 Depth 2
// Child Loop BB8_13 Depth 2
// Child Loop BB8_14 Depth 3
setp.ge.s32 %p0, %r4, %r5;
@%p0 bra BB8_6;
// BB#8: // %for_loop42.lr.ph
// in Loop: Header=BB8_7 Depth=1
setp.lt.s32 %p0, %r6, 0;
mov.u32 %r9, %r8;
mov.u32 %r10, %r4;
@%p0 bra BB8_9;
bra.uni BB8_19;
BB8_9: // in Loop: Header=BB8_7 Depth=1
cvt.rn.f32.s32 %f4, %r3;
mul.lo.s32 %r9, %r3, %r1;
fma.rn.f32 %f4, %f3, %f4, %f2;
mov.u32 %r10, %r4;
BB8_13: // %for_loop.i204.lr.ph.us
// Parent Loop BB8_7 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB8_14 Depth 3
// Callseq Start 13
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r11, [retval0+0];
//{
}// Callseq End 13
add.s32 %r11, %r11, %r10;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f1, %f5, %f0;
mov.u32 %r12, 0;
mov.u32 %r14, %r6;
mov.u32 %r13, %r12;
mov.u32 %r11, %r12;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB8_14: // %for_loop.i204.us
// Parent Loop BB8_7 Depth=1
// Parent Loop BB8_13 Depth=2
// => This Inner Loop Header: Depth=3
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p0, %f9, 0f40800000;
selp.b32 %r15, %r14, 0, %p0;
or.b32 %r13, %r15, %r13;
shr.u32 %r15, %r13, 31;
shr.u32 %r16, %r14, 31;
setp.eq.s32 %p0, %r15, %r16;
@%p0 bra BB8_15;
bra.uni BB8_16;
BB8_15: // in Loop: Header=BB8_14 Depth=3
mov.u32 %r14, %r12;
bra.uni BB8_17;
BB8_16: // %not_all_continued_or_breaked.i218.us
// in Loop: Header=BB8_14 Depth=3
mul.f32 %f9, %f6, %f6;
not.b32 %r15, %r13;
and.b32 %r14, %r14, %r15;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB8_17: // %for_step.i187.us
// in Loop: Header=BB8_14 Depth=3
setp.ne.s32 %p0, %r14, 0;
selp.u32 %r15, 1, 0, %p0;
add.s32 %r11, %r11, %r15;
setp.lt.s32 %p0, %r11, %r0;
selp.b32 %r14, %r14, 0, %p0;
setp.lt.s32 %p0, %r14, 0;
@%p0 bra BB8_14;
// BB#10: // %mandel___vyfvyfvyi.exit219.us
// in Loop: Header=BB8_13 Depth=2
// Callseq Start 14
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r12, [retval0+0];
//{
}// Callseq End 14
// Callseq Start 15
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r13, [retval0+0];
//{
}// Callseq End 15
add.s32 %r13, %r13, %r10;
setp.ge.s32 %p0, %r13, %r5;
@%p0 bra BB8_12;
// BB#11: // %if_then.us
// in Loop: Header=BB8_13 Depth=2
add.s32 %r13, %r10, %r9;
add.s32 %r12, %r13, %r12;
shl.b32 %r12, %r12, 2;
cvt.s64.s32 %rl1, %r12;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r11;
BB8_12: // %if_exit.us
// in Loop: Header=BB8_13 Depth=2
add.s32 %r10, %r10, 32;
setp.lt.s32 %p0, %r10, %r5;
@%p0 bra BB8_13;
bra.uni BB8_6;
BB8_19: // %mandel___vyfvyfvyi.exit219
// Parent Loop BB8_7 Depth=1
// => This Inner Loop Header: Depth=2
// Callseq Start 10
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r11, [retval0+0];
//{
}// Callseq End 10
// Callseq Start 11
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r11, [retval0+0];
//{
}// Callseq End 11
// Callseq Start 12
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r12, [retval0+0];
//{
}// Callseq End 12
add.s32 %r12, %r12, %r10;
setp.lt.s32 %p0, %r12, %r5;
@%p0 bra BB8_20;
bra.uni BB8_18;
BB8_20: // %if_then
// in Loop: Header=BB8_19 Depth=2
add.s32 %r11, %r11, %r9;
shl.b32 %r11, %r11, 2;
cvt.s64.s32 %rl1, %r11;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r11, 0;
st.u32 [%rl1], %r11;
BB8_18: // %if_exit
// in Loop: Header=BB8_19 Depth=2
add.s32 %r10, %r10, 32;
add.s32 %r9, %r9, 32;
setp.lt.s32 %p0, %r10, %r5;
@%p0 bra BB8_19;
BB8_6: // %for_exit43
// in Loop: Header=BB8_7 Depth=1
add.s32 %r3, %r3, 1;
add.s32 %r8, %r8, %r1;
setp.eq.s32 %p0, %r3, %r7;
@%p0 bra BB8_31;
bra.uni BB8_7;
BB8_31: // %for_exit
ret;
}

View File

@@ -1,515 +0,0 @@
compiling nvptx64
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 0
calleArgCount= 0
argVals= 0
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform double
ret_t: /*safe*/ uniform double
ret_t: /*safe*/ uniform double
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying unsigned int32
ret_t: /*safe*/ varying unsigned int32
ret_t: /*safe*/ varying unsigned int32
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform unsigned int32
ret_t: /*safe*/ uniform unsigned int32
ret_t: /*safe*/ uniform unsigned int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying unsigned int64
ret_t: /*safe*/ varying unsigned int64
ret_t: /*safe*/ varying unsigned int64
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ uniform float
ret_t: /*safe*/ /*cost=1*/ uniform float
ret_t: /*safe*/ /*cost=1*/ uniform float
argVals= 1
calleArgCount= 2
argVals= 2
calleArgCount= 2
argVals= 1
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int8
ret_t: /*safe*/ /*cost=1*/ uniform int8
ret_t: /*safe*/ /*cost=1*/ uniform int8
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int8
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int8
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int8
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int16
ret_t: /*safe*/ /*cost=1*/ uniform int16
ret_t: /*safe*/ /*cost=1*/ uniform int16
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int16
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int16
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int16
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int32
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int32
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int32
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform double
ret_t: /*safe*/ /*cost=1*/ uniform double
ret_t: /*safe*/ /*cost=1*/ uniform double
argVals= 1
calleArgCount= 2
argVals= 2
calleArgCount= 2
argVals= 1
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int64
ret_t: /*safe*/ /*cost=1*/ uniform int64
ret_t: /*safe*/ /*cost=1*/ uniform int64
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int64
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int64
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int64
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ varying float
ret_t: /*safe*/ /*cost=1*/ varying float
ret_t: /*safe*/ /*cost=1*/ varying float
argVals= 1
calleArgCount= 2
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int8
ret_t: /*safe*/ /*cost=1*/ varying int8
ret_t: /*safe*/ /*cost=1*/ varying int8
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int8
ret_t: /*safe*/ /*cost=1*/ varying unsigned int8
ret_t: /*safe*/ /*cost=1*/ varying unsigned int8
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int16
ret_t: /*safe*/ /*cost=1*/ varying int16
ret_t: /*safe*/ /*cost=1*/ varying int16
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int16
ret_t: /*safe*/ /*cost=1*/ varying unsigned int16
ret_t: /*safe*/ /*cost=1*/ varying unsigned int16
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int32
ret_t: /*safe*/ /*cost=1*/ varying unsigned int32
ret_t: /*safe*/ /*cost=1*/ varying unsigned int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying double
ret_t: /*safe*/ /*cost=1*/ varying double
ret_t: /*safe*/ /*cost=1*/ varying double
argVals= 1
calleArgCount= 2
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int64
ret_t: /*safe*/ /*cost=1*/ varying int64
ret_t: /*safe*/ /*cost=1*/ varying int64
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int64
ret_t: /*safe*/ /*cost=1*/ varying unsigned int64
ret_t: /*safe*/ /*cost=1*/ varying unsigned int64
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 4
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 4
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 1
calleArgCount= 1
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
argVals= 1
calleArgCount= 1
ret_t: void
argVals= 3
calleArgCount= 3
ret_t: void
ret_t: void
argVals= 3
calleArgCount= 3
ret_t: void
ret_t: void
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
argVals= 1
calleArgCount= 1
argVals= 1
calleArgCount= 1
argVals= 0
calleArgCount= 0
stdlib.ispc:493:5: Error: Assertion failed (ctx.cpp:1755): "v0->getType() ==
v1->getType()".
***
*** Please file a bug report at https://github.com/ispc/ispc/issues
*** (Including as much information as you can about how to reproduce this error).
*** You have apparently encountered a bug in the compiler that we'd like to fix!
***
main.cpp(223): FATAL ERROR: Unhandled signal sent to process; terminating.

View File

@@ -1,513 +0,0 @@
compiling nvptx64
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
argVals= 0
calleArgCount= 1
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform double
ret_t: /*safe*/ uniform double
ret_t: /*safe*/ uniform double
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying unsigned int32
ret_t: /*safe*/ varying unsigned int32
ret_t: /*safe*/ varying unsigned int32
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform unsigned int32
ret_t: /*safe*/ uniform unsigned int32
ret_t: /*safe*/ uniform unsigned int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying unsigned int64
ret_t: /*safe*/ varying unsigned int64
ret_t: /*safe*/ varying unsigned int64
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ uniform float
ret_t: /*safe*/ /*cost=1*/ uniform float
ret_t: /*safe*/ /*cost=1*/ uniform float
argVals= 1
calleArgCount= 2
argVals= 2
calleArgCount= 2
argVals= 1
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int8
ret_t: /*safe*/ /*cost=1*/ uniform int8
ret_t: /*safe*/ /*cost=1*/ uniform int8
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int8
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int8
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int8
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int16
ret_t: /*safe*/ /*cost=1*/ uniform int16
ret_t: /*safe*/ /*cost=1*/ uniform int16
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int16
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int16
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int16
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int32
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int32
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int32
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform double
ret_t: /*safe*/ /*cost=1*/ uniform double
ret_t: /*safe*/ /*cost=1*/ uniform double
argVals= 1
calleArgCount= 2
argVals= 2
calleArgCount= 2
argVals= 1
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int64
ret_t: /*safe*/ /*cost=1*/ uniform int64
ret_t: /*safe*/ /*cost=1*/ uniform int64
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int64
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int64
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int64
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ varying float
ret_t: /*safe*/ /*cost=1*/ varying float
ret_t: /*safe*/ /*cost=1*/ varying float
argVals= 1
calleArgCount= 2
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int8
ret_t: /*safe*/ /*cost=1*/ varying int8
ret_t: /*safe*/ /*cost=1*/ varying int8
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int8
ret_t: /*safe*/ /*cost=1*/ varying unsigned int8
ret_t: /*safe*/ /*cost=1*/ varying unsigned int8
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int16
ret_t: /*safe*/ /*cost=1*/ varying int16
ret_t: /*safe*/ /*cost=1*/ varying int16
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int16
ret_t: /*safe*/ /*cost=1*/ varying unsigned int16
ret_t: /*safe*/ /*cost=1*/ varying unsigned int16
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int32
ret_t: /*safe*/ /*cost=1*/ varying unsigned int32
ret_t: /*safe*/ /*cost=1*/ varying unsigned int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying double
ret_t: /*safe*/ /*cost=1*/ varying double
ret_t: /*safe*/ /*cost=1*/ varying double
argVals= 1
calleArgCount= 2
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int64
ret_t: /*safe*/ /*cost=1*/ varying int64
ret_t: /*safe*/ /*cost=1*/ varying int64
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int64
ret_t: /*safe*/ /*cost=1*/ varying unsigned int64
ret_t: /*safe*/ /*cost=1*/ varying unsigned int64
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 4
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 4
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 1
calleArgCount= 1
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
argVals= 1
calleArgCount= 1
ret_t: void
argVals= 3
calleArgCount= 3
ret_t: void
ret_t: void
argVals= 3
calleArgCount= 3
ret_t: void
ret_t: void
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
argVals= 1
calleArgCount= 1
argVals= 1
calleArgCount= 1
argVals= 0
calleArgCount= 0
stdlib.ispc:493:5: Error: Assertion failed (ctx.cpp:1755): "v0->getType() ==
v1->getType()".
***
*** Please file a bug report at https://github.com/ispc/ispc/issues
*** (Including as much information as you can about how to reproduce this error).
*** You have apparently encountered a bug in the compiler that we'd like to fix!
***
main.cpp(223): FATAL ERROR: Unhandled signal sent to process; terminating.

View File

@@ -1,23 +0,0 @@
//
// z.h
// (Header automatically generated by the ispc compiler.)
// DO NOT EDIT THIS FILE.
//
#ifndef ISPC_Z_H
#define ISPC_Z_H
#include <stdint.h>
#ifdef __cplusplus
namespace ispc { /* namespace */
#endif // __cplusplus
#ifdef __cplusplus
} /* namespace */
#endif // __cplusplus
#endif // ISPC_Z_H