added cuda examples

This commit is contained in:
Evghenii
2013-11-04 11:44:49 +01:00
parent cb6614da42
commit cb7cbec0d5
226 changed files with 284385 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
mandelbrot
*.ppm

View File

@@ -0,0 +1,127 @@
code for sm_35
Function : mandelbrot_scanline
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08a0b010a0a01000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ S2R R2, SR_CTAID.Y; /* 0x86400000131c000a */
/*0018*/ MOV R3, c[0x0][0x15c]; /* 0x64c03c002b9c000e */
/*0020*/ IMAD R3, R2, c[0x0][0x15c], R3; /* 0x51080c002b9c080e */
/*0028*/ ISETP.LT.AND P0, PT, R3, c[0x0][0x154], PT; /* 0x5b181c002a9c0c1e */
/*0030*/ IMUL R0, R2, c[0x0][0x15c]; /* 0x61c018002b9c0802 */
/*0038*/ SEL R3, R3, c[0x0][0x154], P0; /* 0x650000002a9c0c0e */
/* 0x089c8010a01000b0 */
/*0048*/ ISETP.GE.AND P0, PT, R0, R3, PT; /* 0xdb681c00019c001e */
/*0050*/ @P0 EXIT ; /* 0x180000000000003c */
/*0058*/ IADD R2, R2, 0x1; /* 0xc0800000009c0809 */
/*0060*/ MOV R3, c[0x0][0x158]; /* 0x64c03c002b1c000e */
/*0068*/ IMUL R5, R2, c[0x0][0x15c]; /* 0x61c018002b9c0816 */
/*0070*/ LOP.PASS_B R4, RZ, ~c[0x0][0x154]; /* 0x620038002a9ffc12 */
/*0078*/ S2R R2, SR_CTAID.X; /* 0x86400000129c000a */
/* 0x08ac80109c108010 */
/*0088*/ LOP.PASS_B R7, RZ, ~R5; /* 0xe2003800029ffc1e */
/*0090*/ LOP.PASS_B R6, RZ, ~c[0x0][0x154]; /* 0x620038002a9ffc1a */
/*0098*/ LOP.PASS_B R5, RZ, ~R5; /* 0xe2003800029ffc16 */
/*00a0*/ IMAD R3, R2, c[0x0][0x158], R3; /* 0x51080c002b1c080e */
/*00a8*/ ISETP.GT.AND P0, PT, R4, R7, PT; /* 0xdb481c00039c101e */
/*00b0*/ IMUL R2, R2, c[0x0][0x158]; /* 0x61c018002b1c080a */
/*00b8*/ ISETP.LT.AND P1, PT, R3, c[0x0][0x150], PT; /* 0x5b181c002a1c0c3e */
/* 0x0800b010008010a0 */
/*00c8*/ SEL R4, R5, R6, !P0; /* 0xe5002000031c1412 */
/*00d0*/ ISETP.LT.AND P0, PT, RZ, c[0x0][0x160], PT; /* 0x5b181c002c1ffc1e */
/*00d8*/ LOP.PASS_B R4, RZ, ~R4; /* 0xe2003800021ffc12 */
/*00e0*/ SEL R3, R3, c[0x0][0x150], P1; /* 0x650004002a1c0c0e */
/*00e8*/ ISETP.GE.AND P1, PT, R2, R3, PT; /* 0xdb681c00019c083e */
/*00f0*/ SSY 0x368; /* 0x1480000138000000 */
/*00f8*/ @P1 BRA 0x360; /* 0x120000013004003c */
/* 0x089c108010001080 */
/*0108*/ IMUL R5, R0, c[0x0][0x150]; /* 0x61c018002a1c0016 */
/*0110*/ MOV R8, R2; /* 0xe4c03c00011c0022 */
/*0118*/ @!P0 BRA 0x2d8; /* 0x12000000dc20003c */
/*0120*/ I2F.F32.S32 R6, R0; /* 0xe5c00000001ca81a */
/*0128*/ MOV R7, c[0x0][0x148]; /* 0x64c03c00291c001e */
/*0130*/ MOV R14, R2; /* 0xe4c03c00011c003a */
/*0138*/ MOV R16, c[0x0][0x140]; /* 0x64c03c00281c0042 */
/* 0x089c80a010a01000 */
/*0148*/ FFMA R6, R6, c[0x0][0x14c], R7; /* 0x4c001c00299c181a */
/*0150*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */
/*0158*/ MOV R9, R6; /* 0xe4c03c00031c0026 */
/*0160*/ LOP.AND R7, R10, 0x1f; /* 0xc20000000f9c281d */
/*0168*/ PSETP.AND.AND P2, PT, PT, PT, PT; /* 0x84801c07001dc05e */
/*0170*/ IADD R12, R7, R14; /* 0xe0800000071c1c32 */
/*0178*/ PSETP.AND.AND P3, PT, P0, PT, PT; /* 0x84801c07001c007e */
/* 0x08a00010a010a010 */
/*0188*/ I2F.F32.S32 R7, R12; /* 0xe5c00000061ca81e */
/*0190*/ PSETP.AND.AND P1, PT, !PT, PT, PT; /* 0x84801c07001fc03e */
/*0198*/ FFMA R11, R7, c[0x0][0x144], R16; /* 0x4c004000289c1c2e */
/*01a0*/ SSY 0x260; /* 0x148000005c000000 */
/*01a8*/ MOV R7, RZ; /* 0xe4c03c007f9c001e */
/*01b0*/ MOV R8, R11; /* 0xe4c03c00059c0022 */
/*01b8*/ FMUL R15, R8, R8; /* 0xe3400000041c203e */
/* 0x08b0b0ac80b0a010 */
/*01c8*/ PSETP.AND.AND P3, PT, P2, P3, PT; /* 0x84801c03001c807e */
/*01d0*/ FFMA R13, R9, R9, R15; /* 0xcc003c00049c2436 */
/*01d8*/ FSETP.GTU.AND P2, PT, R13, 4, PT; /* 0xb5e01e04001c345d */
/*01e0*/ PSETP.AND.OR P1, PT, P3, P2, P1; /* 0x84810402001cc03e */
/*01e8*/ PSETP.AND.AND P2, PT, !PT, PT, PT; /* 0x84801c07001fc05e */
/*01f0*/ PSETP.XOR.AND P5, PT, P1, P3, PT; /* 0x84801c03101c40be */
/*01f8*/ @P5 PSETP.AND.AND P2, PT, P3, !P1, PT; /* 0x84801c090014c05e */
/* 0x08ac8010b09c1080 */
/*0208*/ @P2 IADD R7, R7, 0x1; /* 0xc080000000881c1d */
/*0210*/ @P5 FFMA R13, -R9, R9, R15; /* 0xcc083c0004942436 */
/*0218*/ @P5 FADD R15, R8, R8; /* 0xe2c000000414203e */
/*0220*/ ISETP.LT.AND P3, PT, R7, c[0x0][0x160], PT; /* 0x5b181c002c1c1c7e */
/*0228*/ @P5 FADD R13, R11, R13; /* 0xe2c0000006942c36 */
/*0230*/ PSETP.AND.AND P4, PT, P2, P3, PT; /* 0x84801c03001c809e */
/*0238*/ @P5 FFMA R9, R9, R15, R6; /* 0xcc00180007942426 */
/* 0x08a0a0100000b810 */
/*0248*/ @P5 MOV R8, R13; /* 0xe4c03c0006940022 */
/*0250*/ @P4 BRA 0x1b8; /* 0x12007fffb010003c */
/*0258*/ ISETP.GE.AND.S P1, PT, R12, R3, PT; /* 0xdb681c0001dc303e */
/*0260*/ @P1 BRA.U 0x2b0; /* 0x120000002404023c */
/*0268*/ @!P1 LOP32I.AND R9, R10, 0x4000001f; /* 0x202000000fa42824 */
/*0270*/ @!P1 IADD R8, R14, R5; /* 0xe080000002a43822 */
/*0278*/ @!P1 IADD R8, R8, R9; /* 0xe080000004a42022 */
/* 0x08b0a000a0b010a0 */
/*0288*/ @!P1 SHF.L R8, RZ, 0x2, R8; /* 0xb7c020000127fc21 */
/*0290*/ @!P1 BFE R9, R8, 0x11f; /* 0xc00800008fa42025 */
/*0298*/ @!P1 IADD R8.CC, R8, c[0x0][0x168]; /* 0x608400002d242022 */
/*02a0*/ @!P1 IADD.X R9, R9, c[0x0][0x16c]; /* 0x608040002da42426 */
/*02a8*/ @!P1 ST.E [R8], R7; /* 0xe48000000024201c */
/*02b0*/ IADD R14, R14, 0x20; /* 0xc0800000101c3839 */
/*02b8*/ ISETP.LT.AND P1, PT, R14, R3, PT; /* 0xdb181c00019c383e */
/* 0x0880b0a0a0a0b8b8 */
/*02c8*/ @P1 BRA 0x150; /* 0x12007fff4004003c */
/*02d0*/ BRA 0x360; /* 0x12000000441c003c */
/*02d8*/ S2R R7, SR_TID.X; /* 0x86400000109c001e */
/*02e0*/ LOP.AND R6, R7, 0x1f; /* 0xc20000000f9c1c19 */
/*02e8*/ IADD R6, R6, R8; /* 0xe0800000041c181a */
/*02f0*/ ISETP.LT.AND P1, PT, R6, R3, PT; /* 0xdb181c00019c183e */
/*02f8*/ @P1 LOP32I.AND R7, R7, 0x4000001f; /* 0x202000000f841c1c */
/* 0x08a0b010a0a0a010 */
/*0308*/ @P1 IADD R6, R8, R5; /* 0xe08000000284201a */
/*0310*/ IADD R8, R8, 0x20; /* 0xc0800000101c2021 */
/*0318*/ @P1 IADD R6, R6, R7; /* 0xe08000000384181a */
/*0320*/ @P1 SHF.L R6, RZ, 0x2, R6; /* 0xb7c018000107fc19 */
/*0328*/ @P1 BFE R7, R6, 0x11f; /* 0xc00800008f84181d */
/*0330*/ @P1 IADD R6.CC, R6, c[0x0][0x168]; /* 0x608400002d04181a */
/*0338*/ @P1 IADD.X R7, R7, c[0x0][0x16c]; /* 0x608040002d841c1e */
/* 0x0880b8b000b8b0c8 */
/*0348*/ @P1 ST.E [R6], RZ; /* 0xe480000000041bfc */
/*0350*/ ISETP.LT.AND P1, PT, R8, R3, PT; /* 0xdb181c00019c203e */
/*0358*/ @P1 BRA 0x2d8; /* 0x12007fffbc04003c */
/*0360*/ IADD.S R0, R0, 0x1; /* 0xc080000000dc0001 */
/*0368*/ ISETP.EQ.AND P1, PT, R0, R4, PT; /* 0xdb281c00021c003e */
/*0370*/ @!P1 BRA 0xe8; /* 0x12007ffeb824003c */
/*0378*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/* 0x08000000000000b8 */
/*0388*/ EXIT ; /* 0x18000000001c003c */
/*0390*/ BRA 0x390; /* 0x12007ffffc1c003c */
/*0398*/ NOP; /* 0x85800000001c3c02 */
/*03a0*/ NOP; /* 0x85800000001c3c02 */
/*03a8*/ NOP; /* 0x85800000001c3c02 */
/*03b0*/ NOP; /* 0x85800000001c3c02 */
/*03b8*/ NOP; /* 0x85800000001c3c02 */
....................................

View File

@@ -0,0 +1,127 @@
code for sm_35
Function : mandelbrot_scanline
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08a0b010a0a01000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ S2R R2, SR_CTAID.Y; /* 0x86400000131c000a */
/*0018*/ MOV R3, c[0x0][0x15c]; /* 0x64c03c002b9c000e */
/*0020*/ IMAD R3, R2, c[0x0][0x15c], R3; /* 0x51080c002b9c080e */
/*0028*/ ISETP.LT.AND P0, PT, R3, c[0x0][0x154], PT; /* 0x5b181c002a9c0c1e */
/*0030*/ IMUL R0, R2, c[0x0][0x15c]; /* 0x61c018002b9c0802 */
/*0038*/ SEL R3, R3, c[0x0][0x154], P0; /* 0x650000002a9c0c0e */
/* 0x089c8010a01000b0 */
/*0048*/ ISETP.GE.AND P0, PT, R0, R3, PT; /* 0xdb681c00019c001e */
/*0050*/ @P0 EXIT ; /* 0x180000000000003c */
/*0058*/ IADD R2, R2, 0x1; /* 0xc0800000009c0809 */
/*0060*/ MOV R3, c[0x0][0x158]; /* 0x64c03c002b1c000e */
/*0068*/ IMUL R5, R2, c[0x0][0x15c]; /* 0x61c018002b9c0816 */
/*0070*/ LOP.PASS_B R4, RZ, ~c[0x0][0x154]; /* 0x620038002a9ffc12 */
/*0078*/ S2R R2, SR_CTAID.X; /* 0x86400000129c000a */
/* 0x08ac80109c108010 */
/*0088*/ LOP.PASS_B R7, RZ, ~R5; /* 0xe2003800029ffc1e */
/*0090*/ LOP.PASS_B R6, RZ, ~c[0x0][0x154]; /* 0x620038002a9ffc1a */
/*0098*/ LOP.PASS_B R5, RZ, ~R5; /* 0xe2003800029ffc16 */
/*00a0*/ IMAD R3, R2, c[0x0][0x158], R3; /* 0x51080c002b1c080e */
/*00a8*/ ISETP.GT.AND P0, PT, R4, R7, PT; /* 0xdb481c00039c101e */
/*00b0*/ IMUL R2, R2, c[0x0][0x158]; /* 0x61c018002b1c080a */
/*00b8*/ ISETP.LT.AND P1, PT, R3, c[0x0][0x150], PT; /* 0x5b181c002a1c0c3e */
/* 0x0800b010008010a0 */
/*00c8*/ SEL R4, R5, R6, !P0; /* 0xe5002000031c1412 */
/*00d0*/ ISETP.LT.AND P0, PT, RZ, c[0x0][0x160], PT; /* 0x5b181c002c1ffc1e */
/*00d8*/ LOP.PASS_B R4, RZ, ~R4; /* 0xe2003800021ffc12 */
/*00e0*/ SEL R3, R3, c[0x0][0x150], P1; /* 0x650004002a1c0c0e */
/*00e8*/ ISETP.GE.AND P1, PT, R2, R3, PT; /* 0xdb681c00019c083e */
/*00f0*/ SSY 0x368; /* 0x1480000138000000 */
/*00f8*/ @P1 BRA 0x360; /* 0x120000013004003c */
/* 0x089c108010001080 */
/*0108*/ IMUL R5, R0, c[0x0][0x150]; /* 0x61c018002a1c0016 */
/*0110*/ MOV R8, R2; /* 0xe4c03c00011c0022 */
/*0118*/ @!P0 BRA 0x2d8; /* 0x12000000dc20003c */
/*0120*/ I2F.F32.S32 R6, R0; /* 0xe5c00000001ca81a */
/*0128*/ MOV R7, c[0x0][0x148]; /* 0x64c03c00291c001e */
/*0130*/ MOV R14, R2; /* 0xe4c03c00011c003a */
/*0138*/ MOV R16, c[0x0][0x140]; /* 0x64c03c00281c0042 */
/* 0x089c80a010a01000 */
/*0148*/ FFMA R6, R6, c[0x0][0x14c], R7; /* 0x4c001c00299c181a */
/*0150*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */
/*0158*/ MOV R9, R6; /* 0xe4c03c00031c0026 */
/*0160*/ LOP.AND R7, R10, 0x1f; /* 0xc20000000f9c281d */
/*0168*/ PSETP.AND.AND P2, PT, PT, PT, PT; /* 0x84801c07001dc05e */
/*0170*/ IADD R12, R7, R14; /* 0xe0800000071c1c32 */
/*0178*/ PSETP.AND.AND P3, PT, P0, PT, PT; /* 0x84801c07001c007e */
/* 0x08a00010a010a010 */
/*0188*/ I2F.F32.S32 R7, R12; /* 0xe5c00000061ca81e */
/*0190*/ PSETP.AND.AND P1, PT, !PT, PT, PT; /* 0x84801c07001fc03e */
/*0198*/ FFMA R11, R7, c[0x0][0x144], R16; /* 0x4c004000289c1c2e */
/*01a0*/ SSY 0x260; /* 0x148000005c000000 */
/*01a8*/ MOV R7, RZ; /* 0xe4c03c007f9c001e */
/*01b0*/ MOV R8, R11; /* 0xe4c03c00059c0022 */
/*01b8*/ FMUL R15, R8, R8; /* 0xe3400000041c203e */
/* 0x08b0b0ac80b0a010 */
/*01c8*/ PSETP.AND.AND P3, PT, P2, P3, PT; /* 0x84801c03001c807e */
/*01d0*/ FFMA R13, R9, R9, R15; /* 0xcc003c00049c2436 */
/*01d8*/ FSETP.GTU.AND P2, PT, R13, 4, PT; /* 0xb5e01e04001c345d */
/*01e0*/ PSETP.AND.OR P1, PT, P3, P2, P1; /* 0x84810402001cc03e */
/*01e8*/ PSETP.AND.AND P2, PT, !PT, PT, PT; /* 0x84801c07001fc05e */
/*01f0*/ PSETP.XOR.AND P5, PT, P1, P3, PT; /* 0x84801c03101c40be */
/*01f8*/ @P5 PSETP.AND.AND P2, PT, P3, !P1, PT; /* 0x84801c090014c05e */
/* 0x08ac8010b09c1080 */
/*0208*/ @P2 IADD R7, R7, 0x1; /* 0xc080000000881c1d */
/*0210*/ @P5 FFMA R13, -R9, R9, R15; /* 0xcc083c0004942436 */
/*0218*/ @P5 FADD R15, R8, R8; /* 0xe2c000000414203e */
/*0220*/ ISETP.LT.AND P3, PT, R7, c[0x0][0x160], PT; /* 0x5b181c002c1c1c7e */
/*0228*/ @P5 FADD R13, R11, R13; /* 0xe2c0000006942c36 */
/*0230*/ PSETP.AND.AND P4, PT, P2, P3, PT; /* 0x84801c03001c809e */
/*0238*/ @P5 FFMA R9, R9, R15, R6; /* 0xcc00180007942426 */
/* 0x08a0a0100000b810 */
/*0248*/ @P5 MOV R8, R13; /* 0xe4c03c0006940022 */
/*0250*/ @P4 BRA 0x1b8; /* 0x12007fffb010003c */
/*0258*/ ISETP.GE.AND.S P1, PT, R12, R3, PT; /* 0xdb681c0001dc303e */
/*0260*/ @P1 BRA.U 0x2b0; /* 0x120000002404023c */
/*0268*/ @!P1 LOP32I.AND R9, R10, 0x4000001f; /* 0x202000000fa42824 */
/*0270*/ @!P1 IADD R8, R14, R5; /* 0xe080000002a43822 */
/*0278*/ @!P1 IADD R8, R8, R9; /* 0xe080000004a42022 */
/* 0x08b0a000a0b010a0 */
/*0288*/ @!P1 SHF.L R8, RZ, 0x2, R8; /* 0xb7c020000127fc21 */
/*0290*/ @!P1 BFE R9, R8, 0x11f; /* 0xc00800008fa42025 */
/*0298*/ @!P1 IADD R8.CC, R8, c[0x0][0x168]; /* 0x608400002d242022 */
/*02a0*/ @!P1 IADD.X R9, R9, c[0x0][0x16c]; /* 0x608040002da42426 */
/*02a8*/ @!P1 ST.E [R8], R7; /* 0xe48000000024201c */
/*02b0*/ IADD R14, R14, 0x20; /* 0xc0800000101c3839 */
/*02b8*/ ISETP.LT.AND P1, PT, R14, R3, PT; /* 0xdb181c00019c383e */
/* 0x0880b0a0a0a0b8b8 */
/*02c8*/ @P1 BRA 0x150; /* 0x12007fff4004003c */
/*02d0*/ BRA 0x360; /* 0x12000000441c003c */
/*02d8*/ S2R R7, SR_TID.X; /* 0x86400000109c001e */
/*02e0*/ LOP.AND R6, R7, 0x1f; /* 0xc20000000f9c1c19 */
/*02e8*/ IADD R6, R6, R8; /* 0xe0800000041c181a */
/*02f0*/ ISETP.LT.AND P1, PT, R6, R3, PT; /* 0xdb181c00019c183e */
/*02f8*/ @P1 LOP32I.AND R7, R7, 0x4000001f; /* 0x202000000f841c1c */
/* 0x08a0b010a0a0a010 */
/*0308*/ @P1 IADD R6, R8, R5; /* 0xe08000000284201a */
/*0310*/ IADD R8, R8, 0x20; /* 0xc0800000101c2021 */
/*0318*/ @P1 IADD R6, R6, R7; /* 0xe08000000384181a */
/*0320*/ @P1 SHF.L R6, RZ, 0x2, R6; /* 0xb7c018000107fc19 */
/*0328*/ @P1 BFE R7, R6, 0x11f; /* 0xc00800008f84181d */
/*0330*/ @P1 IADD R6.CC, R6, c[0x0][0x168]; /* 0x608400002d04181a */
/*0338*/ @P1 IADD.X R7, R7, c[0x0][0x16c]; /* 0x608040002d841c1e */
/* 0x0880b8b000b8b0c8 */
/*0348*/ @P1 ST.E [R6], RZ; /* 0xe480000000041bfc */
/*0350*/ ISETP.LT.AND P1, PT, R8, R3, PT; /* 0xdb181c00019c203e */
/*0358*/ @P1 BRA 0x2d8; /* 0x12007fffbc04003c */
/*0360*/ IADD.S R0, R0, 0x1; /* 0xc080000000dc0001 */
/*0368*/ ISETP.EQ.AND P1, PT, R0, R4, PT; /* 0xdb281c00021c003e */
/*0370*/ @!P1 BRA 0xe8; /* 0x12007ffeb824003c */
/*0378*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/* 0x08000000000000b8 */
/*0388*/ EXIT ; /* 0x18000000001c003c */
/*0390*/ BRA 0x390; /* 0x12007ffffc1c003c */
/*0398*/ NOP; /* 0x85800000001c3c02 */
/*03a0*/ NOP; /* 0x85800000001c3c02 */
/*03a8*/ NOP; /* 0x85800000001c3c02 */
/*03b0*/ NOP; /* 0x85800000001c3c02 */
/*03b8*/ NOP; /* 0x85800000001c3c02 */
....................................

View File

@@ -0,0 +1,79 @@
code for sm_35
Function : _Z19mandelbrot_scanlineffffiiiiiPi
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x0880a010a0a01000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ S2R R0, SR_CTAID.Y; /* 0x86400000131c0002 */
/*0018*/ MOV R4, c[0x0][0x158]; /* 0x64c03c002b1c0012 */
/*0020*/ IMUL R2, R0, c[0x0][0x15c]; /* 0x61c018002b9c000a */
/*0028*/ IADD R0, R2, c[0x0][0x15c]; /* 0x608000002b9c0802 */
/*0030*/ S2R R9, SR_CTAID.X; /* 0x86400000129c0026 */
/*0038*/ IMNMX R11, R0, c[0x0][0x154], PT; /* 0x61081c002a9c002e */
/* 0x08b0a0100010b09c */
/*0048*/ IMAD R0, R9, c[0x0][0x158], R4; /* 0x510810002b1c2402 */
/*0050*/ ISETP.GE.AND P0, PT, R2, R11, PT; /* 0xdb681c00059c081e */
/*0058*/ IMNMX R0, R0, c[0x0][0x150], PT; /* 0x61081c002a1c0002 */
/*0060*/ @P0 EXIT ; /* 0x180000000000003c */
/*0068*/ IMUL R3, R9, c[0x0][0x158]; /* 0x61c018002b1c240e */
/*0070*/ SSY 0x1f8; /* 0x14800000c0000000 */
/*0078*/ ISETP.GE.AND P0, PT, R3, R0, PT; /* 0xdb681c00001c0c1e */
/* 0x08a0100010a01000 */
/*0088*/ @P0 BRA 0x1f0; /* 0x12000000b000003c */
/*0090*/ I2F.F32.S32 R4, R2; /* 0xe5c00000011ca812 */
/*0098*/ MOV R5, c[0x0][0x148]; /* 0x64c03c00291c0016 */
/*00a0*/ MOV R16, c[0x0][0x140]; /* 0x64c03c00281c0042 */
/*00a8*/ FFMA R4, R4, c[0x0][0x14c], R5; /* 0x4c001400299c1012 */
/*00b0*/ S2R R5, SR_TID.X; /* 0x86400000109c0016 */
/*00b8*/ MOV R6, RZ; /* 0xe4c03c007f9c001a */
/* 0x08800010a0a0a010 */
/*00c8*/ LOP.AND R10, R5, 0x1f; /* 0xc20000000f9c1429 */
/*00d0*/ ISETP.LT.AND P0, PT, RZ, c[0x0][0x160], PT; /* 0x5b181c002c1ffc1e */
/*00d8*/ IADD R12, R10, R3; /* 0xe0800000019c2832 */
/*00e0*/ I2F.F32.U32 R5, R12; /* 0xe5c00000061c2816 */
/*00e8*/ FFMA R5, R5, c[0x0][0x144], R16; /* 0x4c004000289c1416 */
/*00f0*/ @!P0 BRA 0x190; /* 0x120000004c20003c */
/*00f8*/ MOV R7, R4; /* 0xe4c03c00021c001e */
/* 0x0800b0a0a0100010 */
/*0108*/ MOV R8, R5; /* 0xe4c03c00029c0022 */
/*0110*/ PBK 0x190; /* 0x150000003c000000 */
/*0118*/ FMUL R13, R7, R7; /* 0xe3400000039c1c36 */
/*0120*/ FMUL R14, R8, R8; /* 0xe3400000041c203a */
/*0128*/ FADD R15, R14, R13; /* 0xe2c00000069c383e */
/*0130*/ FSETP.GT.AND P0, PT, R15, 4, PT; /* 0xb5a01e04001c3c1d */
/*0138*/ @P0 BRK ; /* 0x1a0000000000003c */
/* 0x080010ac809c8010 */
/*0148*/ IADD R6, R6, 0x1; /* 0xc0800000009c1819 */
/*0150*/ FADD R8, R8, R8; /* 0xe2c00000041c2022 */
/*0158*/ FADD R14, R14, -R13; /* 0xe2c10000069c383a */
/*0160*/ ISETP.LT.AND P0, PT, R6, c[0x0][0x160], PT; /* 0x5b181c002c1c181e */
/*0168*/ FFMA R7, R8, R7, R4; /* 0xcc001000039c201e */
/*0170*/ FADD R8, R5, R14; /* 0xe2c00000071c1422 */
/*0178*/ @!P0 BRK ; /* 0x1a0000000020003c */
/* 0x08b0a00010ac80b8 */
/*0188*/ BRA 0x118; /* 0x12007fffc41c003c */
/*0190*/ ISETP.GE.U32.AND P0, PT, R12, R0, PT; /* 0xdb601c00001c301e */
/*0198*/ IMAD R5, R2, c[0x0][0x150], R3; /* 0x51080c002a1c0816 */
/*01a0*/ IADD R5, R5, R10; /* 0xe0800000051c1416 */
/*01a8*/ @P0 BRA.U 0x1d8; /* 0x120000001400023c */
/*01b0*/ @!P0 MOV32I R8, 0x4; /* 0x740000000223c022 */
/*01b8*/ @!P0 IMAD R12.CC, R5, R8, c[0x0][0x168]; /* 0x910c20002d201432 */
/* 0x08b000b8b0a000a0 */
/*01c8*/ @!P0 IMAD.HI.X R13, R5, R8, c[0x0][0x16c]; /* 0x931820002da01436 */
/*01d0*/ @!P0 ST.E [R12], R6; /* 0xe480000000203018 */
/*01d8*/ IADD R3, R3, 0x20; /* 0xc0800000101c0c0d */
/*01e0*/ ISETP.LT.AND P0, PT, R3, R0, PT; /* 0xdb181c00001c0c1e */
/*01e8*/ @P0 BRA 0xb0; /* 0x12007fff6000003c */
/*01f0*/ IADD.S R2, R2, 0x1; /* 0xc080000000dc0809 */
/*01f8*/ ISETP.LT.AND P0, PT, R2, R11, PT; /* 0xdb181c00059c081e */
/* 0x0800000000b810b8 */
/*0208*/ @P0 BRA 0x68; /* 0x12007fff2c00003c */
/*0210*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0218*/ EXIT ; /* 0x18000000001c003c */
/*0220*/ BRA 0x220; /* 0x12007ffffc1c003c */
/*0228*/ NOP; /* 0x85800000001c3c02 */
/*0230*/ NOP; /* 0x85800000001c3c02 */
/*0238*/ NOP; /* 0x85800000001c3c02 */
...................................................

View File

@@ -0,0 +1,111 @@
code for sm_35
Function : mandelbrot_scanline
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x0880a010a0a01000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ S2R R4, SR_CTAID.Y; /* 0x86400000131c0012 */
/*0018*/ MOV R6, c[0x0][0x158]; /* 0x64c03c002b1c001a */
/*0020*/ IMUL R0, R4, c[0x0][0x15c]; /* 0x61c018002b9c1002 */
/*0028*/ IADD R3, R0, c[0x0][0x15c]; /* 0x608000002b9c000e */
/*0030*/ S2R R2, SR_CTAID.X; /* 0x86400000129c000a */
/*0038*/ IMNMX R5, R3, c[0x0][0x154], PT; /* 0x61081c002a9c0c16 */
/* 0x08a010a000b010a0 */
/*0048*/ IMAD R3, R2, c[0x0][0x158], R6; /* 0x510818002b1c080e */
/*0050*/ ISETP.GE.AND P0, PT, R0, R5, PT; /* 0xdb681c00029c001e */
/*0058*/ IMNMX R3, R3, c[0x0][0x150], PT; /* 0x61081c002a1c0c0e */
/*0060*/ @P0 EXIT ; /* 0x180000000000003c */
/*0068*/ IADD R4, R4, 0x1; /* 0xc0800000009c1011 */
/*0070*/ IMUL R5, R4, c[0x0][0x15c]; /* 0x61c018002b9c1016 */
/*0078*/ LOP.PASS_B R4, RZ, ~c[0x0][0x154]; /* 0x620038002a9ffc12 */
/* 0x0800b0a01000a0a0 */
/*0088*/ LOP.PASS_B R5, RZ, ~R5; /* 0xe2003800029ffc16 */
/*0090*/ IMNMX R4, R4, R5, !PT; /* 0xe1083c00029c1012 */
/*0098*/ LOP.PASS_B R4, RZ, ~R4; /* 0xe2003800021ffc12 */
/*00a0*/ IMUL R5, R2, c[0x0][0x158]; /* 0x61c018002b1c0816 */
/*00a8*/ SSY 0x318; /* 0x1480000134000000 */
/*00b0*/ ISETP.GE.AND P0, PT, R5, R3, PT; /* 0xdb681c00019c141e */
/*00b8*/ @P0 BRA 0x310; /* 0x120000012800003c */
/* 0x08a0a00010ac8010 */
/*00c8*/ ISETP.LT.AND P0, PT, RZ, c[0x0][0x160], PT; /* 0x5b181c002c1ffc1e */
/*00d0*/ I2F.F32.S32 R6, R0; /* 0xe5c00000001ca81a */
/*00d8*/ MOV R7, c[0x0][0x148]; /* 0x64c03c00291c001e */
/*00e0*/ FFMA R6, R6, c[0x0][0x14c], R7; /* 0x4c001c00299c181a */
/*00e8*/ @P0 BRA 0x180; /* 0x120000004800003c */
/*00f0*/ S2R R7, SR_TID.X; /* 0x86400000109c001e */
/*00f8*/ LOP.AND R6, R7, 0x1f; /* 0xc20000000f9c1c19 */
/* 0x08a010a0a080b0a0 */
/*0108*/ IADD R6, R6, R5; /* 0xe0800000029c181a */
/*0110*/ ISETP.GE.AND P0, PT, R6, R3, PT; /* 0xdb681c00019c181e */
/*0118*/ @!P0 LOP32I.AND R7, R7, 0x4000001f; /* 0x202000000fa01c1c */
/*0120*/ @!P0 IMAD R6, R0, c[0x0][0x150], R5; /* 0x510814002a20001a */
/*0128*/ @!P0 IADD R6, R6, R7; /* 0xe080000003a0181a */
/*0130*/ @!P0 SHF.L R6, RZ, 0x2, R6; /* 0xb7c018000123fc19 */
/*0138*/ IADD R5, R5, 0x20; /* 0xc0800000101c1415 */
/* 0x08b8b8b0c8a0b010 */
/*0148*/ @!P0 BFE R7, R6, 0x11f; /* 0xc00800008fa0181d */
/*0150*/ @!P0 IADD R6.CC, R6, c[0x0][0x168]; /* 0x608400002d20181a */
/*0158*/ @!P0 IADD.X R7, R7, c[0x0][0x16c]; /* 0x608040002da01c1e */
/*0160*/ @!P0 ST.E [R6], RZ; /* 0xe480000000201bfc */
/*0168*/ ISETP.LT.AND P0, PT, R5, R3, PT; /* 0xdb181c00019c141e */
/*0170*/ @P0 BRA 0xf0; /* 0x12007fffbc00003c */
/*0178*/ BRA 0x310; /* 0x12000000c81c003c */
/* 0x08a0a0a010a01000 */
/*0188*/ MOV R16, c[0x0][0x140]; /* 0x64c03c00281c0042 */
/*0190*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */
/*0198*/ SSY 0x2a0; /* 0x1480000080000000 */
/*01a0*/ LOP.AND R8, R10, 0x1f; /* 0xc20000000f9c2821 */
/*01a8*/ PSETP.AND.AND P2, PT, PT, PT, PT; /* 0x84801c07001dc05e */
/*01b0*/ IADD R12, R8, R5; /* 0xe0800000029c2032 */
/*01b8*/ I2F.F32.S32 R7, R12; /* 0xe5c00000061ca81e */
/* 0x0880009880108010 */
/*01c8*/ PSETP.AND.AND P3, PT, P0, PT, PT; /* 0x84801c07001c007e */
/*01d0*/ FFMA R11, R7, c[0x0][0x144], R16; /* 0x4c004000289c1c2e */
/*01d8*/ PSETP.AND.AND P1, PT, !PT, PT, PT; /* 0x84801c07001fc03e */
/*01e0*/ MOV R7, RZ; /* 0xe4c03c007f9c001e */
/*01e8*/ MOV R8, R6; /* 0xe4c03c00031c0022 */
/*01f0*/ MOV R9, R11; /* 0xe4c03c00059c0026 */
/*01f8*/ FMUL R14, R9, R9; /* 0xe3400000049c243a */
/* 0x08b0ac80b0a0a010 */
/*0208*/ FMUL R15, R8, R8; /* 0xe3400000041c203e */
/*0210*/ PSETP.AND.AND P3, PT, P2, P3, PT; /* 0x84801c03001c807e */
/*0218*/ FADD R13, R15, R14; /* 0xe2c00000071c3c36 */
/*0220*/ FSETP.GTU.AND P2, PT, R13, 4, PT; /* 0xb5e01e04001c345d */
/*0228*/ PSETP.AND.OR P1, PT, P3, P2, P1; /* 0x84810402001cc03e */
/*0230*/ PSETP.AND.AND P2, PT, !PT, PT, PT; /* 0x84801c07001fc05e */
/*0238*/ PSETP.XOR.AND P5, PT, P1, P3, PT; /* 0x84801c03101c40be */
/* 0x08ac8010b0a010b0 */
/*0248*/ @P5 PSETP.AND.AND P2, PT, P3, !P1, PT; /* 0x84801c090014c05e */
/*0250*/ @P2 IADD R7, R7, 0x1; /* 0xc080000000881c1d */
/*0258*/ @P5 FADD R13, R9, R9; /* 0xe2c0000004942436 */
/*0260*/ ISETP.LT.AND P3, PT, R7, c[0x0][0x160], PT; /* 0x5b181c002c1c1c7e */
/*0268*/ @P5 FADD R14, R14, -R15; /* 0xe2c100000794383a */
/*0270*/ PSETP.AND.AND P4, PT, P2, P3, PT; /* 0x84801c03001c809e */
/*0278*/ @P5 FFMA R8, R8, R13, R6; /* 0xcc00180006942022 */
/* 0x08a0a0800000b810 */
/*0288*/ @P5 FADD R9, R11, R14; /* 0xe2c0000007142c26 */
/*0290*/ @P4 BRA 0x1f8; /* 0x12007fffb010003c */
/*0298*/ ISETP.GE.AND.S P1, PT, R12, R3, PT; /* 0xdb681c0001dc303e */
/*02a0*/ @P1 BRA.U 0x2f0; /* 0x120000002404023c */
/*02a8*/ @!P1 LOP32I.AND R9, R10, 0x4000001f; /* 0x202000000fa42824 */
/*02b0*/ @!P1 IMAD R8, R0, c[0x0][0x150], R5; /* 0x510814002a240022 */
/*02b8*/ @!P1 IADD R8, R8, R9; /* 0xe080000004a42022 */
/* 0x08b0a000a0b010a0 */
/*02c8*/ @!P1 SHF.L R8, RZ, 0x2, R8; /* 0xb7c020000127fc21 */
/*02d0*/ @!P1 BFE R9, R8, 0x11f; /* 0xc00800008fa42025 */
/*02d8*/ @!P1 IADD R8.CC, R8, c[0x0][0x168]; /* 0x608400002d242022 */
/*02e0*/ @!P1 IADD.X R9, R9, c[0x0][0x16c]; /* 0x608040002da42426 */
/*02e8*/ @!P1 ST.E [R8], R7; /* 0xe48000000024201c */
/*02f0*/ IADD R5, R5, 0x20; /* 0xc0800000101c1415 */
/*02f8*/ ISETP.LT.AND P1, PT, R5, R3, PT; /* 0xdb181c00019c143e */
/* 0x0800b810b8b000b8 */
/*0308*/ @P1 BRA 0x190; /* 0x12007fff4004003c */
/*0310*/ IADD.S R0, R0, 0x1; /* 0xc080000000dc0001 */
/*0318*/ ISETP.NE.AND P0, PT, R0, R4, PT; /* 0xdb581c00021c001e */
/*0320*/ @P0 BRA 0xa0; /* 0x12007ffebc00003c */
/*0328*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0330*/ EXIT ; /* 0x18000000001c003c */
/*0338*/ BRA 0x338; /* 0x12007ffffc1c003c */
....................................

View File

@@ -0,0 +1,8 @@
EXAMPLE=mandelbrot_tasks3d
CPP_SRC=mandelbrot_tasks3d.cpp mandelbrot_tasks_serial.cpp
ISPC_SRC=mandelbrot_tasks3d.ispc
ISPC_IA_TARGETS=avx,sse2,sse4
ISPC_ARM_TARGETS=neon
include ../common.mk

View File

@@ -0,0 +1,186 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl mandelbrot_scanline
// @mandelbrot_scanline
.entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 .ptr .align 4 mandelbrot_scanline_param_9
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u32 %r6, [mandelbrot_scanline_param_5];
mov.u32 %r5, %ctaid.y;
ld.param.u32 %r7, [mandelbrot_scanline_param_7];
mul.lo.s32 %r0, %r5, %r7;
mad.lo.s32 %r1, %r5, %r7, %r7;
setp.lt.s32 %p0, %r1, %r6;
selp.b32 %r1, %r1, %r6, %p0;
setp.ge.s32 %p0, %r0, %r1;
@%p0 bra BB0_13;
// BB#1: // %for_test28.preheader.lr.ph
ld.param.f32 %f0, [mandelbrot_scanline_param_0];
mov.u32 %r2, %ctaid.x;
ld.param.u32 %r3, [mandelbrot_scanline_param_6];
mul.lo.s32 %r1, %r2, %r3;
ld.param.f32 %f1, [mandelbrot_scanline_param_1];
mad.lo.s32 %r3, %r2, %r3, %r3;
ld.param.f32 %f2, [mandelbrot_scanline_param_2];
ld.param.u32 %r2, [mandelbrot_scanline_param_4];
setp.lt.s32 %p0, %r3, %r2;
ld.param.f32 %f3, [mandelbrot_scanline_param_3];
selp.b32 %r3, %r3, %r2, %p0;
ld.param.u32 %r4, [mandelbrot_scanline_param_8];
ld.param.u64 %rl0, [mandelbrot_scanline_param_9];
setp.gt.s32 %p0, %r4, 0;
not.b32 %r6, %r6;
add.s32 %r5, %r5, 1;
mul.lo.s32 %r5, %r5, %r7;
not.b32 %r5, %r5;
setp.gt.s32 %p1, %r6, %r5;
selp.b32 %r5, %r6, %r5, %p1;
not.b32 %r5, %r5;
BB0_2: // %for_test28.preheader
// =>This Loop Header: Depth=1
// Child Loop BB0_15 Depth 2
// Child Loop BB0_8 Depth 2
// Child Loop BB0_11 Depth 3
setp.ge.s32 %p1, %r1, %r3;
@%p1 bra BB0_12;
// BB#3: // %for_loop30.lr.ph
// in Loop: Header=BB0_2 Depth=1
mul.lo.s32 %r6, %r0, %r2;
mov.u32 %r7, %r1;
@%p0 bra BB0_4;
bra.uni BB0_15;
BB0_4: // in Loop: Header=BB0_2 Depth=1
cvt.rn.f32.s32 %f4, %r0;
fma.rn.f32 %f4, %f4, %f3, %f2;
mov.u32 %r7, %r1;
BB0_8: // %for_loop.i.lr.ph.us
// Parent Loop BB0_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB0_11 Depth 3
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r11, %r10, %r7;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f5, %f1, %f0;
mov.u32 %r10, 0;
mov.pred %p1, 0;
mov.pred %p3, -1;
mov.pred %p4, %p0;
mov.pred %p2, %p1;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB0_11: // %for_loop.i.us
// Parent Loop BB0_2 Depth=1
// Parent Loop BB0_8 Depth=2
// => This Inner Loop Header: Depth=3
and.pred %p4, %p3, %p4;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p3, %f9, 0f40800000;
and.pred %p3, %p4, %p3;
or.pred %p2, %p3, %p2;
xor.pred %p5, %p2, %p4;
mov.pred %p3, %p1;
@!%p5 bra BB0_10;
bra.uni BB0_9;
BB0_9: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB0_11 Depth=3
mul.f32 %f9, %f6, %f6;
not.pred %p3, %p2;
and.pred %p3, %p4, %p3;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB0_10: // %for_step.i.us
// in Loop: Header=BB0_11 Depth=3
add.s32 %r12, %r10, 1;
selp.b32 %r10, %r12, %r10, %p3;
setp.lt.s32 %p4, %r10, %r4;
and.pred %p5, %p3, %p4;
@%p5 bra BB0_11;
// BB#5: // %mandel___vyfvyfvyi.exit.us
// in Loop: Header=BB0_8 Depth=2
setp.ge.s32 %p1, %r11, %r3;
@%p1 bra BB0_7;
// BB#6: // %if_then.us
// in Loop: Header=BB0_8 Depth=2
add.s32 %r11, %r8, 1073741823;
and.b32 %r9, %r11, %r9;
add.s32 %r11, %r7, %r6;
add.s32 %r9, %r11, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB0_7: // %if_exit.us
// in Loop: Header=BB0_8 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r3;
@%p1 bra BB0_8;
bra.uni BB0_12;
BB0_15: // %mandel___vyfvyfvyi.exit
// Parent Loop BB0_2 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r10, %r10, %r7;
setp.lt.s32 %p1, %r10, %r3;
@%p1 bra BB0_16;
bra.uni BB0_14;
BB0_16: // %if_then
// in Loop: Header=BB0_15 Depth=2
add.s32 %r10, %r8, 1073741823;
and.b32 %r9, %r10, %r9;
add.s32 %r10, %r7, %r6;
add.s32 %r9, %r10, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
BB0_14: // %if_exit
// in Loop: Header=BB0_15 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r3;
@%p1 bra BB0_15;
BB0_12: // %for_exit31
// in Loop: Header=BB0_2 Depth=1
add.s32 %r0, %r0, 1;
setp.eq.s32 %p1, %r0, %r5;
@%p1 bra BB0_13;
bra.uni BB0_2;
BB0_13: // %for_exit
ret;
}

Binary file not shown.

View File

@@ -0,0 +1,321 @@
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <algorithm>
#include <string.h>
#include <cuda.h>
#include <vector>
#include <cassert>
#include "drvapi_error_string.h"
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
// These are the inline versions for all of the SDK helper functions
void __checkCudaErrors(CUresult err, const char *file, const int line) {
if(CUDA_SUCCESS != err) {
std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
<< getCudaDrvErrorString(err) << "\" from file <" << file
<< ", line " << line << "\n";
exit(-1);
}
}
/**********************/
/* Basic CUDriver API */
CUcontext context;
void createContext(const int deviceId = 0)
{
CUdevice device;
int devCount;
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGetCount(&devCount));
assert(devCount > 0);
checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
char name[128];
checkCudaErrors(cuDeviceGetName(name, 128, device));
std::cout << "Using CUDA Device [0]: " << name << "\n";
int devMajor, devMinor;
checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
std::cout << "Device Compute Capability: "
<< devMajor << "." << devMinor << "\n";
if (devMajor < 2) {
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
exit(1);
}
// Create driver context
checkCudaErrors(cuCtxCreate(&context, 0, device));
}
void destroyContext()
{
checkCudaErrors(cuCtxDestroy(context));
}
CUmodule loadModule(const char * module)
{
CUmodule cudaModule;
checkCudaErrors(cuModuleLoadData(&cudaModule, module));
return cudaModule;
}
void unloadModule(CUmodule &cudaModule)
{
checkCudaErrors(cuModuleUnload(cudaModule));
}
CUfunction getFunction(CUmodule &cudaModule, const char * function)
{
CUfunction cudaFunction;
checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
return cudaFunction;
}
CUdeviceptr deviceMalloc(const size_t size)
{
CUdeviceptr d_buf;
checkCudaErrors(cuMemAlloc(&d_buf, size));
return d_buf;
}
void deviceFree(CUdeviceptr d_buf)
{
checkCudaErrors(cuMemFree(d_buf));
}
void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
{
checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
}
void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
{
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
}
#define deviceLaunch(func,nbx,nby,nbz,params) \
checkCudaErrors( \
cuLaunchKernel( \
(func), \
(nbx), (nby), (nbz), \
32, 1, 1, \
0, NULL, (params), NULL \
));
typedef CUdeviceptr devicePtr;
/**************/
extern "C"
{
#if 0
struct ModuleManager
{
private:
typedef std::pair<std::string, CUModule> ModulePair;
typedef std::map <std::string, CUModule> ModuleMap;
ModuleMap module_list;
ModuleMap::iterator findModule(const char * module_name)
{
return module_list.find(std::string(module_name));
}
public:
CUmodule loadModule(const char * module_name, const char * module_data)
{
const ModuleMap::iterator it = findModule(module_name)
if (it != ModuleMap::end)
{
CUmodule cudaModule = loadModule(module);
module_list.insert(std::make_pair(std::string(module_name), cudaModule));
return cudaModule
}
return it->second;
}
void unloadModule(const char * module_name)
{
ModuleMap::iterator it = findModule(module_name)
if (it != ModuleMap::end)
module_list.erase(it);
}
};
#endif
void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment)
{
return NULL;
}
void CUDALaunch(
void **handlePtr,
const char * module_name,
const char * module,
const char * func_name,
void **func_args,
int countx, int county, int countz)
{
CUmodule cudaModule = loadModule(module);
CUfunction cudaFunction = getFunction(cudaModule, func_name);
deviceLaunch(cudaFunction, countx, county, countz, func_args);
unloadModule(cudaModule);
}
void CUDASync(void *handle)
{
checkCudaErrors(cuStreamSynchronize(0));
}
void CUDAFree(void *handle)
{
}
}
/********************/
/* Write a PPM image file with the image of the Mandelbrot set */
static void
writePPM(int *buf, int width, int height, const char *fn)
{
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n");
fprintf(fp, "%d %d\n", width, height);
fprintf(fp, "255\n");
for (int i = 0; i < width*height; ++i) {
// Map the iteration count to colors by just alternating between
// two greys.
char c = (buf[i] & 0x1) ? 240 : 20;
for (int j = 0; j < 3; ++j)
fputc(c, fp);
}
fclose(fp);
printf("Wrote image file %s\n", fn);
}
std::vector<char> readBinary(const char * filename)
{
std::vector<char> buffer;
FILE *fp = fopen(filename, "rb");
if (!fp )
{
fprintf(stderr, "file %s not found\n", filename);
assert(0);
}
#if 0
char c;
while ((c = fgetc(fp)) != EOF)
buffer.push_back(c);
#else
fseek(fp, 0, SEEK_END);
const unsigned long long size = ftell(fp); /*calc the size needed*/
fseek(fp, 0, SEEK_SET);
buffer.resize(size);
if (fp == NULL){ /*ERROR detection if file == empty*/
fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);
exit(1);
}
else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
fprintf(stderr, "Error: There was an Error reading the file %s \n", filename);
exit(1);
}
#endif
fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size());
return buffer;
}
static void usage()
{
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
exit(1);
}
extern "C"
void mandelbrot_ispc(
float x0, float y0,
float x1, float y1,
int width, int height,
int maxIterations, int output[])
{
float dx = (x1 - x0) / width;
float dy = (y1 - y0) / height;
int xspan = 16; /* make sure it is big enough to avoid false-sharing */
int yspan = 4;
const int nbx = width/xspan;
const int nby = height/yspan;
const int nbz = 1;
fprintf(stderr ," nbx= %d nby= %d nbtot= %d \n", nbx, nby, nbx*nby);
#if 0
launch [nbx,nby]
mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan,
maxIterations, output);
#endif
// const std::vector<char> cubin = readBinary("cuLaunch.cubin");
const std::vector<char> cubin = readBinary("cuLaunch.ptx");
void *params[] = {&x0, &dx, &y0, &dy, &width, &height, &xspan, &yspan, &maxIterations, &output};
CUDALaunch(
NULL, //void **handlePtr,
"module_01", // const char * module_name,
&cubin[0], //const char * module,
"mandelbrot_scanline", //const char * func_name,
params, //void **func_args,
nbx,nby,nbz); //int countx, int county, int countz)
CUDASync(NULL);
}
int main(int argc, char *argv[])
{
unsigned int width = 1536;
unsigned int height = 1024;
float x0 = -2;
float x1 = 1;
float y0 = -1;
float y1 = 1;
if (argc == 1)
;
else if (argc == 2) {
if (strncmp(argv[1], "--scale=", 8) == 0) {
float scale = atof(argv[1] + 8);
if (scale == 0.f)
usage();
width *= scale;
height *= scale;
// round up to multiples of 16
width = (width + 0xf) & ~0xf;
height = (height + 0xf) & ~0xf;
}
else
usage();
}
else
usage();
/*******************/
createContext();
/*******************/
int maxIterations = 512;
int *h_buf = new int[width*height];
for (unsigned int i = 0; i < width*height; i++)
h_buf[i] = 0;
const size_t bufsize = sizeof(int)*width*height;
devicePtr d_buf = deviceMalloc(bufsize);
memcpyH2D(d_buf, h_buf, bufsize);
mandelbrot_ispc(x0,y0,x1,y1,width, height, maxIterations, (int*)d_buf);
memcpyD2H(h_buf, d_buf, bufsize);
deviceFree(d_buf);
writePPM(h_buf, width, height, "mandelbrot-cuda.ppm");
/*******************/
destroyContext();
/*******************/
return 0;
}

Binary file not shown.

View File

@@ -0,0 +1,186 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl mandelbrot_scanline
// @mandelbrot_scanline
.entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 .ptr .align 4 mandelbrot_scanline_param_9
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u32 %r6, [mandelbrot_scanline_param_5];
mov.u32 %r5, %ctaid.y;
ld.param.u32 %r7, [mandelbrot_scanline_param_7];
mul.lo.s32 %r0, %r5, %r7;
mad.lo.s32 %r1, %r5, %r7, %r7;
setp.lt.s32 %p0, %r1, %r6;
selp.b32 %r1, %r1, %r6, %p0;
setp.ge.s32 %p0, %r0, %r1;
@%p0 bra BB0_13;
// BB#1: // %for_test28.preheader.lr.ph
ld.param.f32 %f0, [mandelbrot_scanline_param_0];
mov.u32 %r2, %ctaid.x;
ld.param.u32 %r3, [mandelbrot_scanline_param_6];
mul.lo.s32 %r1, %r2, %r3;
ld.param.f32 %f1, [mandelbrot_scanline_param_1];
mad.lo.s32 %r3, %r2, %r3, %r3;
ld.param.f32 %f2, [mandelbrot_scanline_param_2];
ld.param.u32 %r2, [mandelbrot_scanline_param_4];
setp.lt.s32 %p0, %r3, %r2;
ld.param.f32 %f3, [mandelbrot_scanline_param_3];
selp.b32 %r3, %r3, %r2, %p0;
ld.param.u32 %r4, [mandelbrot_scanline_param_8];
ld.param.u64 %rl0, [mandelbrot_scanline_param_9];
setp.gt.s32 %p0, %r4, 0;
not.b32 %r6, %r6;
add.s32 %r5, %r5, 1;
mul.lo.s32 %r5, %r5, %r7;
not.b32 %r5, %r5;
setp.gt.s32 %p1, %r6, %r5;
selp.b32 %r5, %r6, %r5, %p1;
not.b32 %r5, %r5;
BB0_2: // %for_test28.preheader
// =>This Loop Header: Depth=1
// Child Loop BB0_15 Depth 2
// Child Loop BB0_8 Depth 2
// Child Loop BB0_11 Depth 3
setp.ge.s32 %p1, %r1, %r3;
@%p1 bra BB0_12;
// BB#3: // %for_loop30.lr.ph
// in Loop: Header=BB0_2 Depth=1
mul.lo.s32 %r6, %r0, %r2;
mov.u32 %r7, %r1;
@%p0 bra BB0_4;
bra.uni BB0_15;
BB0_4: // in Loop: Header=BB0_2 Depth=1
cvt.rn.f32.s32 %f4, %r0;
fma.rn.f32 %f4, %f4, %f3, %f2;
mov.u32 %r7, %r1;
BB0_8: // %for_loop.i.lr.ph.us
// Parent Loop BB0_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB0_11 Depth 3
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r11, %r10, %r7;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f5, %f1, %f0;
mov.u32 %r10, 0;
mov.pred %p1, 0;
mov.pred %p3, -1;
mov.pred %p4, %p0;
mov.pred %p2, %p1;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB0_11: // %for_loop.i.us
// Parent Loop BB0_2 Depth=1
// Parent Loop BB0_8 Depth=2
// => This Inner Loop Header: Depth=3
and.pred %p4, %p3, %p4;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p3, %f9, 0f40800000;
and.pred %p3, %p4, %p3;
or.pred %p2, %p3, %p2;
xor.pred %p5, %p2, %p4;
mov.pred %p3, %p1;
@!%p5 bra BB0_10;
bra.uni BB0_9;
BB0_9: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB0_11 Depth=3
mul.f32 %f9, %f6, %f6;
not.pred %p3, %p2;
and.pred %p3, %p4, %p3;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB0_10: // %for_step.i.us
// in Loop: Header=BB0_11 Depth=3
add.s32 %r12, %r10, 1;
selp.b32 %r10, %r12, %r10, %p3;
setp.lt.s32 %p4, %r10, %r4;
and.pred %p5, %p3, %p4;
@%p5 bra BB0_11;
// BB#5: // %mandel___vyfvyfvyi.exit.us
// in Loop: Header=BB0_8 Depth=2
setp.ge.s32 %p1, %r11, %r3;
@%p1 bra BB0_7;
// BB#6: // %if_then.us
// in Loop: Header=BB0_8 Depth=2
add.s32 %r11, %r8, 1073741823;
and.b32 %r9, %r11, %r9;
add.s32 %r11, %r7, %r6;
add.s32 %r9, %r11, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB0_7: // %if_exit.us
// in Loop: Header=BB0_8 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r3;
@%p1 bra BB0_8;
bra.uni BB0_12;
BB0_15: // %mandel___vyfvyfvyi.exit
// Parent Loop BB0_2 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r10, %r10, %r7;
setp.lt.s32 %p1, %r10, %r3;
@%p1 bra BB0_16;
bra.uni BB0_14;
BB0_16: // %if_then
// in Loop: Header=BB0_15 Depth=2
add.s32 %r10, %r8, 1073741823;
and.b32 %r9, %r10, %r9;
add.s32 %r10, %r7, %r6;
add.s32 %r9, %r10, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
BB0_14: // %if_exit
// in Loop: Header=BB0_15 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r3;
@%p1 bra BB0_15;
BB0_12: // %for_exit31
// in Loop: Header=BB0_2 Depth=1
add.s32 %r0, %r0, 1;
setp.eq.s32 %p1, %r0, %r5;
@%p1 bra BB0_13;
bra.uni BB0_2;
BB0_13: // %for_exit
ret;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,370 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef _DRVAPI_ERROR_STRING_H_
#define _DRVAPI_ERROR_STRING_H_
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// Error Code string definitions here
typedef struct
{
char const *error_string;
int error_id;
} s_CudaErrorStr;
/**
* Error codes
*/
static s_CudaErrorStr sCudaDrvErrorString[] =
{
/**
* The API call returned with no errors. In the case of query calls, this
* can also mean that the operation being queried is complete (see
* ::cuEventQuery() and ::cuStreamQuery()).
*/
{ "CUDA_SUCCESS", 0 },
/**
* This indicates that one or more of the parameters passed to the API call
* is not within an acceptable range of values.
*/
{ "CUDA_ERROR_INVALID_VALUE", 1 },
/**
* The API call failed because it was unable to allocate enough memory to
* perform the requested operation.
*/
{ "CUDA_ERROR_OUT_OF_MEMORY", 2 },
/**
* This indicates that the CUDA driver has not been initialized with
* ::cuInit() or that initialization has failed.
*/
{ "CUDA_ERROR_NOT_INITIALIZED", 3 },
/**
* This indicates that the CUDA driver is in the process of shutting down.
*/
{ "CUDA_ERROR_DEINITIALIZED", 4 },
/**
* This indicates profiling APIs are called while application is running
* in visual profiler mode.
*/
{ "CUDA_ERROR_PROFILER_DISABLED", 5 },
/**
* This indicates profiling has not been initialized for this context.
* Call cuProfilerInitialize() to resolve this.
*/
{ "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 },
/**
* This indicates profiler has already been started and probably
* cuProfilerStart() is incorrectly called.
*/
{ "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 },
/**
* This indicates profiler has already been stopped and probably
* cuProfilerStop() is incorrectly called.
*/
{ "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 },
/**
* This indicates that no CUDA-capable devices were detected by the installed
* CUDA driver.
*/
{ "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 },
/**
* This indicates that the device ordinal supplied by the user does not
* correspond to a valid CUDA device.
*/
{ "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 },
/**
* This indicates that the device kernel image is invalid. This can also
* indicate an invalid CUDA module.
*/
{ "CUDA_ERROR_INVALID_IMAGE", 200 },
/**
* This most frequently indicates that there is no context bound to the
* current thread. This can also be returned if the context passed to an
* API call is not a valid handle (such as a context that has had
* ::cuCtxDestroy() invoked on it). This can also be returned if a user
* mixes different API versions (i.e. 3010 context with 3020 API calls).
* See ::cuCtxGetApiVersion() for more details.
*/
{ "CUDA_ERROR_INVALID_CONTEXT", 201 },
/**
* This indicated that the context being supplied as a parameter to the
* API call was already the active context.
* \deprecated
* This error return is deprecated as of CUDA 3.2. It is no longer an
* error to attempt to push the active context via ::cuCtxPushCurrent().
*/
{ "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 },
/**
* This indicates that a map or register operation has failed.
*/
{ "CUDA_ERROR_MAP_FAILED", 205 },
/**
* This indicates that an unmap or unregister operation has failed.
*/
{ "CUDA_ERROR_UNMAP_FAILED", 206 },
/**
* This indicates that the specified array is currently mapped and thus
* cannot be destroyed.
*/
{ "CUDA_ERROR_ARRAY_IS_MAPPED", 207 },
/**
* This indicates that the resource is already mapped.
*/
{ "CUDA_ERROR_ALREADY_MAPPED", 208 },
/**
* This indicates that there is no kernel image available that is suitable
* for the device. This can occur when a user specifies code generation
* options for a particular CUDA source file that do not include the
* corresponding device configuration.
*/
{ "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 },
/**
* This indicates that a resource has already been acquired.
*/
{ "CUDA_ERROR_ALREADY_ACQUIRED", 210 },
/**
* This indicates that a resource is not mapped.
*/
{ "CUDA_ERROR_NOT_MAPPED", 211 },
/**
* This indicates that a mapped resource is not available for access as an
* array.
*/
{ "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 },
/**
* This indicates that a mapped resource is not available for access as a
* pointer.
*/
{ "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 },
/**
* This indicates that an uncorrectable ECC error was detected during
* execution.
*/
{ "CUDA_ERROR_ECC_UNCORRECTABLE", 214 },
/**
* This indicates that the ::CUlimit passed to the API call is not
* supported by the active device.
*/
{ "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 },
/**
* This indicates that the ::CUcontext passed to the API call can
* only be bound to a single CPU thread at a time but is already
* bound to a CPU thread.
*/
{ "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 },
/**
* This indicates that peer access is not supported across the given
* devices.
*/
{ "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217},
/**
* This indicates that the device kernel source is invalid.
*/
{ "CUDA_ERROR_INVALID_SOURCE", 300 },
/**
* This indicates that the file specified was not found.
*/
{ "CUDA_ERROR_FILE_NOT_FOUND", 301 },
/**
* This indicates that a link to a shared object failed to resolve.
*/
{ "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 },
/**
* This indicates that initialization of a shared object failed.
*/
{ "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 },
/**
* This indicates that an OS call failed.
*/
{ "CUDA_ERROR_OPERATING_SYSTEM", 304 },
/**
* This indicates that a resource handle passed to the API call was not
* valid. Resource handles are opaque types like ::CUstream and ::CUevent.
*/
{ "CUDA_ERROR_INVALID_HANDLE", 400 },
/**
* This indicates that a named symbol was not found. Examples of symbols
* are global/constant variable names, texture names }, and surface names.
*/
{ "CUDA_ERROR_NOT_FOUND", 500 },
/**
* This indicates that asynchronous operations issued previously have not
* completed yet. This result is not actually an error, but must be indicated
* differently than ::CUDA_SUCCESS (which indicates completion). Calls that
* may return this value include ::cuEventQuery() and ::cuStreamQuery().
*/
{ "CUDA_ERROR_NOT_READY", 600 },
/**
* An exception occurred on the device while executing a kernel. Common
* causes include dereferencing an invalid device pointer and accessing
* out of bounds shared memory. The context cannot be used }, so it must
* be destroyed (and a new one should be created). All existing device
* memory allocations from this context are invalid and must be
* reconstructed if the program is to continue using CUDA.
*/
{ "CUDA_ERROR_LAUNCH_FAILED", 700 },
/**
* This indicates that a launch did not occur because it did not have
* appropriate resources. This error usually indicates that the user has
* attempted to pass too many arguments to the device kernel, or the
* kernel launch specifies too many threads for the kernel's register
* count. Passing arguments of the wrong size (i.e. a 64-bit pointer
* when a 32-bit int is expected) is equivalent to passing too many
* arguments and can also result in this error.
*/
{ "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 },
/**
* This indicates that the device kernel took too long to execute. This can
* only occur if timeouts are enabled - see the device attribute
* ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
* context cannot be used (and must be destroyed similar to
* ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
* this context are invalid and must be reconstructed if the program is to
* continue using CUDA.
*/
{ "CUDA_ERROR_LAUNCH_TIMEOUT", 702 },
/**
* This error indicates a kernel launch that uses an incompatible texturing
* mode.
*/
{ "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 },
/**
* This error indicates that a call to ::cuCtxEnablePeerAccess() is
* trying to re-enable peer access to a context which has already
* had peer access to it enabled.
*/
{ "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 },
/**
* This error indicates that ::cuCtxDisablePeerAccess() is
* trying to disable peer access which has not been enabled yet
* via ::cuCtxEnablePeerAccess().
*/
{ "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 },
/**
* This error indicates that the primary context for the specified device
* has already been initialized.
*/
{ "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 },
/**
* This error indicates that the context current to the calling thread
* has been destroyed using ::cuCtxDestroy }, or is a primary context which
* has not yet been initialized.
*/
{ "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 },
/**
* A device-side assert triggered during kernel execution. The context
* cannot be used anymore, and must be destroyed. All existing device
* memory allocations from this context are invalid and must be
* reconstructed if the program is to continue using CUDA.
*/
{ "CUDA_ERROR_ASSERT", 710 },
/**
* This error indicates that the hardware resources required to enable
* peer access have been exhausted for one or more of the devices
* passed to ::cuCtxEnablePeerAccess().
*/
{ "CUDA_ERROR_TOO_MANY_PEERS", 711 },
/**
* This error indicates that the memory range passed to ::cuMemHostRegister()
* has already been registered.
*/
{ "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 },
/**
* This error indicates that the pointer passed to ::cuMemHostUnregister()
* does not correspond to any currently registered memory region.
*/
{ "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 },
/**
* This error indicates that the attempted operation is not permitted.
*/
{ "CUDA_ERROR_NOT_PERMITTED", 800 },
/**
* This error indicates that the attempted operation is not supported
* on the current system or device.
*/
{ "CUDA_ERROR_NOT_SUPPORTED", 801 },
/**
* This indicates that an unknown internal error has occurred.
*/
{ "CUDA_ERROR_UNKNOWN", 999 },
{ NULL, -1 }
};
// This is just a linear search through the array, since the error_id's are not
// always ocurring consecutively
const char * getCudaDrvErrorString(CUresult error_id)
{
int index = 0;
while (sCudaDrvErrorString[index].error_id != error_id &&
sCudaDrvErrorString[index].error_id != -1)
{
index++;
}
if (sCudaDrvErrorString[index].error_id == error_id)
return (const char *)sCudaDrvErrorString[index].error_string;
else
return (const char *)"CUDA_ERROR not found!";
}
#endif

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -0,0 +1,352 @@
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <algorithm>
#include <string.h>
#include <cuda.h>
#include <vector>
#include <cassert>
#include "drvapi_error_string.h"
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
// These are the inline versions for all of the SDK helper functions
void __checkCudaErrors(CUresult err, const char *file, const int line) {
if(CUDA_SUCCESS != err) {
std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
<< getCudaDrvErrorString(err) << "\" from file <" << file
<< ", line " << line << "\n";
exit(-1);
}
}
/**********************/
/* Basic CUDriver API */
CUcontext context;
void createContext(const int deviceId = 0)
{
CUdevice device;
int devCount;
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGetCount(&devCount));
assert(devCount > 0);
checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
char name[128];
checkCudaErrors(cuDeviceGetName(name, 128, device));
std::cout << "Using CUDA Device [0]: " << name << "\n";
int devMajor, devMinor;
checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
std::cout << "Device Compute Capability: "
<< devMajor << "." << devMinor << "\n";
if (devMajor < 2) {
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
exit(1);
}
// Create driver context
checkCudaErrors(cuCtxCreate(&context, 0, device));
}
void destroyContext()
{
checkCudaErrors(cuCtxDestroy(context));
}
CUmodule loadModule(const char * module)
{
CUmodule cudaModule;
checkCudaErrors(cuModuleLoadData(&cudaModule, module));
return cudaModule;
}
void unloadModule(CUmodule &cudaModule)
{
checkCudaErrors(cuModuleUnload(cudaModule));
}
CUfunction getFunction(CUmodule &cudaModule, const char * function)
{
CUfunction cudaFunction;
checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
return cudaFunction;
}
CUdeviceptr deviceMalloc(const size_t size)
{
CUdeviceptr d_buf;
checkCudaErrors(cuMemAlloc(&d_buf, size));
return d_buf;
}
void deviceFree(CUdeviceptr d_buf)
{
checkCudaErrors(cuMemFree(d_buf));
}
void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
{
checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
}
void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
{
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
}
#define deviceLaunch(func,nbx,nby,nbz,params) \
checkCudaErrors( \
cuLaunchKernel( \
(func), \
(nbx), (nby), (nbz), \
32, 1, 1, \
0, NULL, (params), NULL \
));
typedef CUdeviceptr devicePtr;
/**************/
extern "C"
{
#if 0
struct ModuleManager
{
private:
typedef std::pair<std::string, CUModule> ModulePair;
typedef std::map <std::string, CUModule> ModuleMap;
ModuleMap module_list;
ModuleMap::iterator findModule(const char * module_name)
{
return module_list.find(std::string(module_name));
}
public:
CUmodule loadModule(const char * module_name, const char * module_data)
{
const ModuleMap::iterator it = findModule(module_name)
if (it != ModuleMap::end)
{
CUmodule cudaModule = loadModule(module);
module_list.insert(std::make_pair(std::string(module_name), cudaModule));
return cudaModule
}
return it->second;
}
void unloadModule(const char * module_name)
{
ModuleMap::iterator it = findModule(module_name)
if (it != ModuleMap::end)
module_list.erase(it);
}
};
#endif
void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment)
{
#if 0
fprintf(stderr, " ptr= %p\n", *handlePtr);
fprintf(stderr, " size= %d\n", (int)size);
fprintf(stderr, " alignment= %d\n", (int)alignment);
fprintf(stderr, " ------- \n\n");
#endif
return NULL;
}
void CUDALaunch(
void **handlePtr,
const char * module_name,
const char * module,
const char * func_name,
void **func_args,
int countx, int county, int countz)
{
assert(module_name != NULL);
assert(module != NULL);
assert(func_name != NULL);
assert(func_args != NULL);
#if 1
CUmodule cudaModule = loadModule(module);
CUfunction cudaFunction = getFunction(cudaModule, func_name);
deviceLaunch(cudaFunction, countx, county, countz, func_args);
unloadModule(cudaModule);
#else
fprintf(stderr, " handle= %p\n", *handlePtr);
fprintf(stderr, " count= %d %d %d\n", countx, county, countz);
fprintf(stderr, " module_name= %s \n", module_name);
fprintf(stderr, " func_name= %s \n", func_name);
// fprintf(stderr, " ptx= %s \n", module);
fprintf(stderr, " x0= %g \n", *((float*)(func_args[0])));
fprintf(stderr, " dx= %g \n", *((float*)(func_args[1])));
fprintf(stderr, " y0= %g \n", *((float*)(func_args[2])));
fprintf(stderr, " dy= %g \n", *((float*)(func_args[3])));
fprintf(stderr, " w= %d \n", *((int*)(func_args[4])));
fprintf(stderr, " h= %d \n", *((int*)(func_args[5])));
fprintf(stderr, " xs= %d \n", *((int*)(func_args[6])));
fprintf(stderr, " ys= %d \n", *((int*)(func_args[7])));
fprintf(stderr, " maxit= %d \n", *((int*)(func_args[8])));
fprintf(stderr, " ptr= %p \n", *((int**)(func_args[9])));
fprintf(stderr, " ------- \n\n");
#endif
}
void CUDASync(void *handle)
{
checkCudaErrors(cuStreamSynchronize(0));
}
void ISPCSync(void *handle)
{
}
void CUDAFree(void *handle)
{
}
}
/********************/
/* Write a PPM image file with the image of the Mandelbrot set */
static void
writePPM(int *buf, int width, int height, const char *fn)
{
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n");
fprintf(fp, "%d %d\n", width, height);
fprintf(fp, "255\n");
for (int i = 0; i < width*height; ++i) {
// Map the iteration count to colors by just alternating between
// two greys.
char c = (buf[i] & 0x1) ? 240 : 20;
for (int j = 0; j < 3; ++j)
fputc(c, fp);
}
fclose(fp);
printf("Wrote image file %s\n", fn);
}
std::vector<char> readBinary(const char * filename)
{
std::vector<char> buffer;
FILE *fp = fopen(filename, "rb");
if (!fp )
{
fprintf(stderr, "file %s not found\n", filename);
assert(0);
}
#if 0
char c;
while ((c = fgetc(fp)) != EOF)
buffer.push_back(c);
#else
fseek(fp, 0, SEEK_END);
const unsigned long long size = ftell(fp); /*calc the size needed*/
fseek(fp, 0, SEEK_SET);
buffer.resize(size);
if (fp == NULL){ /*ERROR detection if file == empty*/
fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);
exit(1);
}
else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
fprintf(stderr, "Error: There was an Error reading the file %s \n", filename);
exit(1);
}
#endif
fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size());
return buffer;
}
static void usage()
{
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
exit(1);
}
extern "C"
void mandelbrot_ispc(
float x0, float y0,
float x1, float y1,
int width, int height,
int maxIterations, int output[])
#if 1
;
#else
{
float dx = (x1 - x0) / width;
float dy = (y1 - y0) / height;
int xspan = 32; /* make sure it is big enough to avoid false-sharing */
int yspan = 4;
const int nbx = width/xspan;
const int nby = width/yspan;
const int nbz = 1;
fprintf(stderr ," nbx= %d nby= %d nbtot= %d \n", nbx, nby, nbx*nby);
// const std::vector<char> cubin = readBinary("cuLaunch.cubin");
const std::vector<char> cubin = readBinary("cuLaunch.ptx");
void *params[] = {&x0, &dx, &y0, &dy, &width, &height, &xspan, &yspan, &maxIterations, &output};
CUDALaunch(
NULL, //void **handlePtr,
"module_01", // const char * module_name,
&cubin[0], //const char * module,
"mandelbrot_scanline", //const char * func_name,
params, //void **func_args,
nbx,nby,nbz); //int countx, int county, int countz)
CUDASync(NULL);
}
#endif
int main(int argc, char *argv[])
{
unsigned int width = 1536;
unsigned int height = 1024;
float x0 = -2;
float x1 = 1;
float y0 = -1;
float y1 = 1;
if (argc == 1)
;
else if (argc == 2) {
if (strncmp(argv[1], "--scale=", 8) == 0) {
float scale = atof(argv[1] + 8);
if (scale == 0.f)
usage();
width *= scale;
height *= scale;
// round up to multiples of 16
width = (width + 0xf) & ~0xf;
height = (height + 0xf) & ~0xf;
}
else
usage();
}
else
usage();
/*******************/
createContext();
/*******************/
int maxIterations = 512;
int *h_buf = new int[width*height];
for (unsigned int i = 0; i < width*height; i++)
h_buf[i] = 0;
const size_t bufsize = sizeof(int)*width*height;
devicePtr d_buf = deviceMalloc(bufsize);
memcpyH2D(d_buf, h_buf, bufsize);
mandelbrot_ispc(x0,y0,x1,y1,width, height, maxIterations, (int*)d_buf);
memcpyD2H(h_buf, d_buf, bufsize);
deviceFree(d_buf);
writePPM(h_buf, width, height, "mandelbrot-cuda.ppm");
/*******************/
destroyContext();
/*******************/
return 0;
}

View File

@@ -0,0 +1,410 @@
; ModuleID = 'mandelbrot_task.bc'
target datalayout = "e-p:64:64:64-S0-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-v16:16:16-v32:32:32-n16:32:64"
target triple = "nvptx64"
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #0
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #0
; Function Attrs: alwaysinline nounwind readnone
define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8>, <1 x i32> %mask) #1 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i8> %0, i32 0
%d1 = extractelement <1 x i8> %1, i32 0
%sel = select i1 %cmp, i8 %d0, i8 %d1
%r = insertelement <1 x i8> undef, i8 %sel, i32 0
ret <1 x i8> %r
}
; Function Attrs: alwaysinline nounwind readnone
define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16>, <1 x i32> %mask) #1 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i16> %0, i32 0
%d1 = extractelement <1 x i16> %1, i32 0
%sel = select i1 %cmp, i16 %d0, i16 %d1
%r = insertelement <1 x i16> undef, i16 %sel, i32 0
ret <1 x i16> %r
}
; Function Attrs: alwaysinline nounwind readnone
define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64>, <1 x i32> %mask) #1 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i64> %0, i32 0
%d1 = extractelement <1 x i64> %1, i32 0
%sel = select i1 %cmp, i64 %d0, i64 %d1
%r = insertelement <1 x i64> undef, i64 %sel, i32 0
ret <1 x i64> %r
}
; Function Attrs: nounwind readnone
declare double @llvm.nvvm.rsqrt.approx.d(double) #0
; Function Attrs: alwaysinline nounwind
define void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float> %v3, <1 x float>* noalias nocapture %out0, <1 x float>* noalias nocapture %out1, <1 x float>* noalias nocapture %out2, <1 x float>* noalias nocapture %out3) #2 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
store <1 x float> %v3, <1 x float>* %out3, align 4
ret void
}
; Function Attrs: alwaysinline nounwind
define void @__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float> %v3, <1 x float>* noalias nocapture %out0, <1 x float>* noalias nocapture %out1, <1 x float>* noalias nocapture %out2, <1 x float>* noalias nocapture %out3) #2 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
store <1 x float> %v3, <1 x float>* %out3, align 4
ret void
}
; Function Attrs: nounwind
define void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float>* nocapture %out0, <1 x float>* nocapture %out1, <1 x float>* nocapture %out2) #3 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
ret void
}
; Function Attrs: nounwind
define void @__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float>* nocapture %out0, <1 x float>* nocapture %out1, <1 x float>* nocapture %out2) #3 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
ret void
}
; Function Attrs: alwaysinline nounwind readonly
define <1 x double> @__rsqrt_varying_double(<1 x double> %v) #4 {
%vs = extractelement <1 x double> %v, i32 0
%rs = tail call double @llvm.nvvm.rsqrt.approx.d(double %vs)
%rv = insertelement <1 x double> undef, double %rs, i32 0
ret <1 x double> %rv
}
; Function Attrs: nounwind
define void @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* noalias nocapture, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #5 {
allocas:
%x01 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 0
%x02 = load float* %x01, align 4
%dx3 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 1
%dx4 = load float* %dx3, align 4
%y05 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 2
%y06 = load float* %y05, align 4
%dy7 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 3
%dy8 = load float* %dy7, align 4
%width9 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 4
%width10 = load i32* %width9, align 4
%height11 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 5
%height12 = load i32* %height11, align 4
%xspan13 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 6
%xspan14 = load i32* %xspan13, align 4
%yspan15 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 7
%yspan16 = load i32* %yspan15, align 4
%maxIterations17 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 8
%maxIterations18 = load i32* %maxIterations17, align 4
%output19 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 9
%output20 = load i32** %output19, align 8
%task_struct_mask = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 10
%mask = load <1 x i32>* %task_struct_mask, align 4
%item.i = extractelement <1 x i32> %mask, i32 0
%cmp.i = icmp slt i32 %item.i, 0
%bid.i.i = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
%mul_calltmp_xspan_load = mul i32 %bid.i.i, %xspan14
%add_xstart_load_xspan_load25 = add i32 %mul_calltmp_xspan_load, %xspan14
%c.i.i = icmp slt i32 %add_xstart_load_xspan_load25, %width10
%r.i.i = select i1 %c.i.i, i32 %add_xstart_load_xspan_load25, i32 %width10
%bid.i.i177 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
%mul_calltmp31_yspan_load = mul i32 %bid.i.i177, %yspan16
%add_ystart_load_yspan_load32 = add i32 %mul_calltmp31_yspan_load, %yspan16
%c.i.i178 = icmp slt i32 %add_ystart_load_yspan_load32, %height12
%r.i.i179 = select i1 %c.i.i178, i32 %add_ystart_load_yspan_load32, i32 %height12
%less_yi_load_yend_load319 = icmp slt i32 %mul_calltmp31_yspan_load, %r.i.i179
br i1 %cmp.i, label %for_test.preheader, label %for_test104.preheader
for_test104.preheader: ; preds = %allocas
br i1 %less_yi_load_yend_load319, label %for_test115.preheader.lr.ph, label %for_exit
for_test115.preheader.lr.ph: ; preds = %for_test104.preheader
%less_xi_load122_xend_load123331 = icmp slt i32 %mul_calltmp_xspan_load, %r.i.i
%maxIterations_load140_broadcast_init = insertelement <1 x i32> undef, i32 %maxIterations18, i32 0
%less_i_load_count_load.i321 = icmp sgt <1 x i32> %maxIterations_load140_broadcast_init, zeroinitializer
%"oldMask&test.i322" = select <1 x i1> %less_i_load_count_load.i321, <1 x i32> <i32 -1>, <1 x i32> zeroinitializer
%"internal_mask&function_mask10.i323" = and <1 x i32> %"oldMask&test.i322", %mask
%item.i.i324 = extractelement <1 x i32> %"internal_mask&function_mask10.i323", i32 0
%cmp.i.i325 = icmp slt i32 %item.i.i324, 0
%11 = xor i32 %height12, -1
%12 = add i32 %bid.i.i177, 1
%13 = mul i32 %yspan16, %12
%14 = xor i32 %13, -1
%15 = icmp sgt i32 %11, %14
%smax336 = select i1 %15, i32 %11, i32 %14
%16 = xor i32 %smax336, -1
br label %for_test115.preheader
for_test.preheader: ; preds = %allocas
br i1 %less_yi_load_yend_load319, label %for_test40.preheader.lr.ph, label %for_exit
for_test40.preheader.lr.ph: ; preds = %for_test.preheader
%less_xi_load_xend_load317 = icmp slt i32 %mul_calltmp_xspan_load, %r.i.i
%maxIterations_load_broadcast_init = insertelement <1 x i32> undef, i32 %maxIterations18, i32 0
%less_i_load_count_load.i204308 = icmp sgt <1 x i32> %maxIterations_load_broadcast_init, zeroinitializer
%"oldMask&test.i205309" = select <1 x i1> %less_i_load_count_load.i204308, <1 x i32> <i32 -1>, <1 x i32> zeroinitializer
%item.i.i206310 = extractelement <1 x i32> %"oldMask&test.i205309", i32 0
%cmp.i.i207311 = icmp slt i32 %item.i.i206310, 0
%output_load_ptr2int = ptrtoint i32* %output20 to i64
%17 = xor i32 %height12, -1
%18 = add i32 %bid.i.i177, 1
%19 = mul i32 %yspan16, %18
%20 = xor i32 %19, -1
%21 = icmp sgt i32 %17, %20
%smax = select i1 %21, i32 %17, i32 %20
%22 = xor i32 %smax, -1
br label %for_test40.preheader
for_test40.preheader: ; preds = %for_exit43, %for_test40.preheader.lr.ph
%yi.0320 = phi i32 [ %mul_calltmp31_yspan_load, %for_test40.preheader.lr.ph ], [ %yi_load77_plus1, %for_exit43 ]
br i1 %less_xi_load_xend_load317, label %for_loop42.lr.ph, label %for_exit43
for_loop42.lr.ph: ; preds = %for_test40.preheader
%yi_load52_to_float = sitofp i32 %yi.0320 to float
%mul_yi_load52_to_float_dy_load = fmul float %dy8, %yi_load52_to_float
%add_y0_load_mul_yi_load52_to_float_dy_load = fadd float %y06, %mul_yi_load52_to_float_dy_load
%add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init = insertelement <1 x float> undef, float %add_y0_load_mul_yi_load52_to_float_dy_load, i32 0
%mul_yi_load56_width_load57 = mul i32 %yi.0320, %width10
br i1 %cmp.i.i207311, label %for_loop.i229.lr.ph.us, label %mandel___vyfvyfvyi.exit244
mandel___vyfvyfvyi.exit244.us: ; preds = %for_step.i212.us
%tid.i.i189.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%tid.i.i.i190.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%sub_calltmp3_.i191.us = add i32 %tid.i.i.i190.us, -1
%bitop.i192.us = and i32 %sub_calltmp3_.i191.us, %tid.i.i189.us
%add_xi_load62_calltmp65.us = add i32 %bitop.i192.us, %xi.0318.us
%less_add_xi_load62_calltmp65_xend_load66.us = icmp slt i32 %add_xi_load62_calltmp65.us, %r.i.i
br i1 %less_add_xi_load62_calltmp65_xend_load66.us, label %if_then.us, label %if_exit.us
if_then.us: ; preds = %mandel___vyfvyfvyi.exit244.us
%tid.i.i.i194.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%sub_calltmp3_.i195.us = add i32 %tid.i.i.i194.us, 1073741823
%tid.i.i193.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%bitop.i196.us = and i32 %sub_calltmp3_.i195.us, %tid.i.i193.us
%add_xi_load58_calltmp61.us = add i32 %xi.0318.us, %mul_yi_load56_width_load57
%add_mul_yi_load56_width_load57_add_xi_load58_calltmp61.us = add i32 %add_xi_load58_calltmp61.us, %bitop.i196.us
%23 = shl i32 %add_mul_yi_load56_width_load57_add_xi_load58_calltmp61.us, 2
%iptr__id.i264.rhs.us = sext i32 %23 to i64
%iptr__id.i264.us = add i64 %iptr__id.i264.rhs.us, %output_load_ptr2int
%ptr__id.i265.us = inttoptr i64 %iptr__id.i264.us to i32*
store i32 %sel.i.i291.us, i32* %ptr__id.i265.us, align 4
br label %if_exit.us
if_exit.us: ; preds = %if_then.us, %mandel___vyfvyfvyi.exit244.us
%tid.i.i188.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%add_xi_load76_calltmp74.us = add i32 %tid.i.i188.us, %xi.0318.us
%less_xi_load_xend_load.us = icmp slt i32 %add_xi_load76_calltmp74.us, %r.i.i
br i1 %less_xi_load_xend_load.us, label %for_loop.i229.lr.ph.us, label %for_exit43
for_loop.i229.us: ; preds = %for_loop.i229.lr.ph.us, %for_step.i212.us
%"oldMask&test.i205316.us" = phi <1 x i32> [ %"oldMask&test.i205309", %for_loop.i229.lr.ph.us ], [ %"oldMask&test.i205.us", %for_step.i212.us ]
%break_lanes_memory.0.i201315.us = phi <1 x i32> [ zeroinitializer, %for_loop.i229.lr.ph.us ], [ %"mask|break_mask.i220.us", %for_step.i212.us ]
%r.i.i292295314.us = phi <1 x i32> [ zeroinitializer, %for_loop.i229.lr.ph.us ], [ %r.i.i292.us, %for_step.i212.us ]
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init301313.us = phi <1 x float> [ %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init.us, %for_loop.i229.lr.ph.us ], [ %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init300.us, %for_step.i212.us ]
%add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init303312.us = phi <1 x float> [ %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init, %for_loop.i229.lr.ph.us ], [ %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init302.us, %for_step.i212.us ]
%mul_z_re_load_z_re_load13.i214.us = fmul <1 x float> %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init301313.us, %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init301313.us
%mul_z_im_load_z_im_load14.i216.us = fmul <1 x float> %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init303312.us, %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init303312.us
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i217.us = fadd <1 x float> %mul_z_im_load_z_im_load14.i216.us, %mul_z_re_load_z_re_load13.i214.us
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i218.us = fcmp ugt <1 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i217.us, <float 4.000000e+00>
%"oldMask&test16.i219.us" = select <1 x i1> %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i218.us, <1 x i32> %"oldMask&test.i205316.us", <1 x i32> zeroinitializer
%"mask|break_mask.i220.us" = or <1 x i32> %"oldMask&test16.i219.us", %break_lanes_memory.0.i201315.us
%item.i63.i222.us = extractelement <1 x i32> %"mask|break_mask.i220.us", i32 0
%v.i64.i223.us = lshr i32 %item.i63.i222.us, 31
%item.i62.i225.us = extractelement <1 x i32> %"oldMask&test.i205316.us", i32 0
%v.i.i226.us = lshr i32 %item.i62.i225.us, 31
%"equal_finished&func_internal_mask&function_mask12.i228.us" = icmp eq i32 %v.i64.i223.us, %v.i.i226.us
br i1 %"equal_finished&func_internal_mask&function_mask12.i228.us", label %for_step.i212.us, label %not_all_continued_or_breaked.i243.us
not_all_continued_or_breaked.i243.us: ; preds = %for_loop.i229.us
%"!(break|continue)_lanes.i232.us" = xor <1 x i32> %"mask|break_mask.i220.us", <i32 -1>
%new_mask28.i233.us = and <1 x i32> %"oldMask&test.i205316.us", %"!(break|continue)_lanes.i232.us"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i238.us = fsub <1 x float> %mul_z_re_load_z_re_load13.i214.us, %mul_z_im_load_z_im_load14.i216.us
%mul__z_re_load35.i239.us = fmul <1 x float> %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init301313.us, <float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i240.us = fmul <1 x float> %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init303312.us, %mul__z_re_load35.i239.us
%add_c_re_load42_new_re_load.i241.us = fadd <1 x float> %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init.us, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i238.us
%add_c_im_load44_new_im_load.i242.us = fadd <1 x float> %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init, %mul_mul__z_re_load35_z_im_load36.i240.us
br label %for_step.i212.us
for_step.i212.us: ; preds = %not_all_continued_or_breaked.i243.us, %for_loop.i229.us
%add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init302.us = phi <1 x float> [ %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init303312.us, %for_loop.i229.us ], [ %add_c_im_load44_new_im_load.i242.us, %not_all_continued_or_breaked.i243.us ]
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init300.us = phi <1 x float> [ %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init301313.us, %for_loop.i229.us ], [ %add_c_re_load42_new_re_load.i241.us, %not_all_continued_or_breaked.i243.us ]
%internal_mask_memory.1.i209.us = phi <1 x i32> [ zeroinitializer, %for_loop.i229.us ], [ %new_mask28.i233.us, %not_all_continued_or_breaked.i243.us ]
%m.i.i287.us = extractelement <1 x i32> %internal_mask_memory.1.i209.us, i32 0
%d0.i.i289.us = extractelement <1 x i32> %r.i.i292295314.us, i32 0
%not.cmp.i.i288.us = icmp ne i32 %m.i.i287.us, 0
%d1.i.i290.us = zext i1 %not.cmp.i.i288.us to i32
%sel.i.i291.us = add i32 %d0.i.i289.us, %d1.i.i290.us
%r.i.i292.us = insertelement <1 x i32> undef, i32 %sel.i.i291.us, i32 0
%less_i_load_count_load.i204.us = icmp slt <1 x i32> %r.i.i292.us, %maxIterations_load_broadcast_init
%"oldMask&test.i205.us" = select <1 x i1> %less_i_load_count_load.i204.us, <1 x i32> %internal_mask_memory.1.i209.us, <1 x i32> zeroinitializer
%item.i.i206.us = extractelement <1 x i32> %"oldMask&test.i205.us", i32 0
%cmp.i.i207.us = icmp slt i32 %item.i.i206.us, 0
br i1 %cmp.i.i207.us, label %for_loop.i229.us, label %mandel___vyfvyfvyi.exit244.us
for_loop.i229.lr.ph.us: ; preds = %if_exit.us, %for_loop42.lr.ph
%xi.0318.us = phi i32 [ %add_xi_load76_calltmp74.us, %if_exit.us ], [ %mul_calltmp_xspan_load, %for_loop42.lr.ph ]
%tid.i.i180.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%tid.i.i.i181.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%sub_calltmp3_.i182.us = add i32 %tid.i.i.i181.us, -1
%bitop.i183.us = and i32 %sub_calltmp3_.i182.us, %tid.i.i180.us
%add_xi_load48_calltmp51.us = add i32 %bitop.i183.us, %xi.0318.us
%add_xi_load48_calltmp51_to_float.us = sitofp i32 %add_xi_load48_calltmp51.us to float
%mul_add_xi_load48_calltmp51_to_float_dx_load.us = fmul float %dx4, %add_xi_load48_calltmp51_to_float.us
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load.us = fadd float %x02, %mul_add_xi_load48_calltmp51_to_float_dx_load.us
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init.us = insertelement <1 x float> undef, float %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load.us, i32 0
br label %for_loop.i229.us
for_exit: ; preds = %for_exit118, %for_exit43, %for_test.preheader, %for_test104.preheader
ret void
mandel___vyfvyfvyi.exit244: ; preds = %if_exit, %for_loop42.lr.ph
%xi.0318 = phi i32 [ %add_xi_load76_calltmp74, %if_exit ], [ %mul_calltmp_xspan_load, %for_loop42.lr.ph ]
%tid.i.i189 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%tid.i.i.i190 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%sub_calltmp3_.i191 = add i32 %tid.i.i.i190, -1
%bitop.i192 = and i32 %sub_calltmp3_.i191, %tid.i.i189
%add_xi_load62_calltmp65 = add i32 %bitop.i192, %xi.0318
%less_add_xi_load62_calltmp65_xend_load66 = icmp slt i32 %add_xi_load62_calltmp65, %r.i.i
br i1 %less_add_xi_load62_calltmp65_xend_load66, label %if_then, label %if_exit
for_exit43: ; preds = %if_exit, %if_exit.us, %for_test40.preheader
%yi_load77_plus1 = add i32 %yi.0320, 1
%exitcond = icmp eq i32 %yi_load77_plus1, %22
br i1 %exitcond, label %for_exit, label %for_test40.preheader
if_then: ; preds = %mandel___vyfvyfvyi.exit244
%tid.i.i.i194 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%sub_calltmp3_.i195 = add i32 %tid.i.i.i194, 1073741823
%tid.i.i193 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%bitop.i196 = and i32 %sub_calltmp3_.i195, %tid.i.i193
%add_xi_load58_calltmp61 = add i32 %xi.0318, %mul_yi_load56_width_load57
%add_mul_yi_load56_width_load57_add_xi_load58_calltmp61 = add i32 %add_xi_load58_calltmp61, %bitop.i196
%24 = shl i32 %add_mul_yi_load56_width_load57_add_xi_load58_calltmp61, 2
%iptr__id.i264.rhs = sext i32 %24 to i64
%iptr__id.i264 = add i64 %iptr__id.i264.rhs, %output_load_ptr2int
%ptr__id.i265 = inttoptr i64 %iptr__id.i264 to i32*
store i32 0, i32* %ptr__id.i265, align 4
br label %if_exit
if_exit: ; preds = %if_then, %mandel___vyfvyfvyi.exit244
%tid.i.i188 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%add_xi_load76_calltmp74 = add i32 %tid.i.i188, %xi.0318
%less_xi_load_xend_load = icmp slt i32 %add_xi_load76_calltmp74, %r.i.i
br i1 %less_xi_load_xend_load, label %mandel___vyfvyfvyi.exit244, label %for_exit43
for_test115.preheader: ; preds = %for_exit118, %for_test115.preheader.lr.ph
%yi109.0335 = phi i32 [ %mul_calltmp31_yspan_load, %for_test115.preheader.lr.ph ], [ %yi_load171_plus1, %for_exit118 ]
br i1 %less_xi_load122_xend_load123331, label %for_loop117.lr.ph, label %for_exit118
for_loop117.lr.ph: ; preds = %for_test115.preheader
%yi_load135_to_float = sitofp i32 %yi109.0335 to float
%mul_yi_load135_to_float_dy_load136 = fmul float %dy8, %yi_load135_to_float
%add_y0_load134_mul_yi_load135_to_float_dy_load136 = fadd float %y06, %mul_yi_load135_to_float_dy_load136
%add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init = insertelement <1 x float> undef, float %add_y0_load134_mul_yi_load135_to_float_dy_load136, i32 0
br i1 %cmp.i.i325, label %for_loop.i.lr.ph.us, label %if_exit159
if_exit159.us: ; preds = %for_step.i.us
%tid.i.i.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%add_xi120_load_calltmp169.us = add i32 %tid.i.i.us, %xi120.0332.us
%less_xi_load122_xend_load123.us = icmp slt i32 %add_xi120_load_calltmp169.us, %r.i.i
br i1 %less_xi_load122_xend_load123.us, label %for_loop.i.lr.ph.us, label %for_exit118
for_loop.i.us: ; preds = %for_loop.i.lr.ph.us, %for_step.i.us
%"oldMask&test.i329.us" = phi <1 x i32> [ %"oldMask&test.i322", %for_loop.i.lr.ph.us ], [ %"oldMask&test.i.us", %for_step.i.us ]
%break_lanes_memory.0.i328.us = phi <1 x i32> [ zeroinitializer, %for_loop.i.lr.ph.us ], [ %"mask|break_mask.i.us", %for_step.i.us ]
%25 = phi <1 x i32> [ zeroinitializer, %for_loop.i.lr.ph.us ], [ %r.i.i261.us, %for_step.i.us ]
%add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init305327.us = phi <1 x float> [ %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init.us, %for_loop.i.lr.ph.us ], [ %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init304.us, %for_step.i.us ]
%add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init307326.us = phi <1 x float> [ %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init, %for_loop.i.lr.ph.us ], [ %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init306.us, %for_step.i.us ]
%"internal_mask&function_mask12.i.us" = and <1 x i32> %"oldMask&test.i329.us", %mask
%mul_z_re_load_z_re_load13.i.us = fmul <1 x float> %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init305327.us, %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init305327.us
%mul_z_im_load_z_im_load14.i.us = fmul <1 x float> %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init307326.us, %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init307326.us
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i.us = fadd <1 x float> %mul_z_im_load_z_im_load14.i.us, %mul_z_re_load_z_re_load13.i.us
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i.us = fcmp ugt <1 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i.us, <float 4.000000e+00>
%"oldMask&test16.i.us" = select <1 x i1> %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i.us, <1 x i32> %"oldMask&test.i329.us", <1 x i32> zeroinitializer
%"mask|break_mask.i.us" = or <1 x i32> %"oldMask&test16.i.us", %break_lanes_memory.0.i328.us
%"finished&func.i.us" = and <1 x i32> %"mask|break_mask.i.us", %mask
%item.i63.i.us = extractelement <1 x i32> %"finished&func.i.us", i32 0
%v.i64.i.us = lshr i32 %item.i63.i.us, 31
%item.i62.i.us = extractelement <1 x i32> %"internal_mask&function_mask12.i.us", i32 0
%v.i.i.us = lshr i32 %item.i62.i.us, 31
%"equal_finished&func_internal_mask&function_mask12.i.us" = icmp eq i32 %v.i64.i.us, %v.i.i.us
br i1 %"equal_finished&func_internal_mask&function_mask12.i.us", label %for_step.i.us, label %not_all_continued_or_breaked.i.us
not_all_continued_or_breaked.i.us: ; preds = %for_loop.i.us
%"!(break|continue)_lanes.i.us" = xor <1 x i32> %"mask|break_mask.i.us", <i32 -1>
%new_mask28.i.us = and <1 x i32> %"oldMask&test.i329.us", %"!(break|continue)_lanes.i.us"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i.us = fsub <1 x float> %mul_z_re_load_z_re_load13.i.us, %mul_z_im_load_z_im_load14.i.us
%mul__z_re_load35.i.us = fmul <1 x float> %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init305327.us, <float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i.us = fmul <1 x float> %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init307326.us, %mul__z_re_load35.i.us
%add_c_re_load42_new_re_load.i.us = fadd <1 x float> %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init.us, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i.us
%add_c_im_load44_new_im_load.i.us = fadd <1 x float> %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init, %mul_mul__z_re_load35_z_im_load36.i.us
br label %for_step.i.us
for_step.i.us: ; preds = %not_all_continued_or_breaked.i.us, %for_loop.i.us
%add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init306.us = phi <1 x float> [ %add_y0_load134_mul_yi_load135_to_float_dy_load136_broadcast_init307326.us, %for_loop.i.us ], [ %add_c_im_load44_new_im_load.i.us, %not_all_continued_or_breaked.i.us ]
%add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init304.us = phi <1 x float> [ %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init305327.us, %for_loop.i.us ], [ %add_c_re_load42_new_re_load.i.us, %not_all_continued_or_breaked.i.us ]
%internal_mask_memory.1.i.us = phi <1 x i32> [ zeroinitializer, %for_loop.i.us ], [ %new_mask28.i.us, %not_all_continued_or_breaked.i.us ]
%m.i.i.us = extractelement <1 x i32> %internal_mask_memory.1.i.us, i32 0
%d0.i.i259.us = extractelement <1 x i32> %25, i32 0
%not.cmp.i.i258.us = icmp ne i32 %m.i.i.us, 0
%d1.i.i260.us = zext i1 %not.cmp.i.i258.us to i32
%sel.i.i.us = add i32 %d0.i.i259.us, %d1.i.i260.us
%r.i.i261.us = insertelement <1 x i32> undef, i32 %sel.i.i.us, i32 0
%less_i_load_count_load.i.us = icmp slt <1 x i32> %r.i.i261.us, %maxIterations_load140_broadcast_init
%"oldMask&test.i.us" = select <1 x i1> %less_i_load_count_load.i.us, <1 x i32> %internal_mask_memory.1.i.us, <1 x i32> zeroinitializer
%"internal_mask&function_mask10.i.us" = and <1 x i32> %"oldMask&test.i.us", %mask
%item.i.i.us = extractelement <1 x i32> %"internal_mask&function_mask10.i.us", i32 0
%cmp.i.i.us = icmp slt i32 %item.i.i.us, 0
br i1 %cmp.i.i.us, label %for_loop.i.us, label %if_exit159.us
for_loop.i.lr.ph.us: ; preds = %if_exit159.us, %for_loop117.lr.ph
%xi120.0332.us = phi i32 [ %add_xi120_load_calltmp169.us, %if_exit159.us ], [ %mul_calltmp_xspan_load, %for_loop117.lr.ph ]
%tid.i.i184.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%tid.i.i.i185.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%sub_calltmp3_.i186.us = add i32 %tid.i.i.i185.us, -1
%bitop.i187.us = and i32 %sub_calltmp3_.i186.us, %tid.i.i184.us
%add_xi_load128_calltmp131.us = add i32 %bitop.i187.us, %xi120.0332.us
%add_xi_load128_calltmp131_to_float.us = sitofp i32 %add_xi_load128_calltmp131.us to float
%mul_add_xi_load128_calltmp131_to_float_dx_load132.us = fmul float %dx4, %add_xi_load128_calltmp131_to_float.us
%add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132.us = fadd float %x02, %mul_add_xi_load128_calltmp131_to_float_dx_load132.us
%add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132_broadcast_init.us = insertelement <1 x float> undef, float %add_x0_load127_mul_add_xi_load128_calltmp131_to_float_dx_load132.us, i32 0
br label %for_loop.i.us
for_exit118: ; preds = %if_exit159, %if_exit159.us, %for_test115.preheader
%yi_load171_plus1 = add i32 %yi109.0335, 1
%exitcond337 = icmp eq i32 %yi_load171_plus1, %16
br i1 %exitcond337, label %for_exit, label %for_test115.preheader
if_exit159: ; preds = %if_exit159, %for_loop117.lr.ph
%xi120.0332 = phi i32 [ %add_xi120_load_calltmp169, %if_exit159 ], [ %mul_calltmp_xspan_load, %for_loop117.lr.ph ]
%tid.i.i = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #3
%add_xi120_load_calltmp169 = add i32 %tid.i.i, %xi120.0332
%less_xi_load122_xend_load123 = icmp slt i32 %add_xi120_load_calltmp169, %r.i.i
br i1 %less_xi_load122_xend_load123, label %if_exit159, label %for_exit118
}
attributes #0 = { nounwind readnone }
attributes #1 = { alwaysinline nounwind readnone }
attributes #2 = { alwaysinline nounwind }
attributes #3 = { nounwind }
attributes #4 = { alwaysinline nounwind readonly }
attributes #5 = { nounwind "target-features"="+sm_35" }
!nvvm.annotations = !{!1}
!1 = metadata !{void ({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* , i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, metadata !"kernel", i32 1}

View File

@@ -0,0 +1,53 @@
#include <stdio.h>
#define blockIndex0 (blockIdx.x)
#define blockIndex1 (blockIdx.y)
#define vectorWidth (32)
#define vectorIndex (threadIdx.x & (vectorWidth-1))
int __device__ __forceinline__
mandel(float c_re, float c_im, int count)
{
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.0f)
break;
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.0f * z_re * z_im;
{
z_re = c_re + new_re;
z_im = c_im + new_im;
}
}
return i;
}
extern "C"
__global__ void mandelbrot_scanline(
float x0, float dx,
float y0, float dy,
int width, int height,
int xspan, int yspan,
int maxIterations, int output[])
{
const int xstart = blockIndex0 * xspan;
const int xend = min(xstart + xspan, width);
const int ystart = blockIndex1 * yspan;
const int yend = min(ystart + yspan, height);
for (int yi = ystart; yi < yend; yi++)
for (int xi = xstart; xi < xend; xi += vectorWidth)
{
const float x = x0 + (xi + vectorIndex) * dx;
const float y = y0 + yi * dy;
const int res = mandel(x,y,maxIterations);
const int index = yi * width + (xi + vectorIndex);
if (xi + vectorIndex < xend)
output[index] = res;
}
}

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,213 @@
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
// Cuda compilation tools, release 5.5, V5.5.0
//
.version 3.2
.target sm_35
.address_size 64
.file 1 "/home/evghenii/soft/ispc-code/ispc/examples/mandelbrot_tasks3d/mandel_task_cu.cu", 1383122156, 1370
.file 2 "/usr/local/cuda-5.5/bin/..//include/cuda_device_runtime_api.h", 1375338991, 7655
.file 3 "/usr/local/cuda-5.5/bin/..//include/device_functions.h", 1375338991, 185228
.extern .func (.param .b32 func_retval0) vprintf
(
.param .b64 vprintf_param_0,
.param .b64 vprintf_param_1
)
;
.global .align 1 .b8 $str[26] = {118, 101, 99, 116, 111, 114, 73, 110, 100, 101, 120, 61, 32, 37, 100, 32, 32, 98, 105, 100, 61, 32, 37, 100, 10, 0};
.weak .func (.param .b32 func_retval0) cudaMalloc(
.param .b64 cudaMalloc_param_0,
.param .b64 cudaMalloc_param_1
)
{
.reg .s32 %r<2>;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
.loc 2 66 3
ret;
}
.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes(
.param .b64 cudaFuncGetAttributes_param_0,
.param .b64 cudaFuncGetAttributes_param_1
)
{
.reg .s32 %r<2>;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
.loc 2 71 3
ret;
}
.visible .entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 mandelbrot_scanline_param_9
)
{
.local .align 8 .b8 __local_depot2[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<9>;
.reg .s32 %r<40>;
.reg .f32 %f<20>;
.reg .s64 %rd<8>;
mov.u64 %SPL, __local_depot2;
cvta.local.u64 %SP, %SPL;
ld.param.f32 %f9, [mandelbrot_scanline_param_0];
ld.param.f32 %f10, [mandelbrot_scanline_param_1];
ld.param.f32 %f11, [mandelbrot_scanline_param_2];
ld.param.f32 %f12, [mandelbrot_scanline_param_3];
ld.param.u32 %r14, [mandelbrot_scanline_param_4];
ld.param.u32 %r17, [mandelbrot_scanline_param_5];
ld.param.u32 %r15, [mandelbrot_scanline_param_6];
ld.param.u32 %r18, [mandelbrot_scanline_param_7];
ld.param.u32 %r16, [mandelbrot_scanline_param_8];
ld.param.u64 %rd1, [mandelbrot_scanline_param_9];
add.u64 %rd2, %SP, 0;
.loc 1 35 1
cvta.to.local.u64 %rd3, %rd2;
mov.u32 %r19, %tid.x;
and.b32 %r20, %r19, 31;
mov.u32 %r21, %ntid.x;
cvta.global.u64 %rd4, $str;
st.local.v2.u32 [%rd3], {%r20, %r21};
// Callseq Start 0
{
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd4;
.param .b64 param1;
st.param.b64 [param1+0], %rd2;
.param .b32 retval0;
.loc 1 35 1
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r22, [retval0+0];
}
// Callseq End 0
.loc 1 36 1
mov.u32 %r23, %ctaid.x;
.loc 1 37 1
mad.lo.s32 %r24, %r23, %r15, %r15;
.loc 3 2621 10
min.s32 %r1, %r24, %r14;
.loc 1 39 1
mov.u32 %r25, %ctaid.y;
mul.lo.s32 %r37, %r25, %r18;
.loc 1 40 1
add.s32 %r26, %r37, %r18;
.loc 3 2621 10
min.s32 %r3, %r26, %r17;
.loc 1 42 1
setp.ge.s32 %p1, %r37, %r3;
@%p1 bra BB2_12;
cvta.to.global.u64 %rd5, %rd1;
BB2_2:
.loc 1 36 1
mul.lo.s32 %r38, %r23, %r15;
.loc 1 43 1
setp.ge.s32 %p2, %r38, %r1;
@%p2 bra BB2_11;
.loc 1 46 1
cvt.rn.f32.s32 %f13, %r37;
fma.rn.f32 %f1, %f13, %f12, %f11;
BB2_4:
.loc 1 45 1
add.s32 %r7, %r20, %r38;
cvt.rn.f32.u32 %f14, %r7;
fma.rn.f32 %f2, %f14, %f10, %f9;
mov.u32 %r39, 0;
setp.gt.s32 %p3, %r16, 0;
.loc 1 12 1
@%p3 bra BB2_5;
bra.uni BB2_8;
BB2_5:
mov.f32 %f18, %f1;
mov.f32 %f19, %f2;
BB2_6:
.loc 1 13 1
mov.f32 %f4, %f19;
mov.f32 %f3, %f18;
mul.f32 %f5, %f3, %f3;
mul.f32 %f6, %f4, %f4;
add.f32 %f15, %f6, %f5;
setp.gt.f32 %p4, %f15, 0f40800000;
@%p4 bra BB2_8;
.loc 1 16 1
sub.f32 %f16, %f6, %f5;
.loc 1 17 1
add.f32 %f17, %f4, %f4;
.loc 1 19 1
add.f32 %f7, %f2, %f16;
.loc 1 20 1
fma.rn.f32 %f8, %f17, %f3, %f1;
.loc 1 12 96
add.s32 %r39, %r39, 1;
.loc 1 12 1
setp.lt.s32 %p5, %r39, %r16;
mov.f32 %f18, %f8;
mov.f32 %f19, %f7;
@%p5 bra BB2_6;
BB2_8:
.loc 1 49 1
mad.lo.s32 %r34, %r37, %r14, %r38;
add.s32 %r11, %r34, %r20;
.loc 1 50 1
setp.ge.u32 %p6, %r7, %r1;
@%p6 bra BB2_10;
.loc 1 51 1
mul.wide.s32 %rd6, %r11, 4;
add.s64 %rd7, %rd5, %rd6;
st.global.u32 [%rd7], %r39;
BB2_10:
.loc 1 43 57
add.s32 %r38, %r38, 32;
.loc 1 43 1
setp.lt.s32 %p7, %r38, %r1;
@%p7 bra BB2_4;
BB2_11:
.loc 1 42 57
add.s32 %r37, %r37, 1;
.loc 1 42 1
setp.lt.s32 %p8, %r37, %r3;
@%p8 bra BB2_2;
BB2_12:
.loc 1 53 2
ret;
}

View File

@@ -0,0 +1,22 @@
extern task void
mandelbrot_scanline(
uniform float x0, uniform float dx,
uniform float y0, uniform float dy,
uniform int width, uniform int height,
uniform int xspan, uniform int yspan,
uniform int maxIterations, uniform int output[]);
export void
mandelbrot_ispc(uniform float x0, uniform float y0,
uniform float x1, uniform float y1,
uniform int width, uniform int height,
uniform int maxIterations, uniform int output[]) {
uniform float dx = (x1 - x0) / width;
uniform float dy = (y1 - y0) / height;
const uniform int xspan = 16; /* make sure it is big enough to avoid false-sharing */
const uniform int yspan = 16;
launch [width/xspan, height/yspan]
mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan,
maxIterations, output);
}

Binary file not shown.

View File

@@ -0,0 +1,91 @@
#ifdef __NVPTX__
#define blockIndex0 blockIndex0()
#define blockIndex1 blockIndex1()
#define vectorWidth warpSize()
#define vectorIndex laneIndex()
#else
#define blockIndex0 taskIndex0
#define blockIndex1 taskIndex1
#define vectorWidth programCount
#define vectorIndex programIndex
#endif
#if 0
varying float mem_private[100];
uniform float mem_shared [100];
#else
static inline int
mandel(float c_re, float c_im, int count)
{
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.f)
break;
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.f * z_re * z_im;
unmasked {
z_re = c_re + new_re;
z_im = c_im + new_im;
}
}
return i;
}
task void
mandelbrot_scanline(
uniform float x0, uniform float dx,
uniform float y0, uniform float dy,
uniform int width, uniform int height,
uniform int xspan, uniform int yspan,
uniform int maxIterations, uniform int output[])
{
const uniform int xstart = blockIndex0 * xspan;
const uniform int xend = min(xstart + xspan, width);
const uniform int ystart = blockIndex1 * yspan;
const uniform int yend = min(ystart + yspan, height);
// assert(xspan >= vectorWidth);
for (uniform int yi = ystart; yi < yend; yi++)
for (uniform int xi = xstart; xi < xend; xi += vectorWidth)
{
const float x = x0 + (xi + vectorIndex) * dx;
const float y = y0 + yi * dy;
const int res = mandel(x,y,maxIterations);
const int index = yi * width + (xi + vectorIndex);
if (xi + vectorIndex < xend)
output[index] = res;
}
}
#if 1
export void
mandelbrot_ispc(uniform float x0, uniform float y0,
uniform float x1, uniform float y1,
uniform int width, uniform int height,
uniform int maxIterations, uniform int output[]) {
uniform float dx = (x1 - x0) / width;
uniform float dy = (y1 - y0) / height;
const uniform int xspan = 32; /* make sure it is big enough to avoid false-sharing */
const uniform int yspan = 4;
launch [width/xspan, height/yspan]
mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan,
maxIterations, output);
#if 0
launch [width/xspan] [height/yspan]
mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan,
maxIterations, output);
#endif
}
#endif
#endif

View File

@@ -0,0 +1,676 @@
; ModuleID = 'mandelbrot_task.bc'
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
declare i8* @ISPCAlloc(i8**, i64, i32)
declare void @ISPCLaunch(i8**, i8*, i8*, i32, i32, i32)
declare void @ISPCSync(i8*)
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
define void @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) {
allocas:
%x01 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 0
%x02 = load float* %x01, align 4
%dx3 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 1
%dx4 = load float* %dx3, align 4
%y05 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 2
%y06 = load float* %y05, align 4
%dy7 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 3
%dy8 = load float* %dy7, align 4
%width9 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 4
%width10 = load i32* %width9, align 4
%height11 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 5
%height12 = load i32* %height11, align 4
%xspan13 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 6
%xspan14 = load i32* %xspan13, align 4
%yspan15 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 7
%yspan16 = load i32* %yspan15, align 4
%maxIterations17 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 8
%maxIterations18 = load i32* %maxIterations17, align 4
%output19 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 9
%output20 = load i32** %output19, align 8
%task_struct_mask = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }* %0, i64 0, i32 10
%mask = load <8 x i32>* %task_struct_mask, align 32
%floatmask.i = bitcast <8 x i32> %mask to <8 x float>
%v.i = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i)
%cmp.i = icmp eq i32 %v.i, 255
%mul_taskIndex0_load_xspan_load = mul i32 %xspan14, %5
%add_xstart_load_xspan_load23 = add i32 %mul_taskIndex0_load_xspan_load, %xspan14
%ret_veca.i.i = insertelement <4 x i32> undef, i32 %add_xstart_load_xspan_load23, i32 0
%ret_vecb.i.i = insertelement <4 x i32> undef, i32 %width10, i32 0
%ret_val.i.i = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %ret_veca.i.i, <4 x i32> %ret_vecb.i.i)
%ret.i.i = extractelement <4 x i32> %ret_val.i.i, i32 0
%mul_taskIndex1_load_yspan_load = mul i32 %yspan16, %6
%add_ystart_load_yspan_load26 = add i32 %mul_taskIndex1_load_yspan_load, %yspan16
%ret_veca.i.i220 = insertelement <4 x i32> undef, i32 %add_ystart_load_yspan_load26, i32 0
%ret_vecb.i.i221 = insertelement <4 x i32> undef, i32 %height12, i32 0
%ret_val.i.i222 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %ret_veca.i.i220, <4 x i32> %ret_vecb.i.i221)
%ret.i.i223 = extractelement <4 x i32> %ret_val.i.i222, i32 0
%less_yi_load_yend_load345 = icmp slt i32 %mul_taskIndex1_load_yspan_load, %ret.i.i223
br i1 %cmp.i, label %for_test.preheader, label %for_test92.preheader
for_test92.preheader: ; preds = %allocas
br i1 %less_yi_load_yend_load345, label %for_test103.preheader.lr.ph, label %for_exit
for_test103.preheader.lr.ph: ; preds = %for_test92.preheader
%less_xi_load110_xend_load111360 = icmp slt i32 %mul_taskIndex0_load_xspan_load, %ret.i.i
%x0_load115_broadcast_init = insertelement <8 x float> undef, float %x02, i32 0
%x0_load115_broadcast = shufflevector <8 x float> %x0_load115_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%dx_load117_broadcast_init = insertelement <8 x float> undef, float %dx4, i32 0
%dx_load117_broadcast = shufflevector <8 x float> %dx_load117_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%maxIterations_load125_broadcast_init = insertelement <8 x i32> undef, i32 %maxIterations18, i32 0
%maxIterations_load125_broadcast = shufflevector <8 x i32> %maxIterations_load125_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%less_i_load_count_load.i347 = icmp sgt <8 x i32> %maxIterations_load125_broadcast, zeroinitializer
%"oldMask&test.i348" = select <8 x i1> %less_i_load_count_load.i347, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> zeroinitializer
%"internal_mask&function_mask10.i349" = and <8 x i32> %"oldMask&test.i348", %mask
%floatmask.i.i350 = bitcast <8 x i32> %"internal_mask&function_mask10.i349" to <8 x float>
%xend_load134_broadcast_init = insertelement <8 x i32> undef, i32 %ret.i.i, i32 0
%xend_load134_broadcast = shufflevector <8 x i32> %xend_load134_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%output_load145_ptr2int_2void = bitcast i32* %output20 to i8*
br label %for_test103.preheader
for_test.preheader: ; preds = %allocas
br i1 %less_yi_load_yend_load345, label %for_test34.preheader.lr.ph, label %for_exit
for_test34.preheader.lr.ph: ; preds = %for_test.preheader
%less_xi_load_xend_load343 = icmp slt i32 %mul_taskIndex0_load_xspan_load, %ret.i.i
%x0_load_broadcast_init = insertelement <8 x float> undef, float %x02, i32 0
%x0_load_broadcast = shufflevector <8 x float> %x0_load_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%dx_load_broadcast_init = insertelement <8 x float> undef, float %dx4, i32 0
%dx_load_broadcast = shufflevector <8 x float> %dx_load_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%maxIterations_load_broadcast_init = insertelement <8 x i32> undef, i32 %maxIterations18, i32 0
%maxIterations_load_broadcast = shufflevector <8 x i32> %maxIterations_load_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%less_i_load_count_load.i181332 = icmp sgt <8 x i32> %maxIterations_load_broadcast, zeroinitializer
%"oldMask&test.i182333" = select <8 x i1> %less_i_load_count_load.i181332, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> zeroinitializer
%floatmask.i.i183334 = bitcast <8 x i32> %"oldMask&test.i182333" to <8 x float>
%xend_load51_broadcast_init = insertelement <8 x i32> undef, i32 %ret.i.i, i32 0
%xend_load51_broadcast = shufflevector <8 x i32> %xend_load51_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%output_load_ptr2int_2void = bitcast i32* %output20 to i8*
br label %for_test34.preheader
for_test34.preheader: ; preds = %for_exit37, %for_test34.preheader.lr.ph
%yi.0346 = phi i32 [ %mul_taskIndex1_load_yspan_load, %for_test34.preheader.lr.ph ], [ %yi_load69_plus1, %for_exit37 ]
br i1 %less_xi_load_xend_load343, label %for_loop36.lr.ph, label %for_exit37
for_loop36.lr.ph: ; preds = %for_test34.preheader
%yi_load43_to_float = sitofp i32 %yi.0346 to float
%mul_yi_load43_to_float_dy_load = fmul float %dy8, %yi_load43_to_float
%add_y0_load_mul_yi_load43_to_float_dy_load = fadd float %y06, %mul_yi_load43_to_float_dy_load
%add_y0_load_mul_yi_load43_to_float_dy_load_broadcast_init = insertelement <8 x float> undef, float %add_y0_load_mul_yi_load43_to_float_dy_load, i32 0
%add_y0_load_mul_yi_load43_to_float_dy_load_broadcast = shufflevector <8 x float> %add_y0_load_mul_yi_load43_to_float_dy_load_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%mul_yi_load47_width_load48 = mul i32 %yi.0346, %width10
%mul_yi_load47_width_load48_broadcast_init = insertelement <8 x i32> undef, i32 %mul_yi_load47_width_load48, i32 0
%mul_yi_load47_width_load48_broadcast = shufflevector <8 x i32> %mul_yi_load47_width_load48_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
br label %for_loop36
for_exit: ; preds = %for_exit106, %for_exit37, %for_test.preheader, %for_test92.preheader
ret void
for_loop36: ; preds = %safe_if_after_true, %for_loop36.lr.ph
%xi.0344 = phi i32 [ %mul_taskIndex0_load_xspan_load, %for_loop36.lr.ph ], [ %add_xi_load68_, %safe_if_after_true ]
%xi_load42_broadcast_init = insertelement <8 x i32> undef, i32 %xi.0344, i32 0
%xi_load42_broadcast = shufflevector <8 x i32> %xi_load42_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%add_xi_load42_broadcast_ = add <8 x i32> %xi_load42_broadcast, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%add_xi_load42_broadcast__to_float = sitofp <8 x i32> %add_xi_load42_broadcast_ to <8 x float>
%mul_add_xi_load42_broadcast__to_float_dx_load_broadcast = fmul <8 x float> %dx_load_broadcast, %add_xi_load42_broadcast__to_float
%add_x0_load_broadcast_mul_add_xi_load42_broadcast__to_float_dx_load_broadcast = fadd <8 x float> %x0_load_broadcast, %mul_add_xi_load42_broadcast__to_float_dx_load_broadcast
%v.i.i184335 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i.i183334)
%cmp.i.i185336 = icmp eq i32 %v.i.i184335, 0
br i1 %cmp.i.i185336, label %mandel___vyfvyfvyi.exit219, label %for_loop.i207
for_step.i192: ; preds = %not_all_continued_or_breaked.i218, %for_loop.i207
%z_re.1.i187 = phi <8 x float> [ %z_re.0.i176338, %for_loop.i207 ], [ %add_c_re_load42_new_re_load.i216, %not_all_continued_or_breaked.i218 ]
%z_im.1.i188 = phi <8 x float> [ %z_im.0.i177339, %for_loop.i207 ], [ %add_c_im_load44_new_im_load.i217, %not_all_continued_or_breaked.i218 ]
%internal_mask_memory.1.i189 = phi <8 x i32> [ zeroinitializer, %for_loop.i207 ], [ %new_mask28.i210, %not_all_continued_or_breaked.i218 ]
%i_load53_plus1.i191 = add <8 x i32> %blendAsInt.i328337, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%mask_as_float.i = bitcast <8 x i32> %internal_mask_memory.1.i189 to <8 x float>
%oldAsFloat.i = bitcast <8 x i32> %blendAsInt.i328337 to <8 x float>
%newAsFloat.i = bitcast <8 x i32> %i_load53_plus1.i191 to <8 x float>
%blend.i = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat.i, <8 x float> %newAsFloat.i, <8 x float> %mask_as_float.i)
%blendAsInt.i = bitcast <8 x float> %blend.i to <8 x i32>
%less_i_load_count_load.i181 = icmp slt <8 x i32> %blendAsInt.i, %maxIterations_load_broadcast
%"oldMask&test.i182" = select <8 x i1> %less_i_load_count_load.i181, <8 x i32> %internal_mask_memory.1.i189, <8 x i32> zeroinitializer
%floatmask.i.i183 = bitcast <8 x i32> %"oldMask&test.i182" to <8 x float>
%v.i.i184 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i.i183)
%cmp.i.i185 = icmp eq i32 %v.i.i184, 0
br i1 %cmp.i.i185, label %mandel___vyfvyfvyi.exit219, label %for_loop.i207
for_loop.i207: ; preds = %for_step.i192, %for_loop36
%v.i.i184342 = phi i32 [ %v.i.i184, %for_step.i192 ], [ %v.i.i184335, %for_loop36 ]
%"oldMask&test.i182341" = phi <8 x i32> [ %"oldMask&test.i182", %for_step.i192 ], [ %"oldMask&test.i182333", %for_loop36 ]
%break_lanes_memory.0.i178340 = phi <8 x i32> [ %"mask|break_mask.i198", %for_step.i192 ], [ zeroinitializer, %for_loop36 ]
%z_im.0.i177339 = phi <8 x float> [ %z_im.1.i188, %for_step.i192 ], [ %add_y0_load_mul_yi_load43_to_float_dy_load_broadcast, %for_loop36 ]
%z_re.0.i176338 = phi <8 x float> [ %z_re.1.i187, %for_step.i192 ], [ %add_x0_load_broadcast_mul_add_xi_load42_broadcast__to_float_dx_load_broadcast, %for_loop36 ]
%blendAsInt.i328337 = phi <8 x i32> [ %blendAsInt.i, %for_step.i192 ], [ zeroinitializer, %for_loop36 ]
%mul_z_re_load_z_re_load13.i193 = fmul <8 x float> %z_re.0.i176338, %z_re.0.i176338
%mul_z_im_load_z_im_load14.i194 = fmul <8 x float> %z_im.0.i177339, %z_im.0.i177339
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i195 = fadd <8 x float> %mul_z_re_load_z_re_load13.i193, %mul_z_im_load_z_im_load14.i194
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i196 = fcmp ugt <8 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i195, <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
%"oldMask&test16.i197" = select <8 x i1> %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i196, <8 x i32> %"oldMask&test.i182341", <8 x i32> zeroinitializer
%"mask|break_mask.i198" = or <8 x i32> %"oldMask&test16.i197", %break_lanes_memory.0.i178340
%floatmask.i67.i200 = bitcast <8 x i32> %"mask|break_mask.i198" to <8 x float>
%v.i68.i201 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i67.i200)
%"equal_finished&func_internal_mask&function_mask12.i206" = icmp eq i32 %v.i68.i201, %v.i.i184342
br i1 %"equal_finished&func_internal_mask&function_mask12.i206", label %for_step.i192, label %not_all_continued_or_breaked.i218
not_all_continued_or_breaked.i218: ; preds = %for_loop.i207
%"!(break|continue)_lanes.i209" = xor <8 x i32> %"mask|break_mask.i198", <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%new_mask28.i210 = and <8 x i32> %"oldMask&test.i182341", %"!(break|continue)_lanes.i209"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i213 = fsub <8 x float> %mul_z_re_load_z_re_load13.i193, %mul_z_im_load_z_im_load14.i194
%mul__z_re_load35.i214 = fmul <8 x float> %z_re.0.i176338, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i215 = fmul <8 x float> %mul__z_re_load35.i214, %z_im.0.i177339
%add_c_re_load42_new_re_load.i216 = fadd <8 x float> %add_x0_load_broadcast_mul_add_xi_load42_broadcast__to_float_dx_load_broadcast, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i213
%add_c_im_load44_new_im_load.i217 = fadd <8 x float> %add_y0_load_mul_yi_load43_to_float_dy_load_broadcast, %mul_mul__z_re_load35_z_im_load36.i215
br label %for_step.i192
mandel___vyfvyfvyi.exit219: ; preds = %for_step.i192, %for_loop36
%blendAsInt.i328.lcssa = phi <8 x i32> [ zeroinitializer, %for_loop36 ], [ %blendAsInt.i, %for_step.i192 ]
%less_add_xi_load50_broadcast__xend_load51_broadcast = icmp slt <8 x i32> %add_xi_load42_broadcast_, %xend_load51_broadcast
%floatmask.i172 = select <8 x i1> %less_add_xi_load50_broadcast__xend_load51_broadcast, <8 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000>, <8 x float> zeroinitializer
%v.i173 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i172)
%cmp.i174 = icmp eq i32 %v.i173, 0
br i1 %cmp.i174, label %safe_if_after_true, label %safe_if_run_true
for_exit37: ; preds = %safe_if_after_true, %for_test34.preheader
%yi_load69_plus1 = add i32 %yi.0346, 1
%exitcond = icmp eq i32 %yi_load69_plus1, %ret.i.i223
br i1 %exitcond, label %for_exit, label %for_test34.preheader
safe_if_after_true: ; preds = %pl_dolane.7.i326, %pl_loopend.6.i318, %mandel___vyfvyfvyi.exit219
%add_xi_load68_ = add i32 %xi.0344, 8
%less_xi_load_xend_load = icmp slt i32 %add_xi_load68_, %ret.i.i
br i1 %less_xi_load_xend_load, label %for_loop36, label %for_exit37
safe_if_run_true: ; preds = %mandel___vyfvyfvyi.exit219
%add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast = add <8 x i32> %mul_yi_load47_width_load48_broadcast, %xi_load42_broadcast
%v.i.i239 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i172)
%v64.i.i240 = zext i32 %v.i.i239 to i64
%pl_and.i241 = and i64 %v64.i.i240, 1
%pl_doit.i242 = icmp eq i64 %pl_and.i241, 0
br i1 %pl_doit.i242, label %pl_loopend.i252, label %pl_dolane.i249
pl_dolane.i249: ; preds = %safe_if_run_true
%offset32.i.i243 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 0
%offset64.i.i244 = sext i32 %offset32.i.i243 to i64
%finalptr.i.i246331 = getelementptr i32* %output20, i64 %offset64.i.i244
%storeval.i.i248 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 0
store i32 %storeval.i.i248, i32* %finalptr.i.i246331, align 4
br label %pl_loopend.i252
pl_loopend.i252: ; preds = %pl_dolane.i249, %safe_if_run_true
%pl_and.1.i250 = and i64 %v64.i.i240, 2
%pl_doit.1.i251 = icmp eq i64 %pl_and.1.i250, 0
br i1 %pl_doit.1.i251, label %pl_loopend.1.i263, label %pl_dolane.1.i260
pl_dolane.1.i260: ; preds = %pl_loopend.i252
%offset32.i.1.i253 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 1
%offset64.i.1.i254 = sext i32 %offset32.i.1.i253 to i64
%offset.i.1.i255 = shl nsw i64 %offset64.i.1.i254, 2
%ptroffset.sum.i.1.i256 = add i64 %offset.i.1.i255, 8
%finalptr.i.1.i257 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.1.i256
%ptrcast.i.1.i258 = bitcast i8* %finalptr.i.1.i257 to i32*
%storeval.i.1.i259 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 1
store i32 %storeval.i.1.i259, i32* %ptrcast.i.1.i258, align 4
br label %pl_loopend.1.i263
pl_loopend.1.i263: ; preds = %pl_dolane.1.i260, %pl_loopend.i252
%pl_and.2.i261 = and i64 %v64.i.i240, 4
%pl_doit.2.i262 = icmp eq i64 %pl_and.2.i261, 0
br i1 %pl_doit.2.i262, label %pl_loopend.2.i274, label %pl_dolane.2.i271
pl_dolane.2.i271: ; preds = %pl_loopend.1.i263
%offset32.i.2.i264 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 2
%offset64.i.2.i265 = sext i32 %offset32.i.2.i264 to i64
%offset.i.2.i266 = shl nsw i64 %offset64.i.2.i265, 2
%ptroffset.sum.i.2.i267 = add i64 %offset.i.2.i266, 16
%finalptr.i.2.i268 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.2.i267
%ptrcast.i.2.i269 = bitcast i8* %finalptr.i.2.i268 to i32*
%storeval.i.2.i270 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 2
store i32 %storeval.i.2.i270, i32* %ptrcast.i.2.i269, align 4
br label %pl_loopend.2.i274
pl_loopend.2.i274: ; preds = %pl_dolane.2.i271, %pl_loopend.1.i263
%pl_and.3.i272 = and i64 %v64.i.i240, 8
%pl_doit.3.i273 = icmp eq i64 %pl_and.3.i272, 0
br i1 %pl_doit.3.i273, label %pl_loopend.3.i285, label %pl_dolane.3.i282
pl_dolane.3.i282: ; preds = %pl_loopend.2.i274
%offset32.i.3.i275 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 3
%offset64.i.3.i276 = sext i32 %offset32.i.3.i275 to i64
%offset.i.3.i277 = shl nsw i64 %offset64.i.3.i276, 2
%ptroffset.sum.i.3.i278 = add i64 %offset.i.3.i277, 24
%finalptr.i.3.i279 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.3.i278
%ptrcast.i.3.i280 = bitcast i8* %finalptr.i.3.i279 to i32*
%storeval.i.3.i281 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 3
store i32 %storeval.i.3.i281, i32* %ptrcast.i.3.i280, align 4
br label %pl_loopend.3.i285
pl_loopend.3.i285: ; preds = %pl_dolane.3.i282, %pl_loopend.2.i274
%pl_and.4.i283 = and i64 %v64.i.i240, 16
%pl_doit.4.i284 = icmp eq i64 %pl_and.4.i283, 0
br i1 %pl_doit.4.i284, label %pl_loopend.4.i296, label %pl_dolane.4.i293
pl_dolane.4.i293: ; preds = %pl_loopend.3.i285
%offset32.i.4.i286 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 4
%offset64.i.4.i287 = sext i32 %offset32.i.4.i286 to i64
%offset.i.4.i288 = shl nsw i64 %offset64.i.4.i287, 2
%ptroffset.sum.i.4.i289 = add i64 %offset.i.4.i288, 32
%finalptr.i.4.i290 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.4.i289
%ptrcast.i.4.i291 = bitcast i8* %finalptr.i.4.i290 to i32*
%storeval.i.4.i292 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 4
store i32 %storeval.i.4.i292, i32* %ptrcast.i.4.i291, align 4
br label %pl_loopend.4.i296
pl_loopend.4.i296: ; preds = %pl_dolane.4.i293, %pl_loopend.3.i285
%pl_and.5.i294 = and i64 %v64.i.i240, 32
%pl_doit.5.i295 = icmp eq i64 %pl_and.5.i294, 0
br i1 %pl_doit.5.i295, label %pl_loopend.5.i307, label %pl_dolane.5.i304
pl_dolane.5.i304: ; preds = %pl_loopend.4.i296
%offset32.i.5.i297 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 5
%offset64.i.5.i298 = sext i32 %offset32.i.5.i297 to i64
%offset.i.5.i299 = shl nsw i64 %offset64.i.5.i298, 2
%ptroffset.sum.i.5.i300 = add i64 %offset.i.5.i299, 40
%finalptr.i.5.i301 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.5.i300
%ptrcast.i.5.i302 = bitcast i8* %finalptr.i.5.i301 to i32*
%storeval.i.5.i303 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 5
store i32 %storeval.i.5.i303, i32* %ptrcast.i.5.i302, align 4
br label %pl_loopend.5.i307
pl_loopend.5.i307: ; preds = %pl_dolane.5.i304, %pl_loopend.4.i296
%pl_and.6.i305 = and i64 %v64.i.i240, 64
%pl_doit.6.i306 = icmp eq i64 %pl_and.6.i305, 0
br i1 %pl_doit.6.i306, label %pl_loopend.6.i318, label %pl_dolane.6.i315
pl_dolane.6.i315: ; preds = %pl_loopend.5.i307
%offset32.i.6.i308 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 6
%offset64.i.6.i309 = sext i32 %offset32.i.6.i308 to i64
%offset.i.6.i310 = shl nsw i64 %offset64.i.6.i309, 2
%ptroffset.sum.i.6.i311 = add i64 %offset.i.6.i310, 48
%finalptr.i.6.i312 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.6.i311
%ptrcast.i.6.i313 = bitcast i8* %finalptr.i.6.i312 to i32*
%storeval.i.6.i314 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 6
store i32 %storeval.i.6.i314, i32* %ptrcast.i.6.i313, align 4
br label %pl_loopend.6.i318
pl_loopend.6.i318: ; preds = %pl_dolane.6.i315, %pl_loopend.5.i307
%pl_and.7.i316 = and i64 %v64.i.i240, 128
%pl_doit.7.i317 = icmp eq i64 %pl_and.7.i316, 0
br i1 %pl_doit.7.i317, label %safe_if_after_true, label %pl_dolane.7.i326
pl_dolane.7.i326: ; preds = %pl_loopend.6.i318
%offset32.i.7.i319 = extractelement <8 x i32> %add_mul_yi_load47_width_load48_broadcast_xi_load49_broadcast, i32 7
%offset64.i.7.i320 = sext i32 %offset32.i.7.i319 to i64
%offset.i.7.i321 = shl nsw i64 %offset64.i.7.i320, 2
%ptroffset.sum.i.7.i322 = add i64 %offset.i.7.i321, 56
%finalptr.i.7.i323 = getelementptr i8* %output_load_ptr2int_2void, i64 %ptroffset.sum.i.7.i322
%ptrcast.i.7.i324 = bitcast i8* %finalptr.i.7.i323 to i32*
%storeval.i.7.i325 = extractelement <8 x i32> %blendAsInt.i328.lcssa, i32 7
store i32 %storeval.i.7.i325, i32* %ptrcast.i.7.i324, align 4
br label %safe_if_after_true
for_test103.preheader: ; preds = %for_exit106, %for_test103.preheader.lr.ph
%yi97.0364 = phi i32 [ %mul_taskIndex1_load_yspan_load, %for_test103.preheader.lr.ph ], [ %yi_load164_plus1, %for_exit106 ]
br i1 %less_xi_load110_xend_load111360, label %for_loop105.lr.ph, label %for_exit106
for_loop105.lr.ph: ; preds = %for_test103.preheader
%yi_load120_to_float = sitofp i32 %yi97.0364 to float
%mul_yi_load120_to_float_dy_load121 = fmul float %dy8, %yi_load120_to_float
%add_y0_load119_mul_yi_load120_to_float_dy_load121 = fadd float %y06, %mul_yi_load120_to_float_dy_load121
%add_y0_load119_mul_yi_load120_to_float_dy_load121_broadcast_init = insertelement <8 x float> undef, float %add_y0_load119_mul_yi_load120_to_float_dy_load121, i32 0
%add_y0_load119_mul_yi_load120_to_float_dy_load121_broadcast = shufflevector <8 x float> %add_y0_load119_mul_yi_load120_to_float_dy_load121_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%mul_yi_load130_width_load131 = mul i32 %yi97.0364, %width10
%mul_yi_load130_width_load131_broadcast_init = insertelement <8 x i32> undef, i32 %mul_yi_load130_width_load131, i32 0
%mul_yi_load130_width_load131_broadcast = shufflevector <8 x i32> %mul_yi_load130_width_load131_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
br label %for_loop105
for_loop105: ; preds = %safe_if_after_true137, %for_loop105.lr.ph
%xi108.0361 = phi i32 [ %mul_taskIndex0_load_xspan_load, %for_loop105.lr.ph ], [ %add_xi108_load_, %safe_if_after_true137 ]
%xi_load116_broadcast_init = insertelement <8 x i32> undef, i32 %xi108.0361, i32 0
%xi_load116_broadcast = shufflevector <8 x i32> %xi_load116_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%add_xi_load116_broadcast_ = add <8 x i32> %xi_load116_broadcast, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%add_xi_load116_broadcast__to_float = sitofp <8 x i32> %add_xi_load116_broadcast_ to <8 x float>
%mul_add_xi_load116_broadcast__to_float_dx_load117_broadcast = fmul <8 x float> %dx_load117_broadcast, %add_xi_load116_broadcast__to_float
%add_x0_load115_broadcast_mul_add_xi_load116_broadcast__to_float_dx_load117_broadcast = fadd <8 x float> %x0_load115_broadcast, %mul_add_xi_load116_broadcast__to_float_dx_load117_broadcast
%v.i.i351 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i.i350)
%cmp.i.i352 = icmp eq i32 %v.i.i351, 0
br i1 %cmp.i.i352, label %mandel___vyfvyfvyi.exit, label %for_loop.i
for_step.i: ; preds = %not_all_continued_or_breaked.i, %for_loop.i
%z_re.1.i = phi <8 x float> [ %z_re.0.i354, %for_loop.i ], [ %add_c_re_load42_new_re_load.i, %not_all_continued_or_breaked.i ]
%z_im.1.i = phi <8 x float> [ %z_im.0.i355, %for_loop.i ], [ %add_c_im_load44_new_im_load.i, %not_all_continued_or_breaked.i ]
%internal_mask_memory.1.i = phi <8 x i32> [ zeroinitializer, %for_loop.i ], [ %new_mask28.i, %not_all_continued_or_breaked.i ]
%i_load53_plus1.i = add <8 x i32> %blendAsInt.i237329353, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%mask_as_float.i232 = bitcast <8 x i32> %internal_mask_memory.1.i to <8 x float>
%oldAsFloat.i234 = bitcast <8 x i32> %blendAsInt.i237329353 to <8 x float>
%newAsFloat.i235 = bitcast <8 x i32> %i_load53_plus1.i to <8 x float>
%blend.i236 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat.i234, <8 x float> %newAsFloat.i235, <8 x float> %mask_as_float.i232)
%blendAsInt.i237 = bitcast <8 x float> %blend.i236 to <8 x i32>
%less_i_load_count_load.i = icmp slt <8 x i32> %blendAsInt.i237, %maxIterations_load125_broadcast
%"oldMask&test.i" = select <8 x i1> %less_i_load_count_load.i, <8 x i32> %internal_mask_memory.1.i, <8 x i32> zeroinitializer
%"internal_mask&function_mask10.i" = and <8 x i32> %"oldMask&test.i", %mask
%floatmask.i.i = bitcast <8 x i32> %"internal_mask&function_mask10.i" to <8 x float>
%v.i.i = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i.i)
%cmp.i.i = icmp eq i32 %v.i.i, 0
br i1 %cmp.i.i, label %mandel___vyfvyfvyi.exit, label %for_loop.i
for_loop.i: ; preds = %for_step.i, %for_loop105
%v.i.i358 = phi i32 [ %v.i.i, %for_step.i ], [ %v.i.i351, %for_loop105 ]
%"oldMask&test.i357" = phi <8 x i32> [ %"oldMask&test.i", %for_step.i ], [ %"oldMask&test.i348", %for_loop105 ]
%break_lanes_memory.0.i356 = phi <8 x i32> [ %"mask|break_mask.i", %for_step.i ], [ zeroinitializer, %for_loop105 ]
%z_im.0.i355 = phi <8 x float> [ %z_im.1.i, %for_step.i ], [ %add_y0_load119_mul_yi_load120_to_float_dy_load121_broadcast, %for_loop105 ]
%z_re.0.i354 = phi <8 x float> [ %z_re.1.i, %for_step.i ], [ %add_x0_load115_broadcast_mul_add_xi_load116_broadcast__to_float_dx_load117_broadcast, %for_loop105 ]
%blendAsInt.i237329353 = phi <8 x i32> [ %blendAsInt.i237, %for_step.i ], [ zeroinitializer, %for_loop105 ]
%mul_z_re_load_z_re_load13.i = fmul <8 x float> %z_re.0.i354, %z_re.0.i354
%mul_z_im_load_z_im_load14.i = fmul <8 x float> %z_im.0.i355, %z_im.0.i355
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i = fadd <8 x float> %mul_z_re_load_z_re_load13.i, %mul_z_im_load_z_im_load14.i
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i = fcmp ugt <8 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i, <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
%"oldMask&test16.i" = select <8 x i1> %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i, <8 x i32> %"oldMask&test.i357", <8 x i32> zeroinitializer
%"mask|break_mask.i" = or <8 x i32> %"oldMask&test16.i", %break_lanes_memory.0.i356
%"finished&func.i" = and <8 x i32> %"mask|break_mask.i", %mask
%floatmask.i67.i = bitcast <8 x i32> %"finished&func.i" to <8 x float>
%v.i68.i = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i67.i)
%"equal_finished&func_internal_mask&function_mask12.i" = icmp eq i32 %v.i68.i, %v.i.i358
br i1 %"equal_finished&func_internal_mask&function_mask12.i", label %for_step.i, label %not_all_continued_or_breaked.i
not_all_continued_or_breaked.i: ; preds = %for_loop.i
%"!(break|continue)_lanes.i" = xor <8 x i32> %"mask|break_mask.i", <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%new_mask28.i = and <8 x i32> %"oldMask&test.i357", %"!(break|continue)_lanes.i"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i = fsub <8 x float> %mul_z_re_load_z_re_load13.i, %mul_z_im_load_z_im_load14.i
%mul__z_re_load35.i = fmul <8 x float> %z_re.0.i354, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i = fmul <8 x float> %mul__z_re_load35.i, %z_im.0.i355
%add_c_re_load42_new_re_load.i = fadd <8 x float> %add_x0_load115_broadcast_mul_add_xi_load116_broadcast__to_float_dx_load117_broadcast, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i
%add_c_im_load44_new_im_load.i = fadd <8 x float> %add_y0_load119_mul_yi_load120_to_float_dy_load121_broadcast, %mul_mul__z_re_load35_z_im_load36.i
br label %for_step.i
mandel___vyfvyfvyi.exit: ; preds = %for_step.i, %for_loop105
%blendAsInt.i237329.lcssa = phi <8 x i32> [ zeroinitializer, %for_loop105 ], [ %blendAsInt.i237, %for_step.i ]
%less_add_xi_load133_broadcast__xend_load134_broadcast = icmp slt <8 x i32> %add_xi_load116_broadcast_, %xend_load134_broadcast
%"oldMask&test139" = select <8 x i1> %less_add_xi_load133_broadcast__xend_load134_broadcast, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> zeroinitializer
%"internal_mask&function_mask143" = and <8 x i32> %"oldMask&test139", %mask
%floatmask.i169 = bitcast <8 x i32> %"internal_mask&function_mask143" to <8 x float>
%v.i170 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i169)
%cmp.i171 = icmp eq i32 %v.i170, 0
br i1 %cmp.i171, label %safe_if_after_true137, label %safe_if_run_true138
for_exit106: ; preds = %safe_if_after_true137, %for_test103.preheader
%yi_load164_plus1 = add i32 %yi97.0364, 1
%exitcond365 = icmp eq i32 %yi_load164_plus1, %ret.i.i223
br i1 %exitcond365, label %for_exit, label %for_test103.preheader
safe_if_after_true137: ; preds = %pl_dolane.7.i, %pl_loopend.6.i, %mandel___vyfvyfvyi.exit
%add_xi108_load_ = add i32 %xi108.0361, 8
%less_xi_load110_xend_load111 = icmp slt i32 %add_xi108_load_, %ret.i.i
br i1 %less_xi_load110_xend_load111, label %for_loop105, label %for_exit106
safe_if_run_true138: ; preds = %mandel___vyfvyfvyi.exit
%add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast = add <8 x i32> %mul_yi_load130_width_load131_broadcast, %xi_load116_broadcast
%v.i.i231 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i169)
%v64.i.i = zext i32 %v.i.i231 to i64
%pl_and.i = and i64 %v64.i.i, 1
%pl_doit.i = icmp eq i64 %pl_and.i, 0
br i1 %pl_doit.i, label %pl_loopend.i, label %pl_dolane.i
pl_dolane.i: ; preds = %safe_if_run_true138
%offset32.i.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 0
%offset64.i.i = sext i32 %offset32.i.i to i64
%finalptr.i.i330 = getelementptr i32* %output20, i64 %offset64.i.i
%storeval.i.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 0
store i32 %storeval.i.i, i32* %finalptr.i.i330, align 4
br label %pl_loopend.i
pl_loopend.i: ; preds = %pl_dolane.i, %safe_if_run_true138
%pl_and.1.i = and i64 %v64.i.i, 2
%pl_doit.1.i = icmp eq i64 %pl_and.1.i, 0
br i1 %pl_doit.1.i, label %pl_loopend.1.i, label %pl_dolane.1.i
pl_dolane.1.i: ; preds = %pl_loopend.i
%offset32.i.1.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 1
%offset64.i.1.i = sext i32 %offset32.i.1.i to i64
%offset.i.1.i = shl nsw i64 %offset64.i.1.i, 2
%ptroffset.sum.i.1.i = add i64 %offset.i.1.i, 8
%finalptr.i.1.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.1.i
%ptrcast.i.1.i = bitcast i8* %finalptr.i.1.i to i32*
%storeval.i.1.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 1
store i32 %storeval.i.1.i, i32* %ptrcast.i.1.i, align 4
br label %pl_loopend.1.i
pl_loopend.1.i: ; preds = %pl_dolane.1.i, %pl_loopend.i
%pl_and.2.i = and i64 %v64.i.i, 4
%pl_doit.2.i = icmp eq i64 %pl_and.2.i, 0
br i1 %pl_doit.2.i, label %pl_loopend.2.i, label %pl_dolane.2.i
pl_dolane.2.i: ; preds = %pl_loopend.1.i
%offset32.i.2.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 2
%offset64.i.2.i = sext i32 %offset32.i.2.i to i64
%offset.i.2.i = shl nsw i64 %offset64.i.2.i, 2
%ptroffset.sum.i.2.i = add i64 %offset.i.2.i, 16
%finalptr.i.2.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.2.i
%ptrcast.i.2.i = bitcast i8* %finalptr.i.2.i to i32*
%storeval.i.2.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 2
store i32 %storeval.i.2.i, i32* %ptrcast.i.2.i, align 4
br label %pl_loopend.2.i
pl_loopend.2.i: ; preds = %pl_dolane.2.i, %pl_loopend.1.i
%pl_and.3.i = and i64 %v64.i.i, 8
%pl_doit.3.i = icmp eq i64 %pl_and.3.i, 0
br i1 %pl_doit.3.i, label %pl_loopend.3.i, label %pl_dolane.3.i
pl_dolane.3.i: ; preds = %pl_loopend.2.i
%offset32.i.3.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 3
%offset64.i.3.i = sext i32 %offset32.i.3.i to i64
%offset.i.3.i = shl nsw i64 %offset64.i.3.i, 2
%ptroffset.sum.i.3.i = add i64 %offset.i.3.i, 24
%finalptr.i.3.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.3.i
%ptrcast.i.3.i = bitcast i8* %finalptr.i.3.i to i32*
%storeval.i.3.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 3
store i32 %storeval.i.3.i, i32* %ptrcast.i.3.i, align 4
br label %pl_loopend.3.i
pl_loopend.3.i: ; preds = %pl_dolane.3.i, %pl_loopend.2.i
%pl_and.4.i = and i64 %v64.i.i, 16
%pl_doit.4.i = icmp eq i64 %pl_and.4.i, 0
br i1 %pl_doit.4.i, label %pl_loopend.4.i, label %pl_dolane.4.i
pl_dolane.4.i: ; preds = %pl_loopend.3.i
%offset32.i.4.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 4
%offset64.i.4.i = sext i32 %offset32.i.4.i to i64
%offset.i.4.i = shl nsw i64 %offset64.i.4.i, 2
%ptroffset.sum.i.4.i = add i64 %offset.i.4.i, 32
%finalptr.i.4.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.4.i
%ptrcast.i.4.i = bitcast i8* %finalptr.i.4.i to i32*
%storeval.i.4.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 4
store i32 %storeval.i.4.i, i32* %ptrcast.i.4.i, align 4
br label %pl_loopend.4.i
pl_loopend.4.i: ; preds = %pl_dolane.4.i, %pl_loopend.3.i
%pl_and.5.i = and i64 %v64.i.i, 32
%pl_doit.5.i = icmp eq i64 %pl_and.5.i, 0
br i1 %pl_doit.5.i, label %pl_loopend.5.i, label %pl_dolane.5.i
pl_dolane.5.i: ; preds = %pl_loopend.4.i
%offset32.i.5.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 5
%offset64.i.5.i = sext i32 %offset32.i.5.i to i64
%offset.i.5.i = shl nsw i64 %offset64.i.5.i, 2
%ptroffset.sum.i.5.i = add i64 %offset.i.5.i, 40
%finalptr.i.5.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.5.i
%ptrcast.i.5.i = bitcast i8* %finalptr.i.5.i to i32*
%storeval.i.5.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 5
store i32 %storeval.i.5.i, i32* %ptrcast.i.5.i, align 4
br label %pl_loopend.5.i
pl_loopend.5.i: ; preds = %pl_dolane.5.i, %pl_loopend.4.i
%pl_and.6.i = and i64 %v64.i.i, 64
%pl_doit.6.i = icmp eq i64 %pl_and.6.i, 0
br i1 %pl_doit.6.i, label %pl_loopend.6.i, label %pl_dolane.6.i
pl_dolane.6.i: ; preds = %pl_loopend.5.i
%offset32.i.6.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 6
%offset64.i.6.i = sext i32 %offset32.i.6.i to i64
%offset.i.6.i = shl nsw i64 %offset64.i.6.i, 2
%ptroffset.sum.i.6.i = add i64 %offset.i.6.i, 48
%finalptr.i.6.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.6.i
%ptrcast.i.6.i = bitcast i8* %finalptr.i.6.i to i32*
%storeval.i.6.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 6
store i32 %storeval.i.6.i, i32* %ptrcast.i.6.i, align 4
br label %pl_loopend.6.i
pl_loopend.6.i: ; preds = %pl_dolane.6.i, %pl_loopend.5.i
%pl_and.7.i = and i64 %v64.i.i, 128
%pl_doit.7.i = icmp eq i64 %pl_and.7.i, 0
br i1 %pl_doit.7.i, label %safe_if_after_true137, label %pl_dolane.7.i
pl_dolane.7.i: ; preds = %pl_loopend.6.i
%offset32.i.7.i = extractelement <8 x i32> %add_mul_yi_load130_width_load131_broadcast_xi_load132_broadcast, i32 7
%offset64.i.7.i = sext i32 %offset32.i.7.i to i64
%offset.i.7.i = shl nsw i64 %offset64.i.7.i, 2
%ptroffset.sum.i.7.i = add i64 %offset.i.7.i, 56
%finalptr.i.7.i = getelementptr i8* %output_load145_ptr2int_2void, i64 %ptroffset.sum.i.7.i
%ptrcast.i.7.i = bitcast i8* %finalptr.i.7.i to i32*
%storeval.i.7.i = extractelement <8 x i32> %blendAsInt.i237329.lcssa, i32 7
store i32 %storeval.i.7.i, i32* %ptrcast.i.7.i, align 4
br label %safe_if_after_true137
}
define void @mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_(float %x0, float %y0, float %x1, float %y1, i32 %width, i32 %height, i32 %maxIterations, i32* %output, <8 x i32> %__mask) {
allocas:
%launch_group_handle = alloca i8*, align 8
store i8* null, i8** %launch_group_handle, align 8
%floatmask.i = bitcast <8 x i32> %__mask to <8 x float>
%v.i = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i)
%cmp.i = icmp eq i32 %v.i, 255
%sub_x1_load_x0_load = fsub float %x1, %x0
%width_load_to_float = sitofp i32 %width to float
%div_sub_x1_load_x0_load_width_load_to_float = fdiv float %sub_x1_load_x0_load, %width_load_to_float
%sub_y1_load_y0_load = fsub float %y1, %y0
%height_load_to_float = sitofp i32 %height to float
%div_sub_y1_load_y0_load_height_load_to_float = fdiv float %sub_y1_load_y0_load, %height_load_to_float
%div_width_load15_ = sdiv i32 %width, 16
%div_height_load16_yspan_load = sdiv i32 %height, 16
%args_ptr = call i8* @ISPCAlloc(i8** %launch_group_handle, i64 96, i32 32)
%funarg = bitcast i8* %args_ptr to float*
store float %x0, float* %funarg, align 4
%funarg17 = getelementptr i8* %args_ptr, i64 4
%0 = bitcast i8* %funarg17 to float*
store float %div_sub_x1_load_x0_load_width_load_to_float, float* %0, align 4
%funarg18 = getelementptr i8* %args_ptr, i64 8
%1 = bitcast i8* %funarg18 to float*
store float %y0, float* %1, align 4
%funarg19 = getelementptr i8* %args_ptr, i64 12
%2 = bitcast i8* %funarg19 to float*
store float %div_sub_y1_load_y0_load_height_load_to_float, float* %2, align 4
%funarg20 = getelementptr i8* %args_ptr, i64 16
%3 = bitcast i8* %funarg20 to i32*
store i32 %width, i32* %3, align 4
%funarg21 = getelementptr i8* %args_ptr, i64 20
%4 = bitcast i8* %funarg21 to i32*
store i32 %height, i32* %4, align 4
%funarg22 = getelementptr i8* %args_ptr, i64 24
%5 = bitcast i8* %funarg22 to i32*
store i32 16, i32* %5, align 4
%funarg23 = getelementptr i8* %args_ptr, i64 28
%6 = bitcast i8* %funarg23 to i32*
store i32 16, i32* %6, align 4
%funarg24 = getelementptr i8* %args_ptr, i64 32
%7 = bitcast i8* %funarg24 to i32*
store i32 %maxIterations, i32* %7, align 4
%funarg25 = getelementptr i8* %args_ptr, i64 40
%8 = bitcast i8* %funarg25 to i32**
store i32* %output, i32** %8, align 8
%funarg_mask = getelementptr i8* %args_ptr, i64 64
%9 = bitcast i8* %funarg_mask to <8 x i32>*
br i1 %cmp.i, label %all_on, label %some_on
all_on: ; preds = %allocas
store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %9, align 32
call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_ to i8*), i8* %args_ptr, i32 %div_width_load15_, i32 %div_height_load16_yspan_load, i32 1)
%launch_group_handle_load = load i8** %launch_group_handle, align 8
%cmp = icmp eq i8* %launch_group_handle_load, null
br i1 %cmp, label %post_sync, label %call_sync
some_on: ; preds = %allocas
store <8 x i32> %__mask, <8 x i32>* %9, align 32
call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_ to i8*), i8* %args_ptr, i32 %div_width_load15_, i32 %div_height_load16_yspan_load, i32 1)
%launch_group_handle_load67 = load i8** %launch_group_handle, align 8
%cmp68 = icmp eq i8* %launch_group_handle_load67, null
br i1 %cmp68, label %post_sync, label %call_sync69
call_sync: ; preds = %all_on
call void @ISPCSync(i8* %launch_group_handle_load)
store i8* null, i8** %launch_group_handle, align 8
br label %post_sync
post_sync: ; preds = %call_sync69, %call_sync, %some_on, %all_on
ret void
call_sync69: ; preds = %some_on
call void @ISPCSync(i8* %launch_group_handle_load67)
store i8* null, i8** %launch_group_handle, align 8
br label %post_sync
}
define void @mandelbrot_ispc(float %x0, float %y0, float %x1, float %y1, i32 %width, i32 %height, i32 %maxIterations, i32* %output) {
allocas:
%launch_group_handle = alloca i8*, align 8
store i8* null, i8** %launch_group_handle, align 8
%sub_x1_load_x0_load = fsub float %x1, %x0
%width_load_to_float = sitofp i32 %width to float
%div_sub_x1_load_x0_load_width_load_to_float = fdiv float %sub_x1_load_x0_load, %width_load_to_float
%sub_y1_load_y0_load = fsub float %y1, %y0
%height_load_to_float = sitofp i32 %height to float
%div_sub_y1_load_y0_load_height_load_to_float = fdiv float %sub_y1_load_y0_load, %height_load_to_float
%div_width_load15_ = sdiv i32 %width, 16
%div_height_load16_yspan_load = sdiv i32 %height, 16
%args_ptr = call i8* @ISPCAlloc(i8** %launch_group_handle, i64 96, i32 32)
%funarg = bitcast i8* %args_ptr to float*
store float %x0, float* %funarg, align 4
%funarg17 = getelementptr i8* %args_ptr, i64 4
%0 = bitcast i8* %funarg17 to float*
store float %div_sub_x1_load_x0_load_width_load_to_float, float* %0, align 4
%funarg18 = getelementptr i8* %args_ptr, i64 8
%1 = bitcast i8* %funarg18 to float*
store float %y0, float* %1, align 4
%funarg19 = getelementptr i8* %args_ptr, i64 12
%2 = bitcast i8* %funarg19 to float*
store float %div_sub_y1_load_y0_load_height_load_to_float, float* %2, align 4
%funarg20 = getelementptr i8* %args_ptr, i64 16
%3 = bitcast i8* %funarg20 to i32*
store i32 %width, i32* %3, align 4
%funarg21 = getelementptr i8* %args_ptr, i64 20
%4 = bitcast i8* %funarg21 to i32*
store i32 %height, i32* %4, align 4
%funarg22 = getelementptr i8* %args_ptr, i64 24
%5 = bitcast i8* %funarg22 to i32*
store i32 16, i32* %5, align 4
%funarg23 = getelementptr i8* %args_ptr, i64 28
%6 = bitcast i8* %funarg23 to i32*
store i32 16, i32* %6, align 4
%funarg24 = getelementptr i8* %args_ptr, i64 32
%7 = bitcast i8* %funarg24 to i32*
store i32 %maxIterations, i32* %7, align 4
%funarg25 = getelementptr i8* %args_ptr, i64 40
%8 = bitcast i8* %funarg25 to i32**
store i32* %output, i32** %8, align 8
%funarg_mask = getelementptr i8* %args_ptr, i64 64
%9 = bitcast i8* %funarg_mask to <8 x i32>*
store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %9, align 32
call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_ to i8*), i8* %args_ptr, i32 %div_width_load15_, i32 %div_height_load16_yspan_load, i32 1)
%launch_group_handle_load = load i8** %launch_group_handle, align 8
%cmp = icmp eq i8* %launch_group_handle_load, null
br i1 %cmp, label %post_sync, label %call_sync
call_sync: ; preds = %allocas
call void @ISPCSync(i8* %launch_group_handle_load)
store i8* null, i8** %launch_group_handle, align 8
br label %post_sync
post_sync: ; preds = %call_sync, %allocas
ret void
}

Binary file not shown.

View File

@@ -0,0 +1,320 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl __aos_to_soa4_float1
// @__aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl mandelbrot_scanline
.entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 .ptr .align 4 mandelbrot_scanline_param_9
) // @mandelbrot_scanline
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u32 %r6, [mandelbrot_scanline_param_5];
ld.param.u32 %r5, [mandelbrot_scanline_param_7];
mov.u32 %r7, %ctaid.y;
mul.lo.s32 %r0, %r7, %r5;
mad.lo.s32 %r1, %r7, %r5, %r5;
setp.lt.s32 %p0, %r1, %r6;
selp.b32 %r1, %r1, %r6, %p0;
setp.ge.s32 %p0, %r0, %r1;
@%p0 bra BB4_13;
// BB#1: // %for_test28.preheader.lr.ph
ld.param.f32 %f0, [mandelbrot_scanline_param_0];
ld.param.f32 %f1, [mandelbrot_scanline_param_1];
ld.param.f32 %f2, [mandelbrot_scanline_param_2];
ld.param.f32 %f3, [mandelbrot_scanline_param_3];
ld.param.u32 %r1, [mandelbrot_scanline_param_4];
ld.param.u32 %r4, [mandelbrot_scanline_param_6];
ld.param.u32 %r2, [mandelbrot_scanline_param_8];
ld.param.u64 %rl0, [mandelbrot_scanline_param_9];
mov.u32 %r8, %ctaid.x;
mul.lo.s32 %r3, %r8, %r4;
mad.lo.s32 %r4, %r8, %r4, %r4;
setp.lt.s32 %p0, %r4, %r1;
selp.b32 %r4, %r4, %r1, %p0;
setp.gt.s32 %p0, %r2, 0;
not.b32 %r6, %r6;
add.s32 %r7, %r7, 1;
mul.lo.s32 %r5, %r7, %r5;
not.b32 %r5, %r5;
setp.gt.s32 %p1, %r6, %r5;
selp.b32 %r5, %r6, %r5, %p1;
not.b32 %r5, %r5;
BB4_2: // %for_test28.preheader
// =>This Loop Header: Depth=1
// Child Loop BB4_15 Depth 2
// Child Loop BB4_8 Depth 2
// Child Loop BB4_11 Depth 3
setp.ge.s32 %p1, %r3, %r4;
@%p1 bra BB4_12;
// BB#3: // %for_loop30.lr.ph
// in Loop: Header=BB4_2 Depth=1
mul.lo.s32 %r6, %r0, %r1;
mov.u32 %r7, %r3;
@%p0 bra BB4_4;
bra.uni BB4_15;
BB4_4: // in Loop: Header=BB4_2 Depth=1
cvt.rn.f32.s32 %f4, %r0;
fma.rn.f32 %f4, %f4, %f3, %f2;
mov.u32 %r7, %r3;
BB4_8: // %for_loop.i.lr.ph.us
// Parent Loop BB4_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB4_11 Depth 3
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r11, %r10, %r7;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f5, %f1, %f0;
mov.u32 %r10, 0;
mov.pred %p1, 0;
mov.pred %p3, -1;
mov.pred %p4, %p0;
mov.pred %p2, %p1;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB4_11: // %for_loop.i.us
// Parent Loop BB4_2 Depth=1
// Parent Loop BB4_8 Depth=2
// => This Inner Loop Header: Depth=3
and.pred %p4, %p3, %p4;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p3, %f9, 0f40800000;
and.pred %p3, %p4, %p3;
or.pred %p2, %p3, %p2;
xor.pred %p5, %p2, %p4;
mov.pred %p3, %p1;
@!%p5 bra BB4_10;
bra.uni BB4_9;
BB4_9: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB4_11 Depth=3
mul.f32 %f9, %f6, %f6;
not.pred %p3, %p2;
and.pred %p3, %p4, %p3;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB4_10: // %for_step.i.us
// in Loop: Header=BB4_11 Depth=3
add.s32 %r12, %r10, 1;
selp.b32 %r10, %r12, %r10, %p3;
setp.lt.s32 %p4, %r10, %r2;
and.pred %p5, %p3, %p4;
@%p5 bra BB4_11;
// BB#5: // %mandel___vyfvyfvyi.exit.us
// in Loop: Header=BB4_8 Depth=2
setp.ge.s32 %p1, %r11, %r4;
@%p1 bra BB4_7;
// BB#6: // %if_then.us
// in Loop: Header=BB4_8 Depth=2
add.s32 %r11, %r8, 1073741823;
and.b32 %r9, %r11, %r9;
add.s32 %r11, %r7, %r6;
add.s32 %r9, %r11, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB4_7: // %if_exit.us
// in Loop: Header=BB4_8 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r4;
@%p1 bra BB4_8;
bra.uni BB4_12;
BB4_15: // %mandel___vyfvyfvyi.exit
// Parent Loop BB4_2 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r10, %r10, %r7;
setp.lt.s32 %p1, %r10, %r4;
@%p1 bra BB4_16;
bra.uni BB4_14;
BB4_16: // %if_then
// in Loop: Header=BB4_15 Depth=2
add.s32 %r10, %r8, 1073741823;
and.b32 %r9, %r10, %r9;
add.s32 %r10, %r7, %r6;
add.s32 %r9, %r10, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
BB4_14: // %if_exit
// in Loop: Header=BB4_15 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r4;
@%p1 bra BB4_15;
BB4_12: // %for_exit31
// in Loop: Header=BB4_2 Depth=1
add.s32 %r0, %r0, 1;
setp.eq.s32 %p1, %r0, %r5;
@%p1 bra BB4_13;
bra.uni BB4_2;
BB4_13: // %for_exit
ret;
}

View File

@@ -0,0 +1,534 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_20, texmode_independent
.address_size 64
// .globl __vselect_i8
.func (.param .b32 func_retval0) getBlockIndex0___UM_
(
)
;
.func (.param .b32 func_retval0) getBlockIndex1___UM_
(
)
;
.func (.param .b32 func_retval0) getLaneIndex___UM_
(
)
;
// @__vselect_i8
.func (.param .align 1 .b8 func_retval0[1]) __vselect_i8(
.param .align 1 .b8 __vselect_i8_param_0[1],
.param .align 1 .b8 __vselect_i8_param_1[1],
.param .align 4 .b8 __vselect_i8_param_2[4]
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i8_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u8 %rc0, [__vselect_i8_param_0];
ld.param.u8 %rc1, [__vselect_i8_param_1];
selp.b16 %rc0, %rc0, %rc1, %p0;
st.param.b8 [func_retval0+0], %rc0;
ret;
}
// .globl __vselect_i16
.func (.param .align 2 .b8 func_retval0[2]) __vselect_i16(
.param .align 2 .b8 __vselect_i16_param_0[2],
.param .align 2 .b8 __vselect_i16_param_1[2],
.param .align 4 .b8 __vselect_i16_param_2[4]
) // @__vselect_i16
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i16_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u16 %rs0, [__vselect_i16_param_0];
ld.param.u16 %rs1, [__vselect_i16_param_1];
selp.b16 %rs0, %rs0, %rs1, %p0;
st.param.b16 [func_retval0+0], %rs0;
ret;
}
// .globl __vselect_i64
.func (.param .align 8 .b8 func_retval0[8]) __vselect_i64(
.param .align 8 .b8 __vselect_i64_param_0[8],
.param .align 8 .b8 __vselect_i64_param_1[8],
.param .align 4 .b8 __vselect_i64_param_2[4]
) // @__vselect_i64
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i64_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u64 %rl0, [__vselect_i64_param_0];
ld.param.u64 %rl1, [__vselect_i64_param_1];
selp.b64 %rl0, %rl0, %rl1, %p0;
st.param.b64 [func_retval0+0], %rl0;
ret;
}
// .globl __aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
) // @__aos_to_soa4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __rsqrt_varying_double
.func (.param .align 8 .b8 func_retval0[8]) __rsqrt_varying_double(
.param .align 8 .b8 __rsqrt_varying_double_param_0[8]
) // @__rsqrt_varying_double
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f64 %fl0, [__rsqrt_varying_double_param_0];
rsqrt.approx.f64 %fl0, %fl0;
st.param.f64 [func_retval0+0], %fl0;
ret;
}
// .globl mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_
.func mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_(
.param .b64 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_1,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_2,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_3,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_4,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_5,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_6,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_7,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_8,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_9,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_10
) // @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u64 %rl0, [mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0];
ld.f32 %f0, [%rl0];
ld.f32 %f1, [%rl0+4];
ld.f32 %f2, [%rl0+8];
ld.f32 %f3, [%rl0+12];
ld.u32 %r0, [%rl0+16];
ld.u32 %r6, [%rl0+20];
ld.u32 %r7, [%rl0+24];
ld.u32 %r8, [%rl0+28];
ld.u32 %r1, [%rl0+32];
ld.u64 %rl0, [%rl0+40];
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getBlockIndex0___UM_,
(
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 0
// Callseq Start 1
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getBlockIndex1___UM_,
(
);
ld.param.b32 %r10, [retval0+0];
//{
}// Callseq End 1
mul.lo.s32 %r2, %r10, %r8;
mad.lo.s32 %r3, %r10, %r8, %r8;
setp.lt.s32 %p0, %r3, %r6;
selp.b32 %r3, %r3, %r6, %p0;
setp.ge.s32 %p0, %r2, %r3;
@%p0 bra BB8_14;
// BB#1: // %for_test34.preheader.lr.ph
mul.lo.s32 %r3, %r9, %r7;
mad.lo.s32 %r4, %r9, %r7, %r7;
setp.lt.s32 %p0, %r4, %r0;
selp.b32 %r4, %r4, %r0, %p0;
setp.gt.s32 %p0, %r1, 0;
selp.b32 %r5, -1, 0, %p0;
not.b32 %r6, %r6;
add.s32 %r11, %r10, 1;
mul.lo.s32 %r11, %r8, %r11;
not.b32 %r11, %r11;
setp.gt.s32 %p0, %r6, %r11;
selp.b32 %r6, %r6, %r11, %p0;
not.b32 %r6, %r6;
mul.lo.s32 %r8, %r10, %r8;
mul.lo.s32 %r8, %r8, %r0;
mad.lo.s32 %r7, %r9, %r7, %r8;
BB8_2: // %for_test34.preheader
// =>This Loop Header: Depth=1
// Child Loop BB8_16 Depth 2
// Child Loop BB8_8 Depth 2
// Child Loop BB8_9 Depth 3
setp.ge.s32 %p0, %r3, %r4;
@%p0 bra BB8_13;
// BB#3: // %for_loop36.lr.ph
// in Loop: Header=BB8_2 Depth=1
setp.lt.s32 %p0, %r5, 0;
mov.u32 %r8, %r7;
mov.u32 %r9, %r3;
@%p0 bra BB8_4;
bra.uni BB8_16;
BB8_4: // in Loop: Header=BB8_2 Depth=1
cvt.rn.f32.s32 %f4, %r2;
mul.lo.s32 %r8, %r2, %r0;
fma.rn.f32 %f4, %f3, %f4, %f2;
mov.u32 %r9, %r3;
BB8_8: // %for_loop.i178.lr.ph.us
// Parent Loop BB8_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB8_9 Depth 3
// Callseq Start 5
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___UM_,
(
);
ld.param.b32 %r10, [retval0+0];
//{
}// Callseq End 5
add.s32 %r10, %r10, %r9;
cvt.rn.f32.s32 %f5, %r10;
fma.rn.f32 %f5, %f1, %f5, %f0;
mov.u32 %r11, 0;
mov.u32 %r13, %r5;
mov.u32 %r12, %r11;
mov.u32 %r10, %r11;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB8_9: // %for_loop.i178.us
// Parent Loop BB8_2 Depth=1
// Parent Loop BB8_8 Depth=2
// => This Inner Loop Header: Depth=3
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p0, %f9, 0f40800000;
selp.b32 %r14, %r13, 0, %p0;
or.b32 %r12, %r14, %r12;
shr.u32 %r14, %r12, 31;
shr.u32 %r15, %r13, 31;
setp.eq.s32 %p0, %r14, %r15;
@%p0 bra BB8_10;
bra.uni BB8_11;
BB8_10: // in Loop: Header=BB8_9 Depth=3
mov.u32 %r13, %r11;
bra.uni BB8_12;
BB8_11: // %not_all_continued_or_breaked.i192.us
// in Loop: Header=BB8_9 Depth=3
mul.f32 %f9, %f6, %f6;
not.b32 %r14, %r12;
and.b32 %r13, %r13, %r14;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB8_12: // %for_step.i161.us
// in Loop: Header=BB8_9 Depth=3
setp.ne.s32 %p0, %r13, 0;
selp.u32 %r14, 1, 0, %p0;
add.s32 %r10, %r10, %r14;
setp.lt.s32 %p0, %r10, %r1;
selp.b32 %r13, %r13, 0, %p0;
setp.lt.s32 %p0, %r13, 0;
@%p0 bra BB8_9;
// BB#5: // %mandel___vyfvyfvyi.exit193.us
// in Loop: Header=BB8_8 Depth=2
// Callseq Start 6
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___UM_,
(
);
ld.param.b32 %r11, [retval0+0];
//{
}// Callseq End 6
// Callseq Start 7
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___UM_,
(
);
ld.param.b32 %r12, [retval0+0];
//{
}// Callseq End 7
add.s32 %r12, %r12, %r9;
setp.ge.s32 %p0, %r12, %r4;
@%p0 bra BB8_7;
// BB#6: // %if_then.us
// in Loop: Header=BB8_8 Depth=2
add.s32 %r12, %r9, %r8;
add.s32 %r11, %r12, %r11;
shl.b32 %r11, %r11, 2;
cvt.s64.s32 %rl1, %r11;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB8_7: // %if_exit.us
// in Loop: Header=BB8_8 Depth=2
add.s32 %r9, %r9, 32;
setp.lt.s32 %p0, %r9, %r4;
@%p0 bra BB8_8;
bra.uni BB8_13;
BB8_16: // %mandel___vyfvyfvyi.exit193
// Parent Loop BB8_2 Depth=1
// => This Inner Loop Header: Depth=2
// Callseq Start 2
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___UM_,
(
);
ld.param.b32 %r10, [retval0+0];
//{
}// Callseq End 2
// Callseq Start 3
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___UM_,
(
);
ld.param.b32 %r10, [retval0+0];
//{
}// Callseq End 3
// Callseq Start 4
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___UM_,
(
);
ld.param.b32 %r11, [retval0+0];
//{
}// Callseq End 4
add.s32 %r11, %r11, %r9;
setp.lt.s32 %p0, %r11, %r4;
@%p0 bra BB8_17;
bra.uni BB8_15;
BB8_17: // %if_then
// in Loop: Header=BB8_16 Depth=2
add.s32 %r10, %r10, %r8;
shl.b32 %r10, %r10, 2;
cvt.s64.s32 %rl1, %r10;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r10, 0;
st.u32 [%rl1], %r10;
BB8_15: // %if_exit
// in Loop: Header=BB8_16 Depth=2
add.s32 %r9, %r9, 32;
add.s32 %r8, %r8, 32;
setp.lt.s32 %p0, %r9, %r4;
@%p0 bra BB8_16;
BB8_13: // %for_exit37
// in Loop: Header=BB8_2 Depth=1
add.s32 %r2, %r2, 1;
add.s32 %r7, %r7, %r0;
setp.eq.s32 %p0, %r2, %r6;
@%p0 bra BB8_14;
bra.uni BB8_2;
BB8_14: // %for_exit
ret;
}

View File

@@ -0,0 +1,178 @@
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
// Cuda compilation tools, release 5.5, V5.5.0
//
.version 3.2
.target sm_20
.address_size 64
.visible .entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 mandelbrot_scanline_param_9
)
{
.reg .pred %p<32>;
.reg .s32 %r<62>;
.reg .f32 %f<28>;
.reg .s64 %rd<6>;
ld.param.f32 %f11, [mandelbrot_scanline_param_0];
ld.param.f32 %f12, [mandelbrot_scanline_param_1];
ld.param.f32 %f13, [mandelbrot_scanline_param_2];
ld.param.f32 %f14, [mandelbrot_scanline_param_3];
ld.param.u32 %r17, [mandelbrot_scanline_param_4];
ld.param.u32 %r18, [mandelbrot_scanline_param_5];
ld.param.u32 %r19, [mandelbrot_scanline_param_6];
ld.param.u32 %r20, [mandelbrot_scanline_param_7];
ld.param.u32 %r21, [mandelbrot_scanline_param_8];
ld.param.u64 %rd1, [mandelbrot_scanline_param_9];
mov.u32 %r22, %ctaid.x;
mad.lo.s32 %r23, %r22, %r19, %r19;
min.s32 %r1, %r23, %r17;
mov.u32 %r2, %ctaid.y;
mul.lo.s32 %r59, %r2, %r20;
add.s32 %r24, %r59, %r20;
min.s32 %r25, %r24, %r18;
setp.ge.s32 %p10, %r59, %r25;
@%p10 bra BB0_15;
not.b32 %r26, %r18;
add.s32 %r27, %r2, 1;
mul.lo.s32 %r28, %r27, %r20;
not.b32 %r29, %r28;
max.s32 %r30, %r26, %r29;
not.b32 %r4, %r30;
BB0_2:
mul.lo.s32 %r60, %r22, %r19;
setp.ge.s32 %p11, %r60, %r1;
@%p11 bra BB0_14;
cvt.rn.f32.s32 %f15, %r59;
setp.gt.s32 %p12, %r21, 0;
fma.rn.f32 %f1, %f15, %f14, %f13;
@%p12 bra BB0_7;
BB0_4:
mov.u32 %r8, WARP_SZ;
add.s32 %r34, %r8, -1;
mov.u32 %r35, %tid.x;
and.b32 %r36, %r34, %r35;
add.s32 %r37, %r36, %r60;
setp.ge.s32 %p13, %r37, %r1;
@%p13 bra BB0_6;
mad.lo.s32 %r38, %r59, %r17, %r60;
add.s32 %r40, %r8, 1073741823;
and.b32 %r42, %r40, %r35;
add.s32 %r43, %r38, %r42;
shl.b32 %r44, %r43, 2;
cvt.s64.s32 %rd2, %r44;
add.s64 %rd3, %rd2, %rd1;
mov.u32 %r45, 0;
st.u32 [%rd3], %r45;
BB0_6:
add.s32 %r60, %r8, %r60;
setp.lt.s32 %p14, %r60, %r1;
@%p14 bra BB0_4;
bra.uni BB0_14;
BB0_7:
mov.u32 %r47, WARP_SZ;
add.s32 %r48, %r47, -1;
mov.u32 %r49, %tid.x;
and.b32 %r50, %r48, %r49;
add.s32 %r11, %r50, %r60;
cvt.rn.f32.s32 %f16, %r11;
fma.rn.f32 %f2, %f16, %f12, %f11;
mov.u32 %r61, 0;
mov.pred %p16, 0;
mov.pred %p29, -1;
mov.pred %p26, %p12;
mov.pred %p31, %p16;
mov.f32 %f22, %f2;
mov.f32 %f26, %f1;
BB0_8:
mov.f32 %f24, %f26;
mov.f32 %f27, %f24;
mov.f32 %f20, %f22;
mov.f32 %f23, %f20;
mov.pred %p3, %p29;
mov.pred %p2, %p26;
and.pred %p5, %p3, %p2;
mul.f32 %f6, %f23, %f23;
mul.f32 %f5, %f27, %f27;
add.f32 %f17, %f5, %f6;
setp.gtu.f32 %p18, %f17, 0f40800000;
and.pred %p19, %p5, %p18;
or.pred %p31, %p19, %p31;
xor.pred %p20, %p31, %p5;
mov.pred %p30, %p16;
@!%p20 bra BB0_10;
bra.uni BB0_9;
BB0_9:
add.f32 %f18, %f23, %f23;
fma.rn.f32 %f27, %f27, %f18, %f1;
sub.f32 %f19, %f6, %f5;
add.f32 %f23, %f2, %f19;
not.pred %p21, %p31;
and.pred %p7, %p5, %p21;
mov.pred %p30, %p7;
BB0_10:
mov.f32 %f9, %f27;
mov.f32 %f10, %f23;
mov.pred %p28, %p30;
mov.pred %p29, %p28;
add.s32 %r51, %r61, 1;
selp.b32 %r61, %r51, %r61, %p29;
setp.lt.s32 %p9, %r61, %r21;
and.pred %p22, %p29, %p9;
mov.pred %p26, %p9;
mov.f32 %f22, %f10;
mov.f32 %f26, %f9;
@%p22 bra BB0_8;
setp.ge.s32 %p23, %r11, %r1;
@%p23 bra BB0_13;
mad.lo.s32 %r52, %r59, %r17, %r60;
add.s32 %r54, %r47, 1073741823;
and.b32 %r56, %r54, %r49;
add.s32 %r57, %r52, %r56;
shl.b32 %r58, %r57, 2;
cvt.s64.s32 %rd4, %r58;
add.s64 %rd5, %rd4, %rd1;
st.u32 [%rd5], %r61;
BB0_13:
add.s32 %r60, %r47, %r60;
setp.lt.s32 %p24, %r60, %r1;
@%p24 bra BB0_7;
BB0_14:
add.s32 %r59, %r59, 1;
setp.ne.s32 %p25, %r59, %r4;
@%p25 bra BB0_2;
BB0_15:
ret;
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,171 @@
.file "mandelbrot_task.ispc"
.text
.globl mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
.align 16, 0x90
.type mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_,@function
mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_: # @mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
# BB#0: # %allocas
pushq %rbp
pushq %r15
pushq %r14
pushq %rbx
subq $88, %rsp
vmovups %ymm4, 32(%rsp) # 32-byte Folded Spill
movq %rcx, %r14
movl %edx, %r15d
movl %esi, %ebx
movl %edi, %ebp
vmovss %xmm1, 76(%rsp) # 4-byte Spill
vmovss %xmm0, 28(%rsp) # 4-byte Spill
vcvtsi2ssl %ebp, %xmm0, %xmm5
vsubss %xmm0, %xmm2, %xmm4
vcvtsi2ssl %ebx, %xmm0, %xmm2
vsubss %xmm1, %xmm3, %xmm3
movq $0, 80(%rsp)
leaq 80(%rsp), %rdi
vdivss %xmm2, %xmm3, %xmm1
vmovss %xmm1, 72(%rsp) # 4-byte Spill
vdivss %xmm5, %xmm4, %xmm0
vmovss %xmm0, 68(%rsp) # 4-byte Spill
movl $96, %esi
movl $32, %edx
vzeroupper
callq ISPCAlloc
vmovups 32(%rsp), %ymm0 # 32-byte Folded Reload
movq %rax, %rdx
movl %ebx, %r8d
sarl $31, %r8d
shrl $28, %r8d
addl %ebx, %r8d
vmovss 28(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, (%rdx)
sarl $4, %r8d
movl %ebp, %ecx
sarl $31, %ecx
shrl $28, %ecx
addl %ebp, %ecx
sarl $4, %ecx
vmovmskps %ymm0, %eax
cmpl $255, %eax
vmovss 68(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 4(%rdx)
vmovss 76(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 8(%rdx)
vmovss 72(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 12(%rdx)
movl %ebp, 16(%rdx)
movl %ebx, 20(%rdx)
movl $16, 24(%rdx)
movl $16, 28(%rdx)
movl %r15d, 32(%rdx)
movq %r14, 40(%rdx)
jne .LBB0_2
# BB#1: # %all_on
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
.LBB0_2: # %all_on
vmovaps %ymm0, 64(%rdx)
leaq 80(%rsp), %rdi
movl $mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, %esi
movl $1, %r9d
vzeroupper
callq ISPCLaunch
movq 80(%rsp), %rdi
testq %rdi, %rdi
je .LBB0_4
# BB#3: # %call_sync
callq ISPCSync
movq $0, 80(%rsp)
.LBB0_4: # %post_sync
addq $88, %rsp
popq %rbx
popq %r14
popq %r15
popq %rbp
ret
.Ltmp0:
.size mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_, .Ltmp0-mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
.globl mandelbrot_ispc
.align 16, 0x90
.type mandelbrot_ispc,@function
mandelbrot_ispc: # @mandelbrot_ispc
# BB#0: # %allocas
pushq %rbp
pushq %r15
pushq %r14
pushq %r12
pushq %rbx
subq $32, %rsp
movq %rcx, %r14
movl %edx, %r15d
movl %esi, %ebx
movl %edi, %ebp
vmovss %xmm1, 20(%rsp) # 4-byte Spill
vmovss %xmm0, 8(%rsp) # 4-byte Spill
vcvtsi2ssl %ebp, %xmm0, %xmm5
vsubss %xmm0, %xmm2, %xmm4
vcvtsi2ssl %ebx, %xmm0, %xmm2
vsubss %xmm1, %xmm3, %xmm3
movq $0, 24(%rsp)
leaq 24(%rsp), %r12
vdivss %xmm2, %xmm3, %xmm1
vmovss %xmm1, 16(%rsp) # 4-byte Spill
vdivss %xmm5, %xmm4, %xmm0
vmovss %xmm0, 12(%rsp) # 4-byte Spill
movq %r12, %rdi
movl $96, %esi
movl $32, %edx
callq ISPCAlloc
movl %ebx, %r8d
sarl $31, %r8d
vpcmpeqd %xmm0, %xmm0, %xmm0
vmovss 8(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, (%rax)
shrl $28, %r8d
addl %ebx, %r8d
movl %ebp, %ecx
sarl $31, %ecx
shrl $28, %ecx
addl %ebp, %ecx
sarl $4, %ecx
sarl $4, %r8d
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovss 12(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 4(%rax)
vmovss 20(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 8(%rax)
vmovss 16(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 12(%rax)
movl %ebp, 16(%rax)
movl %ebx, 20(%rax)
movl $16, 24(%rax)
movl $16, 28(%rax)
movl %r15d, 32(%rax)
movq %r14, 40(%rax)
vmovaps %ymm0, 64(%rax)
movq %r12, %rdi
movl $mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, %esi
movq %rax, %rdx
movl $1, %r9d
vzeroupper
callq ISPCLaunch
movq 24(%rsp), %rdi
testq %rdi, %rdi
je .LBB1_2
# BB#1: # %call_sync
callq ISPCSync
movq $0, 24(%rsp)
.LBB1_2: # %post_sync
addq $32, %rsp
popq %rbx
popq %r12
popq %r14
popq %r15
popq %rbp
ret
.Ltmp1:
.size mandelbrot_ispc, .Ltmp1-mandelbrot_ispc
.section ".note.GNU-stack","",@progbits

View File

@@ -0,0 +1,171 @@
.file "mandelbrot_task.ispc"
.text
.globl mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
.align 16, 0x90
.type mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_,@function
mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_: # @mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
# BB#0: # %allocas
pushq %rbp
pushq %r15
pushq %r14
pushq %rbx
subq $88, %rsp
vmovups %ymm4, 32(%rsp) # 32-byte Folded Spill
movq %rcx, %r14
movl %edx, %r15d
movl %esi, %ebx
movl %edi, %ebp
vmovss %xmm1, 76(%rsp) # 4-byte Spill
vmovss %xmm0, 28(%rsp) # 4-byte Spill
vcvtsi2ssl %ebp, %xmm0, %xmm5
vsubss %xmm0, %xmm2, %xmm4
vcvtsi2ssl %ebx, %xmm0, %xmm2
vsubss %xmm1, %xmm3, %xmm3
movq $0, 80(%rsp)
leaq 80(%rsp), %rdi
vdivss %xmm2, %xmm3, %xmm1
vmovss %xmm1, 72(%rsp) # 4-byte Spill
vdivss %xmm5, %xmm4, %xmm0
vmovss %xmm0, 68(%rsp) # 4-byte Spill
movl $96, %esi
movl $32, %edx
vzeroupper
callq ISPCAlloc
vmovups 32(%rsp), %ymm0 # 32-byte Folded Reload
movq %rax, %rdx
movl %ebx, %r8d
sarl $31, %r8d
shrl $28, %r8d
addl %ebx, %r8d
vmovss 28(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, (%rdx)
sarl $4, %r8d
movl %ebp, %ecx
sarl $31, %ecx
shrl $28, %ecx
addl %ebp, %ecx
sarl $4, %ecx
vmovmskps %ymm0, %eax
cmpl $255, %eax
vmovss 68(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 4(%rdx)
vmovss 76(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 8(%rdx)
vmovss 72(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 12(%rdx)
movl %ebp, 16(%rdx)
movl %ebx, 20(%rdx)
movl $16, 24(%rdx)
movl $16, 28(%rdx)
movl %r15d, 32(%rdx)
movq %r14, 40(%rdx)
jne .LBB0_2
# BB#1: # %all_on
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
.LBB0_2: # %all_on
vmovaps %ymm0, 64(%rdx)
leaq 80(%rsp), %rdi
movl $mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, %esi
movl $1, %r9d
vzeroupper
callq ISPCLaunch
movq 80(%rsp), %rdi
testq %rdi, %rdi
je .LBB0_4
# BB#3: # %call_sync
callq ISPCSync
movq $0, 80(%rsp)
.LBB0_4: # %post_sync
addq $88, %rsp
popq %rbx
popq %r14
popq %r15
popq %rbp
ret
.Ltmp0:
.size mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_, .Ltmp0-mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
.globl mandelbrot_ispc
.align 16, 0x90
.type mandelbrot_ispc,@function
mandelbrot_ispc: # @mandelbrot_ispc
# BB#0: # %allocas
pushq %rbp
pushq %r15
pushq %r14
pushq %r12
pushq %rbx
subq $32, %rsp
movq %rcx, %r14
movl %edx, %r15d
movl %esi, %ebx
movl %edi, %ebp
vmovss %xmm1, 20(%rsp) # 4-byte Spill
vmovss %xmm0, 8(%rsp) # 4-byte Spill
vcvtsi2ssl %ebp, %xmm0, %xmm5
vsubss %xmm0, %xmm2, %xmm4
vcvtsi2ssl %ebx, %xmm0, %xmm2
vsubss %xmm1, %xmm3, %xmm3
movq $0, 24(%rsp)
leaq 24(%rsp), %r12
vdivss %xmm2, %xmm3, %xmm1
vmovss %xmm1, 16(%rsp) # 4-byte Spill
vdivss %xmm5, %xmm4, %xmm0
vmovss %xmm0, 12(%rsp) # 4-byte Spill
movq %r12, %rdi
movl $96, %esi
movl $32, %edx
callq ISPCAlloc
movl %ebx, %r8d
sarl $31, %r8d
vpcmpeqd %xmm0, %xmm0, %xmm0
vmovss 8(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, (%rax)
shrl $28, %r8d
addl %ebx, %r8d
movl %ebp, %ecx
sarl $31, %ecx
shrl $28, %ecx
addl %ebp, %ecx
sarl $4, %ecx
sarl $4, %r8d
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovss 12(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 4(%rax)
vmovss 20(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 8(%rax)
vmovss 16(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 12(%rax)
movl %ebp, 16(%rax)
movl %ebx, 20(%rax)
movl $16, 24(%rax)
movl $16, 28(%rax)
movl %r15d, 32(%rax)
movq %r14, 40(%rax)
vmovaps %ymm0, 64(%rax)
movq %r12, %rdi
movl $mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, %esi
movq %rax, %rdx
movl $1, %r9d
vzeroupper
callq ISPCLaunch
movq 24(%rsp), %rdi
testq %rdi, %rdi
je .LBB1_2
# BB#1: # %call_sync
callq ISPCSync
movq $0, 24(%rsp)
.LBB1_2: # %post_sync
addq $32, %rsp
popq %rbx
popq %r12
popq %r14
popq %r15
popq %rbp
ret
.Ltmp1:
.size mandelbrot_ispc, .Ltmp1-mandelbrot_ispc
.section ".note.GNU-stack","",@progbits

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,208 @@
; ModuleID = 'mandelbrot_task_nvptx64.bc'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64"
@__str = internal constant [66 x i8] c"mandelbrot_task.ispc:55:3: Assertion failed: xspan >= vectorWidth\00"
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #0
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #0
; Function Attrs: nounwind
declare i32 @puts(i8* nocapture) #1
; Function Attrs: noreturn
declare void @abort() #2
; Function Attrs: nounwind
define void @mandelbrot_scanline(float %x0, float %dx, float %y0, float %dy, i32 %width, i32 %height, i32 %xspan, i32 %yspan, i32 %maxIterations, i32* %output) #3 {
allocas:
%bid.i.i = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
%mul_calltmp_xspan_load = mul i32 %bid.i.i, %xspan
%add_xstart_load_xspan_load13 = add i32 %mul_calltmp_xspan_load, %xspan
%c.i.i = icmp slt i32 %add_xstart_load_xspan_load13, %width
%r.i.i = select i1 %c.i.i, i32 %add_xstart_load_xspan_load13, i32 %width
%bid.i.i77 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1
%mul_calltmp19_yspan_load = mul i32 %bid.i.i77, %yspan
%add_ystart_load_yspan_load20 = add i32 %mul_calltmp19_yspan_load, %yspan
%tid.i.i80 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%greaterequal_xspan_load24_calltmp27 = icmp sgt i32 %tid.i.i80, %xspan
br i1 %greaterequal_xspan_load24_calltmp27, label %fail.i, label %for_test.preheader
for_test.preheader: ; preds = %allocas
%c.i.i78 = icmp slt i32 %add_ystart_load_yspan_load20, %height
%r.i.i79 = select i1 %c.i.i78, i32 %add_ystart_load_yspan_load20, i32 %height
%less_yi_load_yend_load113 = icmp slt i32 %mul_calltmp19_yspan_load, %r.i.i79
br i1 %less_yi_load_yend_load113, label %for_test34.preheader.lr.ph, label %for_exit
for_test34.preheader.lr.ph: ; preds = %for_test.preheader
%less_xi_load_xend_load111 = icmp slt i32 %mul_calltmp_xspan_load, %r.i.i
%maxIterations_load_broadcast_init = insertelement <1 x i32> undef, i32 %maxIterations, i32 0
%less_i_load_count_load.i102 = icmp sgt <1 x i32> %maxIterations_load_broadcast_init, zeroinitializer
%v.i.i104 = extractelement <1 x i1> %less_i_load_count_load.i102, i32 0
%output_load_ptr2int = ptrtoint i32* %output to i64
%0 = xor i32 %height, -1
%1 = add i32 %bid.i.i77, 1
%2 = mul i32 %1, %yspan
%3 = xor i32 %2, -1
%4 = icmp sgt i32 %0, %3
%smax = select i1 %4, i32 %0, i32 %3
%5 = xor i32 %smax, -1
br label %for_test34.preheader
fail.i: ; preds = %allocas
%call.i = call i32 @puts(i8* getelementptr inbounds ([66 x i8]* @__str, i64 0, i64 0)) #1
call void @abort() #4
unreachable
for_test34.preheader: ; preds = %for_exit37, %for_test34.preheader.lr.ph
%yi.0114 = phi i32 [ %mul_calltmp19_yspan_load, %for_test34.preheader.lr.ph ], [ %yi_load71_plus1, %for_exit37 ]
br i1 %less_xi_load_xend_load111, label %for_loop36.lr.ph, label %for_exit37
for_loop36.lr.ph: ; preds = %for_test34.preheader
%yi_load46_to_float = sitofp i32 %yi.0114 to float
%mul_yi_load46_to_float_dy_load = fmul float %yi_load46_to_float, %dy
%add_y0_load_mul_yi_load46_to_float_dy_load = fadd float %mul_yi_load46_to_float_dy_load, %y0
%add_y0_load_mul_yi_load46_to_float_dy_load_broadcast_init = insertelement <1 x float> undef, float %add_y0_load_mul_yi_load46_to_float_dy_load, i32 0
%mul_yi_load50_width_load51 = mul i32 %yi.0114, %width
br i1 %v.i.i104, label %for_loop.i.lr.ph.us, label %mandel___vyfvyfvyi.exit
mandel___vyfvyfvyi.exit.us: ; preds = %for_step.i.us
%tid.i.i72.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
%tid.i.i.i.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%sub_calltmp3_.i.us = add i32 %tid.i.i.i.us, -1
%bitop.i.us = and i32 %sub_calltmp3_.i.us, %tid.i.i72.us
%add_xi_load56_calltmp59.us = add i32 %bitop.i.us, %xi.0112.us
%less_add_xi_load56_calltmp59_xend_load60.us = icmp slt i32 %add_xi_load56_calltmp59.us, %r.i.i
br i1 %less_add_xi_load56_calltmp59_xend_load60.us, label %if_then.us, label %if_exit.us
if_then.us: ; preds = %mandel___vyfvyfvyi.exit.us
%tid.i.i.i74.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%sub_calltmp3_.i75.us = add i32 %tid.i.i.i74.us, 1073741823
%tid.i.i73.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
%bitop.i76.us = and i32 %sub_calltmp3_.i75.us, %tid.i.i73.us
%add_xi_load52_calltmp55.us = add i32 %xi.0112.us, %mul_yi_load50_width_load51
%add_mul_yi_load50_width_load51_add_xi_load52_calltmp55.us = add i32 %add_xi_load52_calltmp55.us, %bitop.i76.us
%6 = shl i32 %add_mul_yi_load50_width_load51_add_xi_load52_calltmp55.us, 2
%iptr__id.i.rhs.us = sext i32 %6 to i64
%iptr__id.i.us = add i64 %iptr__id.i.rhs.us, %output_load_ptr2int
%ptr__id.i.us = inttoptr i64 %iptr__id.i.us to i32*
%val__id.i.us = extractelement <1 x i32> %v1.i92.us, i32 0
store i32 %val__id.i.us, i32* %ptr__id.i.us, align 4
br label %if_exit.us
if_exit.us: ; preds = %if_then.us, %mandel___vyfvyfvyi.exit.us
%tid.i.i.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%add_xi_load70_calltmp68.us = add i32 %tid.i.i.us, %xi.0112.us
%less_xi_load_xend_load.us = icmp slt i32 %add_xi_load70_calltmp68.us, %r.i.i
br i1 %less_xi_load_xend_load.us, label %for_loop.i.lr.ph.us, label %for_exit37
for_loop.i.us: ; preds = %for_loop.i.lr.ph.us, %for_step.i.us
%less_i_load_count_load.i110.us = phi <1 x i1> [ %less_i_load_count_load.i102, %for_loop.i.lr.ph.us ], [ %less_i_load_count_load.i.us, %for_step.i.us ]
%internal_mask_memory.0.i109.us = phi <1 x i1> [ <i1 true>, %for_loop.i.lr.ph.us ], [ %internal_mask_memory.1.i.us, %for_step.i.us ]
%break_lanes_memory.0.i108.us = phi <1 x i1> [ zeroinitializer, %for_loop.i.lr.ph.us ], [ %"mask|break_mask.i.us", %for_step.i.us ]
%v1.i9096107.us = phi <1 x float> [ %add_x0_load_mul_add_xi_load42_calltmp45_to_float_dx_load_broadcast_init.us, %for_loop.i.lr.ph.us ], [ %v1.i9095.us, %for_step.i.us ]
%v1.i8898106.us = phi <1 x float> [ %add_y0_load_mul_yi_load46_to_float_dy_load_broadcast_init, %for_loop.i.lr.ph.us ], [ %v1.i8897.us, %for_step.i.us ]
%v1.i9299105.us = phi <1 x i32> [ zeroinitializer, %for_loop.i.lr.ph.us ], [ %v1.i92.us, %for_step.i.us ]
%"oldMask&test.i.us" = and <1 x i1> %internal_mask_memory.0.i109.us, %less_i_load_count_load.i110.us
%mul_z_re_load_z_re_load13.i.us = fmul <1 x float> %v1.i9096107.us, %v1.i9096107.us
%mul_z_im_load_z_im_load14.i.us = fmul <1 x float> %v1.i8898106.us, %v1.i8898106.us
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i.us = fadd <1 x float> %mul_z_im_load_z_im_load14.i.us, %mul_z_re_load_z_re_load13.i.us
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i.us = fcmp ugt <1 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i.us, <float 4.000000e+00>
%"oldMask&test16.i.us" = and <1 x i1> %"oldMask&test.i.us", %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i.us
%"mask|break_mask.i.us" = or <1 x i1> %"oldMask&test16.i.us", %break_lanes_memory.0.i108.us
%v.i2.i.us = extractelement <1 x i1> %"mask|break_mask.i.us", i32 0
%v.i1.i.us = extractelement <1 x i1> %"oldMask&test.i.us", i32 0
%"equal_finished&func_internal_mask&function_mask12.itmp.us" = xor i1 %v.i2.i.us, %v.i1.i.us
br i1 %"equal_finished&func_internal_mask&function_mask12.itmp.us", label %not_all_continued_or_breaked.i.us, label %for_step.i.us
not_all_continued_or_breaked.i.us: ; preds = %for_loop.i.us
%"!(break|continue)_lanes.i.us" = xor <1 x i1> %"mask|break_mask.i.us", <i1 true>
%new_mask28.i.us = and <1 x i1> %"oldMask&test.i.us", %"!(break|continue)_lanes.i.us"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i.us = fsub <1 x float> %mul_z_re_load_z_re_load13.i.us, %mul_z_im_load_z_im_load14.i.us
%mul__z_re_load35.i.us = fmul <1 x float> %v1.i9096107.us, <float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i.us = fmul <1 x float> %v1.i8898106.us, %mul__z_re_load35.i.us
%add_c_re_load42_new_re_load.i.us = fadd <1 x float> %add_x0_load_mul_add_xi_load42_calltmp45_to_float_dx_load_broadcast_init.us, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i.us
%add_c_im_load44_new_im_load.i.us = fadd <1 x float> %add_y0_load_mul_yi_load46_to_float_dy_load_broadcast_init, %mul_mul__z_re_load35_z_im_load36.i.us
br label %for_step.i.us
for_step.i.us: ; preds = %not_all_continued_or_breaked.i.us, %for_loop.i.us
%v1.i8897.us = phi <1 x float> [ %v1.i8898106.us, %for_loop.i.us ], [ %add_c_im_load44_new_im_load.i.us, %not_all_continued_or_breaked.i.us ]
%v1.i9095.us = phi <1 x float> [ %v1.i9096107.us, %for_loop.i.us ], [ %add_c_re_load42_new_re_load.i.us, %not_all_continued_or_breaked.i.us ]
%internal_mask_memory.1.i.us = phi <1 x i1> [ zeroinitializer, %for_loop.i.us ], [ %new_mask28.i.us, %not_all_continued_or_breaked.i.us ]
%i_load53_plus1.i.us = add <1 x i32> %v1.i9299105.us, <i32 1>
%v1.i92.us = select <1 x i1> %internal_mask_memory.1.i.us, <1 x i32> %i_load53_plus1.i.us, <1 x i32> %v1.i9299105.us
%less_i_load_count_load.i.us = icmp slt <1 x i32> %v1.i92.us, %maxIterations_load_broadcast_init
%"internal_mask&function_mask10.i.us" = and <1 x i1> %internal_mask_memory.1.i.us, %less_i_load_count_load.i.us
%v.i.i.us = extractelement <1 x i1> %"internal_mask&function_mask10.i.us", i32 0
br i1 %v.i.i.us, label %for_loop.i.us, label %mandel___vyfvyfvyi.exit.us
for_loop.i.lr.ph.us: ; preds = %if_exit.us, %for_loop36.lr.ph
%xi.0112.us = phi i32 [ %add_xi_load70_calltmp68.us, %if_exit.us ], [ %mul_calltmp_xspan_load, %for_loop36.lr.ph ]
%tid.i.i81.us = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
%tid.i.i.i82.us = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%sub_calltmp3_.i83.us = add i32 %tid.i.i.i82.us, -1
%bitop.i84.us = and i32 %sub_calltmp3_.i83.us, %tid.i.i81.us
%add_xi_load42_calltmp45.us = add i32 %bitop.i84.us, %xi.0112.us
%add_xi_load42_calltmp45_to_float.us = sitofp i32 %add_xi_load42_calltmp45.us to float
%mul_add_xi_load42_calltmp45_to_float_dx_load.us = fmul float %add_xi_load42_calltmp45_to_float.us, %dx
%add_x0_load_mul_add_xi_load42_calltmp45_to_float_dx_load.us = fadd float %mul_add_xi_load42_calltmp45_to_float_dx_load.us, %x0
%add_x0_load_mul_add_xi_load42_calltmp45_to_float_dx_load_broadcast_init.us = insertelement <1 x float> undef, float %add_x0_load_mul_add_xi_load42_calltmp45_to_float_dx_load.us, i32 0
br label %for_loop.i.us
for_exit: ; preds = %for_exit37, %for_test.preheader
ret void
mandel___vyfvyfvyi.exit: ; preds = %if_exit, %for_loop36.lr.ph
%xi.0112 = phi i32 [ %add_xi_load70_calltmp68, %if_exit ], [ %mul_calltmp_xspan_load, %for_loop36.lr.ph ]
%tid.i.i72 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
%tid.i.i.i = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%sub_calltmp3_.i = add i32 %tid.i.i.i, -1
%bitop.i = and i32 %sub_calltmp3_.i, %tid.i.i72
%add_xi_load56_calltmp59 = add i32 %bitop.i, %xi.0112
%less_add_xi_load56_calltmp59_xend_load60 = icmp slt i32 %add_xi_load56_calltmp59, %r.i.i
br i1 %less_add_xi_load56_calltmp59_xend_load60, label %if_then, label %if_exit
for_exit37: ; preds = %if_exit, %if_exit.us, %for_test34.preheader
%yi_load71_plus1 = add i32 %yi.0114, 1
%exitcond = icmp eq i32 %yi_load71_plus1, %5
br i1 %exitcond, label %for_exit, label %for_test34.preheader
if_then: ; preds = %mandel___vyfvyfvyi.exit
%tid.i.i.i74 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%sub_calltmp3_.i75 = add i32 %tid.i.i.i74, 1073741823
%tid.i.i73 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
%bitop.i76 = and i32 %sub_calltmp3_.i75, %tid.i.i73
%add_xi_load52_calltmp55 = add i32 %xi.0112, %mul_yi_load50_width_load51
%add_mul_yi_load50_width_load51_add_xi_load52_calltmp55 = add i32 %add_xi_load52_calltmp55, %bitop.i76
%7 = shl i32 %add_mul_yi_load50_width_load51_add_xi_load52_calltmp55, 2
%iptr__id.i.rhs = sext i32 %7 to i64
%iptr__id.i = add i64 %iptr__id.i.rhs, %output_load_ptr2int
%ptr__id.i = inttoptr i64 %iptr__id.i to i32*
store i32 0, i32* %ptr__id.i, align 4
br label %if_exit
if_exit: ; preds = %if_then, %mandel___vyfvyfvyi.exit
%tid.i.i = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #1
%add_xi_load70_calltmp68 = add i32 %tid.i.i, %xi.0112
%less_xi_load_xend_load = icmp slt i32 %add_xi_load70_calltmp68, %r.i.i
br i1 %less_xi_load_xend_load, label %mandel___vyfvyfvyi.exit, label %for_exit37
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
attributes #2 = { noreturn }
attributes #3 = { nounwind "target-features"="+sm_35" }
attributes #4 = { noreturn nounwind }
!nvvm.annotations = !{!0}
!0 = metadata !{void (float, float, float, float, i32, i32, i32, i32, i32, i32*)* @mandelbrot_scanline, metadata !"kernel", i32 1}

View File

@@ -0,0 +1,229 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl mandelbrot_scanline
.func (.param .b32 func_retval0) puts
(
.param .b64 puts_param_0
)
;
.func abort
(
)
;
.global .align 1 .b8 __str[66] = {109, 97, 110, 100, 101, 108, 98, 114, 111, 116, 95, 116, 97, 115, 107, 46, 105, 115, 112, 99, 58, 53, 53, 58, 51, 58, 32, 65, 115, 115, 101, 114, 116, 105, 111, 110, 32, 102, 97, 105, 108, 101, 100, 58, 32, 120, 115, 112, 97, 110, 32, 62, 61, 32, 118, 101, 99, 116, 111, 114, 87, 105, 100, 116, 104, 0};
// @mandelbrot_scanline
.entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 .ptr .align 4 mandelbrot_scanline_param_9
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u32 %r3, [mandelbrot_scanline_param_6];
mov.u32 %r0, WARP_SZ;
setp.gt.s32 %p0, %r0, %r3;
@%p0 bra BB0_18;
// BB#1: // %for_test.preheader
ld.param.u32 %r7, [mandelbrot_scanline_param_5];
ld.param.u32 %r6, [mandelbrot_scanline_param_7];
mov.u32 %r8, %ctaid.y;
mul.lo.s32 %r1, %r8, %r6;
mad.lo.s32 %r2, %r8, %r6, %r6;
setp.lt.s32 %p0, %r2, %r7;
selp.b32 %r2, %r2, %r7, %p0;
setp.ge.s32 %p0, %r1, %r2;
@%p0 bra BB0_14;
// BB#2: // %for_test34.preheader.lr.ph
ld.param.f32 %f0, [mandelbrot_scanline_param_0];
ld.param.f32 %f1, [mandelbrot_scanline_param_1];
ld.param.f32 %f2, [mandelbrot_scanline_param_2];
mov.u32 %r4, %ctaid.x;
mul.lo.s32 %r2, %r4, %r3;
ld.param.f32 %f3, [mandelbrot_scanline_param_3];
mad.lo.s32 %r4, %r4, %r3, %r3;
ld.param.u32 %r3, [mandelbrot_scanline_param_4];
setp.lt.s32 %p0, %r4, %r3;
selp.b32 %r4, %r4, %r3, %p0;
ld.param.u32 %r5, [mandelbrot_scanline_param_8];
ld.param.u64 %rl0, [mandelbrot_scanline_param_9];
setp.gt.s32 %p0, %r5, 0;
not.b32 %r7, %r7;
add.s32 %r8, %r8, 1;
mul.lo.s32 %r6, %r8, %r6;
not.b32 %r6, %r6;
setp.gt.s32 %p1, %r7, %r6;
selp.b32 %r6, %r7, %r6, %p1;
not.b32 %r6, %r6;
BB0_3: // %for_test34.preheader
// =>This Loop Header: Depth=1
// Child Loop BB0_16 Depth 2
// Child Loop BB0_9 Depth 2
// Child Loop BB0_12 Depth 3
setp.ge.s32 %p1, %r2, %r4;
@%p1 bra BB0_13;
// BB#4: // %for_loop36.lr.ph
// in Loop: Header=BB0_3 Depth=1
mul.lo.s32 %r7, %r1, %r3;
mov.u32 %r8, %r2;
@%p0 bra BB0_5;
bra.uni BB0_16;
BB0_5: // in Loop: Header=BB0_3 Depth=1
cvt.rn.f32.s32 %f4, %r1;
fma.rn.f32 %f4, %f4, %f3, %f2;
mov.u32 %r8, %r2;
BB0_9: // %for_loop.i.lr.ph.us
// Parent Loop BB0_3 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB0_12 Depth 3
mov.u32 %r9, %tid.x;
add.s32 %r10, %r0, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r11, %r10, %r8;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f5, %f1, %f0;
mov.u32 %r10, 0;
mov.pred %p1, 0;
mov.pred %p3, -1;
mov.pred %p4, %p0;
mov.pred %p2, %p1;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB0_12: // %for_loop.i.us
// Parent Loop BB0_3 Depth=1
// Parent Loop BB0_9 Depth=2
// => This Inner Loop Header: Depth=3
and.pred %p4, %p3, %p4;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p3, %f9, 0f40800000;
and.pred %p3, %p4, %p3;
or.pred %p2, %p3, %p2;
xor.pred %p5, %p2, %p4;
mov.pred %p3, %p1;
@!%p5 bra BB0_11;
bra.uni BB0_10;
BB0_10: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB0_12 Depth=3
mul.f32 %f9, %f6, %f6;
not.pred %p3, %p2;
and.pred %p3, %p4, %p3;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB0_11: // %for_step.i.us
// in Loop: Header=BB0_12 Depth=3
add.s32 %r12, %r10, 1;
selp.b32 %r10, %r12, %r10, %p3;
setp.lt.s32 %p4, %r10, %r5;
and.pred %p5, %p3, %p4;
@%p5 bra BB0_12;
// BB#6: // %mandel___vyfvyfvyi.exit.us
// in Loop: Header=BB0_9 Depth=2
setp.ge.s32 %p1, %r11, %r4;
@%p1 bra BB0_8;
// BB#7: // %if_then.us
// in Loop: Header=BB0_9 Depth=2
add.s32 %r11, %r0, 1073741823;
and.b32 %r9, %r11, %r9;
add.s32 %r11, %r8, %r7;
add.s32 %r9, %r11, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB0_8: // %if_exit.us
// in Loop: Header=BB0_9 Depth=2
add.s32 %r8, %r0, %r8;
setp.lt.s32 %p1, %r8, %r4;
@%p1 bra BB0_9;
bra.uni BB0_13;
BB0_16: // %mandel___vyfvyfvyi.exit
// Parent Loop BB0_3 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r9, %tid.x;
add.s32 %r10, %r0, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r10, %r10, %r8;
setp.lt.s32 %p1, %r10, %r4;
@%p1 bra BB0_17;
bra.uni BB0_15;
BB0_17: // %if_then
// in Loop: Header=BB0_16 Depth=2
add.s32 %r10, %r0, 1073741823;
and.b32 %r9, %r10, %r9;
add.s32 %r10, %r8, %r7;
add.s32 %r9, %r10, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
BB0_15: // %if_exit
// in Loop: Header=BB0_16 Depth=2
add.s32 %r8, %r0, %r8;
setp.lt.s32 %p1, %r8, %r4;
@%p1 bra BB0_16;
BB0_13: // %for_exit37
// in Loop: Header=BB0_3 Depth=1
add.s32 %r1, %r1, 1;
setp.eq.s32 %p1, %r1, %r6;
@%p1 bra BB0_14;
bra.uni BB0_3;
BB0_14: // %for_exit
ret;
BB0_18: // %fail.i
mov.u64 %rl0, __str;
cvta.global.u64 %rl0, %rl0;
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rl0;
.param .b32 retval0;
call.uni (retval0),
puts,
(
param0
);
ld.param.b32 %r0, [retval0+0];
//{
}// Callseq End 0
// Callseq Start 1
{
.reg .b32 temp_param_reg;
// <end>}
call.uni
abort,
(
);
//{
}// Callseq End 1
}

View File

@@ -0,0 +1,320 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl __aos_to_soa4_float1
// @__aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl mandelbrot_scanline
.entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 .ptr .align 4 mandelbrot_scanline_param_9
) // @mandelbrot_scanline
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u32 %r6, [mandelbrot_scanline_param_5];
ld.param.u32 %r5, [mandelbrot_scanline_param_7];
mov.u32 %r7, %ctaid.y;
mul.lo.s32 %r0, %r7, %r5;
mad.lo.s32 %r1, %r7, %r5, %r5;
setp.lt.s32 %p0, %r1, %r6;
selp.b32 %r1, %r1, %r6, %p0;
setp.ge.s32 %p0, %r0, %r1;
@%p0 bra BB4_13;
// BB#1: // %for_test28.preheader.lr.ph
ld.param.f32 %f0, [mandelbrot_scanline_param_0];
ld.param.f32 %f1, [mandelbrot_scanline_param_1];
ld.param.f32 %f2, [mandelbrot_scanline_param_2];
ld.param.f32 %f3, [mandelbrot_scanline_param_3];
ld.param.u32 %r1, [mandelbrot_scanline_param_4];
ld.param.u32 %r4, [mandelbrot_scanline_param_6];
ld.param.u32 %r2, [mandelbrot_scanline_param_8];
ld.param.u64 %rl0, [mandelbrot_scanline_param_9];
mov.u32 %r8, %ctaid.x;
mul.lo.s32 %r3, %r8, %r4;
mad.lo.s32 %r4, %r8, %r4, %r4;
setp.lt.s32 %p0, %r4, %r1;
selp.b32 %r4, %r4, %r1, %p0;
setp.gt.s32 %p0, %r2, 0;
not.b32 %r6, %r6;
add.s32 %r7, %r7, 1;
mul.lo.s32 %r5, %r7, %r5;
not.b32 %r5, %r5;
setp.gt.s32 %p1, %r6, %r5;
selp.b32 %r5, %r6, %r5, %p1;
not.b32 %r5, %r5;
BB4_2: // %for_test28.preheader
// =>This Loop Header: Depth=1
// Child Loop BB4_15 Depth 2
// Child Loop BB4_8 Depth 2
// Child Loop BB4_11 Depth 3
setp.ge.s32 %p1, %r3, %r4;
@%p1 bra BB4_12;
// BB#3: // %for_loop30.lr.ph
// in Loop: Header=BB4_2 Depth=1
mul.lo.s32 %r6, %r0, %r1;
mov.u32 %r7, %r3;
@%p0 bra BB4_4;
bra.uni BB4_15;
BB4_4: // in Loop: Header=BB4_2 Depth=1
cvt.rn.f32.s32 %f4, %r0;
fma.rn.f32 %f4, %f4, %f3, %f2;
mov.u32 %r7, %r3;
BB4_8: // %for_loop.i.lr.ph.us
// Parent Loop BB4_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB4_11 Depth 3
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r11, %r10, %r7;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f5, %f1, %f0;
mov.u32 %r10, 0;
mov.pred %p1, 0;
mov.pred %p3, -1;
mov.pred %p4, %p0;
mov.pred %p2, %p1;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB4_11: // %for_loop.i.us
// Parent Loop BB4_2 Depth=1
// Parent Loop BB4_8 Depth=2
// => This Inner Loop Header: Depth=3
and.pred %p4, %p3, %p4;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p3, %f9, 0f40800000;
and.pred %p3, %p4, %p3;
or.pred %p2, %p3, %p2;
xor.pred %p5, %p2, %p4;
mov.pred %p3, %p1;
@!%p5 bra BB4_10;
bra.uni BB4_9;
BB4_9: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB4_11 Depth=3
mul.f32 %f9, %f6, %f6;
not.pred %p3, %p2;
and.pred %p3, %p4, %p3;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB4_10: // %for_step.i.us
// in Loop: Header=BB4_11 Depth=3
add.s32 %r12, %r10, 1;
selp.b32 %r10, %r12, %r10, %p3;
setp.lt.s32 %p4, %r10, %r2;
and.pred %p5, %p3, %p4;
@%p5 bra BB4_11;
// BB#5: // %mandel___vyfvyfvyi.exit.us
// in Loop: Header=BB4_8 Depth=2
setp.ge.s32 %p1, %r11, %r4;
@%p1 bra BB4_7;
// BB#6: // %if_then.us
// in Loop: Header=BB4_8 Depth=2
add.s32 %r11, %r8, 1073741823;
and.b32 %r9, %r11, %r9;
add.s32 %r11, %r7, %r6;
add.s32 %r9, %r11, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB4_7: // %if_exit.us
// in Loop: Header=BB4_8 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r4;
@%p1 bra BB4_8;
bra.uni BB4_12;
BB4_15: // %mandel___vyfvyfvyi.exit
// Parent Loop BB4_2 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r10, %r10, %r7;
setp.lt.s32 %p1, %r10, %r4;
@%p1 bra BB4_16;
bra.uni BB4_14;
BB4_16: // %if_then
// in Loop: Header=BB4_15 Depth=2
add.s32 %r10, %r8, 1073741823;
and.b32 %r9, %r10, %r9;
add.s32 %r10, %r7, %r6;
add.s32 %r9, %r10, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
BB4_14: // %if_exit
// in Loop: Header=BB4_15 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r4;
@%p1 bra BB4_15;
BB4_12: // %for_exit31
// in Loop: Header=BB4_2 Depth=1
add.s32 %r0, %r0, 1;
setp.eq.s32 %p1, %r0, %r5;
@%p1 bra BB4_13;
bra.uni BB4_2;
BB4_13: // %for_exit
ret;
}

View File

@@ -0,0 +1,320 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl __aos_to_soa4_float1
// @__aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl mandelbrot_scanline
.entry mandelbrot_scanline(
.param .f32 mandelbrot_scanline_param_0,
.param .f32 mandelbrot_scanline_param_1,
.param .f32 mandelbrot_scanline_param_2,
.param .f32 mandelbrot_scanline_param_3,
.param .u32 mandelbrot_scanline_param_4,
.param .u32 mandelbrot_scanline_param_5,
.param .u32 mandelbrot_scanline_param_6,
.param .u32 mandelbrot_scanline_param_7,
.param .u32 mandelbrot_scanline_param_8,
.param .u64 .ptr .align 4 mandelbrot_scanline_param_9
) // @mandelbrot_scanline
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u32 %r6, [mandelbrot_scanline_param_5];
ld.param.u32 %r5, [mandelbrot_scanline_param_7];
mov.u32 %r7, %ctaid.y;
mul.lo.s32 %r0, %r7, %r5;
mad.lo.s32 %r1, %r7, %r5, %r5;
setp.lt.s32 %p0, %r1, %r6;
selp.b32 %r1, %r1, %r6, %p0;
setp.ge.s32 %p0, %r0, %r1;
@%p0 bra BB4_13;
// BB#1: // %for_test28.preheader.lr.ph
ld.param.f32 %f0, [mandelbrot_scanline_param_0];
ld.param.f32 %f1, [mandelbrot_scanline_param_1];
ld.param.f32 %f2, [mandelbrot_scanline_param_2];
ld.param.f32 %f3, [mandelbrot_scanline_param_3];
ld.param.u32 %r1, [mandelbrot_scanline_param_4];
ld.param.u32 %r4, [mandelbrot_scanline_param_6];
ld.param.u32 %r2, [mandelbrot_scanline_param_8];
ld.param.u64 %rl0, [mandelbrot_scanline_param_9];
mov.u32 %r8, %ctaid.x;
mul.lo.s32 %r3, %r8, %r4;
mad.lo.s32 %r4, %r8, %r4, %r4;
setp.lt.s32 %p0, %r4, %r1;
selp.b32 %r4, %r4, %r1, %p0;
setp.gt.s32 %p0, %r2, 0;
not.b32 %r6, %r6;
add.s32 %r7, %r7, 1;
mul.lo.s32 %r5, %r7, %r5;
not.b32 %r5, %r5;
setp.gt.s32 %p1, %r6, %r5;
selp.b32 %r5, %r6, %r5, %p1;
not.b32 %r5, %r5;
BB4_2: // %for_test28.preheader
// =>This Loop Header: Depth=1
// Child Loop BB4_15 Depth 2
// Child Loop BB4_8 Depth 2
// Child Loop BB4_11 Depth 3
setp.ge.s32 %p1, %r3, %r4;
@%p1 bra BB4_12;
// BB#3: // %for_loop30.lr.ph
// in Loop: Header=BB4_2 Depth=1
mul.lo.s32 %r6, %r0, %r1;
mov.u32 %r7, %r3;
@%p0 bra BB4_4;
bra.uni BB4_15;
BB4_4: // in Loop: Header=BB4_2 Depth=1
cvt.rn.f32.s32 %f4, %r0;
fma.rn.f32 %f4, %f4, %f3, %f2;
mov.u32 %r7, %r3;
BB4_8: // %for_loop.i.lr.ph.us
// Parent Loop BB4_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB4_11 Depth 3
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r11, %r10, %r7;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f5, %f1, %f0;
mov.u32 %r10, 0;
mov.pred %p1, 0;
mov.pred %p3, -1;
mov.pred %p4, %p0;
mov.pred %p2, %p1;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB4_11: // %for_loop.i.us
// Parent Loop BB4_2 Depth=1
// Parent Loop BB4_8 Depth=2
// => This Inner Loop Header: Depth=3
and.pred %p4, %p3, %p4;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p3, %f9, 0f40800000;
and.pred %p3, %p4, %p3;
or.pred %p2, %p3, %p2;
xor.pred %p5, %p2, %p4;
mov.pred %p3, %p1;
@!%p5 bra BB4_10;
bra.uni BB4_9;
BB4_9: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB4_11 Depth=3
mul.f32 %f9, %f6, %f6;
not.pred %p3, %p2;
and.pred %p3, %p4, %p3;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB4_10: // %for_step.i.us
// in Loop: Header=BB4_11 Depth=3
add.s32 %r12, %r10, 1;
selp.b32 %r10, %r12, %r10, %p3;
setp.lt.s32 %p4, %r10, %r2;
and.pred %p5, %p3, %p4;
@%p5 bra BB4_11;
// BB#5: // %mandel___vyfvyfvyi.exit.us
// in Loop: Header=BB4_8 Depth=2
setp.ge.s32 %p1, %r11, %r4;
@%p1 bra BB4_7;
// BB#6: // %if_then.us
// in Loop: Header=BB4_8 Depth=2
add.s32 %r11, %r8, 1073741823;
and.b32 %r9, %r11, %r9;
add.s32 %r11, %r7, %r6;
add.s32 %r9, %r11, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r10;
BB4_7: // %if_exit.us
// in Loop: Header=BB4_8 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r4;
@%p1 bra BB4_8;
bra.uni BB4_12;
BB4_15: // %mandel___vyfvyfvyi.exit
// Parent Loop BB4_2 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r9, %tid.x;
mov.u32 %r8, WARP_SZ;
add.s32 %r10, %r8, -1;
and.b32 %r10, %r10, %r9;
add.s32 %r10, %r10, %r7;
setp.lt.s32 %p1, %r10, %r4;
@%p1 bra BB4_16;
bra.uni BB4_14;
BB4_16: // %if_then
// in Loop: Header=BB4_15 Depth=2
add.s32 %r10, %r8, 1073741823;
and.b32 %r9, %r10, %r9;
add.s32 %r10, %r7, %r6;
add.s32 %r9, %r10, %r9;
shl.b32 %r9, %r9, 2;
cvt.s64.s32 %rl1, %r9;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
BB4_14: // %if_exit
// in Loop: Header=BB4_15 Depth=2
add.s32 %r7, %r8, %r7;
setp.lt.s32 %p1, %r7, %r4;
@%p1 bra BB4_15;
BB4_12: // %for_exit31
// in Loop: Header=BB4_2 Depth=1
add.s32 %r0, %r0, 1;
setp.eq.s32 %p1, %r0, %r5;
@%p1 bra BB4_13;
bra.uni BB4_2;
BB4_13: // %for_exit
ret;
}

View File

@@ -0,0 +1,171 @@
.file "mandelbrot_task.ispc"
.text
.globl mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
.align 16, 0x90
.type mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_,@function
mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_: # @mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
# BB#0: # %allocas
pushq %rbp
pushq %r15
pushq %r14
pushq %rbx
subq $88, %rsp
vmovups %ymm4, 32(%rsp) # 32-byte Folded Spill
movq %rcx, %r14
movl %edx, %r15d
movl %esi, %ebx
movl %edi, %ebp
vmovss %xmm1, 76(%rsp) # 4-byte Spill
vmovss %xmm0, 28(%rsp) # 4-byte Spill
vcvtsi2ssl %ebp, %xmm0, %xmm5
vsubss %xmm0, %xmm2, %xmm4
vcvtsi2ssl %ebx, %xmm0, %xmm2
vsubss %xmm1, %xmm3, %xmm3
movq $0, 80(%rsp)
leaq 80(%rsp), %rdi
vdivss %xmm2, %xmm3, %xmm1
vmovss %xmm1, 72(%rsp) # 4-byte Spill
vdivss %xmm5, %xmm4, %xmm0
vmovss %xmm0, 68(%rsp) # 4-byte Spill
movl $96, %esi
movl $32, %edx
vzeroupper
callq ISPCAlloc
vmovups 32(%rsp), %ymm0 # 32-byte Folded Reload
movq %rax, %rdx
movl %ebx, %r8d
sarl $31, %r8d
shrl $28, %r8d
addl %ebx, %r8d
vmovss 28(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, (%rdx)
sarl $4, %r8d
movl %ebp, %ecx
sarl $31, %ecx
shrl $28, %ecx
addl %ebp, %ecx
sarl $4, %ecx
vmovmskps %ymm0, %eax
cmpl $255, %eax
vmovss 68(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 4(%rdx)
vmovss 76(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 8(%rdx)
vmovss 72(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 12(%rdx)
movl %ebp, 16(%rdx)
movl %ebx, 20(%rdx)
movl $16, 24(%rdx)
movl $16, 28(%rdx)
movl %r15d, 32(%rdx)
movq %r14, 40(%rdx)
jne .LBB0_2
# BB#1: # %all_on
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
.LBB0_2: # %all_on
vmovaps %ymm0, 64(%rdx)
leaq 80(%rsp), %rdi
movl $mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, %esi
movl $1, %r9d
vzeroupper
callq ISPCLaunch
movq 80(%rsp), %rdi
testq %rdi, %rdi
je .LBB0_4
# BB#3: # %call_sync
callq ISPCSync
movq $0, 80(%rsp)
.LBB0_4: # %post_sync
addq $88, %rsp
popq %rbx
popq %r14
popq %r15
popq %rbp
ret
.Ltmp0:
.size mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_, .Ltmp0-mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
.globl mandelbrot_ispc
.align 16, 0x90
.type mandelbrot_ispc,@function
mandelbrot_ispc: # @mandelbrot_ispc
# BB#0: # %allocas
pushq %rbp
pushq %r15
pushq %r14
pushq %r12
pushq %rbx
subq $32, %rsp
movq %rcx, %r14
movl %edx, %r15d
movl %esi, %ebx
movl %edi, %ebp
vmovss %xmm1, 20(%rsp) # 4-byte Spill
vmovss %xmm0, 8(%rsp) # 4-byte Spill
vcvtsi2ssl %ebp, %xmm0, %xmm5
vsubss %xmm0, %xmm2, %xmm4
vcvtsi2ssl %ebx, %xmm0, %xmm2
vsubss %xmm1, %xmm3, %xmm3
movq $0, 24(%rsp)
leaq 24(%rsp), %r12
vdivss %xmm2, %xmm3, %xmm1
vmovss %xmm1, 16(%rsp) # 4-byte Spill
vdivss %xmm5, %xmm4, %xmm0
vmovss %xmm0, 12(%rsp) # 4-byte Spill
movq %r12, %rdi
movl $96, %esi
movl $32, %edx
callq ISPCAlloc
movl %ebx, %r8d
sarl $31, %r8d
vpcmpeqd %xmm0, %xmm0, %xmm0
vmovss 8(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, (%rax)
shrl $28, %r8d
addl %ebx, %r8d
movl %ebp, %ecx
sarl $31, %ecx
shrl $28, %ecx
addl %ebp, %ecx
sarl $4, %ecx
sarl $4, %r8d
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovss 12(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 4(%rax)
vmovss 20(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 8(%rax)
vmovss 16(%rsp), %xmm1 # 4-byte Reload
vmovss %xmm1, 12(%rax)
movl %ebp, 16(%rax)
movl %ebx, 20(%rax)
movl $16, 24(%rax)
movl $16, 28(%rax)
movl %r15d, 32(%rax)
movq %r14, 40(%rax)
vmovaps %ymm0, 64(%rax)
movq %r12, %rdi
movl $mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_, %esi
movq %rax, %rdx
movl $1, %r9d
vzeroupper
callq ISPCLaunch
movq 24(%rsp), %rdi
testq %rdi, %rdi
je .LBB1_2
# BB#1: # %call_sync
callq ISPCSync
movq $0, 24(%rsp)
.LBB1_2: # %post_sync
addq $32, %rsp
popq %rbx
popq %r12
popq %r14
popq %r15
popq %rbp
ret
.Ltmp1:
.size mandelbrot_ispc, .Ltmp1-mandelbrot_ispc
.section ".note.GNU-stack","",@progbits

View File

@@ -0,0 +1,180 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>mandelbrot_tasks</RootNamespace>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<LinkIncremental>true</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>mandelbrot_tasks</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>mandelbrot_tasks</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>mandelbrot_tasks</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>mandelbrot_tasks</TargetName>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PrecompiledHeader>
</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
<FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PrecompiledHeader>
</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
<FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="mandelbrot_tasks.cpp" />
<ClCompile Include="mandelbrot_tasks_serial.cpp" />
<ClCompile Include="../tasksys.cpp" />
</ItemGroup>
<ItemGroup>
<CustomBuild Include="mandelbrot_tasks.ispc">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
</CustomBuild>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>

Binary file not shown.

View File

@@ -0,0 +1,146 @@
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#define NOMINMAX
#pragma warning (disable: 4244)
#pragma warning (disable: 4305)
#endif
#include <stdio.h>
#include <algorithm>
#include <string.h>
#include "../timing.h"
#include "mandelbrot_tasks3d_ispc.h"
using namespace ispc;
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
int width, int height, int maxIterations,
int output[]);
/* Write a PPM image file with the image of the Mandelbrot set */
static void
writePPM(int *buf, int width, int height, const char *fn) {
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n");
fprintf(fp, "%d %d\n", width, height);
fprintf(fp, "255\n");
for (int i = 0; i < width*height; ++i) {
// Map the iteration count to colors by just alternating between
// two greys.
char c = (buf[i] & 0x1) ? 240 : 20;
for (int j = 0; j < 3; ++j)
fputc(c, fp);
}
fclose(fp);
printf("Wrote image file %s\n", fn);
}
static void usage() {
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
exit(1);
}
int main(int argc, char *argv[]) {
unsigned int width = 1536;
unsigned int height = 1024;
float x0 = -2;
float x1 = 1;
float y0 = -1;
float y1 = 1;
if (argc == 1)
;
else if (argc == 2) {
if (strncmp(argv[1], "--scale=", 8) == 0) {
float scale = atof(argv[1] + 8);
if (scale == 0.f)
usage();
width *= scale;
height *= scale;
// round up to multiples of 16
width = (width + 0xf) & ~0xf;
height = (height + 0xf) & ~0xf;
}
else
usage();
}
else
usage();
int maxIterations = 512;
int *buf = new int[width*height];
//
// Compute the image using the ispc implementation; report the minimum
// time of three runs.
//
double minISPC = 1e30;
for (int i = 0; i < 3; ++i) {
// Clear out the buffer
for (unsigned int i = 0; i < width * height; ++i)
buf[i] = 0;
reset_and_start_timer();
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
double dt = get_elapsed_mcycles();
minISPC = std::min(minISPC, dt);
}
printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
writePPM(buf, width, height, "mandelbrot-ispc.ppm");
//
// And run the serial implementation 3 times, again reporting the
// minimum time.
//
double minSerial = 1e30;
for (int i = 0; i < 3; ++i) {
// Clear out the buffer
for (unsigned int i = 0; i < width * height; ++i)
buf[i] = 0;
reset_and_start_timer();
mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
double dt = get_elapsed_mcycles();
minSerial = std::min(minSerial, dt);
}
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
writePPM(buf, width, height, "mandelbrot-serial.ppm");
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
return 0;
}

View File

@@ -0,0 +1,99 @@
/*
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
static inline int
mandel(float c_re, float c_im, int count) {
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.)
break;
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.f * z_re * z_im;
unmasked {
z_re = c_re + new_re;
z_im = c_im + new_im;
}
}
return i;
}
/* Task to compute the Mandelbrot iterations for a single scanline.
*/
task void
mandelbrot_scanline(uniform float x0, uniform float dx,
uniform float y0, uniform float dy,
uniform int width, uniform int height,
uniform int xspan, uniform int yspan,
uniform int maxIterations, uniform int output[]) {
const uniform int xstart = taskIndex0 * xspan;
const uniform int xend = min(xstart + xspan, width);
const uniform int ystart = taskIndex1 * yspan;
const uniform int yend = min(ystart + yspan, height);
foreach (yi = ystart ... yend, xi = xstart ... xend) {
float x = x0 + xi * dx;
float y = y0 + yi * dy;
int index = yi * width + xi;
output[index] = mandel(x, y, maxIterations);
}
}
#if 1
export void
mandelbrot_ispc(uniform float x0, uniform float y0,
uniform float x1, uniform float y1,
uniform int width, uniform int height,
uniform int maxIterations, uniform int output[]) {
uniform float dx = (x1 - x0) / width;
uniform float dy = (y1 - y0) / height;
const uniform int xspan = 16; /* make sure it is big enough to avoid false-sharing */
const uniform int yspan = 16;
#if 1
launch [width/xspan, height/yspan]
#else
launch [height/yspan][width/xspan]
#endif
mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan,
maxIterations, output);
}
#endif

View File

@@ -0,0 +1,68 @@
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
static int mandel(float c_re, float c_im, int count) {
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.f)
break;
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.f * z_re * z_im;
z_re = c_re + new_re;
z_im = c_im + new_im;
}
return i;
}
void mandelbrot_serial(float x0, float y0, float x1, float y1,
int width, int height, int maxIterations,
int output[])
{
float dx = (x1 - x0) / width;
float dy = (y1 - y0) / height;
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; ++i) {
float x = x0 + i * dx;
float y = y0 + j * dy;
int index = (j * width + i);
output[index] = mandel(x, y, maxIterations);
}
}
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,8 @@
extern uniform int foo1();
extern uniform int data[1024];
void foo()
{
data[foo1()] = 0; //taskIndex;
}

View File

@@ -0,0 +1,103 @@
; ModuleID = 'task.bc'
target datalayout = "e-p:64:64:64-S0-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-v16:16:16-v32:32:32-n16:32:64"
target triple = "nvptx64"
@data = external global [1024 x i32]
; Function Attrs: alwaysinline nounwind readnone
define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8>, <1 x i32> %mask) #0 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i8> %0, i32 0
%d1 = extractelement <1 x i8> %1, i32 0
%sel = select i1 %cmp, i8 %d0, i8 %d1
%r = insertelement <1 x i8> undef, i8 %sel, i32 0
ret <1 x i8> %r
}
; Function Attrs: alwaysinline nounwind readnone
define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16>, <1 x i32> %mask) #0 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i16> %0, i32 0
%d1 = extractelement <1 x i16> %1, i32 0
%sel = select i1 %cmp, i16 %d0, i16 %d1
%r = insertelement <1 x i16> undef, i16 %sel, i32 0
ret <1 x i16> %r
}
; Function Attrs: alwaysinline nounwind readnone
define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64>, <1 x i32> %mask) #0 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i64> %0, i32 0
%d1 = extractelement <1 x i64> %1, i32 0
%sel = select i1 %cmp, i64 %d0, i64 %d1
%r = insertelement <1 x i64> undef, i64 %sel, i32 0
ret <1 x i64> %r
}
; Function Attrs: nounwind readnone
declare double @llvm.nvvm.rsqrt.approx.d(double) #1
; Function Attrs: alwaysinline nounwind
define void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float> %v3, <1 x float>* noalias nocapture %out0, <1 x float>* noalias nocapture %out1, <1 x float>* noalias nocapture %out2, <1 x float>* noalias nocapture %out3) #2 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
store <1 x float> %v3, <1 x float>* %out3, align 4
ret void
}
; Function Attrs: alwaysinline nounwind
define void @__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float> %v3, <1 x float>* noalias nocapture %out0, <1 x float>* noalias nocapture %out1, <1 x float>* noalias nocapture %out2, <1 x float>* noalias nocapture %out3) #2 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
store <1 x float> %v3, <1 x float>* %out3, align 4
ret void
}
; Function Attrs: nounwind
define void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float>* nocapture %out0, <1 x float>* nocapture %out1, <1 x float>* nocapture %out2) #3 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
ret void
}
; Function Attrs: nounwind
define void @__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float>* nocapture %out0, <1 x float>* nocapture %out1, <1 x float>* nocapture %out2) #3 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
ret void
}
; Function Attrs: alwaysinline nounwind readonly
define <1 x double> @__rsqrt_varying_double(<1 x double> %v) #4 {
%vs = extractelement <1 x double> %v, i32 0
%rs = tail call double @llvm.nvvm.rsqrt.approx.d(double %vs)
%rv = insertelement <1 x double> undef, double %rs, i32 0
ret <1 x double> %rv
}
; Function Attrs: nounwind
declare i32 @foo1___(<1 x i32>) #5
; Function Attrs: nounwind
define void @foo___(<1 x i32> %__mask) #5 {
allocas:
%calltmp = tail call i32 @foo1___(<1 x i32> %__mask)
%calltmp_to_int64 = sext i32 %calltmp to i64
%data_offset = getelementptr [1024 x i32]* @data, i64 0, i64 %calltmp_to_int64
store i32 0, i32* %data_offset, align 4
ret void
}
attributes #0 = { alwaysinline nounwind readnone }
attributes #1 = { nounwind readnone }
attributes #2 = { alwaysinline nounwind }
attributes #3 = { nounwind }
attributes #4 = { alwaysinline nounwind readonly }
attributes #5 = { nounwind "target-features"="+sm_35" }

View File

@@ -0,0 +1,543 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_20, texmode_independent
.address_size 64
// .globl __land_id
// @__land_id
.func (.param .b32 func_retval0) __land_id(
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
mov.u32 %r0, %laneid;
st.param.b32 [func_retval0+0], %r0;
ret;
}
// .globl __vselect_i8
.func (.param .align 1 .b8 func_retval0[1]) __vselect_i8(
.param .align 1 .b8 __vselect_i8_param_0[1],
.param .align 1 .b8 __vselect_i8_param_1[1],
.param .align 4 .b8 __vselect_i8_param_2[4]
) // @__vselect_i8
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i8_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u8 %rc0, [__vselect_i8_param_0];
ld.param.u8 %rc1, [__vselect_i8_param_1];
selp.b16 %rc0, %rc0, %rc1, %p0;
st.param.b8 [func_retval0+0], %rc0;
ret;
}
// .globl __vselect_i16
.func (.param .align 2 .b8 func_retval0[2]) __vselect_i16(
.param .align 2 .b8 __vselect_i16_param_0[2],
.param .align 2 .b8 __vselect_i16_param_1[2],
.param .align 4 .b8 __vselect_i16_param_2[4]
) // @__vselect_i16
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i16_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u16 %rs0, [__vselect_i16_param_0];
ld.param.u16 %rs1, [__vselect_i16_param_1];
selp.b16 %rs0, %rs0, %rs1, %p0;
st.param.b16 [func_retval0+0], %rs0;
ret;
}
// .globl __vselect_i64
.func (.param .align 8 .b8 func_retval0[8]) __vselect_i64(
.param .align 8 .b8 __vselect_i64_param_0[8],
.param .align 8 .b8 __vselect_i64_param_1[8],
.param .align 4 .b8 __vselect_i64_param_2[4]
) // @__vselect_i64
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i64_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u64 %rl0, [__vselect_i64_param_0];
ld.param.u64 %rl1, [__vselect_i64_param_1];
selp.b64 %rl0, %rl0, %rl1, %p0;
st.param.b64 [func_retval0+0], %rl0;
ret;
}
// .globl __aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
) // @__aos_to_soa4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __rsqrt_varying_double
.func (.param .align 8 .b8 func_retval0[8]) __rsqrt_varying_double(
.param .align 8 .b8 __rsqrt_varying_double_param_0[8]
) // @__rsqrt_varying_double
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f64 %fl0, [__rsqrt_varying_double_param_0];
rsqrt.approx.f64 %fl0, %fl0;
st.param.f64 [func_retval0+0], %fl0;
ret;
}
// .globl mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_
.func mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_(
.param .b64 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_1,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_2,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_3,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_4,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_5,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_6,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_7,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_8,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_9,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_10
) // @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u64 %rl0, [mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0];
ld.f32 %f0, [%rl0];
ld.f32 %f1, [%rl0+4];
ld.f32 %f2, [%rl0+8];
ld.f32 %f3, [%rl0+12];
ld.u32 %r4, [%rl0+16];
ld.u32 %r8, [%rl0+20];
ld.u32 %r9, [%rl0+24];
ld.u32 %r7, [%rl0+28];
ld.u32 %r0, [%rl0+32];
ld.u32 %r5, [%rl0+48];
setp.lt.s32 %p0, %r5, 0;
mov.u32 %r11, %ctaid.x;
mul.lo.s32 %r1, %r11, %r9;
mad.lo.s32 %r2, %r11, %r9, %r9;
setp.lt.s32 %p1, %r2, %r4;
selp.b32 %r2, %r2, %r4, %p1;
mov.u32 %r10, %ctaid.y;
mul.lo.s32 %r3, %r10, %r7;
mad.lo.s32 %r6, %r10, %r7, %r7;
setp.lt.s32 %p1, %r6, %r8;
selp.b32 %r6, %r6, %r8, %p1;
@%p0 bra BB9_4;
// BB#1: // %for_test101.preheader
setp.ge.s32 %p0, %r3, %r6;
@%p0 bra BB9_31;
// BB#2: // %for_test112.preheader.lr.ph
setp.gt.s32 %p0, %r0, 0;
selp.b32 %r4, -1, 0, %p0;
and.b32 %r6, %r4, %r5;
not.b32 %r8, %r8;
add.s32 %r9, %r10, 1;
mul.lo.s32 %r7, %r7, %r9;
not.b32 %r7, %r7;
setp.gt.s32 %p0, %r8, %r7;
selp.b32 %r7, %r8, %r7, %p0;
not.b32 %r7, %r7;
BB9_3: // %for_test112.preheader
// =>This Loop Header: Depth=1
// Child Loop BB9_29 Depth 2
// Child Loop BB9_28 Depth 2
// Child Loop BB9_23 Depth 3
setp.ge.s32 %p0, %r1, %r2;
@%p0 bra BB9_30;
// BB#21: // %for_loop114.lr.ph
// in Loop: Header=BB9_3 Depth=1
setp.lt.s32 %p0, %r6, 0;
mov.u32 %r8, %r1;
@%p0 bra BB9_22;
bra.uni BB9_29;
BB9_22: // in Loop: Header=BB9_3 Depth=1
cvt.rn.f32.s32 %f4, %r3;
fma.rn.f32 %f4, %f3, %f4, %f2;
mov.u32 %r8, %r1;
BB9_28: // %for_loop.i.lr.ph.us
// Parent Loop BB9_3 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB9_23 Depth 3
mov.u32 %r9, %laneid;
add.s32 %r9, %r9, %r8;
cvt.rn.f32.s32 %f5, %r9;
fma.rn.f32 %f5, %f1, %f5, %f0;
mov.u32 %r9, 0;
mov.u32 %r12, %r4;
mov.u32 %r10, %r9;
mov.u32 %r11, %r9;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB9_23: // %for_loop.i.us
// Parent Loop BB9_3 Depth=1
// Parent Loop BB9_28 Depth=2
// => This Inner Loop Header: Depth=3
and.b32 %r13, %r12, %r5;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p0, %f9, 0f40800000;
selp.b32 %r14, %r12, 0, %p0;
or.b32 %r10, %r14, %r10;
and.b32 %r14, %r10, %r5;
shr.u32 %r14, %r14, 31;
shr.u32 %r13, %r13, 31;
setp.eq.s32 %p0, %r14, %r13;
@%p0 bra BB9_24;
bra.uni BB9_25;
BB9_24: // in Loop: Header=BB9_23 Depth=3
mov.u32 %r12, %r9;
bra.uni BB9_26;
BB9_25: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB9_23 Depth=3
mul.f32 %f9, %f6, %f6;
not.b32 %r13, %r10;
and.b32 %r12, %r12, %r13;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB9_26: // %for_step.i.us
// in Loop: Header=BB9_23 Depth=3
setp.ne.s32 %p0, %r12, 0;
selp.u32 %r13, 1, 0, %p0;
add.s32 %r11, %r11, %r13;
setp.lt.s32 %p0, %r11, %r0;
selp.b32 %r12, %r12, 0, %p0;
and.b32 %r13, %r12, %r5;
setp.lt.s32 %p0, %r13, 0;
@%p0 bra BB9_23;
// BB#27: // %if_exit156.us
// in Loop: Header=BB9_28 Depth=2
add.s32 %r8, %r8, 32;
setp.lt.s32 %p0, %r8, %r2;
@%p0 bra BB9_28;
bra.uni BB9_30;
BB9_29: // %if_exit156
// Parent Loop BB9_3 Depth=1
// => This Inner Loop Header: Depth=2
add.s32 %r8, %r8, 32;
setp.lt.s32 %p0, %r8, %r2;
@%p0 bra BB9_29;
BB9_30: // %for_exit115
// in Loop: Header=BB9_3 Depth=1
add.s32 %r3, %r3, 1;
setp.eq.s32 %p0, %r3, %r7;
@%p0 bra BB9_31;
bra.uni BB9_3;
BB9_4: // %for_test.preheader
setp.ge.s32 %p0, %r3, %r6;
@%p0 bra BB9_31;
// BB#5: // %for_test40.preheader.lr.ph
ld.u64 %rl0, [%rl0+40];
setp.gt.s32 %p0, %r0, 0;
selp.b32 %r5, -1, 0, %p0;
not.b32 %r6, %r8;
add.s32 %r8, %r10, 1;
mul.lo.s32 %r8, %r7, %r8;
not.b32 %r8, %r8;
setp.gt.s32 %p0, %r6, %r8;
selp.b32 %r6, %r6, %r8, %p0;
not.b32 %r6, %r6;
mul.lo.s32 %r7, %r10, %r7;
mul.lo.s32 %r7, %r7, %r4;
mad.lo.s32 %r7, %r11, %r9, %r7;
mov.u32 %r13, 0;
BB9_6: // %for_test40.preheader
// =>This Loop Header: Depth=1
// Child Loop BB9_19 Depth 2
// Child Loop BB9_12 Depth 2
// Child Loop BB9_13 Depth 3
setp.ge.s32 %p0, %r1, %r2;
@%p0 bra BB9_17;
// BB#7: // %for_loop42.lr.ph
// in Loop: Header=BB9_6 Depth=1
setp.lt.s32 %p0, %r5, 0;
mov.u32 %r8, %r7;
mov.u32 %r9, %r1;
@%p0 bra BB9_8;
bra.uni BB9_19;
BB9_8: // in Loop: Header=BB9_6 Depth=1
cvt.rn.f32.s32 %f4, %r3;
mul.lo.s32 %r8, %r3, %r4;
fma.rn.f32 %f4, %f3, %f4, %f2;
mov.u32 %r9, %r1;
BB9_12: // %for_loop.i206.lr.ph.us
// Parent Loop BB9_6 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB9_13 Depth 3
mov.u32 %r10, %laneid;
add.s32 %r12, %r10, %r9;
cvt.rn.f32.s32 %f5, %r12;
fma.rn.f32 %f5, %f1, %f5, %f0;
mov.u32 %r15, %r5;
mov.u32 %r14, %r13;
mov.u32 %r11, %r13;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB9_13: // %for_loop.i206.us
// Parent Loop BB9_6 Depth=1
// Parent Loop BB9_12 Depth=2
// => This Inner Loop Header: Depth=3
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p0, %f9, 0f40800000;
selp.b32 %r16, %r15, 0, %p0;
or.b32 %r14, %r16, %r14;
shr.u32 %r16, %r14, 31;
shr.u32 %r17, %r15, 31;
setp.eq.s32 %p0, %r16, %r17;
@%p0 bra BB9_14;
bra.uni BB9_15;
BB9_14: // in Loop: Header=BB9_13 Depth=3
mov.u32 %r15, %r13;
bra.uni BB9_16;
BB9_15: // %not_all_continued_or_breaked.i220.us
// in Loop: Header=BB9_13 Depth=3
mul.f32 %f9, %f6, %f6;
not.b32 %r16, %r14;
and.b32 %r15, %r15, %r16;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB9_16: // %for_step.i189.us
// in Loop: Header=BB9_13 Depth=3
setp.ne.s32 %p0, %r15, 0;
selp.u32 %r16, 1, 0, %p0;
add.s32 %r11, %r11, %r16;
setp.lt.s32 %p0, %r11, %r0;
selp.b32 %r15, %r15, 0, %p0;
setp.lt.s32 %p0, %r15, 0;
@%p0 bra BB9_13;
// BB#9: // %mandel___vyfvyfvyi.exit221.us
// in Loop: Header=BB9_12 Depth=2
setp.ge.s32 %p0, %r12, %r2;
@%p0 bra BB9_11;
// BB#10: // %if_then.us
// in Loop: Header=BB9_12 Depth=2
add.s32 %r12, %r9, %r8;
add.s32 %r10, %r12, %r10;
shl.b32 %r10, %r10, 2;
cvt.s64.s32 %rl1, %r10;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r11;
BB9_11: // %if_exit.us
// in Loop: Header=BB9_12 Depth=2
add.s32 %r9, %r9, 32;
setp.lt.s32 %p0, %r9, %r2;
@%p0 bra BB9_12;
bra.uni BB9_17;
BB9_19: // %mandel___vyfvyfvyi.exit221
// Parent Loop BB9_6 Depth=1
// => This Inner Loop Header: Depth=2
mov.u32 %r10, %laneid;
add.s32 %r11, %r10, %r9;
setp.lt.s32 %p0, %r11, %r2;
@%p0 bra BB9_20;
bra.uni BB9_18;
BB9_20: // %if_then
// in Loop: Header=BB9_19 Depth=2
add.s32 %r10, %r10, %r8;
shl.b32 %r10, %r10, 2;
cvt.s64.s32 %rl1, %r10;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r10, 0;
st.u32 [%rl1], %r10;
BB9_18: // %if_exit
// in Loop: Header=BB9_19 Depth=2
add.s32 %r9, %r9, 32;
add.s32 %r8, %r8, 32;
setp.lt.s32 %p0, %r9, %r2;
@%p0 bra BB9_19;
BB9_17: // %for_exit43
// in Loop: Header=BB9_6 Depth=1
add.s32 %r3, %r3, 1;
add.s32 %r7, %r7, %r4;
setp.eq.s32 %p0, %r3, %r6;
@%p0 bra BB9_31;
bra.uni BB9_6;
BB9_31: // %for_exit
ret;
}

View File

@@ -0,0 +1,284 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl __vselect_i8
.func (.param .b32 func_retval0) foo1___
(
.param .align 4 .b8 foo1____param_0[4]
)
;
.extern .global .align 4 .b8 data[4096];
// @__vselect_i8
.func (.param .align 1 .b8 func_retval0[1]) __vselect_i8(
.param .align 1 .b8 __vselect_i8_param_0[1],
.param .align 1 .b8 __vselect_i8_param_1[1],
.param .align 4 .b8 __vselect_i8_param_2[4]
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i8_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u8 %rc0, [__vselect_i8_param_0];
ld.param.u8 %rc1, [__vselect_i8_param_1];
selp.b16 %rc0, %rc0, %rc1, %p0;
st.param.b8 [func_retval0+0], %rc0;
ret;
}
// .globl __vselect_i16
.func (.param .align 2 .b8 func_retval0[2]) __vselect_i16(
.param .align 2 .b8 __vselect_i16_param_0[2],
.param .align 2 .b8 __vselect_i16_param_1[2],
.param .align 4 .b8 __vselect_i16_param_2[4]
) // @__vselect_i16
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i16_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u16 %rs0, [__vselect_i16_param_0];
ld.param.u16 %rs1, [__vselect_i16_param_1];
selp.b16 %rs0, %rs0, %rs1, %p0;
st.param.b16 [func_retval0+0], %rs0;
ret;
}
// .globl __vselect_i64
.func (.param .align 8 .b8 func_retval0[8]) __vselect_i64(
.param .align 8 .b8 __vselect_i64_param_0[8],
.param .align 8 .b8 __vselect_i64_param_1[8],
.param .align 4 .b8 __vselect_i64_param_2[4]
) // @__vselect_i64
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i64_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u64 %rl0, [__vselect_i64_param_0];
ld.param.u64 %rl1, [__vselect_i64_param_1];
selp.b64 %rl0, %rl0, %rl1, %p0;
st.param.b64 [func_retval0+0], %rl0;
ret;
}
// .globl __aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
) // @__aos_to_soa4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __rsqrt_varying_double
.func (.param .align 8 .b8 func_retval0[8]) __rsqrt_varying_double(
.param .align 8 .b8 __rsqrt_varying_double_param_0[8]
) // @__rsqrt_varying_double
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f64 %fl0, [__rsqrt_varying_double_param_0];
rsqrt.approx.f64 %fl0, %fl0;
st.param.f64 [func_retval0+0], %fl0;
ret;
}
// .globl foo___
.func foo___(
.param .align 4 .b8 foo____param_0[4]
) // @foo___
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
mov.u64 %rl0, data;
cvta.global.u64 %rl0, %rl0;
ld.param.u32 %r0, [foo____param_0];
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r0;
.param .b32 retval0;
call.uni (retval0),
foo1___,
(
param0
);
ld.param.b32 %r0, [retval0+0];
//{
}// Callseq End 0
mov.u32 %r1, 0;
mul.wide.s32 %rl1, %r0, 4;
add.s64 %rl0, %rl0, %rl1;
st.u32 [%rl0], %r1;
ret;
}

Binary file not shown.

View File

@@ -0,0 +1,61 @@
#define blockIndex0 (blockIdx.x)
#define blockIndex1 (blockIdx.y)
#define vectorWidth (32)
#define vectorIndex (threadIdx.x & (vectorWidth-1))
struct fooS {float x, y;};
int __device__ __forceinline__
mandel(float c_re, float c_im, int count)
{
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.0f)
break;
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.0f * z_re * z_im;
z_re = c_re + new_re;
z_im = c_im + new_im;
}
return i;
}
__global__ void mandelbrot_scanline(
#if 0
fooS xin, fooS dxin,
#else
float x0, float dx,
float y0, float dy,
#endif
int width, int height,
int xspan, int yspan,
int maxIterations, int output[])
{
#if 0
const float x0 = xin.x;
const float y0 = xin.y;
const float dx = dxin.x;
const float dy = dxin.y;
#endif
const int xstart = blockIndex0 * xspan;
const int xend = min(xstart + xspan, width);
const int ystart = blockIndex1 * yspan;
const int yend = min(ystart + yspan, height);
for (int yi = ystart; yi < yend; yi++)
for (int xi = xstart; xi < xend; xi += vectorWidth)
{
const float x = x0 + (xi + vectorIndex) * dx;
const float y = y0 + yi * dy;
const int res = mandel(x,y,maxIterations);
const int index = yi * width + (xi + vectorIndex);
if (xi + vectorIndex < xend)
output[index] = res;
}
}

Binary file not shown.

View File

@@ -0,0 +1,400 @@
; ModuleID = 'test.bc'
target datalayout = "e-p:64:64:64-S0-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-v16:16:16-v32:32:32-n16:32:64"
target triple = "nvptx64"
; Function Attrs: alwaysinline nounwind readnone
define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8>, <1 x i32> %mask) #0 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i8> %0, i32 0
%d1 = extractelement <1 x i8> %1, i32 0
%sel = select i1 %cmp, i8 %d0, i8 %d1
%r = insertelement <1 x i8> undef, i8 %sel, i32 0
ret <1 x i8> %r
}
; Function Attrs: alwaysinline nounwind readnone
define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16>, <1 x i32> %mask) #0 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i16> %0, i32 0
%d1 = extractelement <1 x i16> %1, i32 0
%sel = select i1 %cmp, i16 %d0, i16 %d1
%r = insertelement <1 x i16> undef, i16 %sel, i32 0
ret <1 x i16> %r
}
; Function Attrs: alwaysinline nounwind readnone
define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64>, <1 x i32> %mask) #0 {
%m = extractelement <1 x i32> %mask, i32 0
%cmp = icmp eq i32 %m, 0
%d0 = extractelement <1 x i64> %0, i32 0
%d1 = extractelement <1 x i64> %1, i32 0
%sel = select i1 %cmp, i64 %d0, i64 %d1
%r = insertelement <1 x i64> undef, i64 %sel, i32 0
ret <1 x i64> %r
}
; Function Attrs: nounwind readnone
declare double @llvm.nvvm.rsqrt.approx.d(double) #1
; Function Attrs: alwaysinline nounwind
define void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float> %v3, <1 x float>* noalias nocapture %out0, <1 x float>* noalias nocapture %out1, <1 x float>* noalias nocapture %out2, <1 x float>* noalias nocapture %out3) #2 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
store <1 x float> %v3, <1 x float>* %out3, align 4
ret void
}
; Function Attrs: alwaysinline nounwind
define void @__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float> %v3, <1 x float>* noalias nocapture %out0, <1 x float>* noalias nocapture %out1, <1 x float>* noalias nocapture %out2, <1 x float>* noalias nocapture %out3) #2 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
store <1 x float> %v3, <1 x float>* %out3, align 4
ret void
}
; Function Attrs: nounwind
define void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float>* nocapture %out0, <1 x float>* nocapture %out1, <1 x float>* nocapture %out2) #3 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
ret void
}
; Function Attrs: nounwind
define void @__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, <1 x float>* nocapture %out0, <1 x float>* nocapture %out1, <1 x float>* nocapture %out2) #3 {
store <1 x float> %v0, <1 x float>* %out0, align 4
store <1 x float> %v1, <1 x float>* %out1, align 4
store <1 x float> %v2, <1 x float>* %out2, align 4
ret void
}
; Function Attrs: alwaysinline nounwind readonly
define <1 x double> @__rsqrt_varying_double(<1 x double> %v) #4 {
%vs = extractelement <1 x double> %v, i32 0
%rs = tail call double @llvm.nvvm.rsqrt.approx.d(double %vs)
%rv = insertelement <1 x double> undef, double %rs, i32 0
ret <1 x double> %rv
}
; Function Attrs: nounwind
declare i32 @getBlockIndex0___(<1 x i32>) #5
; Function Attrs: nounwind
declare i32 @getBlockIndex1___(<1 x i32>) #5
; Function Attrs: nounwind
declare i32 @getLaneIndex___(<1 x i32>) #5
; Function Attrs: nounwind
define void @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_({ float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* noalias nocapture, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #5 {
allocas:
%x01 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 0
%x02 = load float* %x01, align 4
%dx3 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 1
%dx4 = load float* %dx3, align 4
%y05 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 2
%y06 = load float* %y05, align 4
%dy7 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 3
%dy8 = load float* %dy7, align 4
%width9 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 4
%width10 = load i32* %width9, align 4
%height11 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 5
%height12 = load i32* %height11, align 4
%xspan13 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 6
%xspan14 = load i32* %xspan13, align 4
%yspan15 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 7
%yspan16 = load i32* %yspan15, align 4
%maxIterations17 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 8
%maxIterations18 = load i32* %maxIterations17, align 4
%output19 = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 9
%output20 = load i32** %output19, align 8
%task_struct_mask = getelementptr { float, float, float, float, i32, i32, i32, i32, i32, i32*, <1 x i32> }* %0, i64 0, i32 10
%mask = load <1 x i32>* %task_struct_mask, align 4
%item.i = extractelement <1 x i32> %mask, i32 0
%cmp.i = icmp slt i32 %item.i, 0
br i1 %cmp.i, label %all_on, label %some_on
all_on: ; preds = %allocas
%calltmp = call i32 @getBlockIndex0___(<1 x i32> <i32 -1>)
%mul_calltmp_xspan_load = mul i32 %calltmp, %xspan14
%add_xstart_load_xspan_load25 = add i32 %mul_calltmp_xspan_load, %xspan14
%c.i.i = icmp slt i32 %add_xstart_load_xspan_load25, %width10
%r.i.i = select i1 %c.i.i, i32 %add_xstart_load_xspan_load25, i32 %width10
%calltmp31 = call i32 @getBlockIndex1___(<1 x i32> <i32 -1>)
%mul_calltmp31_yspan_load = mul i32 %calltmp31, %yspan16
%add_ystart_load_yspan_load32 = add i32 %mul_calltmp31_yspan_load, %yspan16
%c.i.i166 = icmp slt i32 %add_ystart_load_yspan_load32, %height12
%r.i.i167 = select i1 %c.i.i166, i32 %add_ystart_load_yspan_load32, i32 %height12
%less_yi_load_yend_load294 = icmp slt i32 %mul_calltmp31_yspan_load, %r.i.i167
br i1 %less_yi_load_yend_load294, label %for_test40.preheader.lr.ph, label %for_exit
for_test40.preheader.lr.ph: ; preds = %all_on
%less_xi_load_xend_load292 = icmp slt i32 %mul_calltmp_xspan_load, %r.i.i
%maxIterations_load_broadcast_init = insertelement <1 x i32> undef, i32 %maxIterations18, i32 0
%less_i_load_count_load.i179283 = icmp sgt <1 x i32> %maxIterations_load_broadcast_init, zeroinitializer
%"oldMask&test.i180284" = select <1 x i1> %less_i_load_count_load.i179283, <1 x i32> <i32 -1>, <1 x i32> zeroinitializer
%item.i.i181285 = extractelement <1 x i32> %"oldMask&test.i180284", i32 0
%cmp.i.i182286 = icmp slt i32 %item.i.i181285, 0
%output_load_ptr2int = ptrtoint i32* %output20 to i64
%11 = xor i32 %height12, -1
%12 = add i32 %calltmp31, 1
%13 = mul i32 %yspan16, %12
%14 = xor i32 %13, -1
%15 = icmp sgt i32 %11, %14
%smax = select i1 %15, i32 %11, i32 %14
%16 = xor i32 %smax, -1
br label %for_test40.preheader
some_on: ; preds = %allocas
%calltmp80 = call i32 @getBlockIndex0___(<1 x i32> %mask)
%mul_calltmp80_xspan_load81 = mul i32 %calltmp80, %xspan14
%add_xstart_load83_xspan_load84 = add i32 %mul_calltmp80_xspan_load81, %xspan14
%c.i.i168 = icmp slt i32 %add_xstart_load83_xspan_load84, %width10
%r.i.i169 = select i1 %c.i.i168, i32 %add_xstart_load83_xspan_load84, i32 %width10
%calltmp92 = call i32 @getBlockIndex1___(<1 x i32> %mask)
%mul_calltmp92_yspan_load93 = mul i32 %calltmp92, %yspan16
%add_ystart_load95_yspan_load96 = add i32 %mul_calltmp92_yspan_load93, %yspan16
%c.i.i170 = icmp slt i32 %add_ystart_load95_yspan_load96, %height12
%r.i.i171 = select i1 %c.i.i170, i32 %add_ystart_load95_yspan_load96, i32 %height12
%less_yi_load108_yend_load109309 = icmp slt i32 %mul_calltmp92_yspan_load93, %r.i.i171
br i1 %less_yi_load108_yend_load109309, label %for_test112.preheader.lr.ph, label %for_exit
for_test112.preheader.lr.ph: ; preds = %some_on
%less_xi_load119_xend_load120306 = icmp slt i32 %mul_calltmp80_xspan_load81, %r.i.i169
%maxIterations_load137_broadcast_init = insertelement <1 x i32> undef, i32 %maxIterations18, i32 0
%less_i_load_count_load.i296 = icmp sgt <1 x i32> %maxIterations_load137_broadcast_init, zeroinitializer
%"oldMask&test.i297" = select <1 x i1> %less_i_load_count_load.i296, <1 x i32> <i32 -1>, <1 x i32> zeroinitializer
%"internal_mask&function_mask10.i298" = and <1 x i32> %"oldMask&test.i297", %mask
%item.i.i299 = extractelement <1 x i32> %"internal_mask&function_mask10.i298", i32 0
%cmp.i.i300 = icmp slt i32 %item.i.i299, 0
%17 = xor i32 %height12, -1
%18 = add i32 %calltmp92, 1
%19 = mul i32 %yspan16, %18
%20 = xor i32 %19, -1
%21 = icmp sgt i32 %17, %20
%smax311 = select i1 %21, i32 %17, i32 %20
%22 = xor i32 %smax311, -1
br label %for_test112.preheader
for_test40.preheader: ; preds = %for_exit43, %for_test40.preheader.lr.ph
%yi.0295 = phi i32 [ %mul_calltmp31_yspan_load, %for_test40.preheader.lr.ph ], [ %yi_load74_plus1, %for_exit43 ]
br i1 %less_xi_load_xend_load292, label %for_loop42.lr.ph, label %for_exit43
for_loop42.lr.ph: ; preds = %for_test40.preheader
%yi_load52_to_float = sitofp i32 %yi.0295 to float
%mul_yi_load52_to_float_dy_load = fmul float %dy8, %yi_load52_to_float
%add_y0_load_mul_yi_load52_to_float_dy_load = fadd float %y06, %mul_yi_load52_to_float_dy_load
%add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init = insertelement <1 x float> undef, float %add_y0_load_mul_yi_load52_to_float_dy_load, i32 0
%mul_yi_load56_width_load57 = mul i32 %yi.0295, %width10
br i1 %cmp.i.i182286, label %for_loop.i204.lr.ph.us, label %mandel___vyfvyfvyi.exit219
mandel___vyfvyfvyi.exit219.us: ; preds = %for_step.i187.us
%calltmp61.us = call i32 @getLaneIndex___(<1 x i32> <i32 -1>)
%calltmp65.us = call i32 @getLaneIndex___(<1 x i32> <i32 -1>)
%add_xi_load62_calltmp65.us = add i32 %calltmp65.us, %xi.0293.us
%less_add_xi_load62_calltmp65_xend_load66.us = icmp slt i32 %add_xi_load62_calltmp65.us, %r.i.i
br i1 %less_add_xi_load62_calltmp65_xend_load66.us, label %if_then.us, label %if_exit.us
if_then.us: ; preds = %mandel___vyfvyfvyi.exit219.us
%add_xi_load58_calltmp61.us = add i32 %xi.0293.us, %mul_yi_load56_width_load57
%add_mul_yi_load56_width_load57_add_xi_load58_calltmp61.us = add i32 %add_xi_load58_calltmp61.us, %calltmp61.us
%23 = shl i32 %add_mul_yi_load56_width_load57_add_xi_load58_calltmp61.us, 2
%iptr__id.i239.rhs.us = sext i32 %23 to i64
%iptr__id.i239.us = add i64 %iptr__id.i239.rhs.us, %output_load_ptr2int
%ptr__id.i240.us = inttoptr i64 %iptr__id.i239.us to i32*
store i32 %sel.i.i266.us, i32* %ptr__id.i240.us, align 4
br label %if_exit.us
if_exit.us: ; preds = %if_then.us, %mandel___vyfvyfvyi.exit219.us
%add_xi_load73_.us = add i32 %xi.0293.us, 32
%less_xi_load_xend_load.us = icmp slt i32 %add_xi_load73_.us, %r.i.i
br i1 %less_xi_load_xend_load.us, label %for_loop.i204.lr.ph.us, label %for_exit43
for_loop.i204.us: ; preds = %for_loop.i204.lr.ph.us, %for_step.i187.us
%"oldMask&test.i180291.us" = phi <1 x i32> [ %"oldMask&test.i180284", %for_loop.i204.lr.ph.us ], [ %"oldMask&test.i180.us", %for_step.i187.us ]
%break_lanes_memory.0.i176290.us = phi <1 x i32> [ zeroinitializer, %for_loop.i204.lr.ph.us ], [ %"mask|break_mask.i195.us", %for_step.i187.us ]
%r.i.i267270289.us = phi <1 x i32> [ zeroinitializer, %for_loop.i204.lr.ph.us ], [ %r.i.i267.us, %for_step.i187.us ]
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init276288.us = phi <1 x float> [ %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init.us, %for_loop.i204.lr.ph.us ], [ %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init275.us, %for_step.i187.us ]
%add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init278287.us = phi <1 x float> [ %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init, %for_loop.i204.lr.ph.us ], [ %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init277.us, %for_step.i187.us ]
%mul_z_re_load_z_re_load13.i189.us = fmul <1 x float> %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init276288.us, %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init276288.us
%mul_z_im_load_z_im_load14.i191.us = fmul <1 x float> %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init278287.us, %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init278287.us
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i192.us = fadd <1 x float> %mul_z_im_load_z_im_load14.i191.us, %mul_z_re_load_z_re_load13.i189.us
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i193.us = fcmp ugt <1 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i192.us, <float 4.000000e+00>
%"oldMask&test16.i194.us" = select <1 x i1> %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i193.us, <1 x i32> %"oldMask&test.i180291.us", <1 x i32> zeroinitializer
%"mask|break_mask.i195.us" = or <1 x i32> %"oldMask&test16.i194.us", %break_lanes_memory.0.i176290.us
%item.i63.i197.us = extractelement <1 x i32> %"mask|break_mask.i195.us", i32 0
%v.i64.i198.us = lshr i32 %item.i63.i197.us, 31
%item.i62.i200.us = extractelement <1 x i32> %"oldMask&test.i180291.us", i32 0
%v.i.i201.us = lshr i32 %item.i62.i200.us, 31
%"equal_finished&func_internal_mask&function_mask12.i203.us" = icmp eq i32 %v.i64.i198.us, %v.i.i201.us
br i1 %"equal_finished&func_internal_mask&function_mask12.i203.us", label %for_step.i187.us, label %not_all_continued_or_breaked.i218.us
not_all_continued_or_breaked.i218.us: ; preds = %for_loop.i204.us
%"!(break|continue)_lanes.i207.us" = xor <1 x i32> %"mask|break_mask.i195.us", <i32 -1>
%new_mask28.i208.us = and <1 x i32> %"oldMask&test.i180291.us", %"!(break|continue)_lanes.i207.us"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i213.us = fsub <1 x float> %mul_z_re_load_z_re_load13.i189.us, %mul_z_im_load_z_im_load14.i191.us
%mul__z_re_load35.i214.us = fmul <1 x float> %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init276288.us, <float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i215.us = fmul <1 x float> %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init278287.us, %mul__z_re_load35.i214.us
%add_c_re_load42_new_re_load.i216.us = fadd <1 x float> %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init.us, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i213.us
%add_c_im_load44_new_im_load.i217.us = fadd <1 x float> %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init, %mul_mul__z_re_load35_z_im_load36.i215.us
br label %for_step.i187.us
for_step.i187.us: ; preds = %not_all_continued_or_breaked.i218.us, %for_loop.i204.us
%add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init277.us = phi <1 x float> [ %add_y0_load_mul_yi_load52_to_float_dy_load_broadcast_init278287.us, %for_loop.i204.us ], [ %add_c_im_load44_new_im_load.i217.us, %not_all_continued_or_breaked.i218.us ]
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init275.us = phi <1 x float> [ %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init276288.us, %for_loop.i204.us ], [ %add_c_re_load42_new_re_load.i216.us, %not_all_continued_or_breaked.i218.us ]
%internal_mask_memory.1.i184.us = phi <1 x i32> [ zeroinitializer, %for_loop.i204.us ], [ %new_mask28.i208.us, %not_all_continued_or_breaked.i218.us ]
%m.i.i262.us = extractelement <1 x i32> %internal_mask_memory.1.i184.us, i32 0
%d0.i.i264.us = extractelement <1 x i32> %r.i.i267270289.us, i32 0
%not.cmp.i.i263.us = icmp ne i32 %m.i.i262.us, 0
%d1.i.i265.us = zext i1 %not.cmp.i.i263.us to i32
%sel.i.i266.us = add i32 %d0.i.i264.us, %d1.i.i265.us
%r.i.i267.us = insertelement <1 x i32> undef, i32 %sel.i.i266.us, i32 0
%less_i_load_count_load.i179.us = icmp slt <1 x i32> %r.i.i267.us, %maxIterations_load_broadcast_init
%"oldMask&test.i180.us" = select <1 x i1> %less_i_load_count_load.i179.us, <1 x i32> %internal_mask_memory.1.i184.us, <1 x i32> zeroinitializer
%item.i.i181.us = extractelement <1 x i32> %"oldMask&test.i180.us", i32 0
%cmp.i.i182.us = icmp slt i32 %item.i.i181.us, 0
br i1 %cmp.i.i182.us, label %for_loop.i204.us, label %mandel___vyfvyfvyi.exit219.us
for_loop.i204.lr.ph.us: ; preds = %if_exit.us, %for_loop42.lr.ph
%xi.0293.us = phi i32 [ %add_xi_load73_.us, %if_exit.us ], [ %mul_calltmp_xspan_load, %for_loop42.lr.ph ]
%calltmp51.us = call i32 @getLaneIndex___(<1 x i32> <i32 -1>)
%add_xi_load48_calltmp51.us = add i32 %calltmp51.us, %xi.0293.us
%add_xi_load48_calltmp51_to_float.us = sitofp i32 %add_xi_load48_calltmp51.us to float
%mul_add_xi_load48_calltmp51_to_float_dx_load.us = fmul float %dx4, %add_xi_load48_calltmp51_to_float.us
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load.us = fadd float %x02, %mul_add_xi_load48_calltmp51_to_float_dx_load.us
%add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load_broadcast_init.us = insertelement <1 x float> undef, float %add_x0_load_mul_add_xi_load48_calltmp51_to_float_dx_load.us, i32 0
br label %for_loop.i204.us
for_exit: ; preds = %for_exit115, %for_exit43, %some_on, %all_on
ret void
mandel___vyfvyfvyi.exit219: ; preds = %if_exit, %for_loop42.lr.ph
%xi.0293 = phi i32 [ %add_xi_load73_, %if_exit ], [ %mul_calltmp_xspan_load, %for_loop42.lr.ph ]
%calltmp51 = call i32 @getLaneIndex___(<1 x i32> <i32 -1>)
%calltmp61 = call i32 @getLaneIndex___(<1 x i32> <i32 -1>)
%calltmp65 = call i32 @getLaneIndex___(<1 x i32> <i32 -1>)
%add_xi_load62_calltmp65 = add i32 %calltmp65, %xi.0293
%less_add_xi_load62_calltmp65_xend_load66 = icmp slt i32 %add_xi_load62_calltmp65, %r.i.i
br i1 %less_add_xi_load62_calltmp65_xend_load66, label %if_then, label %if_exit
for_exit43: ; preds = %if_exit, %if_exit.us, %for_test40.preheader
%yi_load74_plus1 = add i32 %yi.0295, 1
%exitcond = icmp eq i32 %yi_load74_plus1, %16
br i1 %exitcond, label %for_exit, label %for_test40.preheader
if_then: ; preds = %mandel___vyfvyfvyi.exit219
%add_xi_load58_calltmp61 = add i32 %xi.0293, %mul_yi_load56_width_load57
%add_mul_yi_load56_width_load57_add_xi_load58_calltmp61 = add i32 %add_xi_load58_calltmp61, %calltmp61
%24 = shl i32 %add_mul_yi_load56_width_load57_add_xi_load58_calltmp61, 2
%iptr__id.i239.rhs = sext i32 %24 to i64
%iptr__id.i239 = add i64 %iptr__id.i239.rhs, %output_load_ptr2int
%ptr__id.i240 = inttoptr i64 %iptr__id.i239 to i32*
store i32 0, i32* %ptr__id.i240, align 4
br label %if_exit
if_exit: ; preds = %if_then, %mandel___vyfvyfvyi.exit219
%add_xi_load73_ = add i32 %xi.0293, 32
%less_xi_load_xend_load = icmp slt i32 %add_xi_load73_, %r.i.i
br i1 %less_xi_load_xend_load, label %mandel___vyfvyfvyi.exit219, label %for_exit43
for_test112.preheader: ; preds = %for_exit115, %for_test112.preheader.lr.ph
%yi106.0310 = phi i32 [ %mul_calltmp92_yspan_load93, %for_test112.preheader.lr.ph ], [ %yi_load165_plus1, %for_exit115 ]
br i1 %less_xi_load119_xend_load120306, label %for_loop114.lr.ph, label %for_exit115
for_loop114.lr.ph: ; preds = %for_test112.preheader
%yi_load132_to_float = sitofp i32 %yi106.0310 to float
%mul_yi_load132_to_float_dy_load133 = fmul float %dy8, %yi_load132_to_float
%add_y0_load131_mul_yi_load132_to_float_dy_load133 = fadd float %y06, %mul_yi_load132_to_float_dy_load133
%add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init = insertelement <1 x float> undef, float %add_y0_load131_mul_yi_load132_to_float_dy_load133, i32 0
br i1 %cmp.i.i300, label %for_loop.i.lr.ph.us, label %if_exit156
if_exit156.us: ; preds = %for_step.i.us
%calltmp147.us = call i32 @getLaneIndex___(<1 x i32> %mask)
%calltmp151.us = call i32 @getLaneIndex___(<1 x i32> %mask)
%add_xi117_load_.us = add i32 %xi117.0307.us, 32
%less_xi_load119_xend_load120.us = icmp slt i32 %add_xi117_load_.us, %r.i.i169
br i1 %less_xi_load119_xend_load120.us, label %for_loop.i.lr.ph.us, label %for_exit115
for_loop.i.us: ; preds = %for_loop.i.lr.ph.us, %for_step.i.us
%"oldMask&test.i304.us" = phi <1 x i32> [ %"oldMask&test.i297", %for_loop.i.lr.ph.us ], [ %"oldMask&test.i.us", %for_step.i.us ]
%break_lanes_memory.0.i303.us = phi <1 x i32> [ zeroinitializer, %for_loop.i.lr.ph.us ], [ %"mask|break_mask.i.us", %for_step.i.us ]
%25 = phi <1 x i32> [ zeroinitializer, %for_loop.i.lr.ph.us ], [ %r.i.i236.us, %for_step.i.us ]
%add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init280302.us = phi <1 x float> [ %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init.us, %for_loop.i.lr.ph.us ], [ %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init279.us, %for_step.i.us ]
%add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init282301.us = phi <1 x float> [ %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init, %for_loop.i.lr.ph.us ], [ %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init281.us, %for_step.i.us ]
%"internal_mask&function_mask12.i.us" = and <1 x i32> %"oldMask&test.i304.us", %mask
%mul_z_re_load_z_re_load13.i.us = fmul <1 x float> %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init280302.us, %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init280302.us
%mul_z_im_load_z_im_load14.i.us = fmul <1 x float> %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init282301.us, %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init282301.us
%add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i.us = fadd <1 x float> %mul_z_im_load_z_im_load14.i.us, %mul_z_re_load_z_re_load13.i.us
%greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i.us = fcmp ugt <1 x float> %add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14.i.us, <float 4.000000e+00>
%"oldMask&test16.i.us" = select <1 x i1> %greater_add_mul_z_re_load_z_re_load13_mul_z_im_load_z_im_load14_.i.us, <1 x i32> %"oldMask&test.i304.us", <1 x i32> zeroinitializer
%"mask|break_mask.i.us" = or <1 x i32> %"oldMask&test16.i.us", %break_lanes_memory.0.i303.us
%"finished&func.i.us" = and <1 x i32> %"mask|break_mask.i.us", %mask
%item.i63.i.us = extractelement <1 x i32> %"finished&func.i.us", i32 0
%v.i64.i.us = lshr i32 %item.i63.i.us, 31
%item.i62.i.us = extractelement <1 x i32> %"internal_mask&function_mask12.i.us", i32 0
%v.i.i.us = lshr i32 %item.i62.i.us, 31
%"equal_finished&func_internal_mask&function_mask12.i.us" = icmp eq i32 %v.i64.i.us, %v.i.i.us
br i1 %"equal_finished&func_internal_mask&function_mask12.i.us", label %for_step.i.us, label %not_all_continued_or_breaked.i.us
not_all_continued_or_breaked.i.us: ; preds = %for_loop.i.us
%"!(break|continue)_lanes.i.us" = xor <1 x i32> %"mask|break_mask.i.us", <i32 -1>
%new_mask28.i.us = and <1 x i32> %"oldMask&test.i304.us", %"!(break|continue)_lanes.i.us"
%sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i.us = fsub <1 x float> %mul_z_re_load_z_re_load13.i.us, %mul_z_im_load_z_im_load14.i.us
%mul__z_re_load35.i.us = fmul <1 x float> %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init280302.us, <float 2.000000e+00>
%mul_mul__z_re_load35_z_im_load36.i.us = fmul <1 x float> %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init282301.us, %mul__z_re_load35.i.us
%add_c_re_load42_new_re_load.i.us = fadd <1 x float> %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init.us, %sub_mul_z_re_load31_z_re_load32_mul_z_im_load33_z_im_load34.i.us
%add_c_im_load44_new_im_load.i.us = fadd <1 x float> %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init, %mul_mul__z_re_load35_z_im_load36.i.us
br label %for_step.i.us
for_step.i.us: ; preds = %not_all_continued_or_breaked.i.us, %for_loop.i.us
%add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init281.us = phi <1 x float> [ %add_y0_load131_mul_yi_load132_to_float_dy_load133_broadcast_init282301.us, %for_loop.i.us ], [ %add_c_im_load44_new_im_load.i.us, %not_all_continued_or_breaked.i.us ]
%add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init279.us = phi <1 x float> [ %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init280302.us, %for_loop.i.us ], [ %add_c_re_load42_new_re_load.i.us, %not_all_continued_or_breaked.i.us ]
%internal_mask_memory.1.i.us = phi <1 x i32> [ zeroinitializer, %for_loop.i.us ], [ %new_mask28.i.us, %not_all_continued_or_breaked.i.us ]
%m.i.i.us = extractelement <1 x i32> %internal_mask_memory.1.i.us, i32 0
%d0.i.i234.us = extractelement <1 x i32> %25, i32 0
%not.cmp.i.i233.us = icmp ne i32 %m.i.i.us, 0
%d1.i.i235.us = zext i1 %not.cmp.i.i233.us to i32
%sel.i.i.us = add i32 %d0.i.i234.us, %d1.i.i235.us
%r.i.i236.us = insertelement <1 x i32> undef, i32 %sel.i.i.us, i32 0
%less_i_load_count_load.i.us = icmp slt <1 x i32> %r.i.i236.us, %maxIterations_load137_broadcast_init
%"oldMask&test.i.us" = select <1 x i1> %less_i_load_count_load.i.us, <1 x i32> %internal_mask_memory.1.i.us, <1 x i32> zeroinitializer
%"internal_mask&function_mask10.i.us" = and <1 x i32> %"oldMask&test.i.us", %mask
%item.i.i.us = extractelement <1 x i32> %"internal_mask&function_mask10.i.us", i32 0
%cmp.i.i.us = icmp slt i32 %item.i.i.us, 0
br i1 %cmp.i.i.us, label %for_loop.i.us, label %if_exit156.us
for_loop.i.lr.ph.us: ; preds = %if_exit156.us, %for_loop114.lr.ph
%xi117.0307.us = phi i32 [ %add_xi117_load_.us, %if_exit156.us ], [ %mul_calltmp80_xspan_load81, %for_loop114.lr.ph ]
%calltmp128.us = call i32 @getLaneIndex___(<1 x i32> %mask)
%add_xi_load125_calltmp128.us = add i32 %calltmp128.us, %xi117.0307.us
%add_xi_load125_calltmp128_to_float.us = sitofp i32 %add_xi_load125_calltmp128.us to float
%mul_add_xi_load125_calltmp128_to_float_dx_load129.us = fmul float %dx4, %add_xi_load125_calltmp128_to_float.us
%add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129.us = fadd float %x02, %mul_add_xi_load125_calltmp128_to_float_dx_load129.us
%add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129_broadcast_init.us = insertelement <1 x float> undef, float %add_x0_load124_mul_add_xi_load125_calltmp128_to_float_dx_load129.us, i32 0
br label %for_loop.i.us
for_exit115: ; preds = %if_exit156, %if_exit156.us, %for_test112.preheader
%yi_load165_plus1 = add i32 %yi106.0310, 1
%exitcond312 = icmp eq i32 %yi_load165_plus1, %22
br i1 %exitcond312, label %for_exit, label %for_test112.preheader
if_exit156: ; preds = %if_exit156, %for_loop114.lr.ph
%xi117.0307 = phi i32 [ %add_xi117_load_, %if_exit156 ], [ %mul_calltmp80_xspan_load81, %for_loop114.lr.ph ]
%calltmp128 = call i32 @getLaneIndex___(<1 x i32> %mask)
%calltmp147 = call i32 @getLaneIndex___(<1 x i32> %mask)
%calltmp151 = call i32 @getLaneIndex___(<1 x i32> %mask)
%add_xi117_load_ = add i32 %xi117.0307, 32
%less_xi_load119_xend_load120 = icmp slt i32 %add_xi117_load_, %r.i.i169
br i1 %less_xi_load119_xend_load120, label %if_exit156, label %for_exit115
}
attributes #0 = { alwaysinline nounwind readnone }
attributes #1 = { nounwind readnone }
attributes #2 = { alwaysinline nounwind }
attributes #3 = { nounwind }
attributes #4 = { alwaysinline nounwind readonly }
attributes #5 = { nounwind "target-features"="+sm_35" }

View File

@@ -0,0 +1,177 @@
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
// Cuda compilation tools, release 5.5, V5.5.0
//
.version 3.2
.target sm_35
.address_size 64
.file 1 "/home/evghenii/soft/ispc-code/ispc/examples/mandelbrot_tasks3d/test.cu", 1383046614, 1449
.file 2 "/usr/local/cuda-5.5/bin/..//include/cuda_device_runtime_api.h", 1375338991, 7655
.file 3 "/usr/local/cuda-5.5/bin/..//include/device_functions.h", 1375338991, 185228
.weak .func (.param .b32 func_retval0) cudaMalloc(
.param .b64 cudaMalloc_param_0,
.param .b64 cudaMalloc_param_1
)
{
.reg .s32 %r<2>;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
.loc 2 66 3
ret;
}
.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes(
.param .b64 cudaFuncGetAttributes_param_0,
.param .b64 cudaFuncGetAttributes_param_1
)
{
.reg .s32 %r<2>;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
.loc 2 71 3
ret;
}
.visible .entry _Z19mandelbrot_scanlineffffiiiiiPi(
.param .f32 _Z19mandelbrot_scanlineffffiiiiiPi_param_0,
.param .f32 _Z19mandelbrot_scanlineffffiiiiiPi_param_1,
.param .f32 _Z19mandelbrot_scanlineffffiiiiiPi_param_2,
.param .f32 _Z19mandelbrot_scanlineffffiiiiiPi_param_3,
.param .u32 _Z19mandelbrot_scanlineffffiiiiiPi_param_4,
.param .u32 _Z19mandelbrot_scanlineffffiiiiiPi_param_5,
.param .u32 _Z19mandelbrot_scanlineffffiiiiiPi_param_6,
.param .u32 _Z19mandelbrot_scanlineffffiiiiiPi_param_7,
.param .u32 _Z19mandelbrot_scanlineffffiiiiiPi_param_8,
.param .u64 _Z19mandelbrot_scanlineffffiiiiiPi_param_9
)
{
.reg .pred %p<9>;
.reg .s32 %r<36>;
.reg .f32 %f<20>;
.reg .s64 %rd<5>;
ld.param.f32 %f9, [_Z19mandelbrot_scanlineffffiiiiiPi_param_0];
ld.param.f32 %f10, [_Z19mandelbrot_scanlineffffiiiiiPi_param_1];
ld.param.f32 %f11, [_Z19mandelbrot_scanlineffffiiiiiPi_param_2];
ld.param.f32 %f12, [_Z19mandelbrot_scanlineffffiiiiiPi_param_3];
ld.param.u32 %r14, [_Z19mandelbrot_scanlineffffiiiiiPi_param_4];
ld.param.u32 %r17, [_Z19mandelbrot_scanlineffffiiiiiPi_param_5];
ld.param.u32 %r15, [_Z19mandelbrot_scanlineffffiiiiiPi_param_6];
ld.param.u32 %r18, [_Z19mandelbrot_scanlineffffiiiiiPi_param_7];
ld.param.u32 %r16, [_Z19mandelbrot_scanlineffffiiiiiPi_param_8];
ld.param.u64 %rd1, [_Z19mandelbrot_scanlineffffiiiiiPi_param_9];
.loc 1 43 1
mov.u32 %r19, %ctaid.x;
.loc 1 44 1
mad.lo.s32 %r20, %r19, %r15, %r15;
.loc 3 2621 10
min.s32 %r1, %r20, %r14;
.loc 1 46 1
mov.u32 %r21, %ctaid.y;
mul.lo.s32 %r33, %r21, %r18;
.loc 1 47 1
add.s32 %r22, %r33, %r18;
.loc 3 2621 10
min.s32 %r3, %r22, %r17;
.loc 1 49 1
setp.ge.s32 %p1, %r33, %r3;
@%p1 bra BB2_12;
cvta.to.global.u64 %rd2, %rd1;
BB2_2:
.loc 1 43 1
mul.lo.s32 %r34, %r19, %r15;
.loc 1 50 1
setp.ge.s32 %p2, %r34, %r1;
@%p2 bra BB2_11;
.loc 1 53 1
cvt.rn.f32.s32 %f13, %r33;
fma.rn.f32 %f1, %f13, %f12, %f11;
BB2_4:
.loc 1 52 1
mov.u32 %r26, %tid.x;
and.b32 %r27, %r26, 31;
add.s32 %r7, %r27, %r34;
cvt.rn.f32.u32 %f14, %r7;
fma.rn.f32 %f2, %f14, %f10, %f9;
mov.u32 %r35, 0;
setp.gt.s32 %p3, %r16, 0;
.loc 1 13 1
@%p3 bra BB2_5;
bra.uni BB2_8;
BB2_5:
mov.f32 %f18, %f1;
mov.f32 %f19, %f2;
BB2_6:
.loc 1 14 1
mov.f32 %f4, %f19;
mov.f32 %f3, %f18;
mul.f32 %f5, %f3, %f3;
mul.f32 %f6, %f4, %f4;
add.f32 %f15, %f6, %f5;
setp.gt.f32 %p4, %f15, 0f40800000;
@%p4 bra BB2_8;
.loc 1 17 1
sub.f32 %f16, %f6, %f5;
.loc 1 18 1
add.f32 %f17, %f4, %f4;
.loc 1 19 1
add.f32 %f7, %f2, %f16;
.loc 1 20 1
fma.rn.f32 %f8, %f17, %f3, %f1;
.loc 1 13 96
add.s32 %r35, %r35, 1;
.loc 1 13 1
setp.lt.s32 %p5, %r35, %r16;
mov.f32 %f18, %f8;
mov.f32 %f19, %f7;
@%p5 bra BB2_6;
BB2_8:
.loc 1 56 1
mad.lo.s32 %r30, %r33, %r14, %r34;
add.s32 %r11, %r30, %r27;
.loc 1 57 1
setp.ge.u32 %p6, %r7, %r1;
@%p6 bra BB2_10;
mul.wide.s32 %rd3, %r11, 4;
add.s64 %rd4, %rd2, %rd3;
.loc 1 58 1
st.global.u32 [%rd4], %r35;
BB2_10:
.loc 1 50 57
add.s32 %r34, %r34, 32;
.loc 1 50 1
setp.lt.s32 %p7, %r34, %r1;
@%p7 bra BB2_4;
BB2_11:
.loc 1 49 57
add.s32 %r33, %r33, 1;
.loc 1 49 1
setp.lt.s32 %p8, %r33, %r3;
@%p8 bra BB2_2;
BB2_12:
.loc 1 60 2
ret;
}

View File

@@ -0,0 +1,801 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl __vselect_i8
.func (.param .b32 func_retval0) getBlockIndex0___
(
.param .align 4 .b8 getBlockIndex0____param_0[4]
)
;
.func (.param .b32 func_retval0) getBlockIndex1___
(
.param .align 4 .b8 getBlockIndex1____param_0[4]
)
;
.func (.param .b32 func_retval0) getLaneIndex___
(
.param .align 4 .b8 getLaneIndex____param_0[4]
)
;
// @__vselect_i8
.func (.param .align 1 .b8 func_retval0[1]) __vselect_i8(
.param .align 1 .b8 __vselect_i8_param_0[1],
.param .align 1 .b8 __vselect_i8_param_1[1],
.param .align 4 .b8 __vselect_i8_param_2[4]
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i8_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u8 %rc0, [__vselect_i8_param_0];
ld.param.u8 %rc1, [__vselect_i8_param_1];
selp.b16 %rc0, %rc0, %rc1, %p0;
st.param.b8 [func_retval0+0], %rc0;
ret;
}
// .globl __vselect_i16
.func (.param .align 2 .b8 func_retval0[2]) __vselect_i16(
.param .align 2 .b8 __vselect_i16_param_0[2],
.param .align 2 .b8 __vselect_i16_param_1[2],
.param .align 4 .b8 __vselect_i16_param_2[4]
) // @__vselect_i16
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i16_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u16 %rs0, [__vselect_i16_param_0];
ld.param.u16 %rs1, [__vselect_i16_param_1];
selp.b16 %rs0, %rs0, %rs1, %p0;
st.param.b16 [func_retval0+0], %rs0;
ret;
}
// .globl __vselect_i64
.func (.param .align 8 .b8 func_retval0[8]) __vselect_i64(
.param .align 8 .b8 __vselect_i64_param_0[8],
.param .align 8 .b8 __vselect_i64_param_1[8],
.param .align 4 .b8 __vselect_i64_param_2[4]
) // @__vselect_i64
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i64_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u64 %rl0, [__vselect_i64_param_0];
ld.param.u64 %rl1, [__vselect_i64_param_1];
selp.b64 %rl0, %rl0, %rl1, %p0;
st.param.b64 [func_retval0+0], %rl0;
ret;
}
// .globl __aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
) // @__aos_to_soa4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __rsqrt_varying_double
.func (.param .align 8 .b8 func_retval0[8]) __rsqrt_varying_double(
.param .align 8 .b8 __rsqrt_varying_double_param_0[8]
) // @__rsqrt_varying_double
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f64 %fl0, [__rsqrt_varying_double_param_0];
rsqrt.approx.f64 %fl0, %fl0;
st.param.f64 [func_retval0+0], %fl0;
ret;
}
// .globl mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_
.func mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_(
.param .b64 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_1,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_2,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_3,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_4,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_5,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_6,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_7,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_8,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_9,
.param .b32 mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_10
) // @mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E_
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u64 %rl0, [mandelbrot_scanline___unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0];
ld.f32 %f0, [%rl0];
ld.f32 %f1, [%rl0+4];
ld.f32 %f2, [%rl0+8];
ld.f32 %f3, [%rl0+12];
ld.u32 %r1, [%rl0+16];
ld.u32 %r7, [%rl0+20];
ld.u32 %r9, [%rl0+24];
ld.u32 %r8, [%rl0+28];
ld.u32 %r0, [%rl0+32];
ld.u32 %r2, [%rl0+48];
setp.gt.s32 %p0, %r2, -1;
@%p0 bra BB8_3;
bra.uni BB8_1;
BB8_3: // %some_on
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getBlockIndex0___,
(
param0
);
ld.param.b32 %r5, [retval0+0];
//{
}// Callseq End 0
// Callseq Start 1
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getBlockIndex1___,
(
param0
);
ld.param.b32 %r10, [retval0+0];
//{
}// Callseq End 1
mul.lo.s32 %r3, %r10, %r8;
mad.lo.s32 %r4, %r10, %r8, %r8;
setp.lt.s32 %p0, %r4, %r7;
selp.b32 %r4, %r4, %r7, %p0;
setp.ge.s32 %p0, %r3, %r4;
@%p0 bra BB8_31;
// BB#4: // %for_test112.preheader.lr.ph
mul.lo.s32 %r4, %r5, %r9;
mad.lo.s32 %r5, %r5, %r9, %r9;
setp.lt.s32 %p0, %r5, %r1;
selp.b32 %r1, %r5, %r1, %p0;
setp.gt.s32 %p0, %r0, 0;
selp.b32 %r5, -1, 0, %p0;
and.b32 %r6, %r5, %r2;
not.b32 %r7, %r7;
add.s32 %r9, %r10, 1;
mul.lo.s32 %r8, %r8, %r9;
not.b32 %r8, %r8;
setp.gt.s32 %p0, %r7, %r8;
selp.b32 %r7, %r7, %r8, %p0;
not.b32 %r7, %r7;
BB8_5: // %for_test112.preheader
// =>This Loop Header: Depth=1
// Child Loop BB8_29 Depth 2
// Child Loop BB8_28 Depth 2
// Child Loop BB8_23 Depth 3
setp.ge.s32 %p0, %r4, %r1;
@%p0 bra BB8_30;
// BB#21: // %for_loop114.lr.ph
// in Loop: Header=BB8_5 Depth=1
setp.lt.s32 %p0, %r6, 0;
mov.u32 %r8, %r4;
@%p0 bra BB8_22;
bra.uni BB8_29;
BB8_22: // in Loop: Header=BB8_5 Depth=1
cvt.rn.f32.s32 %f4, %r3;
fma.rn.f32 %f4, %f3, %f4, %f2;
mov.u32 %r8, %r4;
BB8_28: // %for_loop.i.lr.ph.us
// Parent Loop BB8_5 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB8_23 Depth 3
// Callseq Start 5
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 5
add.s32 %r9, %r9, %r8;
cvt.rn.f32.s32 %f5, %r9;
fma.rn.f32 %f5, %f1, %f5, %f0;
mov.u32 %r9, 0;
mov.u32 %r12, %r5;
mov.u32 %r10, %r9;
mov.u32 %r11, %r9;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB8_23: // %for_loop.i.us
// Parent Loop BB8_5 Depth=1
// Parent Loop BB8_28 Depth=2
// => This Inner Loop Header: Depth=3
and.b32 %r13, %r12, %r2;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p0, %f9, 0f40800000;
selp.b32 %r14, %r12, 0, %p0;
or.b32 %r10, %r14, %r10;
and.b32 %r14, %r10, %r2;
shr.u32 %r14, %r14, 31;
shr.u32 %r13, %r13, 31;
setp.eq.s32 %p0, %r14, %r13;
@%p0 bra BB8_24;
bra.uni BB8_25;
BB8_24: // in Loop: Header=BB8_23 Depth=3
mov.u32 %r12, %r9;
bra.uni BB8_26;
BB8_25: // %not_all_continued_or_breaked.i.us
// in Loop: Header=BB8_23 Depth=3
mul.f32 %f9, %f6, %f6;
not.b32 %r13, %r10;
and.b32 %r12, %r12, %r13;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB8_26: // %for_step.i.us
// in Loop: Header=BB8_23 Depth=3
setp.ne.s32 %p0, %r12, 0;
selp.u32 %r13, 1, 0, %p0;
add.s32 %r11, %r11, %r13;
setp.lt.s32 %p0, %r11, %r0;
selp.b32 %r12, %r12, 0, %p0;
and.b32 %r13, %r12, %r2;
setp.lt.s32 %p0, %r13, 0;
@%p0 bra BB8_23;
// BB#27: // %if_exit156.us
// in Loop: Header=BB8_28 Depth=2
// Callseq Start 6
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 6
// Callseq Start 7
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 7
add.s32 %r8, %r8, 32;
setp.lt.s32 %p0, %r8, %r1;
@%p0 bra BB8_28;
bra.uni BB8_30;
BB8_29: // %if_exit156
// Parent Loop BB8_5 Depth=1
// => This Inner Loop Header: Depth=2
// Callseq Start 2
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 2
// Callseq Start 3
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 3
// Callseq Start 4
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r9, [retval0+0];
//{
}// Callseq End 4
add.s32 %r8, %r8, 32;
setp.lt.s32 %p0, %r8, %r1;
@%p0 bra BB8_29;
BB8_30: // %for_exit115
// in Loop: Header=BB8_5 Depth=1
add.s32 %r3, %r3, 1;
setp.eq.s32 %p0, %r3, %r7;
@%p0 bra BB8_31;
bra.uni BB8_5;
BB8_1: // %all_on
ld.u64 %rl0, [%rl0+40];
mov.u32 %r2, -1;
// Callseq Start 8
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getBlockIndex0___,
(
param0
);
ld.param.b32 %r10, [retval0+0];
//{
}// Callseq End 8
// Callseq Start 9
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getBlockIndex1___,
(
param0
);
ld.param.b32 %r11, [retval0+0];
//{
}// Callseq End 9
mul.lo.s32 %r3, %r11, %r8;
mad.lo.s32 %r4, %r11, %r8, %r8;
setp.lt.s32 %p0, %r4, %r7;
selp.b32 %r4, %r4, %r7, %p0;
setp.ge.s32 %p0, %r3, %r4;
@%p0 bra BB8_31;
// BB#2: // %for_test40.preheader.lr.ph
mul.lo.s32 %r4, %r10, %r9;
mad.lo.s32 %r5, %r10, %r9, %r9;
setp.lt.s32 %p0, %r5, %r1;
selp.b32 %r5, %r5, %r1, %p0;
setp.gt.s32 %p0, %r0, 0;
selp.b32 %r6, -1, 0, %p0;
not.b32 %r7, %r7;
add.s32 %r12, %r11, 1;
mul.lo.s32 %r12, %r8, %r12;
not.b32 %r12, %r12;
setp.gt.s32 %p0, %r7, %r12;
selp.b32 %r7, %r7, %r12, %p0;
not.b32 %r7, %r7;
mul.lo.s32 %r8, %r11, %r8;
mul.lo.s32 %r8, %r8, %r1;
mad.lo.s32 %r8, %r10, %r9, %r8;
BB8_7: // %for_test40.preheader
// =>This Loop Header: Depth=1
// Child Loop BB8_19 Depth 2
// Child Loop BB8_13 Depth 2
// Child Loop BB8_14 Depth 3
setp.ge.s32 %p0, %r4, %r5;
@%p0 bra BB8_6;
// BB#8: // %for_loop42.lr.ph
// in Loop: Header=BB8_7 Depth=1
setp.lt.s32 %p0, %r6, 0;
mov.u32 %r9, %r8;
mov.u32 %r10, %r4;
@%p0 bra BB8_9;
bra.uni BB8_19;
BB8_9: // in Loop: Header=BB8_7 Depth=1
cvt.rn.f32.s32 %f4, %r3;
mul.lo.s32 %r9, %r3, %r1;
fma.rn.f32 %f4, %f3, %f4, %f2;
mov.u32 %r10, %r4;
BB8_13: // %for_loop.i204.lr.ph.us
// Parent Loop BB8_7 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB8_14 Depth 3
// Callseq Start 13
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r11, [retval0+0];
//{
}// Callseq End 13
add.s32 %r11, %r11, %r10;
cvt.rn.f32.s32 %f5, %r11;
fma.rn.f32 %f5, %f1, %f5, %f0;
mov.u32 %r12, 0;
mov.u32 %r14, %r6;
mov.u32 %r13, %r12;
mov.u32 %r11, %r12;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB8_14: // %for_loop.i204.us
// Parent Loop BB8_7 Depth=1
// Parent Loop BB8_13 Depth=2
// => This Inner Loop Header: Depth=3
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p0, %f9, 0f40800000;
selp.b32 %r15, %r14, 0, %p0;
or.b32 %r13, %r15, %r13;
shr.u32 %r15, %r13, 31;
shr.u32 %r16, %r14, 31;
setp.eq.s32 %p0, %r15, %r16;
@%p0 bra BB8_15;
bra.uni BB8_16;
BB8_15: // in Loop: Header=BB8_14 Depth=3
mov.u32 %r14, %r12;
bra.uni BB8_17;
BB8_16: // %not_all_continued_or_breaked.i218.us
// in Loop: Header=BB8_14 Depth=3
mul.f32 %f9, %f6, %f6;
not.b32 %r15, %r13;
and.b32 %r14, %r14, %r15;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB8_17: // %for_step.i187.us
// in Loop: Header=BB8_14 Depth=3
setp.ne.s32 %p0, %r14, 0;
selp.u32 %r15, 1, 0, %p0;
add.s32 %r11, %r11, %r15;
setp.lt.s32 %p0, %r11, %r0;
selp.b32 %r14, %r14, 0, %p0;
setp.lt.s32 %p0, %r14, 0;
@%p0 bra BB8_14;
// BB#10: // %mandel___vyfvyfvyi.exit219.us
// in Loop: Header=BB8_13 Depth=2
// Callseq Start 14
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r12, [retval0+0];
//{
}// Callseq End 14
// Callseq Start 15
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r13, [retval0+0];
//{
}// Callseq End 15
add.s32 %r13, %r13, %r10;
setp.ge.s32 %p0, %r13, %r5;
@%p0 bra BB8_12;
// BB#11: // %if_then.us
// in Loop: Header=BB8_13 Depth=2
add.s32 %r13, %r10, %r9;
add.s32 %r12, %r13, %r12;
shl.b32 %r12, %r12, 2;
cvt.s64.s32 %rl1, %r12;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r11;
BB8_12: // %if_exit.us
// in Loop: Header=BB8_13 Depth=2
add.s32 %r10, %r10, 32;
setp.lt.s32 %p0, %r10, %r5;
@%p0 bra BB8_13;
bra.uni BB8_6;
BB8_19: // %mandel___vyfvyfvyi.exit219
// Parent Loop BB8_7 Depth=1
// => This Inner Loop Header: Depth=2
// Callseq Start 10
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r11, [retval0+0];
//{
}// Callseq End 10
// Callseq Start 11
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r11, [retval0+0];
//{
}// Callseq End 11
// Callseq Start 12
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.b32 [param0+0], %r2;
.param .b32 retval0;
call.uni (retval0),
getLaneIndex___,
(
param0
);
ld.param.b32 %r12, [retval0+0];
//{
}// Callseq End 12
add.s32 %r12, %r12, %r10;
setp.lt.s32 %p0, %r12, %r5;
@%p0 bra BB8_20;
bra.uni BB8_18;
BB8_20: // %if_then
// in Loop: Header=BB8_19 Depth=2
add.s32 %r11, %r11, %r9;
shl.b32 %r11, %r11, 2;
cvt.s64.s32 %rl1, %r11;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r11, 0;
st.u32 [%rl1], %r11;
BB8_18: // %if_exit
// in Loop: Header=BB8_19 Depth=2
add.s32 %r10, %r10, 32;
add.s32 %r9, %r9, 32;
setp.lt.s32 %p0, %r10, %r5;
@%p0 bra BB8_19;
BB8_6: // %for_exit43
// in Loop: Header=BB8_7 Depth=1
add.s32 %r3, %r3, 1;
add.s32 %r8, %r8, %r1;
setp.eq.s32 %p0, %r3, %r7;
@%p0 bra BB8_31;
bra.uni BB8_7;
BB8_31: // %for_exit
ret;
}

View File

@@ -0,0 +1,515 @@
compiling nvptx64
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 0
calleArgCount= 0
argVals= 0
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform double
ret_t: /*safe*/ uniform double
ret_t: /*safe*/ uniform double
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying unsigned int32
ret_t: /*safe*/ varying unsigned int32
ret_t: /*safe*/ varying unsigned int32
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform unsigned int32
ret_t: /*safe*/ uniform unsigned int32
ret_t: /*safe*/ uniform unsigned int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying unsigned int64
ret_t: /*safe*/ varying unsigned int64
ret_t: /*safe*/ varying unsigned int64
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ uniform float
ret_t: /*safe*/ /*cost=1*/ uniform float
ret_t: /*safe*/ /*cost=1*/ uniform float
argVals= 1
calleArgCount= 2
argVals= 2
calleArgCount= 2
argVals= 1
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int8
ret_t: /*safe*/ /*cost=1*/ uniform int8
ret_t: /*safe*/ /*cost=1*/ uniform int8
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int8
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int8
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int8
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int16
ret_t: /*safe*/ /*cost=1*/ uniform int16
ret_t: /*safe*/ /*cost=1*/ uniform int16
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int16
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int16
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int16
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int32
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int32
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int32
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform double
ret_t: /*safe*/ /*cost=1*/ uniform double
ret_t: /*safe*/ /*cost=1*/ uniform double
argVals= 1
calleArgCount= 2
argVals= 2
calleArgCount= 2
argVals= 1
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int64
ret_t: /*safe*/ /*cost=1*/ uniform int64
ret_t: /*safe*/ /*cost=1*/ uniform int64
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int64
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int64
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int64
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ varying float
ret_t: /*safe*/ /*cost=1*/ varying float
ret_t: /*safe*/ /*cost=1*/ varying float
argVals= 1
calleArgCount= 2
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int8
ret_t: /*safe*/ /*cost=1*/ varying int8
ret_t: /*safe*/ /*cost=1*/ varying int8
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int8
ret_t: /*safe*/ /*cost=1*/ varying unsigned int8
ret_t: /*safe*/ /*cost=1*/ varying unsigned int8
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int16
ret_t: /*safe*/ /*cost=1*/ varying int16
ret_t: /*safe*/ /*cost=1*/ varying int16
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int16
ret_t: /*safe*/ /*cost=1*/ varying unsigned int16
ret_t: /*safe*/ /*cost=1*/ varying unsigned int16
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int32
ret_t: /*safe*/ /*cost=1*/ varying unsigned int32
ret_t: /*safe*/ /*cost=1*/ varying unsigned int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying double
ret_t: /*safe*/ /*cost=1*/ varying double
ret_t: /*safe*/ /*cost=1*/ varying double
argVals= 1
calleArgCount= 2
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int64
ret_t: /*safe*/ /*cost=1*/ varying int64
ret_t: /*safe*/ /*cost=1*/ varying int64
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int64
ret_t: /*safe*/ /*cost=1*/ varying unsigned int64
ret_t: /*safe*/ /*cost=1*/ varying unsigned int64
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 4
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 4
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 1
calleArgCount= 1
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
argVals= 1
calleArgCount= 1
ret_t: void
argVals= 3
calleArgCount= 3
ret_t: void
ret_t: void
argVals= 3
calleArgCount= 3
ret_t: void
ret_t: void
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
argVals= 1
calleArgCount= 1
argVals= 1
calleArgCount= 1
argVals= 0
calleArgCount= 0
stdlib.ispc:493:5: Error: Assertion failed (ctx.cpp:1755): "v0->getType() ==
v1->getType()".
***
*** Please file a bug report at https://github.com/ispc/ispc/issues
*** (Including as much information as you can about how to reproduce this error).
*** You have apparently encountered a bug in the compiler that we'd like to fix!
***
main.cpp(223): FATAL ERROR: Unhandled signal sent to process; terminating.

View File

@@ -0,0 +1,513 @@
compiling nvptx64
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 0
calleArgCount= 0
argVals= 0
calleArgCount= 1
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
ret_t: /*safe*/ uniform float
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform double
ret_t: /*safe*/ uniform double
ret_t: /*safe*/ uniform double
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying unsigned int32
ret_t: /*safe*/ varying unsigned int32
ret_t: /*safe*/ varying unsigned int32
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform unsigned int32
ret_t: /*safe*/ uniform unsigned int32
ret_t: /*safe*/ uniform unsigned int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying unsigned int64
ret_t: /*safe*/ varying unsigned int64
ret_t: /*safe*/ varying unsigned int64
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 2
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
ret_t: /*safe*/ varying float
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
ret_t: /*safe*/ varying int8
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
ret_t: /*safe*/ varying int16
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
ret_t: /*safe*/ varying double
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
ret_t: /*safe*/ varying int64
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ uniform float
ret_t: /*safe*/ /*cost=1*/ uniform float
ret_t: /*safe*/ /*cost=1*/ uniform float
argVals= 1
calleArgCount= 2
argVals= 2
calleArgCount= 2
argVals= 1
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int8
ret_t: /*safe*/ /*cost=1*/ uniform int8
ret_t: /*safe*/ /*cost=1*/ uniform int8
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int8
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int8
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int8
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int16
ret_t: /*safe*/ /*cost=1*/ uniform int16
ret_t: /*safe*/ /*cost=1*/ uniform int16
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int16
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int16
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int16
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int32
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int32
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int32
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform double
ret_t: /*safe*/ /*cost=1*/ uniform double
ret_t: /*safe*/ /*cost=1*/ uniform double
argVals= 1
calleArgCount= 2
argVals= 2
calleArgCount= 2
argVals= 1
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform int64
ret_t: /*safe*/ /*cost=1*/ uniform int64
ret_t: /*safe*/ /*cost=1*/ uniform int64
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int64
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int64
ret_t: /*safe*/ /*cost=1*/ uniform unsigned int64
argVals= 2
calleArgCount= 2
ret_t: /*safe*/ /*cost=1*/ varying float
ret_t: /*safe*/ /*cost=1*/ varying float
ret_t: /*safe*/ /*cost=1*/ varying float
argVals= 1
calleArgCount= 2
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int8
ret_t: /*safe*/ /*cost=1*/ varying int8
ret_t: /*safe*/ /*cost=1*/ varying int8
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int8
ret_t: /*safe*/ /*cost=1*/ varying unsigned int8
ret_t: /*safe*/ /*cost=1*/ varying unsigned int8
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int16
ret_t: /*safe*/ /*cost=1*/ varying int16
ret_t: /*safe*/ /*cost=1*/ varying int16
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int16
ret_t: /*safe*/ /*cost=1*/ varying unsigned int16
ret_t: /*safe*/ /*cost=1*/ varying unsigned int16
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int32
ret_t: /*safe*/ /*cost=1*/ varying unsigned int32
ret_t: /*safe*/ /*cost=1*/ varying unsigned int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying double
ret_t: /*safe*/ /*cost=1*/ varying double
ret_t: /*safe*/ /*cost=1*/ varying double
argVals= 1
calleArgCount= 2
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying int64
ret_t: /*safe*/ /*cost=1*/ varying int64
ret_t: /*safe*/ /*cost=1*/ varying int64
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ varying unsigned int64
ret_t: /*safe*/ /*cost=1*/ varying unsigned int64
ret_t: /*safe*/ /*cost=1*/ varying unsigned int64
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
ret_t: /*safe*/ /*cost=1*/ uniform int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
ret_t: /*safe*/ /*cost=1*/ varying int32
argVals= 1
calleArgCount= 1
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
ret_t: /*safe*/ uniform bool
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 4
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ varying int32
argVals= 2
calleArgCount= 3
argVals= 1
calleArgCount= 2
argVals= 3
calleArgCount= 4
ret_t: /*safe*/ varying int32
ret_t: /*safe*/ varying int32
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
ret_t: /*safe*/ uniform int32
argVals= 1
calleArgCount= 1
argVals= 1
calleArgCount= 1
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
ret_t: /*safe*/ uniform unsigned int64
argVals= 1
calleArgCount= 1
ret_t: void
argVals= 3
calleArgCount= 3
ret_t: void
ret_t: void
argVals= 3
calleArgCount= 3
ret_t: void
ret_t: void
argVals= 3
calleArgCount= 3
argVals= 3
calleArgCount= 3
argVals= 1
calleArgCount= 1
argVals= 1
calleArgCount= 1
argVals= 0
calleArgCount= 0
stdlib.ispc:493:5: Error: Assertion failed (ctx.cpp:1755): "v0->getType() ==
v1->getType()".
***
*** Please file a bug report at https://github.com/ispc/ispc/issues
*** (Including as much information as you can about how to reproduce this error).
*** You have apparently encountered a bug in the compiler that we'd like to fix!
***
main.cpp(223): FATAL ERROR: Unhandled signal sent to process; terminating.

View File

@@ -0,0 +1,23 @@
//
// z.h
// (Header automatically generated by the ispc compiler.)
// DO NOT EDIT THIS FILE.
//
#ifndef ISPC_Z_H
#define ISPC_Z_H
#include <stdint.h>
#ifdef __cplusplus
namespace ispc { /* namespace */
#endif // __cplusplus
#ifdef __cplusplus
} /* namespace */
#endif // __cplusplus
#endif // ISPC_Z_H