added cuda examples
This commit is contained in:
2
examples_cuda/stencil/.gitignore
vendored
Normal file
2
examples_cuda/stencil/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
stencil
|
||||
objs
|
||||
BIN
examples_cuda/stencil/.stencil.ispc.swo
Normal file
BIN
examples_cuda/stencil/.stencil.ispc.swo
Normal file
Binary file not shown.
175
examples_cuda/stencil/1.s
Normal file
175
examples_cuda/stencil/1.s
Normal file
@@ -0,0 +1,175 @@
|
||||
|
||||
code for sm_35
|
||||
Function : stencil_step_task
|
||||
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
|
||||
/* 0x0880a010a0a01000 */
|
||||
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
|
||||
/*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */
|
||||
/*0018*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */
|
||||
/*0020*/ IADD R0, R10, c[0x0][0x150]; /* 0x608000002a1c2802 */
|
||||
/*0028*/ IADD R11, R0, 0x1; /* 0xc0800000009c002d */
|
||||
/*0030*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */
|
||||
/*0038*/ ISETP.GE.AND P0, PT, R0, R11, PT; /* 0xdb681c00059c001e */
|
||||
/* 0x08a0a1ac118d8d8c */
|
||||
/*0048*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */
|
||||
/*0050*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */
|
||||
/*0058*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */
|
||||
/*0060*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */
|
||||
/*0068*/ @P0 EXIT ; /* 0x180000000000003c */
|
||||
/*0070*/ MOV R11, c[0x0][0x158]; /* 0x64c03c002b1c002e */
|
||||
/*0078*/ IMUL R41, R11, c[0x0][0x154]; /* 0x61c018002a9c2ca6 */
|
||||
/* 0x08b0a000a010a010 */
|
||||
/*0088*/ IADD R11, R10, c[0x0][0x150]; /* 0x608000002a1c282e */
|
||||
/*0090*/ SHF.L R40, RZ, 0x1, R41; /* 0xb7c0a400009ffca1 */
|
||||
/*0098*/ I2I.S32.S32 R10, -R40; /* 0xe6010000141ce82a */
|
||||
/*00a0*/ IADD R49, R11, 0x1; /* 0xc0800000009c2cc5 */
|
||||
/*00a8*/ SHF.L R28, RZ, 0x3, R10; /* 0xb7c02800019ffc71 */
|
||||
/*00b0*/ MOV R10, c[0x0][0x148]; /* 0x64c03c00291c002a */
|
||||
/*00b8*/ ISETP.GE.AND P0, PT, R10, c[0x0][0x14c], PT; /* 0x5b681c00299c281e */
|
||||
/* 0x0880acb0a00010ac */
|
||||
/*00c8*/ @P0 BRA 0x4f0; /* 0x120000021000003c */
|
||||
/*00d0*/ MOV R29, c[0x0][0x148]; /* 0x64c03c00291c0076 */
|
||||
/*00d8*/ IMUL R42, R0, R41; /* 0xe1c01800149c00aa */
|
||||
/*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */
|
||||
/*00e8*/ ISETP.GE.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b681c00289c281e */
|
||||
/*00f0*/ @P0 BRA 0x4d8; /* 0x12000001f000003c */
|
||||
/*00f8*/ MOV R10, c[0x0][0x154]; /* 0x64c03c002a9c002a */
|
||||
/* 0x0880888010a0109c */
|
||||
/*0108*/ IMAD R44, R29, c[0x0][0x154], R42; /* 0x5108a8002a9c74b2 */
|
||||
/*0110*/ SHF.L R11, RZ, 0x1, R10; /* 0xb7c02800009ffc2d */
|
||||
/*0118*/ MOV R39, c[0x0][0x140]; /* 0x64c03c00281c009e */
|
||||
/*0120*/ IMAD R34, R10, -0x2, R44; /* 0xa908b3ffff1c2889 */
|
||||
/*0128*/ IADD R43, R44, R11; /* 0xe0800000059cb0ae */
|
||||
/*0130*/ I2I.S32.S32 R10, -R11; /* 0xe6010000059ce82a */
|
||||
/*0138*/ IMAD R36, R41, -0x2, R44; /* 0xa908b3ffff1ca491 */
|
||||
/* 0x08a0001084108480 */
|
||||
/*0148*/ IADD R32, R44, c[0x0][0x154]; /* 0x608000002a9cb082 */
|
||||
/*0150*/ IADD R33, R44, R41; /* 0xe0800000149cb086 */
|
||||
/*0158*/ IADD R35, R44, R40; /* 0xe0800000141cb08e */
|
||||
/*0160*/ IMAD R38, R41, 0x3, R44; /* 0xa108b000019ca499 */
|
||||
/*0168*/ SHF.L R47, RZ, 0x3, R10; /* 0xb7c02800019ffcbd */
|
||||
/*0170*/ IADD R37, R43, c[0x0][0x154]; /* 0x608000002a9cac96 */
|
||||
/*0178*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */
|
||||
/* 0x08a0b0a010908c10 */
|
||||
/*0188*/ MOV32I R48, 0x8; /* 0x74000000041fc0c2 */
|
||||
/*0190*/ IADD R45, R10, R39; /* 0xe0800000139c28b6 */
|
||||
/*0198*/ BFE R30, R47, 0x11f; /* 0xc00800008f9cbc79 */
|
||||
/*01a0*/ IADD R46, R45, R44; /* 0xe0800000161cb4ba */
|
||||
/*01a8*/ IADD R14, R32, R45; /* 0xe0800000169c803a */
|
||||
/*01b0*/ IMAD R10.CC, R46, R48, c[0x0][0x170]; /* 0x910cc0002e1cb82a */
|
||||
/*01b8*/ IMAD.HI.X R11, R46, R48, c[0x0][0x174]; /* 0x9318c0002e9cb82e */
|
||||
/* 0x0881cc118c118c10 */
|
||||
/*01c8*/ IADD R27, R37, R45; /* 0xe0800000169c946e */
|
||||
/*01d0*/ LD.E.64 R12, [R10+-0x8]; /* 0xc5fffffffc1c2830 */
|
||||
/*01d8*/ BFE R50, R28, 0x11f; /* 0xc00800008f9c70c9 */
|
||||
/*01e0*/ LD.E.64 R24, [R10+0x8]; /* 0xc5800000041c2860 */
|
||||
/*01e8*/ ISETP.GE.AND P0, PT, R45, c[0x0][0x144], PT; /* 0x5b681c00289cb41e */
|
||||
/*01f0*/ LD.E.64 R18, [R10+-0x18]; /* 0xc5fffffff41c2848 */
|
||||
/*01f8*/ DADD R20, R24, R12; /* 0xe3800000061c6052 */
|
||||
/* 0x098c10a011ac8188 */
|
||||
/*0208*/ LD.E.64 R22, [R10+0x18]; /* 0xc58000000c1c2858 */
|
||||
/*0210*/ IMAD R16.CC, R14, R48, c[0x0][0x170]; /* 0x910cc0002e1c3842 */
|
||||
/*0218*/ LD.E.64 R12, [R10+-0x10]; /* 0xc5fffffff81c2830 */
|
||||
/*0220*/ IMAD.HI.X R17, R14, R48, c[0x0][0x174]; /* 0x9318c0002e9c3846 */
|
||||
/*0228*/ IADD R25, R43, R45; /* 0xe0800000169cac66 */
|
||||
/*0230*/ LD.E.64 R14, [R16]; /* 0xc5800000001c4038 */
|
||||
/*0238*/ DADD R22, R22, R18; /* 0xe3800000091c585a */
|
||||
/* 0x0994808c848cb180 */
|
||||
/*0248*/ LD.E.64 R18, [R10+0x10]; /* 0xc5800000081c2848 */
|
||||
/*0250*/ IMAD R26.CC, R27, R48, c[0x0][0x170]; /* 0x910cc0002e1c6c6a */
|
||||
/*0258*/ IMAD.HI.X R27, R27, R48, c[0x0][0x174]; /* 0x9318c0002e9c6c6e */
|
||||
/*0260*/ IMAD R24.CC, R25, R48, c[0x0][0x170]; /* 0x910cc0002e1c6462 */
|
||||
/*0268*/ DADD R14, R20, R14; /* 0xe3800000071c503a */
|
||||
/*0270*/ DADD R20, R18, R12; /* 0xe3800000061c4852 */
|
||||
/*0278*/ LD.E.64 R12, [R26]; /* 0xc5800000001c6830 */
|
||||
/* 0x08b080118010c080 */
|
||||
/*0288*/ IMAD.HI.X R25, R25, R48, c[0x0][0x174]; /* 0x9318c0002e9c6466 */
|
||||
/*0290*/ IADD R16.CC, R16, R47; /* 0xe0840000179c4042 */
|
||||
/*0298*/ LD.E.64 R18, [R24]; /* 0xc5800000001c6048 */
|
||||
/*02a0*/ DADD R12, R22, R12; /* 0xe3800000061c5832 */
|
||||
/*02a8*/ IADD.X R17, R17, R30; /* 0xe08040000f1c4446 */
|
||||
/*02b0*/ IADD R31, R34, R45; /* 0xe0800000169c887e */
|
||||
/*02b8*/ IADD R22.CC, R16, R47; /* 0xe0840000179c405a */
|
||||
/* 0x089980818880a010 */
|
||||
/*02c8*/ IADD.X R23, R17, R30; /* 0xe08040000f1c445e */
|
||||
/*02d0*/ IMAD R26.CC, R31, R48, c[0x0][0x170]; /* 0x910cc0002e1c7c6a */
|
||||
/*02d8*/ DADD R20, R20, R18; /* 0xe3800000091c5052 */
|
||||
/*02e0*/ LD.E.64 R18, [R16]; /* 0xc5800000001c4048 */
|
||||
/*02e8*/ IMAD.HI.X R27, R31, R48, c[0x0][0x174]; /* 0x9318c0002e9c7c6e */
|
||||
/*02f0*/ LD.E.64 R24, [R22]; /* 0xc5800000001c5860 */
|
||||
/*02f8*/ IADD R51, R33, R45; /* 0xe0800000169c84ce */
|
||||
/* 0x088880ac818c11b8 */
|
||||
/*0308*/ LD.E.64 R30, [R26]; /* 0xc5800000001c6878 */
|
||||
/*0310*/ LD.E.64 R26, [R10]; /* 0xc5800000001c2868 */
|
||||
/*0318*/ DADD R14, R14, R18; /* 0xe3800000091c383a */
|
||||
/*0320*/ IMAD R18.CC, R51, R48, c[0x0][0x170]; /* 0x910cc0002e1ccc4a */
|
||||
/*0328*/ IADD R17, R35, R45; /* 0xe0800000169c8c46 */
|
||||
/*0330*/ IMAD.HI.X R19, R51, R48, c[0x0][0x174]; /* 0x9318c0002e9ccc4e */
|
||||
/*0338*/ DADD R22, R20, R30; /* 0xe38000000f1c505a */
|
||||
/* 0x098c10a0999c1090 */
|
||||
/*0348*/ IMAD R16.CC, R17, R48, c[0x0][0x170]; /* 0x910cc0002e1c4442 */
|
||||
/*0350*/ LD.E.64 R20, [R18]; /* 0xc5800000001c4850 */
|
||||
/*0358*/ DADD R12, R12, R24; /* 0xe38000000c1c3032 */
|
||||
/*0360*/ IMAD.HI.X R17, R17, R48, c[0x0][0x174]; /* 0x9318c0002e9c4446 */
|
||||
/*0368*/ IADD R18.CC, R18, R28; /* 0xe08400000e1c484a */
|
||||
/*0370*/ LD.E.64 R24, [R16]; /* 0xc5800000001c4060 */
|
||||
/*0378*/ DADD R20, R14, R20; /* 0xe38000000a1c3852 */
|
||||
/* 0x088080b4a18010cc */
|
||||
/*0388*/ IADD.X R19, R19, R50; /* 0xe0804000191c4c4e */
|
||||
/*0390*/ LD.E.64 R14, [R18]; /* 0xc5800000001c4838 */
|
||||
/*0398*/ DADD R22, R22, R24; /* 0xe38000000c1c585a */
|
||||
/*03a0*/ IADD R25, R36, R45; /* 0xe0800000169c9066 */
|
||||
/*03a8*/ IMAD R16.CC, R25, R48, c[0x0][0x170]; /* 0x910cc0002e1c6442 */
|
||||
/*03b0*/ DADD R20, R20, R14; /* 0xe3800000071c5052 */
|
||||
/*03b8*/ IADD R15, R38, R45; /* 0xe0800000169c983e */
|
||||
/* 0x09a010b081ac809c */
|
||||
/*03c8*/ IMAD.HI.X R17, R25, R48, c[0x0][0x174]; /* 0x9318c0002e9c6446 */
|
||||
/*03d0*/ IMAD R14.CC, R15, R48, c[0x0][0x170]; /* 0x910cc0002e1c3c3a */
|
||||
/*03d8*/ LD.E.64 R24, [R16]; /* 0xc5800000001c4060 */
|
||||
/*03e0*/ IMAD.HI.X R15, R15, R48, c[0x0][0x174]; /* 0x9318c0002e9c3c3e */
|
||||
/*03e8*/ IADD R18.CC, R18, R28; /* 0xe08400000e1c484a */
|
||||
/*03f0*/ LD.E.64 R30, [R14]; /* 0xc5800000001c3878 */
|
||||
/*03f8*/ IADD.X R19, R19, R50; /* 0xe0804000191c4c4e */
|
||||
/* 0x08a480a480b58010 */
|
||||
/*0408*/ LD.E.64 R50, [R18]; /* 0xc5800000001c48c8 */
|
||||
/*0410*/ DMUL R20, R6, R20; /* 0xe40000000a1c1852 */
|
||||
/*0418*/ DADD R22, R22, R24; /* 0xe38000000c1c585a */
|
||||
/*0420*/ DADD R12, R12, R30; /* 0xe38000000f1c3032 */
|
||||
/*0428*/ DFMA R24, R8, R26, R20; /* 0xdb8050000d1c2062 */
|
||||
/*0430*/ DFMA R16, R4, R22, R24; /* 0xdb8060000b1c1042 */
|
||||
/*0438*/ DADD R12, R12, R50; /* 0xe3800000191c3032 */
|
||||
/* 0x08908cb0a010ac80 */
|
||||
/*0448*/ DFMA R10, R2, R12, R16; /* 0xdb804000061c082a */
|
||||
/*0450*/ @P0 BRA.U 0x4b8; /* 0x120000003000023c */
|
||||
/*0458*/ @!P0 MOV32I R17, 0x8; /* 0x740000000423c046 */
|
||||
/*0460*/ @!P0 DADD R18, R26, R26; /* 0xe38000000d20684a */
|
||||
/*0468*/ @!P0 IMAD R14.CC, R46, R17, c[0x0][0x178]; /* 0x910c44002f20b83a */
|
||||
/*0470*/ @!P0 IMAD.HI.X R15, R46, R17, c[0x0][0x17c]; /* 0x931844002fa0b83e */
|
||||
/*0478*/ @!P0 IMAD R16.CC, R46, R17, c[0x0][0x168]; /* 0x910c44002d20b842 */
|
||||
/* 0x08a180a5dc10bd9c */
|
||||
/*0488*/ @!P0 LD.E.64 R12, [R14]; /* 0xc580000000203830 */
|
||||
/*0490*/ @!P0 IMAD.HI.X R17, R46, R17, c[0x0][0x16c]; /* 0x931844002da0b846 */
|
||||
/*0498*/ @!P0 LD.E.64 R20, [R16]; /* 0xc580000000204050 */
|
||||
/*04a0*/ @!P0 DADD R22, R18, -R12; /* 0xe38100000620485a */
|
||||
/*04a8*/ @!P0 DFMA R10, R20, R10, R22; /* 0xdb8058000520502a */
|
||||
/*04b0*/ @!P0 ST.E.64 [R14], R10; /* 0xe580000000203828 */
|
||||
/*04b8*/ IADD R39, R39, 0x20; /* 0xc0800000101c9c9d */
|
||||
/* 0x08b0a0b8b0a0b8b0 */
|
||||
/*04c8*/ ISETP.LT.AND P0, PT, R39, c[0x0][0x144], PT; /* 0x5b181c00289c9c1e */
|
||||
/*04d0*/ @P0 BRA 0x178; /* 0x12007ffe5000003c */
|
||||
/*04d8*/ IADD R29, R29, 0x1; /* 0xc0800000009c7475 */
|
||||
/*04e0*/ ISETP.LT.AND P0, PT, R29, c[0x0][0x14c], PT; /* 0x5b181c00299c741e */
|
||||
/*04e8*/ @P0 BRA 0xe0; /* 0x12007ffdf800003c */
|
||||
/*04f0*/ IADD R0, R0, 0x1; /* 0xc0800000009c0001 */
|
||||
/*04f8*/ ISETP.LT.AND P0, PT, R0, R49, PT; /* 0xdb181c00189c001e */
|
||||
/* 0x0800000000b810b8 */
|
||||
/*0508*/ @P0 BRA 0xb0; /* 0x12007ffdd000003c */
|
||||
/*0510*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
|
||||
/*0518*/ EXIT ; /* 0x18000000001c003c */
|
||||
/*0520*/ BRA 0x520; /* 0x12007ffffc1c003c */
|
||||
/*0528*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0530*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0538*/ NOP; /* 0x85800000001c3c02 */
|
||||
..................................
|
||||
|
||||
|
||||
239
examples_cuda/stencil/2.s
Normal file
239
examples_cuda/stencil/2.s
Normal file
@@ -0,0 +1,239 @@
|
||||
|
||||
code for sm_35
|
||||
Function : stencil_step_task
|
||||
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
|
||||
/* 0x0880acb0a0a0a000 */
|
||||
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
|
||||
/*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */
|
||||
/*0018*/ IADD R44, R10, c[0x0][0x150]; /* 0x608000002a1c28b2 */
|
||||
/*0020*/ IADD R0, R44, 0x1; /* 0xc0800000009cb001 */
|
||||
/*0028*/ ISETP.GE.AND P0, PT, R44, R0, PT; /* 0xdb681c00001cb01e */
|
||||
/*0030*/ @P0 EXIT ; /* 0x180000000000003c */
|
||||
/*0038*/ MOV R11, c[0x0][0x154]; /* 0x64c03c002a9c002e */
|
||||
/* 0x0888108010a01080 */
|
||||
/*0048*/ IADD R41, R10, c[0x0][0x150]; /* 0x608000002a1c28a6 */
|
||||
/*0050*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */
|
||||
/*0058*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */
|
||||
/*0060*/ IMUL R35, R11, c[0x0][0x158]; /* 0x61c018002b1c2c8e */
|
||||
/*0068*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */
|
||||
/*0070*/ SHF.L R36, RZ, 0x1, R11; /* 0xb7c02c00009ffc91 */
|
||||
/*0078*/ MOV R42, c[0x0][0x148]; /* 0x64c03c00291c00aa */
|
||||
/* 0x088c80108c108c10 */
|
||||
/*0088*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */
|
||||
/*0090*/ IMUL R0, R11, 0x3; /* 0xc1c01800019c2c01 */
|
||||
/*0098*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */
|
||||
/*00a0*/ IMUL R18, R11, -0x3; /* 0xc9c01bfffe9c2c49 */
|
||||
/*00a8*/ SHF.L R37, RZ, 0x1, R35; /* 0xb7c08c00009ffc95 */
|
||||
/*00b0*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */
|
||||
/*00b8*/ IMUL R19, R35, 0x3; /* 0xc1c01800019c8c4d */
|
||||
/* 0x0880acb0a0acb000 */
|
||||
/*00c8*/ IMUL R20, R35, -0x3; /* 0xc9c01bfffe9c8c51 */
|
||||
/*00d0*/ ISETP.GE.AND P0, PT, R42, c[0x0][0x14c], PT; /* 0x5b681c00299ca81e */
|
||||
/*00d8*/ @P0 BRA 0x6d8; /* 0x12000002fc00003c */
|
||||
/*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */
|
||||
/*00e8*/ ISETP.LT.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b181c00289c281e */
|
||||
/*00f0*/ @!P0 BRA 0x6d8; /* 0x12000002f020003c */
|
||||
/*00f8*/ IMUL R40, R44, R35; /* 0xe1c01800119cb0a2 */
|
||||
/* 0x088880108c10a000 */
|
||||
/*0108*/ MOV R21, c[0x0][0x148]; /* 0x64c03c00291c0056 */
|
||||
/*0110*/ IMAD R39, R21, c[0x0][0x154], R40; /* 0x5108a0002a9c549e */
|
||||
/*0118*/ MOV R34, c[0x0][0x140]; /* 0x64c03c00281c008a */
|
||||
/*0120*/ IADD R29, R39, R37; /* 0xe0800000129c9c76 */
|
||||
/*0128*/ IADD R22, R39, c[0x0][0x154]; /* 0x608000002a9c9c5a */
|
||||
/*0130*/ ISUB R32, R39, R37; /* 0xe0880000129c9c82 */
|
||||
/*0138*/ IADD R23, R39, R36; /* 0xe0800000121c9c5e */
|
||||
/* 0x0880808080108c10 */
|
||||
/*0148*/ ISUB R24, R39, c[0x0][0x154]; /* 0x608800002a9c9c62 */
|
||||
/*0150*/ IADD R25, R39, R0; /* 0xe0800000001c9c66 */
|
||||
/*0158*/ ISUB R26, R39, R36; /* 0xe0880000121c9c6a */
|
||||
/*0160*/ IADD R27, R39, R35; /* 0xe0800000119c9c6e */
|
||||
/*0168*/ IADD R28, R39, R18; /* 0xe0800000091c9c72 */
|
||||
/*0170*/ ISUB R30, R39, R35; /* 0xe0880000119c9c7a */
|
||||
/*0178*/ IADD R33, R39, R20; /* 0xe08000000a1c9c86 */
|
||||
/* 0x08a0acb0a0a0a000 */
|
||||
/*0188*/ IADD R31, R39, R19; /* 0xe0800000099c9c7e */
|
||||
/*0190*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */
|
||||
/*0198*/ LOP.AND R11, R10, 0x1f; /* 0xc20000000f9c282d */
|
||||
/*01a0*/ IADD R43, R11, R34; /* 0xe0800000111c2cae */
|
||||
/*01a8*/ ISETP.GE.AND P0, PT, R43, c[0x0][0x144], PT; /* 0x5b681c00289cac1e */
|
||||
/*01b0*/ @P0 BRA.U 0x6a0; /* 0x120000027400023c */
|
||||
/*01b8*/ @!P0 IADD R10, R39, R43; /* 0xe080000015a09c2a */
|
||||
/* 0x08a0108c109c80a0 */
|
||||
/*01c8*/ @!P0 SHF.L R38, RZ, 0x3, R10; /* 0xb7c0280001a3fc99 */
|
||||
/*01d0*/ @!P0 IADD R10, R38, -0x8; /* 0xc88003fffc209829 */
|
||||
/*01d8*/ @!P0 IADD R11, R38, 0x8; /* 0xc08000000420982d */
|
||||
/*01e0*/ @!P0 BFE R12, R10, 0x11f; /* 0xc00800008fa02831 */
|
||||
/*01e8*/ @!P0 IADD R54.CC, R10, c[0x0][0x170]; /* 0x608400002e2028da */
|
||||
/*01f0*/ @!P0 IADD R10, R38, -0x10; /* 0xc88003fff8209829 */
|
||||
/*01f8*/ @!P0 BFE R13, R11, 0x11f; /* 0xc00800008fa02c35 */
|
||||
/* 0x08808080a0108c10 */
|
||||
/*0208*/ @!P0 IADD.X R55, R12, c[0x0][0x174]; /* 0x608040002ea030de */
|
||||
/*0210*/ @!P0 IADD R46.CC, R11, c[0x0][0x170]; /* 0x608400002e202cba */
|
||||
/*0218*/ @!P0 IADD R11, R38, 0x10; /* 0xc08000000820982d */
|
||||
/*0220*/ @!P0 BFE R14, R10, 0x11f; /* 0xc00800008fa02839 */
|
||||
/*0228*/ @!P0 IADD.X R47, R13, c[0x0][0x174]; /* 0x608040002ea034be */
|
||||
/*0230*/ @!P0 IADD R48.CC, R10, c[0x0][0x170]; /* 0x608400002e2028c2 */
|
||||
/*0238*/ @!P0 IADD R10, R22, R43; /* 0xe080000015a0582a */
|
||||
/* 0x08ac108080909410 */
|
||||
/*0248*/ @!P0 LD.E.64 R12, [R54]; /* 0xc58000000020d830 */
|
||||
/*0250*/ @!P0 BFE R15, R11, 0x11f; /* 0xc00800008fa02c3d */
|
||||
/*0258*/ @!P0 LD.E.64 R16, [R46]; /* 0xc58000000020b840 */
|
||||
/*0260*/ @!P0 IADD.X R49, R14, c[0x0][0x174]; /* 0x608040002ea038c6 */
|
||||
/*0268*/ @!P0 IADD R52.CC, R11, c[0x0][0x170]; /* 0x608400002e202cd2 */
|
||||
/*0270*/ @!P0 SHF.L R50, RZ, 0x3, R10; /* 0xb7c0280001a3fcc9 */
|
||||
/*0278*/ @!P0 IADD R14, R23, R43; /* 0xe080000015a05c3a */
|
||||
/* 0x08908c108c108010 */
|
||||
/*0288*/ @!P0 IADD.X R53, R15, c[0x0][0x174]; /* 0x608040002ea03cd6 */
|
||||
/*0290*/ @!P0 BFE R51, R50, 0x11f; /* 0xc00800008fa0c8cd */
|
||||
/*0298*/ @!P0 IADD R50.CC, R50, c[0x0][0x170]; /* 0x608400002e20c8ca */
|
||||
/*02a0*/ @!P0 SHF.L R45, RZ, 0x3, R14; /* 0xb7c0380001a3fcb5 */
|
||||
/*02a8*/ @!P0 LD.E.64 R10, [R48]; /* 0xc58000000020c028 */
|
||||
/*02b0*/ @!P0 DADD R12, R12, R16; /* 0xe380000008203032 */
|
||||
/*02b8*/ @!P0 LD.E.64 R14, [R52]; /* 0xc58000000020d038 */
|
||||
/* 0x089c8010b0108c10 */
|
||||
/*02c8*/ @!P0 IADD.X R51, R51, c[0x0][0x174]; /* 0x608040002ea0ccce */
|
||||
/*02d0*/ @!P0 BFE R17, R45, 0x11f; /* 0xc00800008fa0b445 */
|
||||
/*02d8*/ @!P0 IADD R16, R24, R43; /* 0xe080000015a06042 */
|
||||
/*02e0*/ @!P0 IADD R46.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4ba */
|
||||
/*02e8*/ @!P0 SHF.L R45, RZ, 0x3, R16; /* 0xb7c0400001a3fcb5 */
|
||||
/*02f0*/ @!P0 IADD.X R47, R17, c[0x0][0x174]; /* 0x608040002ea044be */
|
||||
/*02f8*/ @!P0 LD.E.64 R16, [R50]; /* 0xc58000000020c840 */
|
||||
/* 0x08848010a8108080 */
|
||||
/*0308*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */
|
||||
/*0310*/ @!P0 DADD R48, R10, R14; /* 0xe3800000072028c2 */
|
||||
/*0318*/ @!P0 BFE R11, R45, 0x11f; /* 0xc00800008fa0b42d */
|
||||
/*0320*/ @!P0 IADD R10, R26, R43; /* 0xe080000015a0682a */
|
||||
/*0328*/ @!P0 IADD.X R55, R11, c[0x0][0x174]; /* 0x608040002ea02cde */
|
||||
/*0330*/ @!P0 SHF.L R45, RZ, 0x3, R10; /* 0xb7c0280001a3fcb5 */
|
||||
/*0338*/ @!P0 LD.E.64 R14, [R46]; /* 0xc58000000020b838 */
|
||||
/* 0x0890988010801094 */
|
||||
/*0348*/ @!P0 DADD R16, R12, R16; /* 0xe380000008203042 */
|
||||
/*0350*/ @!P0 IADD R13, R27, R43; /* 0xe080000015a06c36 */
|
||||
/*0358*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */
|
||||
/*0360*/ @!P0 BFE R53, R45, 0x11f; /* 0xc00800008fa0b4d5 */
|
||||
/*0368*/ @!P0 IADD R52.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4d2 */
|
||||
/*0370*/ @!P0 IADD R12, R29, R43; /* 0xe080000015a07432 */
|
||||
/*0378*/ @!P0 SHF.L R13, RZ, 0x3, R13; /* 0xb7c0340001a3fc35 */
|
||||
/* 0x0894801094108c10 */
|
||||
/*0388*/ @!P0 IADD.X R53, R53, c[0x0][0x174]; /* 0x608040002ea0d4d6 */
|
||||
/*0390*/ @!P0 SHF.L R45, RZ, 0x3, R12; /* 0xb7c0300001a3fcb5 */
|
||||
/*0398*/ @!P0 BFE R46, R13, 0x11f; /* 0xc00800008fa034b9 */
|
||||
/*03a0*/ @!P0 IADD R50.CC, R13, c[0x0][0x170]; /* 0x608400002e2034ca */
|
||||
/*03a8*/ @!P0 LD.E.64 R12, [R52]; /* 0xc58000000020d030 */
|
||||
/*03b0*/ @!P0 DADD R16, R16, R10; /* 0xe380000005204042 */
|
||||
/*03b8*/ @!P0 BFE R10, R45, 0x11f; /* 0xc00800008fa0b429 */
|
||||
/* 0x08a0108c109c8010 */
|
||||
/*03c8*/ @!P0 IADD.X R51, R46, c[0x0][0x174]; /* 0x608040002ea0b8ce */
|
||||
/*03d0*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */
|
||||
/*03d8*/ @!P0 IADD R45, R30, R43; /* 0xe080000015a078b6 */
|
||||
/*03e0*/ @!P0 LD.E.64 R46, [R50]; /* 0xc58000000020c8b8 */
|
||||
/*03e8*/ @!P0 DADD R14, R48, R14; /* 0xe38000000720c03a */
|
||||
/*03f0*/ @!P0 IADD.X R55, R10, c[0x0][0x174]; /* 0x608040002ea028de */
|
||||
/*03f8*/ @!P0 SHF.L R48, RZ, 0x3, R45; /* 0xb7c0b40001a3fcc1 */
|
||||
/* 0x088480a080108010 */
|
||||
/*0408*/ @!P0 IADD R45, R32, R43; /* 0xe080000015a080b6 */
|
||||
/*0410*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */
|
||||
/*0418*/ @!P0 BFE R49, R48, 0x11f; /* 0xc00800008fa0c0c5 */
|
||||
/*0420*/ @!P0 IADD R48.CC, R48, c[0x0][0x170]; /* 0x608400002e20c0c2 */
|
||||
/*0428*/ @!P0 DADD R14, R14, R12; /* 0xe38000000620383a */
|
||||
/*0430*/ @!P0 DADD R12, R16, R46; /* 0xe380000017204032 */
|
||||
/*0438*/ @!P0 SHF.L R46, RZ, 0x3, R45; /* 0xb7c0b40001a3fcb9 */
|
||||
/* 0x0880808010b08010 */
|
||||
/*0448*/ @!P0 IADD.X R49, R49, c[0x0][0x174]; /* 0x608040002ea0c4c6 */
|
||||
/*0450*/ @!P0 BFE R45, R38, 0x11f; /* 0xc00800008fa098b5 */
|
||||
/*0458*/ @!P0 IADD R16.CC, R38, c[0x0][0x170]; /* 0x608400002e209842 */
|
||||
/*0460*/ @!P0 IADD.X R17, R45, c[0x0][0x174]; /* 0x608040002ea0b446 */
|
||||
/*0468*/ @!P0 LD.E.64 R50, [R48]; /* 0xc58000000020c0c8 */
|
||||
/*0470*/ @!P0 DADD R14, R14, R10; /* 0xe38000000520383a */
|
||||
/*0478*/ @!P0 BFE R10, R46, 0x11f; /* 0xc00800008fa0b829 */
|
||||
/* 0x0880bc109c1080b0 */
|
||||
/*0488*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */
|
||||
/*0490*/ @!P0 IADD.X R47, R10, c[0x0][0x174]; /* 0x608040002ea028be */
|
||||
/*0498*/ @!P0 LD.E.64 R10, [R16]; /* 0xc580000000204028 */
|
||||
/*04a0*/ @!P0 IADD R48, R38, -0x18; /* 0xc88003fff42098c1 */
|
||||
/*04a8*/ @!P0 LD.E.64 R52, [R46]; /* 0xc58000000020b8d0 */
|
||||
/*04b0*/ @!P0 DADD R12, R12, R50; /* 0xe380000019203032 */
|
||||
/*04b8*/ @!P0 DMUL R50, R8, R10; /* 0xe4000000052020ca */
|
||||
/* 0x08b08010b01080a0 */
|
||||
/*04c8*/ @!P0 IADD R46, R38, 0x18; /* 0xc08000000c2098b9 */
|
||||
/*04d0*/ @!P0 DFMA R16, R6, R12, R50; /* 0xdb80c80006201842 */
|
||||
/*04d8*/ @!P0 BFE R13, R48, 0x11f; /* 0xc00800008fa0c035 */
|
||||
/*04e0*/ @!P0 IADD R12.CC, R48, c[0x0][0x170]; /* 0x608400002e20c032 */
|
||||
/*04e8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */
|
||||
/*04f0*/ @!P0 IADD.X R13, R13, c[0x0][0x174]; /* 0x608040002ea03436 */
|
||||
/*04f8*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */
|
||||
/* 0x08a0a080dc109c80 */
|
||||
/*0508*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */
|
||||
/*0510*/ @!P0 LD.E.64 R48, [R12]; /* 0xc5800000002030c0 */
|
||||
/*0518*/ @!P0 LD.E.64 R50, [R46]; /* 0xc58000000020b8c8 */
|
||||
/*0520*/ @!P0 DADD R14, R14, R52; /* 0xe38000001a20383a */
|
||||
/*0528*/ @!P0 DADD R12, R48, R50; /* 0xe38000001920c032 */
|
||||
/*0530*/ @!P0 IADD R48, R25, R43; /* 0xe080000015a064c2 */
|
||||
/*0538*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */
|
||||
/* 0x08a080dc10a0b010 */
|
||||
/*0548*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */
|
||||
/*0550*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */
|
||||
/*0558*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */
|
||||
/*0560*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */
|
||||
/*0568*/ @!P0 DADD R10, R10, R10; /* 0xe38000000520282a */
|
||||
/*0570*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */
|
||||
/*0578*/ @!P0 IADD R48, R28, R43; /* 0xe080000015a070c2 */
|
||||
/* 0x08a080dca0b010a0 */
|
||||
/*0588*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */
|
||||
/*0590*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */
|
||||
/*0598*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */
|
||||
/*05a0*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */
|
||||
/*05a8*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */
|
||||
/*05b0*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */
|
||||
/*05b8*/ @!P0 IADD R48, R31, R43; /* 0xe080000015a07cc2 */
|
||||
/* 0x0880a010b010a010 */
|
||||
/*05c8*/ @!P0 IADD R43, R33, R43; /* 0xe080000015a084ae */
|
||||
/*05d0*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */
|
||||
/*05d8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */
|
||||
/*05e0*/ @!P0 IADD R48.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8c2 */
|
||||
/*05e8*/ @!P0 IADD.X R49, R47, c[0x0][0x174]; /* 0x608040002ea0bcc6 */
|
||||
/*05f0*/ @!P0 SHF.L R43, RZ, 0x3, R43; /* 0xb7c0ac0001a3fcad */
|
||||
/*05f8*/ @!P0 LD.E.64 R46, [R48]; /* 0xc58000000020c0b8 */
|
||||
/* 0x0880909c80a080d8 */
|
||||
/*0608*/ @!P0 IADD R52.CC, R43, c[0x0][0x170]; /* 0x608400002e20acd2 */
|
||||
/*0610*/ @!P0 DADD R46, R12, R46; /* 0xe3800000172030ba */
|
||||
/*0618*/ @!P0 BFE R12, R43, 0x11f; /* 0xc00800008fa0ac31 */
|
||||
/*0620*/ @!P0 IADD.X R53, R12, c[0x0][0x174]; /* 0x608040002ea030d6 */
|
||||
/*0628*/ @!P0 IADD R12.CC, R38, c[0x0][0x178]; /* 0x608400002f209832 */
|
||||
/*0630*/ @!P0 LD.E.64 R48, [R52]; /* 0xc58000000020d0c0 */
|
||||
/*0638*/ @!P0 IADD.X R13, R45, c[0x0][0x17c]; /* 0x608040002fa0b436 */
|
||||
/* 0x08cc8c10a48090b0 */
|
||||
/*0648*/ @!P0 IADD R50.CC, R38, c[0x0][0x168]; /* 0x608400002d2098ca */
|
||||
/*0650*/ @!P0 IADD.X R51, R45, c[0x0][0x16c]; /* 0x608040002da0b4ce */
|
||||
/*0658*/ @!P0 DADD R46, R46, R48; /* 0xe38000001820b8ba */
|
||||
/*0660*/ @!P0 DFMA R48, R4, R14, R16; /* 0xdb804000072010c2 */
|
||||
/*0668*/ @!P0 LD.E.64 R16, [R12]; /* 0xc580000000203040 */
|
||||
/*0670*/ @!P0 DFMA R48, R2, R46, R48; /* 0xdb80c000172008c2 */
|
||||
/*0678*/ @!P0 LD.E.64 R14, [R50]; /* 0xc58000000020c838 */
|
||||
/* 0x08a0b8b0a000a4a4 */
|
||||
/*0688*/ @!P0 DADD R10, R10, -R16; /* 0xe38100000820282a */
|
||||
/*0690*/ @!P0 DFMA R10, R48, R14, R10; /* 0xdb8028000720c02a */
|
||||
/*0698*/ @!P0 ST.E.64 [R12], R10; /* 0xe580000000203028 */
|
||||
/*06a0*/ IADD R34, R34, 0x20; /* 0xc0800000101c8889 */
|
||||
/*06a8*/ ISETP.LT.AND P0, PT, R34, c[0x0][0x144], PT; /* 0x5b181c00289c881e */
|
||||
/*06b0*/ @P0 BRA 0x190; /* 0x12007ffd6c00003c */
|
||||
/*06b8*/ IADD R21, R21, 0x1; /* 0xc0800000009c5455 */
|
||||
/* 0x08b810b8b010b8b0 */
|
||||
/*06c8*/ ISETP.EQ.AND P0, PT, R21, c[0x0][0x14c], PT; /* 0x5b281c00299c541e */
|
||||
/*06d0*/ @!P0 BRA 0x110; /* 0x12007ffd1c20003c */
|
||||
/*06d8*/ ISETP.NE.AND P0, PT, R44, R41, PT; /* 0xdb581c00149cb01e */
|
||||
/*06e0*/ IADD R44, R44, 0x1; /* 0xc0800000009cb0b1 */
|
||||
/*06e8*/ @P0 BRA 0xd0; /* 0x12007ffcf000003c */
|
||||
/*06f0*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
|
||||
/*06f8*/ EXIT ; /* 0x18000000001c003c */
|
||||
/*0700*/ BRA 0x700; /* 0x12007ffffc1c003c */
|
||||
/*0708*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0710*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0718*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0720*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0728*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0730*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0738*/ NOP; /* 0x85800000001c3c02 */
|
||||
..................................
|
||||
|
||||
|
||||
239
examples_cuda/stencil/3.s
Normal file
239
examples_cuda/stencil/3.s
Normal file
@@ -0,0 +1,239 @@
|
||||
|
||||
code for sm_35
|
||||
Function : stencil_step_task
|
||||
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
|
||||
/* 0x0880acb0a0a0a000 */
|
||||
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
|
||||
/*0010*/ S2R R10, SR_CTAID.X; /* 0x86400000129c002a */
|
||||
/*0018*/ IADD R44, R10, c[0x0][0x150]; /* 0x608000002a1c28b2 */
|
||||
/*0020*/ IADD R0, R44, 0x1; /* 0xc0800000009cb001 */
|
||||
/*0028*/ ISETP.GE.AND P0, PT, R44, R0, PT; /* 0xdb681c00001cb01e */
|
||||
/*0030*/ @P0 EXIT ; /* 0x180000000000003c */
|
||||
/*0038*/ MOV R11, c[0x0][0x154]; /* 0x64c03c002a9c002e */
|
||||
/* 0x0888108010a01080 */
|
||||
/*0048*/ IADD R41, R10, c[0x0][0x150]; /* 0x608000002a1c28a6 */
|
||||
/*0050*/ MOV R12, c[0x0][0x160]; /* 0x64c03c002c1c0032 */
|
||||
/*0058*/ MOV R13, c[0x0][0x164]; /* 0x64c03c002c9c0036 */
|
||||
/*0060*/ IMUL R35, R11, c[0x0][0x158]; /* 0x61c018002b1c2c8e */
|
||||
/*0068*/ LD.E.64 R8, [R12]; /* 0xc5800000001c3020 */
|
||||
/*0070*/ SHF.L R36, RZ, 0x1, R11; /* 0xb7c02c00009ffc91 */
|
||||
/*0078*/ MOV R42, c[0x0][0x148]; /* 0x64c03c00291c00aa */
|
||||
/* 0x088c80108c108c10 */
|
||||
/*0088*/ LD.E.64 R6, [R12+0x8]; /* 0xc5800000041c3018 */
|
||||
/*0090*/ IMUL R0, R11, 0x3; /* 0xc1c01800019c2c01 */
|
||||
/*0098*/ LD.E.64 R4, [R12+0x10]; /* 0xc5800000081c3010 */
|
||||
/*00a0*/ IMUL R18, R11, -0x3; /* 0xc9c01bfffe9c2c49 */
|
||||
/*00a8*/ SHF.L R37, RZ, 0x1, R35; /* 0xb7c08c00009ffc95 */
|
||||
/*00b0*/ LD.E.64 R2, [R12+0x18]; /* 0xc58000000c1c3008 */
|
||||
/*00b8*/ IMUL R19, R35, 0x3; /* 0xc1c01800019c8c4d */
|
||||
/* 0x0880acb0a0acb000 */
|
||||
/*00c8*/ IMUL R20, R35, -0x3; /* 0xc9c01bfffe9c8c51 */
|
||||
/*00d0*/ ISETP.GE.AND P0, PT, R42, c[0x0][0x14c], PT; /* 0x5b681c00299ca81e */
|
||||
/*00d8*/ @P0 BRA 0x6d8; /* 0x12000002fc00003c */
|
||||
/*00e0*/ MOV R10, c[0x0][0x140]; /* 0x64c03c00281c002a */
|
||||
/*00e8*/ ISETP.LT.AND P0, PT, R10, c[0x0][0x144], PT; /* 0x5b181c00289c281e */
|
||||
/*00f0*/ @!P0 BRA 0x6d8; /* 0x12000002f020003c */
|
||||
/*00f8*/ IMUL R40, R44, R35; /* 0xe1c01800119cb0a2 */
|
||||
/* 0x088880108c10a000 */
|
||||
/*0108*/ MOV R21, c[0x0][0x148]; /* 0x64c03c00291c0056 */
|
||||
/*0110*/ IMAD R39, R21, c[0x0][0x154], R40; /* 0x5108a0002a9c549e */
|
||||
/*0118*/ MOV R34, c[0x0][0x140]; /* 0x64c03c00281c008a */
|
||||
/*0120*/ IADD R29, R39, R37; /* 0xe0800000129c9c76 */
|
||||
/*0128*/ IADD R22, R39, c[0x0][0x154]; /* 0x608000002a9c9c5a */
|
||||
/*0130*/ ISUB R32, R39, R37; /* 0xe0880000129c9c82 */
|
||||
/*0138*/ IADD R23, R39, R36; /* 0xe0800000121c9c5e */
|
||||
/* 0x0880808080108c10 */
|
||||
/*0148*/ ISUB R24, R39, c[0x0][0x154]; /* 0x608800002a9c9c62 */
|
||||
/*0150*/ IADD R25, R39, R0; /* 0xe0800000001c9c66 */
|
||||
/*0158*/ ISUB R26, R39, R36; /* 0xe0880000121c9c6a */
|
||||
/*0160*/ IADD R27, R39, R35; /* 0xe0800000119c9c6e */
|
||||
/*0168*/ IADD R28, R39, R18; /* 0xe0800000091c9c72 */
|
||||
/*0170*/ ISUB R30, R39, R35; /* 0xe0880000119c9c7a */
|
||||
/*0178*/ IADD R33, R39, R20; /* 0xe08000000a1c9c86 */
|
||||
/* 0x08a0acb0a0a0a000 */
|
||||
/*0188*/ IADD R31, R39, R19; /* 0xe0800000099c9c7e */
|
||||
/*0190*/ S2R R10, SR_TID.X; /* 0x86400000109c002a */
|
||||
/*0198*/ LOP.AND R11, R10, 0x1f; /* 0xc20000000f9c282d */
|
||||
/*01a0*/ IADD R43, R11, R34; /* 0xe0800000111c2cae */
|
||||
/*01a8*/ ISETP.GE.AND P0, PT, R43, c[0x0][0x144], PT; /* 0x5b681c00289cac1e */
|
||||
/*01b0*/ @P0 BRA.U 0x6a0; /* 0x120000027400023c */
|
||||
/*01b8*/ @!P0 IADD R10, R39, R43; /* 0xe080000015a09c2a */
|
||||
/* 0x08a0108c109c80a0 */
|
||||
/*01c8*/ @!P0 SHF.L R38, RZ, 0x3, R10; /* 0xb7c0280001a3fc99 */
|
||||
/*01d0*/ @!P0 IADD R10, R38, -0x8; /* 0xc88003fffc209829 */
|
||||
/*01d8*/ @!P0 IADD R11, R38, 0x8; /* 0xc08000000420982d */
|
||||
/*01e0*/ @!P0 BFE R12, R10, 0x11f; /* 0xc00800008fa02831 */
|
||||
/*01e8*/ @!P0 IADD R54.CC, R10, c[0x0][0x170]; /* 0x608400002e2028da */
|
||||
/*01f0*/ @!P0 IADD R10, R38, -0x10; /* 0xc88003fff8209829 */
|
||||
/*01f8*/ @!P0 BFE R13, R11, 0x11f; /* 0xc00800008fa02c35 */
|
||||
/* 0x08808080a0108c10 */
|
||||
/*0208*/ @!P0 IADD.X R55, R12, c[0x0][0x174]; /* 0x608040002ea030de */
|
||||
/*0210*/ @!P0 IADD R46.CC, R11, c[0x0][0x170]; /* 0x608400002e202cba */
|
||||
/*0218*/ @!P0 IADD R11, R38, 0x10; /* 0xc08000000820982d */
|
||||
/*0220*/ @!P0 BFE R14, R10, 0x11f; /* 0xc00800008fa02839 */
|
||||
/*0228*/ @!P0 IADD.X R47, R13, c[0x0][0x174]; /* 0x608040002ea034be */
|
||||
/*0230*/ @!P0 IADD R48.CC, R10, c[0x0][0x170]; /* 0x608400002e2028c2 */
|
||||
/*0238*/ @!P0 IADD R10, R22, R43; /* 0xe080000015a0582a */
|
||||
/* 0x08ac108080909410 */
|
||||
/*0248*/ @!P0 LD.E.64 R12, [R54]; /* 0xc58000000020d830 */
|
||||
/*0250*/ @!P0 BFE R15, R11, 0x11f; /* 0xc00800008fa02c3d */
|
||||
/*0258*/ @!P0 LD.E.64 R16, [R46]; /* 0xc58000000020b840 */
|
||||
/*0260*/ @!P0 IADD.X R49, R14, c[0x0][0x174]; /* 0x608040002ea038c6 */
|
||||
/*0268*/ @!P0 IADD R52.CC, R11, c[0x0][0x170]; /* 0x608400002e202cd2 */
|
||||
/*0270*/ @!P0 SHF.L R50, RZ, 0x3, R10; /* 0xb7c0280001a3fcc9 */
|
||||
/*0278*/ @!P0 IADD R14, R23, R43; /* 0xe080000015a05c3a */
|
||||
/* 0x08908c108c108010 */
|
||||
/*0288*/ @!P0 IADD.X R53, R15, c[0x0][0x174]; /* 0x608040002ea03cd6 */
|
||||
/*0290*/ @!P0 BFE R51, R50, 0x11f; /* 0xc00800008fa0c8cd */
|
||||
/*0298*/ @!P0 IADD R50.CC, R50, c[0x0][0x170]; /* 0x608400002e20c8ca */
|
||||
/*02a0*/ @!P0 SHF.L R45, RZ, 0x3, R14; /* 0xb7c0380001a3fcb5 */
|
||||
/*02a8*/ @!P0 LD.E.64 R10, [R48]; /* 0xc58000000020c028 */
|
||||
/*02b0*/ @!P0 DADD R12, R12, R16; /* 0xe380000008203032 */
|
||||
/*02b8*/ @!P0 LD.E.64 R14, [R52]; /* 0xc58000000020d038 */
|
||||
/* 0x089c8010b0108c10 */
|
||||
/*02c8*/ @!P0 IADD.X R51, R51, c[0x0][0x174]; /* 0x608040002ea0ccce */
|
||||
/*02d0*/ @!P0 BFE R17, R45, 0x11f; /* 0xc00800008fa0b445 */
|
||||
/*02d8*/ @!P0 IADD R16, R24, R43; /* 0xe080000015a06042 */
|
||||
/*02e0*/ @!P0 IADD R46.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4ba */
|
||||
/*02e8*/ @!P0 SHF.L R45, RZ, 0x3, R16; /* 0xb7c0400001a3fcb5 */
|
||||
/*02f0*/ @!P0 IADD.X R47, R17, c[0x0][0x174]; /* 0x608040002ea044be */
|
||||
/*02f8*/ @!P0 LD.E.64 R16, [R50]; /* 0xc58000000020c840 */
|
||||
/* 0x08848010a8108080 */
|
||||
/*0308*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */
|
||||
/*0310*/ @!P0 DADD R48, R10, R14; /* 0xe3800000072028c2 */
|
||||
/*0318*/ @!P0 BFE R11, R45, 0x11f; /* 0xc00800008fa0b42d */
|
||||
/*0320*/ @!P0 IADD R10, R26, R43; /* 0xe080000015a0682a */
|
||||
/*0328*/ @!P0 IADD.X R55, R11, c[0x0][0x174]; /* 0x608040002ea02cde */
|
||||
/*0330*/ @!P0 SHF.L R45, RZ, 0x3, R10; /* 0xb7c0280001a3fcb5 */
|
||||
/*0338*/ @!P0 LD.E.64 R14, [R46]; /* 0xc58000000020b838 */
|
||||
/* 0x0890988010801094 */
|
||||
/*0348*/ @!P0 DADD R16, R12, R16; /* 0xe380000008203042 */
|
||||
/*0350*/ @!P0 IADD R13, R27, R43; /* 0xe080000015a06c36 */
|
||||
/*0358*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */
|
||||
/*0360*/ @!P0 BFE R53, R45, 0x11f; /* 0xc00800008fa0b4d5 */
|
||||
/*0368*/ @!P0 IADD R52.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4d2 */
|
||||
/*0370*/ @!P0 IADD R12, R29, R43; /* 0xe080000015a07432 */
|
||||
/*0378*/ @!P0 SHF.L R13, RZ, 0x3, R13; /* 0xb7c0340001a3fc35 */
|
||||
/* 0x0894801094108c10 */
|
||||
/*0388*/ @!P0 IADD.X R53, R53, c[0x0][0x174]; /* 0x608040002ea0d4d6 */
|
||||
/*0390*/ @!P0 SHF.L R45, RZ, 0x3, R12; /* 0xb7c0300001a3fcb5 */
|
||||
/*0398*/ @!P0 BFE R46, R13, 0x11f; /* 0xc00800008fa034b9 */
|
||||
/*03a0*/ @!P0 IADD R50.CC, R13, c[0x0][0x170]; /* 0x608400002e2034ca */
|
||||
/*03a8*/ @!P0 LD.E.64 R12, [R52]; /* 0xc58000000020d030 */
|
||||
/*03b0*/ @!P0 DADD R16, R16, R10; /* 0xe380000005204042 */
|
||||
/*03b8*/ @!P0 BFE R10, R45, 0x11f; /* 0xc00800008fa0b429 */
|
||||
/* 0x08a0108c109c8010 */
|
||||
/*03c8*/ @!P0 IADD.X R51, R46, c[0x0][0x174]; /* 0x608040002ea0b8ce */
|
||||
/*03d0*/ @!P0 IADD R54.CC, R45, c[0x0][0x170]; /* 0x608400002e20b4da */
|
||||
/*03d8*/ @!P0 IADD R45, R30, R43; /* 0xe080000015a078b6 */
|
||||
/*03e0*/ @!P0 LD.E.64 R46, [R50]; /* 0xc58000000020c8b8 */
|
||||
/*03e8*/ @!P0 DADD R14, R48, R14; /* 0xe38000000720c03a */
|
||||
/*03f0*/ @!P0 IADD.X R55, R10, c[0x0][0x174]; /* 0x608040002ea028de */
|
||||
/*03f8*/ @!P0 SHF.L R48, RZ, 0x3, R45; /* 0xb7c0b40001a3fcc1 */
|
||||
/* 0x088480a080108010 */
|
||||
/*0408*/ @!P0 IADD R45, R32, R43; /* 0xe080000015a080b6 */
|
||||
/*0410*/ @!P0 LD.E.64 R10, [R54]; /* 0xc58000000020d828 */
|
||||
/*0418*/ @!P0 BFE R49, R48, 0x11f; /* 0xc00800008fa0c0c5 */
|
||||
/*0420*/ @!P0 IADD R48.CC, R48, c[0x0][0x170]; /* 0x608400002e20c0c2 */
|
||||
/*0428*/ @!P0 DADD R14, R14, R12; /* 0xe38000000620383a */
|
||||
/*0430*/ @!P0 DADD R12, R16, R46; /* 0xe380000017204032 */
|
||||
/*0438*/ @!P0 SHF.L R46, RZ, 0x3, R45; /* 0xb7c0b40001a3fcb9 */
|
||||
/* 0x0880808010b08010 */
|
||||
/*0448*/ @!P0 IADD.X R49, R49, c[0x0][0x174]; /* 0x608040002ea0c4c6 */
|
||||
/*0450*/ @!P0 BFE R45, R38, 0x11f; /* 0xc00800008fa098b5 */
|
||||
/*0458*/ @!P0 IADD R16.CC, R38, c[0x0][0x170]; /* 0x608400002e209842 */
|
||||
/*0460*/ @!P0 IADD.X R17, R45, c[0x0][0x174]; /* 0x608040002ea0b446 */
|
||||
/*0468*/ @!P0 LD.E.64 R50, [R48]; /* 0xc58000000020c0c8 */
|
||||
/*0470*/ @!P0 DADD R14, R14, R10; /* 0xe38000000520383a */
|
||||
/*0478*/ @!P0 BFE R10, R46, 0x11f; /* 0xc00800008fa0b829 */
|
||||
/* 0x0880bc109c1080b0 */
|
||||
/*0488*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */
|
||||
/*0490*/ @!P0 IADD.X R47, R10, c[0x0][0x174]; /* 0x608040002ea028be */
|
||||
/*0498*/ @!P0 LD.E.64 R10, [R16]; /* 0xc580000000204028 */
|
||||
/*04a0*/ @!P0 IADD R48, R38, -0x18; /* 0xc88003fff42098c1 */
|
||||
/*04a8*/ @!P0 LD.E.64 R52, [R46]; /* 0xc58000000020b8d0 */
|
||||
/*04b0*/ @!P0 DADD R12, R12, R50; /* 0xe380000019203032 */
|
||||
/*04b8*/ @!P0 DMUL R50, R8, R10; /* 0xe4000000052020ca */
|
||||
/* 0x08b08010b01080a0 */
|
||||
/*04c8*/ @!P0 IADD R46, R38, 0x18; /* 0xc08000000c2098b9 */
|
||||
/*04d0*/ @!P0 DFMA R16, R6, R12, R50; /* 0xdb80c80006201842 */
|
||||
/*04d8*/ @!P0 BFE R13, R48, 0x11f; /* 0xc00800008fa0c035 */
|
||||
/*04e0*/ @!P0 IADD R12.CC, R48, c[0x0][0x170]; /* 0x608400002e20c032 */
|
||||
/*04e8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */
|
||||
/*04f0*/ @!P0 IADD.X R13, R13, c[0x0][0x174]; /* 0x608040002ea03436 */
|
||||
/*04f8*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */
|
||||
/* 0x08a0a080dc109c80 */
|
||||
/*0508*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */
|
||||
/*0510*/ @!P0 LD.E.64 R48, [R12]; /* 0xc5800000002030c0 */
|
||||
/*0518*/ @!P0 LD.E.64 R50, [R46]; /* 0xc58000000020b8c8 */
|
||||
/*0520*/ @!P0 DADD R14, R14, R52; /* 0xe38000001a20383a */
|
||||
/*0528*/ @!P0 DADD R12, R48, R50; /* 0xe38000001920c032 */
|
||||
/*0530*/ @!P0 IADD R48, R25, R43; /* 0xe080000015a064c2 */
|
||||
/*0538*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */
|
||||
/* 0x08a080dc10a0b010 */
|
||||
/*0548*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */
|
||||
/*0550*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */
|
||||
/*0558*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */
|
||||
/*0560*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */
|
||||
/*0568*/ @!P0 DADD R10, R10, R10; /* 0xe38000000520282a */
|
||||
/*0570*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */
|
||||
/*0578*/ @!P0 IADD R48, R28, R43; /* 0xe080000015a070c2 */
|
||||
/* 0x08a080dca0b010a0 */
|
||||
/*0588*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */
|
||||
/*0590*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */
|
||||
/*0598*/ @!P0 IADD R46.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8ba */
|
||||
/*05a0*/ @!P0 IADD.X R47, R47, c[0x0][0x174]; /* 0x608040002ea0bcbe */
|
||||
/*05a8*/ @!P0 LD.E.64 R48, [R46]; /* 0xc58000000020b8c0 */
|
||||
/*05b0*/ @!P0 DADD R12, R12, R48; /* 0xe380000018203032 */
|
||||
/*05b8*/ @!P0 IADD R48, R31, R43; /* 0xe080000015a07cc2 */
|
||||
/* 0x0880a010b010a010 */
|
||||
/*05c8*/ @!P0 IADD R43, R33, R43; /* 0xe080000015a084ae */
|
||||
/*05d0*/ @!P0 SHF.L R46, RZ, 0x3, R48; /* 0xb7c0c00001a3fcb9 */
|
||||
/*05d8*/ @!P0 BFE R47, R46, 0x11f; /* 0xc00800008fa0b8bd */
|
||||
/*05e0*/ @!P0 IADD R48.CC, R46, c[0x0][0x170]; /* 0x608400002e20b8c2 */
|
||||
/*05e8*/ @!P0 IADD.X R49, R47, c[0x0][0x174]; /* 0x608040002ea0bcc6 */
|
||||
/*05f0*/ @!P0 SHF.L R43, RZ, 0x3, R43; /* 0xb7c0ac0001a3fcad */
|
||||
/*05f8*/ @!P0 LD.E.64 R46, [R48]; /* 0xc58000000020c0b8 */
|
||||
/* 0x0880909c80a080d8 */
|
||||
/*0608*/ @!P0 IADD R52.CC, R43, c[0x0][0x170]; /* 0x608400002e20acd2 */
|
||||
/*0610*/ @!P0 DADD R46, R12, R46; /* 0xe3800000172030ba */
|
||||
/*0618*/ @!P0 BFE R12, R43, 0x11f; /* 0xc00800008fa0ac31 */
|
||||
/*0620*/ @!P0 IADD.X R53, R12, c[0x0][0x174]; /* 0x608040002ea030d6 */
|
||||
/*0628*/ @!P0 IADD R12.CC, R38, c[0x0][0x178]; /* 0x608400002f209832 */
|
||||
/*0630*/ @!P0 LD.E.64 R48, [R52]; /* 0xc58000000020d0c0 */
|
||||
/*0638*/ @!P0 IADD.X R13, R45, c[0x0][0x17c]; /* 0x608040002fa0b436 */
|
||||
/* 0x08cc8c10a48090b0 */
|
||||
/*0648*/ @!P0 IADD R50.CC, R38, c[0x0][0x168]; /* 0x608400002d2098ca */
|
||||
/*0650*/ @!P0 IADD.X R51, R45, c[0x0][0x16c]; /* 0x608040002da0b4ce */
|
||||
/*0658*/ @!P0 DADD R46, R46, R48; /* 0xe38000001820b8ba */
|
||||
/*0660*/ @!P0 DFMA R48, R4, R14, R16; /* 0xdb804000072010c2 */
|
||||
/*0668*/ @!P0 LD.E.64 R16, [R12]; /* 0xc580000000203040 */
|
||||
/*0670*/ @!P0 DFMA R48, R2, R46, R48; /* 0xdb80c000172008c2 */
|
||||
/*0678*/ @!P0 LD.E.64 R14, [R50]; /* 0xc58000000020c838 */
|
||||
/* 0x08a0b8b0a000a4a4 */
|
||||
/*0688*/ @!P0 DADD R10, R10, -R16; /* 0xe38100000820282a */
|
||||
/*0690*/ @!P0 DFMA R10, R48, R14, R10; /* 0xdb8028000720c02a */
|
||||
/*0698*/ @!P0 ST.E.64 [R12], R10; /* 0xe580000000203028 */
|
||||
/*06a0*/ IADD R34, R34, 0x20; /* 0xc0800000101c8889 */
|
||||
/*06a8*/ ISETP.LT.AND P0, PT, R34, c[0x0][0x144], PT; /* 0x5b181c00289c881e */
|
||||
/*06b0*/ @P0 BRA 0x190; /* 0x12007ffd6c00003c */
|
||||
/*06b8*/ IADD R21, R21, 0x1; /* 0xc0800000009c5455 */
|
||||
/* 0x08b810b8b010b8b0 */
|
||||
/*06c8*/ ISETP.EQ.AND P0, PT, R21, c[0x0][0x14c], PT; /* 0x5b281c00299c541e */
|
||||
/*06d0*/ @!P0 BRA 0x110; /* 0x12007ffd1c20003c */
|
||||
/*06d8*/ ISETP.NE.AND P0, PT, R44, R41, PT; /* 0xdb581c00149cb01e */
|
||||
/*06e0*/ IADD R44, R44, 0x1; /* 0xc0800000009cb0b1 */
|
||||
/*06e8*/ @P0 BRA 0xd0; /* 0x12007ffcf000003c */
|
||||
/*06f0*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
|
||||
/*06f8*/ EXIT ; /* 0x18000000001c003c */
|
||||
/*0700*/ BRA 0x700; /* 0x12007ffffc1c003c */
|
||||
/*0708*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0710*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0718*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0720*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0728*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0730*/ NOP; /* 0x85800000001c3c02 */
|
||||
/*0738*/ NOP; /* 0x85800000001c3c02 */
|
||||
..................................
|
||||
|
||||
|
||||
8
examples_cuda/stencil/Makefile
Normal file
8
examples_cuda/stencil/Makefile
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=stencil
|
||||
CPP_SRC=stencil.cpp stencil_serial.cpp
|
||||
ISPC_SRC=stencil.ispc
|
||||
ISPC_IA_TARGETS=avx
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
BIN
examples_cuda/stencil/a.out
Executable file
BIN
examples_cuda/stencil/a.out
Executable file
Binary file not shown.
370
examples_cuda/stencil/drvapi_error_string.h
Normal file
370
examples_cuda/stencil/drvapi_error_string.h
Normal file
@@ -0,0 +1,370 @@
|
||||
/*
|
||||
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef _DRVAPI_ERROR_STRING_H_
|
||||
#define _DRVAPI_ERROR_STRING_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Error Code string definitions here
|
||||
typedef struct
|
||||
{
|
||||
char const *error_string;
|
||||
int error_id;
|
||||
} s_CudaErrorStr;
|
||||
|
||||
/**
|
||||
* Error codes
|
||||
*/
|
||||
static s_CudaErrorStr sCudaDrvErrorString[] =
|
||||
{
|
||||
/**
|
||||
* The API call returned with no errors. In the case of query calls, this
|
||||
* can also mean that the operation being queried is complete (see
|
||||
* ::cuEventQuery() and ::cuStreamQuery()).
|
||||
*/
|
||||
{ "CUDA_SUCCESS", 0 },
|
||||
|
||||
/**
|
||||
* This indicates that one or more of the parameters passed to the API call
|
||||
* is not within an acceptable range of values.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_VALUE", 1 },
|
||||
|
||||
/**
|
||||
* The API call failed because it was unable to allocate enough memory to
|
||||
* perform the requested operation.
|
||||
*/
|
||||
{ "CUDA_ERROR_OUT_OF_MEMORY", 2 },
|
||||
|
||||
/**
|
||||
* This indicates that the CUDA driver has not been initialized with
|
||||
* ::cuInit() or that initialization has failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_INITIALIZED", 3 },
|
||||
|
||||
/**
|
||||
* This indicates that the CUDA driver is in the process of shutting down.
|
||||
*/
|
||||
{ "CUDA_ERROR_DEINITIALIZED", 4 },
|
||||
|
||||
/**
|
||||
* This indicates profiling APIs are called while application is running
|
||||
* in visual profiler mode.
|
||||
*/
|
||||
{ "CUDA_ERROR_PROFILER_DISABLED", 5 },
|
||||
/**
|
||||
* This indicates profiling has not been initialized for this context.
|
||||
* Call cuProfilerInitialize() to resolve this.
|
||||
*/
|
||||
{ "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 6 },
|
||||
/**
|
||||
* This indicates profiler has already been started and probably
|
||||
* cuProfilerStart() is incorrectly called.
|
||||
*/
|
||||
{ "CUDA_ERROR_PROFILER_ALREADY_STARTED", 7 },
|
||||
/**
|
||||
* This indicates profiler has already been stopped and probably
|
||||
* cuProfilerStop() is incorrectly called.
|
||||
*/
|
||||
{ "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 8 },
|
||||
/**
|
||||
* This indicates that no CUDA-capable devices were detected by the installed
|
||||
* CUDA driver.
|
||||
*/
|
||||
{ "CUDA_ERROR_NO_DEVICE (no CUDA-capable devices were detected)", 100 },
|
||||
|
||||
/**
|
||||
* This indicates that the device ordinal supplied by the user does not
|
||||
* correspond to a valid CUDA device.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_DEVICE (device specified is not a valid CUDA device)", 101 },
|
||||
|
||||
|
||||
/**
|
||||
* This indicates that the device kernel image is invalid. This can also
|
||||
* indicate an invalid CUDA module.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_IMAGE", 200 },
|
||||
|
||||
/**
|
||||
* This most frequently indicates that there is no context bound to the
|
||||
* current thread. This can also be returned if the context passed to an
|
||||
* API call is not a valid handle (such as a context that has had
|
||||
* ::cuCtxDestroy() invoked on it). This can also be returned if a user
|
||||
* mixes different API versions (i.e. 3010 context with 3020 API calls).
|
||||
* See ::cuCtxGetApiVersion() for more details.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_CONTEXT", 201 },
|
||||
|
||||
/**
|
||||
* This indicated that the context being supplied as a parameter to the
|
||||
* API call was already the active context.
|
||||
* \deprecated
|
||||
* This error return is deprecated as of CUDA 3.2. It is no longer an
|
||||
* error to attempt to push the active context via ::cuCtxPushCurrent().
|
||||
*/
|
||||
{ "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 202 },
|
||||
|
||||
/**
|
||||
* This indicates that a map or register operation has failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_MAP_FAILED", 205 },
|
||||
|
||||
/**
|
||||
* This indicates that an unmap or unregister operation has failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_UNMAP_FAILED", 206 },
|
||||
|
||||
/**
|
||||
* This indicates that the specified array is currently mapped and thus
|
||||
* cannot be destroyed.
|
||||
*/
|
||||
{ "CUDA_ERROR_ARRAY_IS_MAPPED", 207 },
|
||||
|
||||
/**
|
||||
* This indicates that the resource is already mapped.
|
||||
*/
|
||||
{ "CUDA_ERROR_ALREADY_MAPPED", 208 },
|
||||
|
||||
/**
|
||||
* This indicates that there is no kernel image available that is suitable
|
||||
* for the device. This can occur when a user specifies code generation
|
||||
* options for a particular CUDA source file that do not include the
|
||||
* corresponding device configuration.
|
||||
*/
|
||||
{ "CUDA_ERROR_NO_BINARY_FOR_GPU", 209 },
|
||||
|
||||
/**
|
||||
* This indicates that a resource has already been acquired.
|
||||
*/
|
||||
{ "CUDA_ERROR_ALREADY_ACQUIRED", 210 },
|
||||
|
||||
/**
|
||||
* This indicates that a resource is not mapped.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_MAPPED", 211 },
|
||||
|
||||
/**
|
||||
* This indicates that a mapped resource is not available for access as an
|
||||
* array.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 212 },
|
||||
|
||||
/**
|
||||
* This indicates that a mapped resource is not available for access as a
|
||||
* pointer.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 213 },
|
||||
|
||||
/**
|
||||
* This indicates that an uncorrectable ECC error was detected during
|
||||
* execution.
|
||||
*/
|
||||
{ "CUDA_ERROR_ECC_UNCORRECTABLE", 214 },
|
||||
|
||||
/**
|
||||
* This indicates that the ::CUlimit passed to the API call is not
|
||||
* supported by the active device.
|
||||
*/
|
||||
{ "CUDA_ERROR_UNSUPPORTED_LIMIT", 215 },
|
||||
|
||||
/**
|
||||
* This indicates that the ::CUcontext passed to the API call can
|
||||
* only be bound to a single CPU thread at a time but is already
|
||||
* bound to a CPU thread.
|
||||
*/
|
||||
{ "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 216 },
|
||||
|
||||
/**
|
||||
* This indicates that peer access is not supported across the given
|
||||
* devices.
|
||||
*/
|
||||
{ "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", 217},
|
||||
|
||||
/**
|
||||
* This indicates that the device kernel source is invalid.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_SOURCE", 300 },
|
||||
|
||||
/**
|
||||
* This indicates that the file specified was not found.
|
||||
*/
|
||||
{ "CUDA_ERROR_FILE_NOT_FOUND", 301 },
|
||||
|
||||
/**
|
||||
* This indicates that a link to a shared object failed to resolve.
|
||||
*/
|
||||
{ "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 302 },
|
||||
|
||||
/**
|
||||
* This indicates that initialization of a shared object failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 303 },
|
||||
|
||||
/**
|
||||
* This indicates that an OS call failed.
|
||||
*/
|
||||
{ "CUDA_ERROR_OPERATING_SYSTEM", 304 },
|
||||
|
||||
|
||||
/**
|
||||
* This indicates that a resource handle passed to the API call was not
|
||||
* valid. Resource handles are opaque types like ::CUstream and ::CUevent.
|
||||
*/
|
||||
{ "CUDA_ERROR_INVALID_HANDLE", 400 },
|
||||
|
||||
|
||||
/**
|
||||
* This indicates that a named symbol was not found. Examples of symbols
|
||||
* are global/constant variable names, texture names }, and surface names.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_FOUND", 500 },
|
||||
|
||||
|
||||
/**
|
||||
* This indicates that asynchronous operations issued previously have not
|
||||
* completed yet. This result is not actually an error, but must be indicated
|
||||
* differently than ::CUDA_SUCCESS (which indicates completion). Calls that
|
||||
* may return this value include ::cuEventQuery() and ::cuStreamQuery().
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_READY", 600 },
|
||||
|
||||
|
||||
/**
|
||||
* An exception occurred on the device while executing a kernel. Common
|
||||
* causes include dereferencing an invalid device pointer and accessing
|
||||
* out of bounds shared memory. The context cannot be used }, so it must
|
||||
* be destroyed (and a new one should be created). All existing device
|
||||
* memory allocations from this context are invalid and must be
|
||||
* reconstructed if the program is to continue using CUDA.
|
||||
*/
|
||||
{ "CUDA_ERROR_LAUNCH_FAILED", 700 },
|
||||
|
||||
/**
|
||||
* This indicates that a launch did not occur because it did not have
|
||||
* appropriate resources. This error usually indicates that the user has
|
||||
* attempted to pass too many arguments to the device kernel, or the
|
||||
* kernel launch specifies too many threads for the kernel's register
|
||||
* count. Passing arguments of the wrong size (i.e. a 64-bit pointer
|
||||
* when a 32-bit int is expected) is equivalent to passing too many
|
||||
* arguments and can also result in this error.
|
||||
*/
|
||||
{ "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 701 },
|
||||
|
||||
/**
|
||||
* This indicates that the device kernel took too long to execute. This can
|
||||
* only occur if timeouts are enabled - see the device attribute
|
||||
* ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
|
||||
* context cannot be used (and must be destroyed similar to
|
||||
* ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
|
||||
* this context are invalid and must be reconstructed if the program is to
|
||||
* continue using CUDA.
|
||||
*/
|
||||
{ "CUDA_ERROR_LAUNCH_TIMEOUT", 702 },
|
||||
|
||||
/**
|
||||
* This error indicates a kernel launch that uses an incompatible texturing
|
||||
* mode.
|
||||
*/
|
||||
{ "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 703 },
|
||||
|
||||
/**
|
||||
* This error indicates that a call to ::cuCtxEnablePeerAccess() is
|
||||
* trying to re-enable peer access to a context which has already
|
||||
* had peer access to it enabled.
|
||||
*/
|
||||
{ "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 704 },
|
||||
|
||||
/**
|
||||
* This error indicates that ::cuCtxDisablePeerAccess() is
|
||||
* trying to disable peer access which has not been enabled yet
|
||||
* via ::cuCtxEnablePeerAccess().
|
||||
*/
|
||||
{ "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 705 },
|
||||
|
||||
/**
|
||||
* This error indicates that the primary context for the specified device
|
||||
* has already been initialized.
|
||||
*/
|
||||
{ "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 708 },
|
||||
|
||||
/**
|
||||
* This error indicates that the context current to the calling thread
|
||||
* has been destroyed using ::cuCtxDestroy }, or is a primary context which
|
||||
* has not yet been initialized.
|
||||
*/
|
||||
{ "CUDA_ERROR_CONTEXT_IS_DESTROYED", 709 },
|
||||
|
||||
/**
|
||||
* A device-side assert triggered during kernel execution. The context
|
||||
* cannot be used anymore, and must be destroyed. All existing device
|
||||
* memory allocations from this context are invalid and must be
|
||||
* reconstructed if the program is to continue using CUDA.
|
||||
*/
|
||||
{ "CUDA_ERROR_ASSERT", 710 },
|
||||
|
||||
/**
|
||||
* This error indicates that the hardware resources required to enable
|
||||
* peer access have been exhausted for one or more of the devices
|
||||
* passed to ::cuCtxEnablePeerAccess().
|
||||
*/
|
||||
{ "CUDA_ERROR_TOO_MANY_PEERS", 711 },
|
||||
|
||||
/**
|
||||
* This error indicates that the memory range passed to ::cuMemHostRegister()
|
||||
* has already been registered.
|
||||
*/
|
||||
{ "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 712 },
|
||||
|
||||
/**
|
||||
* This error indicates that the pointer passed to ::cuMemHostUnregister()
|
||||
* does not correspond to any currently registered memory region.
|
||||
*/
|
||||
{ "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 713 },
|
||||
|
||||
/**
|
||||
* This error indicates that the attempted operation is not permitted.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_PERMITTED", 800 },
|
||||
|
||||
/**
|
||||
* This error indicates that the attempted operation is not supported
|
||||
* on the current system or device.
|
||||
*/
|
||||
{ "CUDA_ERROR_NOT_SUPPORTED", 801 },
|
||||
|
||||
/**
|
||||
* This indicates that an unknown internal error has occurred.
|
||||
*/
|
||||
{ "CUDA_ERROR_UNKNOWN", 999 },
|
||||
{ NULL, -1 }
|
||||
};
|
||||
|
||||
// This is just a linear search through the array, since the error_id's are not
|
||||
// always ocurring consecutively
|
||||
const char * getCudaDrvErrorString(CUresult error_id)
|
||||
{
|
||||
int index = 0;
|
||||
while (sCudaDrvErrorString[index].error_id != error_id &&
|
||||
sCudaDrvErrorString[index].error_id != -1)
|
||||
{
|
||||
index++;
|
||||
}
|
||||
if (sCudaDrvErrorString[index].error_id == error_id)
|
||||
return (const char *)sCudaDrvErrorString[index].error_string;
|
||||
else
|
||||
return (const char *)"CUDA_ERROR not found!";
|
||||
}
|
||||
|
||||
#endif
|
||||
0
examples_cuda/stencil/err
Normal file
0
examples_cuda/stencil/err
Normal file
5
examples_cuda/stencil/info
Normal file
5
examples_cuda/stencil/info
Normal file
@@ -0,0 +1,5 @@
|
||||
I have been working with sort example, attempting to use ISPC_USE_OMP for tasking and adding example for sort_paralle.cpp which uses __gnu_parallel::sort to compare apples with apples, but clang has no support for OpenMP.
|
||||
|
||||
The reason to use ISPC_USE_OMP is to control thread-affinity on multi-socket systems. For bandwidth bound throughput, the tasking system based on pthread make it messy to control thread-affinity and w/o this for bandwidth bound work-loads performance may suffer..
|
||||
|
||||
I used sort example to begin with
|
||||
151
examples_cuda/stencil/stencil.cpp
Normal file
151
examples_cuda/stencil/stencil.cpp
Normal file
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include <math.h>
|
||||
#include "../timing.h"
|
||||
#include "stencil_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
|
||||
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
||||
int y0, int y1, int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[5],
|
||||
const double vsq[],
|
||||
double Aeven[], double Aodd[]);
|
||||
|
||||
|
||||
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
|
||||
int offset = 0;
|
||||
for (int z = 0; z < Nz; ++z)
|
||||
for (int y = 0; y < Ny; ++y)
|
||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||
A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
|
||||
A[1][offset] = 0;
|
||||
vsq[offset] = x*y*z / double(Nx * Ny * Nz);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
int Nx = 256, Ny = 256, Nz = 256;
|
||||
int width = 4;
|
||||
double *Aserial[2], *Aispc[2];
|
||||
Aserial[0] = new double [Nx * Ny * Nz];
|
||||
Aserial[1] = new double [Nx * Ny * Nz];
|
||||
Aispc[0] = new double [Nx * Ny * Nz];
|
||||
Aispc[1] = new double [Nx * Ny * Nz];
|
||||
double *vsq = new double [Nx * Ny * Nz];
|
||||
|
||||
double coeff[4] = { 0.5, -.25, .125, -.0625 };
|
||||
|
||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation on one core; report
|
||||
// the minimum time of three runs.
|
||||
//
|
||||
double minTimeISPC = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aispc[0], Aispc[1]);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeISPC = std::min(minTimeISPC, dt);
|
||||
}
|
||||
|
||||
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
|
||||
|
||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation with tasks; report
|
||||
// the minimum time of three runs.
|
||||
//
|
||||
double minTimeISPCTasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aispc[0], Aispc[1]);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
|
||||
}
|
||||
|
||||
printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
|
||||
|
||||
InitData(Nx, Ny, Nz, Aserial, vsq);
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
// minimum time.
|
||||
//
|
||||
double minTimeSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aserial[0], Aserial[1]);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeSerial = std::min(minTimeSerial, dt);
|
||||
}
|
||||
|
||||
printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
|
||||
|
||||
// Check for agreement
|
||||
int offset = 0;
|
||||
for (int z = 0; z < Nz; ++z)
|
||||
for (int y = 0; y < Ny; ++y)
|
||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||
double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
|
||||
Aserial[1][offset]);
|
||||
if (error > 1e-4)
|
||||
printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
|
||||
x, y, z, Aispc[1][offset], Aserial[1][offset]);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
62
examples_cuda/stencil/stencil.cu
Normal file
62
examples_cuda/stencil/stencil.cu
Normal file
@@ -0,0 +1,62 @@
|
||||
#define programCount 32
|
||||
#define programIndex threadIdx.x
|
||||
#define taskIndex blockIdx.x
|
||||
|
||||
__device__ static void
|
||||
stencil_step( int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4], const double vsq[],
|
||||
const double Ain[], double Aout[]) {
|
||||
const int Nxy = Nx * Ny;
|
||||
|
||||
|
||||
#if 0
|
||||
foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) {
|
||||
#else
|
||||
const double coef0 = coef[0];
|
||||
const double coef1 = coef[1];
|
||||
const double coef2 = coef[2];
|
||||
const double coef3 = coef[3];
|
||||
for ( int z = z0; z < z1; z++)
|
||||
for ( int y = y0 ; y < y1; y++)
|
||||
for ( int xb = x0; xb < x1; xb += programCount)
|
||||
{
|
||||
const int x = xb + programIndex;
|
||||
|
||||
#endif
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
double div =
|
||||
coef0 * A_cur(0, 0, 0) +
|
||||
coef1 * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef2 * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef3 * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
if (x < x1)
|
||||
A_next(0, 0, 0) = 2.0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern "C"
|
||||
__global__ void
|
||||
stencil_step_task( int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4], const double vsq[],
|
||||
const double Ain[], double Aout[]) {
|
||||
stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
|
||||
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||
}
|
||||
|
||||
BIN
examples_cuda/stencil/stencil.cubin
Normal file
BIN
examples_cuda/stencil/stencil.cubin
Normal file
Binary file not shown.
136
examples_cuda/stencil/stencil.ispc
Normal file
136
examples_cuda/stencil/stencil.ispc
Normal file
@@ -0,0 +1,136 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef __NVPTX__
|
||||
#warning "emitting DEVICE code"
|
||||
#define taskIndex blockIndex0()
|
||||
#define programIndex laneIndex()
|
||||
#define programCount warpSize()
|
||||
#else
|
||||
#warning "emitting HOST code"
|
||||
#endif
|
||||
|
||||
static void
|
||||
stencil_step(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4], uniform const double vsq[],
|
||||
uniform const double Ain[], uniform double Aout[]) {
|
||||
const uniform int Nxy = Nx * Ny;
|
||||
|
||||
// foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1)
|
||||
for (uniform int z = z0; z < z1; z++)
|
||||
for (uniform int y = y0; y < y1; y++)
|
||||
for (uniform int xb = x0; xb < x1; xb += programCount)
|
||||
{
|
||||
const int x = xb + programIndex;
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
double div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
if (x < x1)
|
||||
A_next(0, 0, 0) = 2.0d0 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static task void
|
||||
stencil_step_task(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4], uniform const double vsq[],
|
||||
uniform const double Ain[], uniform double Aout[]) {
|
||||
stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
|
||||
Nx, Ny, Nz, coef, vsq, Ain, Aout);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4],
|
||||
uniform const double vsq[],
|
||||
uniform double Aeven[], uniform double Aodd[])
|
||||
{
|
||||
for (uniform int t = t0; t < t1; ++t) {
|
||||
// Parallelize across cores as well: each task will work on a slice
|
||||
// of 1 in the z extent of the volume.
|
||||
if ((t & 1) == 0)
|
||||
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
|
||||
coef, vsq, Aeven, Aodd);
|
||||
else
|
||||
launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz,
|
||||
coef, vsq, Aodd, Aeven);
|
||||
|
||||
// We need to wait for all of the launched tasks to finish before
|
||||
// starting the next iteration.
|
||||
sync;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
loop_stencil_ispc(uniform int t0, uniform int t1,
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int z0, uniform int z1,
|
||||
uniform int Nx, uniform int Ny, uniform int Nz,
|
||||
uniform const double coef[4],
|
||||
uniform const double vsq[],
|
||||
uniform double Aeven[], uniform double Aodd[])
|
||||
{
|
||||
for (uniform int t = t0; t < t1; ++t) {
|
||||
if ((t & 1) == 0)
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aeven, Aodd);
|
||||
else
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aodd, Aeven);
|
||||
}
|
||||
}
|
||||
267
examples_cuda/stencil/stencil.ptx
Normal file
267
examples_cuda/stencil/stencil.ptx
Normal file
@@ -0,0 +1,267 @@
|
||||
//
|
||||
// Generated by NVIDIA NVVM Compiler
|
||||
// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
|
||||
// Cuda compilation tools, release 5.5, V5.5.0
|
||||
//
|
||||
|
||||
.version 3.2
|
||||
.target sm_35
|
||||
.address_size 64
|
||||
|
||||
.file 1 "/home/evghenii/soft/ispc-code/ispc/examples/stencil/stencil.cu", 1383254912, 2112
|
||||
.file 2 "/usr/local/cuda-5.5/bin/..//include/cuda_device_runtime_api.h", 1375338991, 7655
|
||||
|
||||
.weak .func (.param .b32 func_retval0) cudaMalloc(
|
||||
.param .b64 cudaMalloc_param_0,
|
||||
.param .b64 cudaMalloc_param_1
|
||||
)
|
||||
{
|
||||
.reg .s32 %r<2>;
|
||||
|
||||
|
||||
mov.u32 %r1, 30;
|
||||
st.param.b32 [func_retval0+0], %r1;
|
||||
.loc 2 66 3
|
||||
ret;
|
||||
}
|
||||
|
||||
.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes(
|
||||
.param .b64 cudaFuncGetAttributes_param_0,
|
||||
.param .b64 cudaFuncGetAttributes_param_1
|
||||
)
|
||||
{
|
||||
.reg .s32 %r<2>;
|
||||
|
||||
|
||||
mov.u32 %r1, 30;
|
||||
st.param.b32 [func_retval0+0], %r1;
|
||||
.loc 2 71 3
|
||||
ret;
|
||||
}
|
||||
|
||||
.visible .entry stencil_step_task(
|
||||
.param .u32 stencil_step_task_param_0,
|
||||
.param .u32 stencil_step_task_param_1,
|
||||
.param .u32 stencil_step_task_param_2,
|
||||
.param .u32 stencil_step_task_param_3,
|
||||
.param .u32 stencil_step_task_param_4,
|
||||
.param .u32 stencil_step_task_param_5,
|
||||
.param .u32 stencil_step_task_param_6,
|
||||
.param .u32 stencil_step_task_param_7,
|
||||
.param .u64 stencil_step_task_param_8,
|
||||
.param .u64 stencil_step_task_param_9,
|
||||
.param .u64 stencil_step_task_param_10,
|
||||
.param .u64 stencil_step_task_param_11
|
||||
)
|
||||
{
|
||||
.reg .pred %p<8>;
|
||||
.reg .s32 %r<54>;
|
||||
.reg .s64 %rd<36>;
|
||||
.reg .f64 %fd<48>;
|
||||
|
||||
|
||||
ld.param.u32 %r19, [stencil_step_task_param_0];
|
||||
ld.param.u32 %r20, [stencil_step_task_param_1];
|
||||
ld.param.u32 %r21, [stencil_step_task_param_2];
|
||||
ld.param.u32 %r22, [stencil_step_task_param_3];
|
||||
ld.param.u32 %r23, [stencil_step_task_param_4];
|
||||
ld.param.u32 %r24, [stencil_step_task_param_5];
|
||||
ld.param.u32 %r25, [stencil_step_task_param_6];
|
||||
ld.param.u64 %rd4, [stencil_step_task_param_8];
|
||||
ld.param.u64 %rd1, [stencil_step_task_param_9];
|
||||
ld.param.u64 %rd2, [stencil_step_task_param_10];
|
||||
ld.param.u64 %rd3, [stencil_step_task_param_11];
|
||||
cvta.to.global.u64 %rd5, %rd4;
|
||||
.loc 1 59 1
|
||||
mov.u32 %r26, %ctaid.x;
|
||||
add.s32 %r51, %r26, %r23;
|
||||
add.s32 %r27, %r51, 1;
|
||||
.loc 1 18 1
|
||||
ld.global.f64 %fd1, [%rd5];
|
||||
.loc 1 19 1
|
||||
ld.global.f64 %fd2, [%rd5+8];
|
||||
.loc 1 20 1
|
||||
ld.global.f64 %fd3, [%rd5+16];
|
||||
.loc 1 21 1
|
||||
ld.global.f64 %fd4, [%rd5+24];
|
||||
.loc 1 22 1
|
||||
setp.ge.s32 %p1, %r51, %r27;
|
||||
@%p1 bra BB2_11;
|
||||
|
||||
mul.lo.s32 %r28, %r25, %r24;
|
||||
shl.b32 %r29, %r28, 1;
|
||||
neg.s32 %r30, %r29;
|
||||
shl.b32 %r2, %r30, 3;
|
||||
cvta.to.global.u64 %rd6, %rd2;
|
||||
cvta.to.global.u64 %rd31, %rd3;
|
||||
cvta.to.global.u64 %rd32, %rd1;
|
||||
|
||||
BB2_2:
|
||||
.loc 1 23 1
|
||||
setp.ge.s32 %p2, %r21, %r22;
|
||||
@%p2 bra BB2_10;
|
||||
|
||||
mov.u32 %r52, %r21;
|
||||
|
||||
BB2_4:
|
||||
.loc 1 24 1
|
||||
mov.u32 %r4, %r52;
|
||||
setp.ge.s32 %p3, %r19, %r20;
|
||||
@%p3 bra BB2_9;
|
||||
|
||||
.loc 1 29 1
|
||||
mul.lo.s32 %r32, %r51, %r28;
|
||||
mad.lo.s32 %r5, %r4, %r24, %r32;
|
||||
.loc 1 32 1
|
||||
add.s32 %r6, %r24, %r5;
|
||||
add.s32 %r7, %r5, %r28;
|
||||
shl.b32 %r33, %r24, 1;
|
||||
add.s32 %r8, %r5, %r33;
|
||||
mad.lo.s32 %r9, %r24, -2, %r5;
|
||||
add.s32 %r10, %r5, %r29;
|
||||
mad.lo.s32 %r11, %r28, -2, %r5;
|
||||
add.s32 %r12, %r24, %r8;
|
||||
mad.lo.s32 %r13, %r28, 3, %r5;
|
||||
mov.u32 %r53, %r19;
|
||||
|
||||
BB2_6:
|
||||
.loc 1 26 1
|
||||
mov.u32 %r14, %r53;
|
||||
mov.u32 %r35, %tid.x;
|
||||
add.s32 %r36, %r35, %r14;
|
||||
.loc 1 29 1
|
||||
add.s32 %r15, %r36, %r5;
|
||||
mul.wide.s32 %rd7, %r15, 8;
|
||||
add.s64 %rd8, %rd6, %rd7;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd5, [%rd8];
|
||||
ld.global.f64 %fd7, [%rd8+-8];
|
||||
ld.global.f64 %fd8, [%rd8+8];
|
||||
add.f64 %fd9, %fd8, %fd7;
|
||||
add.s32 %r37, %r6, %r36;
|
||||
mul.wide.s32 %rd9, %r37, 8;
|
||||
add.s64 %rd10, %rd6, %rd9;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd10, [%rd10];
|
||||
add.f64 %fd11, %fd9, %fd10;
|
||||
.loc 1 22 1
|
||||
neg.s32 %r39, %r33;
|
||||
shl.b32 %r40, %r39, 3;
|
||||
cvt.s64.s32 %rd11, %r40;
|
||||
add.s64 %rd12, %rd10, %rd11;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd12, [%rd12];
|
||||
add.f64 %fd13, %fd11, %fd12;
|
||||
add.s32 %r41, %r7, %r36;
|
||||
mul.wide.s32 %rd13, %r41, 8;
|
||||
add.s64 %rd14, %rd6, %rd13;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd14, [%rd14];
|
||||
add.f64 %fd15, %fd13, %fd14;
|
||||
cvt.s64.s32 %rd15, %r2;
|
||||
add.s64 %rd16, %rd14, %rd15;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd16, [%rd16];
|
||||
add.f64 %fd17, %fd15, %fd16;
|
||||
mul.f64 %fd18, %fd2, %fd17;
|
||||
fma.rn.f64 %fd19, %fd1, %fd5, %fd18;
|
||||
ld.global.f64 %fd20, [%rd8+-16];
|
||||
ld.global.f64 %fd21, [%rd8+16];
|
||||
add.f64 %fd22, %fd21, %fd20;
|
||||
add.s32 %r42, %r8, %r36;
|
||||
mul.wide.s32 %rd17, %r42, 8;
|
||||
add.s64 %rd18, %rd6, %rd17;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd23, [%rd18];
|
||||
add.f64 %fd24, %fd22, %fd23;
|
||||
add.s32 %r43, %r9, %r36;
|
||||
mul.wide.s32 %rd19, %r43, 8;
|
||||
add.s64 %rd20, %rd6, %rd19;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd25, [%rd20];
|
||||
add.f64 %fd26, %fd24, %fd25;
|
||||
add.s32 %r44, %r10, %r36;
|
||||
mul.wide.s32 %rd21, %r44, 8;
|
||||
add.s64 %rd22, %rd6, %rd21;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd27, [%rd22];
|
||||
add.f64 %fd28, %fd26, %fd27;
|
||||
add.s32 %r45, %r11, %r36;
|
||||
mul.wide.s32 %rd23, %r45, 8;
|
||||
add.s64 %rd24, %rd6, %rd23;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd29, [%rd24];
|
||||
add.f64 %fd30, %fd28, %fd29;
|
||||
fma.rn.f64 %fd31, %fd3, %fd30, %fd19;
|
||||
ld.global.f64 %fd32, [%rd8+-24];
|
||||
ld.global.f64 %fd33, [%rd8+24];
|
||||
add.f64 %fd34, %fd33, %fd32;
|
||||
add.s32 %r46, %r12, %r36;
|
||||
mul.wide.s32 %rd25, %r46, 8;
|
||||
add.s64 %rd26, %rd6, %rd25;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd35, [%rd26];
|
||||
add.f64 %fd36, %fd34, %fd35;
|
||||
add.s64 %rd27, %rd12, %rd11;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd37, [%rd27];
|
||||
add.f64 %fd38, %fd36, %fd37;
|
||||
add.s32 %r47, %r13, %r36;
|
||||
mul.wide.s32 %rd28, %r47, 8;
|
||||
add.s64 %rd29, %rd6, %rd28;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd39, [%rd29];
|
||||
add.f64 %fd40, %fd38, %fd39;
|
||||
add.s64 %rd30, %rd16, %rd15;
|
||||
.loc 1 32 1
|
||||
ld.global.f64 %fd41, [%rd30];
|
||||
add.f64 %fd42, %fd40, %fd41;
|
||||
fma.rn.f64 %fd6, %fd4, %fd42, %fd31;
|
||||
.loc 1 44 1
|
||||
setp.ge.s32 %p4, %r36, %r20;
|
||||
@%p4 bra BB2_8;
|
||||
|
||||
mul.wide.s32 %rd33, %r15, 8;
|
||||
add.s64 %rd34, %rd31, %rd33;
|
||||
.loc 1 45 1
|
||||
ld.global.f64 %fd43, [%rd34];
|
||||
add.f64 %fd44, %fd5, %fd5;
|
||||
sub.f64 %fd45, %fd44, %fd43;
|
||||
add.s64 %rd35, %rd32, %rd33;
|
||||
.loc 1 45 1
|
||||
ld.global.f64 %fd46, [%rd35];
|
||||
fma.rn.f64 %fd47, %fd46, %fd6, %fd45;
|
||||
st.global.f64 [%rd34], %fd47;
|
||||
|
||||
BB2_8:
|
||||
.loc 1 24 19
|
||||
add.s32 %r16, %r14, 32;
|
||||
.loc 1 24 1
|
||||
setp.lt.s32 %p5, %r16, %r20;
|
||||
mov.u32 %r53, %r16;
|
||||
@%p5 bra BB2_6;
|
||||
|
||||
BB2_9:
|
||||
.loc 1 23 18
|
||||
add.s32 %r17, %r4, 1;
|
||||
.loc 1 23 1
|
||||
setp.lt.s32 %p6, %r17, %r22;
|
||||
mov.u32 %r52, %r17;
|
||||
@%p6 bra BB2_4;
|
||||
|
||||
BB2_10:
|
||||
.loc 1 22 18
|
||||
add.s32 %r51, %r51, 1;
|
||||
.loc 1 59 1
|
||||
add.s32 %r49, %r23, %r26;
|
||||
add.s32 %r50, %r49, 1;
|
||||
.loc 1 22 1
|
||||
setp.lt.s32 %p7, %r51, %r50;
|
||||
@%p7 bra BB2_2;
|
||||
|
||||
BB2_11:
|
||||
.loc 1 61 2
|
||||
ret;
|
||||
}
|
||||
|
||||
|
||||
180
examples_cuda/stencil/stencil.vcxproj
Normal file
180
examples_cuda/stencil/stencil.vcxproj
Normal file
@@ -0,0 +1,180 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>rt</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="stencil.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="stencil.cpp" />
|
||||
<ClCompile Include="stencil_serial.cpp" />
|
||||
<ClCompile Include="../tasksys.cpp" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
224
examples_cuda/stencil/stencil0.ptx
Normal file
224
examples_cuda/stencil/stencil0.ptx
Normal file
@@ -0,0 +1,224 @@
|
||||
//
|
||||
// Generated by NVIDIA NVVM Compiler
|
||||
// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
|
||||
// Cuda compilation tools, release 5.5, V5.5.0
|
||||
//
|
||||
|
||||
.version 3.2
|
||||
.target sm_35
|
||||
.address_size 64
|
||||
|
||||
.file 1 "/home/evghenii/soft/ispc-code/ispc/examples/stencil/stencil.cu", 1383254912, 2112
|
||||
|
||||
)
|
||||
{
|
||||
.reg .s32 %r<2>;
|
||||
|
||||
|
||||
mov.u32 %r1, 30;
|
||||
st.param.b32 [func_retval0+0], %r1;
|
||||
ret;
|
||||
}
|
||||
|
||||
.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes(
|
||||
.param .b64 cudaFuncGetAttributes_param_0,
|
||||
.param .b64 cudaFuncGetAttributes_param_1
|
||||
)
|
||||
{
|
||||
.reg .s32 %r<2>;
|
||||
|
||||
|
||||
mov.u32 %r1, 30;
|
||||
st.param.b32 [func_retval0+0], %r1;
|
||||
ret;
|
||||
}
|
||||
|
||||
.visible .entry stencil_step_task(
|
||||
.param .u32 stencil_step_task_param_0,
|
||||
.param .u32 stencil_step_task_param_1,
|
||||
.param .u32 stencil_step_task_param_2,
|
||||
.param .u32 stencil_step_task_param_3,
|
||||
.param .u32 stencil_step_task_param_4,
|
||||
.param .u32 stencil_step_task_param_5,
|
||||
.param .u32 stencil_step_task_param_6,
|
||||
.param .u32 stencil_step_task_param_7,
|
||||
.param .u64 stencil_step_task_param_8,
|
||||
.param .u64 stencil_step_task_param_9,
|
||||
.param .u64 stencil_step_task_param_10,
|
||||
.param .u64 stencil_step_task_param_11
|
||||
)
|
||||
{
|
||||
.reg .pred %p<8>;
|
||||
.reg .s32 %r<54>;
|
||||
.reg .s64 %rd<36>;
|
||||
.reg .f64 %fd<48>;
|
||||
|
||||
|
||||
ld.param.u32 %r19, [stencil_step_task_param_0];
|
||||
ld.param.u32 %r20, [stencil_step_task_param_1];
|
||||
ld.param.u32 %r21, [stencil_step_task_param_2];
|
||||
ld.param.u32 %r22, [stencil_step_task_param_3];
|
||||
ld.param.u32 %r23, [stencil_step_task_param_4];
|
||||
ld.param.u32 %r24, [stencil_step_task_param_5];
|
||||
ld.param.u32 %r25, [stencil_step_task_param_6];
|
||||
ld.param.u64 %rd4, [stencil_step_task_param_8];
|
||||
ld.param.u64 %rd1, [stencil_step_task_param_9];
|
||||
ld.param.u64 %rd2, [stencil_step_task_param_10];
|
||||
ld.param.u64 %rd3, [stencil_step_task_param_11];
|
||||
cvta.to.global.u64 %rd5, %rd4;
|
||||
mov.u32 %r26, %ctaid.x;
|
||||
add.s32 %r51, %r26, %r23;
|
||||
add.s32 %r27, %r51, 1;
|
||||
ld.global.f64 %fd1, [%rd5];
|
||||
ld.global.f64 %fd2, [%rd5+8];
|
||||
ld.global.f64 %fd3, [%rd5+16];
|
||||
ld.global.f64 %fd4, [%rd5+24];
|
||||
setp.ge.s32 %p1, %r51, %r27;
|
||||
@%p1 bra BB2_11;
|
||||
|
||||
mul.lo.s32 %r28, %r25, %r24;
|
||||
shl.b32 %r29, %r28, 1;
|
||||
neg.s32 %r30, %r29;
|
||||
shl.b32 %r2, %r30, 3;
|
||||
cvta.to.global.u64 %rd6, %rd2;
|
||||
cvta.to.global.u64 %rd31, %rd3;
|
||||
cvta.to.global.u64 %rd32, %rd1;
|
||||
|
||||
BB2_2:
|
||||
setp.ge.s32 %p2, %r21, %r22;
|
||||
@%p2 bra BB2_10;
|
||||
|
||||
mov.u32 %r52, %r21;
|
||||
|
||||
BB2_4:
|
||||
mov.u32 %r4, %r52;
|
||||
setp.ge.s32 %p3, %r19, %r20;
|
||||
@%p3 bra BB2_9;
|
||||
|
||||
mul.lo.s32 %r32, %r51, %r28;
|
||||
mad.lo.s32 %r5, %r4, %r24, %r32;
|
||||
add.s32 %r6, %r24, %r5;
|
||||
add.s32 %r7, %r5, %r28;
|
||||
shl.b32 %r33, %r24, 1;
|
||||
add.s32 %r8, %r5, %r33;
|
||||
mad.lo.s32 %r9, %r24, -2, %r5;
|
||||
add.s32 %r10, %r5, %r29;
|
||||
mad.lo.s32 %r11, %r28, -2, %r5;
|
||||
add.s32 %r12, %r24, %r8;
|
||||
mad.lo.s32 %r13, %r28, 3, %r5;
|
||||
mov.u32 %r53, %r19;
|
||||
|
||||
BB2_6:
|
||||
mov.u32 %r14, %r53;
|
||||
mov.u32 %r35, %tid.x;
|
||||
add.s32 %r36, %r35, %r14;
|
||||
add.s32 %r15, %r36, %r5;
|
||||
mul.wide.s32 %rd7, %r15, 8;
|
||||
add.s64 %rd8, %rd6, %rd7;
|
||||
ld.global.f64 %fd5, [%rd8];
|
||||
ld.global.f64 %fd7, [%rd8+-8];
|
||||
ld.global.f64 %fd8, [%rd8+8];
|
||||
add.f64 %fd9, %fd8, %fd7;
|
||||
add.s32 %r37, %r6, %r36;
|
||||
mul.wide.s32 %rd9, %r37, 8;
|
||||
add.s64 %rd10, %rd6, %rd9;
|
||||
ld.global.f64 %fd10, [%rd10];
|
||||
add.f64 %fd11, %fd9, %fd10;
|
||||
neg.s32 %r39, %r33;
|
||||
shl.b32 %r40, %r39, 3;
|
||||
cvt.s64.s32 %rd11, %r40;
|
||||
add.s64 %rd12, %rd10, %rd11;
|
||||
ld.global.f64 %fd12, [%rd12];
|
||||
add.f64 %fd13, %fd11, %fd12;
|
||||
add.s32 %r41, %r7, %r36;
|
||||
mul.wide.s32 %rd13, %r41, 8;
|
||||
add.s64 %rd14, %rd6, %rd13;
|
||||
ld.global.f64 %fd14, [%rd14];
|
||||
add.f64 %fd15, %fd13, %fd14;
|
||||
cvt.s64.s32 %rd15, %r2;
|
||||
add.s64 %rd16, %rd14, %rd15;
|
||||
ld.global.f64 %fd16, [%rd16];
|
||||
add.f64 %fd17, %fd15, %fd16;
|
||||
mul.f64 %fd18, %fd2, %fd17;
|
||||
fma.rn.f64 %fd19, %fd1, %fd5, %fd18;
|
||||
ld.global.f64 %fd20, [%rd8+-16];
|
||||
ld.global.f64 %fd21, [%rd8+16];
|
||||
add.f64 %fd22, %fd21, %fd20;
|
||||
add.s32 %r42, %r8, %r36;
|
||||
mul.wide.s32 %rd17, %r42, 8;
|
||||
add.s64 %rd18, %rd6, %rd17;
|
||||
ld.global.f64 %fd23, [%rd18];
|
||||
add.f64 %fd24, %fd22, %fd23;
|
||||
add.s32 %r43, %r9, %r36;
|
||||
mul.wide.s32 %rd19, %r43, 8;
|
||||
add.s64 %rd20, %rd6, %rd19;
|
||||
ld.global.f64 %fd25, [%rd20];
|
||||
add.f64 %fd26, %fd24, %fd25;
|
||||
add.s32 %r44, %r10, %r36;
|
||||
mul.wide.s32 %rd21, %r44, 8;
|
||||
add.s64 %rd22, %rd6, %rd21;
|
||||
ld.global.f64 %fd27, [%rd22];
|
||||
add.f64 %fd28, %fd26, %fd27;
|
||||
add.s32 %r45, %r11, %r36;
|
||||
mul.wide.s32 %rd23, %r45, 8;
|
||||
add.s64 %rd24, %rd6, %rd23;
|
||||
ld.global.f64 %fd29, [%rd24];
|
||||
add.f64 %fd30, %fd28, %fd29;
|
||||
fma.rn.f64 %fd31, %fd3, %fd30, %fd19;
|
||||
ld.global.f64 %fd32, [%rd8+-24];
|
||||
ld.global.f64 %fd33, [%rd8+24];
|
||||
add.f64 %fd34, %fd33, %fd32;
|
||||
add.s32 %r46, %r12, %r36;
|
||||
mul.wide.s32 %rd25, %r46, 8;
|
||||
add.s64 %rd26, %rd6, %rd25;
|
||||
ld.global.f64 %fd35, [%rd26];
|
||||
add.f64 %fd36, %fd34, %fd35;
|
||||
add.s64 %rd27, %rd12, %rd11;
|
||||
ld.global.f64 %fd37, [%rd27];
|
||||
add.f64 %fd38, %fd36, %fd37;
|
||||
add.s32 %r47, %r13, %r36;
|
||||
mul.wide.s32 %rd28, %r47, 8;
|
||||
add.s64 %rd29, %rd6, %rd28;
|
||||
ld.global.f64 %fd39, [%rd29];
|
||||
add.f64 %fd40, %fd38, %fd39;
|
||||
add.s64 %rd30, %rd16, %rd15;
|
||||
ld.global.f64 %fd41, [%rd30];
|
||||
add.f64 %fd42, %fd40, %fd41;
|
||||
fma.rn.f64 %fd6, %fd4, %fd42, %fd31;
|
||||
setp.ge.s32 %p4, %r36, %r20;
|
||||
@%p4 bra BB2_8;
|
||||
|
||||
mul.wide.s32 %rd33, %r15, 8;
|
||||
add.s64 %rd34, %rd31, %rd33;
|
||||
ld.global.f64 %fd43, [%rd34];
|
||||
add.f64 %fd44, %fd5, %fd5;
|
||||
sub.f64 %fd45, %fd44, %fd43;
|
||||
add.s64 %rd35, %rd32, %rd33;
|
||||
ld.global.f64 %fd46, [%rd35];
|
||||
fma.rn.f64 %fd47, %fd46, %fd6, %fd45;
|
||||
st.global.f64 [%rd34], %fd47;
|
||||
|
||||
BB2_8:
|
||||
add.s32 %r16, %r14, 32;
|
||||
setp.lt.s32 %p5, %r16, %r20;
|
||||
mov.u32 %r53, %r16;
|
||||
@%p5 bra BB2_6;
|
||||
|
||||
BB2_9:
|
||||
add.s32 %r17, %r4, 1;
|
||||
setp.lt.s32 %p6, %r17, %r22;
|
||||
mov.u32 %r52, %r17;
|
||||
@%p6 bra BB2_4;
|
||||
|
||||
BB2_10:
|
||||
add.s32 %r51, %r51, 1;
|
||||
add.s32 %r49, %r23, %r26;
|
||||
add.s32 %r50, %r49, 1;
|
||||
setp.lt.s32 %p7, %r51, %r50;
|
||||
@%p7 bra BB2_2;
|
||||
|
||||
BB2_11:
|
||||
ret;
|
||||
}
|
||||
|
||||
|
||||
BIN
examples_cuda/stencil/stencil1.cubin
Normal file
BIN
examples_cuda/stencil/stencil1.cubin
Normal file
Binary file not shown.
BIN
examples_cuda/stencil/stencil2.cubin
Normal file
BIN
examples_cuda/stencil/stencil2.cubin
Normal file
Binary file not shown.
247
examples_cuda/stencil/stencil2.ptx
Normal file
247
examples_cuda/stencil/stencil2.ptx
Normal file
@@ -0,0 +1,247 @@
|
||||
//
|
||||
// Generated by LLVM NVPTX Back-End
|
||||
//
|
||||
|
||||
.version 3.1
|
||||
.target sm_20, texmode_independent
|
||||
.address_size 64
|
||||
|
||||
// .globl stencil_step_task
|
||||
// @stencil_step_task
|
||||
.entry stencil_step_task(
|
||||
.param .u32 stencil_step_task_param_0,
|
||||
.param .u32 stencil_step_task_param_1,
|
||||
.param .u32 stencil_step_task_param_2,
|
||||
.param .u32 stencil_step_task_param_3,
|
||||
.param .u32 stencil_step_task_param_4,
|
||||
.param .u32 stencil_step_task_param_5,
|
||||
.param .u32 stencil_step_task_param_6,
|
||||
.param .u32 stencil_step_task_param_7,
|
||||
.param .u64 .ptr .align 8 stencil_step_task_param_8,
|
||||
.param .u64 .ptr .align 8 stencil_step_task_param_9,
|
||||
.param .u64 .ptr .align 8 stencil_step_task_param_10,
|
||||
.param .u64 .ptr .align 8 stencil_step_task_param_11
|
||||
)
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0: // %allocas
|
||||
mov.u32 %r12, %ctaid.x;
|
||||
ld.param.u32 %r13, [stencil_step_task_param_4];
|
||||
add.s32 %r16, %r12, %r13;
|
||||
add.s32 %r0, %r16, 1;
|
||||
setp.ge.s32 %p0, %r16, %r0;
|
||||
@%p0 bra BB0_11;
|
||||
// BB#1: // %for_test28.i.preheader.lr.ph
|
||||
ld.param.u32 %r0, [stencil_step_task_param_0];
|
||||
ld.param.u32 %r1, [stencil_step_task_param_1];
|
||||
ld.param.u32 %r2, [stencil_step_task_param_2];
|
||||
ld.param.u32 %r3, [stencil_step_task_param_3];
|
||||
ld.param.u32 %r4, [stencil_step_task_param_5];
|
||||
ld.param.u32 %r5, [stencil_step_task_param_6];
|
||||
mul.lo.s32 %r5, %r5, %r4;
|
||||
ld.param.u64 %rl3, [stencil_step_task_param_8];
|
||||
ld.f64 %fl0, [%rl3];
|
||||
ld.f64 %fl1, [%rl3+8];
|
||||
ld.param.u64 %rl0, [stencil_step_task_param_9];
|
||||
ld.f64 %fl2, [%rl3+16];
|
||||
ld.param.u64 %rl1, [stencil_step_task_param_10];
|
||||
ld.param.u64 %rl2, [stencil_step_task_param_11];
|
||||
ld.f64 %fl3, [%rl3+24];
|
||||
shl.b32 %r6, %r4, 1;
|
||||
mul.lo.s32 %r7, %r4, 3;
|
||||
mul.lo.s32 %r8, %r4, -3;
|
||||
shl.b32 %r9, %r5, 1;
|
||||
mul.lo.s32 %r10, %r5, 3;
|
||||
mul.lo.s32 %r11, %r5, -3;
|
||||
add.s32 %r12, %r12, %r13;
|
||||
neg.s32 %r13, %r9;
|
||||
neg.s32 %r14, %r6;
|
||||
mov.u32 %r32, WARP_SZ;
|
||||
BB0_2: // %for_test28.i.preheader
|
||||
// =>This Loop Header: Depth=1
|
||||
// Child Loop BB0_9 Depth 2
|
||||
// Child Loop BB0_5 Depth 3
|
||||
mov.u32 %r15, %r16;
|
||||
setp.ge.s32 %p0, %r2, %r3;
|
||||
@%p0 bra BB0_10;
|
||||
// BB#3: // %for_test35.i.preheader.lr.ph
|
||||
// in Loop: Header=BB0_2 Depth=1
|
||||
setp.lt.s32 %p0, %r0, %r1;
|
||||
@%p0 bra BB0_4;
|
||||
bra.uni BB0_10;
|
||||
BB0_4: // in Loop: Header=BB0_2 Depth=1
|
||||
mul.lo.s32 %r16, %r15, %r5;
|
||||
mov.u32 %r17, %r2;
|
||||
BB0_9: // %for_loop37.i.lr.ph.us
|
||||
// Parent Loop BB0_2 Depth=1
|
||||
// => This Loop Header: Depth=2
|
||||
// Child Loop BB0_5 Depth 3
|
||||
mad.lo.s32 %r18, %r17, %r4, %r16;
|
||||
add.s32 %r19, %r18, %r4;
|
||||
add.s32 %r20, %r18, %r6;
|
||||
sub.s32 %r21, %r18, %r4;
|
||||
add.s32 %r22, %r18, %r7;
|
||||
add.s32 %r23, %r18, %r14;
|
||||
add.s32 %r24, %r18, %r5;
|
||||
add.s32 %r25, %r18, %r8;
|
||||
add.s32 %r26, %r18, %r9;
|
||||
sub.s32 %r27, %r18, %r5;
|
||||
add.s32 %r28, %r18, %r10;
|
||||
add.s32 %r29, %r18, %r13;
|
||||
add.s32 %r30, %r18, %r11;
|
||||
mov.u32 %r31, %r0;
|
||||
BB0_5: // %for_loop37.i.us
|
||||
// Parent Loop BB0_2 Depth=1
|
||||
// Parent Loop BB0_9 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
mov.u32 %r33, %tid.x;
|
||||
add.s32 %r34, %r32, -1;
|
||||
and.b32 %r33, %r34, %r33;
|
||||
add.s32 %r33, %r33, %r31;
|
||||
setp.ge.s32 %p0, %r33, %r1;
|
||||
@%p0 bra BB0_7;
|
||||
// BB#6: // %pl_dolane.i.us
|
||||
// in Loop: Header=BB0_5 Depth=3
|
||||
add.s32 %r34, %r18, %r33;
|
||||
shl.b32 %r34, %r34, 3;
|
||||
add.s32 %r35, %r34, -8;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl4, [%rl3];
|
||||
add.s32 %r35, %r34, 8;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl5, [%rl3];
|
||||
add.s32 %r35, %r34, -16;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl6, [%rl3];
|
||||
add.s32 %r35, %r34, 16;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl9, [%rl3];
|
||||
add.s32 %r35, %r19, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl8, [%rl3];
|
||||
add.s32 %r35, %r34, -24;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl7, [%rl3];
|
||||
add.s32 %r35, %r34, 24;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl10, [%rl3];
|
||||
add.s32 %r35, %r20, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl13, [%rl3];
|
||||
add.s32 %r35, %r21, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl12, [%rl3];
|
||||
add.s32 %r35, %r22, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl11, [%rl3];
|
||||
add.s32 %r35, %r23, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl16, [%rl3];
|
||||
add.s32 %r35, %r24, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl15, [%rl3];
|
||||
add.s32 %r35, %r25, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl14, [%rl3];
|
||||
add.s32 %r35, %r26, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl19, [%rl3];
|
||||
add.s32 %r35, %r27, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl18, [%rl3];
|
||||
add.s32 %r35, %r28, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl17, [%rl3];
|
||||
add.s32 %r35, %r29, %r33;
|
||||
shl.b32 %r35, %r35, 3;
|
||||
cvt.s64.s32 %rl3, %r35;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl24, [%rl3];
|
||||
cvt.s64.s32 %rl4, %r34;
|
||||
add.s64 %rl3, %rl4, %rl1;
|
||||
ld.f64 %fl21, [%rl3];
|
||||
add.s32 %r33, %r30, %r33;
|
||||
shl.b32 %r33, %r33, 3;
|
||||
cvt.s64.s32 %rl3, %r33;
|
||||
add.s64 %rl3, %rl3, %rl1;
|
||||
ld.f64 %fl20, [%rl3];
|
||||
add.s64 %rl3, %rl4, %rl2;
|
||||
ld.f64 %fl23, [%rl3];
|
||||
add.s64 %rl4, %rl4, %rl0;
|
||||
ld.f64 %fl22, [%rl4];
|
||||
add.f64 %fl25, %fl21, %fl21;
|
||||
sub.f64 %fl23, %fl25, %fl23;
|
||||
add.f64 %fl6, %fl6, %fl9;
|
||||
add.f64 %fl6, %fl6, %fl13;
|
||||
add.f64 %fl6, %fl6, %fl16;
|
||||
add.f64 %fl6, %fl6, %fl19;
|
||||
add.f64 %fl6, %fl6, %fl24;
|
||||
add.f64 %fl4, %fl4, %fl5;
|
||||
add.f64 %fl4, %fl4, %fl8;
|
||||
add.f64 %fl4, %fl4, %fl12;
|
||||
add.f64 %fl4, %fl4, %fl15;
|
||||
add.f64 %fl4, %fl4, %fl18;
|
||||
mul.f64 %fl5, %fl0, %fl21;
|
||||
fma.rn.f64 %fl4, %fl1, %fl4, %fl5;
|
||||
fma.rn.f64 %fl4, %fl2, %fl6, %fl4;
|
||||
add.f64 %fl5, %fl7, %fl10;
|
||||
add.f64 %fl5, %fl5, %fl11;
|
||||
add.f64 %fl5, %fl5, %fl14;
|
||||
add.f64 %fl5, %fl5, %fl17;
|
||||
add.f64 %fl5, %fl5, %fl20;
|
||||
fma.rn.f64 %fl4, %fl3, %fl5, %fl4;
|
||||
fma.rn.f64 %fl4, %fl4, %fl22, %fl23;
|
||||
st.f64 [%rl3], %fl4;
|
||||
BB0_7: // %safe_if_after_true.i.us
|
||||
// in Loop: Header=BB0_5 Depth=3
|
||||
add.s32 %r31, %r32, %r31;
|
||||
setp.lt.s32 %p0, %r31, %r1;
|
||||
@%p0 bra BB0_5;
|
||||
// BB#8: // %for_exit38.i.us
|
||||
// in Loop: Header=BB0_9 Depth=2
|
||||
add.s32 %r17, %r17, 1;
|
||||
setp.eq.s32 %p0, %r17, %r3;
|
||||
@%p0 bra BB0_10;
|
||||
bra.uni BB0_9;
|
||||
BB0_10: // %for_exit31.i
|
||||
// in Loop: Header=BB0_2 Depth=1
|
||||
add.s32 %r16, %r15, 1;
|
||||
setp.ne.s32 %p0, %r15, %r12;
|
||||
@%p0 bra BB0_2;
|
||||
BB0_11: // %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit
|
||||
ret;
|
||||
}
|
||||
|
||||
BIN
examples_cuda/stencil/stencil_avx.bc
Normal file
BIN
examples_cuda/stencil/stencil_avx.bc
Normal file
Binary file not shown.
BIN
examples_cuda/stencil/stencil_cu
Executable file
BIN
examples_cuda/stencil/stencil_cu
Executable file
Binary file not shown.
BIN
examples_cuda/stencil/stencil_cu.bc
Normal file
BIN
examples_cuda/stencil/stencil_cu.bc
Normal file
Binary file not shown.
317
examples_cuda/stencil/stencil_cu.cpp
Normal file
317
examples_cuda/stencil/stencil_cu.cpp
Normal file
@@ -0,0 +1,317 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include <math.h>
|
||||
#include "../timing.h"
|
||||
#include "stencil_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <cuda.h>
|
||||
#include "drvapi_error_string.h"
|
||||
|
||||
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
|
||||
// These are the inline versions for all of the SDK helper functions
|
||||
void __checkCudaErrors(CUresult err, const char *file, const int line) {
|
||||
if(CUDA_SUCCESS != err) {
|
||||
std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
|
||||
<< getCudaDrvErrorString(err) << "\" from file <" << file
|
||||
<< ", line " << line << "\n";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
/**********************/
|
||||
/* Basic CUDriver API */
|
||||
CUcontext context;
|
||||
|
||||
void createContext(const int deviceId = 0)
|
||||
{
|
||||
CUdevice device;
|
||||
int devCount;
|
||||
checkCudaErrors(cuInit(0));
|
||||
checkCudaErrors(cuDeviceGetCount(&devCount));
|
||||
assert(devCount > 0);
|
||||
checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
|
||||
|
||||
char name[128];
|
||||
checkCudaErrors(cuDeviceGetName(name, 128, device));
|
||||
std::cout << "Using CUDA Device [0]: " << name << "\n";
|
||||
|
||||
int devMajor, devMinor;
|
||||
checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
|
||||
std::cout << "Device Compute Capability: "
|
||||
<< devMajor << "." << devMinor << "\n";
|
||||
if (devMajor < 2) {
|
||||
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Create driver context
|
||||
checkCudaErrors(cuCtxCreate(&context, 0, device));
|
||||
}
|
||||
void destroyContext()
|
||||
{
|
||||
checkCudaErrors(cuCtxDestroy(context));
|
||||
}
|
||||
|
||||
CUmodule loadModule(const char * module)
|
||||
{
|
||||
CUmodule cudaModule;
|
||||
checkCudaErrors(cuModuleLoadData(&cudaModule, module));
|
||||
return cudaModule;
|
||||
}
|
||||
void unloadModule(CUmodule &cudaModule)
|
||||
{
|
||||
checkCudaErrors(cuModuleUnload(cudaModule));
|
||||
}
|
||||
|
||||
CUfunction getFunction(CUmodule &cudaModule, const char * function)
|
||||
{
|
||||
CUfunction cudaFunction;
|
||||
checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
|
||||
return cudaFunction;
|
||||
}
|
||||
|
||||
CUdeviceptr deviceMalloc(const size_t size)
|
||||
{
|
||||
CUdeviceptr d_buf;
|
||||
checkCudaErrors(cuMemAlloc(&d_buf, size));
|
||||
return d_buf;
|
||||
}
|
||||
void deviceFree(CUdeviceptr d_buf)
|
||||
{
|
||||
checkCudaErrors(cuMemFree(d_buf));
|
||||
}
|
||||
void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
|
||||
{
|
||||
checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
|
||||
}
|
||||
void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
|
||||
{
|
||||
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
|
||||
}
|
||||
#define deviceLaunch(func,nbx,nby,nbz,params) \
|
||||
checkCudaErrors( \
|
||||
cuLaunchKernel( \
|
||||
(func), \
|
||||
(nbx), (nby), (nbz), \
|
||||
32, 1, 1, \
|
||||
0, NULL, (params), NULL \
|
||||
));
|
||||
|
||||
typedef CUdeviceptr devicePtr;
|
||||
|
||||
|
||||
/**************/
|
||||
|
||||
extern "C"
|
||||
{
|
||||
|
||||
void *CUDAAlloc(void **handlePtr, int64_t size, int32_t alignment)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
void CUDALaunch(
|
||||
void **handlePtr,
|
||||
const char * module_name,
|
||||
const char * module,
|
||||
const char * func_name,
|
||||
void **func_args,
|
||||
int countx, int county, int countz)
|
||||
{
|
||||
assert(module_name != NULL);
|
||||
assert(module != NULL);
|
||||
assert(func_name != NULL);
|
||||
assert(func_args != NULL);
|
||||
CUmodule cudaModule = loadModule(module);
|
||||
CUfunction cudaFunction = getFunction(cudaModule, func_name);
|
||||
deviceLaunch(cudaFunction, countx, county, countz, func_args);
|
||||
unloadModule(cudaModule);
|
||||
}
|
||||
void CUDASync(void *handle)
|
||||
{
|
||||
checkCudaErrors(cuStreamSynchronize(0));
|
||||
}
|
||||
void ISPCSync(void *handle)
|
||||
{
|
||||
checkCudaErrors(cuStreamSynchronize(0));
|
||||
}
|
||||
void CUDAFree(void *handle)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
||||
int y0, int y1, int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[5],
|
||||
const double vsq[],
|
||||
double Aeven[], double Aodd[]);
|
||||
|
||||
|
||||
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
|
||||
int offset = 0;
|
||||
for (int z = 0; z < Nz; ++z)
|
||||
for (int y = 0; y < Ny; ++y)
|
||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||
A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
|
||||
A[1][offset] = 0;
|
||||
vsq[offset] = x*y*z / double(Nx * Ny * Nz);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
int Nx = 256, Ny = 256, Nz = 256;
|
||||
int width = 4;
|
||||
double *Aserial[2], *Aispc[2];
|
||||
Aserial[0] = new double [Nx * Ny * Nz];
|
||||
Aserial[1] = new double [Nx * Ny * Nz];
|
||||
Aispc[0] = new double [Nx * Ny * Nz];
|
||||
Aispc[1] = new double [Nx * Ny * Nz];
|
||||
double *vsq = new double [Nx * Ny * Nz];
|
||||
|
||||
double coeff[4] = { 0.5, -.25, .125, -.0625 };
|
||||
|
||||
/*******************/
|
||||
createContext();
|
||||
/*******************/
|
||||
|
||||
const size_t bufsize = sizeof(double)*Nx*Ny*Nz;
|
||||
devicePtr d_Aispc0 = deviceMalloc(bufsize);
|
||||
devicePtr d_Aispc1 = deviceMalloc(bufsize);
|
||||
devicePtr d_vsq = deviceMalloc(bufsize);
|
||||
devicePtr d_coeff = deviceMalloc(4*sizeof(double));
|
||||
|
||||
|
||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation on one core; report
|
||||
// the minimum time of three runs.
|
||||
//
|
||||
double minTimeISPC = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aispc[0], Aispc[1]);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeISPC = std::min(minTimeISPC, dt);
|
||||
}
|
||||
|
||||
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
|
||||
|
||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||
|
||||
memcpyH2D(d_Aispc0, Aispc[0], bufsize);
|
||||
memcpyH2D(d_Aispc1, Aispc[1], bufsize);
|
||||
memcpyH2D(d_vsq, vsq, bufsize);
|
||||
memcpyH2D(d_coeff, coeff, 4*sizeof(double));
|
||||
//
|
||||
// Compute the image using the ispc implementation with tasks; report
|
||||
// the minimum time of three runs.
|
||||
//
|
||||
double minTimeISPCTasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, (double*)d_coeff, (double*)d_vsq,
|
||||
(double*)d_Aispc0, (double*)d_Aispc1);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
|
||||
}
|
||||
memcpyD2H(Aispc[1], d_Aispc1, bufsize);
|
||||
//memcpyD2H(Aispc[1], d_vsq, bufsize);
|
||||
|
||||
printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
|
||||
|
||||
InitData(Nx, Ny, Nz, Aserial, vsq);
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
// minimum time.
|
||||
//
|
||||
double minTimeSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
|
||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||
Aserial[0], Aserial[1]);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minTimeSerial = std::min(minTimeSerial, dt);
|
||||
}
|
||||
|
||||
printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
|
||||
|
||||
// Check for agreement
|
||||
int offset = 0;
|
||||
int nerr = 0;
|
||||
for (int z = 0; z < Nz; ++z)
|
||||
for (int y = 0; y < Ny; ++y)
|
||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||
|
||||
double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
|
||||
Aserial[1][offset]);
|
||||
if (error > 1e-3)
|
||||
{
|
||||
if (nerr < 100)
|
||||
printf("Error @ (%d,%d,%d): ispc = %g, serial = %g error= %g\n",
|
||||
x, y, z, Aispc[1][offset], Aserial[1][offset], error);
|
||||
nerr++;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, " nerr= %d frac= %g \n", nerr, 1.0*nerr/(1.0*Nx*Ny*Nz));
|
||||
|
||||
/*******************/
|
||||
destroyContext();
|
||||
/*******************/
|
||||
|
||||
return 0;
|
||||
}
|
||||
762
examples_cuda/stencil/stencil_cu.ll
Normal file
762
examples_cuda/stencil/stencil_cu.ll
Normal file
@@ -0,0 +1,762 @@
|
||||
; ModuleID = 'stencil_cu.bc'
|
||||
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare i8* @ISPCAlloc(i8**, i64, i32) #0
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @ISPCLaunch(i8**, i8*, i8*, i32, i32, i32) #0
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @ISPCSync(i8*) #0
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) #1
|
||||
|
||||
; Function Attrs: nounwind readonly
|
||||
declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x double>) #2
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x double>, <4 x double>) #0
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define internal fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, double* noalias nocapture %coef, double* noalias %vsq, double* noalias %Ain, double* noalias %Aout, <8 x i32> %__mask) #3 {
|
||||
allocas:
|
||||
%floatmask.i = bitcast <8 x i32> %__mask to <8 x float>
|
||||
%v.i = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i) #1
|
||||
%cmp.i = icmp eq i32 %v.i, 255
|
||||
%mul_Nx_load_Ny_load = mul i32 %Ny, %Nx
|
||||
%coef_load_offset_load = load double* %coef, align 8
|
||||
%coef_load18_offset = getelementptr double* %coef, i64 1
|
||||
%coef_load18_offset_load = load double* %coef_load18_offset, align 8
|
||||
%coef_load21_offset = getelementptr double* %coef, i64 2
|
||||
%coef_load21_offset_load = load double* %coef_load21_offset, align 8
|
||||
%coef_load24_offset = getelementptr double* %coef, i64 3
|
||||
%coef_load24_offset_load = load double* %coef_load24_offset, align 8
|
||||
%less_z_load_z1_load260 = icmp slt i32 %z0, %z1
|
||||
br i1 %cmp.i, label %for_test.preheader, label %for_test264.preheader
|
||||
|
||||
for_test264.preheader: ; preds = %allocas
|
||||
br i1 %less_z_load_z1_load260, label %for_test275.preheader.lr.ph, label %for_exit
|
||||
|
||||
for_test275.preheader.lr.ph: ; preds = %for_test264.preheader
|
||||
%less_y_load282_y1_load283264 = icmp slt i32 %y0, %y1
|
||||
%less_xb_load293_x1_load294262 = icmp slt i32 %x0, %x1
|
||||
%x1_load463_broadcast_init = insertelement <8 x i32> undef, i32 %x1, i32 0
|
||||
%x1_load463_broadcast = shufflevector <8 x i32> %x1_load463_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
|
||||
%mul__Nx_load382 = shl i32 %Nx, 1
|
||||
%mul__Nx_load431 = mul i32 %Nx, 3
|
||||
%mul__Nx_load390 = mul i32 %Nx, -2
|
||||
%mul__Nx_load439 = mul i32 %Nx, -3
|
||||
%mul__Nxy_load399 = shl i32 %mul_Nx_load_Ny_load, 1
|
||||
%mul__Nxy_load448 = mul i32 %mul_Nx_load_Ny_load, 3
|
||||
%mul__Nxy_load407 = mul i32 %mul_Nx_load_Ny_load, -2
|
||||
%mul__Nxy_load456 = mul i32 %mul_Nx_load_Ny_load, -3
|
||||
%Ain_load327_ptr2int_2void = bitcast double* %Ain to i8*
|
||||
%mask0.i.i201 = shufflevector <8 x i32> %__mask, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
%mask1.i.i202 = shufflevector <8 x i32> %__mask, <8 x i32> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||
%mask0d.i.i203 = bitcast <8 x i32> %mask0.i.i201 to <4 x double>
|
||||
%mask1d.i.i204 = bitcast <8 x i32> %mask1.i.i202 to <4 x double>
|
||||
%coef1_load315_broadcast_init = insertelement <8 x double> undef, double %coef_load18_offset_load, i32 0
|
||||
%coef0_load306_broadcast_init = insertelement <8 x double> undef, double %coef_load_offset_load, i32 0
|
||||
%coef2_load364_broadcast_init = insertelement <8 x double> undef, double %coef_load21_offset_load, i32 0
|
||||
%coef1_load315_broadcast = shufflevector <8 x double> %coef1_load315_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer
|
||||
%coef0_load306_broadcast = shufflevector <8 x double> %coef0_load306_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer
|
||||
%coef3_load413_broadcast_init = insertelement <8 x double> undef, double %coef_load24_offset_load, i32 0
|
||||
%coef2_load364_broadcast = shufflevector <8 x double> %coef2_load364_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer
|
||||
%coef3_load413_broadcast = shufflevector <8 x double> %coef3_load413_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer
|
||||
%Aout_load488_ptr2int_2void = bitcast double* %Aout to i8*
|
||||
%vsq_load494_ptr2int_2void = bitcast double* %vsq to i8*
|
||||
br label %for_test275.preheader
|
||||
|
||||
for_test.preheader: ; preds = %allocas
|
||||
br i1 %less_z_load_z1_load260, label %for_test30.preheader.lr.ph, label %for_exit
|
||||
|
||||
for_test30.preheader.lr.ph: ; preds = %for_test.preheader
|
||||
%less_y_load_y1_load258 = icmp slt i32 %y0, %y1
|
||||
%less_xb_load_x1_load256 = icmp slt i32 %x0, %x1
|
||||
%x1_load199_broadcast_init = insertelement <8 x i32> undef, i32 %x1, i32 0
|
||||
%x1_load199_broadcast = shufflevector <8 x i32> %x1_load199_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
|
||||
%mul__Nx_load119 = shl i32 %Nx, 1
|
||||
%mul__Nx_load167 = mul i32 %Nx, 3
|
||||
%mul__Nx_load127 = mul i32 %Nx, -2
|
||||
%mul__Nx_load175 = mul i32 %Nx, -3
|
||||
%mul__Nxy_load136 = shl i32 %mul_Nx_load_Ny_load, 1
|
||||
%mul__Nxy_load184 = mul i32 %mul_Nx_load_Ny_load, 3
|
||||
%mul__Nxy_load144 = mul i32 %mul_Nx_load_Ny_load, -2
|
||||
%mul__Nxy_load192 = mul i32 %mul_Nx_load_Ny_load, -3
|
||||
%Ain_load65_ptr2int_2void = bitcast double* %Ain to i8*
|
||||
%coef1_load_broadcast_init = insertelement <8 x double> undef, double %coef_load18_offset_load, i32 0
|
||||
%coef0_load_broadcast_init = insertelement <8 x double> undef, double %coef_load_offset_load, i32 0
|
||||
%coef2_load_broadcast_init = insertelement <8 x double> undef, double %coef_load21_offset_load, i32 0
|
||||
%coef1_load_broadcast = shufflevector <8 x double> %coef1_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer
|
||||
%coef0_load_broadcast = shufflevector <8 x double> %coef0_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer
|
||||
%coef3_load_broadcast_init = insertelement <8 x double> undef, double %coef_load24_offset_load, i32 0
|
||||
%coef2_load_broadcast = shufflevector <8 x double> %coef2_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer
|
||||
%coef3_load_broadcast = shufflevector <8 x double> %coef3_load_broadcast_init, <8 x double> undef, <8 x i32> zeroinitializer
|
||||
%Aout_load219_ptr2int_2void = bitcast double* %Aout to i8*
|
||||
%vsq_load_ptr2int_2void = bitcast double* %vsq to i8*
|
||||
br label %for_test30.preheader
|
||||
|
||||
for_test30.preheader: ; preds = %for_exit33, %for_test30.preheader.lr.ph
|
||||
%z.0261 = phi i32 [ %z0, %for_test30.preheader.lr.ph ], [ %z_load242_plus1, %for_exit33 ]
|
||||
br i1 %less_y_load_y1_load258, label %for_test37.preheader.lr.ph, label %for_exit33
|
||||
|
||||
for_test37.preheader.lr.ph: ; preds = %for_test30.preheader
|
||||
%mul_z_load45_Nxy_load = mul i32 %z.0261, %mul_Nx_load_Ny_load
|
||||
br i1 %less_xb_load_x1_load256, label %for_loop39.lr.ph.us, label %for_exit33
|
||||
|
||||
for_exit40.us: ; preds = %safe_if_after_true.us
|
||||
%y_load241_plus1.us = add i32 %y.0259.us, 1
|
||||
%exitcond = icmp eq i32 %y_load241_plus1.us, %y1
|
||||
br i1 %exitcond, label %for_exit33, label %for_loop39.lr.ph.us
|
||||
|
||||
for_loop39.us: ; preds = %for_loop39.lr.ph.us, %safe_if_after_true.us
|
||||
%xb.0257.us = phi i32 [ %x0, %for_loop39.lr.ph.us ], [ %add_xb_load240_.us, %safe_if_after_true.us ]
|
||||
%xb_load44_broadcast_init.us = insertelement <8 x i32> undef, i32 %xb.0257.us, i32 0
|
||||
%xb_load44_broadcast.us = shufflevector <8 x i32> %xb_load44_broadcast_init.us, <8 x i32> undef, <8 x i32> zeroinitializer
|
||||
%add_xb_load44_broadcast_.us = add <8 x i32> %xb_load44_broadcast.us, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%less_x_load198_x1_load199_broadcast.us = icmp slt <8 x i32> %add_xb_load44_broadcast_.us, %x1_load199_broadcast
|
||||
%"oldMask&test.us" = select <8 x i1> %less_x_load198_x1_load199_broadcast.us, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> zeroinitializer
|
||||
%floatmask.i244.us = bitcast <8 x i32> %"oldMask&test.us" to <8 x float>
|
||||
%v.i245.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i244.us) #1
|
||||
%cmp.i246.us = icmp eq i32 %v.i245.us, 0
|
||||
br i1 %cmp.i246.us, label %safe_if_after_true.us, label %safe_if_run_true.us
|
||||
|
||||
safe_if_run_true.us: ; preds = %for_loop39.us
|
||||
%add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast.elt0.us = add i32 %xb.0257.us, %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us
|
||||
%scaled_varying.elt0.us = shl i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast.elt0.us, 3
|
||||
%"varying+const_offsets.elt0.us" = add i32 %scaled_varying.elt0.us, -8
|
||||
%0 = sext i32 %"varying+const_offsets.elt0.us" to i64
|
||||
%ptr.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %0, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3
|
||||
%ptr_cast_for_load.us = bitcast i8* %ptr.us to <8 x double>*
|
||||
%ptr_masked_load521.us = load <8 x double>* %ptr_cast_for_load.us, align 8, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3
|
||||
%"varying+const_offsets529.elt0.us" = add i32 %scaled_varying.elt0.us, 8
|
||||
%1 = sext i32 %"varying+const_offsets529.elt0.us" to i64
|
||||
%ptr530.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %1, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5
|
||||
%ptr_cast_for_load531.us = bitcast i8* %ptr530.us to <8 x double>*
|
||||
%ptr530_masked_load532.us = load <8 x double>* %ptr_cast_for_load531.us, align 8, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5
|
||||
%"varying+const_offsets540.elt0.us" = add i32 %scaled_varying.elt0.us, -16
|
||||
%2 = sext i32 %"varying+const_offsets540.elt0.us" to i64
|
||||
%ptr541.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %2, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3
|
||||
%ptr_cast_for_load542.us = bitcast i8* %ptr541.us to <8 x double>*
|
||||
%ptr541_masked_load543.us = load <8 x double>* %ptr_cast_for_load542.us, align 8, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3
|
||||
%"varying+const_offsets551.elt0.us" = add i32 %scaled_varying.elt0.us, 16
|
||||
%3 = sext i32 %"varying+const_offsets551.elt0.us" to i64
|
||||
%ptr552.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %3, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5
|
||||
%ptr_cast_for_load553.us = bitcast i8* %ptr552.us to <8 x double>*
|
||||
%ptr552_masked_load554.us = load <8 x double>* %ptr_cast_for_load553.us, align 8, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5
|
||||
%add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556_mul__Nx_load71_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556.elt0.us, %xb.0257.us
|
||||
%scaled_varying560.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556_mul__Nx_load71_broadcast.elt0.us, 3
|
||||
%4 = sext i32 %scaled_varying560.elt0.us to i64
|
||||
%ptr562.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %4, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8
|
||||
%ptr_cast_for_load563.us = bitcast i8* %ptr562.us to <8 x double>*
|
||||
%ptr562_masked_load564.us = load <8 x double>* %ptr_cast_for_load563.us, align 8, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8
|
||||
%add_Ain_load57_offset_load_Ain_load65_offset_load.us = fadd <8 x double> %ptr_masked_load521.us, %ptr530_masked_load532.us
|
||||
%"varying+const_offsets572.elt0.us" = add i32 %scaled_varying.elt0.us, -24
|
||||
%5 = sext i32 %"varying+const_offsets572.elt0.us" to i64
|
||||
%ptr573.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %5, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3
|
||||
%ptr_cast_for_load574.us = bitcast i8* %ptr573.us to <8 x double>*
|
||||
%ptr573_masked_load575.us = load <8 x double>* %ptr_cast_for_load574.us, align 8, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3
|
||||
%"varying+const_offsets583.elt0.us" = add i32 %scaled_varying.elt0.us, 24
|
||||
%6 = sext i32 %"varying+const_offsets583.elt0.us" to i64
|
||||
%ptr584.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %6, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5
|
||||
%ptr_cast_for_load585.us = bitcast i8* %ptr584.us to <8 x double>*
|
||||
%ptr584_masked_load586.us = load <8 x double>* %ptr_cast_for_load585.us, align 8, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5
|
||||
%add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588_mul__Nx_load119_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588.elt0.us, %xb.0257.us
|
||||
%scaled_varying593.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588_mul__Nx_load119_broadcast.elt0.us, 3
|
||||
%7 = sext i32 %scaled_varying593.elt0.us to i64
|
||||
%ptr595.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %7, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1
|
||||
%ptr_cast_for_load596.us = bitcast i8* %ptr595.us to <8 x double>*
|
||||
%ptr595_masked_load597.us = load <8 x double>* %ptr_cast_for_load596.us, align 8, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1
|
||||
%add_Ain_load105_offset_load_Ain_load113_offset_load.us = fadd <8 x double> %ptr541_masked_load543.us, %ptr552_masked_load554.us
|
||||
%add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599_mul__Nx_load79_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599.elt0.us, %xb.0257.us
|
||||
%scaled_varying604.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599_mul__Nx_load79_broadcast.elt0.us, 3
|
||||
%8 = sext i32 %scaled_varying604.elt0.us to i64
|
||||
%ptr606.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %8, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13
|
||||
%ptr_cast_for_load607.us = bitcast i8* %ptr606.us to <8 x double>*
|
||||
%ptr606_masked_load608.us = load <8 x double>* %ptr_cast_for_load607.us, align 8, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13
|
||||
%add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load.us = fadd <8 x double> %add_Ain_load57_offset_load_Ain_load65_offset_load.us, %ptr562_masked_load564.us
|
||||
%add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610_mul__Nx_load167_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610.elt0.us, %xb.0257.us
|
||||
%scaled_varying615.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610_mul__Nx_load167_broadcast.elt0.us, 3
|
||||
%9 = sext i32 %scaled_varying615.elt0.us to i64
|
||||
%ptr617.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %9, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1
|
||||
%ptr_cast_for_load618.us = bitcast i8* %ptr617.us to <8 x double>*
|
||||
%ptr617_masked_load619.us = load <8 x double>* %ptr_cast_for_load618.us, align 8, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1
|
||||
%add_Ain_load153_offset_load_Ain_load161_offset_load.us = fadd <8 x double> %ptr573_masked_load575.us, %ptr584_masked_load586.us
|
||||
%add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621_mul__Nx_load127_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621.elt0.us, %xb.0257.us
|
||||
%scaled_varying626.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621_mul__Nx_load127_broadcast.elt0.us, 3
|
||||
%10 = sext i32 %scaled_varying626.elt0.us to i64
|
||||
%ptr628.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %10, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15
|
||||
%ptr_cast_for_load629.us = bitcast i8* %ptr628.us to <8 x double>*
|
||||
%ptr628_masked_load630.us = load <8 x double>* %ptr_cast_for_load629.us, align 8, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15
|
||||
%add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load.us = fadd <8 x double> %add_Ain_load105_offset_load_Ain_load113_offset_load.us, %ptr595_masked_load597.us
|
||||
%add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632_mul__Nxy_load88_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632.elt0.us, %xb.0257.us
|
||||
%scaled_varying637.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632_mul__Nxy_load88_broadcast.elt0.us, 3
|
||||
%11 = sext i32 %scaled_varying637.elt0.us to i64
|
||||
%ptr639.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %11, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1
|
||||
%ptr_cast_for_load640.us = bitcast i8* %ptr639.us to <8 x double>*
|
||||
%ptr639_masked_load641.us = load <8 x double>* %ptr_cast_for_load640.us, align 8, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1
|
||||
%add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load.us = fadd <8 x double> %add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load.us, %ptr606_masked_load608.us
|
||||
%add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643_mul__Nx_load175_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643.elt0.us, %xb.0257.us
|
||||
%scaled_varying648.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643_mul__Nx_load175_broadcast.elt0.us, 3
|
||||
%12 = sext i32 %scaled_varying648.elt0.us to i64
|
||||
%ptr650.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %12, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15
|
||||
%ptr_cast_for_load651.us = bitcast i8* %ptr650.us to <8 x double>*
|
||||
%ptr650_masked_load652.us = load <8 x double>* %ptr_cast_for_load651.us, align 8, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15
|
||||
%add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load.us = fadd <8 x double> %add_Ain_load153_offset_load_Ain_load161_offset_load.us, %ptr617_masked_load619.us
|
||||
%add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654_mul__Nxy_load136_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654.elt0.us, %xb.0257.us
|
||||
%scaled_varying659.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654_mul__Nxy_load136_broadcast.elt0.us, 3
|
||||
%13 = sext i32 %scaled_varying659.elt0.us to i64
|
||||
%ptr661.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %13, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1
|
||||
%ptr_cast_for_load662.us = bitcast i8* %ptr661.us to <8 x double>*
|
||||
%ptr661_masked_load663.us = load <8 x double>* %ptr_cast_for_load662.us, align 8, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1
|
||||
%add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load.us = fadd <8 x double> %add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load.us, %ptr628_masked_load630.us
|
||||
%add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665_mul__Nxy_load96_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665.elt0.us, %xb.0257.us
|
||||
%scaled_varying670.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665_mul__Nxy_load96_broadcast.elt0.us, 3
|
||||
%14 = sext i32 %scaled_varying670.elt0.us to i64
|
||||
%ptr672.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %14, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15
|
||||
%ptr_cast_for_load673.us = bitcast i8* %ptr672.us to <8 x double>*
|
||||
%ptr672_masked_load674.us = load <8 x double>* %ptr_cast_for_load673.us, align 8, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15
|
||||
%add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load.us = fadd <8 x double> %add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load.us, %ptr639_masked_load641.us
|
||||
%add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676_mul__Nxy_load184_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676.elt0.us, %xb.0257.us
|
||||
%scaled_varying681.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676_mul__Nxy_load184_broadcast.elt0.us, 3
|
||||
%15 = sext i32 %scaled_varying681.elt0.us to i64
|
||||
%ptr683.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %15, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1
|
||||
%ptr_cast_for_load684.us = bitcast i8* %ptr683.us to <8 x double>*
|
||||
%ptr683_masked_load685.us = load <8 x double>* %ptr_cast_for_load684.us, align 8, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1
|
||||
%add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load.us = fadd <8 x double> %add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load.us, %ptr650_masked_load652.us
|
||||
%add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687_mul__Nxy_load144_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687.elt0.us, %xb.0257.us
|
||||
%scaled_varying692.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687_mul__Nxy_load144_broadcast.elt0.us, 3
|
||||
%16 = sext i32 %scaled_varying692.elt0.us to i64
|
||||
%ptr694.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %16, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15
|
||||
%ptr_cast_for_load695.us = bitcast i8* %ptr694.us to <8 x double>*
|
||||
%ptr694_masked_load696.us = load <8 x double>* %ptr_cast_for_load695.us, align 8, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15
|
||||
%add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load.us = fadd <8 x double> %add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load.us, %ptr661_masked_load663.us
|
||||
%add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load.us, %ptr672_masked_load674.us
|
||||
%17 = sext i32 %scaled_varying.elt0.us to i64
|
||||
%ptr705.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %17, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19
|
||||
%ptr_cast_for_load706.us = bitcast i8* %ptr705.us to <8 x double>*
|
||||
%ptr705_masked_load707.us = load <8 x double>* %ptr_cast_for_load706.us, align 8, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19
|
||||
%add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709_mul__Nxy_load192_broadcast.elt0.us = add i32 %add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709.elt0.us, %xb.0257.us
|
||||
%scaled_varying714.elt0.us = shl i32 %add_add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709_mul__Nxy_load192_broadcast.elt0.us, 3
|
||||
%18 = sext i32 %scaled_varying714.elt0.us to i64
|
||||
%ptr716.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %18, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15
|
||||
%ptr_cast_for_load717.us = bitcast i8* %ptr716.us to <8 x double>*
|
||||
%ptr716_masked_load718.us = load <8 x double>* %ptr_cast_for_load717.us, align 8, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15
|
||||
%add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load.us = fadd <8 x double> %add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load.us, %ptr683_masked_load685.us
|
||||
%add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load.us, %ptr694_masked_load696.us
|
||||
%mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fmul <8 x double> %coef1_load_broadcast, %add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us
|
||||
%mul_coef0_load_broadcast_Ain_load_offset_load.us = fmul <8 x double> %coef0_load_broadcast, %ptr705_masked_load707.us
|
||||
%add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load.us, %ptr716_masked_load718.us
|
||||
%mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fmul <8 x double> %coef2_load_broadcast, %add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us
|
||||
%add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us = fadd <8 x double> %mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us, %mul_coef0_load_broadcast_Ain_load_offset_load.us
|
||||
%mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fmul <8 x double> %coef3_load_broadcast, %add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us
|
||||
%add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us = fadd <8 x double> %mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us, %add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load.us
|
||||
%add_add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load_mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us = fadd <8 x double> %add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load.us, %mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us
|
||||
%mask0.i.i234.us = shufflevector <8 x i32> %"oldMask&test.us", <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
%mask1.i.i235.us = shufflevector <8 x i32> %"oldMask&test.us", <8 x i32> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||
%mask0d.i.i236.us = bitcast <8 x i32> %mask0.i.i234.us to <4 x double>
|
||||
%mask1d.i.i237.us = bitcast <8 x i32> %mask1.i.i235.us to <4 x double>
|
||||
%val0d.i.i238.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr705.us, <4 x double> %mask0d.i.i236.us) #0
|
||||
%ptr727.sum.us = add i64 %17, 32
|
||||
%ptr1.i.i239.us = getelementptr i8* %Ain_load65_ptr2int_2void, i64 %ptr727.sum.us
|
||||
%val1d.i.i240.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i239.us, <4 x double> %mask1d.i.i237.us) #0
|
||||
%vald.i.i241.us = shufflevector <4 x double> %val0d.i.i238.us, <4 x double> %val1d.i.i240.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%mul__Ain_load211_offset_load.us = fmul <8 x double> %vald.i.i241.us, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
||||
%ptr736.us = getelementptr i8* %Aout_load219_ptr2int_2void, i64 %17, !filename !0, !first_line !20, !first_column !21, !last_line !20, !last_column !22
|
||||
%val0d.i.i228.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr736.us, <4 x double> %mask0d.i.i236.us) #0
|
||||
%ptr1.i.i229.us = getelementptr i8* %Aout_load219_ptr2int_2void, i64 %ptr727.sum.us
|
||||
%val1d.i.i230.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i229.us, <4 x double> %mask1d.i.i237.us) #0
|
||||
%vald.i.i231.us = shufflevector <4 x double> %val0d.i.i228.us, <4 x double> %val1d.i.i230.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%sub_mul__Ain_load211_offset_load_Aout_load219_offset_load.us = fsub <8 x double> %mul__Ain_load211_offset_load.us, %vald.i.i231.us
|
||||
%ptr745.us = getelementptr i8* %vsq_load_ptr2int_2void, i64 %17, !filename !0, !first_line !23, !first_column !24, !last_line !23, !last_column !7
|
||||
%val0d.i.i218.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr745.us, <4 x double> %mask0d.i.i236.us) #0
|
||||
%ptr1.i.i219.us = getelementptr i8* %vsq_load_ptr2int_2void, i64 %ptr727.sum.us
|
||||
%val1d.i.i220.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i219.us, <4 x double> %mask1d.i.i237.us) #0
|
||||
%vald.i.i221.us = shufflevector <4 x double> %val0d.i.i218.us, <4 x double> %val1d.i.i220.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%mul_vsq_load_offset_load_div_load.us = fmul <8 x double> %add_add_add_mul_coef0_load_broadcast_Ain_load_offset_load_mul_coef1_load_broadcast_add_add_add_add_add_Ain_load57_offset_load_Ain_load65_offset_load_Ain_load73_offset_load_Ain_load81_offset_load_Ain_load89_offset_load_Ain_load97_offset_load_mul_coef2_load_broadcast_add_add_add_add_add_Ain_load105_offset_load_Ain_load113_offset_load_Ain_load121_offset_load_Ain_load129_offset_load_Ain_load137_offset_load_Ain_load145_offset_load_mul_coef3_load_broadcast_add_add_add_add_add_Ain_load153_offset_load_Ain_load161_offset_load_Ain_load169_offset_load_Ain_load177_offset_load_Ain_load185_offset_load_Ain_load193_offset_load.us, %vald.i.i221.us
|
||||
%add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us = fadd <8 x double> %sub_mul__Ain_load211_offset_load_Aout_load219_offset_load.us, %mul_vsq_load_offset_load_div_load.us
|
||||
%val0.i.i.us = shufflevector <8 x double> %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%val1.i.i.us = shufflevector <8 x double> %add_sub_mul__Ain_load211_offset_load_Aout_load219_offset_load_mul_vsq_load_offset_load_div_load.us, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr736.us, <4 x double> %mask0d.i.i236.us, <4 x double> %val0.i.i.us) #0
|
||||
call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr1.i.i229.us, <4 x double> %mask1d.i.i237.us, <4 x double> %val1.i.i.us) #0
|
||||
br label %safe_if_after_true.us
|
||||
|
||||
safe_if_after_true.us: ; preds = %safe_if_run_true.us, %for_loop39.us
|
||||
%add_xb_load240_.us = add i32 %xb.0257.us, 8
|
||||
%less_xb_load_x1_load.us = icmp slt i32 %add_xb_load240_.us, %x1
|
||||
br i1 %less_xb_load_x1_load.us, label %for_loop39.us, label %for_exit40.us
|
||||
|
||||
for_loop39.lr.ph.us: ; preds = %for_exit40.us, %for_test37.preheader.lr.ph
|
||||
%y.0259.us = phi i32 [ %y_load241_plus1.us, %for_exit40.us ], [ %y0, %for_test37.preheader.lr.ph ]
|
||||
%mul_y_load46_Nx_load47.us = mul i32 %y.0259.us, %Nx
|
||||
%add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us = add i32 %mul_y_load46_Nx_load47.us, %mul_z_load45_Nxy_load
|
||||
%add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast556.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %Nx
|
||||
%add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast588.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load119
|
||||
%add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast599.elt0.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %Nx
|
||||
%add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast610.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load167
|
||||
%add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast621.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load127
|
||||
%add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast632.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul_Nx_load_Ny_load
|
||||
%add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast643.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nx_load175
|
||||
%add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast654.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load136
|
||||
%add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast665.elt0.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul_Nx_load_Ny_load
|
||||
%add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast676.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load184
|
||||
%add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast687.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load144
|
||||
%add_add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47_broadcast_xb_load44_broadcast709.elt0.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.us, %mul__Nxy_load192
|
||||
br label %for_loop39.us
|
||||
|
||||
for_exit: ; preds = %for_exit278, %for_exit33, %for_test.preheader, %for_test264.preheader
|
||||
ret void
|
||||
|
||||
for_exit33: ; preds = %for_exit40.us, %for_test37.preheader.lr.ph, %for_test30.preheader
|
||||
%z_load242_plus1 = add i32 %z.0261, 1
|
||||
%exitcond269 = icmp eq i32 %z_load242_plus1, %z1
|
||||
br i1 %exitcond269, label %for_exit, label %for_test30.preheader
|
||||
|
||||
for_test275.preheader: ; preds = %for_exit278, %for_test275.preheader.lr.ph
|
||||
%z269.0268 = phi i32 [ %z0, %for_test275.preheader.lr.ph ], [ %z_load518_plus1, %for_exit278 ]
|
||||
br i1 %less_y_load282_y1_load283264, label %for_test286.preheader.lr.ph, label %for_exit278
|
||||
|
||||
for_test286.preheader.lr.ph: ; preds = %for_test275.preheader
|
||||
%mul_z_load300_Nxy_load301 = mul i32 %z269.0268, %mul_Nx_load_Ny_load
|
||||
br i1 %less_xb_load293_x1_load294262, label %for_loop288.lr.ph.us, label %for_exit278
|
||||
|
||||
for_exit289.us: ; preds = %safe_if_after_true466.us
|
||||
%y_load517_plus1.us = add i32 %y280.0265.us, 1
|
||||
%exitcond271 = icmp eq i32 %y_load517_plus1.us, %y1
|
||||
br i1 %exitcond271, label %for_exit278, label %for_loop288.lr.ph.us
|
||||
|
||||
for_loop288.us: ; preds = %for_loop288.lr.ph.us, %safe_if_after_true466.us
|
||||
%xb291.0263.us = phi i32 [ %x0, %for_loop288.lr.ph.us ], [ %add_xb291_load_.us, %safe_if_after_true466.us ]
|
||||
%xb_load298_broadcast_init.us = insertelement <8 x i32> undef, i32 %xb291.0263.us, i32 0
|
||||
%xb_load298_broadcast.us = shufflevector <8 x i32> %xb_load298_broadcast_init.us, <8 x i32> undef, <8 x i32> zeroinitializer
|
||||
%add_xb_load298_broadcast_.us = add <8 x i32> %xb_load298_broadcast.us, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%less_x_load462_x1_load463_broadcast.us = icmp slt <8 x i32> %add_xb_load298_broadcast_.us, %x1_load463_broadcast
|
||||
%"oldMask&test468.us" = select <8 x i1> %less_x_load462_x1_load463_broadcast.us, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> zeroinitializer
|
||||
%"internal_mask&function_mask472.us" = and <8 x i32> %"oldMask&test468.us", %__mask
|
||||
%floatmask.i211.us = bitcast <8 x i32> %"internal_mask&function_mask472.us" to <8 x float>
|
||||
%v.i212.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i211.us) #1
|
||||
%cmp.i213.us = icmp eq i32 %v.i212.us, 0
|
||||
br i1 %cmp.i213.us, label %safe_if_after_true466.us, label %safe_if_run_true467.us
|
||||
|
||||
safe_if_run_true467.us: ; preds = %for_loop288.us
|
||||
%add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast.elt0.us = add i32 %xb291.0263.us, %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us
|
||||
%scaled_varying757.elt0.us = shl i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast.elt0.us, 3
|
||||
%"varying+const_offsets.elt0758.us" = add i32 %scaled_varying757.elt0.us, -8
|
||||
%19 = sext i32 %"varying+const_offsets.elt0758.us" to i64
|
||||
%ptr759.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %19, !filename !0, !first_line !1, !first_column !2, !last_line !1, !last_column !3
|
||||
%val0d.i.i205.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr759.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr759.sum.us = add i64 %19, 32
|
||||
%ptr1.i.i206.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr759.sum.us
|
||||
%val1d.i.i207.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i206.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i208.us = shufflevector <4 x double> %val0d.i.i205.us, <4 x double> %val1d.i.i207.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%"varying+const_offsets767.elt0.us" = add i32 %scaled_varying757.elt0.us, 8
|
||||
%20 = sext i32 %"varying+const_offsets767.elt0.us" to i64
|
||||
%ptr768.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %20, !filename !0, !first_line !1, !first_column !4, !last_line !1, !last_column !5
|
||||
%val0d.i.i195.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr768.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr768.sum.us = add i64 %20, 32
|
||||
%ptr1.i.i196.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr768.sum.us
|
||||
%val1d.i.i197.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i196.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i198.us = shufflevector <4 x double> %val0d.i.i195.us, <4 x double> %val1d.i.i197.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%"varying+const_offsets776.elt0.us" = add i32 %scaled_varying757.elt0.us, -16
|
||||
%21 = sext i32 %"varying+const_offsets776.elt0.us" to i64
|
||||
%ptr777.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %21, !filename !0, !first_line !6, !first_column !2, !last_line !6, !last_column !3
|
||||
%val0d.i.i185.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr777.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr777.sum.us = add i64 %21, 32
|
||||
%ptr1.i.i186.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr777.sum.us
|
||||
%val1d.i.i187.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i186.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i188.us = shufflevector <4 x double> %val0d.i.i185.us, <4 x double> %val1d.i.i187.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%"varying+const_offsets785.elt0.us" = add i32 %scaled_varying757.elt0.us, 16
|
||||
%22 = sext i32 %"varying+const_offsets785.elt0.us" to i64
|
||||
%ptr786.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %22, !filename !0, !first_line !6, !first_column !4, !last_line !6, !last_column !5
|
||||
%val0d.i.i175.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr786.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr786.sum.us = add i64 %22, 32
|
||||
%ptr1.i.i176.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr786.sum.us
|
||||
%val1d.i.i177.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i176.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i178.us = shufflevector <4 x double> %val0d.i.i175.us, <4 x double> %val1d.i.i177.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788_mul__Nx_load333_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788.elt0.us, %xb291.0263.us
|
||||
%scaled_varying793.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788_mul__Nx_load333_broadcast.elt0.us, 3
|
||||
%23 = sext i32 %scaled_varying793.elt0.us to i64
|
||||
%ptr795.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %23, !filename !0, !first_line !2, !first_column !7, !last_line !2, !last_column !8
|
||||
%val0d.i.i165.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr795.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr795.sum.us = add i64 %23, 32
|
||||
%ptr1.i.i166.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr795.sum.us
|
||||
%val1d.i.i167.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i166.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i168.us = shufflevector <4 x double> %val0d.i.i165.us, <4 x double> %val1d.i.i167.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_Ain_load319_offset_load_Ain_load327_offset_load.us = fadd <8 x double> %vald.i.i208.us, %vald.i.i198.us
|
||||
%"varying+const_offsets803.elt0.us" = add i32 %scaled_varying757.elt0.us, -24
|
||||
%24 = sext i32 %"varying+const_offsets803.elt0.us" to i64
|
||||
%ptr804.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %24, !filename !0, !first_line !9, !first_column !2, !last_line !9, !last_column !3
|
||||
%val0d.i.i155.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr804.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr804.sum.us = add i64 %24, 32
|
||||
%ptr1.i.i156.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr804.sum.us
|
||||
%val1d.i.i157.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i156.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i158.us = shufflevector <4 x double> %val0d.i.i155.us, <4 x double> %val1d.i.i157.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%"varying+const_offsets812.elt0.us" = add i32 %scaled_varying757.elt0.us, 24
|
||||
%25 = sext i32 %"varying+const_offsets812.elt0.us" to i64
|
||||
%ptr813.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %25, !filename !0, !first_line !9, !first_column !4, !last_line !9, !last_column !5
|
||||
%val0d.i.i145.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr813.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr813.sum.us = add i64 %25, 32
|
||||
%ptr1.i.i146.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr813.sum.us
|
||||
%val1d.i.i147.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i146.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i148.us = shufflevector <4 x double> %val0d.i.i145.us, <4 x double> %val1d.i.i147.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815_mul__Nx_load382_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815.elt0.us, %xb291.0263.us
|
||||
%scaled_varying820.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815_mul__Nx_load382_broadcast.elt0.us, 3
|
||||
%26 = sext i32 %scaled_varying820.elt0.us to i64
|
||||
%ptr822.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %26, !filename !0, !first_line !10, !first_column !11, !last_line !10, !last_column !1
|
||||
%val0d.i.i135.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr822.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr822.sum.us = add i64 %26, 32
|
||||
%ptr1.i.i136.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr822.sum.us
|
||||
%val1d.i.i137.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i136.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i138.us = shufflevector <4 x double> %val0d.i.i135.us, <4 x double> %val1d.i.i137.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_Ain_load368_offset_load_Ain_load376_offset_load.us = fadd <8 x double> %vald.i.i188.us, %vald.i.i178.us
|
||||
%add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824_mul__Nx_load341_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824.elt0.us, %xb291.0263.us
|
||||
%scaled_varying829.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824_mul__Nx_load341_broadcast.elt0.us, 3
|
||||
%27 = sext i32 %scaled_varying829.elt0.us to i64
|
||||
%ptr831.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %27, !filename !0, !first_line !2, !first_column !12, !last_line !2, !last_column !13
|
||||
%val0d.i.i125.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr831.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr831.sum.us = add i64 %27, 32
|
||||
%ptr1.i.i126.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr831.sum.us
|
||||
%val1d.i.i127.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i126.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i128.us = shufflevector <4 x double> %val0d.i.i125.us, <4 x double> %val1d.i.i127.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load.us = fadd <8 x double> %add_Ain_load319_offset_load_Ain_load327_offset_load.us, %vald.i.i168.us
|
||||
%add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833_mul__Nx_load431_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833.elt0.us, %xb291.0263.us
|
||||
%scaled_varying838.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833_mul__Nx_load431_broadcast.elt0.us, 3
|
||||
%28 = sext i32 %scaled_varying838.elt0.us to i64
|
||||
%ptr840.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %28, !filename !0, !first_line !14, !first_column !11, !last_line !14, !last_column !1
|
||||
%val0d.i.i115.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr840.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr840.sum.us = add i64 %28, 32
|
||||
%ptr1.i.i116.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr840.sum.us
|
||||
%val1d.i.i117.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i116.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i118.us = shufflevector <4 x double> %val0d.i.i115.us, <4 x double> %val1d.i.i117.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_Ain_load417_offset_load_Ain_load425_offset_load.us = fadd <8 x double> %vald.i.i158.us, %vald.i.i148.us
|
||||
%add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842_mul__Nx_load390_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842.elt0.us, %xb291.0263.us
|
||||
%scaled_varying847.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842_mul__Nx_load390_broadcast.elt0.us, 3
|
||||
%29 = sext i32 %scaled_varying847.elt0.us to i64
|
||||
%ptr849.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %29, !filename !0, !first_line !10, !first_column !6, !last_line !10, !last_column !15
|
||||
%val0d.i.i105.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr849.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr849.sum.us = add i64 %29, 32
|
||||
%ptr1.i.i106.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr849.sum.us
|
||||
%val1d.i.i107.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i106.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i108.us = shufflevector <4 x double> %val0d.i.i105.us, <4 x double> %val1d.i.i107.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load.us = fadd <8 x double> %add_Ain_load368_offset_load_Ain_load376_offset_load.us, %vald.i.i138.us
|
||||
%add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851_mul__Nxy_load350_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851.elt0.us, %xb291.0263.us
|
||||
%scaled_varying856.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851_mul__Nxy_load350_broadcast.elt0.us, 3
|
||||
%30 = sext i32 %scaled_varying856.elt0.us to i64
|
||||
%ptr858.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %30, !filename !0, !first_line !12, !first_column !11, !last_line !12, !last_column !1
|
||||
%val0d.i.i95.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr858.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr858.sum.us = add i64 %30, 32
|
||||
%ptr1.i.i96.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr858.sum.us
|
||||
%val1d.i.i97.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i96.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i98.us = shufflevector <4 x double> %val0d.i.i95.us, <4 x double> %val1d.i.i97.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load.us = fadd <8 x double> %add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load.us, %vald.i.i128.us
|
||||
%add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860_mul__Nx_load439_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860.elt0.us, %xb291.0263.us
|
||||
%scaled_varying865.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860_mul__Nx_load439_broadcast.elt0.us, 3
|
||||
%31 = sext i32 %scaled_varying865.elt0.us to i64
|
||||
%ptr867.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %31, !filename !0, !first_line !14, !first_column !6, !last_line !14, !last_column !15
|
||||
%val0d.i.i85.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr867.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr867.sum.us = add i64 %31, 32
|
||||
%ptr1.i.i86.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr867.sum.us
|
||||
%val1d.i.i87.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i86.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i88.us = shufflevector <4 x double> %val0d.i.i85.us, <4 x double> %val1d.i.i87.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load.us = fadd <8 x double> %add_Ain_load417_offset_load_Ain_load425_offset_load.us, %vald.i.i118.us
|
||||
%add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869_mul__Nxy_load399_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869.elt0.us, %xb291.0263.us
|
||||
%scaled_varying874.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869_mul__Nxy_load399_broadcast.elt0.us, 3
|
||||
%32 = sext i32 %scaled_varying874.elt0.us to i64
|
||||
%ptr876.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %32, !filename !0, !first_line !16, !first_column !11, !last_line !16, !last_column !1
|
||||
%val0d.i.i75.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr876.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr876.sum.us = add i64 %32, 32
|
||||
%ptr1.i.i76.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr876.sum.us
|
||||
%val1d.i.i77.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i76.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i78.us = shufflevector <4 x double> %val0d.i.i75.us, <4 x double> %val1d.i.i77.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load.us = fadd <8 x double> %add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load.us, %vald.i.i108.us
|
||||
%add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878_mul__Nxy_load358_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878.elt0.us, %xb291.0263.us
|
||||
%scaled_varying883.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878_mul__Nxy_load358_broadcast.elt0.us, 3
|
||||
%33 = sext i32 %scaled_varying883.elt0.us to i64
|
||||
%ptr885.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %33, !filename !0, !first_line !12, !first_column !6, !last_line !12, !last_column !15
|
||||
%val0d.i.i65.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr885.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr885.sum.us = add i64 %33, 32
|
||||
%ptr1.i.i66.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr885.sum.us
|
||||
%val1d.i.i67.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i66.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i68.us = shufflevector <4 x double> %val0d.i.i65.us, <4 x double> %val1d.i.i67.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load.us = fadd <8 x double> %add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load.us, %vald.i.i98.us
|
||||
%add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887_mul__Nxy_load448_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887.elt0.us, %xb291.0263.us
|
||||
%scaled_varying892.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887_mul__Nxy_load448_broadcast.elt0.us, 3
|
||||
%34 = sext i32 %scaled_varying892.elt0.us to i64
|
||||
%ptr894.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %34, !filename !0, !first_line !17, !first_column !11, !last_line !17, !last_column !1
|
||||
%val0d.i.i55.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr894.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr894.sum.us = add i64 %34, 32
|
||||
%ptr1.i.i56.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr894.sum.us
|
||||
%val1d.i.i57.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i56.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i58.us = shufflevector <4 x double> %val0d.i.i55.us, <4 x double> %val1d.i.i57.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load.us = fadd <8 x double> %add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load.us, %vald.i.i88.us
|
||||
%add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896_mul__Nxy_load407_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896.elt0.us, %xb291.0263.us
|
||||
%scaled_varying901.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896_mul__Nxy_load407_broadcast.elt0.us, 3
|
||||
%35 = sext i32 %scaled_varying901.elt0.us to i64
|
||||
%ptr903.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %35, !filename !0, !first_line !16, !first_column !6, !last_line !16, !last_column !15
|
||||
%val0d.i.i45.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr903.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr903.sum.us = add i64 %35, 32
|
||||
%ptr1.i.i46.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr903.sum.us
|
||||
%val1d.i.i47.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i46.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i48.us = shufflevector <4 x double> %val0d.i.i45.us, <4 x double> %val1d.i.i47.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load.us = fadd <8 x double> %add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load.us, %vald.i.i78.us
|
||||
%add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load.us, %vald.i.i68.us
|
||||
%36 = sext i32 %scaled_varying757.elt0.us to i64
|
||||
%ptr912.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %36, !filename !0, !first_line !8, !first_column !18, !last_line !8, !last_column !19
|
||||
%val0d.i.i35.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr912.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr912.sum.us = add i64 %36, 32
|
||||
%ptr1.i.i36.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr912.sum.us
|
||||
%val1d.i.i37.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i36.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i38.us = shufflevector <4 x double> %val0d.i.i35.us, <4 x double> %val1d.i.i37.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914_mul__Nxy_load456_broadcast.elt0.us = add i32 %add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914.elt0.us, %xb291.0263.us
|
||||
%scaled_varying919.elt0.us = shl i32 %add_add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914_mul__Nxy_load456_broadcast.elt0.us, 3
|
||||
%37 = sext i32 %scaled_varying919.elt0.us to i64
|
||||
%ptr921.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %37, !filename !0, !first_line !17, !first_column !6, !last_line !17, !last_column !15
|
||||
%val0d.i.i25.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr921.us, <4 x double> %mask0d.i.i203) #0
|
||||
%ptr921.sum.us = add i64 %37, 32
|
||||
%ptr1.i.i26.us = getelementptr i8* %Ain_load327_ptr2int_2void, i64 %ptr921.sum.us
|
||||
%val1d.i.i27.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i26.us, <4 x double> %mask1d.i.i204) #0
|
||||
%vald.i.i28.us = shufflevector <4 x double> %val0d.i.i25.us, <4 x double> %val1d.i.i27.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load.us = fadd <8 x double> %add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load.us, %vald.i.i58.us
|
||||
%add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load.us, %vald.i.i48.us
|
||||
%mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fmul <8 x double> %coef1_load315_broadcast, %add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us
|
||||
%mul_coef0_load306_broadcast_Ain_load310_offset_load.us = fmul <8 x double> %coef0_load306_broadcast, %vald.i.i38.us
|
||||
%add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fadd <8 x double> %add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load.us, %vald.i.i28.us
|
||||
%mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fmul <8 x double> %coef2_load364_broadcast, %add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us
|
||||
%add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us = fadd <8 x double> %mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us, %mul_coef0_load306_broadcast_Ain_load310_offset_load.us
|
||||
%mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fmul <8 x double> %coef3_load413_broadcast, %add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us
|
||||
%add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us = fadd <8 x double> %mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us, %add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load.us
|
||||
%add_add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load_mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us = fadd <8 x double> %add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load.us, %mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us
|
||||
%mask0.i.i11.us = shufflevector <8 x i32> %"internal_mask&function_mask472.us", <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
%mask1.i.i12.us = shufflevector <8 x i32> %"internal_mask&function_mask472.us", <8 x i32> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
|
||||
%mask0d.i.i13.us = bitcast <8 x i32> %mask0.i.i11.us to <4 x double>
|
||||
%mask1d.i.i14.us = bitcast <8 x i32> %mask1.i.i12.us to <4 x double>
|
||||
%val0d.i.i15.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr912.us, <4 x double> %mask0d.i.i13.us) #0
|
||||
%val1d.i.i17.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i36.us, <4 x double> %mask1d.i.i14.us) #0
|
||||
%vald.i.i18.us = shufflevector <4 x double> %val0d.i.i15.us, <4 x double> %val1d.i.i17.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%mul__Ain_load480_offset_load.us = fmul <8 x double> %vald.i.i18.us, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
|
||||
%ptr939.us = getelementptr i8* %Aout_load488_ptr2int_2void, i64 %36, !filename !0, !first_line !20, !first_column !21, !last_line !20, !last_column !22
|
||||
%val0d.i.i5.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr939.us, <4 x double> %mask0d.i.i13.us) #0
|
||||
%ptr1.i.i6.us = getelementptr i8* %Aout_load488_ptr2int_2void, i64 %ptr912.sum.us
|
||||
%val1d.i.i7.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i6.us, <4 x double> %mask1d.i.i14.us) #0
|
||||
%vald.i.i8.us = shufflevector <4 x double> %val0d.i.i5.us, <4 x double> %val1d.i.i7.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%sub_mul__Ain_load480_offset_load_Aout_load488_offset_load.us = fsub <8 x double> %mul__Ain_load480_offset_load.us, %vald.i.i8.us
|
||||
%ptr948.us = getelementptr i8* %vsq_load494_ptr2int_2void, i64 %36, !filename !0, !first_line !23, !first_column !24, !last_line !23, !last_column !7
|
||||
%val0d.i.i.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr948.us, <4 x double> %mask0d.i.i13.us) #0
|
||||
%ptr1.i.i.us = getelementptr i8* %vsq_load494_ptr2int_2void, i64 %ptr912.sum.us
|
||||
%val1d.i.i.us = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %ptr1.i.i.us, <4 x double> %mask1d.i.i14.us) #0
|
||||
%vald.i.i.us = shufflevector <4 x double> %val0d.i.i.us, <4 x double> %val1d.i.i.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%mul_vsq_load494_offset_load_div_load499.us = fmul <8 x double> %add_add_add_mul_coef0_load306_broadcast_Ain_load310_offset_load_mul_coef1_load315_broadcast_add_add_add_add_add_Ain_load319_offset_load_Ain_load327_offset_load_Ain_load335_offset_load_Ain_load343_offset_load_Ain_load351_offset_load_Ain_load359_offset_load_mul_coef2_load364_broadcast_add_add_add_add_add_Ain_load368_offset_load_Ain_load376_offset_load_Ain_load384_offset_load_Ain_load392_offset_load_Ain_load400_offset_load_Ain_load408_offset_load_mul_coef3_load413_broadcast_add_add_add_add_add_Ain_load417_offset_load_Ain_load425_offset_load_Ain_load433_offset_load_Ain_load441_offset_load_Ain_load449_offset_load_Ain_load457_offset_load.us, %vald.i.i.us
|
||||
%add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us = fadd <8 x double> %sub_mul__Ain_load480_offset_load_Aout_load488_offset_load.us, %mul_vsq_load494_offset_load_div_load499.us
|
||||
%val0.i.i253.us = shufflevector <8 x double> %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%val1.i.i254.us = shufflevector <8 x double> %add_sub_mul__Ain_load480_offset_load_Aout_load488_offset_load_mul_vsq_load494_offset_load_div_load499.us, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr939.us, <4 x double> %mask0d.i.i13.us, <4 x double> %val0.i.i253.us) #0
|
||||
call void @llvm.x86.avx.maskstore.pd.256(i8* %ptr1.i.i6.us, <4 x double> %mask1d.i.i14.us, <4 x double> %val1.i.i254.us) #0
|
||||
br label %safe_if_after_true466.us
|
||||
|
||||
safe_if_after_true466.us: ; preds = %safe_if_run_true467.us, %for_loop288.us
|
||||
%add_xb291_load_.us = add i32 %xb291.0263.us, 8
|
||||
%less_xb_load293_x1_load294.us = icmp slt i32 %add_xb291_load_.us, %x1
|
||||
br i1 %less_xb_load293_x1_load294.us, label %for_loop288.us, label %for_exit289.us
|
||||
|
||||
for_loop288.lr.ph.us: ; preds = %for_exit289.us, %for_test286.preheader.lr.ph
|
||||
%y280.0265.us = phi i32 [ %y_load517_plus1.us, %for_exit289.us ], [ %y0, %for_test286.preheader.lr.ph ]
|
||||
%mul_y_load302_Nx_load303.us = mul i32 %y280.0265.us, %Nx
|
||||
%add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us = add i32 %mul_y_load302_Nx_load303.us, %mul_z_load300_Nxy_load301
|
||||
%add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast788.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %Nx
|
||||
%add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast815.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load382
|
||||
%add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast824.elt0.us = sub i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %Nx
|
||||
%add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast833.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load431
|
||||
%add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast842.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load390
|
||||
%add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast851.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul_Nx_load_Ny_load
|
||||
%add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast860.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nx_load439
|
||||
%add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast869.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load399
|
||||
%add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast878.elt0.us = sub i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul_Nx_load_Ny_load
|
||||
%add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast887.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load448
|
||||
%add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast896.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load407
|
||||
%add_add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303_broadcast_xb_load298_broadcast914.elt0.us = add i32 %add_mul_z_load300_Nxy_load301_mul_y_load302_Nx_load303.us, %mul__Nxy_load456
|
||||
br label %for_loop288.us
|
||||
|
||||
for_exit278: ; preds = %for_exit289.us, %for_test286.preheader.lr.ph, %for_test275.preheader
|
||||
%z_load518_plus1 = add i32 %z269.0268, 1
|
||||
%exitcond272 = icmp eq i32 %z_load518_plus1, %z1
|
||||
br i1 %exitcond272, label %for_exit, label %for_test275.preheader
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define internal void @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* noalias nocapture, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #3 {
|
||||
allocas:
|
||||
%x01 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 0
|
||||
%x02 = load i32* %x01, align 4
|
||||
%x13 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 1
|
||||
%x14 = load i32* %x13, align 4
|
||||
%y05 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 2
|
||||
%y06 = load i32* %y05, align 4
|
||||
%y17 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 3
|
||||
%y18 = load i32* %y17, align 4
|
||||
%z09 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 4
|
||||
%z010 = load i32* %z09, align 4
|
||||
%Nx11 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 5
|
||||
%Nx12 = load i32* %Nx11, align 4
|
||||
%Ny13 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 6
|
||||
%Ny14 = load i32* %Ny13, align 4
|
||||
%coef17 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 8
|
||||
%coef18 = load double** %coef17, align 8
|
||||
%vsq19 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 9
|
||||
%vsq20 = load double** %vsq19, align 8
|
||||
%Ain21 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 10
|
||||
%Ain22 = load double** %Ain21, align 8
|
||||
%Aout23 = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 11
|
||||
%Aout24 = load double** %Aout23, align 8
|
||||
%task_struct_mask = getelementptr { i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }* %0, i64 0, i32 12
|
||||
%mask = load <8 x i32>* %task_struct_mask, align 32
|
||||
%floatmask.i = bitcast <8 x i32> %mask to <8 x float>
|
||||
%v.i = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i) #1
|
||||
%cmp.i = icmp eq i32 %v.i, 255
|
||||
%add_z0_load_taskIndex_load = add i32 %z010, %3
|
||||
%add_z0_load27_taskIndex_load28 = add i32 %3, 1
|
||||
%add_add_z0_load27_taskIndex_load28_ = add i32 %add_z0_load27_taskIndex_load28, %z010
|
||||
br i1 %cmp.i, label %all_on, label %some_on
|
||||
|
||||
all_on: ; preds = %allocas
|
||||
tail call fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x02, i32 %x14, i32 %y06, i32 %y18, i32 %add_z0_load_taskIndex_load, i32 %add_add_z0_load27_taskIndex_load28_, i32 %Nx12, i32 %Ny14, double* %coef18, double* %vsq20, double* %Ain22, double* %Aout24, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>)
|
||||
ret void
|
||||
|
||||
some_on: ; preds = %allocas
|
||||
tail call fastcc void @stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(i32 %x02, i32 %x14, i32 %y06, i32 %y18, i32 %add_z0_load_taskIndex_load, i32 %add_add_z0_load27_taskIndex_load28_, i32 %Nx12, i32 %Ny14, double* %coef18, double* %vsq20, double* %Ain22, double* %Aout24, <8 x i32> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @loop_stencil_ispc_tasks(i32 %t0, i32 %t1, i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %z1, i32 %Nx, i32 %Ny, i32 %Nz, double* %coef, double* %vsq, double* %Aeven, double* %Aodd) #3 {
|
||||
allocas:
|
||||
%launch_group_handle = alloca i8*, align 8
|
||||
store i8* null, i8** %launch_group_handle, align 8
|
||||
%less_t_load_t1_load166 = icmp slt i32 %t0, %t1
|
||||
br i1 %less_t_load_t1_load166, label %for_loop.lr.ph, label %post_sync73
|
||||
|
||||
for_loop.lr.ph: ; preds = %allocas
|
||||
%sub_z1_load_z0_load23 = sub i32 %z1, %z0
|
||||
br label %for_loop
|
||||
|
||||
for_loop: ; preds = %post_sync, %for_loop.lr.ph
|
||||
%t.0167 = phi i32 [ %t0, %for_loop.lr.ph ], [ %t_load69_plus1, %post_sync ]
|
||||
%bitop = and i32 %t.0167, 1
|
||||
%equal_bitop_ = icmp eq i32 %bitop, 0
|
||||
%args_ptr = call i8* @ISPCAlloc(i8** %launch_group_handle, i64 96, i32 32)
|
||||
%funarg = bitcast i8* %args_ptr to i32*
|
||||
store i32 %x0, i32* %funarg, align 4
|
||||
%funarg24 = getelementptr i8* %args_ptr, i64 4
|
||||
%0 = bitcast i8* %funarg24 to i32*
|
||||
store i32 %x1, i32* %0, align 4
|
||||
%funarg25 = getelementptr i8* %args_ptr, i64 8
|
||||
%1 = bitcast i8* %funarg25 to i32*
|
||||
store i32 %y0, i32* %1, align 4
|
||||
%funarg26 = getelementptr i8* %args_ptr, i64 12
|
||||
%2 = bitcast i8* %funarg26 to i32*
|
||||
store i32 %y1, i32* %2, align 4
|
||||
%funarg27 = getelementptr i8* %args_ptr, i64 16
|
||||
%3 = bitcast i8* %funarg27 to i32*
|
||||
store i32 %z0, i32* %3, align 4
|
||||
%funarg28 = getelementptr i8* %args_ptr, i64 20
|
||||
%4 = bitcast i8* %funarg28 to i32*
|
||||
store i32 %Nx, i32* %4, align 4
|
||||
%funarg29 = getelementptr i8* %args_ptr, i64 24
|
||||
%5 = bitcast i8* %funarg29 to i32*
|
||||
store i32 %Ny, i32* %5, align 4
|
||||
%funarg30 = getelementptr i8* %args_ptr, i64 28
|
||||
%6 = bitcast i8* %funarg30 to i32*
|
||||
store i32 %Nz, i32* %6, align 4
|
||||
%funarg31 = getelementptr i8* %args_ptr, i64 32
|
||||
%7 = bitcast i8* %funarg31 to double**
|
||||
store double* %coef, double** %7, align 8
|
||||
%funarg32 = getelementptr i8* %args_ptr, i64 40
|
||||
%8 = bitcast i8* %funarg32 to double**
|
||||
store double* %vsq, double** %8, align 8
|
||||
%funarg33 = getelementptr i8* %args_ptr, i64 48
|
||||
%9 = bitcast i8* %funarg33 to double**
|
||||
br i1 %equal_bitop_, label %if_then, label %if_else
|
||||
|
||||
for_exit: ; preds = %post_sync
|
||||
%launch_group_handle_load70.pre = load i8** %launch_group_handle, align 8
|
||||
%cmp71 = icmp eq i8* %launch_group_handle_load70.pre, null
|
||||
br i1 %cmp71, label %post_sync73, label %call_sync72
|
||||
|
||||
if_then: ; preds = %for_loop
|
||||
store double* %Aeven, double** %9, align 8
|
||||
%funarg34 = getelementptr i8* %args_ptr, i64 56
|
||||
%10 = bitcast i8* %funarg34 to double**
|
||||
store double* %Aodd, double** %10, align 8
|
||||
%funarg_mask = getelementptr i8* %args_ptr, i64 64
|
||||
%11 = bitcast i8* %funarg_mask to <8 x i32>*
|
||||
store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %11, align 32
|
||||
call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i8*), i8* %args_ptr, i32 %sub_z1_load_z0_load23, i32 1, i32 1)
|
||||
br label %if_exit
|
||||
|
||||
if_else: ; preds = %for_loop
|
||||
store double* %Aodd, double** %9, align 8
|
||||
%funarg64 = getelementptr i8* %args_ptr, i64 56
|
||||
%12 = bitcast i8* %funarg64 to double**
|
||||
store double* %Aeven, double** %12, align 8
|
||||
%funarg_mask67 = getelementptr i8* %args_ptr, i64 64
|
||||
%13 = bitcast i8* %funarg_mask67 to <8 x i32>*
|
||||
store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %13, align 32
|
||||
call void @ISPCLaunch(i8** %launch_group_handle, i8* bitcast (void ({ i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*, <8 x i32> }*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @stencil_step_task___uniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_ to i8*), i8* %args_ptr, i32 %sub_z1_load_z0_load23, i32 1, i32 1)
|
||||
br label %if_exit
|
||||
|
||||
if_exit: ; preds = %if_else, %if_then
|
||||
%launch_group_handle_load = load i8** %launch_group_handle, align 8
|
||||
%cmp = icmp eq i8* %launch_group_handle_load, null
|
||||
br i1 %cmp, label %post_sync, label %call_sync
|
||||
|
||||
call_sync: ; preds = %if_exit
|
||||
call void @ISPCSync(i8* %launch_group_handle_load)
|
||||
store i8* null, i8** %launch_group_handle, align 8
|
||||
br label %post_sync
|
||||
|
||||
post_sync: ; preds = %call_sync, %if_exit
|
||||
%t_load69_plus1 = add i32 %t.0167, 1
|
||||
%exitcond = icmp eq i32 %t_load69_plus1, %t1
|
||||
br i1 %exitcond, label %for_exit, label %for_loop
|
||||
|
||||
call_sync72: ; preds = %for_exit
|
||||
call void @ISPCSync(i8* %launch_group_handle_load70.pre)
|
||||
store i8* null, i8** %launch_group_handle, align 8
|
||||
br label %post_sync73
|
||||
|
||||
post_sync73: ; preds = %call_sync72, %for_exit, %allocas
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { nounwind readonly }
|
||||
attributes #3 = { nounwind "target-cpu"="corei7-avx" "target-features"="+avx,+popcnt,+cmov" }
|
||||
|
||||
!0 = metadata !{metadata !"stencil.ispc"}
|
||||
!1 = metadata !{i32 68}
|
||||
!2 = metadata !{i32 69}
|
||||
!3 = metadata !{i32 113}
|
||||
!4 = metadata !{i32 22}
|
||||
!5 = metadata !{i32 66}
|
||||
!6 = metadata !{i32 71}
|
||||
!7 = metadata !{i32 23}
|
||||
!8 = metadata !{i32 67}
|
||||
!9 = metadata !{i32 74}
|
||||
!10 = metadata !{i32 72}
|
||||
!11 = metadata !{i32 24}
|
||||
!12 = metadata !{i32 70}
|
||||
!13 = metadata !{i32 114}
|
||||
!14 = metadata !{i32 75}
|
||||
!15 = metadata !{i32 115}
|
||||
!16 = metadata !{i32 73}
|
||||
!17 = metadata !{i32 76}
|
||||
!18 = metadata !{i32 21}
|
||||
!19 = metadata !{i32 64}
|
||||
!20 = metadata !{i32 79}
|
||||
!21 = metadata !{i32 112}
|
||||
!22 = metadata !{i32 156}
|
||||
!23 = metadata !{i32 80}
|
||||
!24 = metadata !{i32 13}
|
||||
BIN
examples_cuda/stencil/stencil_cu.o
Normal file
BIN
examples_cuda/stencil/stencil_cu.o
Normal file
Binary file not shown.
1134
examples_cuda/stencil/stencil_cu.s
Normal file
1134
examples_cuda/stencil/stencil_cu.s
Normal file
File diff suppressed because it is too large
Load Diff
BIN
examples_cuda/stencil/stencil_cu_avx.bc
Normal file
BIN
examples_cuda/stencil/stencil_cu_avx.bc
Normal file
Binary file not shown.
214
examples_cuda/stencil/stencil_cu_avx.s
Normal file
214
examples_cuda/stencil/stencil_cu_avx.s
Normal file
File diff suppressed because one or more lines are too long
BIN
examples_cuda/stencil/stencil_cu_nvptx64.bc
Normal file
BIN
examples_cuda/stencil/stencil_cu_nvptx64.bc
Normal file
Binary file not shown.
BIN
examples_cuda/stencil/stencil_cu_nvptx64.cubin
Normal file
BIN
examples_cuda/stencil/stencil_cu_nvptx64.cubin
Normal file
Binary file not shown.
269
examples_cuda/stencil/stencil_cu_nvptx64.ll
Normal file
269
examples_cuda/stencil/stencil_cu_nvptx64.ll
Normal file
@@ -0,0 +1,269 @@
|
||||
; ModuleID = 'stencil_cu_nvptx64.bc'
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
|
||||
target triple = "nvptx64"
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #0
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @stencil_step_task(i32 %x0, i32 %x1, i32 %y0, i32 %y1, i32 %z0, i32 %Nx, i32 %Ny, i32 %Nz, double* nocapture %coef, double* %vsq, double* %Ain, double* %Aout) #1 {
|
||||
allocas:
|
||||
%bid.i.i = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
|
||||
%add_z0_load_calltmp = add i32 %bid.i.i, %z0
|
||||
%bid.i.i21 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
|
||||
%add_z0_load15_calltmp18 = add i32 %z0, 1
|
||||
%add_add_z0_load15_calltmp18_ = add i32 %add_z0_load15_calltmp18, %bid.i.i21
|
||||
%mul_Nx_load_Ny_load.i = mul i32 %Ny, %Nx
|
||||
%coef_load_offset_load.i = load double* %coef, align 8
|
||||
%coef_load16_offset.i = getelementptr double* %coef, i64 1
|
||||
%coef_load16_offset_load.i = load double* %coef_load16_offset.i, align 8
|
||||
%coef_load19_offset.i = getelementptr double* %coef, i64 2
|
||||
%coef_load19_offset_load.i = load double* %coef_load19_offset.i, align 8
|
||||
%coef_load22_offset.i = getelementptr double* %coef, i64 3
|
||||
%coef_load22_offset_load.i = load double* %coef_load22_offset.i, align 8
|
||||
%less_z_load_z1_load.i161 = icmp slt i32 %add_z0_load_calltmp, %add_add_z0_load15_calltmp18_
|
||||
br i1 %less_z_load_z1_load.i161, label %for_test28.i.preheader.lr.ph, label %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit
|
||||
|
||||
for_test28.i.preheader.lr.ph: ; preds = %allocas
|
||||
%less_y_load_y1_load.i159 = icmp slt i32 %y0, %y1
|
||||
%less_xb_load_x1_load.i157 = icmp slt i32 %x0, %x1
|
||||
%x1_load199_broadcast_init.i = insertelement <1 x i32> undef, i32 %x1, i32 0
|
||||
%mul__Nx_load119.i = shl i32 %Nx, 1
|
||||
%mul__Nx_load167.i = mul i32 %Nx, 3
|
||||
%mul__Nx_load127.i = mul i32 %Nx, -2
|
||||
%Ain_load65_ptr2int.i = ptrtoint double* %Ain to i64
|
||||
%mul__Nx_load175.i = mul i32 %Nx, -3
|
||||
%mul__Nxy_load136.i = shl i32 %mul_Nx_load_Ny_load.i, 1
|
||||
%mul__Nxy_load184.i = mul i32 %mul_Nx_load_Ny_load.i, 3
|
||||
%mul__Nxy_load144.i = mul i32 %mul_Nx_load_Ny_load.i, -2
|
||||
%mul__Nxy_load192.i = mul i32 %mul_Nx_load_Ny_load.i, -3
|
||||
%Aout_load_ptr2int.i = ptrtoint double* %Aout to i64
|
||||
%vsq_load_ptr2int.i = ptrtoint double* %vsq to i64
|
||||
%0 = add i32 %bid.i.i21, %z0
|
||||
br label %for_test28.i.preheader
|
||||
|
||||
for_test28.i.preheader: ; preds = %for_exit31.i, %for_test28.i.preheader.lr.ph
|
||||
%z.0.i162 = phi i32 [ %add_z0_load_calltmp, %for_test28.i.preheader.lr.ph ], [ %z_load245_plus1.i, %for_exit31.i ]
|
||||
br i1 %less_y_load_y1_load.i159, label %for_test35.i.preheader.lr.ph, label %for_exit31.i
|
||||
|
||||
for_test35.i.preheader.lr.ph: ; preds = %for_test28.i.preheader
|
||||
%mul_z_load45_Nxy_load.i = mul i32 %z.0.i162, %mul_Nx_load_Ny_load.i
|
||||
br i1 %less_xb_load_x1_load.i157, label %for_loop37.i.lr.ph.us, label %for_exit31.i
|
||||
|
||||
for_exit38.i.us: ; preds = %safe_if_after_true.i.us
|
||||
%y_load244_plus1.i.us = add i32 %y.0.i160.us, 1
|
||||
%exitcond = icmp eq i32 %y_load244_plus1.i.us, %y1
|
||||
br i1 %exitcond, label %for_exit31.i, label %for_loop37.i.lr.ph.us
|
||||
|
||||
for_loop37.i.us: ; preds = %for_loop37.i.lr.ph.us, %safe_if_after_true.i.us
|
||||
%xb.0.i158.us = phi i32 [ %x0, %for_loop37.i.lr.ph.us ], [ %add_xb_load243_calltmp241.i.us, %safe_if_after_true.i.us ]
|
||||
%tid.i.i.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
||||
%tid.i.i.i.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2
|
||||
%sub_calltmp3_.i.i.us = add i32 %tid.i.i.i.i.us, -1
|
||||
%bitop.i.i.us = and i32 %sub_calltmp3_.i.i.us, %tid.i.i.i.us
|
||||
%add_xb_load42_calltmp.i.us = add i32 %bitop.i.i.us, %xb.0.i158.us
|
||||
%add_xb_load42_calltmp_broadcast_init.i.us = insertelement <1 x i32> undef, i32 %add_xb_load42_calltmp.i.us, i32 0
|
||||
%less_x_load198_x1_load199_broadcast.i.us = icmp slt <1 x i32> %add_xb_load42_calltmp_broadcast_init.i.us, %x1_load199_broadcast_init.i
|
||||
%v.i.i.us = extractelement <1 x i1> %less_x_load198_x1_load199_broadcast.i.us, i32 0
|
||||
br i1 %v.i.i.us, label %pl_dolane.i.us, label %safe_if_after_true.i.us
|
||||
|
||||
pl_dolane.i.us: ; preds = %for_loop37.i.us
|
||||
%.lhs.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %add_xb_load42_calltmp.i.us
|
||||
%.lhs.us = shl i32 %.lhs.lhs.us, 3
|
||||
%1 = add i32 %.lhs.us, -8
|
||||
%iptr__id.i.rhs.us = sext i32 %1 to i64
|
||||
%iptr__id.i.us = add i64 %iptr__id.i.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i.us = inttoptr i64 %iptr__id.i.us to double*
|
||||
%val__id.i.us = load double* %ptr__id.i.us, align 8
|
||||
%2 = add i32 %.lhs.us, 8
|
||||
%iptr__id.i130.rhs.us = sext i32 %2 to i64
|
||||
%iptr__id.i130.us = add i64 %iptr__id.i130.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i131.us = inttoptr i64 %iptr__id.i130.us to double*
|
||||
%val__id.i132.us = load double* %ptr__id.i131.us, align 8
|
||||
%3 = add i32 %.lhs.us, -16
|
||||
%iptr__id.i125.rhs.us = sext i32 %3 to i64
|
||||
%iptr__id.i125.us = add i64 %iptr__id.i125.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i126.us = inttoptr i64 %iptr__id.i125.us to double*
|
||||
%val__id.i127.us = load double* %ptr__id.i126.us, align 8
|
||||
%4 = add i32 %.lhs.us, 16
|
||||
%iptr__id.i120.rhs.us = sext i32 %4 to i64
|
||||
%iptr__id.i120.us = add i64 %iptr__id.i120.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i121.us = inttoptr i64 %iptr__id.i120.us to double*
|
||||
%val__id.i122.us = load double* %ptr__id.i121.us, align 8
|
||||
%.lhs138.us = add i32 %.lhs138.lhs.us, %add_xb_load42_calltmp.i.us
|
||||
%5 = shl i32 %.lhs138.us, 3
|
||||
%iptr__id.i115.rhs.us = sext i32 %5 to i64
|
||||
%iptr__id.i115.us = add i64 %iptr__id.i115.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i116.us = inttoptr i64 %iptr__id.i115.us to double*
|
||||
%val__id.i117.us = load double* %ptr__id.i116.us, align 8
|
||||
%6 = add i32 %.lhs.us, -24
|
||||
%iptr__id.i110.rhs.us = sext i32 %6 to i64
|
||||
%iptr__id.i110.us = add i64 %iptr__id.i110.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i111.us = inttoptr i64 %iptr__id.i110.us to double*
|
||||
%val__id.i112.us = load double* %ptr__id.i111.us, align 8
|
||||
%7 = add i32 %.lhs.us, 24
|
||||
%iptr__id.i105.rhs.us = sext i32 %7 to i64
|
||||
%iptr__id.i105.us = add i64 %iptr__id.i105.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i106.us = inttoptr i64 %iptr__id.i105.us to double*
|
||||
%val__id.i107.us = load double* %ptr__id.i106.us, align 8
|
||||
%.lhs141.us = add i32 %.lhs141.lhs.us, %add_xb_load42_calltmp.i.us
|
||||
%8 = shl i32 %.lhs141.us, 3
|
||||
%iptr__id.i100.rhs.us = sext i32 %8 to i64
|
||||
%iptr__id.i100.us = add i64 %iptr__id.i100.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i101.us = inttoptr i64 %iptr__id.i100.us to double*
|
||||
%val__id.i102.us = load double* %ptr__id.i101.us, align 8
|
||||
%.lhs142.us = add i32 %.lhs142.lhs.us, %add_xb_load42_calltmp.i.us
|
||||
%9 = shl i32 %.lhs142.us, 3
|
||||
%iptr__id.i95.rhs.us = sext i32 %9 to i64
|
||||
%iptr__id.i95.us = add i64 %iptr__id.i95.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i96.us = inttoptr i64 %iptr__id.i95.us to double*
|
||||
%val__id.i97.us = load double* %ptr__id.i96.us, align 8
|
||||
%.lhs143.us = add i32 %.lhs143.lhs.us, %add_xb_load42_calltmp.i.us
|
||||
%10 = shl i32 %.lhs143.us, 3
|
||||
%iptr__id.i90.rhs.us = sext i32 %10 to i64
|
||||
%iptr__id.i90.us = add i64 %iptr__id.i90.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i91.us = inttoptr i64 %iptr__id.i90.us to double*
|
||||
%val__id.i92.us = load double* %ptr__id.i91.us, align 8
|
||||
%.lhs144.us = add i32 %.lhs144.lhs.us, %add_xb_load42_calltmp.i.us
|
||||
%11 = shl i32 %.lhs144.us, 3
|
||||
%iptr__id.i85.rhs.us = sext i32 %11 to i64
|
||||
%iptr__id.i85.us = add i64 %iptr__id.i85.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i86.us = inttoptr i64 %iptr__id.i85.us to double*
|
||||
%val__id.i87.us = load double* %ptr__id.i86.us, align 8
|
||||
%.lhs145.us = add i32 %.lhs145.lhs.us, %add_xb_load42_calltmp.i.us
|
||||
%12 = shl i32 %.lhs145.us, 3
|
||||
%iptr__id.i80.rhs.us = sext i32 %12 to i64
|
||||
%iptr__id.i80.us = add i64 %iptr__id.i80.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i81.us = inttoptr i64 %iptr__id.i80.us to double*
|
||||
%val__id.i82.us = load double* %ptr__id.i81.us, align 8
|
||||
%.lhs146.us = add i32 %.lhs146.lhs.us, %add_xb_load42_calltmp.i.us
|
||||
%13 = shl i32 %.lhs146.us, 3
|
||||
%iptr__id.i75.rhs.us = sext i32 %13 to i64
|
||||
%iptr__id.i75.us = add i64 %iptr__id.i75.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i76.us = inttoptr i64 %iptr__id.i75.us to double*
|
||||
%val__id.i77.us = load double* %ptr__id.i76.us, align 8
|
||||
%.lhs147.us = add i32 %.lhs147.lhs.us, %add_xb_load42_calltmp.i.us
|
||||
%14 = shl i32 %.lhs147.us, 3
|
||||
%iptr__id.i70.rhs.us = sext i32 %14 to i64
|
||||
%iptr__id.i70.us = add i64 %iptr__id.i70.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i71.us = inttoptr i64 %iptr__id.i70.us to double*
|
||||
%val__id.i72.us = load double* %ptr__id.i71.us, align 8
|
||||
%.lhs148.us = add i32 %.lhs148.lhs.us, %add_xb_load42_calltmp.i.us
|
||||
%15 = shl i32 %.lhs148.us, 3
|
||||
%iptr__id.i65.rhs.us = sext i32 %15 to i64
|
||||
%iptr__id.i65.us = add i64 %iptr__id.i65.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i66.us = inttoptr i64 %iptr__id.i65.us to double*
|
||||
%val__id.i67.us = load double* %ptr__id.i66.us, align 8
|
||||
%.lhs149.us = add i32 %.lhs149.lhs.us, %add_xb_load42_calltmp.i.us
|
||||
%16 = shl i32 %.lhs149.us, 3
|
||||
%iptr__id.i60.rhs.us = sext i32 %16 to i64
|
||||
%iptr__id.i60.us = add i64 %iptr__id.i60.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i61.us = inttoptr i64 %iptr__id.i60.us to double*
|
||||
%val__id.i62.us = load double* %ptr__id.i61.us, align 8
|
||||
%.lhs150.us = add i32 %.lhs150.lhs.us, %add_xb_load42_calltmp.i.us
|
||||
%17 = shl i32 %.lhs150.us, 3
|
||||
%iptr__id.i55.rhs.us = sext i32 %17 to i64
|
||||
%iptr__id.i55.us = add i64 %iptr__id.i55.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i56.us = inttoptr i64 %iptr__id.i55.us to double*
|
||||
%val__id.i57.us = load double* %ptr__id.i56.us, align 8
|
||||
%.lhs151.us = add i32 %add_xb_load42_calltmp.i.us, %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us
|
||||
%18 = shl i32 %.lhs151.us, 3
|
||||
%iptr__id.i50.rhs.us = sext i32 %18 to i64
|
||||
%iptr__id.i50.us = add i64 %iptr__id.i50.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i51.us = inttoptr i64 %iptr__id.i50.us to double*
|
||||
%val__id.i52.us = load double* %ptr__id.i51.us, align 8
|
||||
%.lhs152.us = add i32 %.lhs152.lhs.us, %add_xb_load42_calltmp.i.us
|
||||
%19 = shl i32 %.lhs152.us, 3
|
||||
%iptr__id.i45.rhs.us = sext i32 %19 to i64
|
||||
%iptr__id.i45.us = add i64 %iptr__id.i45.rhs.us, %Ain_load65_ptr2int.i
|
||||
%ptr__id.i46.us = inttoptr i64 %iptr__id.i45.us to double*
|
||||
%val__id.i47.us = load double* %ptr__id.i46.us, align 8
|
||||
%val__id.i41.us = load double* %ptr__id.i51.us, align 8
|
||||
%iptr__id.i32.us = add i64 %iptr__id.i50.rhs.us, %Aout_load_ptr2int.i
|
||||
%ptr__id.i33.us = inttoptr i64 %iptr__id.i32.us to double*
|
||||
%val__id.i34.us = load double* %ptr__id.i33.us, align 8
|
||||
%iptr__id.i27.rhs.us = sext i32 %.lhs.us to i64
|
||||
%iptr__id.i27.us = add i64 %iptr__id.i27.rhs.us, %vsq_load_ptr2int.i
|
||||
%ptr__id.i28.us = inttoptr i64 %iptr__id.i27.us to double*
|
||||
%val__id.i29.us = load double* %ptr__id.i28.us, align 8
|
||||
%iptr__id.i23.us = add i64 %iptr__id.i50.rhs.us, %Aout_load_ptr2int.i
|
||||
%ptr__id.i24.us = inttoptr i64 %iptr__id.i23.us to double*
|
||||
%val__id.i25.lhs.us.lhs = fmul double %val__id.i41.us, 2.000000e+00
|
||||
%val__id.i25.lhs.us = fsub double %val__id.i25.lhs.us.lhs, %val__id.i34.us
|
||||
%val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i127.us, %val__id.i122.us
|
||||
%val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i102.us
|
||||
%val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.lhs.us, %val__id.i87.us
|
||||
%val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.lhs.us, %val__id.i72.us
|
||||
%val__id.i25.rhs.rhs.lhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.rhs.lhs.us, %val__id.i57.us
|
||||
%val__id.i25.rhs.rhs.lhs.lhs.us = fmul double %coef_load19_offset_load.i, %val__id.i25.rhs.rhs.lhs.lhs.rhs.us
|
||||
%val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i.us, %val__id.i132.us
|
||||
%val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i117.us
|
||||
%val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.lhs.us, %val__id.i97.us
|
||||
%val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.lhs.us, %val__id.i82.us
|
||||
%val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.lhs.us, %val__id.i67.us
|
||||
%val__id.i25.rhs.rhs.lhs.rhs.lhs.us = fmul double %coef_load16_offset_load.i, %val__id.i25.rhs.rhs.lhs.rhs.lhs.rhs.us
|
||||
%val__id.i25.rhs.rhs.lhs.rhs.rhs.us = fmul double %coef_load_offset_load.i, %val__id.i52.us
|
||||
%val__id.i25.rhs.rhs.lhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.rhs.lhs.us, %val__id.i25.rhs.rhs.lhs.rhs.rhs.us
|
||||
%val__id.i25.rhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.lhs.lhs.us, %val__id.i25.rhs.rhs.lhs.rhs.us
|
||||
%val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs.us = fadd double %val__id.i112.us, %val__id.i107.us
|
||||
%val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.lhs.us, %val__id.i92.us
|
||||
%val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.lhs.us, %val__id.i77.us
|
||||
%val__id.i25.rhs.rhs.rhs.rhs.lhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.lhs.us, %val__id.i62.us
|
||||
%val__id.i25.rhs.rhs.rhs.rhs.us = fadd double %val__id.i25.rhs.rhs.rhs.rhs.lhs.us, %val__id.i47.us
|
||||
%val__id.i25.rhs.rhs.rhs.us = fmul double %coef_load22_offset_load.i, %val__id.i25.rhs.rhs.rhs.rhs.us
|
||||
%val__id.i25.rhs.rhs.us = fadd double %val__id.i25.rhs.rhs.lhs.us, %val__id.i25.rhs.rhs.rhs.us
|
||||
%val__id.i25.rhs.us = fmul double %val__id.i25.rhs.rhs.us, %val__id.i29.us
|
||||
%val__id.i25.us = fadd double %val__id.i25.lhs.us, %val__id.i25.rhs.us
|
||||
store double %val__id.i25.us, double* %ptr__id.i24.us, align 8
|
||||
br label %safe_if_after_true.i.us
|
||||
|
||||
safe_if_after_true.i.us: ; preds = %pl_dolane.i.us, %for_loop37.i.us
|
||||
%tid.i.i1.i.us = tail call i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2
|
||||
%add_xb_load243_calltmp241.i.us = add i32 %tid.i.i1.i.us, %xb.0.i158.us
|
||||
%less_xb_load_x1_load.i.us = icmp slt i32 %add_xb_load243_calltmp241.i.us, %x1
|
||||
br i1 %less_xb_load_x1_load.i.us, label %for_loop37.i.us, label %for_exit38.i.us
|
||||
|
||||
for_loop37.i.lr.ph.us: ; preds = %for_exit38.i.us, %for_test35.i.preheader.lr.ph
|
||||
%y.0.i160.us = phi i32 [ %y_load244_plus1.i.us, %for_exit38.i.us ], [ %y0, %for_test35.i.preheader.lr.ph ]
|
||||
%mul_y_load46_Nx_load47.i.us = mul i32 %y.0.i160.us, %Nx
|
||||
%add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us = add i32 %mul_y_load46_Nx_load47.i.us, %mul_z_load45_Nxy_load.i
|
||||
%.lhs138.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %Nx
|
||||
%.lhs141.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load119.i
|
||||
%.lhs142.lhs.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %Nx
|
||||
%.lhs143.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load167.i
|
||||
%.lhs144.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load127.i
|
||||
%.lhs145.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul_Nx_load_Ny_load.i
|
||||
%.lhs146.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nx_load175.i
|
||||
%.lhs147.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load136.i
|
||||
%.lhs148.lhs.us = sub i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul_Nx_load_Ny_load.i
|
||||
%.lhs149.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load184.i
|
||||
%.lhs150.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load144.i
|
||||
%.lhs152.lhs.us = add i32 %add_mul_z_load45_Nxy_load_mul_y_load46_Nx_load47.i.us, %mul__Nxy_load192.i
|
||||
br label %for_loop37.i.us
|
||||
|
||||
for_exit31.i: ; preds = %for_exit38.i.us, %for_test35.i.preheader.lr.ph, %for_test28.i.preheader
|
||||
%z_load245_plus1.i = add i32 %z.0.i162, 1
|
||||
%exitcond163 = icmp eq i32 %z.0.i162, %0
|
||||
br i1 %exitcond163, label %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit, label %for_test28.i.preheader
|
||||
|
||||
stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit: ; preds = %for_exit31.i, %allocas
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind "target-features"="+sm_35" }
|
||||
attributes #2 = { nounwind }
|
||||
|
||||
!nvvm.annotations = !{!0}
|
||||
|
||||
!0 = metadata !{void (i32, i32, i32, i32, i32, i32, i32, i32, double*, double*, double*, double*)* @stencil_step_task, metadata !"kernel", i32 1}
|
||||
!1 = metadata !{ }
|
||||
!2 = metadata !{ metadata !"output", metadata !0 }
|
||||
!3 = metadata !{ metadata !"input1", metadata !0 }
|
||||
!4 = metadata !{ metadata !"input2", metadata !0 }
|
||||
35
examples_cuda/stencil/stencil_ispc.h
Normal file
35
examples_cuda/stencil/stencil_ispc.h
Normal file
@@ -0,0 +1,35 @@
|
||||
//
|
||||
// stencil_ispc.h
|
||||
// (Header automatically generated by the ispc compiler.)
|
||||
// DO NOT EDIT THIS FILE.
|
||||
//
|
||||
|
||||
#ifndef ISPC_STENCIL_ISPC_H
|
||||
#define ISPC_STENCIL_ISPC_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace ispc { /* namespace */
|
||||
#endif // __cplusplus
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Functions exported from ispc code
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C)
|
||||
extern "C" {
|
||||
#endif // __cplusplus
|
||||
extern void loop_stencil_ispc(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd);
|
||||
extern void loop_stencil_ispc_tasks(int32_t t0, int32_t t1, int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t z0, int32_t z1, int32_t Nx, int32_t Ny, int32_t Nz, const double * coef, const double * vsq, double * Aeven, double * Aodd);
|
||||
#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C)
|
||||
} /* end extern C */
|
||||
#endif // __cplusplus
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* namespace */
|
||||
#endif // __cplusplus
|
||||
|
||||
#endif // ISPC_STENCIL_ISPC_H
|
||||
BIN
examples_cuda/stencil/stencil_nvptx64.bc
Normal file
BIN
examples_cuda/stencil/stencil_nvptx64.bc
Normal file
Binary file not shown.
87
examples_cuda/stencil/stencil_parallel.cpp
Normal file
87
examples_cuda/stencil/stencil_parallel.cpp
Normal file
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
static void
|
||||
stencil_step(int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const float coef[4], const float vsq[],
|
||||
const float Ain[], float Aout[]) {
|
||||
int Nxy = Nx * Ny;
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int z = z0; z < z1; ++z) {
|
||||
for (int y = y0; y < y1; ++y) {
|
||||
for (int x = x0; x < x1; ++x) {
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
float div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void loop_stencil_parallel(int t0, int t1,
|
||||
int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const float coef[4],
|
||||
const float vsq[],
|
||||
float Aeven[], float Aodd[])
|
||||
{
|
||||
for (int t = t0; t < t1; ++t) {
|
||||
if ((t & 1) == 0)
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aeven, Aodd);
|
||||
else
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aodd, Aeven);
|
||||
}
|
||||
}
|
||||
86
examples_cuda/stencil/stencil_serial.cpp
Normal file
86
examples_cuda/stencil/stencil_serial.cpp
Normal file
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
static void
|
||||
stencil_step(int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4], const double vsq[],
|
||||
const double Ain[], double Aout[]) {
|
||||
int Nxy = Nx * Ny;
|
||||
|
||||
for (int z = z0; z < z1; ++z) {
|
||||
for (int y = y0; y < y1; ++y) {
|
||||
for (int x = x0; x < x1; ++x) {
|
||||
int index = (z * Nxy) + (y * Nx) + x;
|
||||
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
|
||||
double div = coef[0] * A_cur(0, 0, 0) +
|
||||
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
|
||||
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
|
||||
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
|
||||
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
|
||||
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
|
||||
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
|
||||
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
|
||||
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
|
||||
A_cur(0, 0, +3) + A_cur(0, 0, -3));
|
||||
|
||||
A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
|
||||
vsq[index] * div;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void loop_stencil_serial(int t0, int t1,
|
||||
int x0, int x1,
|
||||
int y0, int y1,
|
||||
int z0, int z1,
|
||||
int Nx, int Ny, int Nz,
|
||||
const double coef[4],
|
||||
const double vsq[],
|
||||
double Aeven[], double Aodd[])
|
||||
{
|
||||
for (int t = t0; t < t1; ++t) {
|
||||
if ((t & 1) == 0)
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aeven, Aodd);
|
||||
else
|
||||
stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
|
||||
Aodd, Aeven);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user