128 lines
11 KiB
ArmAsm
128 lines
11 KiB
ArmAsm
.file "mandelbrot_task.ispc"
|
|
.text
|
|
.globl mandelbrot_ispc
|
|
.align 16, 0x90
|
|
.type mandelbrot_ispc,@function
|
|
mandelbrot_ispc: # @mandelbrot_ispc
|
|
# BB#0: # %allocas
|
|
pushq %rbp
|
|
movq %rsp, %rbp
|
|
pushq %r15
|
|
pushq %r14
|
|
pushq %r13
|
|
pushq %r12
|
|
pushq %rbx
|
|
andq $-32, %rsp
|
|
subq $224, %rsp
|
|
movq %rcx, %r14
|
|
movl %edx, %r12d
|
|
movl %esi, %r13d
|
|
movl %edi, %ebx
|
|
vmovss %xmm3, 44(%rsp) # 4-byte Spill
|
|
vmovss %xmm2, 36(%rsp) # 4-byte Spill
|
|
vmovss %xmm1, 40(%rsp) # 4-byte Spill
|
|
vmovss %xmm0, 32(%rsp) # 4-byte Spill
|
|
movq $0, 192(%rsp)
|
|
leaq 192(%rsp), %r15
|
|
movq %r15, %rdi
|
|
movl $80, %esi
|
|
movl $32, %edx
|
|
callq CUDAAlloc
|
|
vcvtsi2ssl %ebx, %xmm0, %xmm0
|
|
vmovss 36(%rsp), %xmm1 # 4-byte Reload
|
|
vmovss 32(%rsp), %xmm2 # 4-byte Reload
|
|
vsubss %xmm2, %xmm1, %xmm1
|
|
vmovss %xmm2, 92(%rsp)
|
|
vdivss %xmm0, %xmm1, %xmm0
|
|
vcvtsi2ssl %r13d, %xmm0, %xmm1
|
|
vmovss 40(%rsp), %xmm3 # 4-byte Reload
|
|
vmovss 44(%rsp), %xmm2 # 4-byte Reload
|
|
vsubss %xmm3, %xmm2, %xmm2
|
|
movl %ebx, %r9d
|
|
sarl $31, %r9d
|
|
shrl $27, %r9d
|
|
movl %r13d, %eax
|
|
sarl $31, %eax
|
|
leaq 92(%rsp), %rcx
|
|
movq %rcx, 96(%rsp)
|
|
vdivss %xmm1, %xmm2, %xmm1
|
|
shrl $30, %eax
|
|
addl %r13d, %eax
|
|
addl %ebx, %r9d
|
|
leaq 72(%rsp), %rcx
|
|
leaq 76(%rsp), %rdx
|
|
leaq 80(%rsp), %rsi
|
|
sarl $5, %r9d
|
|
sarl $2, %eax
|
|
vmovss %xmm0, 88(%rsp)
|
|
leaq 88(%rsp), %rdi
|
|
movq %rdi, 104(%rsp)
|
|
vmovss %xmm3, 84(%rsp)
|
|
leaq 84(%rsp), %rdi
|
|
movq %rdi, 112(%rsp)
|
|
vmovss %xmm1, 80(%rsp)
|
|
leaq 68(%rsp), %rdi
|
|
movq %rsi, 120(%rsp)
|
|
movl %ebx, 76(%rsp)
|
|
movq %rdx, 128(%rsp)
|
|
movl %r13d, 72(%rsp)
|
|
leaq 96(%rsp), %r8
|
|
movq %rcx, 136(%rsp)
|
|
movl $32, 68(%rsp)
|
|
leaq 48(%rsp), %rcx
|
|
movq %rdi, 144(%rsp)
|
|
movl $4, 64(%rsp)
|
|
leaq 64(%rsp), %rdx
|
|
movq %rdx, 152(%rsp)
|
|
leaq 60(%rsp), %rdx
|
|
movl %r12d, 60(%rsp)
|
|
movq %rdx, 160(%rsp)
|
|
movq %r14, 48(%rsp)
|
|
movq %rcx, 168(%rsp)
|
|
movl %eax, (%rsp)
|
|
movl $1, 8(%rsp)
|
|
movq %r15, %rdi
|
|
movl $.L.module_str, %esi
|
|
movl $.L.ptx_str, %edx
|
|
movl $.L.func_str, %ecx
|
|
callq CUDALaunch
|
|
movq 192(%rsp), %rdi
|
|
testq %rdi, %rdi
|
|
je .LBB0_2
|
|
# BB#1: # %call_sync
|
|
callq ISPCSync
|
|
movq $0, 192(%rsp)
|
|
.LBB0_2: # %post_sync
|
|
leaq -40(%rbp), %rsp
|
|
popq %rbx
|
|
popq %r12
|
|
popq %r13
|
|
popq %r14
|
|
popq %r15
|
|
popq %rbp
|
|
ret
|
|
.Ltmp0:
|
|
.size mandelbrot_ispc, .Ltmp0-mandelbrot_ispc
|
|
|
|
.type .L.module_str,@object # @.module_str
|
|
.section .rodata,"a",@progbits
|
|
.align 16
|
|
.L.module_str:
|
|
.asciz "mandelbrot_task.ispc"
|
|
.size .L.module_str, 21
|
|
|
|
.type .L.ptx_str,@object # @.ptx_str
|
|
.align 16
|
|
.L.ptx_str:
|
|
.asciz "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.1\n.target sm_35, texmode_independent\n.address_size 64\n\n\t// .globl\tmandelbrot_scanline\n.func (.param .b32 func_retval0) puts\n(\n\t.param .b64 puts_param_0\n)\n;\n.func abort\n(\n\n)\n;\n.global .align 1 .b8 __str[66] = {109, 97, 110, 100, 101, 108, 98, 114, 111, 116, 95, 116, 97, 115, 107, 46, 105, 115, 112, 99, 58, 53, 53, 58, 51, 58, 32, 65, 115, 115, 101, 114, 116, 105, 111, 110, 32, 102, 97, 105, 108, 101, 100, 58, 32, 120, 115, 112, 97, 110, 32, 62, 61, 32, 118, 101, 99, 116, 111, 114, 87, 105, 100, 116, 104, 0};\n // @mandelbrot_scanline\n.entry mandelbrot_scanline(\n\t.param .f32 mandelbrot_scanline_param_0,\n\t.param .f32 mandelbrot_scanline_param_1,\n\t.param .f32 mandelbrot_scanline_param_2,\n\t.param .f32 mandelbrot_scanline_param_3,\n\t.param .u32 mandelbrot_scanline_param_4,\n\t.param .u32 mandelbrot_scanline_param_5,\n\t.param .u32 mandelbrot_scanline_param_6,\n\t.param .u32 mandelbrot_scanline_param_7,\n\t.param .u32 mandelbrot_scanline_param_8,\n\t.param .u64 .ptr .align 4 mandelbrot_scanline_param_9\n)\n{\n\t.reg .pred %p<396>;\n\t.reg .s16 %rc<396>;\n\t.reg .s16 %rs<396>;\n\t.reg .s32 %r<396>;\n\t.reg .s64 %rl<396>;\n\t.reg .f32 %f<396>;\n\t.reg .f64 %fl<396>;\n\n// BB#0: // %allocas\n\tld.param.u32 \t%r3, [mandelbrot_scanline_param_6];\n\tmov.u32 \t%r0, WARP_SZ;\n\tsetp.gt.s32 \t%p0, %r0, %r3;\n\t@%p0 bra \tBB0_18;\n// BB#1: // %for_test.preheader\n\tld.param.u32 \t%r7, [mandelbrot_scanline_param_5];\n\tld.param.u32 \t%r6, [mandelbrot_scanline_param_7];\n\tmov.u32 \t%r8, %ctaid.y;\n\tmul.lo.s32 \t%r1, %r8, %r6;\n\tmad.lo.s32 \t%r2, %r8, %r6, %r6;\n\tsetp.lt.s32 \t%p0, %r2, %r7;\n\tselp.b32 \t%r2, %r2, %r7, %p0;\n\tsetp.ge.s32 \t%p0, %r1, %r2;\n\t@%p0 bra \tBB0_14;\n// BB#2: // %for_test34.preheader.lr.ph\n\tld.param.f32 \t%f0, [mandelbrot_scanline_param_0];\n\tld.param.f32 \t%f1, [mandelbrot_scanline_param_1];\n\tld.param.f32 \t%f2, [mandelbrot_scanline_param_2];\n\tmov.u32 \t%r4, %ctaid.x;\n\tmul.lo.s32 \t%r2, %r4, %r3;\n\tld.param.f32 \t%f3, [mandelbrot_scanline_param_3];\n\tmad.lo.s32 \t%r4, %r4, %r3, %r3;\n\tld.param.u32 \t%r3, [mandelbrot_scanline_param_4];\n\tsetp.lt.s32 \t%p0, %r4, %r3;\n\tselp.b32 \t%r4, %r4, %r3, %p0;\n\tld.param.u32 \t%r5, [mandelbrot_scanline_param_8];\n\tld.param.u64 \t%rl0, [mandelbrot_scanline_param_9];\n\tsetp.gt.s32 \t%p0, %r5, 0;\n\tnot.b32 \t%r7, %r7;\n\tadd.s32 \t%r8, %r8, 1;\n\tmul.lo.s32 \t%r6, %r8, %r6;\n\tnot.b32 \t%r6, %r6;\n\tsetp.gt.s32 \t%p1, %r7, %r6;\n\tselp.b32 \t%r6, %r7, %r6, %p1;\n\tnot.b32 \t%r6, %r6;\nBB0_3: // %for_test34.preheader\n // =>This Loop Header: Depth=1\n // Child Loop BB0_16 Depth 2\n // Child Loop BB0_9 Depth 2\n // Child Loop BB0_12 Depth 3\n\tsetp.ge.s32 \t%p1, %r2, %r4;\n\t@%p1 bra \tBB0_13;\n// BB#4: // %for_loop36.lr.ph\n // in Loop: Header=BB0_3 Depth=1\n\tmul.lo.s32 \t%r7, %r1, %r3;\n\tmov.u32 \t%r8, %r2;\n\t@%p0 bra \tBB0_5;\n\tbra.uni \tBB0_16;\nBB0_5: // in Loop: Header=BB0_3 Depth=1\n\tcvt.rn.f32.s32 \t%f4, %r1;\n\tfma.rn.f32 \t%f4, %f4, %f3, %f2;\n\tmov.u32 \t%r8, %r2;\nBB0_9: // %for_loop.i.lr.ph.us\n // Parent Loop BB0_3 Depth=1\n // => This Loop Header: Depth=2\n // Child Loop BB0_12 Depth 3\n\tmov.u32 \t%r9, %tid.x;\n\tadd.s32 \t%r10, %r0, -1;\n\tand.b32 \t%r10, %r10, %r9;\n\tadd.s32 \t%r11, %r10, %r8;\n\tcvt.rn.f32.s32 \t%f5, %r11;\n\tfma.rn.f32 \t%f5, %f5, %f1, %f0;\n\tmov.u32 \t%r10, 0;\n\tmov.pred \t%p1, 0;\n\tmov.pred \t%p3, -1;\n\tmov.pred \t%p4, %p0;\n\tmov.pred \t%p2, %p1;\n\tmov.f32 \t%f7, %f5;\n\tmov.f32 \t%f6, %f4;\nBB0_12: // %for_loop.i.us\n // Parent Loop BB0_3 Depth=1\n // Parent Loop BB0_9 Depth=2\n // => This Inner Loop Header: Depth=3\n\tand.pred \t%p4, %p3, %p4;\n\tmul.f32 \t%f8, %f7, %f7;\n\tfma.rn.f32 \t%f9, %f6, %f6, %f8;\n\tsetp.gtu.f32 \t%p3, %f9, 0f40800000;\n\tand.pred \t%p3, %p4, %p3;\n\tor.pred \t%p2, %p3, %p2;\n\txor.pred \t%p5, %p2, %p4;\n\tmov.pred \t%p3, %p1;\n\t@!%p5 bra \tBB0_11;\n\tbra.uni \tBB0_10;\nBB0_10: // %not_all_continued_or_breaked.i.us\n // in Loop: Header=BB0_12 Depth=3\n\tmul.f32 \t%f9, %f6, %f6;\n\tnot.pred \t%p3, %p2;\n\tand.pred \t%p3, %p4, %p3;\n\tsub.f32 \t%f8, %f8, %f9;\n\tadd.f32 \t%f8, %f5, %f8;\n\tadd.f32 \t%f7, %f7, %f7;\n\tfma.rn.f32 \t%f6, %f6, %f7, %f4;\n\tmov.f32 \t%f7, %f8;\nBB0_11: // %for_step.i.us\n // in Loop: Header=BB0_12 Depth=3\n\tadd.s32 \t%r12, %r10, 1;\n\tselp.b32 \t%r10, %r12, %r10, %p3;\n\tsetp.lt.s32 \t%p4, %r10, %r5;\n\tand.pred \t%p5, %p3, %p4;\n\t@%p5 bra \tBB0_12;\n// BB#6: // %mandel___vyfvyfvyi.exit.us\n // in Loop: Header=BB0_9 Depth=2\n\tsetp.ge.s32 \t%p1, %r11, %r4;\n\t@%p1 bra \tBB0_8;\n// BB#7: // %if_then.us\n // in Loop: Header=BB0_9 Depth=2\n\tadd.s32 \t%r11, %r0, 1073741823;\n\tand.b32 \t%r9, %r11, %r9;\n\tadd.s32 \t%r11, %r8, %r7;\n\tadd.s32 \t%r9, %r11, %r9;\n\tshl.b32 \t%r9, %r9, 2;\n\tcvt.s64.s32 \t%rl1, %r9;\n\tadd.s64 \t%rl1, %rl1, %rl0;\n\tst.u32 \t[%rl1], %r10;\nBB0_8: // %if_exit.us\n // in Loop: Header=BB0_9 Depth=2\n\tadd.s32 \t%r8, %r0, %r8;\n\tsetp.lt.s32 \t%p1, %r8, %r4;\n\t@%p1 bra \tBB0_9;\n\tbra.uni \tBB0_13;\nBB0_16: // %mandel___vyfvyfvyi.exit\n // Parent Loop BB0_3 Depth=1\n // => This Inner Loop Header: Depth=2\n\tmov.u32 \t%r9, %tid.x;\n\tadd.s32 \t%r10, %r0, -1;\n\tand.b32 \t%r10, %r10, %r9;\n\tadd.s32 \t%r10, %r10, %r8;\n\tsetp.lt.s32 \t%p1, %r10, %r4;\n\t@%p1 bra \tBB0_17;\n\tbra.uni \tBB0_15;\nBB0_17: // %if_then\n // in Loop: Header=BB0_16 Depth=2\n\tadd.s32 \t%r10, %r0, 1073741823;\n\tand.b32 \t%r9, %r10, %r9;\n\tadd.s32 \t%r10, %r8, %r7;\n\tadd.s32 \t%r9, %r10, %r9;\n\tshl.b32 \t%r9, %r9, 2;\n\tcvt.s64.s32 \t%rl1, %r9;\n\tadd.s64 \t%rl1, %rl1, %rl0;\n\tmov.u32 \t%r9, 0;\n\tst.u32 \t[%rl1], %r9;\nBB0_15: // %if_exit\n // in Loop: Header=BB0_16 Depth=2\n\tadd.s32 \t%r8, %r0, %r8;\n\tsetp.lt.s32 \t%p1, %r8, %r4;\n\t@%p1 bra \tBB0_16;\nBB0_13: // %for_exit37\n // in Loop: Header=BB0_3 Depth=1\n\tadd.s32 \t%r1, %r1, 1;\n\tsetp.eq.s32 \t%p1, %r1, %r6;\n\t@%p1 bra \tBB0_14;\n\tbra.uni \tBB0_3;\nBB0_14: // %for_exit\n\tret;\nBB0_18: // %fail.i\n\tmov.u64 \t%rl0, __str;\n\tcvta.global.u64 \t%rl0, %rl0;\n\t// Callseq Start 2\n\t{\n\t.reg .b32 temp_param_reg;\n\t// <end>}\n\t.param .b64 param0;\n\tst.param.b64\t[param0+0], %rl0;\n\t.param .b32 retval0;\n\tcall.uni (retval0), \n\tputs, \n\t(\n\tparam0\n\t);\n\tld.param.b32\t%r0, [retval0+0];\n\t\n\t//{\n\t}// Callseq End 2\n\t// Callseq Start 3\n\t{\n\t.reg .b32 temp_param_reg;\n\t// <end>}\n\tcall.uni \n\tabort, \n\t(\n\t);\n\t\n\t//{\n\t}// Callseq End 3\n}\n\n"
|
|
.size .L.ptx_str, 7522
|
|
|
|
.type .L.func_str,@object # @.func_str
|
|
.align 16
|
|
.L.func_str:
|
|
.asciz "mandelbrot_scanline"
|
|
.size .L.func_str, 20
|
|
|
|
|
|
.section ".note.GNU-stack","",@progbits
|