diff --git a/examples_cuda/mandelbrot_tasks3d/compile.sh b/examples_cuda/mandelbrot_tasks3d/compile.sh new file mode 100755 index 00000000..d45d31e3 --- /dev/null +++ b/examples_cuda/mandelbrot_tasks3d/compile.sh @@ -0,0 +1,5 @@ +#!/bin/sh +ptxas -arch=sm_35 -c -o kernel.gpu.o kernel_cu.ptx +fatbinary -arch=sm_35 -create kernel.fatbin -elf kernel.gpu.o +nvcc -arch=sm_35 -Xptxas="-v" -dlink -o mandel_cu.o kernel.fatbin kernel_driver.cu -rdc=true -lcudadevrt + diff --git a/examples_cuda/mandelbrot_tasks3d/kernel_cu.ptx b/examples_cuda/mandelbrot_tasks3d/kernel_cu.ptx new file mode 100644 index 00000000..356bb887 --- /dev/null +++ b/examples_cuda/mandelbrot_tasks3d/kernel_cu.ptx @@ -0,0 +1,715 @@ +// +// Generated by NVIDIA NVVM Compiler +// Compiler built on Tue Nov 5 22:34:47 2013 (1383687287) +// Cuda compilation tools, release 6.0, V6.0.1 +// + +.version 4.0 +.target sm_35 +.address_size 64 + + +.extern .func (.param .b32 func_retval0) cudaLaunchDevice +( + .param .b64 cudaLaunchDevice_param_0, + .param .b64 cudaLaunchDevice_param_1, + .param .align 4 .b8 cudaLaunchDevice_param_2[12], + .param .align 4 .b8 cudaLaunchDevice_param_3[12], + .param .b32 cudaLaunchDevice_param_4, + .param .b64 cudaLaunchDevice_param_5 +); + + +.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer +( + .param .b64 cudaGetParameterBuffer_param_0, + .param .b64 cudaGetParameterBuffer_param_1 +) +; +.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize +( + +) +; +.global .align 1 .b8 constDeltaForeach[32]; +.global .align 1 .b8 constDeltaForeach3[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + +.visible .func PTXmandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_( + .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0, + .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_1, + .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_2, + .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_3, + .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_4, + .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_5, + .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_6, + .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_7, + .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_8, + .param .b64 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_9 +) +{ + .reg .pred %p<110>; + .reg .f32 %f<77>; + .reg .s32 %r<104>; + .reg .s64 %rd<13>; + + + ld.param.f32 %f34, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0]; + ld.param.f32 %f35, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_1]; + ld.param.f32 %f36, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_2]; + ld.param.f32 %f37, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_3]; + ld.param.u32 %r37, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_4]; + ld.param.u32 %r38, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_5]; + ld.param.u32 %r41, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_6]; + ld.param.u32 %r39, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_7]; + ld.param.u32 %r40, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_8]; + ld.param.u64 %rd3, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_9]; + mov.u32 %r42, %ctaid.x; + shl.b32 %r43, %r42, 2; + mov.u32 %r44, %tid.x; + shr.s32 %r45, %r44, 5; + add.s32 %r46, %r45, %r43; + mul.lo.s32 %r1, %r46, %r41; + add.s32 %r47, %r1, %r41; + min.s32 %r2, %r47, %r37; + mov.u32 %r3, %ctaid.y; + mul.lo.s32 %r96, %r3, %r39; + add.s32 %r48, %r96, %r39; + min.s32 %r49, %r48, %r38; + sub.s32 %r50, %r2, %r1; + shr.s32 %r51, %r50, 31; + shr.u32 %r52, %r51, 27; + add.s32 %r53, %r50, %r52; + and.b32 %r54, %r53, -32; + sub.s32 %r55, %r50, %r54; + sub.s32 %r5, %r2, %r55; + and.b32 %r56, %r44, 31; + cvt.u64.u32 %rd1, %r56; + mov.u64 %rd4, constDeltaForeach; + add.s64 %rd5, %rd4, %rd1; + ld.global.u8 %r6, [%rd5]; + add.s32 %r94, %r96, %r6; + setp.ge.s32 %p29, %r96, %r49; + @%p29 bra BB0_26; + + setp.lt.s32 %p30, %r1, %r5; + setp.gt.s32 %p1, %r40, 0; + add.s32 %r57, %r3, 1; + mul.lo.s32 %r58, %r57, %r39; + not.b32 %r59, %r58; + not.b32 %r60, %r38; + max.s32 %r61, %r60, %r59; + not.b32 %r10, %r61; + mov.u64 %rd6, constDeltaForeach3; + add.s64 %rd2, %rd6, %rd1; + @%p30 bra BB0_11; + + mov.u32 %r95, %r94; + +BB0_3: + setp.ge.s32 %p31, %r1, %r2; + @%p31 bra BB0_10; + + setp.gt.s32 %p34, %r40, 0; + mov.u32 %r91, 0; + ld.global.u8 %r66, [%rd2]; + add.s32 %r14, %r1, %r66; + setp.lt.s32 %p35, %r14, %r2; + cvt.rn.f32.s32 %f38, %r14; + fma.rn.ftz.f32 %f5, %f35, %f38, %f34; + cvt.rn.f32.s32 %f39, %r95; + fma.rn.ftz.f32 %f6, %f37, %f39, %f36; + and.pred %p36, %p34, %p35; + selp.u32 %r63, 1, 0, %p36; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r63, 0; + vote.ballot.b32 %r62, %p1; + } + // inline asm + setp.eq.s32 %p37, %r62, 0; + mov.u32 %r92, %r91; + mov.pred %p33, 0; + mov.pred %p86, -1; + mov.pred %p88, %p33; + mov.f32 %f54, %f5; + mov.f32 %f58, %f6; + mov.pred %p103, %p1; + mov.pred %p104, %p1; + @%p37 bra BB0_8; + +BB0_5: + mov.pred %p2, %p104; + mov.f32 %f56, %f58; + mov.f32 %f59, %f56; + mov.f32 %f52, %f54; + mov.f32 %f55, %f52; + mov.pred %p5, %p88; + mul.ftz.f32 %f9, %f59, %f59; + mul.ftz.f32 %f10, %f55, %f55; + add.ftz.f32 %f40, %f9, %f10; + setp.gtu.ftz.f32 %p39, %f40, 0f40800000; + and.pred %p40, %p2, %p39; + or.pred %p6, %p40, %p5; + setp.ge.s32 %p41, %r14, %r2; + xor.pred %p42, %p6, %p2; + not.pred %p43, %p42; + or.pred %p44, %p43, %p41; + mov.pred %p87, %p33; + @%p44 bra BB0_7; + + and.pred %p45, %p86, %p103; + not.pred %p46, %p6; + add.ftz.f32 %f41, %f55, %f55; + sub.ftz.f32 %f42, %f10, %f9; + fma.rn.ftz.f32 %f59, %f59, %f41, %f6; + add.ftz.f32 %f55, %f5, %f42; + and.pred %p87, %p45, %p46; + +BB0_7: + mov.f32 %f13, %f59; + mov.f32 %f14, %f55; + mov.pred %p86, %p87; + add.s32 %r69, %r92, 1; + selp.b32 %r91, %r69, %r92, %p86; + setp.lt.s32 %p103, %r91, %r40; + and.pred %p10, %p86, %p103; + and.pred %p48, %p10, %p35; + selp.u32 %r68, 1, 0, %p48; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r68, 0; + vote.ballot.b32 %r67, %p1; + } + // inline asm + setp.ne.s32 %p49, %r67, 0; + mov.pred %p88, %p6; + mov.f32 %f54, %f14; + mov.f32 %f58, %f13; + mov.u32 %r92, %r91; + mov.pred %p104, %p10; + @%p49 bra BB0_5; + +BB0_8: + setp.ge.s32 %p50, %r14, %r2; + @%p50 bra BB0_10; + + mad.lo.s32 %r70, %r95, %r37, %r14; + shl.b32 %r71, %r70, 2; + cvt.s64.s32 %rd7, %r71; + add.s64 %rd8, %rd7, %rd3; + st.u32 [%rd8], %r91; + +BB0_10: + add.s32 %r96, %r96, 1; + add.s32 %r95, %r96, %r6; + setp.eq.s32 %p51, %r96, %r10; + @%p51 bra BB0_26; + bra.uni BB0_3; + +BB0_11: + selp.u32 %r73, 1, 0, %p1; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r73, 0; + vote.ballot.b32 %r72, %p1; + } + // inline asm + ld.global.u8 %r20, [%rd2]; + +BB0_12: + mov.u32 %r22, %r94; + cvt.rn.f32.s32 %f43, %r22; + mul.lo.s32 %r24, %r22, %r37; + fma.rn.ftz.f32 %f15, %f37, %f43, %f36; + mov.u32 %r97, %r1; + +BB0_13: + mov.u32 %r25, %r97; + add.s32 %r26, %r25, %r20; + cvt.rn.f32.s32 %f44, %r26; + fma.rn.ftz.f32 %f16, %f35, %f44, %f34; + setp.eq.s32 %p54, %r72, 0; + mov.u32 %r99, 0; + mov.u32 %r100, %r99; + mov.pred %p53, 0; + mov.pred %p91, -1; + mov.pred %p93, %p53; + mov.f32 %f62, %f16; + mov.pred %p101, %p1; + mov.pred %p102, %p1; + mov.f32 %f75, %f15; + @%p54 bra BB0_17; + +BB0_14: + mov.f32 %f71, %f75; + mov.f32 %f76, %f71; + mov.pred %p11, %p102; + mov.f32 %f60, %f62; + mov.f32 %f63, %f60; + mov.pred %p14, %p93; + mul.ftz.f32 %f19, %f76, %f76; + mul.ftz.f32 %f20, %f63, %f63; + add.ftz.f32 %f45, %f19, %f20; + setp.gtu.ftz.f32 %p56, %f45, 0f40800000; + and.pred %p57, %p11, %p56; + or.pred %p15, %p57, %p14; + xor.pred %p58, %p15, %p11; + mov.pred %p92, %p53; + @!%p58 bra BB0_16; + bra.uni BB0_15; + +BB0_15: + and.pred %p59, %p91, %p101; + not.pred %p60, %p15; + add.ftz.f32 %f46, %f63, %f63; + sub.ftz.f32 %f47, %f20, %f19; + fma.rn.ftz.f32 %f76, %f76, %f46, %f15; + add.ftz.f32 %f63, %f16, %f47; + and.pred %p92, %p59, %p60; + +BB0_16: + mov.f32 %f23, %f76; + mov.f32 %f24, %f63; + mov.pred %p91, %p92; + add.s32 %r78, %r100, 1; + selp.b32 %r99, %r78, %r100, %p91; + setp.lt.s32 %p101, %r99, %r40; + and.pred %p102, %p91, %p101; + selp.u32 %r77, 1, 0, %p102; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r77, 0; + vote.ballot.b32 %r76, %p1; + } + // inline asm + setp.ne.s32 %p61, %r76, 0; + mov.pred %p93, %p15; + mov.f32 %f62, %f24; + mov.u32 %r100, %r99; + mov.f32 %f75, %f23; + @%p61 bra BB0_14; + +BB0_17: + add.s32 %r79, %r26, %r24; + shl.b32 %r80, %r79, 2; + cvt.s64.s32 %rd9, %r80; + add.s64 %rd10, %rd9, %rd3; + st.u32 [%rd10], %r99; + add.s32 %r30, %r25, 32; + setp.lt.s32 %p62, %r30, %r5; + mov.u32 %r97, %r30; + @%p62 bra BB0_13; + + setp.ge.s32 %p63, %r30, %r2; + @%p63 bra BB0_25; + + setp.gt.s32 %p66, %r40, 0; + mov.u32 %r102, 0; + add.s32 %r31, %r30, %r20; + setp.lt.s32 %p67, %r31, %r2; + cvt.rn.f32.s32 %f48, %r31; + fma.rn.ftz.f32 %f25, %f35, %f48, %f34; + and.pred %p68, %p66, %p67; + selp.u32 %r82, 1, 0, %p68; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r82, 0; + vote.ballot.b32 %r81, %p1; + } + // inline asm + setp.eq.s32 %p69, %r81, 0; + mov.u32 %r103, %r102; + mov.pred %p65, 0; + mov.pred %p107, -1; + mov.pred %p99, %p1; + mov.pred %p100, %p1; + mov.pred %p109, %p65; + mov.f32 %f66, %f25; + mov.f32 %f73, %f15; + @%p69 bra BB0_23; + +BB0_20: + mov.f32 %f69, %f73; + mov.f32 %f74, %f69; + mov.f32 %f64, %f66; + mov.f32 %f67, %f64; + mov.pred %p23, %p109; + mov.pred %p20, %p100; + mul.ftz.f32 %f28, %f74, %f74; + mul.ftz.f32 %f29, %f67, %f67; + add.ftz.f32 %f49, %f28, %f29; + setp.gtu.ftz.f32 %p71, %f49, 0f40800000; + and.pred %p72, %p20, %p71; + or.pred %p24, %p72, %p23; + setp.ge.s32 %p73, %r31, %r2; + xor.pred %p74, %p24, %p20; + not.pred %p75, %p74; + or.pred %p76, %p75, %p73; + mov.pred %p108, %p65; + @%p76 bra BB0_22; + + and.pred %p77, %p107, %p99; + not.pred %p78, %p24; + add.ftz.f32 %f50, %f67, %f67; + sub.ftz.f32 %f51, %f29, %f28; + fma.rn.ftz.f32 %f74, %f74, %f50, %f15; + add.ftz.f32 %f67, %f25, %f51; + and.pred %p108, %p77, %p78; + +BB0_22: + mov.f32 %f73, %f74; + mov.f32 %f33, %f67; + mov.pred %p107, %p108; + add.s32 %r87, %r103, 1; + selp.b32 %r102, %r87, %r103, %p107; + setp.lt.s32 %p99, %r102, %r40; + and.pred %p100, %p107, %p99; + and.pred %p80, %p100, %p67; + selp.u32 %r86, 1, 0, %p80; + // inline asm + { .reg .pred %p1; + setp.ne.u32 %p1, %r86, 0; + vote.ballot.b32 %r85, %p1; + } + // inline asm + setp.ne.s32 %p81, %r85, 0; + mov.pred %p109, %p24; + mov.f32 %f66, %f33; + mov.u32 %r103, %r102; + @%p81 bra BB0_20; + +BB0_23: + setp.ge.s32 %p82, %r31, %r2; + @%p82 bra BB0_25; + + add.s32 %r88, %r31, %r24; + shl.b32 %r89, %r88, 2; + cvt.s64.s32 %rd11, %r89; + add.s64 %rd12, %rd11, %rd3; + st.u32 [%rd12], %r102; + +BB0_25: + add.s32 %r96, %r96, 1; + add.s32 %r94, %r96, %r6; + setp.ne.s32 %p83, %r96, %r10; + @%p83 bra BB0_12; + +BB0_26: + ret; +} + +.visible .func PTXmandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_( + .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0, + .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1, + .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2, + .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3, + .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4, + .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5, + .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6, + .param .b64 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7, + .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_8 +) +{ + .reg .pred %p<4>; + .reg .f32 %f<13>; + .reg .s32 %r<27>; + .reg .s64 %rd<13>; + + + ld.param.f32 %f1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0]; + ld.param.f32 %f2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1]; + ld.param.f32 %f3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2]; + ld.param.f32 %f4, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3]; + ld.param.u32 %r2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4]; + ld.param.u32 %r3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5]; + ld.param.u32 %r4, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6]; + ld.param.u64 %rd5, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7]; + mov.u32 %r5, %tid.x; + and.b32 %r1, %r5, 31; + setp.ne.s32 %p1, %r1, 0; + mov.u64 %rd12, 0; + @%p1 bra BB1_3; + + mov.u64 %rd7, 8; + mov.u64 %rd8, 48; + // Callseq Start 0 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd7; + .param .b64 param1; + st.param.b64 [param1+0], %rd8; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd1, [retval0+0]; + } + // Callseq End 0 + setp.eq.s64 %p2, %rd1, 0; + mov.u64 %rd12, %rd1; + @%p2 bra BB1_3; + + cvt.rn.f32.s32 %f5, %r2; + rcp.approx.ftz.f32 %f6, %f5; + cvt.rn.f32.s32 %f7, %r3; + rcp.approx.ftz.f32 %f8, %f7; + sub.ftz.f32 %f9, %f4, %f2; + mul.ftz.f32 %f10, %f9, %f8; + sub.ftz.f32 %f11, %f3, %f1; + mul.ftz.f32 %f12, %f11, %f6; + st.f32 [%rd1], %f1; + st.f32 [%rd1+4], %f12; + st.f32 [%rd1+8], %f2; + st.f32 [%rd1+12], %f10; + st.u32 [%rd1+16], %r2; + st.u32 [%rd1+20], %r3; + mov.u32 %r6, 64; + st.u32 [%rd1+24], %r6; + mov.u32 %r7, 8; + st.u32 [%rd1+28], %r7; + st.u32 [%rd1+32], %r4; + st.u64 [%rd1+40], %rd5; + mov.u64 %rd12, %rd1; + +BB1_3: + @%p1 bra BB1_5; + + shr.s32 %r16, %r2, 31; + shr.u32 %r17, %r16, 26; + add.s32 %r18, %r2, %r17; + shr.s32 %r19, %r18, 6; + shr.s32 %r20, %r3, 31; + shr.u32 %r21, %r20, 29; + add.s32 %r22, %r3, %r21; + shr.s32 %r10, %r22, 3; + add.s32 %r23, %r19, -1; + shr.s32 %r24, %r23, 2; + add.s32 %r9, %r24, 1; + mov.u32 %r14, 1; + mov.u32 %r12, 128; + mov.u32 %r15, 0; + mov.u64 %rd11, 0; + mov.u64 %rd9, PTXmandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd12; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r9; + st.param.b32 [param2+4], %r10; + st.param.b32 [param2+8], %r14; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r12; + st.param.b32 [param3+4], %r14; + st.param.b32 [param3+8], %r14; + .param .b32 param4; + st.param.b32 [param4+0], %r15; + .param .b64 param5; + st.param.b64 [param5+0], %rd11; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r8, [retval0+0]; + } + + // inline asm + +BB1_5: + // Callseq Start 1 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r25, [retval0+0]; + } + // Callseq End 1 + // Callseq Start 2 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r26, [retval0+0]; + } + // Callseq End 2 + ret; +} + +.visible .func PTXmandelbrot_ispc( + .param .b32 mandelbrot_ispc_param_0, + .param .b32 mandelbrot_ispc_param_1, + .param .b32 mandelbrot_ispc_param_2, + .param .b32 mandelbrot_ispc_param_3, + .param .b32 mandelbrot_ispc_param_4, + .param .b32 mandelbrot_ispc_param_5, + .param .b32 mandelbrot_ispc_param_6, + .param .b64 mandelbrot_ispc_param_7 +) +{ + .reg .pred %p<4>; + .reg .f32 %f<13>; + .reg .s32 %r<27>; + .reg .s64 %rd<13>; + + + ld.param.f32 %f1, [mandelbrot_ispc_param_0]; + ld.param.f32 %f2, [mandelbrot_ispc_param_1]; + ld.param.f32 %f3, [mandelbrot_ispc_param_2]; + ld.param.f32 %f4, [mandelbrot_ispc_param_3]; + ld.param.u32 %r2, [mandelbrot_ispc_param_4]; + ld.param.u32 %r3, [mandelbrot_ispc_param_5]; + ld.param.u32 %r4, [mandelbrot_ispc_param_6]; + ld.param.u64 %rd5, [mandelbrot_ispc_param_7]; + mov.u32 %r5, %tid.x; + and.b32 %r1, %r5, 31; + setp.ne.s32 %p1, %r1, 0; + mov.u64 %rd12, 0; + @%p1 bra BB2_3; + + mov.u64 %rd7, 8; + mov.u64 %rd8, 48; + // Callseq Start 3 + { + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd7; + .param .b64 param1; + st.param.b64 [param1+0], %rd8; + .param .b64 retval0; + call.uni (retval0), + cudaGetParameterBuffer, + ( + param0, + param1 + ); + ld.param.b64 %rd1, [retval0+0]; + } + // Callseq End 3 + setp.eq.s64 %p2, %rd1, 0; + mov.u64 %rd12, %rd1; + @%p2 bra BB2_3; + + cvt.rn.f32.s32 %f5, %r2; + rcp.approx.ftz.f32 %f6, %f5; + cvt.rn.f32.s32 %f7, %r3; + rcp.approx.ftz.f32 %f8, %f7; + sub.ftz.f32 %f9, %f4, %f2; + mul.ftz.f32 %f10, %f9, %f8; + sub.ftz.f32 %f11, %f3, %f1; + mul.ftz.f32 %f12, %f11, %f6; + st.f32 [%rd1], %f1; + st.f32 [%rd1+4], %f12; + st.f32 [%rd1+8], %f2; + st.f32 [%rd1+12], %f10; + st.u32 [%rd1+16], %r2; + st.u32 [%rd1+20], %r3; + mov.u32 %r6, 64; + st.u32 [%rd1+24], %r6; + mov.u32 %r7, 8; + st.u32 [%rd1+28], %r7; + st.u32 [%rd1+32], %r4; + st.u64 [%rd1+40], %rd5; + mov.u64 %rd12, %rd1; + +BB2_3: + @%p1 bra BB2_5; + + shr.s32 %r16, %r2, 31; + shr.u32 %r17, %r16, 26; + add.s32 %r18, %r2, %r17; + shr.s32 %r19, %r18, 6; + shr.s32 %r20, %r3, 31; + shr.u32 %r21, %r20, 29; + add.s32 %r22, %r3, %r21; + shr.s32 %r10, %r22, 3; + add.s32 %r23, %r19, -1; + shr.s32 %r24, %r23, 2; + add.s32 %r9, %r24, 1; + mov.u32 %r14, 1; + mov.u32 %r12, 128; + mov.u32 %r15, 0; + mov.u64 %rd11, 0; + mov.u64 %rd9, PTXmandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_; + // inline asm + { + .param .b64 param0; + st.param.b64 [param0+0], %rd9; + .param .b64 param1; + st.param.b64 [param1+0], %rd12; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], %r9; + st.param.b32 [param2+4], %r10; + st.param.b32 [param2+8], %r14; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], %r12; + st.param.b32 [param3+4], %r14; + st.param.b32 [param3+8], %r14; + .param .b32 param4; + st.param.b32 [param4+0], %r15; + .param .b64 param5; + st.param.b64 [param5+0], %rd11; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 %r8, [retval0+0]; + } + + // inline asm + +BB2_5: + // Callseq Start 4 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r25, [retval0+0]; + } + // Callseq End 4 + // Callseq Start 5 + { + .reg .b32 temp_param_reg; + .param .b32 retval0; + call.uni (retval0), + cudaDeviceSynchronize, + ( + ); + ld.param.b32 %r26, [retval0+0]; + } + // Callseq End 5 + ret; +} + + + diff --git a/examples_cuda/mandelbrot_tasks3d/kernel_driver.cu b/examples_cuda/mandelbrot_tasks3d/kernel_driver.cu new file mode 100644 index 00000000..4766a07f --- /dev/null +++ b/examples_cuda/mandelbrot_tasks3d/kernel_driver.cu @@ -0,0 +1,71 @@ + +extern "C" __device__ void PTXmandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_( + float,float,float,float,uint32_t,uint32_t,uint32_t,uint32_t,uint32_t,uint64_t); + +extern "C" +__global__ void mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_( + float param0, + float param1, + float param2, + float param3, + uint32_t param4, + uint32_t param5, + uint32_t param6, + uint32_t param7, + uint32_t param8, + uint64_t param9) +{ + PTXmandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_( + param0, param1, param2, param3, param4, param5, param6, param7, param8, param9); +} + +extern "C" __device__ void PTXmandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_( + float param0, + float param1, + float param2, + float param3, + uint32_t param4, + uint32_t param5, + uint32_t param6, + uint64_t param7, + char param8); + +extern "C" +__global__ void mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_( + float param0, + float param1, + float param2, + float param3, + uint32_t param4, + uint32_t param5, + uint32_t param6, + uint64_t param7, + char param8) +{ + PTXmandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_( + param0,param1,param2,param3,param4,param5,param6,param7,param8); +} + +extern "C" __device__ void PTXmandelbrot_ispc( + float param0, + float param1, + float param2, + float param3, + uint32_t param4, + uint32_t param5, + uint32_t param6, + uint64_t param7); +extern "C" +__global__ void mandelbrot_ispc( + float param0, + float param1, + float param2, + float param3, + uint32_t param4, + uint32_t param5, + uint32_t param6, + uint64_t param7) +{ + PTXmandelbrot_ispc( + param0,param1,param2,param3,param4,param5,param6,param7); +}