716 lines
18 KiB
Plaintext
716 lines
18 KiB
Plaintext
//
|
|
// Generated by NVIDIA NVVM Compiler
|
|
// Compiler built on Tue Nov 5 22:34:47 2013 (1383687287)
|
|
// Cuda compilation tools, release 6.0, V6.0.1
|
|
//
|
|
|
|
.version 4.0
|
|
.target sm_35
|
|
.address_size 64
|
|
|
|
|
|
.extern .func (.param .b32 func_retval0) cudaLaunchDevice
|
|
(
|
|
.param .b64 cudaLaunchDevice_param_0,
|
|
.param .b64 cudaLaunchDevice_param_1,
|
|
.param .align 4 .b8 cudaLaunchDevice_param_2[12],
|
|
.param .align 4 .b8 cudaLaunchDevice_param_3[12],
|
|
.param .b32 cudaLaunchDevice_param_4,
|
|
.param .b64 cudaLaunchDevice_param_5
|
|
);
|
|
|
|
|
|
.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer
|
|
(
|
|
.param .b64 cudaGetParameterBuffer_param_0,
|
|
.param .b64 cudaGetParameterBuffer_param_1
|
|
)
|
|
;
|
|
.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize
|
|
(
|
|
|
|
)
|
|
;
|
|
.global .align 1 .b8 constDeltaForeach[32];
|
|
.global .align 1 .b8 constDeltaForeach3[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
|
|
|
|
.visible .func PTXmandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_(
|
|
.param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0,
|
|
.param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_1,
|
|
.param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_2,
|
|
.param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_3,
|
|
.param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_4,
|
|
.param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_5,
|
|
.param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_6,
|
|
.param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_7,
|
|
.param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_8,
|
|
.param .b64 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_9
|
|
)
|
|
{
|
|
.reg .pred %p<110>;
|
|
.reg .f32 %f<77>;
|
|
.reg .s32 %r<104>;
|
|
.reg .s64 %rd<13>;
|
|
|
|
|
|
ld.param.f32 %f34, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0];
|
|
ld.param.f32 %f35, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_1];
|
|
ld.param.f32 %f36, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_2];
|
|
ld.param.f32 %f37, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_3];
|
|
ld.param.u32 %r37, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_4];
|
|
ld.param.u32 %r38, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_5];
|
|
ld.param.u32 %r41, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_6];
|
|
ld.param.u32 %r39, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_7];
|
|
ld.param.u32 %r40, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_8];
|
|
ld.param.u64 %rd3, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_9];
|
|
mov.u32 %r42, %ctaid.x;
|
|
shl.b32 %r43, %r42, 2;
|
|
mov.u32 %r44, %tid.x;
|
|
shr.s32 %r45, %r44, 5;
|
|
add.s32 %r46, %r45, %r43;
|
|
mul.lo.s32 %r1, %r46, %r41;
|
|
add.s32 %r47, %r1, %r41;
|
|
min.s32 %r2, %r47, %r37;
|
|
mov.u32 %r3, %ctaid.y;
|
|
mul.lo.s32 %r96, %r3, %r39;
|
|
add.s32 %r48, %r96, %r39;
|
|
min.s32 %r49, %r48, %r38;
|
|
sub.s32 %r50, %r2, %r1;
|
|
shr.s32 %r51, %r50, 31;
|
|
shr.u32 %r52, %r51, 27;
|
|
add.s32 %r53, %r50, %r52;
|
|
and.b32 %r54, %r53, -32;
|
|
sub.s32 %r55, %r50, %r54;
|
|
sub.s32 %r5, %r2, %r55;
|
|
and.b32 %r56, %r44, 31;
|
|
cvt.u64.u32 %rd1, %r56;
|
|
mov.u64 %rd4, constDeltaForeach;
|
|
add.s64 %rd5, %rd4, %rd1;
|
|
ld.global.u8 %r6, [%rd5];
|
|
add.s32 %r94, %r96, %r6;
|
|
setp.ge.s32 %p29, %r96, %r49;
|
|
@%p29 bra BB0_26;
|
|
|
|
setp.lt.s32 %p30, %r1, %r5;
|
|
setp.gt.s32 %p1, %r40, 0;
|
|
add.s32 %r57, %r3, 1;
|
|
mul.lo.s32 %r58, %r57, %r39;
|
|
not.b32 %r59, %r58;
|
|
not.b32 %r60, %r38;
|
|
max.s32 %r61, %r60, %r59;
|
|
not.b32 %r10, %r61;
|
|
mov.u64 %rd6, constDeltaForeach3;
|
|
add.s64 %rd2, %rd6, %rd1;
|
|
@%p30 bra BB0_11;
|
|
|
|
mov.u32 %r95, %r94;
|
|
|
|
BB0_3:
|
|
setp.ge.s32 %p31, %r1, %r2;
|
|
@%p31 bra BB0_10;
|
|
|
|
setp.gt.s32 %p34, %r40, 0;
|
|
mov.u32 %r91, 0;
|
|
ld.global.u8 %r66, [%rd2];
|
|
add.s32 %r14, %r1, %r66;
|
|
setp.lt.s32 %p35, %r14, %r2;
|
|
cvt.rn.f32.s32 %f38, %r14;
|
|
fma.rn.ftz.f32 %f5, %f35, %f38, %f34;
|
|
cvt.rn.f32.s32 %f39, %r95;
|
|
fma.rn.ftz.f32 %f6, %f37, %f39, %f36;
|
|
and.pred %p36, %p34, %p35;
|
|
selp.u32 %r63, 1, 0, %p36;
|
|
// inline asm
|
|
{ .reg .pred %p1;
|
|
setp.ne.u32 %p1, %r63, 0;
|
|
vote.ballot.b32 %r62, %p1;
|
|
}
|
|
// inline asm
|
|
setp.eq.s32 %p37, %r62, 0;
|
|
mov.u32 %r92, %r91;
|
|
mov.pred %p33, 0;
|
|
mov.pred %p86, -1;
|
|
mov.pred %p88, %p33;
|
|
mov.f32 %f54, %f5;
|
|
mov.f32 %f58, %f6;
|
|
mov.pred %p103, %p1;
|
|
mov.pred %p104, %p1;
|
|
@%p37 bra BB0_8;
|
|
|
|
BB0_5:
|
|
mov.pred %p2, %p104;
|
|
mov.f32 %f56, %f58;
|
|
mov.f32 %f59, %f56;
|
|
mov.f32 %f52, %f54;
|
|
mov.f32 %f55, %f52;
|
|
mov.pred %p5, %p88;
|
|
mul.ftz.f32 %f9, %f59, %f59;
|
|
mul.ftz.f32 %f10, %f55, %f55;
|
|
add.ftz.f32 %f40, %f9, %f10;
|
|
setp.gtu.ftz.f32 %p39, %f40, 0f40800000;
|
|
and.pred %p40, %p2, %p39;
|
|
or.pred %p6, %p40, %p5;
|
|
setp.ge.s32 %p41, %r14, %r2;
|
|
xor.pred %p42, %p6, %p2;
|
|
not.pred %p43, %p42;
|
|
or.pred %p44, %p43, %p41;
|
|
mov.pred %p87, %p33;
|
|
@%p44 bra BB0_7;
|
|
|
|
and.pred %p45, %p86, %p103;
|
|
not.pred %p46, %p6;
|
|
add.ftz.f32 %f41, %f55, %f55;
|
|
sub.ftz.f32 %f42, %f10, %f9;
|
|
fma.rn.ftz.f32 %f59, %f59, %f41, %f6;
|
|
add.ftz.f32 %f55, %f5, %f42;
|
|
and.pred %p87, %p45, %p46;
|
|
|
|
BB0_7:
|
|
mov.f32 %f13, %f59;
|
|
mov.f32 %f14, %f55;
|
|
mov.pred %p86, %p87;
|
|
add.s32 %r69, %r92, 1;
|
|
selp.b32 %r91, %r69, %r92, %p86;
|
|
setp.lt.s32 %p103, %r91, %r40;
|
|
and.pred %p10, %p86, %p103;
|
|
and.pred %p48, %p10, %p35;
|
|
selp.u32 %r68, 1, 0, %p48;
|
|
// inline asm
|
|
{ .reg .pred %p1;
|
|
setp.ne.u32 %p1, %r68, 0;
|
|
vote.ballot.b32 %r67, %p1;
|
|
}
|
|
// inline asm
|
|
setp.ne.s32 %p49, %r67, 0;
|
|
mov.pred %p88, %p6;
|
|
mov.f32 %f54, %f14;
|
|
mov.f32 %f58, %f13;
|
|
mov.u32 %r92, %r91;
|
|
mov.pred %p104, %p10;
|
|
@%p49 bra BB0_5;
|
|
|
|
BB0_8:
|
|
setp.ge.s32 %p50, %r14, %r2;
|
|
@%p50 bra BB0_10;
|
|
|
|
mad.lo.s32 %r70, %r95, %r37, %r14;
|
|
shl.b32 %r71, %r70, 2;
|
|
cvt.s64.s32 %rd7, %r71;
|
|
add.s64 %rd8, %rd7, %rd3;
|
|
st.u32 [%rd8], %r91;
|
|
|
|
BB0_10:
|
|
add.s32 %r96, %r96, 1;
|
|
add.s32 %r95, %r96, %r6;
|
|
setp.eq.s32 %p51, %r96, %r10;
|
|
@%p51 bra BB0_26;
|
|
bra.uni BB0_3;
|
|
|
|
BB0_11:
|
|
selp.u32 %r73, 1, 0, %p1;
|
|
// inline asm
|
|
{ .reg .pred %p1;
|
|
setp.ne.u32 %p1, %r73, 0;
|
|
vote.ballot.b32 %r72, %p1;
|
|
}
|
|
// inline asm
|
|
ld.global.u8 %r20, [%rd2];
|
|
|
|
BB0_12:
|
|
mov.u32 %r22, %r94;
|
|
cvt.rn.f32.s32 %f43, %r22;
|
|
mul.lo.s32 %r24, %r22, %r37;
|
|
fma.rn.ftz.f32 %f15, %f37, %f43, %f36;
|
|
mov.u32 %r97, %r1;
|
|
|
|
BB0_13:
|
|
mov.u32 %r25, %r97;
|
|
add.s32 %r26, %r25, %r20;
|
|
cvt.rn.f32.s32 %f44, %r26;
|
|
fma.rn.ftz.f32 %f16, %f35, %f44, %f34;
|
|
setp.eq.s32 %p54, %r72, 0;
|
|
mov.u32 %r99, 0;
|
|
mov.u32 %r100, %r99;
|
|
mov.pred %p53, 0;
|
|
mov.pred %p91, -1;
|
|
mov.pred %p93, %p53;
|
|
mov.f32 %f62, %f16;
|
|
mov.pred %p101, %p1;
|
|
mov.pred %p102, %p1;
|
|
mov.f32 %f75, %f15;
|
|
@%p54 bra BB0_17;
|
|
|
|
BB0_14:
|
|
mov.f32 %f71, %f75;
|
|
mov.f32 %f76, %f71;
|
|
mov.pred %p11, %p102;
|
|
mov.f32 %f60, %f62;
|
|
mov.f32 %f63, %f60;
|
|
mov.pred %p14, %p93;
|
|
mul.ftz.f32 %f19, %f76, %f76;
|
|
mul.ftz.f32 %f20, %f63, %f63;
|
|
add.ftz.f32 %f45, %f19, %f20;
|
|
setp.gtu.ftz.f32 %p56, %f45, 0f40800000;
|
|
and.pred %p57, %p11, %p56;
|
|
or.pred %p15, %p57, %p14;
|
|
xor.pred %p58, %p15, %p11;
|
|
mov.pred %p92, %p53;
|
|
@!%p58 bra BB0_16;
|
|
bra.uni BB0_15;
|
|
|
|
BB0_15:
|
|
and.pred %p59, %p91, %p101;
|
|
not.pred %p60, %p15;
|
|
add.ftz.f32 %f46, %f63, %f63;
|
|
sub.ftz.f32 %f47, %f20, %f19;
|
|
fma.rn.ftz.f32 %f76, %f76, %f46, %f15;
|
|
add.ftz.f32 %f63, %f16, %f47;
|
|
and.pred %p92, %p59, %p60;
|
|
|
|
BB0_16:
|
|
mov.f32 %f23, %f76;
|
|
mov.f32 %f24, %f63;
|
|
mov.pred %p91, %p92;
|
|
add.s32 %r78, %r100, 1;
|
|
selp.b32 %r99, %r78, %r100, %p91;
|
|
setp.lt.s32 %p101, %r99, %r40;
|
|
and.pred %p102, %p91, %p101;
|
|
selp.u32 %r77, 1, 0, %p102;
|
|
// inline asm
|
|
{ .reg .pred %p1;
|
|
setp.ne.u32 %p1, %r77, 0;
|
|
vote.ballot.b32 %r76, %p1;
|
|
}
|
|
// inline asm
|
|
setp.ne.s32 %p61, %r76, 0;
|
|
mov.pred %p93, %p15;
|
|
mov.f32 %f62, %f24;
|
|
mov.u32 %r100, %r99;
|
|
mov.f32 %f75, %f23;
|
|
@%p61 bra BB0_14;
|
|
|
|
BB0_17:
|
|
add.s32 %r79, %r26, %r24;
|
|
shl.b32 %r80, %r79, 2;
|
|
cvt.s64.s32 %rd9, %r80;
|
|
add.s64 %rd10, %rd9, %rd3;
|
|
st.u32 [%rd10], %r99;
|
|
add.s32 %r30, %r25, 32;
|
|
setp.lt.s32 %p62, %r30, %r5;
|
|
mov.u32 %r97, %r30;
|
|
@%p62 bra BB0_13;
|
|
|
|
setp.ge.s32 %p63, %r30, %r2;
|
|
@%p63 bra BB0_25;
|
|
|
|
setp.gt.s32 %p66, %r40, 0;
|
|
mov.u32 %r102, 0;
|
|
add.s32 %r31, %r30, %r20;
|
|
setp.lt.s32 %p67, %r31, %r2;
|
|
cvt.rn.f32.s32 %f48, %r31;
|
|
fma.rn.ftz.f32 %f25, %f35, %f48, %f34;
|
|
and.pred %p68, %p66, %p67;
|
|
selp.u32 %r82, 1, 0, %p68;
|
|
// inline asm
|
|
{ .reg .pred %p1;
|
|
setp.ne.u32 %p1, %r82, 0;
|
|
vote.ballot.b32 %r81, %p1;
|
|
}
|
|
// inline asm
|
|
setp.eq.s32 %p69, %r81, 0;
|
|
mov.u32 %r103, %r102;
|
|
mov.pred %p65, 0;
|
|
mov.pred %p107, -1;
|
|
mov.pred %p99, %p1;
|
|
mov.pred %p100, %p1;
|
|
mov.pred %p109, %p65;
|
|
mov.f32 %f66, %f25;
|
|
mov.f32 %f73, %f15;
|
|
@%p69 bra BB0_23;
|
|
|
|
BB0_20:
|
|
mov.f32 %f69, %f73;
|
|
mov.f32 %f74, %f69;
|
|
mov.f32 %f64, %f66;
|
|
mov.f32 %f67, %f64;
|
|
mov.pred %p23, %p109;
|
|
mov.pred %p20, %p100;
|
|
mul.ftz.f32 %f28, %f74, %f74;
|
|
mul.ftz.f32 %f29, %f67, %f67;
|
|
add.ftz.f32 %f49, %f28, %f29;
|
|
setp.gtu.ftz.f32 %p71, %f49, 0f40800000;
|
|
and.pred %p72, %p20, %p71;
|
|
or.pred %p24, %p72, %p23;
|
|
setp.ge.s32 %p73, %r31, %r2;
|
|
xor.pred %p74, %p24, %p20;
|
|
not.pred %p75, %p74;
|
|
or.pred %p76, %p75, %p73;
|
|
mov.pred %p108, %p65;
|
|
@%p76 bra BB0_22;
|
|
|
|
and.pred %p77, %p107, %p99;
|
|
not.pred %p78, %p24;
|
|
add.ftz.f32 %f50, %f67, %f67;
|
|
sub.ftz.f32 %f51, %f29, %f28;
|
|
fma.rn.ftz.f32 %f74, %f74, %f50, %f15;
|
|
add.ftz.f32 %f67, %f25, %f51;
|
|
and.pred %p108, %p77, %p78;
|
|
|
|
BB0_22:
|
|
mov.f32 %f73, %f74;
|
|
mov.f32 %f33, %f67;
|
|
mov.pred %p107, %p108;
|
|
add.s32 %r87, %r103, 1;
|
|
selp.b32 %r102, %r87, %r103, %p107;
|
|
setp.lt.s32 %p99, %r102, %r40;
|
|
and.pred %p100, %p107, %p99;
|
|
and.pred %p80, %p100, %p67;
|
|
selp.u32 %r86, 1, 0, %p80;
|
|
// inline asm
|
|
{ .reg .pred %p1;
|
|
setp.ne.u32 %p1, %r86, 0;
|
|
vote.ballot.b32 %r85, %p1;
|
|
}
|
|
// inline asm
|
|
setp.ne.s32 %p81, %r85, 0;
|
|
mov.pred %p109, %p24;
|
|
mov.f32 %f66, %f33;
|
|
mov.u32 %r103, %r102;
|
|
@%p81 bra BB0_20;
|
|
|
|
BB0_23:
|
|
setp.ge.s32 %p82, %r31, %r2;
|
|
@%p82 bra BB0_25;
|
|
|
|
add.s32 %r88, %r31, %r24;
|
|
shl.b32 %r89, %r88, 2;
|
|
cvt.s64.s32 %rd11, %r89;
|
|
add.s64 %rd12, %rd11, %rd3;
|
|
st.u32 [%rd12], %r102;
|
|
|
|
BB0_25:
|
|
add.s32 %r96, %r96, 1;
|
|
add.s32 %r94, %r96, %r6;
|
|
setp.ne.s32 %p83, %r96, %r10;
|
|
@%p83 bra BB0_12;
|
|
|
|
BB0_26:
|
|
ret;
|
|
}
|
|
|
|
.visible .func PTXmandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_(
|
|
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0,
|
|
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1,
|
|
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2,
|
|
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3,
|
|
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4,
|
|
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5,
|
|
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6,
|
|
.param .b64 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7,
|
|
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_8
|
|
)
|
|
{
|
|
.reg .pred %p<4>;
|
|
.reg .f32 %f<13>;
|
|
.reg .s32 %r<27>;
|
|
.reg .s64 %rd<13>;
|
|
|
|
|
|
ld.param.f32 %f1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0];
|
|
ld.param.f32 %f2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1];
|
|
ld.param.f32 %f3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2];
|
|
ld.param.f32 %f4, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3];
|
|
ld.param.u32 %r2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4];
|
|
ld.param.u32 %r3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5];
|
|
ld.param.u32 %r4, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6];
|
|
ld.param.u64 %rd5, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7];
|
|
mov.u32 %r5, %tid.x;
|
|
and.b32 %r1, %r5, 31;
|
|
setp.ne.s32 %p1, %r1, 0;
|
|
mov.u64 %rd12, 0;
|
|
@%p1 bra BB1_3;
|
|
|
|
mov.u64 %rd7, 8;
|
|
mov.u64 %rd8, 48;
|
|
// Callseq Start 0
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b64 param0;
|
|
st.param.b64 [param0+0], %rd7;
|
|
.param .b64 param1;
|
|
st.param.b64 [param1+0], %rd8;
|
|
.param .b64 retval0;
|
|
call.uni (retval0),
|
|
cudaGetParameterBuffer,
|
|
(
|
|
param0,
|
|
param1
|
|
);
|
|
ld.param.b64 %rd1, [retval0+0];
|
|
}
|
|
// Callseq End 0
|
|
setp.eq.s64 %p2, %rd1, 0;
|
|
mov.u64 %rd12, %rd1;
|
|
@%p2 bra BB1_3;
|
|
|
|
cvt.rn.f32.s32 %f5, %r2;
|
|
rcp.approx.ftz.f32 %f6, %f5;
|
|
cvt.rn.f32.s32 %f7, %r3;
|
|
rcp.approx.ftz.f32 %f8, %f7;
|
|
sub.ftz.f32 %f9, %f4, %f2;
|
|
mul.ftz.f32 %f10, %f9, %f8;
|
|
sub.ftz.f32 %f11, %f3, %f1;
|
|
mul.ftz.f32 %f12, %f11, %f6;
|
|
st.f32 [%rd1], %f1;
|
|
st.f32 [%rd1+4], %f12;
|
|
st.f32 [%rd1+8], %f2;
|
|
st.f32 [%rd1+12], %f10;
|
|
st.u32 [%rd1+16], %r2;
|
|
st.u32 [%rd1+20], %r3;
|
|
mov.u32 %r6, 64;
|
|
st.u32 [%rd1+24], %r6;
|
|
mov.u32 %r7, 8;
|
|
st.u32 [%rd1+28], %r7;
|
|
st.u32 [%rd1+32], %r4;
|
|
st.u64 [%rd1+40], %rd5;
|
|
mov.u64 %rd12, %rd1;
|
|
|
|
BB1_3:
|
|
@%p1 bra BB1_5;
|
|
|
|
shr.s32 %r16, %r2, 31;
|
|
shr.u32 %r17, %r16, 26;
|
|
add.s32 %r18, %r2, %r17;
|
|
shr.s32 %r19, %r18, 6;
|
|
shr.s32 %r20, %r3, 31;
|
|
shr.u32 %r21, %r20, 29;
|
|
add.s32 %r22, %r3, %r21;
|
|
shr.s32 %r10, %r22, 3;
|
|
add.s32 %r23, %r19, -1;
|
|
shr.s32 %r24, %r23, 2;
|
|
add.s32 %r9, %r24, 1;
|
|
mov.u32 %r14, 1;
|
|
mov.u32 %r12, 128;
|
|
mov.u32 %r15, 0;
|
|
mov.u64 %rd11, 0;
|
|
mov.u64 %rd9, PTXmandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_;
|
|
// inline asm
|
|
{
|
|
.param .b64 param0;
|
|
st.param.b64 [param0+0], %rd9;
|
|
.param .b64 param1;
|
|
st.param.b64 [param1+0], %rd12;
|
|
.param .align 4 .b8 param2[12];
|
|
st.param.b32 [param2+0], %r9;
|
|
st.param.b32 [param2+4], %r10;
|
|
st.param.b32 [param2+8], %r14;
|
|
.param .align 4 .b8 param3[12];
|
|
st.param.b32 [param3+0], %r12;
|
|
st.param.b32 [param3+4], %r14;
|
|
st.param.b32 [param3+8], %r14;
|
|
.param .b32 param4;
|
|
st.param.b32 [param4+0], %r15;
|
|
.param .b64 param5;
|
|
st.param.b64 [param5+0], %rd11;
|
|
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaLaunchDevice,
|
|
(
|
|
param0,
|
|
param1,
|
|
param2,
|
|
param3,
|
|
param4,
|
|
param5
|
|
);
|
|
ld.param.b32 %r8, [retval0+0];
|
|
}
|
|
|
|
// inline asm
|
|
|
|
BB1_5:
|
|
// Callseq Start 1
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaDeviceSynchronize,
|
|
(
|
|
);
|
|
ld.param.b32 %r25, [retval0+0];
|
|
}
|
|
// Callseq End 1
|
|
// Callseq Start 2
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaDeviceSynchronize,
|
|
(
|
|
);
|
|
ld.param.b32 %r26, [retval0+0];
|
|
}
|
|
// Callseq End 2
|
|
ret;
|
|
}
|
|
|
|
.visible .func PTXmandelbrot_ispc(
|
|
.param .b32 mandelbrot_ispc_param_0,
|
|
.param .b32 mandelbrot_ispc_param_1,
|
|
.param .b32 mandelbrot_ispc_param_2,
|
|
.param .b32 mandelbrot_ispc_param_3,
|
|
.param .b32 mandelbrot_ispc_param_4,
|
|
.param .b32 mandelbrot_ispc_param_5,
|
|
.param .b32 mandelbrot_ispc_param_6,
|
|
.param .b64 mandelbrot_ispc_param_7
|
|
)
|
|
{
|
|
.reg .pred %p<4>;
|
|
.reg .f32 %f<13>;
|
|
.reg .s32 %r<27>;
|
|
.reg .s64 %rd<13>;
|
|
|
|
|
|
ld.param.f32 %f1, [mandelbrot_ispc_param_0];
|
|
ld.param.f32 %f2, [mandelbrot_ispc_param_1];
|
|
ld.param.f32 %f3, [mandelbrot_ispc_param_2];
|
|
ld.param.f32 %f4, [mandelbrot_ispc_param_3];
|
|
ld.param.u32 %r2, [mandelbrot_ispc_param_4];
|
|
ld.param.u32 %r3, [mandelbrot_ispc_param_5];
|
|
ld.param.u32 %r4, [mandelbrot_ispc_param_6];
|
|
ld.param.u64 %rd5, [mandelbrot_ispc_param_7];
|
|
mov.u32 %r5, %tid.x;
|
|
and.b32 %r1, %r5, 31;
|
|
setp.ne.s32 %p1, %r1, 0;
|
|
mov.u64 %rd12, 0;
|
|
@%p1 bra BB2_3;
|
|
|
|
mov.u64 %rd7, 8;
|
|
mov.u64 %rd8, 48;
|
|
// Callseq Start 3
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b64 param0;
|
|
st.param.b64 [param0+0], %rd7;
|
|
.param .b64 param1;
|
|
st.param.b64 [param1+0], %rd8;
|
|
.param .b64 retval0;
|
|
call.uni (retval0),
|
|
cudaGetParameterBuffer,
|
|
(
|
|
param0,
|
|
param1
|
|
);
|
|
ld.param.b64 %rd1, [retval0+0];
|
|
}
|
|
// Callseq End 3
|
|
setp.eq.s64 %p2, %rd1, 0;
|
|
mov.u64 %rd12, %rd1;
|
|
@%p2 bra BB2_3;
|
|
|
|
cvt.rn.f32.s32 %f5, %r2;
|
|
rcp.approx.ftz.f32 %f6, %f5;
|
|
cvt.rn.f32.s32 %f7, %r3;
|
|
rcp.approx.ftz.f32 %f8, %f7;
|
|
sub.ftz.f32 %f9, %f4, %f2;
|
|
mul.ftz.f32 %f10, %f9, %f8;
|
|
sub.ftz.f32 %f11, %f3, %f1;
|
|
mul.ftz.f32 %f12, %f11, %f6;
|
|
st.f32 [%rd1], %f1;
|
|
st.f32 [%rd1+4], %f12;
|
|
st.f32 [%rd1+8], %f2;
|
|
st.f32 [%rd1+12], %f10;
|
|
st.u32 [%rd1+16], %r2;
|
|
st.u32 [%rd1+20], %r3;
|
|
mov.u32 %r6, 64;
|
|
st.u32 [%rd1+24], %r6;
|
|
mov.u32 %r7, 8;
|
|
st.u32 [%rd1+28], %r7;
|
|
st.u32 [%rd1+32], %r4;
|
|
st.u64 [%rd1+40], %rd5;
|
|
mov.u64 %rd12, %rd1;
|
|
|
|
BB2_3:
|
|
@%p1 bra BB2_5;
|
|
|
|
shr.s32 %r16, %r2, 31;
|
|
shr.u32 %r17, %r16, 26;
|
|
add.s32 %r18, %r2, %r17;
|
|
shr.s32 %r19, %r18, 6;
|
|
shr.s32 %r20, %r3, 31;
|
|
shr.u32 %r21, %r20, 29;
|
|
add.s32 %r22, %r3, %r21;
|
|
shr.s32 %r10, %r22, 3;
|
|
add.s32 %r23, %r19, -1;
|
|
shr.s32 %r24, %r23, 2;
|
|
add.s32 %r9, %r24, 1;
|
|
mov.u32 %r14, 1;
|
|
mov.u32 %r12, 128;
|
|
mov.u32 %r15, 0;
|
|
mov.u64 %rd11, 0;
|
|
mov.u64 %rd9, PTXmandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_;
|
|
// inline asm
|
|
{
|
|
.param .b64 param0;
|
|
st.param.b64 [param0+0], %rd9;
|
|
.param .b64 param1;
|
|
st.param.b64 [param1+0], %rd12;
|
|
.param .align 4 .b8 param2[12];
|
|
st.param.b32 [param2+0], %r9;
|
|
st.param.b32 [param2+4], %r10;
|
|
st.param.b32 [param2+8], %r14;
|
|
.param .align 4 .b8 param3[12];
|
|
st.param.b32 [param3+0], %r12;
|
|
st.param.b32 [param3+4], %r14;
|
|
st.param.b32 [param3+8], %r14;
|
|
.param .b32 param4;
|
|
st.param.b32 [param4+0], %r15;
|
|
.param .b64 param5;
|
|
st.param.b64 [param5+0], %rd11;
|
|
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaLaunchDevice,
|
|
(
|
|
param0,
|
|
param1,
|
|
param2,
|
|
param3,
|
|
param4,
|
|
param5
|
|
);
|
|
ld.param.b32 %r8, [retval0+0];
|
|
}
|
|
|
|
// inline asm
|
|
|
|
BB2_5:
|
|
// Callseq Start 4
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaDeviceSynchronize,
|
|
(
|
|
);
|
|
ld.param.b32 %r25, [retval0+0];
|
|
}
|
|
// Callseq End 4
|
|
// Callseq Start 5
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaDeviceSynchronize,
|
|
(
|
|
);
|
|
ld.param.b32 %r26, [retval0+0];
|
|
}
|
|
// Callseq End 5
|
|
ret;
|
|
}
|
|
|
|
|
|
|