248 lines
7.8 KiB
Plaintext
248 lines
7.8 KiB
Plaintext
//
|
|
// Generated by LLVM NVPTX Back-End
|
|
//
|
|
|
|
.version 3.1
|
|
.target sm_20, texmode_independent
|
|
.address_size 64
|
|
|
|
// .globl stencil_step_task
|
|
// @stencil_step_task
|
|
.entry stencil_step_task(
|
|
.param .u32 stencil_step_task_param_0,
|
|
.param .u32 stencil_step_task_param_1,
|
|
.param .u32 stencil_step_task_param_2,
|
|
.param .u32 stencil_step_task_param_3,
|
|
.param .u32 stencil_step_task_param_4,
|
|
.param .u32 stencil_step_task_param_5,
|
|
.param .u32 stencil_step_task_param_6,
|
|
.param .u32 stencil_step_task_param_7,
|
|
.param .u64 .ptr .align 8 stencil_step_task_param_8,
|
|
.param .u64 .ptr .align 8 stencil_step_task_param_9,
|
|
.param .u64 .ptr .align 8 stencil_step_task_param_10,
|
|
.param .u64 .ptr .align 8 stencil_step_task_param_11
|
|
)
|
|
{
|
|
.reg .pred %p<396>;
|
|
.reg .s16 %rc<396>;
|
|
.reg .s16 %rs<396>;
|
|
.reg .s32 %r<396>;
|
|
.reg .s64 %rl<396>;
|
|
.reg .f32 %f<396>;
|
|
.reg .f64 %fl<396>;
|
|
|
|
// BB#0: // %allocas
|
|
mov.u32 %r12, %ctaid.x;
|
|
ld.param.u32 %r13, [stencil_step_task_param_4];
|
|
add.s32 %r16, %r12, %r13;
|
|
add.s32 %r0, %r16, 1;
|
|
setp.ge.s32 %p0, %r16, %r0;
|
|
@%p0 bra BB0_11;
|
|
// BB#1: // %for_test28.i.preheader.lr.ph
|
|
ld.param.u32 %r0, [stencil_step_task_param_0];
|
|
ld.param.u32 %r1, [stencil_step_task_param_1];
|
|
ld.param.u32 %r2, [stencil_step_task_param_2];
|
|
ld.param.u32 %r3, [stencil_step_task_param_3];
|
|
ld.param.u32 %r4, [stencil_step_task_param_5];
|
|
ld.param.u32 %r5, [stencil_step_task_param_6];
|
|
mul.lo.s32 %r5, %r5, %r4;
|
|
ld.param.u64 %rl3, [stencil_step_task_param_8];
|
|
ld.f64 %fl0, [%rl3];
|
|
ld.f64 %fl1, [%rl3+8];
|
|
ld.param.u64 %rl0, [stencil_step_task_param_9];
|
|
ld.f64 %fl2, [%rl3+16];
|
|
ld.param.u64 %rl1, [stencil_step_task_param_10];
|
|
ld.param.u64 %rl2, [stencil_step_task_param_11];
|
|
ld.f64 %fl3, [%rl3+24];
|
|
shl.b32 %r6, %r4, 1;
|
|
mul.lo.s32 %r7, %r4, 3;
|
|
mul.lo.s32 %r8, %r4, -3;
|
|
shl.b32 %r9, %r5, 1;
|
|
mul.lo.s32 %r10, %r5, 3;
|
|
mul.lo.s32 %r11, %r5, -3;
|
|
add.s32 %r12, %r12, %r13;
|
|
neg.s32 %r13, %r9;
|
|
neg.s32 %r14, %r6;
|
|
mov.u32 %r32, WARP_SZ;
|
|
BB0_2: // %for_test28.i.preheader
|
|
// =>This Loop Header: Depth=1
|
|
// Child Loop BB0_9 Depth 2
|
|
// Child Loop BB0_5 Depth 3
|
|
mov.u32 %r15, %r16;
|
|
setp.ge.s32 %p0, %r2, %r3;
|
|
@%p0 bra BB0_10;
|
|
// BB#3: // %for_test35.i.preheader.lr.ph
|
|
// in Loop: Header=BB0_2 Depth=1
|
|
setp.lt.s32 %p0, %r0, %r1;
|
|
@%p0 bra BB0_4;
|
|
bra.uni BB0_10;
|
|
BB0_4: // in Loop: Header=BB0_2 Depth=1
|
|
mul.lo.s32 %r16, %r15, %r5;
|
|
mov.u32 %r17, %r2;
|
|
BB0_9: // %for_loop37.i.lr.ph.us
|
|
// Parent Loop BB0_2 Depth=1
|
|
// => This Loop Header: Depth=2
|
|
// Child Loop BB0_5 Depth 3
|
|
mad.lo.s32 %r18, %r17, %r4, %r16;
|
|
add.s32 %r19, %r18, %r4;
|
|
add.s32 %r20, %r18, %r6;
|
|
sub.s32 %r21, %r18, %r4;
|
|
add.s32 %r22, %r18, %r7;
|
|
add.s32 %r23, %r18, %r14;
|
|
add.s32 %r24, %r18, %r5;
|
|
add.s32 %r25, %r18, %r8;
|
|
add.s32 %r26, %r18, %r9;
|
|
sub.s32 %r27, %r18, %r5;
|
|
add.s32 %r28, %r18, %r10;
|
|
add.s32 %r29, %r18, %r13;
|
|
add.s32 %r30, %r18, %r11;
|
|
mov.u32 %r31, %r0;
|
|
BB0_5: // %for_loop37.i.us
|
|
// Parent Loop BB0_2 Depth=1
|
|
// Parent Loop BB0_9 Depth=2
|
|
// => This Inner Loop Header: Depth=3
|
|
mov.u32 %r33, %tid.x;
|
|
add.s32 %r34, %r32, -1;
|
|
and.b32 %r33, %r34, %r33;
|
|
add.s32 %r33, %r33, %r31;
|
|
setp.ge.s32 %p0, %r33, %r1;
|
|
@%p0 bra BB0_7;
|
|
// BB#6: // %pl_dolane.i.us
|
|
// in Loop: Header=BB0_5 Depth=3
|
|
add.s32 %r34, %r18, %r33;
|
|
shl.b32 %r34, %r34, 3;
|
|
add.s32 %r35, %r34, -8;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl4, [%rl3];
|
|
add.s32 %r35, %r34, 8;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl5, [%rl3];
|
|
add.s32 %r35, %r34, -16;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl6, [%rl3];
|
|
add.s32 %r35, %r34, 16;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl9, [%rl3];
|
|
add.s32 %r35, %r19, %r33;
|
|
shl.b32 %r35, %r35, 3;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl8, [%rl3];
|
|
add.s32 %r35, %r34, -24;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl7, [%rl3];
|
|
add.s32 %r35, %r34, 24;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl10, [%rl3];
|
|
add.s32 %r35, %r20, %r33;
|
|
shl.b32 %r35, %r35, 3;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl13, [%rl3];
|
|
add.s32 %r35, %r21, %r33;
|
|
shl.b32 %r35, %r35, 3;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl12, [%rl3];
|
|
add.s32 %r35, %r22, %r33;
|
|
shl.b32 %r35, %r35, 3;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl11, [%rl3];
|
|
add.s32 %r35, %r23, %r33;
|
|
shl.b32 %r35, %r35, 3;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl16, [%rl3];
|
|
add.s32 %r35, %r24, %r33;
|
|
shl.b32 %r35, %r35, 3;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl15, [%rl3];
|
|
add.s32 %r35, %r25, %r33;
|
|
shl.b32 %r35, %r35, 3;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl14, [%rl3];
|
|
add.s32 %r35, %r26, %r33;
|
|
shl.b32 %r35, %r35, 3;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl19, [%rl3];
|
|
add.s32 %r35, %r27, %r33;
|
|
shl.b32 %r35, %r35, 3;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl18, [%rl3];
|
|
add.s32 %r35, %r28, %r33;
|
|
shl.b32 %r35, %r35, 3;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl17, [%rl3];
|
|
add.s32 %r35, %r29, %r33;
|
|
shl.b32 %r35, %r35, 3;
|
|
cvt.s64.s32 %rl3, %r35;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl24, [%rl3];
|
|
cvt.s64.s32 %rl4, %r34;
|
|
add.s64 %rl3, %rl4, %rl1;
|
|
ld.f64 %fl21, [%rl3];
|
|
add.s32 %r33, %r30, %r33;
|
|
shl.b32 %r33, %r33, 3;
|
|
cvt.s64.s32 %rl3, %r33;
|
|
add.s64 %rl3, %rl3, %rl1;
|
|
ld.f64 %fl20, [%rl3];
|
|
add.s64 %rl3, %rl4, %rl2;
|
|
ld.f64 %fl23, [%rl3];
|
|
add.s64 %rl4, %rl4, %rl0;
|
|
ld.f64 %fl22, [%rl4];
|
|
add.f64 %fl25, %fl21, %fl21;
|
|
sub.f64 %fl23, %fl25, %fl23;
|
|
add.f64 %fl6, %fl6, %fl9;
|
|
add.f64 %fl6, %fl6, %fl13;
|
|
add.f64 %fl6, %fl6, %fl16;
|
|
add.f64 %fl6, %fl6, %fl19;
|
|
add.f64 %fl6, %fl6, %fl24;
|
|
add.f64 %fl4, %fl4, %fl5;
|
|
add.f64 %fl4, %fl4, %fl8;
|
|
add.f64 %fl4, %fl4, %fl12;
|
|
add.f64 %fl4, %fl4, %fl15;
|
|
add.f64 %fl4, %fl4, %fl18;
|
|
mul.f64 %fl5, %fl0, %fl21;
|
|
fma.rn.f64 %fl4, %fl1, %fl4, %fl5;
|
|
fma.rn.f64 %fl4, %fl2, %fl6, %fl4;
|
|
add.f64 %fl5, %fl7, %fl10;
|
|
add.f64 %fl5, %fl5, %fl11;
|
|
add.f64 %fl5, %fl5, %fl14;
|
|
add.f64 %fl5, %fl5, %fl17;
|
|
add.f64 %fl5, %fl5, %fl20;
|
|
fma.rn.f64 %fl4, %fl3, %fl5, %fl4;
|
|
fma.rn.f64 %fl4, %fl4, %fl22, %fl23;
|
|
st.f64 [%rl3], %fl4;
|
|
BB0_7: // %safe_if_after_true.i.us
|
|
// in Loop: Header=BB0_5 Depth=3
|
|
add.s32 %r31, %r32, %r31;
|
|
setp.lt.s32 %p0, %r31, %r1;
|
|
@%p0 bra BB0_5;
|
|
// BB#8: // %for_exit38.i.us
|
|
// in Loop: Header=BB0_9 Depth=2
|
|
add.s32 %r17, %r17, 1;
|
|
setp.eq.s32 %p0, %r17, %r3;
|
|
@%p0 bra BB0_10;
|
|
bra.uni BB0_9;
|
|
BB0_10: // %for_exit31.i
|
|
// in Loop: Header=BB0_2 Depth=1
|
|
add.s32 %r16, %r15, 1;
|
|
setp.ne.s32 %p0, %r15, %r12;
|
|
@%p0 bra BB0_2;
|
|
BB0_11: // %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit
|
|
ret;
|
|
}
|
|
|