Files
ispc/examples/stencil/stencil2.ptx
2013-11-08 14:17:26 +01:00

248 lines
7.8 KiB
Plaintext

//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_20, texmode_independent
.address_size 64
// .globl stencil_step_task
// @stencil_step_task
.entry stencil_step_task(
.param .u32 stencil_step_task_param_0,
.param .u32 stencil_step_task_param_1,
.param .u32 stencil_step_task_param_2,
.param .u32 stencil_step_task_param_3,
.param .u32 stencil_step_task_param_4,
.param .u32 stencil_step_task_param_5,
.param .u32 stencil_step_task_param_6,
.param .u32 stencil_step_task_param_7,
.param .u64 .ptr .align 8 stencil_step_task_param_8,
.param .u64 .ptr .align 8 stencil_step_task_param_9,
.param .u64 .ptr .align 8 stencil_step_task_param_10,
.param .u64 .ptr .align 8 stencil_step_task_param_11
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
mov.u32 %r12, %ctaid.x;
ld.param.u32 %r13, [stencil_step_task_param_4];
add.s32 %r16, %r12, %r13;
add.s32 %r0, %r16, 1;
setp.ge.s32 %p0, %r16, %r0;
@%p0 bra BB0_11;
// BB#1: // %for_test28.i.preheader.lr.ph
ld.param.u32 %r0, [stencil_step_task_param_0];
ld.param.u32 %r1, [stencil_step_task_param_1];
ld.param.u32 %r2, [stencil_step_task_param_2];
ld.param.u32 %r3, [stencil_step_task_param_3];
ld.param.u32 %r4, [stencil_step_task_param_5];
ld.param.u32 %r5, [stencil_step_task_param_6];
mul.lo.s32 %r5, %r5, %r4;
ld.param.u64 %rl3, [stencil_step_task_param_8];
ld.f64 %fl0, [%rl3];
ld.f64 %fl1, [%rl3+8];
ld.param.u64 %rl0, [stencil_step_task_param_9];
ld.f64 %fl2, [%rl3+16];
ld.param.u64 %rl1, [stencil_step_task_param_10];
ld.param.u64 %rl2, [stencil_step_task_param_11];
ld.f64 %fl3, [%rl3+24];
shl.b32 %r6, %r4, 1;
mul.lo.s32 %r7, %r4, 3;
mul.lo.s32 %r8, %r4, -3;
shl.b32 %r9, %r5, 1;
mul.lo.s32 %r10, %r5, 3;
mul.lo.s32 %r11, %r5, -3;
add.s32 %r12, %r12, %r13;
neg.s32 %r13, %r9;
neg.s32 %r14, %r6;
mov.u32 %r32, WARP_SZ;
BB0_2: // %for_test28.i.preheader
// =>This Loop Header: Depth=1
// Child Loop BB0_9 Depth 2
// Child Loop BB0_5 Depth 3
mov.u32 %r15, %r16;
setp.ge.s32 %p0, %r2, %r3;
@%p0 bra BB0_10;
// BB#3: // %for_test35.i.preheader.lr.ph
// in Loop: Header=BB0_2 Depth=1
setp.lt.s32 %p0, %r0, %r1;
@%p0 bra BB0_4;
bra.uni BB0_10;
BB0_4: // in Loop: Header=BB0_2 Depth=1
mul.lo.s32 %r16, %r15, %r5;
mov.u32 %r17, %r2;
BB0_9: // %for_loop37.i.lr.ph.us
// Parent Loop BB0_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB0_5 Depth 3
mad.lo.s32 %r18, %r17, %r4, %r16;
add.s32 %r19, %r18, %r4;
add.s32 %r20, %r18, %r6;
sub.s32 %r21, %r18, %r4;
add.s32 %r22, %r18, %r7;
add.s32 %r23, %r18, %r14;
add.s32 %r24, %r18, %r5;
add.s32 %r25, %r18, %r8;
add.s32 %r26, %r18, %r9;
sub.s32 %r27, %r18, %r5;
add.s32 %r28, %r18, %r10;
add.s32 %r29, %r18, %r13;
add.s32 %r30, %r18, %r11;
mov.u32 %r31, %r0;
BB0_5: // %for_loop37.i.us
// Parent Loop BB0_2 Depth=1
// Parent Loop BB0_9 Depth=2
// => This Inner Loop Header: Depth=3
mov.u32 %r33, %tid.x;
add.s32 %r34, %r32, -1;
and.b32 %r33, %r34, %r33;
add.s32 %r33, %r33, %r31;
setp.ge.s32 %p0, %r33, %r1;
@%p0 bra BB0_7;
// BB#6: // %pl_dolane.i.us
// in Loop: Header=BB0_5 Depth=3
add.s32 %r34, %r18, %r33;
shl.b32 %r34, %r34, 3;
add.s32 %r35, %r34, -8;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl4, [%rl3];
add.s32 %r35, %r34, 8;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl5, [%rl3];
add.s32 %r35, %r34, -16;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl6, [%rl3];
add.s32 %r35, %r34, 16;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl9, [%rl3];
add.s32 %r35, %r19, %r33;
shl.b32 %r35, %r35, 3;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl8, [%rl3];
add.s32 %r35, %r34, -24;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl7, [%rl3];
add.s32 %r35, %r34, 24;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl10, [%rl3];
add.s32 %r35, %r20, %r33;
shl.b32 %r35, %r35, 3;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl13, [%rl3];
add.s32 %r35, %r21, %r33;
shl.b32 %r35, %r35, 3;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl12, [%rl3];
add.s32 %r35, %r22, %r33;
shl.b32 %r35, %r35, 3;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl11, [%rl3];
add.s32 %r35, %r23, %r33;
shl.b32 %r35, %r35, 3;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl16, [%rl3];
add.s32 %r35, %r24, %r33;
shl.b32 %r35, %r35, 3;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl15, [%rl3];
add.s32 %r35, %r25, %r33;
shl.b32 %r35, %r35, 3;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl14, [%rl3];
add.s32 %r35, %r26, %r33;
shl.b32 %r35, %r35, 3;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl19, [%rl3];
add.s32 %r35, %r27, %r33;
shl.b32 %r35, %r35, 3;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl18, [%rl3];
add.s32 %r35, %r28, %r33;
shl.b32 %r35, %r35, 3;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl17, [%rl3];
add.s32 %r35, %r29, %r33;
shl.b32 %r35, %r35, 3;
cvt.s64.s32 %rl3, %r35;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl24, [%rl3];
cvt.s64.s32 %rl4, %r34;
add.s64 %rl3, %rl4, %rl1;
ld.f64 %fl21, [%rl3];
add.s32 %r33, %r30, %r33;
shl.b32 %r33, %r33, 3;
cvt.s64.s32 %rl3, %r33;
add.s64 %rl3, %rl3, %rl1;
ld.f64 %fl20, [%rl3];
add.s64 %rl3, %rl4, %rl2;
ld.f64 %fl23, [%rl3];
add.s64 %rl4, %rl4, %rl0;
ld.f64 %fl22, [%rl4];
add.f64 %fl25, %fl21, %fl21;
sub.f64 %fl23, %fl25, %fl23;
add.f64 %fl6, %fl6, %fl9;
add.f64 %fl6, %fl6, %fl13;
add.f64 %fl6, %fl6, %fl16;
add.f64 %fl6, %fl6, %fl19;
add.f64 %fl6, %fl6, %fl24;
add.f64 %fl4, %fl4, %fl5;
add.f64 %fl4, %fl4, %fl8;
add.f64 %fl4, %fl4, %fl12;
add.f64 %fl4, %fl4, %fl15;
add.f64 %fl4, %fl4, %fl18;
mul.f64 %fl5, %fl0, %fl21;
fma.rn.f64 %fl4, %fl1, %fl4, %fl5;
fma.rn.f64 %fl4, %fl2, %fl6, %fl4;
add.f64 %fl5, %fl7, %fl10;
add.f64 %fl5, %fl5, %fl11;
add.f64 %fl5, %fl5, %fl14;
add.f64 %fl5, %fl5, %fl17;
add.f64 %fl5, %fl5, %fl20;
fma.rn.f64 %fl4, %fl3, %fl5, %fl4;
fma.rn.f64 %fl4, %fl4, %fl22, %fl23;
st.f64 [%rl3], %fl4;
BB0_7: // %safe_if_after_true.i.us
// in Loop: Header=BB0_5 Depth=3
add.s32 %r31, %r32, %r31;
setp.lt.s32 %p0, %r31, %r1;
@%p0 bra BB0_5;
// BB#8: // %for_exit38.i.us
// in Loop: Header=BB0_9 Depth=2
add.s32 %r17, %r17, 1;
setp.eq.s32 %p0, %r17, %r3;
@%p0 bra BB0_10;
bra.uni BB0_9;
BB0_10: // %for_exit31.i
// in Loop: Header=BB0_2 Depth=1
add.s32 %r16, %r15, 1;
setp.ne.s32 %p0, %r15, %r12;
@%p0 bra BB0_2;
BB0_11: // %stencil_step___uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_.exit
ret;
}