Files
ispc/examples/stencil/stencil.ptx
2013-11-08 14:17:26 +01:00

268 lines
6.4 KiB
Plaintext

//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
// Cuda compilation tools, release 5.5, V5.5.0
//
.version 3.2
.target sm_35
.address_size 64
.file 1 "/home/evghenii/soft/ispc-code/ispc/examples/stencil/stencil.cu", 1383254912, 2112
.file 2 "/usr/local/cuda-5.5/bin/..//include/cuda_device_runtime_api.h", 1375338991, 7655
.weak .func (.param .b32 func_retval0) cudaMalloc(
.param .b64 cudaMalloc_param_0,
.param .b64 cudaMalloc_param_1
)
{
.reg .s32 %r<2>;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
.loc 2 66 3
ret;
}
.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes(
.param .b64 cudaFuncGetAttributes_param_0,
.param .b64 cudaFuncGetAttributes_param_1
)
{
.reg .s32 %r<2>;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
.loc 2 71 3
ret;
}
.visible .entry stencil_step_task(
.param .u32 stencil_step_task_param_0,
.param .u32 stencil_step_task_param_1,
.param .u32 stencil_step_task_param_2,
.param .u32 stencil_step_task_param_3,
.param .u32 stencil_step_task_param_4,
.param .u32 stencil_step_task_param_5,
.param .u32 stencil_step_task_param_6,
.param .u32 stencil_step_task_param_7,
.param .u64 stencil_step_task_param_8,
.param .u64 stencil_step_task_param_9,
.param .u64 stencil_step_task_param_10,
.param .u64 stencil_step_task_param_11
)
{
.reg .pred %p<8>;
.reg .s32 %r<54>;
.reg .s64 %rd<36>;
.reg .f64 %fd<48>;
ld.param.u32 %r19, [stencil_step_task_param_0];
ld.param.u32 %r20, [stencil_step_task_param_1];
ld.param.u32 %r21, [stencil_step_task_param_2];
ld.param.u32 %r22, [stencil_step_task_param_3];
ld.param.u32 %r23, [stencil_step_task_param_4];
ld.param.u32 %r24, [stencil_step_task_param_5];
ld.param.u32 %r25, [stencil_step_task_param_6];
ld.param.u64 %rd4, [stencil_step_task_param_8];
ld.param.u64 %rd1, [stencil_step_task_param_9];
ld.param.u64 %rd2, [stencil_step_task_param_10];
ld.param.u64 %rd3, [stencil_step_task_param_11];
cvta.to.global.u64 %rd5, %rd4;
.loc 1 59 1
mov.u32 %r26, %ctaid.x;
add.s32 %r51, %r26, %r23;
add.s32 %r27, %r51, 1;
.loc 1 18 1
ld.global.f64 %fd1, [%rd5];
.loc 1 19 1
ld.global.f64 %fd2, [%rd5+8];
.loc 1 20 1
ld.global.f64 %fd3, [%rd5+16];
.loc 1 21 1
ld.global.f64 %fd4, [%rd5+24];
.loc 1 22 1
setp.ge.s32 %p1, %r51, %r27;
@%p1 bra BB2_11;
mul.lo.s32 %r28, %r25, %r24;
shl.b32 %r29, %r28, 1;
neg.s32 %r30, %r29;
shl.b32 %r2, %r30, 3;
cvta.to.global.u64 %rd6, %rd2;
cvta.to.global.u64 %rd31, %rd3;
cvta.to.global.u64 %rd32, %rd1;
BB2_2:
.loc 1 23 1
setp.ge.s32 %p2, %r21, %r22;
@%p2 bra BB2_10;
mov.u32 %r52, %r21;
BB2_4:
.loc 1 24 1
mov.u32 %r4, %r52;
setp.ge.s32 %p3, %r19, %r20;
@%p3 bra BB2_9;
.loc 1 29 1
mul.lo.s32 %r32, %r51, %r28;
mad.lo.s32 %r5, %r4, %r24, %r32;
.loc 1 32 1
add.s32 %r6, %r24, %r5;
add.s32 %r7, %r5, %r28;
shl.b32 %r33, %r24, 1;
add.s32 %r8, %r5, %r33;
mad.lo.s32 %r9, %r24, -2, %r5;
add.s32 %r10, %r5, %r29;
mad.lo.s32 %r11, %r28, -2, %r5;
add.s32 %r12, %r24, %r8;
mad.lo.s32 %r13, %r28, 3, %r5;
mov.u32 %r53, %r19;
BB2_6:
.loc 1 26 1
mov.u32 %r14, %r53;
mov.u32 %r35, %tid.x;
add.s32 %r36, %r35, %r14;
.loc 1 29 1
add.s32 %r15, %r36, %r5;
mul.wide.s32 %rd7, %r15, 8;
add.s64 %rd8, %rd6, %rd7;
.loc 1 32 1
ld.global.f64 %fd5, [%rd8];
ld.global.f64 %fd7, [%rd8+-8];
ld.global.f64 %fd8, [%rd8+8];
add.f64 %fd9, %fd8, %fd7;
add.s32 %r37, %r6, %r36;
mul.wide.s32 %rd9, %r37, 8;
add.s64 %rd10, %rd6, %rd9;
.loc 1 32 1
ld.global.f64 %fd10, [%rd10];
add.f64 %fd11, %fd9, %fd10;
.loc 1 22 1
neg.s32 %r39, %r33;
shl.b32 %r40, %r39, 3;
cvt.s64.s32 %rd11, %r40;
add.s64 %rd12, %rd10, %rd11;
.loc 1 32 1
ld.global.f64 %fd12, [%rd12];
add.f64 %fd13, %fd11, %fd12;
add.s32 %r41, %r7, %r36;
mul.wide.s32 %rd13, %r41, 8;
add.s64 %rd14, %rd6, %rd13;
.loc 1 32 1
ld.global.f64 %fd14, [%rd14];
add.f64 %fd15, %fd13, %fd14;
cvt.s64.s32 %rd15, %r2;
add.s64 %rd16, %rd14, %rd15;
.loc 1 32 1
ld.global.f64 %fd16, [%rd16];
add.f64 %fd17, %fd15, %fd16;
mul.f64 %fd18, %fd2, %fd17;
fma.rn.f64 %fd19, %fd1, %fd5, %fd18;
ld.global.f64 %fd20, [%rd8+-16];
ld.global.f64 %fd21, [%rd8+16];
add.f64 %fd22, %fd21, %fd20;
add.s32 %r42, %r8, %r36;
mul.wide.s32 %rd17, %r42, 8;
add.s64 %rd18, %rd6, %rd17;
.loc 1 32 1
ld.global.f64 %fd23, [%rd18];
add.f64 %fd24, %fd22, %fd23;
add.s32 %r43, %r9, %r36;
mul.wide.s32 %rd19, %r43, 8;
add.s64 %rd20, %rd6, %rd19;
.loc 1 32 1
ld.global.f64 %fd25, [%rd20];
add.f64 %fd26, %fd24, %fd25;
add.s32 %r44, %r10, %r36;
mul.wide.s32 %rd21, %r44, 8;
add.s64 %rd22, %rd6, %rd21;
.loc 1 32 1
ld.global.f64 %fd27, [%rd22];
add.f64 %fd28, %fd26, %fd27;
add.s32 %r45, %r11, %r36;
mul.wide.s32 %rd23, %r45, 8;
add.s64 %rd24, %rd6, %rd23;
.loc 1 32 1
ld.global.f64 %fd29, [%rd24];
add.f64 %fd30, %fd28, %fd29;
fma.rn.f64 %fd31, %fd3, %fd30, %fd19;
ld.global.f64 %fd32, [%rd8+-24];
ld.global.f64 %fd33, [%rd8+24];
add.f64 %fd34, %fd33, %fd32;
add.s32 %r46, %r12, %r36;
mul.wide.s32 %rd25, %r46, 8;
add.s64 %rd26, %rd6, %rd25;
.loc 1 32 1
ld.global.f64 %fd35, [%rd26];
add.f64 %fd36, %fd34, %fd35;
add.s64 %rd27, %rd12, %rd11;
.loc 1 32 1
ld.global.f64 %fd37, [%rd27];
add.f64 %fd38, %fd36, %fd37;
add.s32 %r47, %r13, %r36;
mul.wide.s32 %rd28, %r47, 8;
add.s64 %rd29, %rd6, %rd28;
.loc 1 32 1
ld.global.f64 %fd39, [%rd29];
add.f64 %fd40, %fd38, %fd39;
add.s64 %rd30, %rd16, %rd15;
.loc 1 32 1
ld.global.f64 %fd41, [%rd30];
add.f64 %fd42, %fd40, %fd41;
fma.rn.f64 %fd6, %fd4, %fd42, %fd31;
.loc 1 44 1
setp.ge.s32 %p4, %r36, %r20;
@%p4 bra BB2_8;
mul.wide.s32 %rd33, %r15, 8;
add.s64 %rd34, %rd31, %rd33;
.loc 1 45 1
ld.global.f64 %fd43, [%rd34];
add.f64 %fd44, %fd5, %fd5;
sub.f64 %fd45, %fd44, %fd43;
add.s64 %rd35, %rd32, %rd33;
.loc 1 45 1
ld.global.f64 %fd46, [%rd35];
fma.rn.f64 %fd47, %fd46, %fd6, %fd45;
st.global.f64 [%rd34], %fd47;
BB2_8:
.loc 1 24 19
add.s32 %r16, %r14, 32;
.loc 1 24 1
setp.lt.s32 %p5, %r16, %r20;
mov.u32 %r53, %r16;
@%p5 bra BB2_6;
BB2_9:
.loc 1 23 18
add.s32 %r17, %r4, 1;
.loc 1 23 1
setp.lt.s32 %p6, %r17, %r22;
mov.u32 %r52, %r17;
@%p6 bra BB2_4;
BB2_10:
.loc 1 22 18
add.s32 %r51, %r51, 1;
.loc 1 59 1
add.s32 %r49, %r23, %r26;
add.s32 %r50, %r49, 1;
.loc 1 22 1
setp.lt.s32 %p7, %r51, %r50;
@%p7 bra BB2_2;
BB2_11:
.loc 1 61 2
ret;
}