Files
ispc/examples_cuda/stencil/kernel.ptx
Evghenii 8d4dd13750 changes
2013-11-18 11:58:19 +01:00

1247 lines
35 KiB
Plaintext

//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
// Cuda compilation tools, release 5.5, V5.5.0
//
.version 3.2
.target sm_35
.address_size 64
.extern .func (.param .b32 func_retval0) cudaLaunchDevice
(
.param .b64 cudaLaunchDevice_param_0,
.param .b64 cudaLaunchDevice_param_1,
.param .align 4 .b8 cudaLaunchDevice_param_2[12],
.param .align 4 .b8 cudaLaunchDevice_param_3[12],
.param .b32 cudaLaunchDevice_param_4,
.param .b64 cudaLaunchDevice_param_5
);
.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer
(
.param .b64 cudaGetParameterBuffer_param_0,
.param .b64 cudaGetParameterBuffer_param_1
)
;
.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize
(
)
;
.global .align 1 .b8 constDeltaForeach1[32];
.global .align 1 .b8 constDeltaForeach4[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
.visible .func (.param .b32 func_retval0) __shfl_i32(
.param .b32 __shfl_i32_param_0,
.param .b32 __shfl_i32_param_1
)
{
.reg .s32 %r<4>;
ld.param.u32 %r2, [__shfl_i32_param_0];
ld.param.u32 %r3, [__shfl_i32_param_1];
// inline asm
shfl.idx.b32 %r1, %r2, %r3, 0x1f;
// inline asm
st.param.b32 [func_retval0+0], %r1;
ret;
}
.visible .func (.param .b32 func_retval0) __shfl_xor_float(
.param .b32 __shfl_xor_float_param_0,
.param .b32 __shfl_xor_float_param_1
)
{
.reg .s32 %r<2>;
.reg .f32 %f<3>;
ld.param.f32 %f2, [__shfl_xor_float_param_0];
ld.param.u32 %r1, [__shfl_xor_float_param_1];
// inline asm
shfl.bfly.b32 %f1, %f2, %r1, 0x1f;
// inline asm
st.param.f32 [func_retval0+0], %f1;
ret;
}
.visible .func (.param .b32 func_retval0) __shfl_xor_i32(
.param .b32 __shfl_xor_i32_param_0,
.param .b32 __shfl_xor_i32_param_1
)
{
.reg .s32 %r<4>;
ld.param.u32 %r2, [__shfl_xor_i32_param_0];
ld.param.u32 %r3, [__shfl_xor_i32_param_1];
// inline asm
shfl.bfly.b32 %r1, %r2, %r3, 0x1f;
// inline asm
st.param.b32 [func_retval0+0], %r1;
ret;
}
.visible .func (.param .b32 func_retval0) __fminf(
.param .b32 __fminf_param_0,
.param .b32 __fminf_param_1
)
{
.reg .f32 %f<4>;
ld.param.f32 %f2, [__fminf_param_0];
ld.param.f32 %f3, [__fminf_param_1];
// inline asm
min.f32 %f1, %f2, %f3;
// inline asm
st.param.f32 [func_retval0+0], %f1;
ret;
}
.visible .func (.param .b32 func_retval0) __fmaxf(
.param .b32 __fmaxf_param_0,
.param .b32 __fmaxf_param_1
)
{
.reg .f32 %f<4>;
ld.param.f32 %f2, [__fmaxf_param_0];
ld.param.f32 %f3, [__fmaxf_param_1];
// inline asm
max.f32 %f1, %f2, %f3;
// inline asm
st.param.f32 [func_retval0+0], %f1;
ret;
}
.visible .func (.param .b32 func_retval0) __ballot(
.param .b32 __ballot_param_0
)
{
.reg .s32 %r<3>;
ld.param.u8 %r2, [__ballot_param_0];
// inline asm
{ .reg .pred %p1;
setp.ne.u32 %p1, %r2, 0;
vote.ballot.b32 %r1, %p1;
}
// inline asm
st.param.b32 [func_retval0+0], %r1;
ret;
}
.visible .func (.param .b32 func_retval0) __lanemask_lt(
)
{
.reg .s32 %r<2>;
// inline asm
mov.u32 %r1, %lanemask_lt;
// inline asm
st.param.b32 [func_retval0+0], %r1;
ret;
}
.visible .func (.param .b64 func_retval0) ISPCAlloc(
.param .b64 ISPCAlloc_param_0,
.param .b64 ISPCAlloc_param_1,
.param .b32 ISPCAlloc_param_2
)
{
.reg .s64 %rd<2>;
mov.u64 %rd1, 1;
st.param.b64 [func_retval0+0], %rd1;
ret;
}
.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer(
.param .b64 ISPCGetParamBuffer_param_0,
.param .b64 ISPCGetParamBuffer_param_1,
.param .b64 ISPCGetParamBuffer_param_2
)
{
.reg .pred %p<2>;
.reg .s32 %r<3>;
.reg .s64 %rd<7>;
ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1];
ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2];
mov.u32 %r1, %tid.x;
and.b32 %r2, %r1, 31;
setp.ne.s32 %p1, %r2, 0;
mov.u64 %rd6, 0;
@%p1 bra BB8_2;
// Callseq Start 0
{
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd3;
.param .b64 param1;
st.param.b64 [param1+0], %rd4;
.param .b64 retval0;
call.uni (retval0),
cudaGetParameterBuffer,
(
param0,
param1
);
ld.param.b64 %rd6, [retval0+0];
}
// Callseq End 0
BB8_2:
st.param.b64 [func_retval0+0], %rd6;
ret;
}
.visible .func ISPCLaunch(
.param .b64 ISPCLaunch_param_0,
.param .b64 ISPCLaunch_param_1,
.param .b64 ISPCLaunch_param_2,
.param .b32 ISPCLaunch_param_3,
.param .b32 ISPCLaunch_param_4,
.param .b32 ISPCLaunch_param_5
)
{
.reg .pred %p<2>;
.reg .s32 %r<16>;
.reg .s64 %rd<6>;
ld.param.u64 %rd1, [ISPCLaunch_param_1];
ld.param.u64 %rd2, [ISPCLaunch_param_2];
ld.param.u32 %r1, [ISPCLaunch_param_3];
ld.param.u32 %r2, [ISPCLaunch_param_4];
ld.param.u32 %r3, [ISPCLaunch_param_5];
mov.u32 %r4, %tid.x;
and.b32 %r5, %r4, 31;
setp.ne.s32 %p1, %r5, 0;
@%p1 bra BB9_2;
add.s32 %r14, %r1, -1;
shr.s32 %r15, %r14, 2;
add.s32 %r7, %r15, 1;
mov.u32 %r12, 1;
mov.u32 %r10, 128;
mov.u32 %r13, 0;
mov.u64 %rd5, 0;
// inline asm
{
.param .b64 param0;
st.param.b64 [param0+0], %rd1;
.param .b64 param1;
st.param.b64 [param1+0], %rd2;
.param .align 4 .b8 param2[12];
st.param.b32 [param2+0], %r7;
st.param.b32 [param2+4], %r2;
st.param.b32 [param2+8], %r3;
.param .align 4 .b8 param3[12];
st.param.b32 [param3+0], %r10;
st.param.b32 [param3+4], %r12;
st.param.b32 [param3+8], %r12;
.param .b32 param4;
st.param.b32 [param4+0], %r13;
.param .b64 param5;
st.param.b64 [param5+0], %rd5;
.param .b32 retval0;
call.uni (retval0),
cudaLaunchDevice,
(
param0,
param1,
param2,
param3,
param4,
param5
);
ld.param.b32 %r6, [retval0+0];
}
// inline asm
BB9_2:
ret;
}
.visible .func ISPCSync(
.param .b64 ISPCSync_param_0
)
{
.reg .s32 %r<2>;
// Callseq Start 1
{
.reg .b32 temp_param_reg;
.param .b32 retval0;
call.uni (retval0),
cudaDeviceSynchronize,
(
);
ld.param.b32 %r1, [retval0+0];
}
// Callseq End 1
ret;
}
.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan(
.param .b32 __warpBinExclusiveScan_param_0
)
{
.reg .s32 %r<8>;
.reg .s64 %rd<5>;
ld.param.u8 %r2, [__warpBinExclusiveScan_param_0];
// inline asm
{ .reg .pred %p1;
setp.ne.u32 %p1, %r2, 0;
vote.ballot.b32 %r1, %p1;
}
// inline asm
// inline asm
popc.b32 %r3, %r1;
// inline asm
// inline asm
mov.u32 %r5, %lanemask_lt;
// inline asm
and.b32 %r7, %r5, %r1;
// inline asm
popc.b32 %r6, %r7;
// inline asm
cvt.u64.u32 %rd1, %r6;
shl.b64 %rd2, %rd1, 32;
cvt.u64.u32 %rd3, %r3;
or.b64 %rd4, %rd2, %rd3;
st.param.b64 [func_retval0+0], %rd4;
ret;
}
.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0,
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1,
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2,
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3,
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4,
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5,
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6,
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7,
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8,
.param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9,
.param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10,
.param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11,
.param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12
)
{
.reg .pred %p<14>;
.reg .s32 %r<178>;
.reg .s64 %rd<96>;
.reg .f64 %fd<95>;
ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0];
ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1];
ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2];
ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3];
ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4];
ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5];
ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6];
ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7];
ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9];
ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10];
ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11];
ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12];
mov.u32 %r1, %ctaid.x;
shl.b32 %r50, %r1, 2;
mov.u32 %r2, %tid.x;
shr.s32 %r51, %r2, 5;
add.s32 %r52, %r51, %r50;
mov.u32 %r53, %nctaid.x;
shl.b32 %r54, %r53, 2;
setp.ge.s32 %p1, %r52, %r54;
mov.u32 %r55, %nctaid.y;
mov.u32 %r3, %ctaid.y;
setp.ge.s32 %p2, %r3, %r55;
or.pred %p3, %p1, %p2;
mov.u32 %r56, %nctaid.z;
mov.u32 %r4, %ctaid.z;
setp.ge.s32 %p4, %r4, %r56;
or.pred %p5, %p3, %p4;
@%p5 bra BB12_13;
shl.b32 %r57, %r1, 7;
add.s32 %r58, %r2, %r57;
and.b32 %r59, %r58, -32;
add.s32 %r60, %r59, %r42;
add.s32 %r61, %r60, 32;
min.s32 %r5, %r43, %r61;
shl.b32 %r6, %r3, 3;
add.s32 %r62, %r6, %r44;
add.s32 %r7, %r62, 8;
shl.b32 %r8, %r4, 3;
add.s32 %r172, %r8, %r46;
add.s32 %r63, %r172, 8;
min.s32 %r64, %r47, %r63;
mul.lo.s32 %r10, %r49, %r48;
sub.s32 %r65, %r5, %r60;
shr.s32 %r66, %r65, 31;
shr.u32 %r67, %r66, 27;
add.s32 %r68, %r65, %r67;
and.b32 %r69, %r68, -32;
sub.s32 %r70, %r65, %r69;
sub.s32 %r11, %r5, %r70;
and.b32 %r71, %r2, 31;
cvt.u64.u32 %rd6, %r71;
mov.u64 %rd7, constDeltaForeach1;
add.s64 %rd1, %rd7, %rd6;
setp.ge.s32 %p6, %r172, %r64;
@%p6 bra BB12_13;
min.s32 %r12, %r45, %r7;
shl.b32 %r15, %r10, 1;
neg.s32 %r16, %r15;
mul.lo.s32 %r17, %r10, 3;
mul.lo.s32 %r18, %r10, -3;
mov.u32 %r72, -9;
sub.s32 %r73, %r72, %r44;
sub.s32 %r74, %r73, %r6;
not.b32 %r75, %r45;
max.s32 %r76, %r74, %r75;
not.b32 %r19, %r76;
sub.s32 %r77, %r72, %r46;
sub.s32 %r78, %r77, %r8;
not.b32 %r79, %r47;
max.s32 %r80, %r78, %r79;
not.b32 %r20, %r80;
ld.global.u8 %r13, [%rd1];
mov.u32 %r171, %r172;
BB12_3:
mov.u32 %r21, %r171;
add.s32 %r23, %r21, %r13;
setp.ge.s32 %p7, %r62, %r12;
@%p7 bra BB12_12;
mul.lo.s32 %r24, %r23, %r10;
mov.u32 %r174, %r62;
mov.u32 %r173, %r62;
BB12_5:
mov.u32 %r27, %r173;
add.s32 %r30, %r27, %r13;
setp.ge.s32 %p8, %r60, %r11;
mov.u32 %r176, %r60;
@%p8 bra BB12_8;
mov.u64 %rd9, constDeltaForeach4;
add.s64 %rd10, %rd9, %rd6;
ld.global.u8 %r31, [%rd10];
mad.lo.s32 %r32, %r30, %r48, %r24;
add.s32 %r177, %r59, %r42;
BB12_7:
cvta.to.global.u64 %rd11, %rd2;
add.s32 %r98, %r32, %r177;
add.s32 %r99, %r98, %r31;
shl.b32 %r100, %r99, 3;
cvt.s64.s32 %rd12, %r100;
add.s64 %rd13, %rd12, %rd4;
add.s32 %r101, %r100, 8;
cvt.s64.s32 %rd14, %r101;
add.s64 %rd15, %rd14, %rd4;
add.s32 %r102, %r100, -8;
cvt.s64.s32 %rd16, %r102;
add.s64 %rd17, %rd16, %rd4;
add.s32 %r103, %r99, %r48;
shl.b32 %r104, %r103, 3;
cvt.s64.s32 %rd18, %r104;
add.s64 %rd19, %rd18, %rd4;
sub.s32 %r105, %r99, %r48;
shl.b32 %r106, %r105, 3;
cvt.s64.s32 %rd20, %r106;
add.s64 %rd21, %rd20, %rd4;
add.s32 %r108, %r99, %r10;
shl.b32 %r109, %r108, 3;
cvt.s64.s32 %rd22, %r109;
add.s64 %rd23, %rd22, %rd4;
sub.s32 %r110, %r99, %r10;
shl.b32 %r111, %r110, 3;
cvt.s64.s32 %rd24, %r111;
add.s64 %rd25, %rd24, %rd4;
add.s32 %r112, %r100, 16;
cvt.s64.s32 %rd26, %r112;
add.s64 %rd27, %rd26, %rd4;
add.s32 %r113, %r100, -16;
cvt.s64.s32 %rd28, %r113;
add.s64 %rd29, %rd28, %rd4;
shl.b32 %r114, %r48, 1;
add.s32 %r115, %r99, %r114;
shl.b32 %r116, %r115, 3;
cvt.s64.s32 %rd30, %r116;
add.s64 %rd31, %rd30, %rd4;
mad.lo.s32 %r117, %r48, -2, %r99;
shl.b32 %r118, %r117, 3;
cvt.s64.s32 %rd32, %r118;
add.s64 %rd33, %rd32, %rd4;
add.s32 %r119, %r99, %r15;
shl.b32 %r120, %r119, 3;
cvt.s64.s32 %rd34, %r120;
add.s64 %rd35, %rd34, %rd4;
add.s32 %r121, %r99, %r16;
shl.b32 %r122, %r121, 3;
cvt.s64.s32 %rd36, %r122;
add.s64 %rd37, %rd36, %rd4;
add.s32 %r123, %r100, 24;
cvt.s64.s32 %rd38, %r123;
add.s64 %rd39, %rd38, %rd4;
add.s32 %r124, %r100, -24;
cvt.s64.s32 %rd40, %r124;
add.s64 %rd41, %rd40, %rd4;
mad.lo.s32 %r125, %r48, 3, %r99;
shl.b32 %r126, %r125, 3;
cvt.s64.s32 %rd42, %r126;
add.s64 %rd43, %rd42, %rd4;
mad.lo.s32 %r127, %r48, -3, %r99;
shl.b32 %r128, %r127, 3;
cvt.s64.s32 %rd44, %r128;
add.s64 %rd45, %rd44, %rd4;
add.s32 %r129, %r99, %r17;
shl.b32 %r130, %r129, 3;
cvt.s64.s32 %rd46, %r130;
add.s64 %rd47, %rd46, %rd4;
add.s32 %r131, %r99, %r18;
shl.b32 %r132, %r131, 3;
cvt.s64.s32 %rd48, %r132;
add.s64 %rd49, %rd48, %rd4;
add.s64 %rd50, %rd12, %rd5;
add.s64 %rd51, %rd12, %rd3;
ld.f64 %fd1, [%rd13];
add.f64 %fd2, %fd1, %fd1;
ld.f64 %fd3, [%rd50];
sub.f64 %fd4, %fd2, %fd3;
ld.global.f64 %fd5, [%rd11];
ld.f64 %fd6, [%rd17];
ld.f64 %fd7, [%rd15];
add.f64 %fd8, %fd7, %fd6;
ld.f64 %fd9, [%rd19];
add.f64 %fd10, %fd8, %fd9;
ld.f64 %fd11, [%rd21];
add.f64 %fd12, %fd10, %fd11;
ld.f64 %fd13, [%rd23];
add.f64 %fd14, %fd12, %fd13;
ld.f64 %fd15, [%rd25];
add.f64 %fd16, %fd14, %fd15;
ld.global.f64 %fd17, [%rd11+8];
mul.f64 %fd18, %fd17, %fd16;
fma.rn.f64 %fd19, %fd5, %fd1, %fd18;
ld.f64 %fd20, [%rd29];
ld.f64 %fd21, [%rd27];
add.f64 %fd22, %fd21, %fd20;
ld.f64 %fd23, [%rd31];
add.f64 %fd24, %fd22, %fd23;
ld.f64 %fd25, [%rd33];
add.f64 %fd26, %fd24, %fd25;
ld.f64 %fd27, [%rd35];
add.f64 %fd28, %fd26, %fd27;
ld.f64 %fd29, [%rd37];
add.f64 %fd30, %fd28, %fd29;
ld.global.f64 %fd31, [%rd11+16];
fma.rn.f64 %fd32, %fd31, %fd30, %fd19;
ld.f64 %fd33, [%rd41];
ld.f64 %fd34, [%rd39];
add.f64 %fd35, %fd34, %fd33;
ld.f64 %fd36, [%rd43];
add.f64 %fd37, %fd35, %fd36;
ld.f64 %fd38, [%rd45];
add.f64 %fd39, %fd37, %fd38;
ld.f64 %fd40, [%rd47];
add.f64 %fd41, %fd39, %fd40;
ld.f64 %fd42, [%rd49];
add.f64 %fd43, %fd41, %fd42;
ld.global.f64 %fd44, [%rd11+24];
fma.rn.f64 %fd45, %fd44, %fd43, %fd32;
ld.f64 %fd46, [%rd51];
fma.rn.f64 %fd47, %fd46, %fd45, %fd4;
st.f64 [%rd50], %fd47;
add.s32 %r177, %r177, 32;
setp.lt.s32 %p9, %r177, %r11;
mov.u32 %r175, %r177;
mov.u32 %r176, %r175;
@%p9 bra BB12_7;
BB12_8:
mov.u32 %r36, %r176;
setp.ge.s32 %p10, %r36, %r5;
@%p10 bra BB12_11;
mov.u64 %rd53, constDeltaForeach4;
add.s64 %rd54, %rd53, %rd6;
ld.global.u8 %r135, [%rd54];
add.s32 %r37, %r36, %r135;
setp.ge.s32 %p11, %r37, %r5;
@%p11 bra BB12_11;
cvta.to.global.u64 %rd55, %rd2;
mad.lo.s32 %r136, %r30, %r48, %r24;
add.s32 %r137, %r136, %r37;
shl.b32 %r138, %r137, 3;
cvt.s64.s32 %rd56, %r138;
add.s64 %rd57, %rd56, %rd4;
add.s32 %r139, %r138, 8;
cvt.s64.s32 %rd58, %r139;
add.s64 %rd59, %rd58, %rd4;
add.s32 %r140, %r138, -8;
cvt.s64.s32 %rd60, %r140;
add.s64 %rd61, %rd60, %rd4;
add.s32 %r141, %r137, %r48;
shl.b32 %r142, %r141, 3;
cvt.s64.s32 %rd62, %r142;
add.s64 %rd63, %rd62, %rd4;
sub.s32 %r143, %r137, %r48;
shl.b32 %r144, %r143, 3;
cvt.s64.s32 %rd64, %r144;
add.s64 %rd65, %rd64, %rd4;
add.s32 %r146, %r137, %r10;
shl.b32 %r147, %r146, 3;
cvt.s64.s32 %rd66, %r147;
add.s64 %rd67, %rd66, %rd4;
sub.s32 %r148, %r137, %r10;
shl.b32 %r149, %r148, 3;
cvt.s64.s32 %rd68, %r149;
add.s64 %rd69, %rd68, %rd4;
add.s32 %r150, %r138, 16;
cvt.s64.s32 %rd70, %r150;
add.s64 %rd71, %rd70, %rd4;
add.s32 %r151, %r138, -16;
cvt.s64.s32 %rd72, %r151;
add.s64 %rd73, %rd72, %rd4;
shl.b32 %r152, %r48, 1;
add.s32 %r153, %r137, %r152;
shl.b32 %r154, %r153, 3;
cvt.s64.s32 %rd74, %r154;
add.s64 %rd75, %rd74, %rd4;
mad.lo.s32 %r155, %r48, -2, %r137;
shl.b32 %r156, %r155, 3;
cvt.s64.s32 %rd76, %r156;
add.s64 %rd77, %rd76, %rd4;
add.s32 %r157, %r137, %r15;
shl.b32 %r158, %r157, 3;
cvt.s64.s32 %rd78, %r158;
add.s64 %rd79, %rd78, %rd4;
add.s32 %r159, %r137, %r16;
shl.b32 %r160, %r159, 3;
cvt.s64.s32 %rd80, %r160;
add.s64 %rd81, %rd80, %rd4;
add.s32 %r161, %r138, 24;
cvt.s64.s32 %rd82, %r161;
add.s64 %rd83, %rd82, %rd4;
add.s32 %r162, %r138, -24;
cvt.s64.s32 %rd84, %r162;
add.s64 %rd85, %rd84, %rd4;
mad.lo.s32 %r163, %r48, 3, %r137;
shl.b32 %r164, %r163, 3;
cvt.s64.s32 %rd86, %r164;
add.s64 %rd87, %rd86, %rd4;
mad.lo.s32 %r165, %r48, -3, %r137;
shl.b32 %r166, %r165, 3;
cvt.s64.s32 %rd88, %r166;
add.s64 %rd89, %rd88, %rd4;
add.s32 %r167, %r137, %r17;
shl.b32 %r168, %r167, 3;
cvt.s64.s32 %rd90, %r168;
add.s64 %rd91, %rd90, %rd4;
add.s32 %r169, %r137, %r18;
shl.b32 %r170, %r169, 3;
cvt.s64.s32 %rd92, %r170;
add.s64 %rd93, %rd92, %rd4;
add.s64 %rd94, %rd56, %rd5;
add.s64 %rd95, %rd56, %rd3;
ld.f64 %fd48, [%rd57];
add.f64 %fd49, %fd48, %fd48;
ld.f64 %fd50, [%rd94];
sub.f64 %fd51, %fd49, %fd50;
ld.global.f64 %fd52, [%rd55];
ld.f64 %fd53, [%rd61];
ld.f64 %fd54, [%rd59];
add.f64 %fd55, %fd54, %fd53;
ld.f64 %fd56, [%rd63];
add.f64 %fd57, %fd55, %fd56;
ld.f64 %fd58, [%rd65];
add.f64 %fd59, %fd57, %fd58;
ld.f64 %fd60, [%rd67];
add.f64 %fd61, %fd59, %fd60;
ld.f64 %fd62, [%rd69];
add.f64 %fd63, %fd61, %fd62;
ld.global.f64 %fd64, [%rd55+8];
mul.f64 %fd65, %fd64, %fd63;
fma.rn.f64 %fd66, %fd52, %fd48, %fd65;
ld.f64 %fd67, [%rd73];
ld.f64 %fd68, [%rd71];
add.f64 %fd69, %fd68, %fd67;
ld.f64 %fd70, [%rd75];
add.f64 %fd71, %fd69, %fd70;
ld.f64 %fd72, [%rd77];
add.f64 %fd73, %fd71, %fd72;
ld.f64 %fd74, [%rd79];
add.f64 %fd75, %fd73, %fd74;
ld.f64 %fd76, [%rd81];
add.f64 %fd77, %fd75, %fd76;
ld.global.f64 %fd78, [%rd55+16];
fma.rn.f64 %fd79, %fd78, %fd77, %fd66;
ld.f64 %fd80, [%rd85];
ld.f64 %fd81, [%rd83];
add.f64 %fd82, %fd81, %fd80;
ld.f64 %fd83, [%rd87];
add.f64 %fd84, %fd82, %fd83;
ld.f64 %fd85, [%rd89];
add.f64 %fd86, %fd84, %fd85;
ld.f64 %fd87, [%rd91];
add.f64 %fd88, %fd86, %fd87;
ld.f64 %fd89, [%rd93];
add.f64 %fd90, %fd88, %fd89;
ld.global.f64 %fd91, [%rd55+24];
fma.rn.f64 %fd92, %fd91, %fd90, %fd79;
ld.f64 %fd93, [%rd95];
fma.rn.f64 %fd94, %fd92, %fd93, %fd51;
st.f64 [%rd94], %fd94;
BB12_11:
add.s32 %r39, %r174, 1;
setp.ne.s32 %p12, %r39, %r19;
mov.u32 %r174, %r39;
mov.u32 %r173, %r39;
@%p12 bra BB12_5;
BB12_12:
add.s32 %r171, %r172, 1;
setp.ne.s32 %p13, %r171, %r20;
mov.u32 %r172, %r171;
@%p13 bra BB12_3;
BB12_13:
ret;
}
.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_(
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0,
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1,
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2,
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3,
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4,
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5,
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6,
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7,
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8,
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9,
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10,
.param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11,
.param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12,
.param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13,
.param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14,
.param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1]
)
{
.reg .pred %p<9>;
.reg .s32 %r<63>;
.reg .s64 %rd<18>;
ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0];
ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1];
ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2];
ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3];
ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4];
ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5];
ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6];
ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7];
ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8];
ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9];
ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10];
ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11];
ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12];
ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13];
ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14];
setp.ge.s32 %p1, %r62, %r12;
@%p1 bra BB13_14;
mov.u32 %r22, 31;
sub.s32 %r23, %r22, %r13;
add.s32 %r24, %r23, %r14;
shr.s32 %r25, %r24, 31;
shr.u32 %r26, %r25, 27;
add.s32 %r27, %r24, %r26;
shr.s32 %r28, %r27, 5;
mov.u32 %r29, 7;
sub.s32 %r30, %r29, %r15;
add.s32 %r31, %r30, %r16;
shr.s32 %r32, %r31, 31;
shr.u32 %r33, %r32, 29;
add.s32 %r34, %r31, %r33;
shr.s32 %r1, %r34, 3;
sub.s32 %r35, %r29, %r17;
add.s32 %r36, %r35, %r18;
shr.s32 %r37, %r36, 31;
shr.u32 %r38, %r37, 29;
add.s32 %r39, %r36, %r38;
shr.s32 %r2, %r39, 3;
add.s32 %r40, %r28, -1;
shr.s32 %r41, %r40, 2;
add.s32 %r3, %r41, 1;
mov.u32 %r42, %tid.x;
and.b32 %r4, %r42, 31;
sub.s32 %r61, %r62, %r12;
BB13_2:
and.b32 %r8, %r62, 1;
setp.ne.s32 %p2, %r4, 0;
mov.u64 %rd17, 0;
@%p2 bra BB13_4;
mov.u64 %rd9, 8;
mov.u64 %rd10, 72;
// Callseq Start 2
{
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd9;
.param .b64 param1;
st.param.b64 [param1+0], %rd10;
.param .b64 retval0;
call.uni (retval0),
cudaGetParameterBuffer,
(
param0,
param1
);
ld.param.b64 %rd17, [retval0+0];
}
// Callseq End 2
BB13_4:
setp.eq.s32 %p3, %r8, 0;
@%p3 bra BB13_9;
setp.eq.s64 %p4, %rd17, 0;
@%p4 bra BB13_7;
st.u32 [%rd17], %r13;
st.u32 [%rd17+4], %r14;
st.u32 [%rd17+8], %r15;
st.u32 [%rd17+12], %r16;
st.u32 [%rd17+16], %r17;
st.u32 [%rd17+20], %r18;
st.u32 [%rd17+24], %r19;
st.u32 [%rd17+28], %r20;
st.u32 [%rd17+32], %r21;
st.u64 [%rd17+40], %rd4;
st.u64 [%rd17+48], %rd5;
st.u64 [%rd17+56], %rd7;
st.u64 [%rd17+64], %rd6;
BB13_7:
@%p2 bra BB13_13;
mov.u32 %r47, 128;
mov.u32 %r49, 1;
mov.u32 %r50, 0;
mov.u64 %rd13, 0;
mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_;
// inline asm
{
.param .b64 param0;
st.param.b64 [param0+0], %rd11;
.param .b64 param1;
st.param.b64 [param1+0], %rd17;
.param .align 4 .b8 param2[12];
st.param.b32 [param2+0], %r3;
st.param.b32 [param2+4], %r1;
st.param.b32 [param2+8], %r2;
.param .align 4 .b8 param3[12];
st.param.b32 [param3+0], %r47;
st.param.b32 [param3+4], %r49;
st.param.b32 [param3+8], %r49;
.param .b32 param4;
st.param.b32 [param4+0], %r50;
.param .b64 param5;
st.param.b64 [param5+0], %rd13;
.param .b32 retval0;
call.uni (retval0),
cudaLaunchDevice,
(
param0,
param1,
param2,
param3,
param4,
param5
);
ld.param.b32 %r43, [retval0+0];
}
// inline asm
bra.uni BB13_13;
BB13_9:
setp.eq.s64 %p6, %rd17, 0;
@%p6 bra BB13_11;
st.u32 [%rd17], %r13;
st.u32 [%rd17+4], %r14;
st.u32 [%rd17+8], %r15;
st.u32 [%rd17+12], %r16;
st.u32 [%rd17+16], %r17;
st.u32 [%rd17+20], %r18;
st.u32 [%rd17+24], %r19;
st.u32 [%rd17+28], %r20;
st.u32 [%rd17+32], %r21;
st.u64 [%rd17+40], %rd4;
st.u64 [%rd17+48], %rd5;
st.u64 [%rd17+56], %rd6;
st.u64 [%rd17+64], %rd7;
BB13_11:
@%p2 bra BB13_13;
mov.u32 %r55, 128;
mov.u32 %r57, 1;
mov.u32 %r58, 0;
mov.u64 %rd16, 0;
mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_;
// inline asm
{
.param .b64 param0;
st.param.b64 [param0+0], %rd14;
.param .b64 param1;
st.param.b64 [param1+0], %rd17;
.param .align 4 .b8 param2[12];
st.param.b32 [param2+0], %r3;
st.param.b32 [param2+4], %r1;
st.param.b32 [param2+8], %r2;
.param .align 4 .b8 param3[12];
st.param.b32 [param3+0], %r55;
st.param.b32 [param3+4], %r57;
st.param.b32 [param3+8], %r57;
.param .b32 param4;
st.param.b32 [param4+0], %r58;
.param .b64 param5;
st.param.b64 [param5+0], %rd16;
.param .b32 retval0;
call.uni (retval0),
cudaLaunchDevice,
(
param0,
param1,
param2,
param3,
param4,
param5
);
ld.param.b32 %r51, [retval0+0];
}
// inline asm
BB13_13:
// Callseq Start 3
{
.reg .b32 temp_param_reg;
.param .b32 retval0;
call.uni (retval0),
cudaDeviceSynchronize,
(
);
ld.param.b32 %r59, [retval0+0];
}
// Callseq End 3
add.s32 %r62, %r62, 1;
add.s32 %r61, %r61, 1;
setp.ne.s32 %p8, %r61, 0;
@%p8 bra BB13_2;
BB13_14:
// Callseq Start 4
{
.reg .b32 temp_param_reg;
.param .b32 retval0;
call.uni (retval0),
cudaDeviceSynchronize,
(
);
ld.param.b32 %r60, [retval0+0];
}
// Callseq End 4
ret;
}
.visible .entry loop_stencil_ispc_tasks(
.param .u32 loop_stencil_ispc_tasks_param_0,
.param .u32 loop_stencil_ispc_tasks_param_1,
.param .u32 loop_stencil_ispc_tasks_param_2,
.param .u32 loop_stencil_ispc_tasks_param_3,
.param .u32 loop_stencil_ispc_tasks_param_4,
.param .u32 loop_stencil_ispc_tasks_param_5,
.param .u32 loop_stencil_ispc_tasks_param_6,
.param .u32 loop_stencil_ispc_tasks_param_7,
.param .u32 loop_stencil_ispc_tasks_param_8,
.param .u32 loop_stencil_ispc_tasks_param_9,
.param .u32 loop_stencil_ispc_tasks_param_10,
.param .u64 loop_stencil_ispc_tasks_param_11,
.param .u64 loop_stencil_ispc_tasks_param_12,
.param .u64 loop_stencil_ispc_tasks_param_13,
.param .u64 loop_stencil_ispc_tasks_param_14
)
{
.reg .pred %p<9>;
.reg .s32 %r<63>;
.reg .s64 %rd<18>;
ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0];
ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1];
ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2];
ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3];
ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4];
ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5];
ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6];
ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7];
ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8];
ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9];
ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10];
ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11];
ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12];
ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13];
ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14];
setp.ge.s32 %p1, %r62, %r12;
@%p1 bra BB14_14;
mov.u32 %r22, 31;
sub.s32 %r23, %r22, %r13;
add.s32 %r24, %r23, %r14;
shr.s32 %r25, %r24, 31;
shr.u32 %r26, %r25, 27;
add.s32 %r27, %r24, %r26;
shr.s32 %r28, %r27, 5;
mov.u32 %r29, 7;
sub.s32 %r30, %r29, %r15;
add.s32 %r31, %r30, %r16;
shr.s32 %r32, %r31, 31;
shr.u32 %r33, %r32, 29;
add.s32 %r34, %r31, %r33;
shr.s32 %r1, %r34, 3;
sub.s32 %r35, %r29, %r17;
add.s32 %r36, %r35, %r18;
shr.s32 %r37, %r36, 31;
shr.u32 %r38, %r37, 29;
add.s32 %r39, %r36, %r38;
shr.s32 %r2, %r39, 3;
add.s32 %r40, %r28, -1;
shr.s32 %r41, %r40, 2;
add.s32 %r3, %r41, 1;
mov.u32 %r42, %tid.x;
and.b32 %r4, %r42, 31;
sub.s32 %r61, %r62, %r12;
BB14_2:
and.b32 %r8, %r62, 1;
setp.ne.s32 %p2, %r4, 0;
mov.u64 %rd17, 0;
@%p2 bra BB14_4;
mov.u64 %rd9, 8;
mov.u64 %rd10, 72;
// Callseq Start 5
{
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd9;
.param .b64 param1;
st.param.b64 [param1+0], %rd10;
.param .b64 retval0;
call.uni (retval0),
cudaGetParameterBuffer,
(
param0,
param1
);
ld.param.b64 %rd17, [retval0+0];
}
// Callseq End 5
BB14_4:
setp.eq.s32 %p3, %r8, 0;
@%p3 bra BB14_9;
setp.eq.s64 %p4, %rd17, 0;
@%p4 bra BB14_7;
st.u32 [%rd17], %r13;
st.u32 [%rd17+4], %r14;
st.u32 [%rd17+8], %r15;
st.u32 [%rd17+12], %r16;
st.u32 [%rd17+16], %r17;
st.u32 [%rd17+20], %r18;
st.u32 [%rd17+24], %r19;
st.u32 [%rd17+28], %r20;
st.u32 [%rd17+32], %r21;
st.u64 [%rd17+40], %rd4;
st.u64 [%rd17+48], %rd5;
st.u64 [%rd17+56], %rd7;
st.u64 [%rd17+64], %rd6;
BB14_7:
@%p2 bra BB14_13;
mov.u32 %r47, 128;
mov.u32 %r49, 1;
mov.u32 %r50, 0;
mov.u64 %rd13, 0;
mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_;
// inline asm
{
.param .b64 param0;
st.param.b64 [param0+0], %rd11;
.param .b64 param1;
st.param.b64 [param1+0], %rd17;
.param .align 4 .b8 param2[12];
st.param.b32 [param2+0], %r3;
st.param.b32 [param2+4], %r1;
st.param.b32 [param2+8], %r2;
.param .align 4 .b8 param3[12];
st.param.b32 [param3+0], %r47;
st.param.b32 [param3+4], %r49;
st.param.b32 [param3+8], %r49;
.param .b32 param4;
st.param.b32 [param4+0], %r50;
.param .b64 param5;
st.param.b64 [param5+0], %rd13;
.param .b32 retval0;
call.uni (retval0),
cudaLaunchDevice,
(
param0,
param1,
param2,
param3,
param4,
param5
);
ld.param.b32 %r43, [retval0+0];
}
// inline asm
bra.uni BB14_13;
BB14_9:
setp.eq.s64 %p6, %rd17, 0;
@%p6 bra BB14_11;
st.u32 [%rd17], %r13;
st.u32 [%rd17+4], %r14;
st.u32 [%rd17+8], %r15;
st.u32 [%rd17+12], %r16;
st.u32 [%rd17+16], %r17;
st.u32 [%rd17+20], %r18;
st.u32 [%rd17+24], %r19;
st.u32 [%rd17+28], %r20;
st.u32 [%rd17+32], %r21;
st.u64 [%rd17+40], %rd4;
st.u64 [%rd17+48], %rd5;
st.u64 [%rd17+56], %rd6;
st.u64 [%rd17+64], %rd7;
BB14_11:
@%p2 bra BB14_13;
mov.u32 %r55, 128;
mov.u32 %r57, 1;
mov.u32 %r58, 0;
mov.u64 %rd16, 0;
mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_;
// inline asm
{
.param .b64 param0;
st.param.b64 [param0+0], %rd14;
.param .b64 param1;
st.param.b64 [param1+0], %rd17;
.param .align 4 .b8 param2[12];
st.param.b32 [param2+0], %r3;
st.param.b32 [param2+4], %r1;
st.param.b32 [param2+8], %r2;
.param .align 4 .b8 param3[12];
st.param.b32 [param3+0], %r55;
st.param.b32 [param3+4], %r57;
st.param.b32 [param3+8], %r57;
.param .b32 param4;
st.param.b32 [param4+0], %r58;
.param .b64 param5;
st.param.b64 [param5+0], %rd16;
.param .b32 retval0;
call.uni (retval0),
cudaLaunchDevice,
(
param0,
param1,
param2,
param3,
param4,
param5
);
ld.param.b32 %r51, [retval0+0];
}
// inline asm
BB14_13:
// Callseq Start 6
{
.reg .b32 temp_param_reg;
.param .b32 retval0;
call.uni (retval0),
cudaDeviceSynchronize,
(
);
ld.param.b32 %r59, [retval0+0];
}
// Callseq End 6
add.s32 %r62, %r62, 1;
add.s32 %r61, %r61, 1;
setp.ne.s32 %p8, %r61, 0;
@%p8 bra BB14_2;
BB14_14:
// Callseq Start 7
{
.reg .b32 temp_param_reg;
.param .b32 retval0;
call.uni (retval0),
cudaDeviceSynchronize,
(
);
ld.param.b32 %r60, [retval0+0];
}
// Callseq End 7
ret;
}