1247 lines
35 KiB
Plaintext
1247 lines
35 KiB
Plaintext
//
|
|
// Generated by NVIDIA NVVM Compiler
|
|
// Compiler built on Thu Jul 18 02:37:37 2013 (1374107857)
|
|
// Cuda compilation tools, release 5.5, V5.5.0
|
|
//
|
|
|
|
.version 3.2
|
|
.target sm_35
|
|
.address_size 64
|
|
|
|
|
|
.extern .func (.param .b32 func_retval0) cudaLaunchDevice
|
|
(
|
|
.param .b64 cudaLaunchDevice_param_0,
|
|
.param .b64 cudaLaunchDevice_param_1,
|
|
.param .align 4 .b8 cudaLaunchDevice_param_2[12],
|
|
.param .align 4 .b8 cudaLaunchDevice_param_3[12],
|
|
.param .b32 cudaLaunchDevice_param_4,
|
|
.param .b64 cudaLaunchDevice_param_5
|
|
);
|
|
|
|
|
|
.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer
|
|
(
|
|
.param .b64 cudaGetParameterBuffer_param_0,
|
|
.param .b64 cudaGetParameterBuffer_param_1
|
|
)
|
|
;
|
|
.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize
|
|
(
|
|
|
|
)
|
|
;
|
|
.global .align 1 .b8 constDeltaForeach1[32];
|
|
.global .align 1 .b8 constDeltaForeach4[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
|
|
|
|
.visible .func (.param .b32 func_retval0) __shfl_i32(
|
|
.param .b32 __shfl_i32_param_0,
|
|
.param .b32 __shfl_i32_param_1
|
|
)
|
|
{
|
|
.reg .s32 %r<4>;
|
|
|
|
|
|
ld.param.u32 %r2, [__shfl_i32_param_0];
|
|
ld.param.u32 %r3, [__shfl_i32_param_1];
|
|
// inline asm
|
|
shfl.idx.b32 %r1, %r2, %r3, 0x1f;
|
|
// inline asm
|
|
st.param.b32 [func_retval0+0], %r1;
|
|
ret;
|
|
}
|
|
|
|
.visible .func (.param .b32 func_retval0) __shfl_xor_float(
|
|
.param .b32 __shfl_xor_float_param_0,
|
|
.param .b32 __shfl_xor_float_param_1
|
|
)
|
|
{
|
|
.reg .s32 %r<2>;
|
|
.reg .f32 %f<3>;
|
|
|
|
|
|
ld.param.f32 %f2, [__shfl_xor_float_param_0];
|
|
ld.param.u32 %r1, [__shfl_xor_float_param_1];
|
|
// inline asm
|
|
shfl.bfly.b32 %f1, %f2, %r1, 0x1f;
|
|
// inline asm
|
|
st.param.f32 [func_retval0+0], %f1;
|
|
ret;
|
|
}
|
|
|
|
.visible .func (.param .b32 func_retval0) __shfl_xor_i32(
|
|
.param .b32 __shfl_xor_i32_param_0,
|
|
.param .b32 __shfl_xor_i32_param_1
|
|
)
|
|
{
|
|
.reg .s32 %r<4>;
|
|
|
|
|
|
ld.param.u32 %r2, [__shfl_xor_i32_param_0];
|
|
ld.param.u32 %r3, [__shfl_xor_i32_param_1];
|
|
// inline asm
|
|
shfl.bfly.b32 %r1, %r2, %r3, 0x1f;
|
|
// inline asm
|
|
st.param.b32 [func_retval0+0], %r1;
|
|
ret;
|
|
}
|
|
|
|
.visible .func (.param .b32 func_retval0) __fminf(
|
|
.param .b32 __fminf_param_0,
|
|
.param .b32 __fminf_param_1
|
|
)
|
|
{
|
|
.reg .f32 %f<4>;
|
|
|
|
|
|
ld.param.f32 %f2, [__fminf_param_0];
|
|
ld.param.f32 %f3, [__fminf_param_1];
|
|
// inline asm
|
|
min.f32 %f1, %f2, %f3;
|
|
// inline asm
|
|
st.param.f32 [func_retval0+0], %f1;
|
|
ret;
|
|
}
|
|
|
|
.visible .func (.param .b32 func_retval0) __fmaxf(
|
|
.param .b32 __fmaxf_param_0,
|
|
.param .b32 __fmaxf_param_1
|
|
)
|
|
{
|
|
.reg .f32 %f<4>;
|
|
|
|
|
|
ld.param.f32 %f2, [__fmaxf_param_0];
|
|
ld.param.f32 %f3, [__fmaxf_param_1];
|
|
// inline asm
|
|
max.f32 %f1, %f2, %f3;
|
|
// inline asm
|
|
st.param.f32 [func_retval0+0], %f1;
|
|
ret;
|
|
}
|
|
|
|
.visible .func (.param .b32 func_retval0) __ballot(
|
|
.param .b32 __ballot_param_0
|
|
)
|
|
{
|
|
.reg .s32 %r<3>;
|
|
|
|
|
|
ld.param.u8 %r2, [__ballot_param_0];
|
|
// inline asm
|
|
{ .reg .pred %p1;
|
|
setp.ne.u32 %p1, %r2, 0;
|
|
vote.ballot.b32 %r1, %p1;
|
|
}
|
|
// inline asm
|
|
st.param.b32 [func_retval0+0], %r1;
|
|
ret;
|
|
}
|
|
|
|
.visible .func (.param .b32 func_retval0) __lanemask_lt(
|
|
|
|
)
|
|
{
|
|
.reg .s32 %r<2>;
|
|
|
|
|
|
// inline asm
|
|
mov.u32 %r1, %lanemask_lt;
|
|
// inline asm
|
|
st.param.b32 [func_retval0+0], %r1;
|
|
ret;
|
|
}
|
|
|
|
.visible .func (.param .b64 func_retval0) ISPCAlloc(
|
|
.param .b64 ISPCAlloc_param_0,
|
|
.param .b64 ISPCAlloc_param_1,
|
|
.param .b32 ISPCAlloc_param_2
|
|
)
|
|
{
|
|
.reg .s64 %rd<2>;
|
|
|
|
|
|
mov.u64 %rd1, 1;
|
|
st.param.b64 [func_retval0+0], %rd1;
|
|
ret;
|
|
}
|
|
|
|
.visible .func (.param .b64 func_retval0) ISPCGetParamBuffer(
|
|
.param .b64 ISPCGetParamBuffer_param_0,
|
|
.param .b64 ISPCGetParamBuffer_param_1,
|
|
.param .b64 ISPCGetParamBuffer_param_2
|
|
)
|
|
{
|
|
.reg .pred %p<2>;
|
|
.reg .s32 %r<3>;
|
|
.reg .s64 %rd<7>;
|
|
|
|
|
|
ld.param.u64 %rd3, [ISPCGetParamBuffer_param_1];
|
|
ld.param.u64 %rd4, [ISPCGetParamBuffer_param_2];
|
|
mov.u32 %r1, %tid.x;
|
|
and.b32 %r2, %r1, 31;
|
|
setp.ne.s32 %p1, %r2, 0;
|
|
mov.u64 %rd6, 0;
|
|
@%p1 bra BB8_2;
|
|
|
|
// Callseq Start 0
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b64 param0;
|
|
st.param.b64 [param0+0], %rd3;
|
|
.param .b64 param1;
|
|
st.param.b64 [param1+0], %rd4;
|
|
.param .b64 retval0;
|
|
call.uni (retval0),
|
|
cudaGetParameterBuffer,
|
|
(
|
|
param0,
|
|
param1
|
|
);
|
|
ld.param.b64 %rd6, [retval0+0];
|
|
}
|
|
// Callseq End 0
|
|
|
|
BB8_2:
|
|
st.param.b64 [func_retval0+0], %rd6;
|
|
ret;
|
|
}
|
|
|
|
.visible .func ISPCLaunch(
|
|
.param .b64 ISPCLaunch_param_0,
|
|
.param .b64 ISPCLaunch_param_1,
|
|
.param .b64 ISPCLaunch_param_2,
|
|
.param .b32 ISPCLaunch_param_3,
|
|
.param .b32 ISPCLaunch_param_4,
|
|
.param .b32 ISPCLaunch_param_5
|
|
)
|
|
{
|
|
.reg .pred %p<2>;
|
|
.reg .s32 %r<16>;
|
|
.reg .s64 %rd<6>;
|
|
|
|
|
|
ld.param.u64 %rd1, [ISPCLaunch_param_1];
|
|
ld.param.u64 %rd2, [ISPCLaunch_param_2];
|
|
ld.param.u32 %r1, [ISPCLaunch_param_3];
|
|
ld.param.u32 %r2, [ISPCLaunch_param_4];
|
|
ld.param.u32 %r3, [ISPCLaunch_param_5];
|
|
mov.u32 %r4, %tid.x;
|
|
and.b32 %r5, %r4, 31;
|
|
setp.ne.s32 %p1, %r5, 0;
|
|
@%p1 bra BB9_2;
|
|
|
|
add.s32 %r14, %r1, -1;
|
|
shr.s32 %r15, %r14, 2;
|
|
add.s32 %r7, %r15, 1;
|
|
mov.u32 %r12, 1;
|
|
mov.u32 %r10, 128;
|
|
mov.u32 %r13, 0;
|
|
mov.u64 %rd5, 0;
|
|
// inline asm
|
|
{
|
|
.param .b64 param0;
|
|
st.param.b64 [param0+0], %rd1;
|
|
.param .b64 param1;
|
|
st.param.b64 [param1+0], %rd2;
|
|
.param .align 4 .b8 param2[12];
|
|
st.param.b32 [param2+0], %r7;
|
|
st.param.b32 [param2+4], %r2;
|
|
st.param.b32 [param2+8], %r3;
|
|
.param .align 4 .b8 param3[12];
|
|
st.param.b32 [param3+0], %r10;
|
|
st.param.b32 [param3+4], %r12;
|
|
st.param.b32 [param3+8], %r12;
|
|
.param .b32 param4;
|
|
st.param.b32 [param4+0], %r13;
|
|
.param .b64 param5;
|
|
st.param.b64 [param5+0], %rd5;
|
|
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaLaunchDevice,
|
|
(
|
|
param0,
|
|
param1,
|
|
param2,
|
|
param3,
|
|
param4,
|
|
param5
|
|
);
|
|
ld.param.b32 %r6, [retval0+0];
|
|
}
|
|
|
|
// inline asm
|
|
|
|
BB9_2:
|
|
ret;
|
|
}
|
|
|
|
.visible .func ISPCSync(
|
|
.param .b64 ISPCSync_param_0
|
|
)
|
|
{
|
|
.reg .s32 %r<2>;
|
|
|
|
|
|
// Callseq Start 1
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaDeviceSynchronize,
|
|
(
|
|
);
|
|
ld.param.b32 %r1, [retval0+0];
|
|
}
|
|
// Callseq End 1
|
|
ret;
|
|
}
|
|
|
|
.visible .func (.param .b64 func_retval0) __warpBinExclusiveScan(
|
|
.param .b32 __warpBinExclusiveScan_param_0
|
|
)
|
|
{
|
|
.reg .s32 %r<8>;
|
|
.reg .s64 %rd<5>;
|
|
|
|
|
|
ld.param.u8 %r2, [__warpBinExclusiveScan_param_0];
|
|
// inline asm
|
|
{ .reg .pred %p1;
|
|
setp.ne.u32 %p1, %r2, 0;
|
|
vote.ballot.b32 %r1, %p1;
|
|
}
|
|
// inline asm
|
|
// inline asm
|
|
popc.b32 %r3, %r1;
|
|
// inline asm
|
|
// inline asm
|
|
mov.u32 %r5, %lanemask_lt;
|
|
// inline asm
|
|
and.b32 %r7, %r5, %r1;
|
|
// inline asm
|
|
popc.b32 %r6, %r7;
|
|
// inline asm
|
|
cvt.u64.u32 %rd1, %r6;
|
|
shl.b64 %rd2, %rd1, 32;
|
|
cvt.u64.u32 %rd3, %r3;
|
|
or.b64 %rd4, %rd2, %rd3;
|
|
st.param.b64 [func_retval0+0], %rd4;
|
|
ret;
|
|
}
|
|
|
|
.entry stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_(
|
|
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0,
|
|
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1,
|
|
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2,
|
|
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3,
|
|
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4,
|
|
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5,
|
|
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6,
|
|
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7,
|
|
.param .u32 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_8,
|
|
.param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9,
|
|
.param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10,
|
|
.param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11,
|
|
.param .u64 stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12
|
|
)
|
|
{
|
|
.reg .pred %p<14>;
|
|
.reg .s32 %r<178>;
|
|
.reg .s64 %rd<96>;
|
|
.reg .f64 %fd<95>;
|
|
|
|
|
|
ld.param.u32 %r42, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_0];
|
|
ld.param.u32 %r43, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_1];
|
|
ld.param.u32 %r44, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_2];
|
|
ld.param.u32 %r45, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_3];
|
|
ld.param.u32 %r46, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_4];
|
|
ld.param.u32 %r47, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_5];
|
|
ld.param.u32 %r48, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_6];
|
|
ld.param.u32 %r49, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_7];
|
|
ld.param.u64 %rd2, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_9];
|
|
ld.param.u64 %rd3, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_10];
|
|
ld.param.u64 %rd4, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_11];
|
|
ld.param.u64 %rd5, [stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E__param_12];
|
|
mov.u32 %r1, %ctaid.x;
|
|
shl.b32 %r50, %r1, 2;
|
|
mov.u32 %r2, %tid.x;
|
|
shr.s32 %r51, %r2, 5;
|
|
add.s32 %r52, %r51, %r50;
|
|
mov.u32 %r53, %nctaid.x;
|
|
shl.b32 %r54, %r53, 2;
|
|
setp.ge.s32 %p1, %r52, %r54;
|
|
mov.u32 %r55, %nctaid.y;
|
|
mov.u32 %r3, %ctaid.y;
|
|
setp.ge.s32 %p2, %r3, %r55;
|
|
or.pred %p3, %p1, %p2;
|
|
mov.u32 %r56, %nctaid.z;
|
|
mov.u32 %r4, %ctaid.z;
|
|
setp.ge.s32 %p4, %r4, %r56;
|
|
or.pred %p5, %p3, %p4;
|
|
@%p5 bra BB12_13;
|
|
|
|
shl.b32 %r57, %r1, 7;
|
|
add.s32 %r58, %r2, %r57;
|
|
and.b32 %r59, %r58, -32;
|
|
add.s32 %r60, %r59, %r42;
|
|
add.s32 %r61, %r60, 32;
|
|
min.s32 %r5, %r43, %r61;
|
|
shl.b32 %r6, %r3, 3;
|
|
add.s32 %r62, %r6, %r44;
|
|
add.s32 %r7, %r62, 8;
|
|
shl.b32 %r8, %r4, 3;
|
|
add.s32 %r172, %r8, %r46;
|
|
add.s32 %r63, %r172, 8;
|
|
min.s32 %r64, %r47, %r63;
|
|
mul.lo.s32 %r10, %r49, %r48;
|
|
sub.s32 %r65, %r5, %r60;
|
|
shr.s32 %r66, %r65, 31;
|
|
shr.u32 %r67, %r66, 27;
|
|
add.s32 %r68, %r65, %r67;
|
|
and.b32 %r69, %r68, -32;
|
|
sub.s32 %r70, %r65, %r69;
|
|
sub.s32 %r11, %r5, %r70;
|
|
and.b32 %r71, %r2, 31;
|
|
cvt.u64.u32 %rd6, %r71;
|
|
mov.u64 %rd7, constDeltaForeach1;
|
|
add.s64 %rd1, %rd7, %rd6;
|
|
setp.ge.s32 %p6, %r172, %r64;
|
|
@%p6 bra BB12_13;
|
|
|
|
min.s32 %r12, %r45, %r7;
|
|
shl.b32 %r15, %r10, 1;
|
|
neg.s32 %r16, %r15;
|
|
mul.lo.s32 %r17, %r10, 3;
|
|
mul.lo.s32 %r18, %r10, -3;
|
|
mov.u32 %r72, -9;
|
|
sub.s32 %r73, %r72, %r44;
|
|
sub.s32 %r74, %r73, %r6;
|
|
not.b32 %r75, %r45;
|
|
max.s32 %r76, %r74, %r75;
|
|
not.b32 %r19, %r76;
|
|
sub.s32 %r77, %r72, %r46;
|
|
sub.s32 %r78, %r77, %r8;
|
|
not.b32 %r79, %r47;
|
|
max.s32 %r80, %r78, %r79;
|
|
not.b32 %r20, %r80;
|
|
ld.global.u8 %r13, [%rd1];
|
|
mov.u32 %r171, %r172;
|
|
|
|
BB12_3:
|
|
mov.u32 %r21, %r171;
|
|
add.s32 %r23, %r21, %r13;
|
|
setp.ge.s32 %p7, %r62, %r12;
|
|
@%p7 bra BB12_12;
|
|
|
|
mul.lo.s32 %r24, %r23, %r10;
|
|
mov.u32 %r174, %r62;
|
|
mov.u32 %r173, %r62;
|
|
|
|
BB12_5:
|
|
mov.u32 %r27, %r173;
|
|
add.s32 %r30, %r27, %r13;
|
|
setp.ge.s32 %p8, %r60, %r11;
|
|
mov.u32 %r176, %r60;
|
|
@%p8 bra BB12_8;
|
|
|
|
mov.u64 %rd9, constDeltaForeach4;
|
|
add.s64 %rd10, %rd9, %rd6;
|
|
ld.global.u8 %r31, [%rd10];
|
|
mad.lo.s32 %r32, %r30, %r48, %r24;
|
|
add.s32 %r177, %r59, %r42;
|
|
|
|
BB12_7:
|
|
cvta.to.global.u64 %rd11, %rd2;
|
|
add.s32 %r98, %r32, %r177;
|
|
add.s32 %r99, %r98, %r31;
|
|
shl.b32 %r100, %r99, 3;
|
|
cvt.s64.s32 %rd12, %r100;
|
|
add.s64 %rd13, %rd12, %rd4;
|
|
add.s32 %r101, %r100, 8;
|
|
cvt.s64.s32 %rd14, %r101;
|
|
add.s64 %rd15, %rd14, %rd4;
|
|
add.s32 %r102, %r100, -8;
|
|
cvt.s64.s32 %rd16, %r102;
|
|
add.s64 %rd17, %rd16, %rd4;
|
|
add.s32 %r103, %r99, %r48;
|
|
shl.b32 %r104, %r103, 3;
|
|
cvt.s64.s32 %rd18, %r104;
|
|
add.s64 %rd19, %rd18, %rd4;
|
|
sub.s32 %r105, %r99, %r48;
|
|
shl.b32 %r106, %r105, 3;
|
|
cvt.s64.s32 %rd20, %r106;
|
|
add.s64 %rd21, %rd20, %rd4;
|
|
add.s32 %r108, %r99, %r10;
|
|
shl.b32 %r109, %r108, 3;
|
|
cvt.s64.s32 %rd22, %r109;
|
|
add.s64 %rd23, %rd22, %rd4;
|
|
sub.s32 %r110, %r99, %r10;
|
|
shl.b32 %r111, %r110, 3;
|
|
cvt.s64.s32 %rd24, %r111;
|
|
add.s64 %rd25, %rd24, %rd4;
|
|
add.s32 %r112, %r100, 16;
|
|
cvt.s64.s32 %rd26, %r112;
|
|
add.s64 %rd27, %rd26, %rd4;
|
|
add.s32 %r113, %r100, -16;
|
|
cvt.s64.s32 %rd28, %r113;
|
|
add.s64 %rd29, %rd28, %rd4;
|
|
shl.b32 %r114, %r48, 1;
|
|
add.s32 %r115, %r99, %r114;
|
|
shl.b32 %r116, %r115, 3;
|
|
cvt.s64.s32 %rd30, %r116;
|
|
add.s64 %rd31, %rd30, %rd4;
|
|
mad.lo.s32 %r117, %r48, -2, %r99;
|
|
shl.b32 %r118, %r117, 3;
|
|
cvt.s64.s32 %rd32, %r118;
|
|
add.s64 %rd33, %rd32, %rd4;
|
|
add.s32 %r119, %r99, %r15;
|
|
shl.b32 %r120, %r119, 3;
|
|
cvt.s64.s32 %rd34, %r120;
|
|
add.s64 %rd35, %rd34, %rd4;
|
|
add.s32 %r121, %r99, %r16;
|
|
shl.b32 %r122, %r121, 3;
|
|
cvt.s64.s32 %rd36, %r122;
|
|
add.s64 %rd37, %rd36, %rd4;
|
|
add.s32 %r123, %r100, 24;
|
|
cvt.s64.s32 %rd38, %r123;
|
|
add.s64 %rd39, %rd38, %rd4;
|
|
add.s32 %r124, %r100, -24;
|
|
cvt.s64.s32 %rd40, %r124;
|
|
add.s64 %rd41, %rd40, %rd4;
|
|
mad.lo.s32 %r125, %r48, 3, %r99;
|
|
shl.b32 %r126, %r125, 3;
|
|
cvt.s64.s32 %rd42, %r126;
|
|
add.s64 %rd43, %rd42, %rd4;
|
|
mad.lo.s32 %r127, %r48, -3, %r99;
|
|
shl.b32 %r128, %r127, 3;
|
|
cvt.s64.s32 %rd44, %r128;
|
|
add.s64 %rd45, %rd44, %rd4;
|
|
add.s32 %r129, %r99, %r17;
|
|
shl.b32 %r130, %r129, 3;
|
|
cvt.s64.s32 %rd46, %r130;
|
|
add.s64 %rd47, %rd46, %rd4;
|
|
add.s32 %r131, %r99, %r18;
|
|
shl.b32 %r132, %r131, 3;
|
|
cvt.s64.s32 %rd48, %r132;
|
|
add.s64 %rd49, %rd48, %rd4;
|
|
add.s64 %rd50, %rd12, %rd5;
|
|
add.s64 %rd51, %rd12, %rd3;
|
|
ld.f64 %fd1, [%rd13];
|
|
add.f64 %fd2, %fd1, %fd1;
|
|
ld.f64 %fd3, [%rd50];
|
|
sub.f64 %fd4, %fd2, %fd3;
|
|
ld.global.f64 %fd5, [%rd11];
|
|
ld.f64 %fd6, [%rd17];
|
|
ld.f64 %fd7, [%rd15];
|
|
add.f64 %fd8, %fd7, %fd6;
|
|
ld.f64 %fd9, [%rd19];
|
|
add.f64 %fd10, %fd8, %fd9;
|
|
ld.f64 %fd11, [%rd21];
|
|
add.f64 %fd12, %fd10, %fd11;
|
|
ld.f64 %fd13, [%rd23];
|
|
add.f64 %fd14, %fd12, %fd13;
|
|
ld.f64 %fd15, [%rd25];
|
|
add.f64 %fd16, %fd14, %fd15;
|
|
ld.global.f64 %fd17, [%rd11+8];
|
|
mul.f64 %fd18, %fd17, %fd16;
|
|
fma.rn.f64 %fd19, %fd5, %fd1, %fd18;
|
|
ld.f64 %fd20, [%rd29];
|
|
ld.f64 %fd21, [%rd27];
|
|
add.f64 %fd22, %fd21, %fd20;
|
|
ld.f64 %fd23, [%rd31];
|
|
add.f64 %fd24, %fd22, %fd23;
|
|
ld.f64 %fd25, [%rd33];
|
|
add.f64 %fd26, %fd24, %fd25;
|
|
ld.f64 %fd27, [%rd35];
|
|
add.f64 %fd28, %fd26, %fd27;
|
|
ld.f64 %fd29, [%rd37];
|
|
add.f64 %fd30, %fd28, %fd29;
|
|
ld.global.f64 %fd31, [%rd11+16];
|
|
fma.rn.f64 %fd32, %fd31, %fd30, %fd19;
|
|
ld.f64 %fd33, [%rd41];
|
|
ld.f64 %fd34, [%rd39];
|
|
add.f64 %fd35, %fd34, %fd33;
|
|
ld.f64 %fd36, [%rd43];
|
|
add.f64 %fd37, %fd35, %fd36;
|
|
ld.f64 %fd38, [%rd45];
|
|
add.f64 %fd39, %fd37, %fd38;
|
|
ld.f64 %fd40, [%rd47];
|
|
add.f64 %fd41, %fd39, %fd40;
|
|
ld.f64 %fd42, [%rd49];
|
|
add.f64 %fd43, %fd41, %fd42;
|
|
ld.global.f64 %fd44, [%rd11+24];
|
|
fma.rn.f64 %fd45, %fd44, %fd43, %fd32;
|
|
ld.f64 %fd46, [%rd51];
|
|
fma.rn.f64 %fd47, %fd46, %fd45, %fd4;
|
|
st.f64 [%rd50], %fd47;
|
|
add.s32 %r177, %r177, 32;
|
|
setp.lt.s32 %p9, %r177, %r11;
|
|
mov.u32 %r175, %r177;
|
|
mov.u32 %r176, %r175;
|
|
@%p9 bra BB12_7;
|
|
|
|
BB12_8:
|
|
mov.u32 %r36, %r176;
|
|
setp.ge.s32 %p10, %r36, %r5;
|
|
@%p10 bra BB12_11;
|
|
|
|
mov.u64 %rd53, constDeltaForeach4;
|
|
add.s64 %rd54, %rd53, %rd6;
|
|
ld.global.u8 %r135, [%rd54];
|
|
add.s32 %r37, %r36, %r135;
|
|
setp.ge.s32 %p11, %r37, %r5;
|
|
@%p11 bra BB12_11;
|
|
|
|
cvta.to.global.u64 %rd55, %rd2;
|
|
mad.lo.s32 %r136, %r30, %r48, %r24;
|
|
add.s32 %r137, %r136, %r37;
|
|
shl.b32 %r138, %r137, 3;
|
|
cvt.s64.s32 %rd56, %r138;
|
|
add.s64 %rd57, %rd56, %rd4;
|
|
add.s32 %r139, %r138, 8;
|
|
cvt.s64.s32 %rd58, %r139;
|
|
add.s64 %rd59, %rd58, %rd4;
|
|
add.s32 %r140, %r138, -8;
|
|
cvt.s64.s32 %rd60, %r140;
|
|
add.s64 %rd61, %rd60, %rd4;
|
|
add.s32 %r141, %r137, %r48;
|
|
shl.b32 %r142, %r141, 3;
|
|
cvt.s64.s32 %rd62, %r142;
|
|
add.s64 %rd63, %rd62, %rd4;
|
|
sub.s32 %r143, %r137, %r48;
|
|
shl.b32 %r144, %r143, 3;
|
|
cvt.s64.s32 %rd64, %r144;
|
|
add.s64 %rd65, %rd64, %rd4;
|
|
add.s32 %r146, %r137, %r10;
|
|
shl.b32 %r147, %r146, 3;
|
|
cvt.s64.s32 %rd66, %r147;
|
|
add.s64 %rd67, %rd66, %rd4;
|
|
sub.s32 %r148, %r137, %r10;
|
|
shl.b32 %r149, %r148, 3;
|
|
cvt.s64.s32 %rd68, %r149;
|
|
add.s64 %rd69, %rd68, %rd4;
|
|
add.s32 %r150, %r138, 16;
|
|
cvt.s64.s32 %rd70, %r150;
|
|
add.s64 %rd71, %rd70, %rd4;
|
|
add.s32 %r151, %r138, -16;
|
|
cvt.s64.s32 %rd72, %r151;
|
|
add.s64 %rd73, %rd72, %rd4;
|
|
shl.b32 %r152, %r48, 1;
|
|
add.s32 %r153, %r137, %r152;
|
|
shl.b32 %r154, %r153, 3;
|
|
cvt.s64.s32 %rd74, %r154;
|
|
add.s64 %rd75, %rd74, %rd4;
|
|
mad.lo.s32 %r155, %r48, -2, %r137;
|
|
shl.b32 %r156, %r155, 3;
|
|
cvt.s64.s32 %rd76, %r156;
|
|
add.s64 %rd77, %rd76, %rd4;
|
|
add.s32 %r157, %r137, %r15;
|
|
shl.b32 %r158, %r157, 3;
|
|
cvt.s64.s32 %rd78, %r158;
|
|
add.s64 %rd79, %rd78, %rd4;
|
|
add.s32 %r159, %r137, %r16;
|
|
shl.b32 %r160, %r159, 3;
|
|
cvt.s64.s32 %rd80, %r160;
|
|
add.s64 %rd81, %rd80, %rd4;
|
|
add.s32 %r161, %r138, 24;
|
|
cvt.s64.s32 %rd82, %r161;
|
|
add.s64 %rd83, %rd82, %rd4;
|
|
add.s32 %r162, %r138, -24;
|
|
cvt.s64.s32 %rd84, %r162;
|
|
add.s64 %rd85, %rd84, %rd4;
|
|
mad.lo.s32 %r163, %r48, 3, %r137;
|
|
shl.b32 %r164, %r163, 3;
|
|
cvt.s64.s32 %rd86, %r164;
|
|
add.s64 %rd87, %rd86, %rd4;
|
|
mad.lo.s32 %r165, %r48, -3, %r137;
|
|
shl.b32 %r166, %r165, 3;
|
|
cvt.s64.s32 %rd88, %r166;
|
|
add.s64 %rd89, %rd88, %rd4;
|
|
add.s32 %r167, %r137, %r17;
|
|
shl.b32 %r168, %r167, 3;
|
|
cvt.s64.s32 %rd90, %r168;
|
|
add.s64 %rd91, %rd90, %rd4;
|
|
add.s32 %r169, %r137, %r18;
|
|
shl.b32 %r170, %r169, 3;
|
|
cvt.s64.s32 %rd92, %r170;
|
|
add.s64 %rd93, %rd92, %rd4;
|
|
add.s64 %rd94, %rd56, %rd5;
|
|
add.s64 %rd95, %rd56, %rd3;
|
|
ld.f64 %fd48, [%rd57];
|
|
add.f64 %fd49, %fd48, %fd48;
|
|
ld.f64 %fd50, [%rd94];
|
|
sub.f64 %fd51, %fd49, %fd50;
|
|
ld.global.f64 %fd52, [%rd55];
|
|
ld.f64 %fd53, [%rd61];
|
|
ld.f64 %fd54, [%rd59];
|
|
add.f64 %fd55, %fd54, %fd53;
|
|
ld.f64 %fd56, [%rd63];
|
|
add.f64 %fd57, %fd55, %fd56;
|
|
ld.f64 %fd58, [%rd65];
|
|
add.f64 %fd59, %fd57, %fd58;
|
|
ld.f64 %fd60, [%rd67];
|
|
add.f64 %fd61, %fd59, %fd60;
|
|
ld.f64 %fd62, [%rd69];
|
|
add.f64 %fd63, %fd61, %fd62;
|
|
ld.global.f64 %fd64, [%rd55+8];
|
|
mul.f64 %fd65, %fd64, %fd63;
|
|
fma.rn.f64 %fd66, %fd52, %fd48, %fd65;
|
|
ld.f64 %fd67, [%rd73];
|
|
ld.f64 %fd68, [%rd71];
|
|
add.f64 %fd69, %fd68, %fd67;
|
|
ld.f64 %fd70, [%rd75];
|
|
add.f64 %fd71, %fd69, %fd70;
|
|
ld.f64 %fd72, [%rd77];
|
|
add.f64 %fd73, %fd71, %fd72;
|
|
ld.f64 %fd74, [%rd79];
|
|
add.f64 %fd75, %fd73, %fd74;
|
|
ld.f64 %fd76, [%rd81];
|
|
add.f64 %fd77, %fd75, %fd76;
|
|
ld.global.f64 %fd78, [%rd55+16];
|
|
fma.rn.f64 %fd79, %fd78, %fd77, %fd66;
|
|
ld.f64 %fd80, [%rd85];
|
|
ld.f64 %fd81, [%rd83];
|
|
add.f64 %fd82, %fd81, %fd80;
|
|
ld.f64 %fd83, [%rd87];
|
|
add.f64 %fd84, %fd82, %fd83;
|
|
ld.f64 %fd85, [%rd89];
|
|
add.f64 %fd86, %fd84, %fd85;
|
|
ld.f64 %fd87, [%rd91];
|
|
add.f64 %fd88, %fd86, %fd87;
|
|
ld.f64 %fd89, [%rd93];
|
|
add.f64 %fd90, %fd88, %fd89;
|
|
ld.global.f64 %fd91, [%rd55+24];
|
|
fma.rn.f64 %fd92, %fd91, %fd90, %fd79;
|
|
ld.f64 %fd93, [%rd95];
|
|
fma.rn.f64 %fd94, %fd92, %fd93, %fd51;
|
|
st.f64 [%rd94], %fd94;
|
|
|
|
BB12_11:
|
|
add.s32 %r39, %r174, 1;
|
|
setp.ne.s32 %p12, %r39, %r19;
|
|
mov.u32 %r174, %r39;
|
|
mov.u32 %r173, %r39;
|
|
@%p12 bra BB12_5;
|
|
|
|
BB12_12:
|
|
add.s32 %r171, %r172, 1;
|
|
setp.ne.s32 %p13, %r171, %r20;
|
|
mov.u32 %r172, %r171;
|
|
@%p13 bra BB12_3;
|
|
|
|
BB12_13:
|
|
ret;
|
|
}
|
|
|
|
.visible .func loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E_(
|
|
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0,
|
|
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1,
|
|
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2,
|
|
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3,
|
|
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4,
|
|
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5,
|
|
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6,
|
|
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7,
|
|
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8,
|
|
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9,
|
|
.param .b32 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10,
|
|
.param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11,
|
|
.param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12,
|
|
.param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13,
|
|
.param .b64 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14,
|
|
.param .align 1 .b8 loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_15[1]
|
|
)
|
|
{
|
|
.reg .pred %p<9>;
|
|
.reg .s32 %r<63>;
|
|
.reg .s64 %rd<18>;
|
|
|
|
|
|
ld.param.u32 %r62, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_0];
|
|
ld.param.u32 %r12, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_1];
|
|
ld.param.u32 %r13, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_2];
|
|
ld.param.u32 %r14, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_3];
|
|
ld.param.u32 %r15, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_4];
|
|
ld.param.u32 %r16, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_5];
|
|
ld.param.u32 %r17, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_6];
|
|
ld.param.u32 %r18, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_7];
|
|
ld.param.u32 %r19, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_8];
|
|
ld.param.u32 %r20, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_9];
|
|
ld.param.u32 %r21, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_10];
|
|
ld.param.u64 %rd4, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_11];
|
|
ld.param.u64 %rd5, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_12];
|
|
ld.param.u64 %rd6, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_13];
|
|
ld.param.u64 %rd7, [loop_stencil_ispc_tasks___uniuniuniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_un_3C_und_3E__param_14];
|
|
setp.ge.s32 %p1, %r62, %r12;
|
|
@%p1 bra BB13_14;
|
|
|
|
mov.u32 %r22, 31;
|
|
sub.s32 %r23, %r22, %r13;
|
|
add.s32 %r24, %r23, %r14;
|
|
shr.s32 %r25, %r24, 31;
|
|
shr.u32 %r26, %r25, 27;
|
|
add.s32 %r27, %r24, %r26;
|
|
shr.s32 %r28, %r27, 5;
|
|
mov.u32 %r29, 7;
|
|
sub.s32 %r30, %r29, %r15;
|
|
add.s32 %r31, %r30, %r16;
|
|
shr.s32 %r32, %r31, 31;
|
|
shr.u32 %r33, %r32, 29;
|
|
add.s32 %r34, %r31, %r33;
|
|
shr.s32 %r1, %r34, 3;
|
|
sub.s32 %r35, %r29, %r17;
|
|
add.s32 %r36, %r35, %r18;
|
|
shr.s32 %r37, %r36, 31;
|
|
shr.u32 %r38, %r37, 29;
|
|
add.s32 %r39, %r36, %r38;
|
|
shr.s32 %r2, %r39, 3;
|
|
add.s32 %r40, %r28, -1;
|
|
shr.s32 %r41, %r40, 2;
|
|
add.s32 %r3, %r41, 1;
|
|
mov.u32 %r42, %tid.x;
|
|
and.b32 %r4, %r42, 31;
|
|
sub.s32 %r61, %r62, %r12;
|
|
|
|
BB13_2:
|
|
and.b32 %r8, %r62, 1;
|
|
setp.ne.s32 %p2, %r4, 0;
|
|
mov.u64 %rd17, 0;
|
|
@%p2 bra BB13_4;
|
|
|
|
mov.u64 %rd9, 8;
|
|
mov.u64 %rd10, 72;
|
|
// Callseq Start 2
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b64 param0;
|
|
st.param.b64 [param0+0], %rd9;
|
|
.param .b64 param1;
|
|
st.param.b64 [param1+0], %rd10;
|
|
.param .b64 retval0;
|
|
call.uni (retval0),
|
|
cudaGetParameterBuffer,
|
|
(
|
|
param0,
|
|
param1
|
|
);
|
|
ld.param.b64 %rd17, [retval0+0];
|
|
}
|
|
// Callseq End 2
|
|
|
|
BB13_4:
|
|
setp.eq.s32 %p3, %r8, 0;
|
|
@%p3 bra BB13_9;
|
|
|
|
setp.eq.s64 %p4, %rd17, 0;
|
|
@%p4 bra BB13_7;
|
|
|
|
st.u32 [%rd17], %r13;
|
|
st.u32 [%rd17+4], %r14;
|
|
st.u32 [%rd17+8], %r15;
|
|
st.u32 [%rd17+12], %r16;
|
|
st.u32 [%rd17+16], %r17;
|
|
st.u32 [%rd17+20], %r18;
|
|
st.u32 [%rd17+24], %r19;
|
|
st.u32 [%rd17+28], %r20;
|
|
st.u32 [%rd17+32], %r21;
|
|
st.u64 [%rd17+40], %rd4;
|
|
st.u64 [%rd17+48], %rd5;
|
|
st.u64 [%rd17+56], %rd7;
|
|
st.u64 [%rd17+64], %rd6;
|
|
|
|
BB13_7:
|
|
@%p2 bra BB13_13;
|
|
|
|
mov.u32 %r47, 128;
|
|
mov.u32 %r49, 1;
|
|
mov.u32 %r50, 0;
|
|
mov.u64 %rd13, 0;
|
|
mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_;
|
|
// inline asm
|
|
{
|
|
.param .b64 param0;
|
|
st.param.b64 [param0+0], %rd11;
|
|
.param .b64 param1;
|
|
st.param.b64 [param1+0], %rd17;
|
|
.param .align 4 .b8 param2[12];
|
|
st.param.b32 [param2+0], %r3;
|
|
st.param.b32 [param2+4], %r1;
|
|
st.param.b32 [param2+8], %r2;
|
|
.param .align 4 .b8 param3[12];
|
|
st.param.b32 [param3+0], %r47;
|
|
st.param.b32 [param3+4], %r49;
|
|
st.param.b32 [param3+8], %r49;
|
|
.param .b32 param4;
|
|
st.param.b32 [param4+0], %r50;
|
|
.param .b64 param5;
|
|
st.param.b64 [param5+0], %rd13;
|
|
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaLaunchDevice,
|
|
(
|
|
param0,
|
|
param1,
|
|
param2,
|
|
param3,
|
|
param4,
|
|
param5
|
|
);
|
|
ld.param.b32 %r43, [retval0+0];
|
|
}
|
|
|
|
// inline asm
|
|
bra.uni BB13_13;
|
|
|
|
BB13_9:
|
|
setp.eq.s64 %p6, %rd17, 0;
|
|
@%p6 bra BB13_11;
|
|
|
|
st.u32 [%rd17], %r13;
|
|
st.u32 [%rd17+4], %r14;
|
|
st.u32 [%rd17+8], %r15;
|
|
st.u32 [%rd17+12], %r16;
|
|
st.u32 [%rd17+16], %r17;
|
|
st.u32 [%rd17+20], %r18;
|
|
st.u32 [%rd17+24], %r19;
|
|
st.u32 [%rd17+28], %r20;
|
|
st.u32 [%rd17+32], %r21;
|
|
st.u64 [%rd17+40], %rd4;
|
|
st.u64 [%rd17+48], %rd5;
|
|
st.u64 [%rd17+56], %rd6;
|
|
st.u64 [%rd17+64], %rd7;
|
|
|
|
BB13_11:
|
|
@%p2 bra BB13_13;
|
|
|
|
mov.u32 %r55, 128;
|
|
mov.u32 %r57, 1;
|
|
mov.u32 %r58, 0;
|
|
mov.u64 %rd16, 0;
|
|
mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_;
|
|
// inline asm
|
|
{
|
|
.param .b64 param0;
|
|
st.param.b64 [param0+0], %rd14;
|
|
.param .b64 param1;
|
|
st.param.b64 [param1+0], %rd17;
|
|
.param .align 4 .b8 param2[12];
|
|
st.param.b32 [param2+0], %r3;
|
|
st.param.b32 [param2+4], %r1;
|
|
st.param.b32 [param2+8], %r2;
|
|
.param .align 4 .b8 param3[12];
|
|
st.param.b32 [param3+0], %r55;
|
|
st.param.b32 [param3+4], %r57;
|
|
st.param.b32 [param3+8], %r57;
|
|
.param .b32 param4;
|
|
st.param.b32 [param4+0], %r58;
|
|
.param .b64 param5;
|
|
st.param.b64 [param5+0], %rd16;
|
|
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaLaunchDevice,
|
|
(
|
|
param0,
|
|
param1,
|
|
param2,
|
|
param3,
|
|
param4,
|
|
param5
|
|
);
|
|
ld.param.b32 %r51, [retval0+0];
|
|
}
|
|
|
|
// inline asm
|
|
|
|
BB13_13:
|
|
// Callseq Start 3
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaDeviceSynchronize,
|
|
(
|
|
);
|
|
ld.param.b32 %r59, [retval0+0];
|
|
}
|
|
// Callseq End 3
|
|
add.s32 %r62, %r62, 1;
|
|
add.s32 %r61, %r61, 1;
|
|
setp.ne.s32 %p8, %r61, 0;
|
|
@%p8 bra BB13_2;
|
|
|
|
BB13_14:
|
|
// Callseq Start 4
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaDeviceSynchronize,
|
|
(
|
|
);
|
|
ld.param.b32 %r60, [retval0+0];
|
|
}
|
|
// Callseq End 4
|
|
ret;
|
|
}
|
|
|
|
.visible .entry loop_stencil_ispc_tasks(
|
|
.param .u32 loop_stencil_ispc_tasks_param_0,
|
|
.param .u32 loop_stencil_ispc_tasks_param_1,
|
|
.param .u32 loop_stencil_ispc_tasks_param_2,
|
|
.param .u32 loop_stencil_ispc_tasks_param_3,
|
|
.param .u32 loop_stencil_ispc_tasks_param_4,
|
|
.param .u32 loop_stencil_ispc_tasks_param_5,
|
|
.param .u32 loop_stencil_ispc_tasks_param_6,
|
|
.param .u32 loop_stencil_ispc_tasks_param_7,
|
|
.param .u32 loop_stencil_ispc_tasks_param_8,
|
|
.param .u32 loop_stencil_ispc_tasks_param_9,
|
|
.param .u32 loop_stencil_ispc_tasks_param_10,
|
|
.param .u64 loop_stencil_ispc_tasks_param_11,
|
|
.param .u64 loop_stencil_ispc_tasks_param_12,
|
|
.param .u64 loop_stencil_ispc_tasks_param_13,
|
|
.param .u64 loop_stencil_ispc_tasks_param_14
|
|
)
|
|
{
|
|
.reg .pred %p<9>;
|
|
.reg .s32 %r<63>;
|
|
.reg .s64 %rd<18>;
|
|
|
|
|
|
ld.param.u32 %r62, [loop_stencil_ispc_tasks_param_0];
|
|
ld.param.u32 %r12, [loop_stencil_ispc_tasks_param_1];
|
|
ld.param.u32 %r13, [loop_stencil_ispc_tasks_param_2];
|
|
ld.param.u32 %r14, [loop_stencil_ispc_tasks_param_3];
|
|
ld.param.u32 %r15, [loop_stencil_ispc_tasks_param_4];
|
|
ld.param.u32 %r16, [loop_stencil_ispc_tasks_param_5];
|
|
ld.param.u32 %r17, [loop_stencil_ispc_tasks_param_6];
|
|
ld.param.u32 %r18, [loop_stencil_ispc_tasks_param_7];
|
|
ld.param.u32 %r19, [loop_stencil_ispc_tasks_param_8];
|
|
ld.param.u32 %r20, [loop_stencil_ispc_tasks_param_9];
|
|
ld.param.u32 %r21, [loop_stencil_ispc_tasks_param_10];
|
|
ld.param.u64 %rd4, [loop_stencil_ispc_tasks_param_11];
|
|
ld.param.u64 %rd5, [loop_stencil_ispc_tasks_param_12];
|
|
ld.param.u64 %rd6, [loop_stencil_ispc_tasks_param_13];
|
|
ld.param.u64 %rd7, [loop_stencil_ispc_tasks_param_14];
|
|
setp.ge.s32 %p1, %r62, %r12;
|
|
@%p1 bra BB14_14;
|
|
|
|
mov.u32 %r22, 31;
|
|
sub.s32 %r23, %r22, %r13;
|
|
add.s32 %r24, %r23, %r14;
|
|
shr.s32 %r25, %r24, 31;
|
|
shr.u32 %r26, %r25, 27;
|
|
add.s32 %r27, %r24, %r26;
|
|
shr.s32 %r28, %r27, 5;
|
|
mov.u32 %r29, 7;
|
|
sub.s32 %r30, %r29, %r15;
|
|
add.s32 %r31, %r30, %r16;
|
|
shr.s32 %r32, %r31, 31;
|
|
shr.u32 %r33, %r32, 29;
|
|
add.s32 %r34, %r31, %r33;
|
|
shr.s32 %r1, %r34, 3;
|
|
sub.s32 %r35, %r29, %r17;
|
|
add.s32 %r36, %r35, %r18;
|
|
shr.s32 %r37, %r36, 31;
|
|
shr.u32 %r38, %r37, 29;
|
|
add.s32 %r39, %r36, %r38;
|
|
shr.s32 %r2, %r39, 3;
|
|
add.s32 %r40, %r28, -1;
|
|
shr.s32 %r41, %r40, 2;
|
|
add.s32 %r3, %r41, 1;
|
|
mov.u32 %r42, %tid.x;
|
|
and.b32 %r4, %r42, 31;
|
|
sub.s32 %r61, %r62, %r12;
|
|
|
|
BB14_2:
|
|
and.b32 %r8, %r62, 1;
|
|
setp.ne.s32 %p2, %r4, 0;
|
|
mov.u64 %rd17, 0;
|
|
@%p2 bra BB14_4;
|
|
|
|
mov.u64 %rd9, 8;
|
|
mov.u64 %rd10, 72;
|
|
// Callseq Start 5
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b64 param0;
|
|
st.param.b64 [param0+0], %rd9;
|
|
.param .b64 param1;
|
|
st.param.b64 [param1+0], %rd10;
|
|
.param .b64 retval0;
|
|
call.uni (retval0),
|
|
cudaGetParameterBuffer,
|
|
(
|
|
param0,
|
|
param1
|
|
);
|
|
ld.param.b64 %rd17, [retval0+0];
|
|
}
|
|
// Callseq End 5
|
|
|
|
BB14_4:
|
|
setp.eq.s32 %p3, %r8, 0;
|
|
@%p3 bra BB14_9;
|
|
|
|
setp.eq.s64 %p4, %rd17, 0;
|
|
@%p4 bra BB14_7;
|
|
|
|
st.u32 [%rd17], %r13;
|
|
st.u32 [%rd17+4], %r14;
|
|
st.u32 [%rd17+8], %r15;
|
|
st.u32 [%rd17+12], %r16;
|
|
st.u32 [%rd17+16], %r17;
|
|
st.u32 [%rd17+20], %r18;
|
|
st.u32 [%rd17+24], %r19;
|
|
st.u32 [%rd17+28], %r20;
|
|
st.u32 [%rd17+32], %r21;
|
|
st.u64 [%rd17+40], %rd4;
|
|
st.u64 [%rd17+48], %rd5;
|
|
st.u64 [%rd17+56], %rd7;
|
|
st.u64 [%rd17+64], %rd6;
|
|
|
|
BB14_7:
|
|
@%p2 bra BB14_13;
|
|
|
|
mov.u32 %r47, 128;
|
|
mov.u32 %r49, 1;
|
|
mov.u32 %r50, 0;
|
|
mov.u64 %rd13, 0;
|
|
mov.u64 %rd11, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_;
|
|
// inline asm
|
|
{
|
|
.param .b64 param0;
|
|
st.param.b64 [param0+0], %rd11;
|
|
.param .b64 param1;
|
|
st.param.b64 [param1+0], %rd17;
|
|
.param .align 4 .b8 param2[12];
|
|
st.param.b32 [param2+0], %r3;
|
|
st.param.b32 [param2+4], %r1;
|
|
st.param.b32 [param2+8], %r2;
|
|
.param .align 4 .b8 param3[12];
|
|
st.param.b32 [param3+0], %r47;
|
|
st.param.b32 [param3+4], %r49;
|
|
st.param.b32 [param3+8], %r49;
|
|
.param .b32 param4;
|
|
st.param.b32 [param4+0], %r50;
|
|
.param .b64 param5;
|
|
st.param.b64 [param5+0], %rd13;
|
|
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaLaunchDevice,
|
|
(
|
|
param0,
|
|
param1,
|
|
param2,
|
|
param3,
|
|
param4,
|
|
param5
|
|
);
|
|
ld.param.b32 %r43, [retval0+0];
|
|
}
|
|
|
|
// inline asm
|
|
bra.uni BB14_13;
|
|
|
|
BB14_9:
|
|
setp.eq.s64 %p6, %rd17, 0;
|
|
@%p6 bra BB14_11;
|
|
|
|
st.u32 [%rd17], %r13;
|
|
st.u32 [%rd17+4], %r14;
|
|
st.u32 [%rd17+8], %r15;
|
|
st.u32 [%rd17+12], %r16;
|
|
st.u32 [%rd17+16], %r17;
|
|
st.u32 [%rd17+20], %r18;
|
|
st.u32 [%rd17+24], %r19;
|
|
st.u32 [%rd17+28], %r20;
|
|
st.u32 [%rd17+32], %r21;
|
|
st.u64 [%rd17+40], %rd4;
|
|
st.u64 [%rd17+48], %rd5;
|
|
st.u64 [%rd17+56], %rd6;
|
|
st.u64 [%rd17+64], %rd7;
|
|
|
|
BB14_11:
|
|
@%p2 bra BB14_13;
|
|
|
|
mov.u32 %r55, 128;
|
|
mov.u32 %r57, 1;
|
|
mov.u32 %r58, 0;
|
|
mov.u64 %rd16, 0;
|
|
mov.u64 %rd14, stencil_step_task___UM_uniuniuniuniuniuniuniuniuniun_3C_Cund_3E_un_3C_Cund_3E_un_3C_Cund_3E_un_3C_und_3E_;
|
|
// inline asm
|
|
{
|
|
.param .b64 param0;
|
|
st.param.b64 [param0+0], %rd14;
|
|
.param .b64 param1;
|
|
st.param.b64 [param1+0], %rd17;
|
|
.param .align 4 .b8 param2[12];
|
|
st.param.b32 [param2+0], %r3;
|
|
st.param.b32 [param2+4], %r1;
|
|
st.param.b32 [param2+8], %r2;
|
|
.param .align 4 .b8 param3[12];
|
|
st.param.b32 [param3+0], %r55;
|
|
st.param.b32 [param3+4], %r57;
|
|
st.param.b32 [param3+8], %r57;
|
|
.param .b32 param4;
|
|
st.param.b32 [param4+0], %r58;
|
|
.param .b64 param5;
|
|
st.param.b64 [param5+0], %rd16;
|
|
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaLaunchDevice,
|
|
(
|
|
param0,
|
|
param1,
|
|
param2,
|
|
param3,
|
|
param4,
|
|
param5
|
|
);
|
|
ld.param.b32 %r51, [retval0+0];
|
|
}
|
|
|
|
// inline asm
|
|
|
|
BB14_13:
|
|
// Callseq Start 6
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaDeviceSynchronize,
|
|
(
|
|
);
|
|
ld.param.b32 %r59, [retval0+0];
|
|
}
|
|
// Callseq End 6
|
|
add.s32 %r62, %r62, 1;
|
|
add.s32 %r61, %r61, 1;
|
|
setp.ne.s32 %p8, %r61, 0;
|
|
@%p8 bra BB14_2;
|
|
|
|
BB14_14:
|
|
// Callseq Start 7
|
|
{
|
|
.reg .b32 temp_param_reg;
|
|
.param .b32 retval0;
|
|
call.uni (retval0),
|
|
cudaDeviceSynchronize,
|
|
(
|
|
);
|
|
ld.param.b32 %r60, [retval0+0];
|
|
}
|
|
// Callseq End 7
|
|
ret;
|
|
}
|
|
|
|
|
|
|