diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll index cdde3532..7479a174 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx64.ll @@ -115,7 +115,8 @@ define i8* @ISPCAlloc(i8**, i64, i32) nounwind alwaysinline { ret i8* null } -define void @ISPCLaunch(i8**, i8* %func_ptr, i8** %func_args, i32 %ntx, i32 %nty, i32 %ntz) nounwind alwaysinline +declare i64 @cudaGetParameterBuffer(i64, i64) nounwind +define void @ISPCLaunch(i8**, i8* %func_ptr, i8** %func_args, i32 %nargs, i32 %ntx, i32 %nty, i32 %ntz) nounwind alwaysinline { entry: %func_i64 = ptrtoint i8* %func_ptr to i64 @@ -136,6 +137,10 @@ entry: if.then: ; preds = %entry + %param = call i64 @cudaGetParameterBuffer(i64 8, i64 24); + %ptr = inttoptr i64 %param to i8*; + + %res_tmp = call i32 asm sideeffect "{ .reg .s32 %r<8>; .reg .s64 %rd<3>; diff --git a/examples_cuda/aobench/ao.cu b/examples_cuda/aobench/ao.cu index 77fff8ae..4b92b2f5 100644 --- a/examples_cuda/aobench/ao.cu +++ b/examples_cuda/aobench/ao.cu @@ -418,7 +418,7 @@ void ao_ispc_tasks( const int nbz = 1; const dim3 blocks (nbx, nby, nbz); if (threadIdx.x == 0) - ao_task<<>>(w,h,nsubsamples,image); + ao_task<<>>(w,w,nsubsamples,image); cudaDeviceSynchronize(); } #endif