diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll index 65cf405a..7b469307 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx64.ll @@ -96,6 +96,77 @@ define i32 @__lanemask_lt() nounwind readnone alwaysinline ret i32 %mask } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; tasking + +module asm " +.extern .func (.param .b32 func_retval0) cudaLaunchDevice +( + .param .b64 cudaLaunchDevice_param_0, + .param .b64 cudaLaunchDevice_param_1, + .param .align 4 .b8 cudaLaunchDevice_param_2[12], + .param .align 4 .b8 cudaLaunchDevice_param_3[12], + .param .b32 cudaLaunchDevice_param_4, + .param .b64 cudaLaunchDevice_param_5 +); +" +define i8* @ISPCAlloc(i8**, i64, i32) nounwind alwaysinline builtin +{ + ret i8* null +} +define void @ISPCLaunch(i8**, i8* %func_ptr, i8** %func_args, i32 %ntx, i32 %nty, i32 %ntz) builtin +{ + %func_i64 = ptrtoint i8* %func_ptr to i64 + %args_i64 = ptrtoint i8** %func_args to i64 +;; nbx = (%ntx-1)/(blocksize/warpsize) + 1 for blocksize=128 & warpsize=32 + %sub = add nsw i32 %ntx, -1 + %div = sdiv i32 %sub, 4 + %nbx = add nsw i32 %div, 1 + + %res = call i32 asm sideeffect "{ + .reg .s32 %r<8>; + .reg .s64 %rd<3>; + .param .b64 param0; + st.param.b64 [param0+0], $1; //%rd0; + .param .b64 param1; + st.param.b64 [param1+0], $2; //%rd1; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], $3; //%r0; + st.param.b32 [param2+4], $4; //%r1; + st.param.b32 [param2+8], $5; //%r2; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], $6; //%r3; + st.param.b32 [param3+4], $7; //%r4; + st.param.b32 [param3+8], $8; //%r5; + .param .b32 param4; + st.param.b32 [param4+0], $9; //%r6; + .param .b64 param5; + st.param.b64 [param5+0], $10; //%rd2; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 $0, [retval0+0]; + ", +"=r, l,l, r,r,r, r,r,r, r,l"(i64 %func_i64,i64 %args_i64, i32 %nbx,i32 %nty,i32 %ntz, i32 128,i32 1,i32 1, i32 0,i64 0); + ret void +} +declare i32 @cudaDeviceSynchronize() nounwind +define void @ISPCSync(i8*) nounwind alwaysinline +{ + call i32 @cudaDeviceSynchronize() + ret void; +} + + ;;;;;;;;;;;;;; diff --git a/builtins/util_ptx.m4 b/builtins/util_ptx.m4 index 399d87d2..ca61f84e 100644 --- a/builtins/util_ptx.m4 +++ b/builtins/util_ptx.m4 @@ -1830,15 +1830,8 @@ define(`stdlib_core', ` declare i32 @__fast_masked_vload() -declare i8* @ISPCAlloc(i8**, i64, i32) nounwind -declare void @ISPCLaunch(i8**, i8*, i8*, i32,i32,i32) nounwind -declare void @ISPCSync(i8*) nounwind declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind -declare i8* @CUDAAlloc(i8**, i64, i32) nounwind -declare void @CUDALaunch(i8**, i8*,i8*,i8*, i8**, i32, i32, i32) nounwind -declare void @CUDASync(i8*) nounwind - declare i1 @__is_compile_time_constant_mask( %mask) declare i1 @__is_compile_time_constant_uniform_int32(i32) declare i1 @__is_compile_time_constant_varying_int32() diff --git a/ctx.cpp b/ctx.cpp index 8355bbc6..55adb95b 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -3617,7 +3617,7 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee, static_cast(pt->getElementType()); #endif - llvm::Function *falloc = m->module->getFunction("CUDAAlloc"); + llvm::Function *falloc = m->module->getFunction("ISPCAlloc"); AssertPos(currentPos, falloc != NULL); #if 0 llvm::Value *structSize = g->target->SizeOf(argStructType, bblock); @@ -3691,12 +3691,13 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee, // argument block we just filled in - llvm::Function *flaunch = m->module->getFunction("CUDALaunch"); + llvm::Function *flaunch = m->module->getFunction("ISPCLaunch"); AssertPos(currentPos, flaunch != NULL); std::vector args; args.push_back(launchGroupHandlePtr); /* void **handler */ +#if 0 /* module name string to distinguish between different modules , generated ones */ { const std::string moduleNameStr = m->module->getModuleIdentifier(); @@ -3743,7 +3744,9 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee, static llvm::Constant* const_ptr_12 = llvm::ConstantExpr::getGetElementPtr(gvarModuleNameStr, const_ptr_12_indices); args.push_back(const_ptr_12); /* const char * module_name */ } +#endif +#if 0 /* fucntion name string */ { const std::string funcNameStr = callee->getName().str(); @@ -3766,6 +3769,12 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee, llvm::Constant* const_ptr_12 = llvm::ConstantExpr::getGetElementPtr(gvarFuncNameStr, const_ptr_12_indices); args.push_back(const_ptr_12); /* const char * func_name */ } +#else + { + llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType); + args.push_back(fptr); + } +#endif /* pass array of pointers to function arguments, this is how cuLaunchKernel accepts arguments */ {