From f433aa3ad573560c573cf2b0a5165f4265af6e14 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Wed, 13 Nov 2013 10:43:52 +0100 Subject: [PATCH] CDP works now --- builtins/target-nvptx64.ll | 82 +++++++++++++++++++------------ builtins/util.m4 | 4 -- ctx.cpp | 86 +++++++++++++++++++++++++++++++-- examples_cuda/aobench/ao_cu.cpp | 2 +- 4 files changed, 136 insertions(+), 38 deletions(-) diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll index 7479a174..2f2f004a 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx64.ll @@ -100,6 +100,34 @@ define i32 @__lanemask_lt() nounwind readnone alwaysinline ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; tasking +define i8* @ISPCAlloc(i8**, i64, i32) nounwind alwaysinline +{ + ret i8* null +} + +;; this call allocate parameter buffer for kernel launch +declare i64 @cudaGetParameterBuffer(i64, i64) nounwind +define i8* @ISPCGetParamBuffer(i8**, i64 %align, i64 %size) nounwind alwaysinline +{ +entry: + %call = tail call i32 @__tid_x() + %call1 = tail call i32 @__warpsize() + %sub = add nsw i32 %call1, -1 + %and = and i32 %sub, %call + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + %ptri64tmp = call i64 @cudaGetParameterBuffer(i64 %align, i64 %size); + br label %if.end + +if.end: + %ptri64 = phi i64 [ %ptri64tmp, %if.then ], [ 0, %entry ] + %ptr = inttoptr i64 %ptri64 to i8* + ret i8* %ptr +} + +;; this actually launches kernel a kernel module asm " .extern .func (.param .b32 func_retval0) cudaLaunchDevice ( @@ -111,22 +139,17 @@ module asm " .param .b64 cudaLaunchDevice_param_5 ); " -define i8* @ISPCAlloc(i8**, i64, i32) nounwind alwaysinline -{ - ret i8* null -} -declare i64 @cudaGetParameterBuffer(i64, i64) nounwind -define void @ISPCLaunch(i8**, i8* %func_ptr, i8** %func_args, i32 %nargs, i32 %ntx, i32 %nty, i32 %ntz) nounwind alwaysinline +define void @ISPCLaunch(i8**, i8* %func_ptr, i8* %func_args, i32 %ntx, i32 %nty, i32 %ntz) nounwind alwaysinline { entry: - %func_i64 = ptrtoint i8* %func_ptr to i64 - %args_i64 = ptrtoint i8** %func_args to i64 +;; only 1 lane must launch the kernel !!! + %func_i64 = ptrtoint i8* %func_ptr to i64 + %args_i64 = ptrtoint i8* %func_args to i64 + ;; nbx = (%ntx-1)/(blocksize/warpsize) + 1 for blocksize=128 & warpsize=32 %ntxm1 = add nsw i32 %ntx, -1 %ntxm1d4 = sdiv i32 %ntxm1, 4 %nbx = add nsw i32 %ntxm1d4, 1 - -;; only 1 lane must launch the kernel !!! %call = tail call i32 @__tid_x() %call1 = tail call i32 @__warpsize() %sub = add nsw i32 %call1, -1 @@ -135,31 +158,25 @@ entry: %cmp = icmp eq i32 %and, 0 br i1 %cmp, label %if.then, label %if.end -if.then: ; preds = %entry +if.then: - %param = call i64 @cudaGetParameterBuffer(i64 8, i64 24); - %ptr = inttoptr i64 %param to i8*; - - - %res_tmp = call i32 asm sideeffect "{ - .reg .s32 %r<8>; - .reg .s64 %rd<3>; + %res_tmp = call i32 asm sideeffect "{ .param .b64 param0; - st.param.b64 [param0+0], $1; //%rd0; + st.param.b64 [param0+0], $1; .param .b64 param1; - st.param.b64 [param1+0], $2; //%rd1; + st.param.b64 [param1+0], $2; .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], $3; //%r0; - st.param.b32 [param2+4], $4; //%r1; - st.param.b32 [param2+8], $5; //%r2; + st.param.b32 [param2+0], $3; + st.param.b32 [param2+4], $4; + st.param.b32 [param2+8], $5; .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], $6; //%r3; - st.param.b32 [param3+4], $7; //%r4; - st.param.b32 [param3+8], $8; //%r5; + st.param.b32 [param3+0], $6; + st.param.b32 [param3+4], $7; + st.param.b32 [param3+8], $8; .param .b32 param4; - st.param.b32 [param4+0], $9; //%r6; + st.param.b32 [param4+0], $9; .param .b64 param5; - st.param.b64 [param5+0], $10; //%rd2; + st.param.b64 [param5+0], $10; .param .b32 retval0; call.uni (retval0), @@ -175,14 +192,19 @@ if.then: ; preds = %entry ld.param.b32 $0, [retval0+0]; } ", -"=r, l,l, r,r,r, r,r,r, r,l"(i64 %func_i64,i64 %args_i64, i32 %nbx,i32 %nty,i32 %ntz, i32 128,i32 1,i32 1, i32 0,i64 0); +"=r, l,l, r,r,r, r,r,r, r,l"( + i64 %func_i64,i64 %args_i64, + i32 %nbx,i32 %nty,i32 %ntz, + i32 128,i32 1,i32 1, i32 0,i64 0); br label %if.end if.end: ; preds = %if.then, %entry - %res = phi i32 [ %res_tmp, %if.then ], [ undef, %entry ] +;; %res = phi i32 [ %res_tmp, %if.then ], [ undef, %entry ] ret void } + +;; this synchronizes kernel declare i32 @cudaDeviceSynchronize() nounwind define void @ISPCSync(i8*) nounwind alwaysinline { diff --git a/builtins/util.m4 b/builtins/util.m4 index acf45a38..77d18719 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1835,10 +1835,6 @@ declare void @ISPCLaunch(i8**, i8*, i8*, i32,i32,i32) nounwind declare void @ISPCSync(i8*) nounwind declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind -declare i8* @CUDAAlloc(i8**, i64, i32) nounwind -declare void @CUDALaunch(i8**, i8*,i8*,i8*, i8**, i32, i32, i32) nounwind -declare void @CUDASync(i8*) nounwind - declare i1 @__is_compile_time_constant_mask( %mask) declare i1 @__is_compile_time_constant_uniform_int32(i32) declare i1 @__is_compile_time_constant_varying_int32() diff --git a/ctx.cpp b/ctx.cpp index 55adb95b..15b4743a 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -3593,9 +3593,87 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee, args.push_back(launchCount[2]); return CallInst(flaunch, NULL, args, ""); } - else /* isPTX == true */ + else if(1) /* isPTX == true */ { - //assert(0); /* must only be called in export */ + if (callee == NULL) { + AssertPos(currentPos, m->errorCount > 0); + return NULL; + } + launchedTasks = true; + AssertPos(currentPos, llvm::isa(callee)); + + std::vector argTypes; + for (unsigned int i = 0; i < argVals.size(); i++) + argTypes.push_back(argVals[i]->getType()); + llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes); + llvm::StructType *argStructType = static_cast(st); + llvm::Value *structSize = g->target->SizeOf(argStructType, bblock); + if (structSize->getType() != LLVMTypes::Int64Type) + structSize = ZExtInst(structSize, LLVMTypes::Int64Type, + "struct_size_to_64"); +#if 0 + { + std::string str; llvm::raw_string_ostream rso(str); llvm::formatted_raw_ostream fos(rso); + structSize->print(fos); + fos.flush(); fprintf(stderr, ">>> %s\n", str.c_str()); + } +#endif + int align = 8; + llvm::Function *falloc = m->module->getFunction("ISPCGetParamBuffer"); + AssertPos(currentPos, falloc != NULL); + std::vector allocArgs; + allocArgs.push_back(launchGroupHandlePtr); + allocArgs.push_back(LLVMInt64(align)); + allocArgs.push_back(structSize); + llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr"); + llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64"); + llvm::BasicBlock* if_true = CreateBasicBlock("if_true"); + llvm::BasicBlock* if_false = CreateBasicBlock("if_false"); +// llvm::BasicBlock* bblock_bak = bblock; + + /* check if the pointer returned by ISPCGetParamBuffer is not NULL + * -------------- + * this is a workaround for not checking which laneIdx we are in, + * because ISPCGetParamBuffer will return NULL pointer for all laneIdx, except when laneIdx = 0 + * of course, if ISPCGetParamBuffer fails to get parameter buffer, the pointer for laneIdx = 0 + * will also be zero. + * This check must be added, and also rewrite the code to make it less opaque + */ + llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1"); + BranchInst(if_true, if_false, cmp1); + + bblock = if_true; + + // label_if_then block: + llvm::Type *pt = llvm::PointerType::getUnqual(st); + llvm::Value *argmem = BitCastInst(voidmem, pt); + for (unsigned int i = 0; i < argVals.size(); ++i) + { + llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg"); + // don't need to do masked store here, I think + StoreInst(argVals[i], ptr); + } + BranchInst(if_false); + + bblock = if_false; + + llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType); + llvm::Function *flaunch = m->module->getFunction("ISPCLaunch"); + AssertPos(currentPos, flaunch != NULL); + std::vector args; + args.push_back(launchGroupHandlePtr); + args.push_back(fptr); + args.push_back(voidmem); + args.push_back(launchCount[0]); + args.push_back(launchCount[1]); + args.push_back(launchCount[2]); + return CallInst(flaunch, NULL, args, ""); + + + } + else + { + assert(0); /* must only be called in export */ // assert(g->target->getISA() != Target::NVPTX64); if (callee == NULL) { @@ -3643,7 +3721,7 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee, // the argument block /* allocate structure of pointer */ - llvm::ArrayType* ArrayTy_6 = llvm::ArrayType::get(LLVMTypes::VoidPointerType, argVals.size()); + llvm::ArrayType* ArrayTy_6 = llvm::ArrayType::get(LLVMTypes::VoidPointerType, argVals.size()*2); llvm::Value* ptrParam = AllocaInst(ArrayTy_6, "arrayStructPtr"); /* constructed array of pointers to arguments @@ -3786,6 +3864,8 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee, args.push_back(ptr_arraydecay); /* const void ** params */ } + llvm::ConstantInt* const_int64_10 = llvm::ConstantInt::get(*g->ctx, llvm::APInt(32, argVals.size()*2)); + args.push_back(const_int64_10); args.push_back(launchCount[0]); args.push_back(launchCount[1]); args.push_back(launchCount[2]); diff --git a/examples_cuda/aobench/ao_cu.cpp b/examples_cuda/aobench/ao_cu.cpp index 85eb4cb5..75599078 100755 --- a/examples_cuda/aobench/ao_cu.cpp +++ b/examples_cuda/aobench/ao_cu.cpp @@ -163,7 +163,7 @@ CUmodule loadModule(const char * module) CUjit_option options[nOptions]; void* optionVals[nOptions]; float walltime; - const unsigned int logSize = 8192; + const unsigned int logSize = 32768; char error_log[logSize], info_log[logSize]; void *cuOut;