CDP works now
This commit is contained in:
@@ -100,6 +100,34 @@ define i32 @__lanemask_lt() nounwind readnone alwaysinline
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; tasking
|
||||
|
||||
define i8* @ISPCAlloc(i8**, i64, i32) nounwind alwaysinline
|
||||
{
|
||||
ret i8* null
|
||||
}
|
||||
|
||||
;; this call allocate parameter buffer for kernel launch
|
||||
declare i64 @cudaGetParameterBuffer(i64, i64) nounwind
|
||||
define i8* @ISPCGetParamBuffer(i8**, i64 %align, i64 %size) nounwind alwaysinline
|
||||
{
|
||||
entry:
|
||||
%call = tail call i32 @__tid_x()
|
||||
%call1 = tail call i32 @__warpsize()
|
||||
%sub = add nsw i32 %call1, -1
|
||||
%and = and i32 %sub, %call
|
||||
%cmp = icmp eq i32 %and, 0
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
|
||||
if.then:
|
||||
%ptri64tmp = call i64 @cudaGetParameterBuffer(i64 %align, i64 %size);
|
||||
br label %if.end
|
||||
|
||||
if.end:
|
||||
%ptri64 = phi i64 [ %ptri64tmp, %if.then ], [ 0, %entry ]
|
||||
%ptr = inttoptr i64 %ptri64 to i8*
|
||||
ret i8* %ptr
|
||||
}
|
||||
|
||||
;; this actually launches kernel a kernel
|
||||
module asm "
|
||||
.extern .func (.param .b32 func_retval0) cudaLaunchDevice
|
||||
(
|
||||
@@ -111,22 +139,17 @@ module asm "
|
||||
.param .b64 cudaLaunchDevice_param_5
|
||||
);
|
||||
"
|
||||
define i8* @ISPCAlloc(i8**, i64, i32) nounwind alwaysinline
|
||||
{
|
||||
ret i8* null
|
||||
}
|
||||
declare i64 @cudaGetParameterBuffer(i64, i64) nounwind
|
||||
define void @ISPCLaunch(i8**, i8* %func_ptr, i8** %func_args, i32 %nargs, i32 %ntx, i32 %nty, i32 %ntz) nounwind alwaysinline
|
||||
define void @ISPCLaunch(i8**, i8* %func_ptr, i8* %func_args, i32 %ntx, i32 %nty, i32 %ntz) nounwind alwaysinline
|
||||
{
|
||||
entry:
|
||||
%func_i64 = ptrtoint i8* %func_ptr to i64
|
||||
%args_i64 = ptrtoint i8** %func_args to i64
|
||||
;; only 1 lane must launch the kernel !!!
|
||||
%func_i64 = ptrtoint i8* %func_ptr to i64
|
||||
%args_i64 = ptrtoint i8* %func_args to i64
|
||||
|
||||
;; nbx = (%ntx-1)/(blocksize/warpsize) + 1 for blocksize=128 & warpsize=32
|
||||
%ntxm1 = add nsw i32 %ntx, -1
|
||||
%ntxm1d4 = sdiv i32 %ntxm1, 4
|
||||
%nbx = add nsw i32 %ntxm1d4, 1
|
||||
|
||||
;; only 1 lane must launch the kernel !!!
|
||||
%call = tail call i32 @__tid_x()
|
||||
%call1 = tail call i32 @__warpsize()
|
||||
%sub = add nsw i32 %call1, -1
|
||||
@@ -135,31 +158,25 @@ entry:
|
||||
%cmp = icmp eq i32 %and, 0
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %entry
|
||||
if.then:
|
||||
|
||||
%param = call i64 @cudaGetParameterBuffer(i64 8, i64 24);
|
||||
%ptr = inttoptr i64 %param to i8*;
|
||||
|
||||
|
||||
%res_tmp = call i32 asm sideeffect "{
|
||||
.reg .s32 %r<8>;
|
||||
.reg .s64 %rd<3>;
|
||||
%res_tmp = call i32 asm sideeffect "{
|
||||
.param .b64 param0;
|
||||
st.param.b64 [param0+0], $1; //%rd0;
|
||||
st.param.b64 [param0+0], $1;
|
||||
.param .b64 param1;
|
||||
st.param.b64 [param1+0], $2; //%rd1;
|
||||
st.param.b64 [param1+0], $2;
|
||||
.param .align 4 .b8 param2[12];
|
||||
st.param.b32 [param2+0], $3; //%r0;
|
||||
st.param.b32 [param2+4], $4; //%r1;
|
||||
st.param.b32 [param2+8], $5; //%r2;
|
||||
st.param.b32 [param2+0], $3;
|
||||
st.param.b32 [param2+4], $4;
|
||||
st.param.b32 [param2+8], $5;
|
||||
.param .align 4 .b8 param3[12];
|
||||
st.param.b32 [param3+0], $6; //%r3;
|
||||
st.param.b32 [param3+4], $7; //%r4;
|
||||
st.param.b32 [param3+8], $8; //%r5;
|
||||
st.param.b32 [param3+0], $6;
|
||||
st.param.b32 [param3+4], $7;
|
||||
st.param.b32 [param3+8], $8;
|
||||
.param .b32 param4;
|
||||
st.param.b32 [param4+0], $9; //%r6;
|
||||
st.param.b32 [param4+0], $9;
|
||||
.param .b64 param5;
|
||||
st.param.b64 [param5+0], $10; //%rd2;
|
||||
st.param.b64 [param5+0], $10;
|
||||
|
||||
.param .b32 retval0;
|
||||
call.uni (retval0),
|
||||
@@ -175,14 +192,19 @@ if.then: ; preds = %entry
|
||||
ld.param.b32 $0, [retval0+0];
|
||||
}
|
||||
",
|
||||
"=r, l,l, r,r,r, r,r,r, r,l"(i64 %func_i64,i64 %args_i64, i32 %nbx,i32 %nty,i32 %ntz, i32 128,i32 1,i32 1, i32 0,i64 0);
|
||||
"=r, l,l, r,r,r, r,r,r, r,l"(
|
||||
i64 %func_i64,i64 %args_i64,
|
||||
i32 %nbx,i32 %nty,i32 %ntz,
|
||||
i32 128,i32 1,i32 1, i32 0,i64 0);
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %entry
|
||||
%res = phi i32 [ %res_tmp, %if.then ], [ undef, %entry ]
|
||||
;; %res = phi i32 [ %res_tmp, %if.then ], [ undef, %entry ]
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
;; this synchronizes kernel
|
||||
declare i32 @cudaDeviceSynchronize() nounwind
|
||||
define void @ISPCSync(i8*) nounwind alwaysinline
|
||||
{
|
||||
|
||||
@@ -1835,10 +1835,6 @@ declare void @ISPCLaunch(i8**, i8*, i8*, i32,i32,i32) nounwind
|
||||
declare void @ISPCSync(i8*) nounwind
|
||||
declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind
|
||||
|
||||
declare i8* @CUDAAlloc(i8**, i64, i32) nounwind
|
||||
declare void @CUDALaunch(i8**, i8*,i8*,i8*, i8**, i32, i32, i32) nounwind
|
||||
declare void @CUDASync(i8*) nounwind
|
||||
|
||||
declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
|
||||
declare i1 @__is_compile_time_constant_uniform_int32(i32)
|
||||
declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
|
||||
|
||||
86
ctx.cpp
86
ctx.cpp
@@ -3593,9 +3593,87 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
|
||||
args.push_back(launchCount[2]);
|
||||
return CallInst(flaunch, NULL, args, "");
|
||||
}
|
||||
else /* isPTX == true */
|
||||
else if(1) /* isPTX == true */
|
||||
{
|
||||
//assert(0); /* must only be called in export */
|
||||
if (callee == NULL) {
|
||||
AssertPos(currentPos, m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
launchedTasks = true;
|
||||
AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
|
||||
|
||||
std::vector<llvm::Type*> argTypes;
|
||||
for (unsigned int i = 0; i < argVals.size(); i++)
|
||||
argTypes.push_back(argVals[i]->getType());
|
||||
llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
|
||||
llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
|
||||
llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
|
||||
if (structSize->getType() != LLVMTypes::Int64Type)
|
||||
structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
|
||||
"struct_size_to_64");
|
||||
#if 0
|
||||
{
|
||||
std::string str; llvm::raw_string_ostream rso(str); llvm::formatted_raw_ostream fos(rso);
|
||||
structSize->print(fos);
|
||||
fos.flush(); fprintf(stderr, ">>> %s\n", str.c_str());
|
||||
}
|
||||
#endif
|
||||
int align = 8;
|
||||
llvm::Function *falloc = m->module->getFunction("ISPCGetParamBuffer");
|
||||
AssertPos(currentPos, falloc != NULL);
|
||||
std::vector<llvm::Value *> allocArgs;
|
||||
allocArgs.push_back(launchGroupHandlePtr);
|
||||
allocArgs.push_back(LLVMInt64(align));
|
||||
allocArgs.push_back(structSize);
|
||||
llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
|
||||
llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
|
||||
llvm::BasicBlock* if_true = CreateBasicBlock("if_true");
|
||||
llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
|
||||
// llvm::BasicBlock* bblock_bak = bblock;
|
||||
|
||||
/* check if the pointer returned by ISPCGetParamBuffer is not NULL
|
||||
* --------------
|
||||
* this is a workaround for not checking which laneIdx we are in,
|
||||
* because ISPCGetParamBuffer will return NULL pointer for all laneIdx, except when laneIdx = 0
|
||||
* of course, if ISPCGetParamBuffer fails to get parameter buffer, the pointer for laneIdx = 0
|
||||
* will also be zero.
|
||||
* This check must be added, and also rewrite the code to make it less opaque
|
||||
*/
|
||||
llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
|
||||
BranchInst(if_true, if_false, cmp1);
|
||||
|
||||
bblock = if_true;
|
||||
|
||||
// label_if_then block:
|
||||
llvm::Type *pt = llvm::PointerType::getUnqual(st);
|
||||
llvm::Value *argmem = BitCastInst(voidmem, pt);
|
||||
for (unsigned int i = 0; i < argVals.size(); ++i)
|
||||
{
|
||||
llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
|
||||
// don't need to do masked store here, I think
|
||||
StoreInst(argVals[i], ptr);
|
||||
}
|
||||
BranchInst(if_false);
|
||||
|
||||
bblock = if_false;
|
||||
|
||||
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
|
||||
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
|
||||
AssertPos(currentPos, flaunch != NULL);
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(launchGroupHandlePtr);
|
||||
args.push_back(fptr);
|
||||
args.push_back(voidmem);
|
||||
args.push_back(launchCount[0]);
|
||||
args.push_back(launchCount[1]);
|
||||
args.push_back(launchCount[2]);
|
||||
return CallInst(flaunch, NULL, args, "");
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(0); /* must only be called in export */
|
||||
// assert(g->target->getISA() != Target::NVPTX64);
|
||||
|
||||
if (callee == NULL) {
|
||||
@@ -3643,7 +3721,7 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
|
||||
// the argument block
|
||||
|
||||
/* allocate structure of pointer */
|
||||
llvm::ArrayType* ArrayTy_6 = llvm::ArrayType::get(LLVMTypes::VoidPointerType, argVals.size());
|
||||
llvm::ArrayType* ArrayTy_6 = llvm::ArrayType::get(LLVMTypes::VoidPointerType, argVals.size()*2);
|
||||
llvm::Value* ptrParam = AllocaInst(ArrayTy_6, "arrayStructPtr");
|
||||
|
||||
/* constructed array of pointers to arguments
|
||||
@@ -3786,6 +3864,8 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
|
||||
args.push_back(ptr_arraydecay); /* const void ** params */
|
||||
}
|
||||
|
||||
llvm::ConstantInt* const_int64_10 = llvm::ConstantInt::get(*g->ctx, llvm::APInt(32, argVals.size()*2));
|
||||
args.push_back(const_int64_10);
|
||||
args.push_back(launchCount[0]);
|
||||
args.push_back(launchCount[1]);
|
||||
args.push_back(launchCount[2]);
|
||||
|
||||
@@ -163,7 +163,7 @@ CUmodule loadModule(const char * module)
|
||||
CUjit_option options[nOptions];
|
||||
void* optionVals[nOptions];
|
||||
float walltime;
|
||||
const unsigned int logSize = 8192;
|
||||
const unsigned int logSize = 32768;
|
||||
char error_log[logSize],
|
||||
info_log[logSize];
|
||||
void *cuOut;
|
||||
|
||||
Reference in New Issue
Block a user