diff --git a/builtins.cpp b/builtins.cpp index cfef06fd..62ca0f15 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -630,7 +630,9 @@ lSetInternalFunctions(llvm::Module *module) { "__nctaid_y", "__nctaid_z", "__warpsize", - "__cvt_loc2gen" + "__cvt_loc2gen", + "__cvt_loc2gen_var", + "__cvt_const2gen" }; int count = sizeof(names) / sizeof(names[0]); diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index 24b2f0d1..ff73e09d 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -69,6 +69,11 @@ define i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline %ptr = tail call i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p3i64(i64 addrspace(3)* %0) ret i64* %ptr } +define i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline +{ + %ptr = tail call i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p3i64(i64 addrspace(3)* %0) + ret i64* %ptr +} define i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline { %ptr = tail call i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p4i64(i64 addrspace(4)* %0) diff --git a/builtins/util.m4 b/builtins/util.m4 index fb8c5c33..cd445fc4 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -4544,5 +4544,6 @@ declare i32 @__nctaid_y() nounwind readnone alwaysinline declare i32 @__nctaid_z() nounwind readnone alwaysinline declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline +declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline ') diff --git a/opt.cpp b/opt.cpp index f7494873..11829923 100644 --- a/opt.cpp +++ b/opt.cpp @@ -128,6 +128,7 @@ static llvm::Pass *CreateDebugPass(char * output); static llvm::Pass *CreateReplaceStdlibShiftPass(); static llvm::Pass *CreateFixBooleanSelectPass(); +static llvm::Pass *CreatePromoteLocalToPrivatePass(); #define DEBUG_START_PASS(NAME) \ if (g->debugPrint && \ @@ -574,6 +575,9 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createReassociatePass()); optPM.add(llvm::createIPConstantPropagationPass()); optPM.add(CreateReplaceStdlibShiftPass(),229); + if (g->target->getISA() == Target::NVPTX) + optPM.add(CreatePromoteLocalToPrivatePass()); +#if 1 optPM.add(llvm::createDeadArgEliminationPass(),230); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createCFGSimplificationPass()); @@ -685,6 +689,7 @@ Optimize(llvm::Module *module, int optLevel) { // Should be the last optPM.add(CreateFixBooleanSelectPass(), 400); +#endif } #endif @@ -5264,4 +5269,63 @@ CreateFixBooleanSelectPass() { return new FixBooleanSelectPass(); } +/////////////////////////////////////////////////////////////////////////////// +// Detect addrspace(3) +/////////////////////////////////////////////////////////////////////////////// + +class PromoteLocalToPrivatePass: public llvm::BasicBlockPass +{ + public: + static char ID; // Pass identification, replacement for typeid + PromoteLocalToPrivatePass() : BasicBlockPass(ID) {} + + bool runOnBasicBlock(llvm::BasicBlock &BB); +}; + +char PromoteLocalToPrivatePass::ID = 0; + +bool +PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB) +{ + std::vector Allocas; + + bool modifiedAny = false; + + llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var"); + + // Find allocas that are safe to promote, by looking at all instructions in + // the entry node + for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) + { + llvm::Instruction *inst = &*I; + if (llvm::CallInst *ci = llvm::dyn_cast(inst)) + { + llvm::Function *func = ci->getCalledFunction(); + if (cvtFunc && (cvtFunc == func)) + { +#if 0 + fprintf(stderr , "--found cvt-- name= %s \n", + I->getName().str().c_str()); +#endif + llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci); + assert(alloca != NULL); +#if 0 + const int align = 8; // g->target->getNativeVectorAlignment(); + alloca->setAlignment(align); +#endif + ci->replaceAllUsesWith(alloca); + modifiedAny = true; + } + } + } + return modifiedAny; +} + +static llvm::Pass * +CreatePromoteLocalToPrivatePass() { + return new PromoteLocalToPrivatePass(); +} + + + diff --git a/stmt.cpp b/stmt.cpp index c9a36c07..3ebefc06 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -142,7 +142,7 @@ lHasUnsizedArrays(const Type *type) { return lHasUnsizedArrays(at->GetElementType()); } -static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos ¤tPos) +static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos ¤tPos, const bool variable = false) { if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) return value; @@ -159,10 +159,11 @@ static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value * /* convert i64* addrspace(3) to i64* */ llvm::Function *__cvt2gen = m->module->getFunction( - addressSpace == 3 ? "__cvt_loc2gen" : "__cvt_const2gen"); + addressSpace == 3 ? (variable ? "__cvt_loc2gen_var" : "__cvt_loc2gen") : "__cvt_const2gen"); + std::vector __cvt2gen_args; __cvt2gen_args.push_back(value); - value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, "gep2gen_cvt", ctx->GetCurrentBasicBlock()); + value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, variable ? "gep2gen_cvt_var" : "gep2gen_cvt", ctx->GetCurrentBasicBlock()); /* compute offset */ if (addressSpace == 3) @@ -333,7 +334,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const { * constant uniform are automatically promoted to varying */ !sym->type->IsConstType() && -#if 1 +#if 0 sym->type->IsArrayType() && #endif g->target->getISA() == Target::NVPTX) @@ -345,18 +346,19 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const { /* with __shared__ memory everything must be an array */ int nel = 4; ArrayType *nat; + bool variable = true; if (sym->type->IsArrayType()) { const ArrayType *at = CastType(sym->type); - nel = at->GetElementCount(); /* we must scale # elements by 4, because a thread-block will run 4 warps * or 128 threads. * ***note-to-me***:please define these value (128threads/4warps) * in nvptx-target definition * instead of compile-time constants */ - nel *= 4; - nat = new ArrayType(at->GetElementType(), nel); + nel *= at->GetElementCount(); + nat = new ArrayType(at->GetElementType(), nel); + variable = false; } else nat = new ArrayType(sym->type, nel); @@ -375,7 +377,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const { NULL, llvm::GlobalVariable::NotThreadLocal, /*AddressSpace=*/3); - sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos); + sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos, variable); llvm::PointerType *ptrTy = llvm::PointerType::get(sym->type->LLVMType(g->ctx),0); sym->storagePtr = ctx->BitCastInst(sym->storagePtr, ptrTy, "uniform_decl");