diff --git a/builtins.cpp b/builtins.cpp index fc21a7c3..397af3c3 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -654,14 +654,17 @@ lSetInternalFunctions(llvm::Module *module) { "__vec4_add_int32", "__vselect_float", "__vselect_i32", - "__tid_x", - "__ctaid_x", - "__ctaid_y", - "__ctaid_z", - "__nctaid_x", - "__nctaid_y", - "__nctaid_z", - "__warpsize", + "__program_index", + "__program_count", + "__warp_index", + "__task_index0", + "__task_index1", + "__task_index2", + "__task_index", + "__task_count0", + "__task_count1", + "__task_count2", + "__task_count", "__cvt_loc2gen", "__cvt_loc2gen_var", "__cvt_const2gen" diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index 3bc2852e..2a61c013 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -17,57 +17,91 @@ declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() nounwind readnone declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() nounwind readnone declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() nounwind readnone -define i32 @__tid_x() nounwind readnone alwaysinline +;;;;;;;;;; + +define i32 @__program_index() nounwind readnone alwaysinline { %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - ret i32 %tid + %program_index = and i32 %tid, 31 + ret i32 %program_index } -define i32 @__warpsize() nounwind readnone alwaysinline +define i32 @__program_count() nounwind readnone alwaysinline { ;; %tid = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() ;; ret i32 %tid ret i32 32 } -define i32 @__laneidx() nounwind readnone alwaysinline +define i32 @__warp_index() nounwind readnone alwaysinline { - %tid = tail call i32 @__tid_x() - %lane = and i32 %tid, 31 - ret i32 %lane; + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %warp_index = lshr i32 %tid, 5 + ret i32 %warp_index } +;;;;;;;;;;;; -define i32 @__ctaid_x() nounwind readnone alwaysinline +define i32 @__task_index0() nounwind readnone alwaysinline { - %bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() - ret i32 %bid + %bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + %bid4 = shl i32 %bid, 2 + %warp_index = call i32 @__warp_index() + %task_index0 = add i32 %bid4, %warp_index + ret i32 %task_index0 } -define i32 @__ctaid_y() nounwind readnone alwaysinline +define i32 @__task_index1() nounwind readnone alwaysinline { - %bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() - ret i32 %bid + %task_index1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + ret i32 %task_index1 } -define i32 @__ctaid_z() nounwind readnone alwaysinline +define i32 @__task_index2() nounwind readnone alwaysinline { - %bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() - ret i32 %bid + %task_index2 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() + ret i32 %task_index2 +} +define i32 @__task_index() nounwind readnone alwaysinline +{ + %ti0 = call i32 @__task_index0() + %ti1 = call i32 @__task_index1() + %ti2 = call i32 @__task_index2() + %tc0 = call i32 @__task_count0() + %tc1 = call i32 @__task_count1() + %mul1 = mul i32 %tc1, %ti2 + %add1 = add i32 %mul1, %ti1 + %mul2 = mul i32 %add1, %tc0 + %task_index = add i32 %mul2, %ti0 + ret i32 %task_index } -define i32 @__nctaid_x() nounwind readnone alwaysinline +;;;;; + +define i32 @__task_count0() nounwind readnone alwaysinline { %nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() - ret i32 %nb + %task_count0 = shl i32 %nb, 2 + ret i32 %task_count0 } -define i32 @__nctaid_y() nounwind readnone alwaysinline +define i32 @__task_count1() nounwind readnone alwaysinline { - %nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() - ret i32 %nb + %task_count1 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() + ret i32 %task_count1 } -define i32 @__nctaid_z() nounwind readnone alwaysinline +define i32 @__task_count2() nounwind readnone alwaysinline { - %nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() - ret i32 %nb + %task_count2 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() + ret i32 %task_count2 } +define i32 @__task_count() nounwind readnone alwaysinline +{ + %tc0 = call i32 @__task_count0() + %tc1 = call i32 @__task_count1() + %tc2 = call i32 @__task_count2() + %mul1 = mul i32 %tc1, %tc2 + %task_count = mul i32 %mul1, %tc0 + ret i32 %task_count +} + ;;;;;;;; + declare i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p3i64(i64 addrspace(3)*) declare i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p4i64(i64 addrspace(4)*) define i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline @@ -228,10 +262,7 @@ declare i64 @cudaGetParameterBuffer(i64, i64) nounwind define i8* @ISPCAlloc(i8**, i64 %size, i32 %align32) nounwind alwaysinline { entry: - %call = tail call i32 @__tid_x() - %call1 = tail call i32 @__warpsize() - %sub = add nsw i32 %call1, -1 - %and = and i32 %sub, %call + %and = call i32 @__program_index() %cmp = icmp eq i32 %and, 0 %align = zext i32 %align32 to i64 br i1 %cmp, label %if.then, label %if.end @@ -270,10 +301,7 @@ entry: ;; %ntxm1d4 = sdiv i32 %ntxm1, 4 %ntxm1d4 = ashr i32 %ntxm1, 2 %nbx = add nsw i32 %ntxm1d4, 1 - %call = tail call i32 @__tid_x() - %call1 = tail call i32 @__warpsize() - %sub = add nsw i32 %call1, -1 - %and = and i32 %sub, %call + %and = call i32 @__program_index() ;; if (laneIdx == 0) %cmp = icmp eq i32 %and, 0 br i1 %cmp, label %if.then, label %if.end @@ -1191,8 +1219,7 @@ define(`shift',` define <1 x $1> @__shift_$1(<1 x $1>, i32) nounwind readnone alwaysinline { %val = extractelement <1 x $1> %0, i32 0 - %tid = tail call i32 @__tid_x() - %lane = and i32 %tid, 31 + %lane = call i32 @__program_index() %src = add i32 %lane, %1 %ret = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %src) %c1 = icmp sge i32 %src, 0 @@ -1214,7 +1241,7 @@ define(`rotate', ` define <1 x $1> @__rotate_$1(<1 x $1>, i32) nounwind readnone alwaysinline { %val = extractelement <1 x $1> %0, i32 0 - %tid = tail call i32 @__tid_x() + %tid = call i32 @__program_index() %src = add i32 %tid, %1 %lane = and i32 %src, 31 %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane) @@ -1569,7 +1596,7 @@ define $1 @__extract_$2(<1 x $1>, i32) nounwind readnone alwaysinline { define <1 x $1> @__insert_$2(<1 x $1>, i32, $1) nounwind readnone alwaysinline { %orig = extractelement <1 x $1> %0, i32 0 - %lane = call i32 @__laneidx() + %lane = call i32 @__program_index() %c = icmp eq i32 %lane, %1 %val = select i1 %c, $1 %2, $1 %orig %insert = insertelement <1 x $1> %0, $1 %val, i32 0 @@ -1620,7 +1647,7 @@ define void @__do_assert_uniform(i8 *%str, i1 %test, %mask) { br i1 %test, label %ok, label %fail fail: - %lane = call i32 @__laneidx() + %lane = call i32 @__program_index() %cmp = icmp eq i32 %lane, 0 br i1 %cmp, label %fail_print, label %fail_void; @@ -1966,7 +1993,7 @@ define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinli entry: %addr = ptrtoint $3 * %ptr to i64 %active = call i32 @__get_first_active_lane(); - %lane = call i32 @__laneidx(); + %lane = call i32 @__program_index(); %c = icmp eq i32 %lane, %active br i1 %c, label %p1, label %p2 @@ -2144,7 +2171,7 @@ define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %cmp, $3 %val) nounwind a entry: %addr = ptrtoint $3 * %ptr to i64 %active = call i32 @__get_first_active_lane(); - %lane = call i32 @__laneidx(); + %lane = call i32 @__program_index(); %c = icmp eq i32 %lane, %active br i1 %c, label %p1, label %p2 diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4 index 891e2760..19fcf68c 100644 --- a/builtins/util-nvptx.m4 +++ b/builtins/util-nvptx.m4 @@ -2032,10 +2032,7 @@ define noalias i8 * @__new_uniform_64rt(i64 %size) { entry: ;; compute laneIdx = __tid_x() & (__warpsize() - 1) - %call = tail call i32 @__tid_x() - %call1 = tail call i32 @__warpsize() - %sub = add nsw i32 %call1, -1 - %and = and i32 %sub, %call + %and = call i32 @__program_index() ;; if (laneIdx == 0) %cmp = icmp eq i32 %and, 0 br i1 %cmp, label %if.then, label %if.end @@ -2062,10 +2059,7 @@ if.end: ; preds = %if.then, %entry define void @__delete_uniform_64rt(i8 * %ptr) { entry: - %call = tail call i32 @__tid_x() - %call1 = tail call i32 @__warpsize() - %sub = add nsw i32 %call1, -1 - %and = and i32 %sub, %call + %and = call i32 @__program_index() %cmp = icmp eq i32 %and, 0 br i1 %cmp, label %if.then, label %if.end diff --git a/builtins/util.m4 b/builtins/util.m4 index f6bbe768..87bd2c2c 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -4534,14 +4534,17 @@ define_down_avgs() define(`declare_nvptx', ` -declare i32 @__tid_x() nounwind readnone alwaysinline -declare i32 @__warpsize() nounwind readnone alwaysinline -declare i32 @__ctaid_x() nounwind readnone alwaysinline -declare i32 @__ctaid_y() nounwind readnone alwaysinline -declare i32 @__ctaid_z() nounwind readnone alwaysinline -declare i32 @__nctaid_x() nounwind readnone alwaysinline -declare i32 @__nctaid_y() nounwind readnone alwaysinline -declare i32 @__nctaid_z() nounwind readnone alwaysinline +declare i32 @__program_index() nounwind readnone alwaysinline +declare i32 @__program_count() nounwind readnone alwaysinline +declare i32 @__warp_index() nounwind readnone alwaysinline +declare i32 @__task_index0() nounwind readnone alwaysinline +declare i32 @__task_index1() nounwind readnone alwaysinline +declare i32 @__task_index2() nounwind readnone alwaysinline +declare i32 @__task_index() nounwind readnone alwaysinline +declare i32 @__task_count0() nounwind readnone alwaysinline +declare i32 @__task_count1() nounwind readnone alwaysinline +declare i32 @__task_count2() nounwind readnone alwaysinline +declare i32 @__task_count() nounwind readnone alwaysinline declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline diff --git a/ctx.cpp b/ctx.cpp index 39f56885..38bea7c4 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1373,21 +1373,26 @@ FunctionEmitContext::None(llvm::Value *mask) { llvm::Value * -FunctionEmitContext::LaneMask(llvm::Value *v) { - const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk"; - // Call the target-dependent movmsk function to turn the vector mask - // into an i64 value - std::vector mm; - m->symbolTable->LookupFunction(__movmsk, &mm); - if (g->target->getMaskBitCount() == 1) - AssertPos(currentPos, mm.size() == 1); - else - // There should be one with signed int signature, one unsigned int. - AssertPos(currentPos, mm.size() == 2); - // We can actually call either one, since both are i32s as far as - // LLVM's type system is concerned... - llvm::Function *fmm = mm[0]->function; - return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk")); +FunctionEmitContext::LaneMask(llvm::Value *v) +{ +#if 1 /* this makes mandelbrot example slower, why ?!? */ + const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk"; +#else + const char *__movmsk = "__movmsk"; +#endif + // Call the target-dependent movmsk function to turn the vector mask + // into an i64 value + std::vector mm; + m->symbolTable->LookupFunction(__movmsk, &mm); + if (g->target->getMaskBitCount() == 1) + AssertPos(currentPos, mm.size() == 1); + else + // There should be one with signed int signature, one unsigned int. + AssertPos(currentPos, mm.size() == 2); + // We can actually call either one, since both are i32s as far as + // LLVM's type system is concerned... + llvm::Function *fmm = mm[0]->function; + return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk")); } bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName) @@ -1476,13 +1481,9 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) { } llvm::Value * FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) { - llvm::Function *func_tid_x = m->module->getFunction("__tid_x"); - llvm::Function *func_warpsz = m->module->getFunction("__warpsize"); - llvm::Value *__tid_x = CallInst(func_tid_x, NULL, std::vector(), "laneIdxForEach"); - llvm::Value *__warpsz = CallInst(func_warpsz, NULL, std::vector(), "warpSZForEach"); - llvm::Value *__warpszm1 = BinaryOperator(llvm::Instruction::Add, __warpsz, LLVMInt32(-1), "__warpszm1"); - llvm::Value *laneIdx = BinaryOperator(llvm::Instruction::And, __tid_x, __warpszm1, "__laneidx"); - llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), laneIdx, 0, "__laneIdxV"); + llvm::Function *func_program_index = m->module->getFunction("__program_index"); + llvm::Value *__program_index = CallInst(func_program_index, NULL, std::vector(), "foreach__program_indexS"); + llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV"); #if 0 if (!is32bits) index = ZExtInst(index, LLVMTypes::Int64VectandType); @@ -1887,146 +1888,6 @@ FunctionEmitContext::BitCastInst(llvm::Value *value, llvm::Type *type, return inst; } -/* NVPTX: - * this is a helper function which adds a warp offset to a base pointer - * pointer must either be in local memory addrspace(3) - * or the one just converted from addrspace(3) to addrspace(0) in lConvertToGenericPtr - */ -static llvm::Value* lAddWarpOffset(FunctionEmitContext *ctx, llvm::Value *value) -{ - llvm::Function *func_tid_x = m->module->getFunction("__tid_x"); - llvm::Function *func_warpsz = m->module->getFunction("__warpsize"); - llvm::Value *__tid_x = ctx->CallInst(func_tid_x, NULL, std::vector(), "tidCorrectLocalPtr"); - llvm::Value *__warpsz = ctx->CallInst(func_warpsz, NULL, std::vector(), "warpSzCorrectLocaLPtr"); - llvm::Value *_mwarpsz = ctx->BinaryOperator(llvm::Instruction::Sub, LLVMInt32(0), __warpsz, "mwarpSzCorrectLocalPtr"); - llvm::Value *__offset = ctx->BinaryOperator(llvm::Instruction::And, __tid_x, _mwarpsz, "offsetCorrectLocalPtr"); - return llvm::GetElementPtrInst::Create(value, __offset, "warpOffset_gep", ctx->GetCurrentBasicBlock()); -} - -static llvm::Value* lConvertGepToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos ¤tPos) -{ - if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) - return value; - llvm::PointerType *pt = llvm::dyn_cast(value->getType()); - const int addressSpace = pt->getAddressSpace(); - if (addressSpace != 3 && addressSpace != 4) - return value; - assert(0); - - llvm::Type *elTy = pt->getElementType(); - assert(elTy->isArrayTy()); - const int numElTot = elTy->getArrayNumElements(); - const int numEl = numElTot/4; -#if 0 - fprintf(stderr, " --- detected addrspace(3) sz= %d --- \n", numEl); -#endif - llvm::ArrayType *arrTy = llvm::dyn_cast(pt->getArrayElementType()); - assert(arrTy != NULL); - llvm::Type *arrElTy = arrTy->getElementType(); -#if 0 - if (arrElTy->isArrayTy()) - Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array"); -#endif - - /* convert elTy addrspace(3)* to i64* addrspace(3)* */ - llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace); - value = ctx->BitCastInst(value, Int64Ptr3, "gep2gen_cast1"); - - /* convert i64* addrspace(3) to i64* */ - llvm::Function *__cvt2gen = m->module->getFunction( - addressSpace == 3 ? "__cvt_loc2gen" : "__cvt_const2gen"); - std::vector __cvt2gen_args; - __cvt2gen_args.push_back(value); - value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, "gep2gen_cvt", ctx->GetCurrentBasicBlock()); - - /* convert i64* to errElTy* */ - llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0); - value = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2"); - - /* compute offset */ - if (addressSpace == 3) - { - llvm::Function *funcTid = m->module->getFunction("__tid_x"); - llvm::Function *funcWarpSz = m->module->getFunction("__warpsize"); - llvm::Value *tid = ctx->CallInst(funcTid, NULL, std::vector(), "gep2gen_tid"); - llvm::Value *warpSz = ctx->CallInst(funcWarpSz, NULL, std::vector(), "gep2gen_warpSz"); - llvm::Value *warpId = ctx->BinaryOperator(llvm::Instruction::SDiv, tid, warpSz, "gep2gen_warpId"); - llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset"); - value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock()); - } - - /* convert arrElTy* to elTy* */ - llvm::PointerType *elTyPt0 = llvm::PointerType::get(elTy, 0); - value = ctx->BitCastInst(value, elTyPt0, "gep2gen_cast3"); - - return value; -} - -/* NVPTX: - * this function compute correct address in local memory for load/store operations - */ -static llvm::Value* lCorrectLocalPtr(FunctionEmitContext *ctx, llvm::Value* value) -{ - // return value; - assert(value->getType()->isPointerTy()); - llvm::PointerType *pt = llvm::dyn_cast(value->getType()); - if (g->target->getISA() != Target::NVPTX || pt->getAddressSpace() != 3) return value; - - assert(0); /* we should never enter here */ - - return lAddWarpOffset(ctx, value); -} - -/* NVPTX: - * this function converts a pointer in addrspace(3 or 4) to addrspace(0) - */ -static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos ¤tPos) -{ -// return value; - if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) return value; - llvm::PointerType *pt = llvm::dyn_cast(value->getType()); - - /* make sure addrspace corresponds to either local or constant memories */ - const int addressSpace = pt->getAddressSpace(); - if (addressSpace != 3 && addressSpace != 4) return value; - - assert(0); /* we should never enter here */ - - /* if array, extracts element type */ - llvm::Type *type = pt->getElementType(); - llvm::Type *typeEl = type; - if (type->isArrayTy()) - { - typeEl = type->getArrayElementType(); - if (typeEl->isArrayTy()) - Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array"); - } - - /* convert elTy addrspace(3)* to i64* addrspace(3)* */ - llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace); - value = ctx->BitCastInst(value, Int64Ptr3, "cvt2gen_i64ptr"); - - /* convert i64* addrspace(3) to i64* */ - llvm::Function *__cvt2gen = m->module->getFunction( - addressSpace == 3 ? "__cvt_loc2gen" : "__cvt_const2gen"); - std::vector __cvt2gen_args; - __cvt2gen_args.push_back(value); -#if 0 - value = ctx->CallInst(__cvt2gen, NULL, __cvt2gen_args, "cvt2gen_call"); -#else - value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, "cvt2gen_call", ctx->GetCurrentBasicBlock()); -#endif - - /* convert i64* to elTy* */ - llvm::PointerType *typeElPtr = llvm::PointerType::get(typeEl, 0); - value = ctx->BitCastInst(value, typeElPtr, "cvtLoc2Gen_i642ptr"); - - /* add warp offset to the pointer for local memory */ - if (addressSpace == 3) - value = lAddWarpOffset(ctx, value); - - return value; -} llvm::Value * FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) { @@ -2042,7 +1903,6 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) { if (name == NULL) name = LLVMGetName(value, "_ptr2int"); - value = lConvertToGenericPtr(this, value, currentPos); /* NVPTX : convert to addrspace(0) */ llvm::Type *type = LLVMTypes::PointerIntType; llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock); AddDebugPos(inst); @@ -2076,7 +1936,6 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, llvm::Type *toType, } } - value = lConvertToGenericPtr(this, value, currentPos); /* NVPTX : convert to addrspace(0) */ llvm::Instruction *inst = new llvm::PtrToIntInst(value, toType, name, bblock); AddDebugPos(inst); return inst; @@ -2383,7 +2242,6 @@ FunctionEmitContext::MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset) { llvm::Value * FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index, const Type *ptrRefType, const char *name) { - basePtr = lConvertGepToGenericPtr(this, basePtr, currentPos); if (basePtr == NULL || index == NULL) { AssertPos(currentPos, m->errorCount > 0); return NULL; @@ -2454,7 +2312,6 @@ llvm::Value * FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0, llvm::Value *index1, const Type *ptrRefType, const char *name) { - basePtr = lConvertGepToGenericPtr(this, basePtr, currentPos); if (basePtr == NULL || index0 == NULL || index1 == NULL) { AssertPos(currentPos, m->errorCount > 0); return NULL; @@ -2657,7 +2514,6 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) { if (name == NULL) name = LLVMGetName(ptr, "_load"); - ptr = lCorrectLocalPtr(this, ptr); /* NVPTX: correct addrspace(3) pointer before load/store */ llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock); if (g->opt.forceAlignedMemory && @@ -2790,7 +2646,6 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, llvm::Value *mask, // it's totally unaligned. (This shouldn't make any difference // vs the proper alignment in practice.) align = 1; - ptr = lCorrectLocalPtr(this, ptr); /* NVPTX: correct addrspace(3) pointer before load/store */ llvm::Instruction *inst = new llvm::LoadInst(ptr, name, false /* not volatile */, align, bblock); @@ -3218,7 +3073,6 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) { llvm::dyn_cast(ptr->getType()); AssertPos(currentPos, pt != NULL); - ptr = lCorrectLocalPtr(this, ptr); /* NVPTX: correct addrspace(3) pointer before load/store */ llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock); if (g->opt.forceAlignedMemory && @@ -3531,14 +3385,6 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType, return NULL; } -#if 0 - std::vector args = args_in; - /* NVPTX: - * Convert all pointers to addrspace(0) - */ - for (unsigned int i = 0; i < args.size(); i++) - args[i] = lConvertToGenericPtr(this, args[i], currentPos); -#endif std::vector argVals = args; // Most of the time, the mask is passed as the last argument. this // isn't the case for things like intrinsics, builtins, and extern "C" diff --git a/module.cpp b/module.cpp index fde31456..a8f521d8 100644 --- a/module.cpp +++ b/module.cpp @@ -2167,20 +2167,20 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre if (g->target->getISA() == Target::NVPTX) { opts.addMacroDef("__NVPTX__"); - opts.addMacroDef("programIndex=laneIndex()"); + opts.addMacroDef("programIndex=__programIndex()"); opts.addMacroDef("cif=if"); opts.addMacroDef("cfor=for"); opts.addMacroDef("cwhile=while"); opts.addMacroDef("ccontinue=continue"); opts.addMacroDef("cdo=do"); - opts.addMacroDef("taskIndex0=blockIndex0()"); - opts.addMacroDef("taskCount0=blockCount0()"); - opts.addMacroDef("taskIndex1=blockIndex1()"); - opts.addMacroDef("taskCount1=blockCount1()"); - opts.addMacroDef("taskIndex2=blockIndex2()"); - opts.addMacroDef("taskCount2=blockCount2()"); - opts.addMacroDef("taskIndex=(taskIndex0 + taskCount0*(taskIndex1 + taskCount1*taskIndex2))"); - opts.addMacroDef("taskCount=(taskCount0*taskCount1*taskCount2)"); + opts.addMacroDef("taskIndex0=__taskIndex0()"); + opts.addMacroDef("taskIndex1=__taskIndex1()"); + opts.addMacroDef("taskIndex2=__taskIndex2()"); + opts.addMacroDef("taskIndex=__taskIndex()"); + opts.addMacroDef("taskCount0=__taskCount0()"); + opts.addMacroDef("taskCount1=__taskCount1()"); + opts.addMacroDef("taskCount2=__taskCount2()"); + opts.addMacroDef("taskCount=__taskCount()"); } #if defined(LLVM_3_1) diff --git a/stdlib.ispc b/stdlib.ispc index 1fccfc03..37cb4141 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -62,57 +62,25 @@ /////////////////////////////////////////////////////////////////////////// // CUDA Specific primitives // -#define CUDABLOCKSIZE 128 -#define WARPSIZE2 5 -#define WARPSIZE (1<> WARPSIZE2)) + (__tid_x() >> WARPSIZE2); -} -/***************/ -__declspec(safe,cost0) - static inline uniform int blockIndex1() -{ - return __ctaid_y(); -} -/***************/ -__declspec(safe,cost0) - static inline uniform int blockIndex2() -{ - return __ctaid_z(); -} -/***************/ -__declspec(safe,cost0) - static inline uniform int blockCount0() -{ - return __nctaid_x() * (CUDABLOCKSIZE >> WARPSIZE2); -} -/***************/ -__declspec(safe,cost0) - static inline uniform int blockCount1() -{ - return __nctaid_y(); -} -/***************/ -__declspec(safe,cost0) - static inline uniform int blockCount2() -{ - return __nctaid_z(); -} + +__declspec(safe,cost0) static inline uniform int __taskCount0() { return __task_count0(); } +__declspec(safe,cost0) static inline uniform int __taskCount1() { return __task_count1(); } +__declspec(safe,cost0) static inline uniform int __taskCount2() { return __task_count2(); } +__declspec(safe,cost0) static inline uniform int __taskCount () { return __task_count (); } /////////////////////////////////////////////////////////////////////////// // Low level primitives diff --git a/stmt.cpp b/stmt.cpp index 0f84215f..016bb0f4 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -186,11 +186,8 @@ static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value * llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0); value = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2"); - llvm::Function *funcTid = m->module->getFunction("__tid_x"); - llvm::Function *funcWarpSz = m->module->getFunction("__warpsize"); - llvm::Value *tid = ctx->CallInst(funcTid, NULL, std::vector(), "gep2gen_tid"); - llvm::Value *warpSz = ctx->CallInst(funcWarpSz, NULL, std::vector(), "gep2gen_warpSz"); - llvm::Value *warpId = ctx->BinaryOperator(llvm::Instruction::SDiv, tid, warpSz, "gep2gen_warpId"); + llvm::Function *func_warp_index = m->module->getFunction("__warp_index"); + llvm::Value *warpId = ctx->CallInst(func_warp_index, NULL, std::vector(), "gep2gen_warp_index"); llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset"); value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock()); } @@ -1517,10 +1514,8 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx, llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList); globalDelta->setInitializer(constDelta); - llvm::Function *func_tid_x = m->module->getFunction("__tid_x"); - std::vector allocArgs; - llvm::Value *__tid_x = ctx->CallInst(func_tid_x, NULL, allocArgs, "laneIdxForEach"); - llvm::Value *laneIdx = ctx->BinaryOperator(llvm::Instruction::And, __tid_x, LLVMInt32(31), "__laneidx"); + llvm::Function *func_program_index = m->module->getFunction("__program_index"); + llvm::Value *laneIdx = ctx->CallInst(func_program_index, NULL, std::vector(), "foreach__programIndex"); std::vector ptr_arrayidx_indices; ptr_arrayidx_indices.push_back(LLVMInt32(0));