improvements

This commit is contained in:
Evghenii
2014-02-05 12:04:36 +01:00
parent 048da693c5
commit 686c1d676d
8 changed files with 142 additions and 306 deletions

View File

@@ -654,14 +654,17 @@ lSetInternalFunctions(llvm::Module *module) {
"__vec4_add_int32", "__vec4_add_int32",
"__vselect_float", "__vselect_float",
"__vselect_i32", "__vselect_i32",
"__tid_x", "__program_index",
"__ctaid_x", "__program_count",
"__ctaid_y", "__warp_index",
"__ctaid_z", "__task_index0",
"__nctaid_x", "__task_index1",
"__nctaid_y", "__task_index2",
"__nctaid_z", "__task_index",
"__warpsize", "__task_count0",
"__task_count1",
"__task_count2",
"__task_count",
"__cvt_loc2gen", "__cvt_loc2gen",
"__cvt_loc2gen_var", "__cvt_loc2gen_var",
"__cvt_const2gen" "__cvt_const2gen"

View File

@@ -17,57 +17,91 @@ declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() nounwind readnone declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() nounwind readnone declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() nounwind readnone
define i32 @__tid_x() nounwind readnone alwaysinline ;;;;;;;;;;
define i32 @__program_index() nounwind readnone alwaysinline
{ {
%tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %tid %program_index = and i32 %tid, 31
ret i32 %program_index
} }
define i32 @__warpsize() nounwind readnone alwaysinline define i32 @__program_count() nounwind readnone alwaysinline
{ {
;; %tid = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() ;; %tid = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
;; ret i32 %tid ;; ret i32 %tid
ret i32 32 ret i32 32
} }
define i32 @__laneidx() nounwind readnone alwaysinline define i32 @__warp_index() nounwind readnone alwaysinline
{ {
%tid = tail call i32 @__tid_x() %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%lane = and i32 %tid, 31 %warp_index = lshr i32 %tid, 5
ret i32 %lane; ret i32 %warp_index
} }
;;;;;;;;;;;;
define i32 @__ctaid_x() nounwind readnone alwaysinline define i32 @__task_index0() nounwind readnone alwaysinline
{ {
%bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() %bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %bid %bid4 = shl i32 %bid, 2
%warp_index = call i32 @__warp_index()
%task_index0 = add i32 %bid4, %warp_index
ret i32 %task_index0
} }
define i32 @__ctaid_y() nounwind readnone alwaysinline define i32 @__task_index1() nounwind readnone alwaysinline
{ {
%bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() %task_index1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %bid ret i32 %task_index1
} }
define i32 @__ctaid_z() nounwind readnone alwaysinline define i32 @__task_index2() nounwind readnone alwaysinline
{ {
%bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() %task_index2 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
ret i32 %bid ret i32 %task_index2
}
define i32 @__task_index() nounwind readnone alwaysinline
{
%ti0 = call i32 @__task_index0()
%ti1 = call i32 @__task_index1()
%ti2 = call i32 @__task_index2()
%tc0 = call i32 @__task_count0()
%tc1 = call i32 @__task_count1()
%mul1 = mul i32 %tc1, %ti2
%add1 = add i32 %mul1, %ti1
%mul2 = mul i32 %add1, %tc0
%task_index = add i32 %mul2, %ti0
ret i32 %task_index
} }
define i32 @__nctaid_x() nounwind readnone alwaysinline ;;;;;
define i32 @__task_count0() nounwind readnone alwaysinline
{ {
%nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() %nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
ret i32 %nb %task_count0 = shl i32 %nb, 2
ret i32 %task_count0
} }
define i32 @__nctaid_y() nounwind readnone alwaysinline define i32 @__task_count1() nounwind readnone alwaysinline
{ {
%nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() %task_count1 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
ret i32 %nb ret i32 %task_count1
} }
define i32 @__nctaid_z() nounwind readnone alwaysinline define i32 @__task_count2() nounwind readnone alwaysinline
{ {
%nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() %task_count2 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
ret i32 %nb ret i32 %task_count2
} }
define i32 @__task_count() nounwind readnone alwaysinline
{
%tc0 = call i32 @__task_count0()
%tc1 = call i32 @__task_count1()
%tc2 = call i32 @__task_count2()
%mul1 = mul i32 %tc1, %tc2
%task_count = mul i32 %mul1, %tc0
ret i32 %task_count
}
;;;;;;;; ;;;;;;;;
declare i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p3i64(i64 addrspace(3)*) declare i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p3i64(i64 addrspace(3)*)
declare i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p4i64(i64 addrspace(4)*) declare i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p4i64(i64 addrspace(4)*)
define i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline define i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
@@ -228,10 +262,7 @@ declare i64 @cudaGetParameterBuffer(i64, i64) nounwind
define i8* @ISPCAlloc(i8**, i64 %size, i32 %align32) nounwind alwaysinline define i8* @ISPCAlloc(i8**, i64 %size, i32 %align32) nounwind alwaysinline
{ {
entry: entry:
%call = tail call i32 @__tid_x() %and = call i32 @__program_index()
%call1 = tail call i32 @__warpsize()
%sub = add nsw i32 %call1, -1
%and = and i32 %sub, %call
%cmp = icmp eq i32 %and, 0 %cmp = icmp eq i32 %and, 0
%align = zext i32 %align32 to i64 %align = zext i32 %align32 to i64
br i1 %cmp, label %if.then, label %if.end br i1 %cmp, label %if.then, label %if.end
@@ -270,10 +301,7 @@ entry:
;; %ntxm1d4 = sdiv i32 %ntxm1, 4 ;; %ntxm1d4 = sdiv i32 %ntxm1, 4
%ntxm1d4 = ashr i32 %ntxm1, 2 %ntxm1d4 = ashr i32 %ntxm1, 2
%nbx = add nsw i32 %ntxm1d4, 1 %nbx = add nsw i32 %ntxm1d4, 1
%call = tail call i32 @__tid_x() %and = call i32 @__program_index()
%call1 = tail call i32 @__warpsize()
%sub = add nsw i32 %call1, -1
%and = and i32 %sub, %call
;; if (laneIdx == 0) ;; if (laneIdx == 0)
%cmp = icmp eq i32 %and, 0 %cmp = icmp eq i32 %and, 0
br i1 %cmp, label %if.then, label %if.end br i1 %cmp, label %if.then, label %if.end
@@ -1191,8 +1219,7 @@ define(`shift',`
define <1 x $1> @__shift_$1(<1 x $1>, i32) nounwind readnone alwaysinline define <1 x $1> @__shift_$1(<1 x $1>, i32) nounwind readnone alwaysinline
{ {
%val = extractelement <1 x $1> %0, i32 0 %val = extractelement <1 x $1> %0, i32 0
%tid = tail call i32 @__tid_x() %lane = call i32 @__program_index()
%lane = and i32 %tid, 31
%src = add i32 %lane, %1 %src = add i32 %lane, %1
%ret = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %src) %ret = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %src)
%c1 = icmp sge i32 %src, 0 %c1 = icmp sge i32 %src, 0
@@ -1214,7 +1241,7 @@ define(`rotate', `
define <1 x $1> @__rotate_$1(<1 x $1>, i32) nounwind readnone alwaysinline define <1 x $1> @__rotate_$1(<1 x $1>, i32) nounwind readnone alwaysinline
{ {
%val = extractelement <1 x $1> %0, i32 0 %val = extractelement <1 x $1> %0, i32 0
%tid = tail call i32 @__tid_x() %tid = call i32 @__program_index()
%src = add i32 %tid, %1 %src = add i32 %tid, %1
%lane = and i32 %src, 31 %lane = and i32 %src, 31
%rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane) %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane)
@@ -1569,7 +1596,7 @@ define $1 @__extract_$2(<1 x $1>, i32) nounwind readnone alwaysinline {
define <1 x $1> @__insert_$2(<1 x $1>, i32, define <1 x $1> @__insert_$2(<1 x $1>, i32,
$1) nounwind readnone alwaysinline { $1) nounwind readnone alwaysinline {
%orig = extractelement <1 x $1> %0, i32 0 %orig = extractelement <1 x $1> %0, i32 0
%lane = call i32 @__laneidx() %lane = call i32 @__program_index()
%c = icmp eq i32 %lane, %1 %c = icmp eq i32 %lane, %1
%val = select i1 %c, $1 %2, $1 %orig %val = select i1 %c, $1 %2, $1 %orig
%insert = insertelement <1 x $1> %0, $1 %val, i32 0 %insert = insertelement <1 x $1> %0, $1 %val, i32 0
@@ -1620,7 +1647,7 @@ define void @__do_assert_uniform(i8 *%str, i1 %test, <WIDTH x MASK> %mask) {
br i1 %test, label %ok, label %fail br i1 %test, label %ok, label %fail
fail: fail:
%lane = call i32 @__laneidx() %lane = call i32 @__program_index()
%cmp = icmp eq i32 %lane, 0 %cmp = icmp eq i32 %lane, 0
br i1 %cmp, label %fail_print, label %fail_void; br i1 %cmp, label %fail_print, label %fail_void;
@@ -1966,7 +1993,7 @@ define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinli
entry: entry:
%addr = ptrtoint $3 * %ptr to i64 %addr = ptrtoint $3 * %ptr to i64
%active = call i32 @__get_first_active_lane(); %active = call i32 @__get_first_active_lane();
%lane = call i32 @__laneidx(); %lane = call i32 @__program_index();
%c = icmp eq i32 %lane, %active %c = icmp eq i32 %lane, %active
br i1 %c, label %p1, label %p2 br i1 %c, label %p1, label %p2
@@ -2144,7 +2171,7 @@ define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %cmp, $3 %val) nounwind a
entry: entry:
%addr = ptrtoint $3 * %ptr to i64 %addr = ptrtoint $3 * %ptr to i64
%active = call i32 @__get_first_active_lane(); %active = call i32 @__get_first_active_lane();
%lane = call i32 @__laneidx(); %lane = call i32 @__program_index();
%c = icmp eq i32 %lane, %active %c = icmp eq i32 %lane, %active
br i1 %c, label %p1, label %p2 br i1 %c, label %p1, label %p2

View File

@@ -2032,10 +2032,7 @@ define noalias i8 * @__new_uniform_64rt(i64 %size)
{ {
entry: entry:
;; compute laneIdx = __tid_x() & (__warpsize() - 1) ;; compute laneIdx = __tid_x() & (__warpsize() - 1)
%call = tail call i32 @__tid_x() %and = call i32 @__program_index()
%call1 = tail call i32 @__warpsize()
%sub = add nsw i32 %call1, -1
%and = and i32 %sub, %call
;; if (laneIdx == 0) ;; if (laneIdx == 0)
%cmp = icmp eq i32 %and, 0 %cmp = icmp eq i32 %and, 0
br i1 %cmp, label %if.then, label %if.end br i1 %cmp, label %if.then, label %if.end
@@ -2062,10 +2059,7 @@ if.end: ; preds = %if.then, %entry
define void @__delete_uniform_64rt(i8 * %ptr) define void @__delete_uniform_64rt(i8 * %ptr)
{ {
entry: entry:
%call = tail call i32 @__tid_x() %and = call i32 @__program_index()
%call1 = tail call i32 @__warpsize()
%sub = add nsw i32 %call1, -1
%and = and i32 %sub, %call
%cmp = icmp eq i32 %and, 0 %cmp = icmp eq i32 %and, 0
br i1 %cmp, label %if.then, label %if.end br i1 %cmp, label %if.then, label %if.end

View File

@@ -4534,14 +4534,17 @@ define_down_avgs()
define(`declare_nvptx', define(`declare_nvptx',
` `
declare i32 @__tid_x() nounwind readnone alwaysinline declare i32 @__program_index() nounwind readnone alwaysinline
declare i32 @__warpsize() nounwind readnone alwaysinline declare i32 @__program_count() nounwind readnone alwaysinline
declare i32 @__ctaid_x() nounwind readnone alwaysinline declare i32 @__warp_index() nounwind readnone alwaysinline
declare i32 @__ctaid_y() nounwind readnone alwaysinline declare i32 @__task_index0() nounwind readnone alwaysinline
declare i32 @__ctaid_z() nounwind readnone alwaysinline declare i32 @__task_index1() nounwind readnone alwaysinline
declare i32 @__nctaid_x() nounwind readnone alwaysinline declare i32 @__task_index2() nounwind readnone alwaysinline
declare i32 @__nctaid_y() nounwind readnone alwaysinline declare i32 @__task_index() nounwind readnone alwaysinline
declare i32 @__nctaid_z() nounwind readnone alwaysinline declare i32 @__task_count0() nounwind readnone alwaysinline
declare i32 @__task_count1() nounwind readnone alwaysinline
declare i32 @__task_count2() nounwind readnone alwaysinline
declare i32 @__task_count() nounwind readnone alwaysinline
declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline

200
ctx.cpp
View File

@@ -1373,21 +1373,26 @@ FunctionEmitContext::None(llvm::Value *mask) {
llvm::Value * llvm::Value *
FunctionEmitContext::LaneMask(llvm::Value *v) { FunctionEmitContext::LaneMask(llvm::Value *v)
const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk"; {
// Call the target-dependent movmsk function to turn the vector mask #if 1 /* this makes mandelbrot example slower, why ?!? */
// into an i64 value const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
std::vector<Symbol *> mm; #else
m->symbolTable->LookupFunction(__movmsk, &mm); const char *__movmsk = "__movmsk";
if (g->target->getMaskBitCount() == 1) #endif
AssertPos(currentPos, mm.size() == 1); // Call the target-dependent movmsk function to turn the vector mask
else // into an i64 value
// There should be one with signed int signature, one unsigned int. std::vector<Symbol *> mm;
AssertPos(currentPos, mm.size() == 2); m->symbolTable->LookupFunction(__movmsk, &mm);
// We can actually call either one, since both are i32s as far as if (g->target->getMaskBitCount() == 1)
// LLVM's type system is concerned... AssertPos(currentPos, mm.size() == 1);
llvm::Function *fmm = mm[0]->function; else
return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk")); // There should be one with signed int signature, one unsigned int.
AssertPos(currentPos, mm.size() == 2);
// We can actually call either one, since both are i32s as far as
// LLVM's type system is concerned...
llvm::Function *fmm = mm[0]->function;
return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
} }
bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName) bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName)
@@ -1476,13 +1481,9 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) {
} }
llvm::Value * llvm::Value *
FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) { FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) {
llvm::Function *func_tid_x = m->module->getFunction("__tid_x"); llvm::Function *func_program_index = m->module->getFunction("__program_index");
llvm::Function *func_warpsz = m->module->getFunction("__warpsize"); llvm::Value *__program_index = CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__program_indexS");
llvm::Value *__tid_x = CallInst(func_tid_x, NULL, std::vector<llvm::Value*>(), "laneIdxForEach"); llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV");
llvm::Value *__warpsz = CallInst(func_warpsz, NULL, std::vector<llvm::Value*>(), "warpSZForEach");
llvm::Value *__warpszm1 = BinaryOperator(llvm::Instruction::Add, __warpsz, LLVMInt32(-1), "__warpszm1");
llvm::Value *laneIdx = BinaryOperator(llvm::Instruction::And, __tid_x, __warpszm1, "__laneidx");
llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), laneIdx, 0, "__laneIdxV");
#if 0 #if 0
if (!is32bits) if (!is32bits)
index = ZExtInst(index, LLVMTypes::Int64VectandType); index = ZExtInst(index, LLVMTypes::Int64VectandType);
@@ -1887,146 +1888,6 @@ FunctionEmitContext::BitCastInst(llvm::Value *value, llvm::Type *type,
return inst; return inst;
} }
/* NVPTX:
* this is a helper function which adds a warp offset to a base pointer
* pointer must either be in local memory addrspace(3)
* or the one just converted from addrspace(3) to addrspace(0) in lConvertToGenericPtr
*/
static llvm::Value* lAddWarpOffset(FunctionEmitContext *ctx, llvm::Value *value)
{
llvm::Function *func_tid_x = m->module->getFunction("__tid_x");
llvm::Function *func_warpsz = m->module->getFunction("__warpsize");
llvm::Value *__tid_x = ctx->CallInst(func_tid_x, NULL, std::vector<llvm::Value*>(), "tidCorrectLocalPtr");
llvm::Value *__warpsz = ctx->CallInst(func_warpsz, NULL, std::vector<llvm::Value*>(), "warpSzCorrectLocaLPtr");
llvm::Value *_mwarpsz = ctx->BinaryOperator(llvm::Instruction::Sub, LLVMInt32(0), __warpsz, "mwarpSzCorrectLocalPtr");
llvm::Value *__offset = ctx->BinaryOperator(llvm::Instruction::And, __tid_x, _mwarpsz, "offsetCorrectLocalPtr");
return llvm::GetElementPtrInst::Create(value, __offset, "warpOffset_gep", ctx->GetCurrentBasicBlock());
}
static llvm::Value* lConvertGepToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos &currentPos)
{
if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX)
return value;
llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
const int addressSpace = pt->getAddressSpace();
if (addressSpace != 3 && addressSpace != 4)
return value;
assert(0);
llvm::Type *elTy = pt->getElementType();
assert(elTy->isArrayTy());
const int numElTot = elTy->getArrayNumElements();
const int numEl = numElTot/4;
#if 0
fprintf(stderr, " --- detected addrspace(3) sz= %d --- \n", numEl);
#endif
llvm::ArrayType *arrTy = llvm::dyn_cast<llvm::ArrayType>(pt->getArrayElementType());
assert(arrTy != NULL);
llvm::Type *arrElTy = arrTy->getElementType();
#if 0
if (arrElTy->isArrayTy())
Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array");
#endif
/* convert elTy addrspace(3)* to i64* addrspace(3)* */
llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace);
value = ctx->BitCastInst(value, Int64Ptr3, "gep2gen_cast1");
/* convert i64* addrspace(3) to i64* */
llvm::Function *__cvt2gen = m->module->getFunction(
addressSpace == 3 ? "__cvt_loc2gen" : "__cvt_const2gen");
std::vector<llvm::Value *> __cvt2gen_args;
__cvt2gen_args.push_back(value);
value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, "gep2gen_cvt", ctx->GetCurrentBasicBlock());
/* convert i64* to errElTy* */
llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0);
value = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2");
/* compute offset */
if (addressSpace == 3)
{
llvm::Function *funcTid = m->module->getFunction("__tid_x");
llvm::Function *funcWarpSz = m->module->getFunction("__warpsize");
llvm::Value *tid = ctx->CallInst(funcTid, NULL, std::vector<llvm::Value*>(), "gep2gen_tid");
llvm::Value *warpSz = ctx->CallInst(funcWarpSz, NULL, std::vector<llvm::Value*>(), "gep2gen_warpSz");
llvm::Value *warpId = ctx->BinaryOperator(llvm::Instruction::SDiv, tid, warpSz, "gep2gen_warpId");
llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset");
value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock());
}
/* convert arrElTy* to elTy* */
llvm::PointerType *elTyPt0 = llvm::PointerType::get(elTy, 0);
value = ctx->BitCastInst(value, elTyPt0, "gep2gen_cast3");
return value;
}
/* NVPTX:
* this function compute correct address in local memory for load/store operations
*/
static llvm::Value* lCorrectLocalPtr(FunctionEmitContext *ctx, llvm::Value* value)
{
// return value;
assert(value->getType()->isPointerTy());
llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
if (g->target->getISA() != Target::NVPTX || pt->getAddressSpace() != 3) return value;
assert(0); /* we should never enter here */
return lAddWarpOffset(ctx, value);
}
/* NVPTX:
* this function converts a pointer in addrspace(3 or 4) to addrspace(0)
*/
static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos &currentPos)
{
// return value;
if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) return value;
llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
/* make sure addrspace corresponds to either local or constant memories */
const int addressSpace = pt->getAddressSpace();
if (addressSpace != 3 && addressSpace != 4) return value;
assert(0); /* we should never enter here */
/* if array, extracts element type */
llvm::Type *type = pt->getElementType();
llvm::Type *typeEl = type;
if (type->isArrayTy())
{
typeEl = type->getArrayElementType();
if (typeEl->isArrayTy())
Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array");
}
/* convert elTy addrspace(3)* to i64* addrspace(3)* */
llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace);
value = ctx->BitCastInst(value, Int64Ptr3, "cvt2gen_i64ptr");
/* convert i64* addrspace(3) to i64* */
llvm::Function *__cvt2gen = m->module->getFunction(
addressSpace == 3 ? "__cvt_loc2gen" : "__cvt_const2gen");
std::vector<llvm::Value *> __cvt2gen_args;
__cvt2gen_args.push_back(value);
#if 0
value = ctx->CallInst(__cvt2gen, NULL, __cvt2gen_args, "cvt2gen_call");
#else
value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, "cvt2gen_call", ctx->GetCurrentBasicBlock());
#endif
/* convert i64* to elTy* */
llvm::PointerType *typeElPtr = llvm::PointerType::get(typeEl, 0);
value = ctx->BitCastInst(value, typeElPtr, "cvtLoc2Gen_i642ptr");
/* add warp offset to the pointer for local memory */
if (addressSpace == 3)
value = lAddWarpOffset(ctx, value);
return value;
}
llvm::Value * llvm::Value *
FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) { FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) {
@@ -2042,7 +1903,6 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) {
if (name == NULL) if (name == NULL)
name = LLVMGetName(value, "_ptr2int"); name = LLVMGetName(value, "_ptr2int");
value = lConvertToGenericPtr(this, value, currentPos); /* NVPTX : convert to addrspace(0) */
llvm::Type *type = LLVMTypes::PointerIntType; llvm::Type *type = LLVMTypes::PointerIntType;
llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock); llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock);
AddDebugPos(inst); AddDebugPos(inst);
@@ -2076,7 +1936,6 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, llvm::Type *toType,
} }
} }
value = lConvertToGenericPtr(this, value, currentPos); /* NVPTX : convert to addrspace(0) */
llvm::Instruction *inst = new llvm::PtrToIntInst(value, toType, name, bblock); llvm::Instruction *inst = new llvm::PtrToIntInst(value, toType, name, bblock);
AddDebugPos(inst); AddDebugPos(inst);
return inst; return inst;
@@ -2383,7 +2242,6 @@ FunctionEmitContext::MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset) {
llvm::Value * llvm::Value *
FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index, FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index,
const Type *ptrRefType, const char *name) { const Type *ptrRefType, const char *name) {
basePtr = lConvertGepToGenericPtr(this, basePtr, currentPos);
if (basePtr == NULL || index == NULL) { if (basePtr == NULL || index == NULL) {
AssertPos(currentPos, m->errorCount > 0); AssertPos(currentPos, m->errorCount > 0);
return NULL; return NULL;
@@ -2454,7 +2312,6 @@ llvm::Value *
FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0, FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
llvm::Value *index1, const Type *ptrRefType, llvm::Value *index1, const Type *ptrRefType,
const char *name) { const char *name) {
basePtr = lConvertGepToGenericPtr(this, basePtr, currentPos);
if (basePtr == NULL || index0 == NULL || index1 == NULL) { if (basePtr == NULL || index0 == NULL || index1 == NULL) {
AssertPos(currentPos, m->errorCount > 0); AssertPos(currentPos, m->errorCount > 0);
return NULL; return NULL;
@@ -2657,7 +2514,6 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) {
if (name == NULL) if (name == NULL)
name = LLVMGetName(ptr, "_load"); name = LLVMGetName(ptr, "_load");
ptr = lCorrectLocalPtr(this, ptr); /* NVPTX: correct addrspace(3) pointer before load/store */
llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock); llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock);
if (g->opt.forceAlignedMemory && if (g->opt.forceAlignedMemory &&
@@ -2790,7 +2646,6 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, llvm::Value *mask,
// it's totally unaligned. (This shouldn't make any difference // it's totally unaligned. (This shouldn't make any difference
// vs the proper alignment in practice.) // vs the proper alignment in practice.)
align = 1; align = 1;
ptr = lCorrectLocalPtr(this, ptr); /* NVPTX: correct addrspace(3) pointer before load/store */
llvm::Instruction *inst = new llvm::LoadInst(ptr, name, llvm::Instruction *inst = new llvm::LoadInst(ptr, name,
false /* not volatile */, false /* not volatile */,
align, bblock); align, bblock);
@@ -3218,7 +3073,6 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) {
llvm::dyn_cast<llvm::PointerType>(ptr->getType()); llvm::dyn_cast<llvm::PointerType>(ptr->getType());
AssertPos(currentPos, pt != NULL); AssertPos(currentPos, pt != NULL);
ptr = lCorrectLocalPtr(this, ptr); /* NVPTX: correct addrspace(3) pointer before load/store */
llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock); llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock);
if (g->opt.forceAlignedMemory && if (g->opt.forceAlignedMemory &&
@@ -3531,14 +3385,6 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
return NULL; return NULL;
} }
#if 0
std::vector<llvm::Value *> args = args_in;
/* NVPTX:
* Convert all pointers to addrspace(0)
*/
for (unsigned int i = 0; i < args.size(); i++)
args[i] = lConvertToGenericPtr(this, args[i], currentPos);
#endif
std::vector<llvm::Value *> argVals = args; std::vector<llvm::Value *> argVals = args;
// Most of the time, the mask is passed as the last argument. this // Most of the time, the mask is passed as the last argument. this
// isn't the case for things like intrinsics, builtins, and extern "C" // isn't the case for things like intrinsics, builtins, and extern "C"

View File

@@ -2167,20 +2167,20 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
if (g->target->getISA() == Target::NVPTX) if (g->target->getISA() == Target::NVPTX)
{ {
opts.addMacroDef("__NVPTX__"); opts.addMacroDef("__NVPTX__");
opts.addMacroDef("programIndex=laneIndex()"); opts.addMacroDef("programIndex=__programIndex()");
opts.addMacroDef("cif=if"); opts.addMacroDef("cif=if");
opts.addMacroDef("cfor=for"); opts.addMacroDef("cfor=for");
opts.addMacroDef("cwhile=while"); opts.addMacroDef("cwhile=while");
opts.addMacroDef("ccontinue=continue"); opts.addMacroDef("ccontinue=continue");
opts.addMacroDef("cdo=do"); opts.addMacroDef("cdo=do");
opts.addMacroDef("taskIndex0=blockIndex0()"); opts.addMacroDef("taskIndex0=__taskIndex0()");
opts.addMacroDef("taskCount0=blockCount0()"); opts.addMacroDef("taskIndex1=__taskIndex1()");
opts.addMacroDef("taskIndex1=blockIndex1()"); opts.addMacroDef("taskIndex2=__taskIndex2()");
opts.addMacroDef("taskCount1=blockCount1()"); opts.addMacroDef("taskIndex=__taskIndex()");
opts.addMacroDef("taskIndex2=blockIndex2()"); opts.addMacroDef("taskCount0=__taskCount0()");
opts.addMacroDef("taskCount2=blockCount2()"); opts.addMacroDef("taskCount1=__taskCount1()");
opts.addMacroDef("taskIndex=(taskIndex0 + taskCount0*(taskIndex1 + taskCount1*taskIndex2))"); opts.addMacroDef("taskCount2=__taskCount2()");
opts.addMacroDef("taskCount=(taskCount0*taskCount1*taskCount2)"); opts.addMacroDef("taskCount=__taskCount()");
} }
#if defined(LLVM_3_1) #if defined(LLVM_3_1)

View File

@@ -62,57 +62,25 @@
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// CUDA Specific primitives // CUDA Specific primitives
// //
#define CUDABLOCKSIZE 128
#define WARPSIZE2 5
#define WARPSIZE (1<<WARPSIZE2)
/***************/ /***************/
__declspec(safe,cost0)
static inline uniform int warpSize() __declspec(safe,cost0) static inline varying int __programIndex() { return __program_index(); }
{ __declspec(safe,cost0) static inline uniform int __programCount() { return __program_count(); }
return WARPSIZE; //__warpsize(); __declspec(safe,cost0) static inline uniform int __warpIndex() { return __warp_index(); }
}
/***************/ /***************/
__declspec(safe,cost0)
static inline varying int laneIndex() __declspec(safe,cost0) static inline uniform int __taskIndex0() { return __task_index0(); }
{ __declspec(safe,cost0) static inline uniform int __taskIndex1() { return __task_index1(); }
return __tid_x() & (WARPSIZE-1) ; //& (warpSize()-1); __declspec(safe,cost0) static inline uniform int __taskIndex2() { return __task_index2(); }
} __declspec(safe,cost0) static inline uniform int __taskIndex () { return __task_index (); }
/***************/ /***************/
__declspec(safe,cost0)
static inline uniform int blockIndex0() __declspec(safe,cost0) static inline uniform int __taskCount0() { return __task_count0(); }
{ __declspec(safe,cost0) static inline uniform int __taskCount1() { return __task_count1(); }
return (__ctaid_x() * (CUDABLOCKSIZE >> WARPSIZE2)) + (__tid_x() >> WARPSIZE2); __declspec(safe,cost0) static inline uniform int __taskCount2() { return __task_count2(); }
} __declspec(safe,cost0) static inline uniform int __taskCount () { return __task_count (); }
/***************/
__declspec(safe,cost0)
static inline uniform int blockIndex1()
{
return __ctaid_y();
}
/***************/
__declspec(safe,cost0)
static inline uniform int blockIndex2()
{
return __ctaid_z();
}
/***************/
__declspec(safe,cost0)
static inline uniform int blockCount0()
{
return __nctaid_x() * (CUDABLOCKSIZE >> WARPSIZE2);
}
/***************/
__declspec(safe,cost0)
static inline uniform int blockCount1()
{
return __nctaid_y();
}
/***************/
__declspec(safe,cost0)
static inline uniform int blockCount2()
{
return __nctaid_z();
}
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// Low level primitives // Low level primitives

View File

@@ -186,11 +186,8 @@ static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *
llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0); llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0);
value = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2"); value = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2");
llvm::Function *funcTid = m->module->getFunction("__tid_x"); llvm::Function *func_warp_index = m->module->getFunction("__warp_index");
llvm::Function *funcWarpSz = m->module->getFunction("__warpsize"); llvm::Value *warpId = ctx->CallInst(func_warp_index, NULL, std::vector<llvm::Value*>(), "gep2gen_warp_index");
llvm::Value *tid = ctx->CallInst(funcTid, NULL, std::vector<llvm::Value*>(), "gep2gen_tid");
llvm::Value *warpSz = ctx->CallInst(funcWarpSz, NULL, std::vector<llvm::Value*>(), "gep2gen_warpSz");
llvm::Value *warpId = ctx->BinaryOperator(llvm::Instruction::SDiv, tid, warpSz, "gep2gen_warpId");
llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset"); llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset");
value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock()); value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock());
} }
@@ -1517,10 +1514,8 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList); llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList);
globalDelta->setInitializer(constDelta); globalDelta->setInitializer(constDelta);
llvm::Function *func_tid_x = m->module->getFunction("__tid_x"); llvm::Function *func_program_index = m->module->getFunction("__program_index");
std::vector<llvm::Value *> allocArgs; llvm::Value *laneIdx = ctx->CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__programIndex");
llvm::Value *__tid_x = ctx->CallInst(func_tid_x, NULL, allocArgs, "laneIdxForEach");
llvm::Value *laneIdx = ctx->BinaryOperator(llvm::Instruction::And, __tid_x, LLVMInt32(31), "__laneidx");
std::vector<llvm::Value*> ptr_arrayidx_indices; std::vector<llvm::Value*> ptr_arrayidx_indices;
ptr_arrayidx_indices.push_back(LLVMInt32(0)); ptr_arrayidx_indices.push_back(LLVMInt32(0));