diff --git a/builtins.cpp b/builtins.cpp index bec7a3e5..80740146 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -481,6 +481,7 @@ lSetInternalFunctions(llvm::Module *module) { "__min_varying_uint32", "__min_varying_uint64", "__movmsk", + "__movmsk_ptx", "__new_uniform_32rt", "__new_uniform_64rt", "__new_varying32_32rt", diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index 1e8d0ae5..fd314d3b 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -722,16 +722,14 @@ svml_stubs(double,d,WIDTH) define i64 @__movmsk(<1 x i1>) nounwind readnone alwaysinline { %v = extractelement <1 x i1> %0, i32 0 -;; if 0 - ;; this one fails with ./tests/popcnt-4.ispc and others ... -;; %v0 = call i32 @__ballot_nvptx(i1 %v) -;; %v64 = zext i32 %v0 to i64 - -;; else - ;; this one just copies mask - %v64 = zext i1 %v to i64 -;; endif - ret i64 %v64 + %v64 = zext i1 %v to i64 + ret i64 %v64 +} +define i64 @__movmsk_ptx(<1 x i1>) nounwind readnone alwaysinline { + %v = extractelement <1 x i1> %0, i32 0 + %v0 = call i32 @__ballot_nvptx(i1 %v) + %v64 = zext i32 %v0 to i64 + ret i64 %v64 } define i1 @__any(<1 x i1>) nounwind readnone alwaysinline { diff --git a/ctx.cpp b/ctx.cpp index 6fb7561d..1f6e5e53 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1374,10 +1374,11 @@ FunctionEmitContext::None(llvm::Value *mask) { llvm::Value * FunctionEmitContext::LaneMask(llvm::Value *v) { + const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk"; // Call the target-dependent movmsk function to turn the vector mask // into an i64 value std::vector mm; - m->symbolTable->LookupFunction("__movmsk", &mm); + m->symbolTable->LookupFunction(__movmsk, &mm); if (g->target->getMaskBitCount() == 1) AssertPos(currentPos, mm.size() == 1); else @@ -1389,6 +1390,18 @@ FunctionEmitContext::LaneMask(llvm::Value *v) { return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk")); } +llvm::Value* +FunctionEmitContext::Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar) +{ + return NULL; +} + +llvm::Value* +FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane) +{ + return NULL; +} + llvm::Value * FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) { @@ -1410,8 +1423,6 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) { llvm::Value * FunctionEmitContext::ProgramIndexVector(bool is32bits) { - if (1 || g->target->getISA() != Target::NVPTX) - { llvm::SmallVector array; for (int i = 0; i < g->target->getVectorWidth() ; ++i) { llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i); @@ -1421,9 +1432,9 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) { llvm::Constant* index = llvm::ConstantVector::get(array); return index; - } - else - { /* this calls __tid_x() & __warpsize */ +} +llvm::Value * +FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) { llvm::Function *func_tid_x = m->module->getFunction("__tid_x"); llvm::Function *func_warpsz = m->module->getFunction("__warpsize"); llvm::Value *__tid_x = CallInst(func_tid_x, NULL, std::vector(), "laneIdxForEach"); @@ -1432,7 +1443,6 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) { llvm::Value *laneIdx = BinaryOperator(llvm::Instruction::And, __tid_x, __warpszm1, "__laneidx"); llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), laneIdx, 0, "__laneIdxV"); return index; - } } diff --git a/ctx.h b/ctx.h index 4dd30053..57160c17 100644 --- a/ctx.h +++ b/ctx.h @@ -291,6 +291,13 @@ public: of the mask is on. */ llvm::Value *LaneMask(llvm::Value *mask); + + /** Issues a call to __insert_int8/int16/int32/int64/float/double */ + llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar); + /** Issues a call to __extract_int8/int16/int32/int64/float/double */ + llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane); + + /** Given two masks of type LLVMTypes::MaskType, return an i1 value that indicates whether the two masks are equal. */ llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2); @@ -298,6 +305,7 @@ public: /** Generate ConstantVector, which contains ProgramIndex, i.e. < i32 0, i32 1, i32 2, i32 3> */ llvm::Value *ProgramIndexVector(bool is32bits = true); + llvm::Value *ProgramIndexVectorPTX(bool is32bits = true); /** Given a string, create an anonymous global variable to hold its value and return the pointer to the string. */ diff --git a/stdlib.ispc b/stdlib.ispc index 871129e3..3e37ac5b 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -2057,11 +2057,10 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value) } \ static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) { \ TYPE ret; \ - uniform TYPE * uniform ptrs[programCount]; \ - ptrs[programIndex] = p; \ foreach_active (i) { \ - ret = insert(ret, i, *ptrs[i]); \ - *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \ + uniform TYPE * uniform ptr = (uniform TYPE * uniform)extract((int64)p, i); \ + ret = insert(ret, i, *ptr); \ + *ptr = OPFUNC(*ptr, extract(value, i)); \ } \ return ret; \ } diff --git a/stmt.cpp b/stmt.cpp index 2160cbaf..b30a0000 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -2243,7 +2243,8 @@ ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const { // math...) // Get the "program index" vector value - llvm::Value *programIndex = ctx->ProgramIndexVector(); + llvm::Value *programIndex = g->target->getISA() == Target::NVPTX ? + ctx->ProgramIndexVectorPTX() : ctx->ProgramIndexVector(); // And smear the current lane out to a vector llvm::Value *firstSet32 = @@ -2354,6 +2355,8 @@ ForeachUniqueStmt::ForeachUniqueStmt(const char *iterName, Expr *e, sym = m->symbolTable->LookupVariable(iterName); expr = e; stmts = s; + if (g->target->getISA() == Target::NVPTX) + Error(pos, "\"foreach_unique\" is not yetsupported with \"nvptx\" target."); }