diff --git a/Makefile b/Makefile index 69b8423c..abe7e1f7 100644 --- a/Makefile +++ b/Makefile @@ -144,7 +144,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=nvptx64 avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ +TARGETS=nvptx avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \ generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 ifneq ($(ARM_ENABLED), 0) @@ -254,15 +254,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc @echo Compiling $< $(CXX) $(CXXFLAGS) -o $@ -c $< -objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/util_ptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll) +objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@ -objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/util_ptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll) +objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< \(32 bit version\) m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@ -objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/util_ptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll) +objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< \(64 bit version\) m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@ diff --git a/builtins.cpp b/builtins.cpp index 40f7006b..a7820c4c 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -693,9 +693,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length, if (g->target->getISA() != Target::NEON32 && g->target->getISA() != Target::NEON16 && g->target->getISA() != Target::NEON8 && - g->target->getISA() != Target::NVPTX64) + g->target->getISA() != Target::NVPTX) #else - if (g->target->getISA() != Target::NVPTX64) + if (g->target->getISA() != Target::NVPTX) #endif // !__arm__ { Assert(bcTriple.getArch() == llvm::Triple::UnknownArch || @@ -858,14 +858,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // Next, add the target's custom implementations of the various needed // builtin functions (e.g. __masked_store_32(), etc). switch (g->target->getISA()) { - case Target::NVPTX64: + case Target::NVPTX: { if (runtime32) { - fprintf(stderr, "W're sorry, but only 64bit targets are supported at this moment .. \n"); + fprintf(stderr, "Unforetunatly 32bit targets are supported at the moment .. \n"); assert(0); } else { - EXPORT_MODULE(builtins_bitcode_nvptx64_64bit); + EXPORT_MODULE(builtins_bitcode_nvptx_64bit); } break; }; @@ -1138,7 +1138,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } // define the 'programCount' builtin variable - if (!g->target->isPTX()) + if (g->target->getISA() != Target::NVPTX) { lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable); } diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx.ll similarity index 99% rename from builtins/target-nvptx64.ll rename to builtins/target-nvptx.ll index d43e5d4a..db217e9a 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx.ll @@ -105,15 +105,9 @@ define i32 @__lanemask_lt_nvptx() nounwind readnone alwaysinline ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; tasking -define i8* @ISPCAlloc(i8**, i64, i32) nounwind alwaysinline -{ - %ptr = inttoptr i64 1 to i8* - ret i8* %ptr -} - ;; this call allocate parameter buffer for kernel launch declare i64 @cudaGetParameterBuffer(i64, i64) nounwind -define i8* @ISPCGetParamBuffer(i8**, i64 %align, i64 %size) nounwind alwaysinline +define i8* @ISPCAlloc(i8**, i64 %size, i32 %align32) nounwind alwaysinline { entry: %call = tail call i32 @__tid_x() @@ -121,6 +115,7 @@ entry: %sub = add nsw i32 %call1, -1 %and = and i32 %sub, %call %cmp = icmp eq i32 %and, 0 + %align = zext i32 %align32 to i64 br i1 %cmp, label %if.then, label %if.end if.then: @@ -224,7 +219,7 @@ define void @ISPCSync(i8*) nounwind alwaysinline -include(`util_ptx.m4') +include(`util-nvptx.m4') stdlib_core() packed_load_and_store() diff --git a/builtins/util_ptx.m4 b/builtins/util-nvptx.m4 similarity index 100% rename from builtins/util_ptx.m4 rename to builtins/util-nvptx.m4 diff --git a/ctx.cpp b/ctx.cpp index 74a760ae..b5ca392c 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1410,7 +1410,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) { llvm::Value * FunctionEmitContext::ProgramIndexVector(bool is32bits) { - if (!g->target->isPTX()) //g->target->getISA() != Target::NVPTX64) + if (g->target->getISA() != Target::NVPTX) { llvm::SmallVector array; for (int i = 0; i < g->target->getVectorWidth() ; ++i) { @@ -3540,7 +3540,7 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee, std::vector &argVals, llvm::Value *launchCount[3]){ - if (!g->target->isPTX()) + if (g->target->getISA() != Target::NVPTX) { if (callee == NULL) { AssertPos(currentPos, m->errorCount > 0); @@ -3608,7 +3608,79 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee, args.push_back(launchCount[2]); return CallInst(flaunch, NULL, args, ""); } - else /* isPTX == true */ + else /* NVPTX */ + { + if (callee == NULL) { + AssertPos(currentPos, m->errorCount > 0); + return NULL; + } + launchedTasks = true; + + AssertPos(currentPos, llvm::isa(callee)); + std::vector argTypes; + for (unsigned int i = 0; i < argVals.size(); i++) + argTypes.push_back(argVals[i]->getType()); + llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes); + llvm::StructType *argStructType = static_cast(st); + llvm::Value *structSize = g->target->SizeOf(argStructType, bblock); + if (structSize->getType() != LLVMTypes::Int64Type) + structSize = ZExtInst(structSize, LLVMTypes::Int64Type, + "struct_size_to_64"); + + const int align = 8; + llvm::Function *falloc = m->module->getFunction("ISPCAlloc"); + AssertPos(currentPos, falloc != NULL); + std::vector allocArgs; + allocArgs.push_back(launchGroupHandlePtr); + allocArgs.push_back(structSize); + allocArgs.push_back(LLVMInt32(align)); + llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr"); + llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64"); + llvm::BasicBlock* if_true = CreateBasicBlock("if_true"); + llvm::BasicBlock* if_false = CreateBasicBlock("if_false"); + + /* check if the pointer returned by ISPCAlloc is not NULL + * -------------- + * this is a workaround for not checking the value of programIndex + * because ISPCAlloc will return NULL pointer for all programIndex > 0 + * of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0 + * will also be NULL + * This check must be added, and also rewrite the code to make it less opaque + */ + llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1"); + BranchInst(if_true, if_false, cmp1); + + /**********************/ + bblock = if_true; + + // label_if_then block: + llvm::Type *pt = llvm::PointerType::getUnqual(st); + llvm::Value *argmem = BitCastInst(voidmem, pt); + for (unsigned int i = 0; i < argVals.size(); ++i) + { + llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg"); + // don't need to do masked store here, I think + StoreInst(argVals[i], ptr); + } + BranchInst(if_false); + + /**********************/ + bblock = if_false; + + llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType); + llvm::Function *flaunch = m->module->getFunction("ISPCLaunch"); + AssertPos(currentPos, flaunch != NULL); + std::vector args; + args.push_back(launchGroupHandlePtr); + args.push_back(fptr); + args.push_back(voidmem); + args.push_back(launchCount[0]); + args.push_back(launchCount[1]); + args.push_back(launchCount[2]); + llvm::Value *ret = CallInst(flaunch, NULL, args, ""); + return ret; + } +#if 0 { if (callee == NULL) { AssertPos(currentPos, m->errorCount > 0); @@ -3684,13 +3756,16 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee, args.push_back(launchCount[2]); return CallInst(flaunch, NULL, args, ""); } +#endif } void FunctionEmitContext::SyncInst() { - if (!g->target->isPTX()) +#if 0 + if (g->target->getISA() != Target::NVPTX) { +#endif llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr); llvm::Value *nullPtrValue = llvm::Constant::getNullValue(LLVMTypes::VoidPointerType); @@ -3714,6 +3789,7 @@ FunctionEmitContext::SyncInst() { BranchInst(bPostSync); SetCurrentBasicBlock(bPostSync); +#if 0 } else { @@ -3726,6 +3802,7 @@ FunctionEmitContext::SyncInst() { CallInst(fsync, NULL, launchGroupHandle, ""); StoreInst(nullPtrValue, launchGroupHandlePtr); } +#endif } diff --git a/decl.cpp b/decl.cpp index c0857474..7c248f82 100644 --- a/decl.cpp +++ b/decl.cpp @@ -531,7 +531,7 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) { returnType = returnType->ResolveUnboundVariability(Variability::Varying); bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0); - if (isTask && g->target->isPTX()) //getISA() == Target::NVPTX64) + if (isTask && g->target->getISA() == Target::NVPTX) { // ds->storageClass = SC_EXTERN_C; ds->typeQualifiers |= TYPEQUAL_UNMASKED; @@ -547,12 +547,11 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) { "qualifiers"); return; } -// if (!g->target->isPTX()) - if (isExternC && isTask) { - Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" " - "qualifiers"); - return; - } + if (isExternC && isTask) { + Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" " + "qualifiers"); + return; + } if (isExternC && isExported) { Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" " "qualifiers"); diff --git a/examples_ptx/common_gpu.mk b/examples_ptx/common_gpu.mk index c4628559..e02e5b95 100644 --- a/examples_ptx/common_gpu.mk +++ b/examples_ptx/common_gpu.mk @@ -22,7 +22,7 @@ endif # ISPC=ispc -ISPC_FLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math +ISPC_FLAGS=-O3 --math-lib=default --target=nvptx --opt=fast-math # # # diff --git a/func.cpp b/func.cpp index 0782d724..165c17ba 100644 --- a/func.cpp +++ b/func.cpp @@ -125,7 +125,7 @@ Function::Function(Symbol *s, Stmt *c) { sym->parentFunction = this; } - if (type->isTask) { + if (type->isTask && g->target->getISA() != Target::NVPTX) { threadIndexSym = m->symbolTable->LookupVariable("threadIndex"); Assert(threadIndexSym); threadCountSym = m->symbolTable->LookupVariable("threadCount"); @@ -237,12 +237,122 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, #endif const FunctionType *type = CastType(sym->type); Assert(type != NULL); + if (type->isTask == true && g->target->getISA() != Target::NVPTX) { + // For tasks, we there should always be three parmeters: the + // pointer to the structure that holds all of the arguments, the + // thread index, and the thread count variables. + llvm::Function::arg_iterator argIter = function->arg_begin(); + llvm::Value *structParamPtr = argIter++; + llvm::Value *threadIndex = argIter++; + llvm::Value *threadCount = argIter++; + llvm::Value *taskIndex = argIter++; + llvm::Value *taskCount = argIter++; + llvm::Value *taskIndex0 = argIter++; + llvm::Value *taskIndex1 = argIter++; + llvm::Value *taskIndex2 = argIter++; + llvm::Value *taskCount0 = argIter++; + llvm::Value *taskCount1 = argIter++; + llvm::Value *taskCount2 = argIter++; + + // Copy the function parameter values from the structure into local + // storage + for (unsigned int i = 0; i < args.size(); ++i) + lCopyInTaskParameter(i, structParamPtr, args, ctx); + + if (type->isUnmasked == false) { + // Copy in the mask as well. + int nArgs = (int)args.size(); + // The mask is the last parameter in the argument structure + llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL, + "task_struct_mask"); + llvm::Value *ptrval = ctx->LoadInst(ptr, "mask"); + ctx->SetFunctionMask(ptrval); + } + + // Copy threadIndex and threadCount into stack-allocated storage so + // that their symbols point to something reasonable. + threadIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadIndex"); + ctx->StoreInst(threadIndex, threadIndexSym->storagePtr); + + threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount"); + ctx->StoreInst(threadCount, threadCountSym->storagePtr); + + // Copy taskIndex and taskCount into stack-allocated storage so + // that their symbols point to something reasonable. + taskIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex"); + ctx->StoreInst(taskIndex, taskIndexSym->storagePtr); + + taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount"); + ctx->StoreInst(taskCount, taskCountSym->storagePtr); + + taskIndexSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex0"); + ctx->StoreInst(taskIndex0, taskIndexSym0->storagePtr); + taskIndexSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex1"); + ctx->StoreInst(taskIndex1, taskIndexSym1->storagePtr); + taskIndexSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex2"); + ctx->StoreInst(taskIndex2, taskIndexSym2->storagePtr); + + taskCountSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount0"); + ctx->StoreInst(taskCount0, taskCountSym0->storagePtr); + taskCountSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount1"); + ctx->StoreInst(taskCount1, taskCountSym1->storagePtr); + taskCountSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount2"); + ctx->StoreInst(taskCount2, taskCountSym2->storagePtr); + } + else { + // Regular, non-task function + llvm::Function::arg_iterator argIter = function->arg_begin(); + for (unsigned int i = 0; i < args.size(); ++i, ++argIter) { + Symbol *sym = args[i]; + if (sym == NULL) + // anonymous function parameter + continue; + + argIter->setName(sym->name.c_str()); + + // Allocate stack storage for the parameter and emit code + // to store the its value there. + sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str()); + ctx->StoreInst(argIter, sym->storagePtr); + ctx->EmitFunctionParameterDebugInfo(sym, i); + } + + // If the number of actual function arguments is equal to the + // number of declared arguments in decl->functionParams, then we + // don't have a mask parameter, so set it to be all on. This + // happens for exmaple with 'export'ed functions that the app + // calls. + if (argIter == function->arg_end()) { + Assert(type->isUnmasked || type->isExported); + ctx->SetFunctionMask(LLVMMaskAllOn); + } + else { + Assert(type->isUnmasked == false); + + // Otherwise use the mask to set the entry mask value + argIter->setName("__mask"); + Assert(argIter->getType() == LLVMTypes::MaskType); + ctx->SetFunctionMask(argIter); + Assert(++argIter == function->arg_end()); + } + if (type->isTask == true && g->target->getISA() == Target::NVPTX) + { + llvm::NamedMDNode* annotations = + m->module->getOrInsertNamedMetadata("nvvm.annotations"); + llvm::SmallVector av; + av.push_back(function); + av.push_back(llvm::MDString::get(*g->ctx, "kernel")); + av.push_back(LLVMInt32(1)); + annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); + } + } +#if 0 if (type->isTask == true) { // For tasks, we there should always be three parmeters: the // pointer to the structure that holds all of the arguments, the // thread index, and the thread count variables. - if (!g->target->isPTX()) //if (g->target->getISA() != Target::NVPTX64) + if (g->target->getISA() != Target::NVPTX) { llvm::Function::arg_iterator argIter = function->arg_begin(); llvm::Value *structParamPtr = argIter++; @@ -341,7 +451,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, Assert(type->isUnmasked || type->isExported); ctx->SetFunctionMask(LLVMMaskAllOn); } - else /* for NVPTX64 , function must be unmasked */ + else /* for NVPTX, function must be unmasked */ { assert(0); Assert(type->isUnmasked == false); @@ -353,7 +463,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, Assert(++argIter == function->arg_end()); } - if (g->target->isPTX() && g->target->getISA() == Target::NVPTX64) + if (g->target->getISA() == Target::NVPTX) { llvm::NamedMDNode* annotations = m->module->getOrInsertNamedMetadata("nvvm.annotations"); @@ -402,6 +512,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, Assert(++argIter == function->arg_end()); } } +#endif // Finally, we can generate code for the function if (code != NULL) { @@ -535,26 +646,12 @@ Function::GenerateIR() { } // And we can now go ahead and emit the code - /* export function with NVPTX64 target should be emitted host architecture */ -#if 0 - const FunctionType *func_type= CastType(sym->type); - if (g->target->getISA() == Target::NVPTX64 && func_type->isExported) - return; -#endif - -#if 0 - if (g->target->getISA() != Target::NVPTX64 && g->target->isPTX() && func_type->isTask) - return; -#endif - -// if (!(g->target->getISA()==Target::NVPTX64 && func_type->isExported)) { FunctionEmitContext ec(this, sym, function, firstStmtPos); emitCode(&ec, function, firstStmtPos); } if (m->errorCount == 0) { -// if (!(g->target->getISA() == Target::NVPTX64 && func_type->isExported)) if (llvm::verifyFunction(*function, llvm::ReturnStatusAction) == true) { if (g->debugPrint) function->dump(); @@ -566,18 +663,18 @@ Function::GenerateIR() { // the application can call it const FunctionType *type = CastType(sym->type); Assert(type != NULL); - if (type->isExported) { // && g->target->getISA() != Target::VPTX64) { + if (type->isExported) { if (!type->isTask) { - if (g->target->isPTX() && g->target->getISA() == Target::NVPTX64) - { - llvm::NamedMDNode* annotations = - m->module->getOrInsertNamedMetadata("nvvm.annotations"); - llvm::SmallVector av; - av.push_back(function); - av.push_back(llvm::MDString::get(*g->ctx, "kernel")); - av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1)); - annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); - } + if (g->target->getISA() == Target::NVPTX) + { + llvm::NamedMDNode* annotations = + m->module->getOrInsertNamedMetadata("nvvm.annotations"); + llvm::SmallVector av; + av.push_back(function); + av.push_back(llvm::MDString::get(*g->ctx, "kernel")); + av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1)); + annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); + } llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true); llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage; std::string functionName = sym->name; @@ -585,7 +682,7 @@ Function::GenerateIR() { if (g->mangleFunctionsWithTarget) functionName += std::string("_") + g->target->GetISAString(); - if (g->target->getISA() == Target::NVPTX64) + if (g->target->getISA() == Target::NVPTX) functionName += std::string("___export"); llvm::Function *appFunction = llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module); @@ -615,7 +712,7 @@ Function::GenerateIR() { FATAL("Function verificication failed"); } } - if (g->target->isPTX() && g->target->getISA() == Target::NVPTX64) + if (g->target->getISA() == Target::NVPTX) { llvm::NamedMDNode* annotations = m->module->getOrInsertNamedMetadata("nvvm.annotations"); diff --git a/ispc.cpp b/ispc.cpp index 97735308..223e7317 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -174,7 +174,7 @@ static const char *supportedCPUs[] = { #endif // LLVM 3.4+ }; -Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, bool isPTX) : +Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : m_target(NULL), m_targetMachine(NULL), #if defined(LLVM_3_1) @@ -184,7 +184,6 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo #endif m_valid(false), m_isa(SSE2), - m_isPTX(isPTX), m_arch(""), m_is32Bit(true), m_cpu(""), @@ -212,7 +211,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo if (!strcmp(cpu, "core-avx2")) isa = "avx2-i32x8"; else if (!strcmp(cpu, "sm_35")) - isa = "nvptx64"; + isa = "nvptx"; #ifdef ISPC_ARM_ENABLED else if (!strcmp(cpu, "cortex-a9") || !strcmp(cpu, "cortex-a15")) @@ -249,7 +248,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo cpu = "cortex-a9"; #endif - if (cpu == NULL && !strcmp(isa, "nvptx64")) + if (cpu == NULL && !strcmp(isa, "nvptx")) cpu = "sm_35"; if (cpu == NULL) { @@ -280,8 +279,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo this->m_cpu = cpu; if (arch == NULL) { - if (!strcmp(isa, "nvptx64")) - arch = "nvptx64"; + if (!strcmp(isa, "nvptx")) + arch = "nvptx"; #ifdef ISPC_ARM_ENABLED else if (!strncmp(isa, "neon", 4)) arch = "arm"; @@ -709,10 +708,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo this->m_maskBitCount = 32; } #endif - else if (!strcasecmp(isa, "nvptx64")) + else if (!strcasecmp(isa, "nvptx")) { - this->m_isa = Target::NVPTX64; - this->m_isPTX = true; + this->m_isa = Target::NVPTX; this->m_nativeVectorWidth = 32; this->m_nativeVectorAlignment = 32; this->m_vectorWidth = 1; @@ -780,7 +778,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-" "i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-" "f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32-v4:128:128"; - } else if (m_isa == Target::NVPTX64) + } else if (m_isa == Target::NVPTX) { dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"; } @@ -803,7 +801,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo // Initialize target-specific "target-feature" attribute. if (!m_attributes.empty()) { llvm::AttrBuilder attrBuilder; - if (m_isa != Target::NVPTX64) + if (m_isa != Target::NVPTX) attrBuilder.addAttribute("target-cpu", this->m_cpu); attrBuilder.addAttribute("target-features", this->m_attributes); this->m_tf_attributes = new llvm::AttributeSet( @@ -838,7 +836,7 @@ Target::SupportedCPUs() { const char * Target::SupportedArchs() { - return "nvptx64, " + return "nvptx, " #ifdef ISPC_ARM_ENABLED "arm, " #endif @@ -848,7 +846,7 @@ Target::SupportedArchs() { const char * Target::SupportedTargets() { - return "nvptx64, " + return "nvptx, " #ifdef ISPC_ARM_ENABLED "neon-i8x16, neon-i16x8, neon-i32x4, " #endif @@ -866,9 +864,9 @@ Target::SupportedTargets() { std::string Target::GetTripleString() const { llvm::Triple triple; - if (m_arch == "nvptx64") + if (m_arch == "nvptx") { - triple.setTriple("nvptx64"); + triple.setTriple("nvptx"); } #ifdef ISPC_ARM_ENABLED else if (m_arch == "arm") { @@ -902,8 +900,8 @@ Target::GetTripleString() const { const char * Target::ISAToString(ISA isa) { switch (isa) { - case Target::NVPTX64: - return "nvptx64"; + case Target::NVPTX: + return "nvptx"; #ifdef ISPC_ARM_ENABLED case Target::NEON8: return "neon-8"; diff --git a/ispc.h b/ispc.h index d649b6cd..ebef4bb0 100644 --- a/ispc.h +++ b/ispc.h @@ -179,7 +179,7 @@ public: flexible/performant of them will apear last in the enumerant. Note also that __best_available_isa() needs to be updated if ISAs are added or the enumerant values are reordered. */ - enum ISA { NVPTX64, + enum ISA { NVPTX, #ifdef ISPC_ARM_ENABLED NEON32, NEON16, NEON8, #endif @@ -189,7 +189,7 @@ public: /** Initializes the given Target pointer for a target of the given name, if the name is a known target. Returns true if the target was initialized and false if the name is unknown. */ - Target(const char *arch, const char *cpu, const char *isa, bool pic, bool isPTX = false); + Target(const char *arch, const char *cpu, const char *isa, bool pic); /** Returns a comma-delimited string giving the names of the currently supported compilation targets. */ @@ -251,7 +251,6 @@ public: bool isValid() const {return m_valid;} ISA getISA() const {return m_isa;} - bool isPTX() const {return m_isPTX;} std::string getArch() const {return m_arch;} @@ -310,7 +309,6 @@ private: /** Instruction set being compiled to. */ ISA m_isa; - bool m_isPTX; /** Target system architecture. (e.g. "x86-64", "x86"). */ std::string m_arch; diff --git a/module.cpp b/module.cpp index 4ca1b351..a745db29 100644 --- a/module.cpp +++ b/module.cpp @@ -733,7 +733,7 @@ Module::AddFunctionDeclaration(const std::string &name, if (storageClass == SC_EXTERN_C) { // Make sure the user hasn't supplied both an 'extern "C"' and a // 'task' qualifier with the function - if (functionType->isTask) //&& !g->target->isPTX()) //tISA() != Target::NVPTX64) + if (functionType->isTask) { Error(pos, "\"task\" qualifier is illegal with C-linkage extern " "function \"%s\". Ignoring this function.", name.c_str()); @@ -796,8 +796,8 @@ Module::AddFunctionDeclaration(const std::string &name, #else // LLVM 3.1 and 3.3+ function->addFnAttr(llvm::Attribute::AlwaysInline); #endif - /* evghenii: on PTX target this must not be used, cause crash, dunno why */ - if (functionType->isTask && g->target->getISA() != Target::NVPTX64) + /* evghenii: on PTX target the following must not be set ... why ?!? */ + if (functionType->isTask && g->target->getISA() != Target::NVPTX) // This also applies transitively to members I think? #if defined(LLVM_3_1) function->setDoesNotAlias(1, true); @@ -953,7 +953,7 @@ Module::writeOutput(OutputType outputType, const char *outFileName, const char *fileType = NULL; switch (outputType) { case Asm: - if (g->target->getISA() != Target::NVPTX64) + if (g->target->getISA() != Target::NVPTX) { if (strcasecmp(suffix, "s")) fileType = "assembly"; @@ -1053,7 +1053,7 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) { } llvm::raw_fd_ostream fos(fd, (fd != 1), false); - if (g->target->getISA() == Target::NVPTX64) + if (g->target->getISA() == Target::NVPTX) { const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"; module->setDataLayout(dl_string); @@ -1925,7 +1925,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre opts.addMacroDef(g->cppArgs[i].substr(2)); } } - if (g->target->getISA() == Target::NVPTX64) + if (g->target->getISA() == Target::NVPTX) { opts.addMacroDef("__NVPTX__"); opts.addMacroDef("programIndex=laneIndex()"); @@ -2331,135 +2331,9 @@ Module::CompileAndOutput(const char *srcFile, const char *hostStubFileName, const char *devStubFileName) { - char ptxname[] = "nvptx64"; - for (int k = 0; k < 7; k++) - ptxname[k] = target[k]; - if (0) //target != NULL && strcmp(ptxname,"nvptx64") == 0) // NVPTX64 - { - std::vector targets = lExtractTargets(target); - Assert(targets.size() > 1); - // We're only compiling to a single target - int errorCount = 0; - - const char *suffix_orig = strrchr(outFileName, '.'); - ++suffix_orig; - assert(suffix_orig!=NULL); - - g->PtxString = std::string(); - - for (int itarget = 0; itarget < 1; itarget++) - { - fprintf(stderr, "compiling nvptx64 : target= %s\n",targets[itarget].c_str()); - g->target = new Target(arch, cpu, targets[itarget].c_str(), generatePIC, /* isPTX= */ true); - if (!g->target->isValid()) - return 1; - - m = new Module(srcFile); - if (m->CompileFile() == 0) { - if (outputType == CXX) { - if (target == NULL || strncmp(target, "generic-", 8) != 0) { - Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" " - "targets must be used."); - return 1; - } - } - else if (outputType == Asm || outputType == Object) { - if (target != NULL && strncmp(target, "generic-", 8) == 0) { - Error(SourcePos(), "When using a \"generic-*\" compilation target, " - "%s output can not be used.", - (outputType == Asm) ? "assembly" : "object file"); - return 1; - } - } - - assert(outFileName != NULL); - - std::string targetOutFileName = - lGetTargetFileName(outFileName, targets[itarget].c_str()); - if (outputType == Asm) - { - const char * targetOutFileName_c = targetOutFileName.c_str(); - const int suffix = strrchr(targetOutFileName_c, '.') - targetOutFileName_c + 1; - if (itarget == 1 && !strcasecmp(suffix_orig, "ptx")) - { - targetOutFileName[suffix ] = 's'; - targetOutFileName[suffix+1] = 0; - } - } - - if (outputType != Object) - { - if (!m->writeOutput(outputType, targetOutFileName.c_str(), includeFileName)) - return 1; - } - else if (itarget > 0) - { - if (!m->writeOutput(outputType, outFileName, includeFileName)) - return 1; - } - - if (itarget == 0) - { /* store ptx into memory */ - llvm::PassManager pm; -#if defined(LLVM_3_1) - pm.add(new llvm::TargetData(*g->target->getDataLayout())); -#else - pm.add(new llvm::DataLayout(*g->target->getDataLayout())); -#endif - - llvm::raw_string_ostream rso(g->PtxString); - llvm::formatted_raw_ostream fos(rso); - - llvm::TargetMachine::CodeGenFileType fileType = llvm::TargetMachine::CGFT_AssemblyFile; - llvm::TargetMachine *targetMachine = g->target->GetTargetMachine(); - if (targetMachine->addPassesToEmitFile(pm, fos, fileType)) { - fprintf(stderr, "Fatal error adding passes to emit object file!"); - exit(1); - } - - llvm::Module *module = m->module; - pm.run(*module); - fos.flush(); - assert(!g->PtxString.empty()); -#if 0 - std::cout << g->PtxString << std::endl; -#endif - } - - - if (itarget > 0) - { - if (headerFileName != NULL) - if (!m->writeOutput(Module::Header, headerFileName)) - return 1; - if (depsFileName != NULL) - if (!m->writeOutput(Module::Deps,depsFileName)) - return 1; - if (hostStubFileName != NULL) - if (!m->writeOutput(Module::HostStub,hostStubFileName)) - return 1; - if (devStubFileName != NULL) - if (!m->writeOutput(Module::DevStub,devStubFileName)) - return 1; - } - } - else - ++m->errorCount; - - errorCount += m->errorCount; - delete m; - m = NULL; - - delete g->target; - g->target = NULL; - - } - return errorCount > 0; - } - else if (target == NULL || strchr(target, ',') == NULL) { + if (target == NULL || strchr(target, ',') == NULL) { // We're only compiling to a single target - const bool isPTX = strcmp(target, "nvptx64") == 0; - g->target = new Target(arch, cpu, target, generatePIC, isPTX); + g->target = new Target(arch, cpu, target, generatePIC); if (!g->target->isValid()) return 1; @@ -2525,8 +2399,6 @@ Module::CompileAndOutput(const char *srcFile, // The user supplied multiple targets std::vector targets = lExtractTargets(target); Assert(targets.size() > 1); - for (unsigned int i = 0; i < targets.size(); ++i) - assert(strcmp(targets[i].c_str(), "nvptx64") < 0); if (outFileName != NULL && strcmp(outFileName, "-") == 0) { Error(SourcePos(), "Multi-target compilation can't generate output " diff --git a/stmt.cpp b/stmt.cpp index 67d0d96a..05d84a93 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -206,7 +206,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const { } if (sym->storageClass == SC_STATIC) { - if (g->target->getISA() == Target::NVPTX64) + if (g->target->getISA() == Target::NVPTX) if (!sym->type->IsConstType()) Error(initExpr->pos, "Non-constant static variable ""\"%s\" is not supported with ""\"cuda\" target.", sym->name.c_str()); @@ -1280,7 +1280,7 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx, llvm::Value *varyingCounterPtr, const std::vector &spans) { - if (!g->target->isPTX()) + if (g->target->getISA() != Target::NVPTX) { // Smear the uniform counter value out to be varying llvm::Value *counter = ctx->LoadInst(uniformCounterPtr); @@ -1315,7 +1315,7 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx, ctx->StoreInst(varyingCounter, varyingCounterPtr); return varyingCounter; } - else /* isPTX() == true */ + else /* NVPTX == true */ { // Smear the uniform counter value out to be varying llvm::Value *counter = ctx->LoadInst(uniformCounterPtr); @@ -1465,921 +1465,458 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { if (ctx->GetCurrentBasicBlock() == NULL || stmts == NULL) return; - if (!g->target->isPTX()) - { - llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body"); - llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body"); - llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit"); + llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body"); + llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body"); + llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit"); - llvm::Value *oldMask = ctx->GetInternalMask(); - llvm::Value *oldFunctionMask = ctx->GetFunctionMask(); + llvm::Value *oldMask = ctx->GetInternalMask(); + llvm::Value *oldFunctionMask = ctx->GetFunctionMask(); - ctx->SetDebugPos(pos); - ctx->StartScope(); + ctx->SetDebugPos(pos); + ctx->StartScope(); - ctx->SetInternalMask(LLVMMaskAllOn); - ctx->SetFunctionMask(LLVMMaskAllOn); + ctx->SetInternalMask(LLVMMaskAllOn); + ctx->SetFunctionMask(LLVMMaskAllOn); - // This should be caught during typechecking - AssertPos(pos, startExprs.size() == dimVariables.size() && - endExprs.size() == dimVariables.size()); - int nDims = (int)dimVariables.size(); + // This should be caught during typechecking + AssertPos(pos, startExprs.size() == dimVariables.size() && + endExprs.size() == dimVariables.size()); + int nDims = (int)dimVariables.size(); + + /////////////////////////////////////////////////////////////////////// + // Setup: compute the number of items we have to work on in each + // dimension and a number of derived values. + std::vector bbReset, bbStep, bbTest; + std::vector startVals, endVals, uniformCounterPtrs; + std::vector nExtras, alignedEnd, extrasMaskPtrs; + + std::vector span(nDims, 0); + const int vectorWidth = + g->target->getISA() == Target::NVPTX ? 32 : g->target->getVectorWidth(); + lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]); + + for (int i = 0; i < nDims; ++i) { + // Basic blocks that we'll fill in later with the looping logic for + // this dimension. + bbReset.push_back(ctx->CreateBasicBlock("foreach_reset")); + if (i < nDims-1) + // stepping for the innermost dimension is handled specially + bbStep.push_back(ctx->CreateBasicBlock("foreach_step")); + bbTest.push_back(ctx->CreateBasicBlock("foreach_test")); + + // Start and end value for this loop dimension + llvm::Value *sv = startExprs[i]->GetValue(ctx); + llvm::Value *ev = endExprs[i]->GetValue(ctx); + if (sv == NULL || ev == NULL) + return; + startVals.push_back(sv); + endVals.push_back(ev); + + // nItems = endVal - startVal + llvm::Value *nItems = + ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems"); + + // nExtras = nItems % (span for this dimension) + // This gives us the number of extra elements we need to deal with + // at the end of the loop for this dimension that don't fit cleanly + // into a vector width. + nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems, + LLVMInt32(span[i]), "nextras")); + + // alignedEnd = endVal - nExtras + alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev, + nExtras[i], "aligned_end")); /////////////////////////////////////////////////////////////////////// - // Setup: compute the number of items we have to work on in each - // dimension and a number of derived values. - std::vector bbReset, bbStep, bbTest; - std::vector startVals, endVals, uniformCounterPtrs; - std::vector nExtras, alignedEnd, extrasMaskPtrs; + // Each dimension has a loop counter that is a uniform value that + // goes from startVal to endVal, in steps of the span for this + // dimension. Its value is only used internally here for looping + // logic and isn't directly available in the user's program code. + uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type, + "counter")); + ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); - std::vector span(nDims, 0); - lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]); + // There is also a varying variable that holds the set of index + // values for each dimension in the current loop iteration; this is + // the value that is program-visible. + dimVariables[i]->storagePtr = + ctx->AllocaInst(LLVMTypes::Int32VectorType, + dimVariables[i]->name.c_str()); + dimVariables[i]->parentFunction = ctx->GetFunction(); + ctx->EmitVariableDebugInfo(dimVariables[i]); - for (int i = 0; i < nDims; ++i) { - // Basic blocks that we'll fill in later with the looping logic for - // this dimension. - bbReset.push_back(ctx->CreateBasicBlock("foreach_reset")); - if (i < nDims-1) - // stepping for the innermost dimension is handled specially - bbStep.push_back(ctx->CreateBasicBlock("foreach_step")); - bbTest.push_back(ctx->CreateBasicBlock("foreach_test")); - - // Start and end value for this loop dimension - llvm::Value *sv = startExprs[i]->GetValue(ctx); - llvm::Value *ev = endExprs[i]->GetValue(ctx); - if (sv == NULL || ev == NULL) - return; - startVals.push_back(sv); - endVals.push_back(ev); - - // nItems = endVal - startVal - llvm::Value *nItems = - ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems"); - - // nExtras = nItems % (span for this dimension) - // This gives us the number of extra elements we need to deal with - // at the end of the loop for this dimension that don't fit cleanly - // into a vector width. - nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems, - LLVMInt32(span[i]), "nextras")); - - // alignedEnd = endVal - nExtras - alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev, - nExtras[i], "aligned_end")); - - /////////////////////////////////////////////////////////////////////// - // Each dimension has a loop counter that is a uniform value that - // goes from startVal to endVal, in steps of the span for this - // dimension. Its value is only used internally here for looping - // logic and isn't directly available in the user's program code. - uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type, - "counter")); - ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); - - // There is also a varying variable that holds the set of index - // values for each dimension in the current loop iteration; this is - // the value that is program-visible. - dimVariables[i]->storagePtr = - ctx->AllocaInst(LLVMTypes::Int32VectorType, - dimVariables[i]->name.c_str()); - dimVariables[i]->parentFunction = ctx->GetFunction(); - ctx->EmitVariableDebugInfo(dimVariables[i]); - - // Each dimension also maintains a mask that represents which of - // the varying elements in the current iteration should be - // processed. (i.e. this is used to disable the lanes that have - // out-of-bounds offsets.) - extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask")); - ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); - } - - ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR); - - // On to the outermost loop's test - ctx->BranchInst(bbTest[0]); - - /////////////////////////////////////////////////////////////////////////// - // foreach_reset: this code runs when we need to reset the counter for - // a given dimension in preparation for running through its loop again, - // after the enclosing level advances its counter. - for (int i = 0; i < nDims; ++i) { - ctx->SetCurrentBasicBlock(bbReset[i]); - if (i == 0) - ctx->BranchInst(bbExit); - else { - ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); - ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); - ctx->BranchInst(bbStep[i-1]); - } - } - - /////////////////////////////////////////////////////////////////////////// - // foreach_step: increment the uniform counter by the vector width. - // Note that we don't increment the varying counter here as well but - // just generate its value when we need it in the loop body. Don't do - // this for the innermost dimension, which has a more complex stepping - // structure.. - for (int i = 0; i < nDims-1; ++i) { - ctx->SetCurrentBasicBlock(bbStep[i]); - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]); - llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[i]), "new_counter"); - ctx->StoreInst(newCounter, uniformCounterPtrs[i]); - ctx->BranchInst(bbTest[i]); - } - - /////////////////////////////////////////////////////////////////////////// - // foreach_test (for all dimensions other than the innermost...) - std::vector inExtras; - for (int i = 0; i < nDims-1; ++i) { - ctx->SetCurrentBasicBlock(bbTest[i]); - - llvm::Value *haveExtras = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT, - endVals[i], alignedEnd[i], "have_extras"); - - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter"); - llvm::Value *atAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, - counter, alignedEnd[i], "at_aligned_end"); - llvm::Value *inEx = - ctx->BinaryOperator(llvm::Instruction::And, haveExtras, - atAlignedEnd, "in_extras"); - - if (i == 0) - inExtras.push_back(inEx); - else - inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx, - inExtras[i-1], "in_extras_all")); - - llvm::Value *varyingCounter = - lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i], - dimVariables[i]->storagePtr, span); - - llvm::Value *smearEnd = ctx->BroadcastValue( - endVals[i], LLVMTypes::Int32VectorType, "smear_end"); - - // Do a vector compare of its value to the end value to generate a - // mask for this last bit of work. - llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); - emask = ctx->I1VecToBoolVec(emask); - - if (i == 0) - ctx->StoreInst(emask, extrasMaskPtrs[i]); - else { - llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]); - llvm::Value *newMask = - ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, - "extras_mask"); - ctx->StoreInst(newMask, extrasMaskPtrs[i]); - } - - llvm::Value *notAtEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, endVals[i]); - ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd); - } - - /////////////////////////////////////////////////////////////////////////// - // foreach_test (for innermost dimension) - // - // All of the outer dimensions are handled generically--basically as a - // for() loop from the start value to the end value, where at each loop - // test, we compute the mask of active elements for the current - // dimension and then update an overall mask that is the AND - // combination of all of the outer ones. - // - // The innermost loop is handled specially, for performance purposes. - // When starting the innermost dimension, we start by checking once - // whether any of the outer dimensions has set the mask to be - // partially-active or not. We follow different code paths for these - // two cases, taking advantage of the knowledge that the mask is all - // on, when this is the case. - // - // In each of these code paths, we start with a loop from the starting - // value to the aligned end value for the innermost dimension; we can - // guarantee that the innermost loop will have an "all on" mask (as far - // as its dimension is concerned) for the duration of this loop. Doing - // so allows us to emit code that assumes the mask is all on (for the - // case where none of the outer dimensions has set the mask to be - // partially on), or allows us to emit code that just uses the mask - // from the outer dimensions directly (for the case where they have). - // - // After this loop, we just need to deal with one vector's worth of - // "ragged extra bits", where the mask used includes the effect of the - // mask for the innermost dimension. - // - // We start out this process by emitting the check that determines - // whether any of the enclosing dimensions is partially active - // (i.e. processing extra elements that don't exactly fit into a - // vector). - llvm::BasicBlock *bbOuterInExtras = - ctx->CreateBasicBlock("outer_in_extras"); - llvm::BasicBlock *bbOuterNotInExtras = - ctx->CreateBasicBlock("outer_not_in_extras"); - - ctx->SetCurrentBasicBlock(bbTest[nDims-1]); - if (inExtras.size()) - ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras, - inExtras.back()); - else - // for a 1D iteration domain, we certainly don't have any enclosing - // dimensions that are processing extra elements. - ctx->BranchInst(bbOuterNotInExtras); - - /////////////////////////////////////////////////////////////////////////// - // One or more outer dimensions in extras, so we need to mask for the loop - // body regardless. We break this into two cases, roughly: - // for (counter = start; counter < alignedEnd; counter += step) { - // // mask is all on for inner, so set mask to outer mask - // // run loop body with mask - // } - // // counter == alignedEnd - // if (counter < end) { - // // set mask to outermask & (counter+programCounter < end) - // // run loop body with mask - // } - llvm::BasicBlock *bbAllInnerPartialOuter = - ctx->CreateBasicBlock("all_inner_partial_outer"); - llvm::BasicBlock *bbPartial = - ctx->CreateBasicBlock("both_partial"); - ctx->SetCurrentBasicBlock(bbOuterInExtras); { - // Update the varying counter value here, since all subsequent - // blocks along this path need it. - lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); - - // here we just check to see if counter < alignedEnd - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); - llvm::Value *beforeAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, alignedEnd[nDims-1], "before_aligned_end"); - ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd); - } - - // Below we have a basic block that runs the loop body code for the - // case where the mask is partially but not fully on. This same block - // runs in multiple cases: both for handling any ragged extra data for - // the innermost dimension but also when outer dimensions have set the - // mask to be partially on. - // - // The value stored in stepIndexAfterMaskedBodyPtr is used after each - // execution of the body code to determine whether the innermost index - // value should be incremented by the step (we're running the "for" - // loop of full vectors at the innermost dimension, with outer - // dimensions having set the mask to be partially on), or whether we're - // running once for the ragged extra bits at the end of the innermost - // dimension, in which case we're done with the innermost dimension and - // should step the loop counter for the next enclosing dimension - // instead. - llvm::Value *stepIndexAfterMaskedBodyPtr = - ctx->AllocaInst(LLVMTypes::BoolType, "step_index"); - - /////////////////////////////////////////////////////////////////////////// - // We're in the inner loop part where the only masking is due to outer - // dimensions but the innermost dimension fits fully into a vector's - // width. Set the mask and jump to the masked loop body. - ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); { - llvm::Value *mask; - if (nDims == 1) - // 1D loop; we shouldn't ever get here anyway - mask = LLVMMaskAllOff; - else - mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); - - ctx->SetInternalMask(mask); - - ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbMaskedBody); - } - - /////////////////////////////////////////////////////////////////////////// - // We need to include the effect of the innermost dimension in the mask - // for the final bits here - ctx->SetCurrentBasicBlock(bbPartial); { - llvm::Value *varyingCounter = - ctx->LoadInst(dimVariables[nDims-1]->storagePtr); - llvm::Value *smearEnd = ctx->BroadcastValue( - endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); - - llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); - emask = ctx->I1VecToBoolVec(emask); - - if (nDims == 1) { - ctx->SetInternalMask(emask); - } - else { - llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); - llvm::Value *newMask = - ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, - "extras_mask"); - ctx->SetInternalMask(newMask); - } - - ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbMaskedBody); - } - - /////////////////////////////////////////////////////////////////////////// - // None of the outer dimensions is processing extras; along the lines - // of above, we can express this as: - // for (counter = start; counter < alignedEnd; counter += step) { - // // mask is all on - // // run loop body with mask all on - // } - // // counter == alignedEnd - // if (counter < end) { - // // set mask to (counter+programCounter < end) - // // run loop body with mask - // } - llvm::BasicBlock *bbPartialInnerAllOuter = - ctx->CreateBasicBlock("partial_inner_all_outer"); - ctx->SetCurrentBasicBlock(bbOuterNotInExtras); { - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); - llvm::Value *beforeAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, alignedEnd[nDims-1], "before_aligned_end"); - ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter, - beforeAlignedEnd); - } - - /////////////////////////////////////////////////////////////////////////// - // full_body: do a full vector's worth of work. We know that all - // lanes will be running here, so we explicitly set the mask to be 'all - // on'. This ends up being relatively straightforward: just update the - // value of the varying loop counter and have the statements in the - // loop body emit their code. - llvm::BasicBlock *bbFullBodyContinue = - ctx->CreateBasicBlock("foreach_full_continue"); - ctx->SetCurrentBasicBlock(bbFullBody); { - ctx->SetInternalMask(LLVMMaskAllOn); - ctx->SetBlockEntryMask(LLVMMaskAllOn); - lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); - ctx->SetContinueTarget(bbFullBodyContinue); - ctx->AddInstrumentationPoint("foreach loop body (all on)"); - stmts->EmitCode(ctx); - AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL); - ctx->BranchInst(bbFullBodyContinue); - } - ctx->SetCurrentBasicBlock(bbFullBodyContinue); { - ctx->RestoreContinuedLanes(); - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); - llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[nDims-1]), "new_counter"); - ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); - ctx->BranchInst(bbOuterNotInExtras); - } - - /////////////////////////////////////////////////////////////////////////// - // We're done running blocks with the mask all on; see if the counter is - // less than the end value, in which case we need to run the body one - // more time to get the extra bits. - llvm::BasicBlock *bbSetInnerMask = - ctx->CreateBasicBlock("partial_inner_only"); - ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); { - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); - llvm::Value *beforeFullEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, endVals[nDims-1], "before_full_end"); - ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd); - } - - /////////////////////////////////////////////////////////////////////////// - // The outer dimensions are all on, so the mask is just given by the - // mask for the innermost dimension - ctx->SetCurrentBasicBlock(bbSetInnerMask); { - llvm::Value *varyingCounter = - lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); - llvm::Value *smearEnd = ctx->BroadcastValue( - endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); - llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); - emask = ctx->I1VecToBoolVec(emask); - ctx->SetInternalMask(emask); - ctx->SetBlockEntryMask(emask); - - ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbMaskedBody); - } - - /////////////////////////////////////////////////////////////////////////// - // masked_body: set the mask and have the statements emit their - // code again. Note that it's generally worthwhile having two copies - // of the statements' code, since the code above is emitted with the - // mask known to be all-on, which in turn leads to more efficient code - // for that case. - llvm::BasicBlock *bbStepInnerIndex = - ctx->CreateBasicBlock("step_inner_index"); - llvm::BasicBlock *bbMaskedBodyContinue = - ctx->CreateBasicBlock("foreach_masked_continue"); - ctx->SetCurrentBasicBlock(bbMaskedBody); { - ctx->AddInstrumentationPoint("foreach loop body (masked)"); - ctx->SetContinueTarget(bbMaskedBodyContinue); - ctx->DisableGatherScatterWarnings(); - ctx->SetBlockEntryMask(ctx->GetFullMask()); - stmts->EmitCode(ctx); - ctx->EnableGatherScatterWarnings(); - ctx->BranchInst(bbMaskedBodyContinue); - } - ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); { - ctx->RestoreContinuedLanes(); - llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex); - } - - /////////////////////////////////////////////////////////////////////////// - // step the innermost index, for the case where we're doing the - // innermost for loop over full vectors. - ctx->SetCurrentBasicBlock(bbStepInnerIndex); { - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); - llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[nDims-1]), "new_counter"); - ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); - ctx->BranchInst(bbOuterInExtras); - } - - /////////////////////////////////////////////////////////////////////////// - // foreach_exit: All done. Restore the old mask and clean up - ctx->SetCurrentBasicBlock(bbExit); - - ctx->SetInternalMask(oldMask); - ctx->SetFunctionMask(oldFunctionMask); - - ctx->EndForeach(); - ctx->EndScope(); + // Each dimension also maintains a mask that represents which of + // the varying elements in the current iteration should be + // processed. (i.e. this is used to disable the lanes that have + // out-of-bounds offsets.) + extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask")); + ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); } - else /* isPTX() == true */ - { - llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body"); - llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body"); - llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit"); - llvm::Value *oldMask = ctx->GetInternalMask(); - llvm::Value *oldFunctionMask = ctx->GetFunctionMask(); + ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR); - ctx->SetDebugPos(pos); - ctx->StartScope(); + // On to the outermost loop's test + ctx->BranchInst(bbTest[0]); + /////////////////////////////////////////////////////////////////////////// + // foreach_reset: this code runs when we need to reset the counter for + // a given dimension in preparation for running through its loop again, + // after the enclosing level advances its counter. + for (int i = 0; i < nDims; ++i) { + ctx->SetCurrentBasicBlock(bbReset[i]); + if (i == 0) + ctx->BranchInst(bbExit); + else { + ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); + ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); + ctx->BranchInst(bbStep[i-1]); + } + } + + /////////////////////////////////////////////////////////////////////////// + // foreach_step: increment the uniform counter by the vector width. + // Note that we don't increment the varying counter here as well but + // just generate its value when we need it in the loop body. Don't do + // this for the innermost dimension, which has a more complex stepping + // structure.. + for (int i = 0; i < nDims-1; ++i) { + ctx->SetCurrentBasicBlock(bbStep[i]); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]); + llvm::Value *newCounter = + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[i]), "new_counter"); + ctx->StoreInst(newCounter, uniformCounterPtrs[i]); + ctx->BranchInst(bbTest[i]); + } + + /////////////////////////////////////////////////////////////////////////// + // foreach_test (for all dimensions other than the innermost...) + std::vector inExtras; + for (int i = 0; i < nDims-1; ++i) { + ctx->SetCurrentBasicBlock(bbTest[i]); + + llvm::Value *haveExtras = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT, + endVals[i], alignedEnd[i], "have_extras"); + + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter"); + llvm::Value *atAlignedEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, + counter, alignedEnd[i], "at_aligned_end"); + llvm::Value *inEx = + ctx->BinaryOperator(llvm::Instruction::And, haveExtras, + atAlignedEnd, "in_extras"); + + if (i == 0) + inExtras.push_back(inEx); + else + inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx, + inExtras[i-1], "in_extras_all")); + + llvm::Value *varyingCounter = + lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i], + dimVariables[i]->storagePtr, span); + + llvm::Value *smearEnd = ctx->BroadcastValue( + endVals[i], LLVMTypes::Int32VectorType, "smear_end"); + + // Do a vector compare of its value to the end value to generate a + // mask for this last bit of work. + llvm::Value *emask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); + emask = ctx->I1VecToBoolVec(emask); + + if (i == 0) + ctx->StoreInst(emask, extrasMaskPtrs[i]); + else { + llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]); + llvm::Value *newMask = + ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, + "extras_mask"); + ctx->StoreInst(newMask, extrasMaskPtrs[i]); + } + + llvm::Value *notAtEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, endVals[i]); + ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd); + } + + /////////////////////////////////////////////////////////////////////////// + // foreach_test (for innermost dimension) + // + // All of the outer dimensions are handled generically--basically as a + // for() loop from the start value to the end value, where at each loop + // test, we compute the mask of active elements for the current + // dimension and then update an overall mask that is the AND + // combination of all of the outer ones. + // + // The innermost loop is handled specially, for performance purposes. + // When starting the innermost dimension, we start by checking once + // whether any of the outer dimensions has set the mask to be + // partially-active or not. We follow different code paths for these + // two cases, taking advantage of the knowledge that the mask is all + // on, when this is the case. + // + // In each of these code paths, we start with a loop from the starting + // value to the aligned end value for the innermost dimension; we can + // guarantee that the innermost loop will have an "all on" mask (as far + // as its dimension is concerned) for the duration of this loop. Doing + // so allows us to emit code that assumes the mask is all on (for the + // case where none of the outer dimensions has set the mask to be + // partially on), or allows us to emit code that just uses the mask + // from the outer dimensions directly (for the case where they have). + // + // After this loop, we just need to deal with one vector's worth of + // "ragged extra bits", where the mask used includes the effect of the + // mask for the innermost dimension. + // + // We start out this process by emitting the check that determines + // whether any of the enclosing dimensions is partially active + // (i.e. processing extra elements that don't exactly fit into a + // vector). + llvm::BasicBlock *bbOuterInExtras = + ctx->CreateBasicBlock("outer_in_extras"); + llvm::BasicBlock *bbOuterNotInExtras = + ctx->CreateBasicBlock("outer_not_in_extras"); + + ctx->SetCurrentBasicBlock(bbTest[nDims-1]); + if (inExtras.size()) + ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras, + inExtras.back()); + else + // for a 1D iteration domain, we certainly don't have any enclosing + // dimensions that are processing extra elements. + ctx->BranchInst(bbOuterNotInExtras); + + /////////////////////////////////////////////////////////////////////////// + // One or more outer dimensions in extras, so we need to mask for the loop + // body regardless. We break this into two cases, roughly: + // for (counter = start; counter < alignedEnd; counter += step) { + // // mask is all on for inner, so set mask to outer mask + // // run loop body with mask + // } + // // counter == alignedEnd + // if (counter < end) { + // // set mask to outermask & (counter+programCounter < end) + // // run loop body with mask + // } + llvm::BasicBlock *bbAllInnerPartialOuter = + ctx->CreateBasicBlock("all_inner_partial_outer"); + llvm::BasicBlock *bbPartial = + ctx->CreateBasicBlock("both_partial"); + ctx->SetCurrentBasicBlock(bbOuterInExtras); { + // Update the varying counter value here, since all subsequent + // blocks along this path need it. + lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], + dimVariables[nDims-1]->storagePtr, span); + + // here we just check to see if counter < alignedEnd + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); + llvm::Value *beforeAlignedEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, alignedEnd[nDims-1], "before_aligned_end"); + ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd); + } + + // Below we have a basic block that runs the loop body code for the + // case where the mask is partially but not fully on. This same block + // runs in multiple cases: both for handling any ragged extra data for + // the innermost dimension but also when outer dimensions have set the + // mask to be partially on. + // + // The value stored in stepIndexAfterMaskedBodyPtr is used after each + // execution of the body code to determine whether the innermost index + // value should be incremented by the step (we're running the "for" + // loop of full vectors at the innermost dimension, with outer + // dimensions having set the mask to be partially on), or whether we're + // running once for the ragged extra bits at the end of the innermost + // dimension, in which case we're done with the innermost dimension and + // should step the loop counter for the next enclosing dimension + // instead. + llvm::Value *stepIndexAfterMaskedBodyPtr = + ctx->AllocaInst(LLVMTypes::BoolType, "step_index"); + + /////////////////////////////////////////////////////////////////////////// + // We're in the inner loop part where the only masking is due to outer + // dimensions but the innermost dimension fits fully into a vector's + // width. Set the mask and jump to the masked loop body. + ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); { + llvm::Value *mask; + if (nDims == 1) + // 1D loop; we shouldn't ever get here anyway + mask = LLVMMaskAllOff; + else + mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); + + ctx->SetInternalMask(mask); + + ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbMaskedBody); + } + + /////////////////////////////////////////////////////////////////////////// + // We need to include the effect of the innermost dimension in the mask + // for the final bits here + ctx->SetCurrentBasicBlock(bbPartial); { + llvm::Value *varyingCounter = + ctx->LoadInst(dimVariables[nDims-1]->storagePtr); + llvm::Value *smearEnd = ctx->BroadcastValue( + endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); + + llvm::Value *emask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); + emask = ctx->I1VecToBoolVec(emask); + + if (nDims == 1) { + ctx->SetInternalMask(emask); + } + else { + llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); + llvm::Value *newMask = + ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, + "extras_mask"); + ctx->SetInternalMask(newMask); + } + + ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbMaskedBody); + } + + /////////////////////////////////////////////////////////////////////////// + // None of the outer dimensions is processing extras; along the lines + // of above, we can express this as: + // for (counter = start; counter < alignedEnd; counter += step) { + // // mask is all on + // // run loop body with mask all on + // } + // // counter == alignedEnd + // if (counter < end) { + // // set mask to (counter+programCounter < end) + // // run loop body with mask + // } + llvm::BasicBlock *bbPartialInnerAllOuter = + ctx->CreateBasicBlock("partial_inner_all_outer"); + ctx->SetCurrentBasicBlock(bbOuterNotInExtras); { + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); + llvm::Value *beforeAlignedEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, alignedEnd[nDims-1], "before_aligned_end"); + ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter, + beforeAlignedEnd); + } + + /////////////////////////////////////////////////////////////////////////// + // full_body: do a full vector's worth of work. We know that all + // lanes will be running here, so we explicitly set the mask to be 'all + // on'. This ends up being relatively straightforward: just update the + // value of the varying loop counter and have the statements in the + // loop body emit their code. + llvm::BasicBlock *bbFullBodyContinue = + ctx->CreateBasicBlock("foreach_full_continue"); + ctx->SetCurrentBasicBlock(bbFullBody); { ctx->SetInternalMask(LLVMMaskAllOn); - ctx->SetFunctionMask(LLVMMaskAllOn); - - // This should be caught during typechecking - AssertPos(pos, startExprs.size() == dimVariables.size() && - endExprs.size() == dimVariables.size()); - int nDims = (int)dimVariables.size(); - - /////////////////////////////////////////////////////////////////////// - // Setup: compute the number of items we have to work on in each - // dimension and a number of derived values. - std::vector bbReset, bbStep, bbTest; - std::vector startVals, endVals, uniformCounterPtrs; - std::vector nExtras, alignedEnd, extrasMaskPtrs; - - std::vector span(nDims, 0); - const int vectorWidth = 32; - lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]); -#if 0 - for (int i = 0; i < nDims; i++) - { - fprintf(stderr, " i= %d [ %d ] : %d \n", - i, nDims, span[i]); - } - fprintf(stderr, " --- \n"); -#endif - - for (int i = 0; i < nDims; ++i) { - // Basic blocks that we'll fill in later with the looping logic for - // this dimension. - bbReset.push_back(ctx->CreateBasicBlock("foreach_reset")); - if (i < nDims-1) - // stepping for the innermost dimension is handled specially - bbStep.push_back(ctx->CreateBasicBlock("foreach_step")); - bbTest.push_back(ctx->CreateBasicBlock("foreach_test")); - - // Start and end value for this loop dimension - llvm::Value *sv = startExprs[i]->GetValue(ctx); - llvm::Value *ev = endExprs[i]->GetValue(ctx); - if (sv == NULL || ev == NULL) - return; - startVals.push_back(sv); - endVals.push_back(ev); - - // nItems = endVal - startVal - llvm::Value *nItems = - ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems"); - - // nExtras = nItems % (span for this dimension) - // This gives us the number of extra elements we need to deal with - // at the end of the loop for this dimension that don't fit cleanly - // into a vector width. - nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems, - LLVMInt32(span[i]), "nextras")); - - // alignedEnd = endVal - nExtras - alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev, - nExtras[i], "aligned_end")); - - /////////////////////////////////////////////////////////////////////// - // Each dimension has a loop counter that is a uniform value that - // goes from startVal to endVal, in steps of the span for this - // dimension. Its value is only used internally here for looping - // logic and isn't directly available in the user's program code. - uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type, - "counter")); - ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); - - // There is also a varying variable that holds the set of index - // values for each dimension in the current loop iteration; this is - // the value that is program-visible. - dimVariables[i]->storagePtr = - ctx->AllocaInst(LLVMTypes::Int32VectorType, - dimVariables[i]->name.c_str()); - dimVariables[i]->parentFunction = ctx->GetFunction(); - ctx->EmitVariableDebugInfo(dimVariables[i]); - - // Each dimension also maintains a mask that represents which of - // the varying elements in the current iteration should be - // processed. (i.e. this is used to disable the lanes that have - // out-of-bounds offsets.) - extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask")); - ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); - } - - ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR); - - // On to the outermost loop's test - ctx->BranchInst(bbTest[0]); - - /////////////////////////////////////////////////////////////////////////// - // foreach_reset: this code runs when we need to reset the counter for - // a given dimension in preparation for running through its loop again, - // after the enclosing level advances its counter. - for (int i = 0; i < nDims; ++i) { - ctx->SetCurrentBasicBlock(bbReset[i]); - if (i == 0) - ctx->BranchInst(bbExit); - else { - ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); - ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); - ctx->BranchInst(bbStep[i-1]); - } - } - - /////////////////////////////////////////////////////////////////////////// - // foreach_step: increment the uniform counter by the vector width. - // Note that we don't increment the varying counter here as well but - // just generate its value when we need it in the loop body. Don't do - // this for the innermost dimension, which has a more complex stepping - // structure.. - for (int i = 0; i < nDims-1; ++i) { - ctx->SetCurrentBasicBlock(bbStep[i]); - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]); - llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[i]), "new_counter"); - ctx->StoreInst(newCounter, uniformCounterPtrs[i]); - ctx->BranchInst(bbTest[i]); - } - - /////////////////////////////////////////////////////////////////////////// - // foreach_test (for all dimensions other than the innermost...) - std::vector inExtras; - for (int i = 0; i < nDims-1; ++i) { - ctx->SetCurrentBasicBlock(bbTest[i]); - - llvm::Value *haveExtras = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT, - endVals[i], alignedEnd[i], "have_extras"); - - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter"); - llvm::Value *atAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, - counter, alignedEnd[i], "at_aligned_end"); - llvm::Value *inEx = - ctx->BinaryOperator(llvm::Instruction::And, haveExtras, - atAlignedEnd, "in_extras"); - - if (i == 0) - inExtras.push_back(inEx); - else - inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx, - inExtras[i-1], "in_extras_all")); - - llvm::Value *varyingCounter = - lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i], - dimVariables[i]->storagePtr, span); - - llvm::Value *smearEnd = ctx->BroadcastValue( - endVals[i], LLVMTypes::Int32VectorType, "smear_end"); - - // Do a vector compare of its value to the end value to generate a - // mask for this last bit of work. - llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); - emask = ctx->I1VecToBoolVec(emask); - - if (i == 0) - ctx->StoreInst(emask, extrasMaskPtrs[i]); - else { - llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]); - llvm::Value *newMask = - ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, - "extras_mask"); - ctx->StoreInst(newMask, extrasMaskPtrs[i]); - } - - llvm::Value *notAtEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, endVals[i]); - ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd); - } - - /////////////////////////////////////////////////////////////////////////// - // foreach_test (for innermost dimension) - // - // All of the outer dimensions are handled generically--basically as a - // for() loop from the start value to the end value, where at each loop - // test, we compute the mask of active elements for the current - // dimension and then update an overall mask that is the AND - // combination of all of the outer ones. - // - // The innermost loop is handled specially, for performance purposes. - // When starting the innermost dimension, we start by checking once - // whether any of the outer dimensions has set the mask to be - // partially-active or not. We follow different code paths for these - // two cases, taking advantage of the knowledge that the mask is all - // on, when this is the case. - // - // In each of these code paths, we start with a loop from the starting - // value to the aligned end value for the innermost dimension; we can - // guarantee that the innermost loop will have an "all on" mask (as far - // as its dimension is concerned) for the duration of this loop. Doing - // so allows us to emit code that assumes the mask is all on (for the - // case where none of the outer dimensions has set the mask to be - // partially on), or allows us to emit code that just uses the mask - // from the outer dimensions directly (for the case where they have). - // - // After this loop, we just need to deal with one vector's worth of - // "ragged extra bits", where the mask used includes the effect of the - // mask for the innermost dimension. - // - // We start out this process by emitting the check that determines - // whether any of the enclosing dimensions is partially active - // (i.e. processing extra elements that don't exactly fit into a - // vector). - llvm::BasicBlock *bbOuterInExtras = - ctx->CreateBasicBlock("outer_in_extras"); - llvm::BasicBlock *bbOuterNotInExtras = - ctx->CreateBasicBlock("outer_not_in_extras"); - - ctx->SetCurrentBasicBlock(bbTest[nDims-1]); - if (inExtras.size()) - ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras, - inExtras.back()); - else - // for a 1D iteration domain, we certainly don't have any enclosing - // dimensions that are processing extra elements. - ctx->BranchInst(bbOuterNotInExtras); - - /////////////////////////////////////////////////////////////////////////// - // One or more outer dimensions in extras, so we need to mask for the loop - // body regardless. We break this into two cases, roughly: - // for (counter = start; counter < alignedEnd; counter += step) { - // // mask is all on for inner, so set mask to outer mask - // // run loop body with mask - // } - // // counter == alignedEnd - // if (counter < end) { - // // set mask to outermask & (counter+programCounter < end) - // // run loop body with mask - // } - llvm::BasicBlock *bbAllInnerPartialOuter = - ctx->CreateBasicBlock("all_inner_partial_outer"); - llvm::BasicBlock *bbPartial = - ctx->CreateBasicBlock("both_partial"); - ctx->SetCurrentBasicBlock(bbOuterInExtras); { - // Update the varying counter value here, since all subsequent - // blocks along this path need it. - lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); - - // here we just check to see if counter < alignedEnd - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); - llvm::Value *beforeAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, alignedEnd[nDims-1], "before_aligned_end"); - ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd); - } - - // Below we have a basic block that runs the loop body code for the - // case where the mask is partially but not fully on. This same block - // runs in multiple cases: both for handling any ragged extra data for - // the innermost dimension but also when outer dimensions have set the - // mask to be partially on. - // - // The value stored in stepIndexAfterMaskedBodyPtr is used after each - // execution of the body code to determine whether the innermost index - // value should be incremented by the step (we're running the "for" - // loop of full vectors at the innermost dimension, with outer - // dimensions having set the mask to be partially on), or whether we're - // running once for the ragged extra bits at the end of the innermost - // dimension, in which case we're done with the innermost dimension and - // should step the loop counter for the next enclosing dimension - // instead. - llvm::Value *stepIndexAfterMaskedBodyPtr = - ctx->AllocaInst(LLVMTypes::BoolType, "step_index"); - - /////////////////////////////////////////////////////////////////////////// - // We're in the inner loop part where the only masking is due to outer - // dimensions but the innermost dimension fits fully into a vector's - // width. Set the mask and jump to the masked loop body. - ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); { - llvm::Value *mask; - if (nDims == 1) - // 1D loop; we shouldn't ever get here anyway - mask = LLVMMaskAllOff; - else - mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); - - ctx->SetInternalMask(mask); - - ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbMaskedBody); - } - - /////////////////////////////////////////////////////////////////////////// - // We need to include the effect of the innermost dimension in the mask - // for the final bits here - ctx->SetCurrentBasicBlock(bbPartial); { - llvm::Value *varyingCounter = - ctx->LoadInst(dimVariables[nDims-1]->storagePtr); - llvm::Value *smearEnd = ctx->BroadcastValue( - endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); - - llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); - emask = ctx->I1VecToBoolVec(emask); - - if (nDims == 1) { - ctx->SetInternalMask(emask); - } - else { - llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); - llvm::Value *newMask = - ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, - "extras_mask"); - ctx->SetInternalMask(newMask); - } - - ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbMaskedBody); - } - - /////////////////////////////////////////////////////////////////////////// - // None of the outer dimensions is processing extras; along the lines - // of above, we can express this as: - // for (counter = start; counter < alignedEnd; counter += step) { - // // mask is all on - // // run loop body with mask all on - // } - // // counter == alignedEnd - // if (counter < end) { - // // set mask to (counter+programCounter < end) - // // run loop body with mask - // } - llvm::BasicBlock *bbPartialInnerAllOuter = - ctx->CreateBasicBlock("partial_inner_all_outer"); - ctx->SetCurrentBasicBlock(bbOuterNotInExtras); { - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); - llvm::Value *beforeAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, alignedEnd[nDims-1], "before_aligned_end"); - ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter, - beforeAlignedEnd); - } - - /////////////////////////////////////////////////////////////////////////// - // full_body: do a full vector's worth of work. We know that all - // lanes will be running here, so we explicitly set the mask to be 'all - // on'. This ends up being relatively straightforward: just update the - // value of the varying loop counter and have the statements in the - // loop body emit their code. - llvm::BasicBlock *bbFullBodyContinue = - ctx->CreateBasicBlock("foreach_full_continue"); - ctx->SetCurrentBasicBlock(bbFullBody); { - ctx->SetInternalMask(LLVMMaskAllOn); - ctx->SetBlockEntryMask(LLVMMaskAllOn); - lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); - ctx->SetContinueTarget(bbFullBodyContinue); - ctx->AddInstrumentationPoint("foreach loop body (all on)"); - stmts->EmitCode(ctx); - AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL); - ctx->BranchInst(bbFullBodyContinue); - } - ctx->SetCurrentBasicBlock(bbFullBodyContinue); { - ctx->RestoreContinuedLanes(); - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); - llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[nDims-1]), "new_counter"); - ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); - ctx->BranchInst(bbOuterNotInExtras); - } - - /////////////////////////////////////////////////////////////////////////// - // We're done running blocks with the mask all on; see if the counter is - // less than the end value, in which case we need to run the body one - // more time to get the extra bits. - llvm::BasicBlock *bbSetInnerMask = - ctx->CreateBasicBlock("partial_inner_only"); - ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); { - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); - llvm::Value *beforeFullEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, endVals[nDims-1], "before_full_end"); - ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd); - } - - /////////////////////////////////////////////////////////////////////////// - // The outer dimensions are all on, so the mask is just given by the - // mask for the innermost dimension - ctx->SetCurrentBasicBlock(bbSetInnerMask); { - llvm::Value *varyingCounter = - lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); - llvm::Value *smearEnd = ctx->BroadcastValue( - endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); - llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); - emask = ctx->I1VecToBoolVec(emask); - ctx->SetInternalMask(emask); - ctx->SetBlockEntryMask(emask); - - ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbMaskedBody); - } - - /////////////////////////////////////////////////////////////////////////// - // masked_body: set the mask and have the statements emit their - // code again. Note that it's generally worthwhile having two copies - // of the statements' code, since the code above is emitted with the - // mask known to be all-on, which in turn leads to more efficient code - // for that case. - llvm::BasicBlock *bbStepInnerIndex = - ctx->CreateBasicBlock("step_inner_index"); - llvm::BasicBlock *bbMaskedBodyContinue = - ctx->CreateBasicBlock("foreach_masked_continue"); - ctx->SetCurrentBasicBlock(bbMaskedBody); { - ctx->AddInstrumentationPoint("foreach loop body (masked)"); - ctx->SetContinueTarget(bbMaskedBodyContinue); - ctx->DisableGatherScatterWarnings(); - ctx->SetBlockEntryMask(ctx->GetFullMask()); - stmts->EmitCode(ctx); - ctx->EnableGatherScatterWarnings(); - ctx->BranchInst(bbMaskedBodyContinue); - } - ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); { - ctx->RestoreContinuedLanes(); - llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex); - } - - /////////////////////////////////////////////////////////////////////////// - // step the innermost index, for the case where we're doing the - // innermost for loop over full vectors. - ctx->SetCurrentBasicBlock(bbStepInnerIndex); { - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); - llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[nDims-1]), "new_counter"); - ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); - ctx->BranchInst(bbOuterInExtras); - } - - /////////////////////////////////////////////////////////////////////////// - // foreach_exit: All done. Restore the old mask and clean up - ctx->SetCurrentBasicBlock(bbExit); - - ctx->SetInternalMask(oldMask); - ctx->SetFunctionMask(oldFunctionMask); - - ctx->EndForeach(); - ctx->EndScope(); + ctx->SetBlockEntryMask(LLVMMaskAllOn); + lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], + dimVariables[nDims-1]->storagePtr, span); + ctx->SetContinueTarget(bbFullBodyContinue); + ctx->AddInstrumentationPoint("foreach loop body (all on)"); + stmts->EmitCode(ctx); + AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL); + ctx->BranchInst(bbFullBodyContinue); } + ctx->SetCurrentBasicBlock(bbFullBodyContinue); { + ctx->RestoreContinuedLanes(); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); + llvm::Value *newCounter = + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[nDims-1]), "new_counter"); + ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); + ctx->BranchInst(bbOuterNotInExtras); + } + + /////////////////////////////////////////////////////////////////////////// + // We're done running blocks with the mask all on; see if the counter is + // less than the end value, in which case we need to run the body one + // more time to get the extra bits. + llvm::BasicBlock *bbSetInnerMask = + ctx->CreateBasicBlock("partial_inner_only"); + ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); { + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); + llvm::Value *beforeFullEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, endVals[nDims-1], "before_full_end"); + ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd); + } + + /////////////////////////////////////////////////////////////////////////// + // The outer dimensions are all on, so the mask is just given by the + // mask for the innermost dimension + ctx->SetCurrentBasicBlock(bbSetInnerMask); { + llvm::Value *varyingCounter = + lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], + dimVariables[nDims-1]->storagePtr, span); + llvm::Value *smearEnd = ctx->BroadcastValue( + endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); + llvm::Value *emask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); + emask = ctx->I1VecToBoolVec(emask); + ctx->SetInternalMask(emask); + ctx->SetBlockEntryMask(emask); + + ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbMaskedBody); + } + + /////////////////////////////////////////////////////////////////////////// + // masked_body: set the mask and have the statements emit their + // code again. Note that it's generally worthwhile having two copies + // of the statements' code, since the code above is emitted with the + // mask known to be all-on, which in turn leads to more efficient code + // for that case. + llvm::BasicBlock *bbStepInnerIndex = + ctx->CreateBasicBlock("step_inner_index"); + llvm::BasicBlock *bbMaskedBodyContinue = + ctx->CreateBasicBlock("foreach_masked_continue"); + ctx->SetCurrentBasicBlock(bbMaskedBody); { + ctx->AddInstrumentationPoint("foreach loop body (masked)"); + ctx->SetContinueTarget(bbMaskedBodyContinue); + ctx->DisableGatherScatterWarnings(); + ctx->SetBlockEntryMask(ctx->GetFullMask()); + stmts->EmitCode(ctx); + ctx->EnableGatherScatterWarnings(); + ctx->BranchInst(bbMaskedBodyContinue); + } + ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); { + ctx->RestoreContinuedLanes(); + llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex); + } + + /////////////////////////////////////////////////////////////////////////// + // step the innermost index, for the case where we're doing the + // innermost for loop over full vectors. + ctx->SetCurrentBasicBlock(bbStepInnerIndex); { + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); + llvm::Value *newCounter = + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[nDims-1]), "new_counter"); + ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); + ctx->BranchInst(bbOuterInExtras); + } + + /////////////////////////////////////////////////////////////////////////// + // foreach_exit: All done. Restore the old mask and clean up + ctx->SetCurrentBasicBlock(bbExit); + + ctx->SetInternalMask(oldMask); + ctx->SetFunctionMask(oldFunctionMask); + + ctx->EndForeach(); + ctx->EndScope(); } diff --git a/type.cpp b/type.cpp index 107a70de..6be852ad 100644 --- a/type.cpp +++ b/type.cpp @@ -2925,7 +2925,7 @@ FunctionType::GetReturnTypeString() const { llvm::FunctionType * FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const { - if (isTask == true) // && !g->target->isPTX()) //getISA() != Target::NVPTX64) + if (isTask == true) Assert(removeMask == false); // Get the LLVM Type *s for the function arguments @@ -2950,44 +2950,30 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const { llvmArgTypes.push_back(LLVMTypes::MaskType); std::vector callTypes; - if (isTask) { + if (isTask && g->target->getISA() != Target::NVPTX) { // Tasks take three arguments: a pointer to a struct that holds the // actual task arguments, the thread index, and the total number of // threads the tasks system has running. (Task arguments are // marshalled in a struct so that it's easy to allocate space to // hold them until the task actually runs.) -// if (g->target->getISA() != Target::NVPTX64) - if (!g->target->isPTX()) - { - llvm::Type *st = llvm::StructType::get(*ctx, llvmArgTypes); - callTypes.push_back(llvm::PointerType::getUnqual(st)); - callTypes.push_back(LLVMTypes::Int32Type); // threadIndex - callTypes.push_back(LLVMTypes::Int32Type); // threadCount - callTypes.push_back(LLVMTypes::Int32Type); // taskIndex - callTypes.push_back(LLVMTypes::Int32Type); // taskCount - callTypes.push_back(LLVMTypes::Int32Type); // taskIndex0 - callTypes.push_back(LLVMTypes::Int32Type); // taskIndex1 - callTypes.push_back(LLVMTypes::Int32Type); // taskIndex2 - callTypes.push_back(LLVMTypes::Int32Type); // taskCount0 - callTypes.push_back(LLVMTypes::Int32Type); // taskCount1 - callTypes.push_back(LLVMTypes::Int32Type); // taskCount2 - } - else - { - if (g->target->getISA() == Target::NVPTX64) - callTypes = llvmArgTypes; - else - { - assert(0); /* evghenii: must be removed in final, just for test for nvptx64 target */ - llvm::Type *st = llvm::StructType::get(*ctx, llvmArgTypes); - callTypes.push_back(llvm::PointerType::getUnqual(st)); - } - } + llvm::Type *st = llvm::StructType::get(*ctx, llvmArgTypes); + callTypes.push_back(llvm::PointerType::getUnqual(st)); + callTypes.push_back(LLVMTypes::Int32Type); // threadIndex + callTypes.push_back(LLVMTypes::Int32Type); // threadCount + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex + callTypes.push_back(LLVMTypes::Int32Type); // taskCount + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex0 + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex1 + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex2 + callTypes.push_back(LLVMTypes::Int32Type); // taskCount0 + callTypes.push_back(LLVMTypes::Int32Type); // taskCount1 + callTypes.push_back(LLVMTypes::Int32Type); // taskCount2 } else // Otherwise we already have the types of the arguments callTypes = llvmArgTypes; + if (returnType == NULL) { Assert(m->errorCount > 0); return NULL;