From b3c5a9c4d671260c36287ceab98dbc193e16ac12 Mon Sep 17 00:00:00 2001 From: evghenii Date: Wed, 9 Jul 2014 12:32:18 +0200 Subject: [PATCH] added #ifdef ISPC_NVPTX_ENALED ... #endif guards --- Makefile | 17 +- builtins.cpp | 95 ++++-- ctx.cpp | 257 +++++++------- ctx.h | 20 +- decl.cpp | 12 +- expr.cpp | 4 + func.cpp | 26 +- ispc.cpp | 19 +- ispc.h | 6 +- main.cpp | 2 + module.cpp | 82 +++-- opt.cpp | 17 +- ptxtools/runtest_ptxcc.sh | 6 +- run_tests.py | 2 +- stmt.cpp | 700 ++++++++++++++++++++------------------ type.cpp | 14 +- 16 files changed, 726 insertions(+), 553 deletions(-) diff --git a/Makefile b/Makefile index fe6b36f7..7d7f081b 100644 --- a/Makefile +++ b/Makefile @@ -73,6 +73,10 @@ endif # To enable: make ARM_ENABLED=1 ARM_ENABLED=0 +# Disable NVPTX by request +# To disable: make NVPTX_ENABLED=0 +NVPTX_ENABLED=1 + # Add llvm bin to the path so any scripts run will go to the right llvm-config LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir) export PATH:=$(LLVM_BIN):$(PATH) @@ -89,7 +93,7 @@ LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags) LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e 's/svn//' -e 's/\./_/' -e 's/\..*//') LLVM_VERSION_DEF=-D$(LLVM_VERSION) -LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker nvptx +LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker # Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step. # We check if it's available before adding it (to not break 3.2 and earlier). ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1) @@ -98,6 +102,9 @@ endif ifneq ($(ARM_ENABLED), 0) LLVM_COMPONENTS+=arm endif +ifneq ($(NVPTX_ENABLED), 0) + LLVM_COMPONENTS+=nvptx +endif LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS)) CLANG=clang @@ -156,6 +163,9 @@ endif ifneq ($(ARM_ENABLED), 0) CXXFLAGS+=-DISPC_ARM_ENABLED endif +ifneq ($(NVPTX_ENABLED), 0) + CXXFLAGS+=-DISPC_NVPTX_ENABLED +endif LDFLAGS= ifeq ($(ARCH_OS),Linux) @@ -174,12 +184,15 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=nvptx avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ +TARGETS=avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \ generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 ifneq ($(ARM_ENABLED), 0) TARGETS+=neon-32 neon-16 neon-8 endif +ifneq ($(NVPTX_ENABLED), 0) + TARGETS+=nvptx +endif # These files need to be compiled in two versions - 32 and 64 bits. BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) # These are files to be compiled in single version. diff --git a/builtins.cpp b/builtins.cpp index 2d8d3dac..ab30b51f 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -342,13 +342,17 @@ lSetInternalFunctions(llvm::Module *module) { "__all", "__any", "__aos_to_soa3_float", +//#ifdef ISPC_NVPTX_ENABLED "__aos_to_soa3_float1", +//#endif /* ISPC_NVPTX_ENABLED */ "__aos_to_soa3_float16", "__aos_to_soa3_float4", "__aos_to_soa3_float8", "__aos_to_soa3_int32", "__aos_to_soa4_float", +//#ifdef ISPC_NVPTX_ENABLED "__aos_to_soa4_float1", +//#endif /* ISPC_NVPTX_ENABLED */ "__aos_to_soa4_float16", "__aos_to_soa4_float4", "__aos_to_soa4_float8", @@ -357,14 +361,10 @@ lSetInternalFunctions(llvm::Module *module) { "__atomic_add_int64_global", "__atomic_add_uniform_int32_global", "__atomic_add_uniform_int64_global", - "__atomic_add_varying_int32_global", - "__atomic_add_varying_int64_global", "__atomic_and_int32_global", "__atomic_and_int64_global", "__atomic_and_uniform_int32_global", "__atomic_and_uniform_int64_global", - "__atomic_and_varying_int32_global", - "__atomic_and_varying_int64_global", "__atomic_compare_exchange_double_global", "__atomic_compare_exchange_float_global", "__atomic_compare_exchange_int32_global", @@ -373,30 +373,18 @@ lSetInternalFunctions(llvm::Module *module) { "__atomic_compare_exchange_uniform_float_global", "__atomic_compare_exchange_uniform_int32_global", "__atomic_compare_exchange_uniform_int64_global", - "__atomic_compare_exchange_varying_double_global", - "__atomic_compare_exchange_varying_float_global", - "__atomic_compare_exchange_varying_int32_global", - "__atomic_compare_exchange_varying_int64_global", "__atomic_max_uniform_int32_global", "__atomic_max_uniform_int64_global", "__atomic_min_uniform_int32_global", "__atomic_min_uniform_int64_global", - "__atomic_max_varying_int32_global", - "__atomic_max_varying_int64_global", - "__atomic_min_varying_int32_global", - "__atomic_min_varying_int64_global", "__atomic_or_int32_global", "__atomic_or_int64_global", "__atomic_or_uniform_int32_global", "__atomic_or_uniform_int64_global", - "__atomic_or_varying_int32_global", - "__atomic_or_varying_int64_global", "__atomic_sub_int32_global", "__atomic_sub_int64_global", "__atomic_sub_uniform_int32_global", "__atomic_sub_uniform_int64_global", - "__atomic_sub_varying_int32_global", - "__atomic_sub_varying_int64_global", "__atomic_swap_double_global", "__atomic_swap_float_global", "__atomic_swap_int32_global", @@ -405,28 +393,46 @@ lSetInternalFunctions(llvm::Module *module) { "__atomic_swap_uniform_float_global", "__atomic_swap_uniform_int32_global", "__atomic_swap_uniform_int64_global", - "__atomic_swap_varying_double_global", - "__atomic_swap_varying_float_global", - "__atomic_swap_varying_int32_global", - "__atomic_swap_varying_int64_global", "__atomic_umax_uniform_uint32_global", "__atomic_umax_uniform_uint64_global", "__atomic_umin_uniform_uint32_global", "__atomic_umin_uniform_uint64_global", - "__atomic_umax_varying_uint32_global", - "__atomic_umax_varying_uint64_global", - "__atomic_umin_varying_uint32_global", - "__atomic_umin_varying_uint64_global", "__atomic_xor_int32_global", "__atomic_xor_int64_global", "__atomic_xor_uniform_int32_global", "__atomic_xor_uniform_int64_global", +//#ifdef ISPC_NVPTX_ENABLED + "__atomic_add_varying_int32_global", + "__atomic_add_varying_int64_global", + "__atomic_and_varying_int32_global", + "__atomic_and_varying_int64_global", + "__atomic_compare_exchange_varying_double_global", + "__atomic_compare_exchange_varying_float_global", + "__atomic_compare_exchange_varying_int32_global", + "__atomic_compare_exchange_varying_int64_global", + "__atomic_max_varying_int32_global", + "__atomic_max_varying_int64_global", + "__atomic_min_varying_int32_global", + "__atomic_min_varying_int64_global", + "__atomic_or_varying_int32_global", + "__atomic_or_varying_int64_global", + "__atomic_sub_varying_int32_global", + "__atomic_sub_varying_int64_global", + "__atomic_swap_varying_double_global", + "__atomic_swap_varying_float_global", + "__atomic_swap_varying_int32_global", + "__atomic_swap_varying_int64_global", + "__atomic_umax_varying_uint32_global", + "__atomic_umax_varying_uint64_global", + "__atomic_umin_varying_uint32_global", + "__atomic_umin_varying_uint64_global", "__atomic_xor_uniform_int32_global", "__atomic_xor_uniform_int64_global", "__atomic_xor_varying_int32_global", "__atomic_xor_varying_int64_global", "__atomic_xor_varying_int32_global", "__atomic_xor_varying_int64_global", +//#endif /* ISPC_NVPTX_ENABLED */ "__broadcast_double", "__broadcast_float", "__broadcast_i16", @@ -449,7 +455,9 @@ lSetInternalFunctions(llvm::Module *module) { "__do_assert_uniform", "__do_assert_varying", "__do_print", +//#ifdef ISPC_NVPTX_ENABLED "__do_print_nvptx", +//#endif /* ISPC_NVPTX_ENABLED */ "__doublebits_uniform_int64", "__doublebits_varying_int64", "__exclusive_scan_add_double", @@ -464,8 +472,10 @@ lSetInternalFunctions(llvm::Module *module) { "__extract_int32", "__extract_int64", "__extract_int8", +//#ifdef ISPC_NVPTX_ENABLED "__extract_float", "__extract_double", +//#endif /* ISPC_NVPTX_ENABLED */ "__fastmath", "__float_to_half_uniform", "__float_to_half_varying", @@ -482,8 +492,10 @@ lSetInternalFunctions(llvm::Module *module) { "__insert_int32", "__insert_int64", "__insert_int8", +//#ifdef ISPC_NVPTX_ENABLED "__insert_float", "__insert_double", +//#endif /* ISPC_NVPTX_ENABLED */ "__intbits_uniform_double", "__intbits_uniform_float", "__intbits_varying_double", @@ -520,7 +532,9 @@ lSetInternalFunctions(llvm::Module *module) { "__min_varying_uint32", "__min_varying_uint64", "__movmsk", +//#ifdef ISPC_NVPTX_ENABLED "__movmsk_ptx", +//#endif /* ISPC_NVPTX_ENABLED */ "__new_uniform_32rt", "__new_uniform_64rt", "__new_varying32_32rt", @@ -610,13 +624,15 @@ lSetInternalFunctions(llvm::Module *module) { "__shuffle_i64", "__shuffle_i8", "__soa_to_aos3_float", - "__soa_to_aos3_float1", "__soa_to_aos3_float16", "__soa_to_aos3_float4", "__soa_to_aos3_float8", "__soa_to_aos3_int32", "__soa_to_aos4_float", +//#ifdef ISPC_NVPTX_ENABLED + "__soa_to_aos3_float1", "__soa_to_aos4_float1", +//#endif /* ISPC_NVPTX_ENABLED */ "__soa_to_aos4_float16", "__soa_to_aos4_float4", "__soa_to_aos4_float8", @@ -717,7 +733,7 @@ lSetInternalFunctions(llvm::Module *module) { "__vec4_add_float", "__vec4_add_int32", "__vselect_float", - "__vselect_i32", +//#ifdef ISPC_NVPTX_ENABLED "__program_index", "__program_count", "__warp_index", @@ -736,6 +752,8 @@ lSetInternalFunctions(llvm::Module *module) { "ISPCAlloc", "ISPCLaunch", "ISPCSync", +//#endif /* ISPC_NVPTX_ENABLED */ + "__vselect_i32" }; int count = sizeof(names) / sizeof(names[0]); @@ -808,7 +826,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length, g->target->getISA() != Target::NEON16 && g->target->getISA() != Target::NEON8) #endif // !__arm__ +#ifdef ISPC_NVPTX_ENABLED if (g->target->getISA() != Target::NVPTX) +#endif /* ISPC_NVPTX_ENABLED */ { Assert(bcTriple.getArch() == llvm::Triple::UnknownArch || mTriple.getArch() == bcTriple.getArch()); @@ -982,6 +1002,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // Next, add the target's custom implementations of the various needed // builtin functions (e.g. __masked_store_32(), etc). switch (g->target->getISA()) { +#ifdef ISPC_NVPTX_ENABLED case Target::NVPTX: { if (runtime32) { @@ -993,6 +1014,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } break; }; +#endif /* ISPC_NVPTX_ENABLED */ + #ifdef ISPC_ARM_ENABLED case Target::NEON8: { if (runtime32) { @@ -1262,14 +1285,18 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } // define the 'programCount' builtin variable - if (g->target->getISA() != Target::NVPTX) - { - lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable); - } - else +#ifdef ISPC_NVPTX_ENABLED + if (g->target->getISA() == Target::NVPTX) { lDefineConstantInt("programCount", 32, module, symbolTable); } + else + { +#endif /* ISPC_NVPTX_ENABLED */ + lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable); +#ifdef ISPC_NVPTX_ENABLED + } +#endif /* ISPC_NVPTX_ENABLED */ // define the 'programIndex' builtin lDefineProgramIndex(module, symbolTable); @@ -1301,9 +1328,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(), module, symbolTable); +#ifdef ISPC_NVPTX_ENABLED lDefineConstantInt("__is_nvptx_target", (int)(g->target->getISA() == Target::NVPTX), module, symbolTable); - +#else + lDefineConstantInt("__is_nvptx_target", (int)0, module, symbolTable); +#endif /* ISPC_NVPTX_ENABLED */ + if (g->forceAlignment != -1) { llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true); alignment->setInitializer(LLVMInt32(g->forceAlignment)); diff --git a/ctx.cpp b/ctx.cpp index 7602601b..22d04b8f 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -57,8 +57,10 @@ #include #include #endif +#ifdef ISPC_NVPTX_ENABLED #include #include +#endif /* ISPC_NVPTX_ENABLED */ /** This is a small utility structure that records information related to one level of nested control flow. It's mostly used in correctly restoring @@ -1373,28 +1375,30 @@ FunctionEmitContext::None(llvm::Value *mask) { llvm::Value * -FunctionEmitContext::LaneMask(llvm::Value *v) -{ -#if 1 /* this makes mandelbrot example slower with "nvptx" target. Need further investigation. */ - const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk"; +FunctionEmitContext::LaneMask(llvm::Value *v) { +#ifdef ISPC_NVPTX_ENABLED + /* this makes mandelbrot example slower with "nvptx" target. + * Needs further investigation. */ + const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk"; #else - const char *__movmsk = "__movmsk"; + const char *__movmsk = "__movmsk"; #endif - // Call the target-dependent movmsk function to turn the vector mask - // into an i64 value - std::vector mm; - m->symbolTable->LookupFunction(__movmsk, &mm); - if (g->target->getMaskBitCount() == 1) - AssertPos(currentPos, mm.size() == 1); - else - // There should be one with signed int signature, one unsigned int. - AssertPos(currentPos, mm.size() == 2); - // We can actually call either one, since both are i32s as far as - // LLVM's type system is concerned... - llvm::Function *fmm = mm[0]->function; - return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk")); + // Call the target-dependent movmsk function to turn the vector mask + // into an i64 value + std::vector mm; + m->symbolTable->LookupFunction(__movmsk, &mm); + if (g->target->getMaskBitCount() == 1) + AssertPos(currentPos, mm.size() == 1); + else + // There should be one with signed int signature, one unsigned int. + AssertPos(currentPos, mm.size() == 2); + // We can actually call either one, since both are i32s as far as + // LLVM's type system is concerned... + llvm::Function *fmm = mm[0]->function; + return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk")); } +#ifdef ISPC_NVPTX_ENABLED bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName) { llvm::Type *type = vector->getType(); @@ -1447,19 +1451,21 @@ FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane) llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock()); return ret; } +#endif /* ISPC_NVPTX_ENABLED */ llvm::Value * FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) { - if (g->target->getISA() == Target::NVPTX) - { - // Compare the two masks to get a vector of i1s - llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, - v1, v2, "v1==v2"); - return ExtractInst(cmp, 0); /* this works without calling All(..) in PTX. Why ?!? */ - } - else - { +#ifdef ISPC_NVPTX_ENABLED + if (g->target->getISA() == Target::NVPTX) + { + // Compare the two masks to get a vector of i1s + llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, + v1, v2, "v1==v2"); + return ExtractInst(cmp, 0); /* this works without calling All(..) in PTX. Why ?!? */ + } +#endif /* ISPC_NVPTX_ENABLED */ + #if 0 // Compare the two masks to get a vector of i1s llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, @@ -1474,7 +1480,6 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) { return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2, LLVMGetName("equal", v1, v2)); #endif - } } llvm::Value * @@ -1489,6 +1494,8 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) { return index; } + +#ifdef ISPC_NVPTX_ENABLED llvm::Value * FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) { llvm::Function *func_program_index = m->module->getFunction("__program_index"); @@ -1500,6 +1507,7 @@ FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) { #endif return index; } +#endif /* ISPC_NVPTX_ENABLED */ llvm::Value * @@ -1919,7 +1927,6 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) { if (name == NULL) name = LLVMGetName(value, "_ptr2int"); - llvm::Type *type = LLVMTypes::PointerIntType; llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock); AddDebugPos(inst); @@ -3613,75 +3620,8 @@ llvm::Value * FunctionEmitContext::LaunchInst(llvm::Value *callee, std::vector &argVals, llvm::Value *launchCount[3]){ - - if (g->target->getISA() != Target::NVPTX) - { - if (callee == NULL) { - AssertPos(currentPos, m->errorCount > 0); - return NULL; - } - - launchedTasks = true; - - AssertPos(currentPos, llvm::isa(callee)); - llvm::Type *argType = - (llvm::dyn_cast(callee))->arg_begin()->getType(); - AssertPos(currentPos, llvm::PointerType::classof(argType)); - llvm::PointerType *pt = - llvm::dyn_cast(argType); - AssertPos(currentPos, llvm::StructType::classof(pt->getElementType())); - llvm::StructType *argStructType = - static_cast(pt->getElementType()); - - llvm::Function *falloc = m->module->getFunction("ISPCAlloc"); - AssertPos(currentPos, falloc != NULL); - llvm::Value *structSize = g->target->SizeOf(argStructType, bblock); - if (structSize->getType() != LLVMTypes::Int64Type) - // ISPCAlloc expects the size as an uint64_t, but on 32-bit - // targets, SizeOf returns a 32-bit value - structSize = ZExtInst(structSize, LLVMTypes::Int64Type, - "struct_size_to_64"); - int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth()); - - std::vector allocArgs; - allocArgs.push_back(launchGroupHandlePtr); - allocArgs.push_back(structSize); - allocArgs.push_back(LLVMInt32(align)); - llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr"); - llvm::Value *argmem = BitCastInst(voidmem, pt); - - // Copy the values of the parameters into the appropriate place in - // the argument block - for (unsigned int i = 0; i < argVals.size(); ++i) { - llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg"); - // don't need to do masked store here, I think - StoreInst(argVals[i], ptr); - } - - if (argStructType->getNumElements() == argVals.size() + 1) { - // copy in the mask - llvm::Value *mask = GetFullMask(); - llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL, - "funarg_mask"); - StoreInst(mask, ptr); - } - - // And emit the call to the user-supplied task launch function, passing - // a pointer to the task function being called and a pointer to the - // argument block we just filled in - llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType); - llvm::Function *flaunch = m->module->getFunction("ISPCLaunch"); - AssertPos(currentPos, flaunch != NULL); - std::vector args; - args.push_back(launchGroupHandlePtr); - args.push_back(fptr); - args.push_back(voidmem); - args.push_back(launchCount[0]); - args.push_back(launchCount[1]); - args.push_back(launchCount[2]); - return CallInst(flaunch, NULL, args, ""); - } - else /* NVPTX */ +#ifdef ISPC_NVPTX_ENABLED + if (g->target->getISA() == Target::NVPTX) { if (callee == NULL) { AssertPos(currentPos, m->errorCount > 0); @@ -3764,38 +3704,79 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee, llvm::Value *ret = CallInst(flaunch, NULL, args, ""); return ret; } +#endif /* ISPC_NVPTX_ENABLED */ + + if (callee == NULL) { + AssertPos(currentPos, m->errorCount > 0); + return NULL; + } + + launchedTasks = true; + + AssertPos(currentPos, llvm::isa(callee)); + llvm::Type *argType = + (llvm::dyn_cast(callee))->arg_begin()->getType(); + AssertPos(currentPos, llvm::PointerType::classof(argType)); + llvm::PointerType *pt = + llvm::dyn_cast(argType); + AssertPos(currentPos, llvm::StructType::classof(pt->getElementType())); + llvm::StructType *argStructType = + static_cast(pt->getElementType()); + + llvm::Function *falloc = m->module->getFunction("ISPCAlloc"); + AssertPos(currentPos, falloc != NULL); + llvm::Value *structSize = g->target->SizeOf(argStructType, bblock); + if (structSize->getType() != LLVMTypes::Int64Type) + // ISPCAlloc expects the size as an uint64_t, but on 32-bit + // targets, SizeOf returns a 32-bit value + structSize = ZExtInst(structSize, LLVMTypes::Int64Type, + "struct_size_to_64"); + int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth()); + + std::vector allocArgs; + allocArgs.push_back(launchGroupHandlePtr); + allocArgs.push_back(structSize); + allocArgs.push_back(LLVMInt32(align)); + llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr"); + llvm::Value *argmem = BitCastInst(voidmem, pt); + + // Copy the values of the parameters into the appropriate place in + // the argument block + for (unsigned int i = 0; i < argVals.size(); ++i) { + llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg"); + // don't need to do masked store here, I think + StoreInst(argVals[i], ptr); + } + + if (argStructType->getNumElements() == argVals.size() + 1) { + // copy in the mask + llvm::Value *mask = GetFullMask(); + llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL, + "funarg_mask"); + StoreInst(mask, ptr); + } + + // And emit the call to the user-supplied task launch function, passing + // a pointer to the task function being called and a pointer to the + // argument block we just filled in + llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType); + llvm::Function *flaunch = m->module->getFunction("ISPCLaunch"); + AssertPos(currentPos, flaunch != NULL); + std::vector args; + args.push_back(launchGroupHandlePtr); + args.push_back(fptr); + args.push_back(voidmem); + args.push_back(launchCount[0]); + args.push_back(launchCount[1]); + args.push_back(launchCount[2]); + return CallInst(flaunch, NULL, args, ""); } void FunctionEmitContext::SyncInst() { - if (g->target->getISA() != Target::NVPTX) - { - llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr); - llvm::Value *nullPtrValue = - llvm::Constant::getNullValue(LLVMTypes::VoidPointerType); - llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp, - llvm::CmpInst::ICMP_NE, - launchGroupHandle, nullPtrValue); - llvm::BasicBlock *bSync = CreateBasicBlock("call_sync"); - llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync"); - BranchInst(bSync, bPostSync, nonNull); - - SetCurrentBasicBlock(bSync); - llvm::Function *fsync = m->module->getFunction("ISPCSync"); - if (fsync == NULL) - FATAL("Couldn't find ISPCSync declaration?!"); - CallInst(fsync, NULL, launchGroupHandle, ""); - - // zero out the handle so that if ISPCLaunch is called again in this - // function, it knows it's starting out from scratch - StoreInst(nullPtrValue, launchGroupHandlePtr); - - BranchInst(bPostSync); - - SetCurrentBasicBlock(bPostSync); - } - else /* NVPTX: don't do test, just call sync */ +#ifdef ISPC_NVPTX_ENABLED + if (g->target->getISA() == Target::NVPTX) { llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr); llvm::Value *nullPtrValue = @@ -3805,7 +3786,33 @@ FunctionEmitContext::SyncInst() { FATAL("Couldn't find ISPCSync declaration?!"); CallInst(fsync, NULL, launchGroupHandle, ""); StoreInst(nullPtrValue, launchGroupHandlePtr); + return; } +#endif /* ISPC_NVPTX_ENABLED */ + + llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr); + llvm::Value *nullPtrValue = + llvm::Constant::getNullValue(LLVMTypes::VoidPointerType); + llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp, + llvm::CmpInst::ICMP_NE, + launchGroupHandle, nullPtrValue); + llvm::BasicBlock *bSync = CreateBasicBlock("call_sync"); + llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync"); + BranchInst(bSync, bPostSync, nonNull); + + SetCurrentBasicBlock(bSync); + llvm::Function *fsync = m->module->getFunction("ISPCSync"); + if (fsync == NULL) + FATAL("Couldn't find ISPCSync declaration?!"); + CallInst(fsync, NULL, launchGroupHandle, ""); + + // zero out the handle so that if ISPCLaunch is called again in this + // function, it knows it's starting out from scratch + StoreInst(nullPtrValue, launchGroupHandlePtr); + + BranchInst(bPostSync); + + SetCurrentBasicBlock(bPostSync); } diff --git a/ctx.h b/ctx.h index 54729e4f..593a7aa0 100644 --- a/ctx.h +++ b/ctx.h @@ -291,21 +291,21 @@ public: of the mask is on. */ llvm::Value *LaneMask(llvm::Value *mask); + /** Given two masks of type LLVMTypes::MaskType, return an i1 value + that indicates whether the two masks are equal. */ + llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2); + + /** generate constantvector, which contains programindex, i.e. + < i32 0, i32 1, i32 2, i32 3> */ + llvm::Value *ProgramIndexVector(bool is32bits = true); +#ifdef ISPC_NVPTX_ENABLED + llvm::Value *ProgramIndexVectorPTX(bool is32bits = true); /** Issues a call to __insert_int8/int16/int32/int64/float/double */ llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar); /** Issues a call to __extract_int8/int16/int32/int64/float/double */ llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane); - - - /** Given two masks of type LLVMTypes::MaskType, return an i1 value - that indicates whether the two masks are equal. */ - llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2); - - /** Generate ConstantVector, which contains ProgramIndex, i.e. - < i32 0, i32 1, i32 2, i32 3> */ - llvm::Value *ProgramIndexVector(bool is32bits = true); - llvm::Value *ProgramIndexVectorPTX(bool is32bits = true); +#endif /** Given a string, create an anonymous global variable to hold its value and return the pointer to the string. */ diff --git a/decl.cpp b/decl.cpp index 279cfbfc..c915d6b8 100644 --- a/decl.cpp +++ b/decl.cpp @@ -168,6 +168,7 @@ DeclSpecs::GetBaseType(SourcePos pos) const { retType = lApplyTypeQualifiers(typeQualifiers, retType, pos); if (soaWidth > 0) { +#ifdef ISPC_NVPTX_ENABLED #if 0 /* see stmt.cpp in DeclStmt::EmitCode for work-around of SOAType Declaration */ if (g->target->getISA() == Target::NVPTX) { @@ -175,6 +176,7 @@ DeclSpecs::GetBaseType(SourcePos pos) const { return NULL; } #endif +#endif /* ISPC_NVPTX_ENABLED */ const StructType *st = CastType(retType); if (st == NULL) { @@ -409,6 +411,7 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) { return; } +#ifdef ISPC_NVPTX_ENABLED #if 0 /* NVPTX */ if (baseType->IsUniformType()) { @@ -416,6 +419,7 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) { baseType->IsArrayType() ? " true " : " false "); } #endif +#endif /* ISPC_NVPTX_ENABLED */ const Type *arrayType = new ArrayType(baseType, arraySize); if (child != NULL) { child->InitFromType(arrayType, ds); @@ -544,9 +548,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) { returnType = returnType->ResolveUnboundVariability(Variability::Varying); - bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0); bool isExternC = ds && (ds->storageClass == SC_EXTERN_C); bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0); + bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0); bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0); if (isExported && isTask) { @@ -555,9 +559,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) { return; } if (isExternC && isTask) { - Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" " - "qualifiers"); - return; + Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" " + "qualifiers"); + return; } if (isExternC && isExported) { Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" " diff --git a/expr.cpp b/expr.cpp index 5fe17257..8bb438e7 100644 --- a/expr.cpp +++ b/expr.cpp @@ -7880,12 +7880,14 @@ SizeOfExpr::TypeCheck() { "struct type \"%s\".", type->GetString().c_str()); return NULL; } +#ifdef ISPC_NVPTX_ENABLED if (type != NULL) if (g->target->getISA() == Target::NVPTX && type->IsVaryingType()) { Error(pos, "\"sizeof\" with varying data types is not yet supported with \"nvptx\" target."); return NULL; } +#endif /* ISPC_NVPTX_ENABLED */ return this; } @@ -8718,11 +8720,13 @@ NewExpr::TypeCheck() { AssertPos(pos, m->errorCount > 0); return NULL; } +#ifdef ISPC_NVPTX_ENABLED if (g->target->getISA() == Target::NVPTX && allocType->IsVaryingType()) { Error(pos, "\"new\" with varying data types is not yet supported with \"nvptx\" target."); return NULL; } +#endif /* ISPC_NVPTX_ENABLED */ if (CastType(allocType) != NULL) { Error(pos, "Can't dynamically allocate storage for declared " "but not defined type \"%s\".", allocType->GetString().c_str()); diff --git a/func.cpp b/func.cpp index b821ec87..c7908e5b 100644 --- a/func.cpp +++ b/func.cpp @@ -47,7 +47,9 @@ #include #if defined(LLVM_3_1) || defined(LLVM_3_2) +#ifdef ISPC_NVPTX_ENABLED #include +#endif /* ISPC_NVPTX_ENABLED */ #include #include #include @@ -55,7 +57,9 @@ #include #include #else +#ifdef ISPC_NVPTX_ENABLED #include +#endif /* ISPC_NVPTX_ENABLED */ #include #include #include @@ -131,7 +135,11 @@ Function::Function(Symbol *s, Stmt *c) { sym->parentFunction = this; } - if (type->isTask && g->target->getISA() != Target::NVPTX) { + if (type->isTask +#ifdef ISPC_NVPTX_ENABLED + && (g->target->getISA() != Target::NVPTX) +#endif + ){ threadIndexSym = m->symbolTable->LookupVariable("threadIndex"); Assert(threadIndexSym); threadCountSym = m->symbolTable->LookupVariable("threadCount"); @@ -242,7 +250,11 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, #endif const FunctionType *type = CastType(sym->type); Assert(type != NULL); - if (type->isTask == true && g->target->getISA() != Target::NVPTX) { + if (type->isTask == true +#ifdef ISPC_NVPTX_ENABLED + && (g->target->getISA() != Target::NVPTX) +#endif + ){ // For tasks, there should always be three parameters: the // pointer to the structure that holds all of the arguments, the // thread index, and the thread count variables. @@ -340,6 +352,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, ctx->SetFunctionMask(argIter); Assert(++argIter == function->arg_end()); } +#ifdef ISPC_NVPTX_ENABLED if (type->isTask == true && g->target->getISA() == Target::NVPTX) { llvm::NamedMDNode* annotations = @@ -350,6 +363,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, av.push_back(LLVMInt32(1)); annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); } +#endif /* ISPC_NVPTX_ENABLED */ } // Finally, we can generate code for the function @@ -505,15 +519,14 @@ Function::GenerateIR() { // the application can call it const FunctionType *type = CastType(sym->type); Assert(type != NULL); - if (type->isExported) { + if (type->isExported) { if (!type->isTask) { llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true); llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage; std::string functionName = sym->name; - if (g->mangleFunctionsWithTarget) functionName += std::string("_") + g->target->GetISAString(); - +#ifdef ISPC_NVPTX_ENABLED if (g->target->getISA() == Target::NVPTX) { functionName += std::string("___export"); /* add ___export to the end, for ptxcc to recognize it is exported */ @@ -527,6 +540,7 @@ Function::GenerateIR() { annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); #endif } +#endif /* ISPC_NVPTX_ENABLED */ llvm::Function *appFunction = llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module); #if defined(LLVM_3_1) @@ -566,6 +580,7 @@ Function::GenerateIR() { FATAL("Function verificication failed"); } } +#ifdef ISPC_NVPTX_ENABLED if (g->target->getISA() == Target::NVPTX) { llvm::NamedMDNode* annotations = @@ -576,6 +591,7 @@ Function::GenerateIR() { av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1)); annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); } +#endif /* ISPC_NVPTX_ENABLED */ } } } diff --git a/ispc.cpp b/ispc.cpp index f8f9bd29..735ffeda 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -247,9 +247,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : arch = "arm"; else #endif +#ifdef ISPC_NVPTX_ENABLED if(!strncmp(isa, "nvptx", 5)) arch = "nvptx64"; else +#endif /* ISPC_NVPTX_ENABLED */ arch = "x86-64"; } @@ -587,6 +589,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskBitCount = 32; } #endif +#ifdef ISPC_NVPTX_ENABLED else if (!strcasecmp(isa, "nvptx")) { this->m_isa = Target::NVPTX; @@ -602,6 +605,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasGather = this->m_hasScatter = false; cpuFromIsa = "sm_35"; } +#endif /* ISPC_NVPTX_ENABLED */ else { Error(SourcePos(), "Target \"%s\" is unknown. Choices are: %s.", isa, SupportedTargets()); @@ -720,8 +724,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // Initialize target-specific "target-feature" attribute. if (!m_attributes.empty()) { llvm::AttrBuilder attrBuilder; +#ifdef ISPC_NVPTX_ENABLED if (m_isa != Target::NVPTX) - attrBuilder.addAttribute("target-cpu", this->m_cpu); +#endif + attrBuilder.addAttribute("target-cpu", this->m_cpu); attrBuilder.addAttribute("target-features", this->m_attributes); this->m_tf_attributes = new llvm::AttributeSet( llvm::AttributeSet::get( @@ -768,6 +774,9 @@ Target::SupportedTargets() { return #ifdef ISPC_ARM_ENABLED "neon-i8x16, neon-i16x8, neon-i32x4, " +#endif +#ifdef ISPC_NVPTX_ENABLED + "nvptx, " #endif "sse2-i32x4, sse2-i32x8, " "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, " @@ -776,7 +785,7 @@ Target::SupportedTargets() { "avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 " "avx2-i32x8, avx2-i32x16, avx2-i64x4, " "generic-x1, generic-x4, generic-x8, generic-x16, " - "generic-x32, generic-x64, nvptx"; + "generic-x32, generic-x64"; } @@ -803,8 +812,10 @@ Target::GetTripleString() const { triple.setArchName("i386"); else if (m_arch == "x86-64") triple.setArchName("x86_64"); +#ifdef ISPC_NVPTX_ENABLED else if (m_arch == "nvptx64") triple = llvm::Triple("nvptx64", "nvidia", "cuda"); +#endif /* ISPC_NVPTX_ENABLED */ else triple.setArchName(m_arch); } @@ -837,8 +848,10 @@ Target::ISAToString(ISA isa) { return "avx2"; case Target::GENERIC: return "generic"; +#ifdef ISPC_NVPTX_ENABLED case Target::NVPTX: return "nvptx"; +#endif /* ISPC_NVPTX_ENABLED */ default: FATAL("Unhandled target in ISAToString()"); } @@ -877,8 +890,10 @@ Target::ISAToTargetString(ISA isa) { return "avx2-i32x8"; case Target::GENERIC: return "generic-4"; +#ifdef ISPC_NVPTX_ENABLED case Target::NVPTX: return "nvptx"; +#endif /* ISPC_NVPTX_ENABLED */ default: FATAL("Unhandled target in ISAToTargetString()"); } diff --git a/ispc.h b/ispc.h index f9e78ac9..16de6ec6 100644 --- a/ispc.h +++ b/ispc.h @@ -179,7 +179,10 @@ public: flexible/performant of them will apear last in the enumerant. Note also that __best_available_isa() needs to be updated if ISAs are added or the enumerant values are reordered. */ - enum ISA { NVPTX, + enum ISA { +#ifdef ISPC_NVPTX_ENABLED + NVPTX, +#endif #ifdef ISPC_ARM_ENABLED NEON32, NEON16, NEON8, #endif @@ -611,7 +614,6 @@ struct Globals { /** Indicates that alignment in memory allocation routines should be forced to have given value. -1 value means natural alignment for the platforms. */ int forceAlignment; - std::string PtxString; }; enum { diff --git a/main.cpp b/main.cpp index 2815cde9..28721fa4 100644 --- a/main.cpp +++ b/main.cpp @@ -320,10 +320,12 @@ int main(int Argc, char *Argv[]) { LLVMInitializeARMTargetMC(); #endif +#ifdef ISPC_NVPTX_ENABLED LLVMInitializeNVPTXTargetInfo(); LLVMInitializeNVPTXTarget(); LLVMInitializeNVPTXAsmPrinter(); LLVMInitializeNVPTXTargetMC(); +#endif /* ISPC_NVPTX_ENABLED */ char *file = NULL; const char *headerFileName = NULL; diff --git a/module.cpp b/module.cpp index fc5c6437..995e3a78 100644 --- a/module.cpp +++ b/module.cpp @@ -58,7 +58,9 @@ #include #include #include +#ifdef ISPC_NVPTX_ENABLED #include +#endif /* ISPC_NVPTX_ENABLED */ #ifdef ISPC_IS_WINDOWS #include #include @@ -72,7 +74,9 @@ #include #include #include +#ifdef ISPC_NVPTX_ENABLED #include "llvm/Assembly/AssemblyAnnotationWriter.h" +#endif /* ISPC_NVPTX_ENABLED */ #else #include #include @@ -80,7 +84,9 @@ #include #include #include +#ifdef ISPC_NVPTX_ENABLED #include "llvm/Assembly/AssemblyAnnotationWriter.h" +#endif /* ISPC_NVPTX_ENABLED */ #endif #include #include @@ -446,6 +452,7 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE return; } +#ifdef ISPC_NVPTX_ENABLED if (g->target->getISA() == Target::NVPTX && #if 0 !type->IsConstType() && @@ -476,7 +483,7 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE type = new ArrayType(type->GetAsUniformType(), nel); #endif } - +#endif /* ISPC_NVPTX_ENABLED */ llvm::Type *llvmType = type->LLVMType(g->ctx); if (llvmType == NULL) @@ -677,6 +684,7 @@ lCheckExportedParameterTypes(const Type *type, const std::string &name, } } +#ifdef ISPC_NVPTX_ENABLED static void lCheckTaskParameterTypes(const Type *type, const std::string &name, SourcePos pos) { @@ -691,7 +699,7 @@ lCheckTaskParameterTypes(const Type *type, const std::string &name, name.c_str()); } } - +#endif /* ISPC_NVPTX_ENABLED */ /** Given a function type, loop through the function parameters and see if any are StructTypes. If so, issue an error; this is currently broken @@ -849,8 +857,12 @@ Module::AddFunctionDeclaration(const std::string &name, #else // LLVM 3.1 and 3.3+ function->addFnAttr(llvm::Attribute::AlwaysInline); #endif - /* evghenii: fails function verification when "if" executed in nvptx target */ - if (functionType->isTask && g->target->getISA() != Target::NVPTX) + + if (functionType->isTask) +#ifdef ISPC_NVPTX_ENABLED + /* evghenii: fails function verification when "if" executed in nvptx target */ + if (g->target->getISA() != Target::NVPTX) +#endif /* ISPC_NVPTX_ENABLED */ // This also applies transitively to members I think? #if defined(LLVM_3_1) function->setDoesNotAlias(1, true); @@ -871,12 +883,14 @@ Module::AddFunctionDeclaration(const std::string &name, functionType->GetReturnType()->IsVoidType() == false) Error(pos, "Task-qualified functions must have void return type."); +#ifdef ISPC_NVPTX_ENABLED if (g->target->getISA() == Target::NVPTX && Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false && functionType->isExported) { Error(pos, "Export-qualified functions must have void return type with \"nvptx\" target."); } +#endif /* ISPC_NVPTX_ENABLED */ if (functionType->isExported || functionType->isExternC) lCheckForStructParameters(functionType, pos); @@ -897,9 +911,12 @@ Module::AddFunctionDeclaration(const std::string &name, if (functionType->isExported) { lCheckExportedParameterTypes(argType, argName, argPos); } + +#ifdef ISPC_NVPTX_ENABLED if (functionType->isTask) { lCheckTaskParameterTypes(argType, argName, argPos); } +#endif /* ISPC_NVPTX_ENABLED */ // ISPC assumes that no pointers alias. (It should be possible to // specify when this is not the case, but this should be the @@ -1027,24 +1044,28 @@ Module::writeOutput(OutputType outputType, const char *outFileName, const char *fileType = NULL; switch (outputType) { case Asm: - if (g->target->getISA() != Target::NVPTX) - { - if (strcasecmp(suffix, "s")) +#ifdef ISPC_NVPTX_ENABLED + if (g->target->getISA() == Target::NVPTX) + { + if (strcasecmp(suffix, "ptx")) fileType = "assembly"; - } - else - if (strcasecmp(suffix, "ptx")) + } + else +#endif /* ISPC_NVPTX_ENABLED */ + if (strcasecmp(suffix, "s")) fileType = "assembly"; break; case Bitcode: - if (g->target->getISA() != Target::NVPTX) - { - if (strcasecmp(suffix, "bc")) - fileType = "LLVM bitcode"; - } - else - if (strcasecmp(suffix, "ll")) - fileType = "LLVM assembly"; +#ifdef ISPC_NVPTX_ENABLED + if (g->target->getISA() == Target::NVPTX) + { + if (strcasecmp(suffix, "ll")) + fileType = "LLVM assembly"; + } + else +#endif /* ISPC_NVPTX_ENABLED */ + if (strcasecmp(suffix, "bc")) + fileType = "LLVM bitcode"; break; case Object: if (strcasecmp(suffix, "o") && strcasecmp(suffix, "obj")) @@ -1113,6 +1134,7 @@ Module::writeOutput(OutputType outputType, const char *outFileName, return writeObjectFileOrAssembly(outputType, outFileName); } +#ifdef ISPC_NVPTX_ENABLED typedef std::vector vecString_t; static vecString_t lSplitString(const std::string &s) @@ -1180,6 +1202,7 @@ lFixAttributes(const vecString_t &src, vecString_t &dst) dst.push_back(s); } } +#endif /* ISPC_NVPTX_ENABLED */ bool Module::writeBitcode(llvm::Module *module, const char *outFileName) { @@ -1204,11 +1227,8 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) { } llvm::raw_fd_ostream fos(fd, (fd != 1), false); - if (g->target->getISA() != Target::NVPTX) - { - llvm::WriteBitcodeToFile(module, fos); - } - else +#ifdef ISPC_NVPTX_ENABLED + if (g->target->getISA() == Target::NVPTX) { /* when using "nvptx" target, emit patched/hacked assembly * NVPTX only accepts 3.2-style LLVM assembly, where attributes @@ -1240,7 +1260,9 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) { fos << *it; } } - + else +#endif /* ISPC_NVPTX_ENABLED */ + llvm::WriteBitcodeToFile(module, fos); return true; } @@ -2275,6 +2297,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre opts.addMacroDef(g->cppArgs[i].substr(2)); } } +#ifdef ISPC_NVPTX_ENABLED if (g->target->getISA() == Target::NVPTX) { opts.addMacroDef("__NVPTX__"); @@ -2295,6 +2318,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre opts.addMacroDef("taskCount2=__taskCount2()"); opts.addMacroDef("taskCount=__taskCount()"); } +#endif /* ISPC_NVPTX_ENABLED */ inst.getLangOpts().LineComment = 1; #if defined(LLVM_3_5) @@ -2740,6 +2764,7 @@ lCreateDispatchModule(std::map &functions) return module; } +#ifdef ISPC_NVPTX_ENABLED static std::string lCBEMangle(const std::string &S) { std::string Result; @@ -2762,7 +2787,7 @@ static std::string lCBEMangle(const std::string &S) { } return Result; } - +#endif /* ISPC_NVPTX_ENABLED */ int Module::CompileAndOutput(const char *srcFile, @@ -2778,7 +2803,7 @@ Module::CompileAndOutput(const char *srcFile, const char *hostStubFileName, const char *devStubFileName) { - if (target == NULL || strchr(target, ',') == NULL) { + if (target == NULL || strchr(target, ',') == NULL) { // We're only compiling to a single target g->target = new Target(arch, cpu, target, generatePIC); if (!g->target->isValid()) @@ -2786,7 +2811,7 @@ Module::CompileAndOutput(const char *srcFile, m = new Module(srcFile); if (m->CompileFile() == 0) { - +#ifdef ISPC_NVPTX_ENABLED /* NVPTX: * for PTX target replace '.' with '_' in all global variables * a PTX identifier name must match [a-zA-Z$_][a-zA-Z$_0-9]* @@ -2811,7 +2836,7 @@ Module::CompileAndOutput(const char *srcFile, } } } - +#endif /* ISPC_NVPTX_ENABLED */ if (outputType == CXX) { if (target == NULL || strncmp(target, "generic-", 8) != 0) { Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" " @@ -3014,5 +3039,4 @@ Module::CompileAndOutput(const char *srcFile, return errorCount > 0; } - return true; } diff --git a/opt.cpp b/opt.cpp index 879fde02..096778e7 100644 --- a/opt.cpp +++ b/opt.cpp @@ -55,7 +55,9 @@ #include #include #include +#ifdef ISPC_NVPTX_ENABLED #include +#endif /* ISPC_NVPTX_ENABLED */ #else #include #include @@ -63,7 +65,9 @@ #include #include #include +#ifdef ISPC_NVPTX_ENABLED #include +#endif /* ISPC_NVPTX_ENABLED */ #endif #if defined (LLVM_3_4) || defined(LLVM_3_5) #include @@ -131,7 +135,9 @@ static llvm::Pass *CreateDebugPass(char * output); static llvm::Pass *CreateReplaceStdlibShiftPass(); static llvm::Pass *CreateFixBooleanSelectPass(); +#ifdef ISPC_NVPTX_ENABLED static llvm::Pass *CreatePromoteLocalToPrivatePass(); +#endif /* ISPC_NVPTX_ENABLED */ #define DEBUG_START_PASS(NAME) \ if (g->debugPrint && \ @@ -495,9 +501,11 @@ Optimize(llvm::Module *module, int optLevel) { // run absolutely no optimizations, since the front-end needs us to // take the various __pseudo_* functions it has emitted and turn // them into something that can actually execute. - + optPM.add(CreateImproveMemoryOpsPass(), 100); +#ifdef ISPC_NVPTX_ENABLED if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) +#endif /* ISPC_NVPTX_ENABLED */ optPM.add(CreateImproveMemoryOpsPass(), 100); if (g->opt.disableHandlePseudoMemoryOps == false) @@ -579,7 +587,9 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createReassociatePass()); optPM.add(llvm::createIPConstantPropagationPass()); +#ifdef ISPC_NVPTX_ENABLED if (g->target->getISA() != Target::NVPTX) +#endif /* ISPC_NVPTX_ENABLED */ optPM.add(CreateReplaceStdlibShiftPass(),229); optPM.add(llvm::createDeadArgEliminationPass(),230); @@ -693,7 +703,7 @@ Optimize(llvm::Module *module, int optLevel) { // Should be the last optPM.add(CreateFixBooleanSelectPass(), 400); - +#ifdef ISPC_NVPTX_ENABLED if (g->target->getISA() == Target::NVPTX) { optPM.add(CreatePromoteLocalToPrivatePass()); @@ -799,6 +809,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createConstantMergePass()); #endif } +#endif /* ISPC_NVPTX_ENABLED */ } // Finish up by making sure we didn't mess anything up in the IR along @@ -5410,6 +5421,7 @@ CreateFixBooleanSelectPass() { return new FixBooleanSelectPass(); } +#ifdef ISPC_NVPTX_ENABLED /////////////////////////////////////////////////////////////////////////////// // Detect addrspace(3) /////////////////////////////////////////////////////////////////////////////// @@ -5498,4 +5510,5 @@ CreatePromoteLocalToPrivatePass() { +#endif /* ISPC_NVPTX_ENABLED */ diff --git a/ptxtools/runtest_ptxcc.sh b/ptxtools/runtest_ptxcc.sh index 708d0538..c2133a65 100755 --- a/ptxtools/runtest_ptxcc.sh +++ b/ptxtools/runtest_ptxcc.sh @@ -45,11 +45,13 @@ then # $($LLVMAS $1 -o $TMPDIR/$fbname.bc) && $($LLVMDIS $TMPDIR/$fbname.bc -o $TMPDIR/$fbname.ll) && $($PTXGEN $TMPDIR/$fbname.ll -o $TMPDIR/$fbname.ptx) && \ $($PTXGEN $1 -o $TMPDIR/$fbname.ptx) && \ $($PTXCC $TMPDIR/$fbname.ptx -o $TMPDIR/$fbname.o -Xnvcc="-G") && \ - $(nvcc test_static_nvptx.cpp examples/util/nvcc_helpers.cu examples/util/ispc_malloc.cpp $TMPDIR/$fbname.o -arch=sm_35 -Iexamples/util/ -D_CUDA_ -lcudadevrt $ARGS); + $(nvcc test_static_nvptx.cpp examples/util/nvcc_helpers.cu examples/util/ispc_malloc.cpp $TMPDIR/$fbname.o -arch=sm_35 -Iexamples/util/ -D_CUDA_ -lcudadevrt $ARGS) && \ + $(/bin/rm -rf $TMPDIR/*$fbname*); else $(sed 's/\.b0/\.b32/g' $1 > $TMPDIR/$fbname) && \ $($PTXCC $TMPDIR/$fbname -o $TMPDIR/$fbname.o -Xnvcc="-G") && \ - $(nvcc test_static_nvptx.cpp examples/util/nvcc_helpers.cu examples/util/ispc_malloc.cpp $TMPDIR/$fbname.o -arch=sm_35 -Iexamples/util/ -D_CUDA_ -lcudadevrt $ARGS); + $(nvcc test_static_nvptx.cpp examples/util/nvcc_helpers.cu examples/util/ispc_malloc.cpp $TMPDIR/$fbname.o -arch=sm_35 -Iexamples/util/ -D_CUDA_ -lcudadevrt $ARGS) && \ + $(/bin/rm -rf $TMPDIR/*$fbname*); fi diff --git a/run_tests.py b/run_tests.py index 89961faf..f5df8ef7 100755 --- a/run_tests.py +++ b/run_tests.py @@ -233,7 +233,7 @@ def run_test(testname): elif is_nvptx_target: if os.environ.get("NVVM") == "1": is_nvptx_nvvm = True - obj_name = "%s.bc" % testname + obj_name = "%s.ll" % testname else: obj_name = "%s.ptx" % testname is_nvptx_nvvm = False diff --git a/stmt.cpp b/stmt.cpp index c528909b..2e6e1cfc 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -142,6 +142,7 @@ lHasUnsizedArrays(const Type *type) { return lHasUnsizedArrays(at->GetElementType()); } +#ifdef ISPC_NVPTX_ENABLED static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos ¤tPos, const bool variable = false) { if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) @@ -198,6 +199,7 @@ static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value * return value; } +#endif /* ISPC_NVPTX_ENABLED */ void DeclStmt::EmitCode(FunctionEmitContext *ctx) const { @@ -261,9 +263,8 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const { return; } - if (sym->storageClass == SC_STATIC) { - +#ifdef ISPC_NVPTX_ENABLED if (g->target->getISA() == Target::NVPTX && !sym->type->IsConstType()) { Error(sym->pos, @@ -279,7 +280,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const { PerformanceWarning(sym->pos, "\"const static uniform\" variable ""\"%s\" is stored in __constant address space with ""\"nvptx\" target.", sym->name.c_str()); - +#endif /* ISPC_NVPTX_ENABLED */ // For static variables, we need a compile-time constant value // for its initializer; if there's no initializer, we use a // zero value. @@ -307,28 +308,38 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const { if (cinit == NULL) cinit = llvm::Constant::getNullValue(llvmType); + // Allocate space for the static variable in global scope, so + // that it persists across function calls +#ifdef ISPC_NVPTX_ENABLED int addressSpace = 0; if (g->target->getISA() == Target::NVPTX && sym->type->IsConstType() && sym->type->IsUniformType()) addressSpace = 4; - - // Allocate space for the static variable in global scope, so - // that it persists across function calls sym->storagePtr = new llvm::GlobalVariable(*m->module, llvmType, sym->type->IsConstType(), llvm::GlobalValue::InternalLinkage, cinit, - llvm::Twine("static_") + + llvm::Twine("static.") + llvm::Twine(sym->pos.first_line) + - llvm::Twine("_") + sym->name.c_str(), + llvm::Twine(".") + sym->name.c_str(), NULL, llvm::GlobalVariable::NotThreadLocal, addressSpace); sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos); +#else /* ISPC_NVPTX_ENABLED */ + sym->storagePtr = + new llvm::GlobalVariable(*m->module, llvmType, + sym->type->IsConstType(), + llvm::GlobalValue::InternalLinkage, cinit, + llvm::Twine("static.") + + llvm::Twine(sym->pos.first_line) + + llvm::Twine(".") + sym->name.c_str()); +#endif /* ISPC_NVPTX_ENABLED */ // Tell the FunctionEmitContext about the variable ctx->EmitVariableDebugInfo(sym); } +#ifdef ISPC_NVPTX_ENABLED else if ((sym->type->IsUniformType() || sym->type->IsSOAType()) && /* NVPTX: * only non-constant uniform data types are stored in shared memory @@ -396,6 +407,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const { sym->parentFunction = ctx->GetFunction(); InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos); } +#endif /* ISPC_NVPTX_ENABLED */ else { // For non-static variables, allocate storage on the stack @@ -404,7 +416,6 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const { // Tell the FunctionEmitContext about the variable; must do // this before the initializer stuff. ctx->EmitVariableDebugInfo(sym); - if (initExpr == 0 && sym->type->IsConstType()) Error(sym->pos, "Missing initializer for const variable " "\"%s\".", sym->name.c_str()); @@ -412,7 +423,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const { // And then get it initialized... sym->parentFunction = ctx->GetFunction(); InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos); - } + } } } @@ -571,7 +582,7 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const { if (testValue == NULL) return; - +#ifdef ISPC_NVPTX_ENABLED #if 0 if (!isUniform && g->target->getISA() == Target::NVPTX) { @@ -582,7 +593,7 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const { isUniform = true; } #endif - +#endif /* ISPC_NVPTX_ENABLED */ if (isUniform) { ctx->StartUniformIf(); @@ -865,11 +876,17 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, // Do any of the program instances want to run the 'true' // block? If not, jump ahead to bNext. -#if 1 - llvm::Value *maskAnyTrueQ = ctx->Any(ctx->GetFullMask()); -#else + +#ifdef ISPC_NVPTX_ENABLED +#if 0 llvm::Value *maskAnyTrueQ = ctx->ExtractInst(ctx->GetFullMask(),0); +#else + llvm::Value *maskAnyTrueQ = ctx->Any(ctx->GetFullMask()); #endif +#else /* ISPC_NVPTX_ENABLED */ + llvm::Value *maskAnyTrueQ = ctx->Any(ctx->GetFullMask()); +#endif /* ISPC_NVPTX_ENABLED */ + ctx->BranchInst(bRunTrue, bNext, maskAnyTrueQ); // Emit statements for true @@ -886,11 +903,16 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, // Similarly, check to see if any of the instances want to // run the 'false' block... -#if 1 - llvm::Value *maskAnyFalseQ = ctx->Any(ctx->GetFullMask()); -#else + +#ifdef ISPC_NVPTX_ENABLED +#if 0 llvm::Value *maskAnyFalseQ = ctx->ExtractInst(ctx->GetFullMask(),0); +#else + llvm::Value *maskAnyFalseQ = ctx->Any(ctx->GetFullMask()); #endif +#else /* ISPC_NVPTX_ENABLED */ + llvm::Value *maskAnyFalseQ = ctx->Any(ctx->GetFullMask()); +#endif /* ISPC_NVPTX_ENABLED */ ctx->BranchInst(bRunFalse, bDone, maskAnyFalseQ); // Emit code for false @@ -1450,10 +1472,96 @@ static llvm::Value * lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx, llvm::Value *uniformCounterPtr, llvm::Value *varyingCounterPtr, - const std::vector &spans) -{ - if (g->target->getISA() != Target::NVPTX) - { + const std::vector &spans) { +#ifdef ISPC_NVPTX_ENABLED + if (g->target->getISA() == Target::NVPTX) + { + // Smear the uniform counter value out to be varying + llvm::Value *counter = ctx->LoadInst(uniformCounterPtr); + llvm::Value *smearCounter = ctx->BroadcastValue( + counter, LLVMTypes::Int32VectorType, "smear_counter"); + + // Figure out the offsets; this is a little bit tricky. As an example, + // consider a 2D tiled foreach loop, where we're running 8-wide and + // where the inner dimension has a stride of 4 and the outer dimension + // has a stride of 2. For the inner dimension, we want the offsets + // (0,1,2,3,0,1,2,3), and for the outer dimension we want + // (0,0,0,0,1,1,1,1). + int32_t delta[ISPC_MAX_NVEC]; + const int vecWidth = 32; + std::vector constDeltaList; + for (int i = 0; i < vecWidth; ++i) + { + int d = i; + // First, account for the effect of any dimensions at deeper + // nesting levels than the current one. + int prevDimSpanCount = 1; + for (int j = dim; j < nDims-1; ++j) + prevDimSpanCount *= spans[j+1]; + d /= prevDimSpanCount; + + // And now with what's left, figure out our own offset + delta[i] = d % spans[dim]; + constDeltaList.push_back(LLVMInt8(delta[i])); + } + + llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int8Type, 32); + // llvm::PointerType::get(ArrayDelta, 4); /* constant memory */ + + + llvm::GlobalVariable* globalDelta = new llvm::GlobalVariable( + /*Module=*/*m->module, + /*Type=*/ArrayDelta, + /*isConstant=*/true, + /*Linkage=*/llvm::GlobalValue::PrivateLinkage, + /*Initializer=*/0, // has initializer, specified below + /*Name=*/"constDeltaForeach"); +#if 0 + /*ThreadLocalMode=*/llvm::GlobalVariable::NotThreadLocal, + /*unsigned AddressSpace=*/4 /*constant*/); +#endif + + + llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList); + + globalDelta->setInitializer(constDelta); + llvm::Function *func_program_index = m->module->getFunction("__program_index"); + llvm::Value *laneIdx = ctx->CallInst(func_program_index, NULL, std::vector(), "foreach__programIndex"); + + std::vector ptr_arrayidx_indices; + ptr_arrayidx_indices.push_back(LLVMInt32(0)); + ptr_arrayidx_indices.push_back(laneIdx); +#if 1 + llvm::Instruction* ptr_arrayidx = llvm::GetElementPtrInst::Create(globalDelta, ptr_arrayidx_indices, "arrayidx", ctx->GetCurrentBasicBlock()); + llvm::LoadInst* int8_39 = new llvm::LoadInst(ptr_arrayidx, "", false, ctx->GetCurrentBasicBlock()); + llvm::Value * int32_39 = ctx->ZExtInst(int8_39, LLVMTypes::Int32Type); + + llvm::VectorType* VectorTy_2 = llvm::VectorType::get(llvm::IntegerType::get(*g->ctx, 32), 1); + llvm::UndefValue* const_packed_41 = llvm::UndefValue::get(VectorTy_2); + + llvm::InsertElementInst* packed_43 = llvm::InsertElementInst::Create( + // llvm::UndefValue(LLVMInt32Vector), + const_packed_41, + int32_39, LLVMInt32(0), "", ctx->GetCurrentBasicBlock()); +#endif + + + // Add the deltas to compute the varying counter values; store the + // result to memory and then return it directly as well. +#if 0 + llvm::Value *varyingCounter = + ctx->BinaryOperator(llvm::Instruction::Add, smearCounter, + LLVMInt32Vector(delta), "iter_val"); +#else + llvm::Value *varyingCounter = + ctx->BinaryOperator(llvm::Instruction::Add, smearCounter, + packed_43, "iter_val"); +#endif + ctx->StoreInst(varyingCounter, varyingCounterPtr); + return varyingCounter; + } +#endif /* ISPC_NVPTX_ENABLED */ + // Smear the uniform counter value out to be varying llvm::Value *counter = ctx->LoadInst(uniformCounterPtr); llvm::Value *smearCounter = ctx->BroadcastValue( @@ -1486,93 +1594,6 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx, LLVMInt32Vector(delta), "iter_val"); ctx->StoreInst(varyingCounter, varyingCounterPtr); return varyingCounter; - } - else /* NVPTX == true */ - { - // Smear the uniform counter value out to be varying - llvm::Value *counter = ctx->LoadInst(uniformCounterPtr); - llvm::Value *smearCounter = ctx->BroadcastValue( - counter, LLVMTypes::Int32VectorType, "smear_counter"); - - // Figure out the offsets; this is a little bit tricky. As an example, - // consider a 2D tiled foreach loop, where we're running 8-wide and - // where the inner dimension has a stride of 4 and the outer dimension - // has a stride of 2. For the inner dimension, we want the offsets - // (0,1,2,3,0,1,2,3), and for the outer dimension we want - // (0,0,0,0,1,1,1,1). - int32_t delta[ISPC_MAX_NVEC]; - const int vecWidth = 32; - std::vector constDeltaList; - for (int i = 0; i < vecWidth; ++i) - { - int d = i; - // First, account for the effect of any dimensions at deeper - // nesting levels than the current one. - int prevDimSpanCount = 1; - for (int j = dim; j < nDims-1; ++j) - prevDimSpanCount *= spans[j+1]; - d /= prevDimSpanCount; - - // And now with what's left, figure out our own offset - delta[i] = d % spans[dim]; - constDeltaList.push_back(LLVMInt8(delta[i])); - } - - llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int8Type, 32); -// llvm::PointerType::get(ArrayDelta, 4); /* constant memory */ - - - llvm::GlobalVariable* globalDelta = new llvm::GlobalVariable( - /*Module=*/*m->module, - /*Type=*/ArrayDelta, - /*isConstant=*/true, - /*Linkage=*/llvm::GlobalValue::PrivateLinkage, - /*Initializer=*/0, // has initializer, specified below - /*Name=*/"constDeltaForeach"); -#if 0 - /*ThreadLocalMode=*/llvm::GlobalVariable::NotThreadLocal, - /*unsigned AddressSpace=*/4 /*constant*/); -#endif - - - llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList); - - globalDelta->setInitializer(constDelta); - llvm::Function *func_program_index = m->module->getFunction("__program_index"); - llvm::Value *laneIdx = ctx->CallInst(func_program_index, NULL, std::vector(), "foreach__programIndex"); - - std::vector ptr_arrayidx_indices; - ptr_arrayidx_indices.push_back(LLVMInt32(0)); - ptr_arrayidx_indices.push_back(laneIdx); -#if 1 - llvm::Instruction* ptr_arrayidx = llvm::GetElementPtrInst::Create(globalDelta, ptr_arrayidx_indices, "arrayidx", ctx->GetCurrentBasicBlock()); - llvm::LoadInst* int8_39 = new llvm::LoadInst(ptr_arrayidx, "", false, ctx->GetCurrentBasicBlock()); - llvm::Value * int32_39 = ctx->ZExtInst(int8_39, LLVMTypes::Int32Type); - - llvm::VectorType* VectorTy_2 = llvm::VectorType::get(llvm::IntegerType::get(*g->ctx, 32), 1); - llvm::UndefValue* const_packed_41 = llvm::UndefValue::get(VectorTy_2); - - llvm::InsertElementInst* packed_43 = llvm::InsertElementInst::Create( -// llvm::UndefValue(LLVMInt32Vector), - const_packed_41, - int32_39, LLVMInt32(0), "", ctx->GetCurrentBasicBlock()); -#endif - - - // Add the deltas to compute the varying counter values; store the - // result to memory and then return it directly as well. -#if 0 - llvm::Value *varyingCounter = - ctx->BinaryOperator(llvm::Instruction::Add, smearCounter, - LLVMInt32Vector(delta), "iter_val"); -#else - llvm::Value *varyingCounter = - ctx->BinaryOperator(llvm::Instruction::Add, smearCounter, - packed_43, "iter_val"); -#endif - ctx->StoreInst(varyingCounter, varyingCounterPtr); - return varyingCounter; - } } @@ -1650,7 +1671,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // This should be caught during typechecking AssertPos(pos, startExprs.size() == dimVariables.size() && - endExprs.size() == dimVariables.size()); + endExprs.size() == dimVariables.size()); int nDims = (int)dimVariables.size(); /////////////////////////////////////////////////////////////////////// @@ -1661,66 +1682,70 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { std::vector nExtras, alignedEnd, extrasMaskPtrs; std::vector span(nDims, 0); +#ifdef ISPC_NVPTX_ENABLED const int vectorWidth = g->target->getISA() == Target::NVPTX ? 32 : g->target->getVectorWidth(); lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]); +#else /* ISPC_NVPTX_ENABLED */ + lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]); +#endif /* ISPC_NVPTX_ENABLED */ for (int i = 0; i < nDims; ++i) { - // Basic blocks that we'll fill in later with the looping logic for - // this dimension. - bbReset.push_back(ctx->CreateBasicBlock("foreach_reset")); - if (i < nDims-1) - // stepping for the innermost dimension is handled specially - bbStep.push_back(ctx->CreateBasicBlock("foreach_step")); - bbTest.push_back(ctx->CreateBasicBlock("foreach_test")); + // Basic blocks that we'll fill in later with the looping logic for + // this dimension. + bbReset.push_back(ctx->CreateBasicBlock("foreach_reset")); + if (i < nDims-1) + // stepping for the innermost dimension is handled specially + bbStep.push_back(ctx->CreateBasicBlock("foreach_step")); + bbTest.push_back(ctx->CreateBasicBlock("foreach_test")); - // Start and end value for this loop dimension - llvm::Value *sv = startExprs[i]->GetValue(ctx); - llvm::Value *ev = endExprs[i]->GetValue(ctx); - if (sv == NULL || ev == NULL) - return; - startVals.push_back(sv); - endVals.push_back(ev); + // Start and end value for this loop dimension + llvm::Value *sv = startExprs[i]->GetValue(ctx); + llvm::Value *ev = endExprs[i]->GetValue(ctx); + if (sv == NULL || ev == NULL) + return; + startVals.push_back(sv); + endVals.push_back(ev); - // nItems = endVal - startVal - llvm::Value *nItems = - ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems"); + // nItems = endVal - startVal + llvm::Value *nItems = + ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems"); - // nExtras = nItems % (span for this dimension) - // This gives us the number of extra elements we need to deal with - // at the end of the loop for this dimension that don't fit cleanly - // into a vector width. - nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems, - LLVMInt32(span[i]), "nextras")); + // nExtras = nItems % (span for this dimension) + // This gives us the number of extra elements we need to deal with + // at the end of the loop for this dimension that don't fit cleanly + // into a vector width. + nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems, + LLVMInt32(span[i]), "nextras")); - // alignedEnd = endVal - nExtras - alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev, - nExtras[i], "aligned_end")); + // alignedEnd = endVal - nExtras + alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev, + nExtras[i], "aligned_end")); - /////////////////////////////////////////////////////////////////////// - // Each dimension has a loop counter that is a uniform value that - // goes from startVal to endVal, in steps of the span for this - // dimension. Its value is only used internally here for looping - // logic and isn't directly available in the user's program code. - uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type, - "counter")); - ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); + /////////////////////////////////////////////////////////////////////// + // Each dimension has a loop counter that is a uniform value that + // goes from startVal to endVal, in steps of the span for this + // dimension. Its value is only used internally here for looping + // logic and isn't directly available in the user's program code. + uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type, + "counter")); + ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); - // There is also a varying variable that holds the set of index - // values for each dimension in the current loop iteration; this is - // the value that is program-visible. - dimVariables[i]->storagePtr = - ctx->AllocaInst(LLVMTypes::Int32VectorType, - dimVariables[i]->name.c_str()); - dimVariables[i]->parentFunction = ctx->GetFunction(); - ctx->EmitVariableDebugInfo(dimVariables[i]); + // There is also a varying variable that holds the set of index + // values for each dimension in the current loop iteration; this is + // the value that is program-visible. + dimVariables[i]->storagePtr = + ctx->AllocaInst(LLVMTypes::Int32VectorType, + dimVariables[i]->name.c_str()); + dimVariables[i]->parentFunction = ctx->GetFunction(); + ctx->EmitVariableDebugInfo(dimVariables[i]); - // Each dimension also maintains a mask that represents which of - // the varying elements in the current iteration should be - // processed. (i.e. this is used to disable the lanes that have - // out-of-bounds offsets.) - extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask")); - ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); + // Each dimension also maintains a mask that represents which of + // the varying elements in the current iteration should be + // processed. (i.e. this is used to disable the lanes that have + // out-of-bounds offsets.) + extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask")); + ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); } ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR); @@ -1733,14 +1758,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // a given dimension in preparation for running through its loop again, // after the enclosing level advances its counter. for (int i = 0; i < nDims; ++i) { - ctx->SetCurrentBasicBlock(bbReset[i]); - if (i == 0) - ctx->BranchInst(bbExit); - else { - ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); - ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); - ctx->BranchInst(bbStep[i-1]); - } + ctx->SetCurrentBasicBlock(bbReset[i]); + if (i == 0) + ctx->BranchInst(bbExit); + else { + ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); + ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); + ctx->BranchInst(bbStep[i-1]); + } } /////////////////////////////////////////////////////////////////////////// @@ -1750,67 +1775,67 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // this for the innermost dimension, which has a more complex stepping // structure.. for (int i = 0; i < nDims-1; ++i) { - ctx->SetCurrentBasicBlock(bbStep[i]); - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]); - llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[i]), "new_counter"); - ctx->StoreInst(newCounter, uniformCounterPtrs[i]); - ctx->BranchInst(bbTest[i]); + ctx->SetCurrentBasicBlock(bbStep[i]); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]); + llvm::Value *newCounter = + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[i]), "new_counter"); + ctx->StoreInst(newCounter, uniformCounterPtrs[i]); + ctx->BranchInst(bbTest[i]); } /////////////////////////////////////////////////////////////////////////// // foreach_test (for all dimensions other than the innermost...) std::vector inExtras; for (int i = 0; i < nDims-1; ++i) { - ctx->SetCurrentBasicBlock(bbTest[i]); + ctx->SetCurrentBasicBlock(bbTest[i]); - llvm::Value *haveExtras = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT, - endVals[i], alignedEnd[i], "have_extras"); + llvm::Value *haveExtras = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT, + endVals[i], alignedEnd[i], "have_extras"); - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter"); - llvm::Value *atAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, - counter, alignedEnd[i], "at_aligned_end"); - llvm::Value *inEx = - ctx->BinaryOperator(llvm::Instruction::And, haveExtras, - atAlignedEnd, "in_extras"); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter"); + llvm::Value *atAlignedEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, + counter, alignedEnd[i], "at_aligned_end"); + llvm::Value *inEx = + ctx->BinaryOperator(llvm::Instruction::And, haveExtras, + atAlignedEnd, "in_extras"); - if (i == 0) - inExtras.push_back(inEx); - else - inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx, - inExtras[i-1], "in_extras_all")); + if (i == 0) + inExtras.push_back(inEx); + else + inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx, + inExtras[i-1], "in_extras_all")); - llvm::Value *varyingCounter = - lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i], - dimVariables[i]->storagePtr, span); + llvm::Value *varyingCounter = + lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i], + dimVariables[i]->storagePtr, span); - llvm::Value *smearEnd = ctx->BroadcastValue( - endVals[i], LLVMTypes::Int32VectorType, "smear_end"); + llvm::Value *smearEnd = ctx->BroadcastValue( + endVals[i], LLVMTypes::Int32VectorType, "smear_end"); - // Do a vector compare of its value to the end value to generate a - // mask for this last bit of work. - llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); - emask = ctx->I1VecToBoolVec(emask); + // Do a vector compare of its value to the end value to generate a + // mask for this last bit of work. + llvm::Value *emask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); + emask = ctx->I1VecToBoolVec(emask); - if (i == 0) - ctx->StoreInst(emask, extrasMaskPtrs[i]); - else { - llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]); - llvm::Value *newMask = - ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, - "extras_mask"); - ctx->StoreInst(newMask, extrasMaskPtrs[i]); - } + if (i == 0) + ctx->StoreInst(emask, extrasMaskPtrs[i]); + else { + llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]); + llvm::Value *newMask = + ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, + "extras_mask"); + ctx->StoreInst(newMask, extrasMaskPtrs[i]); + } - llvm::Value *notAtEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, endVals[i]); - ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd); + llvm::Value *notAtEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, endVals[i]); + ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd); } /////////////////////////////////////////////////////////////////////////// @@ -1847,18 +1872,18 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // (i.e. processing extra elements that don't exactly fit into a // vector). llvm::BasicBlock *bbOuterInExtras = - ctx->CreateBasicBlock("outer_in_extras"); + ctx->CreateBasicBlock("outer_in_extras"); llvm::BasicBlock *bbOuterNotInExtras = - ctx->CreateBasicBlock("outer_not_in_extras"); + ctx->CreateBasicBlock("outer_not_in_extras"); ctx->SetCurrentBasicBlock(bbTest[nDims-1]); if (inExtras.size()) - ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras, - inExtras.back()); + ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras, + inExtras.back()); else - // for a 1D iteration domain, we certainly don't have any enclosing - // dimensions that are processing extra elements. - ctx->BranchInst(bbOuterNotInExtras); + // for a 1D iteration domain, we certainly don't have any enclosing + // dimensions that are processing extra elements. + ctx->BranchInst(bbOuterNotInExtras); /////////////////////////////////////////////////////////////////////////// // One or more outer dimensions in extras, so we need to mask for the loop @@ -1873,21 +1898,21 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // // run loop body with mask // } llvm::BasicBlock *bbAllInnerPartialOuter = - ctx->CreateBasicBlock("all_inner_partial_outer"); + ctx->CreateBasicBlock("all_inner_partial_outer"); llvm::BasicBlock *bbPartial = - ctx->CreateBasicBlock("both_partial"); + ctx->CreateBasicBlock("both_partial"); ctx->SetCurrentBasicBlock(bbOuterInExtras); { - // Update the varying counter value here, since all subsequent - // blocks along this path need it. - lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); + // Update the varying counter value here, since all subsequent + // blocks along this path need it. + lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], + dimVariables[nDims-1]->storagePtr, span); - // here we just check to see if counter < alignedEnd - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); - llvm::Value *beforeAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, alignedEnd[nDims-1], "before_aligned_end"); - ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd); + // here we just check to see if counter < alignedEnd + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); + llvm::Value *beforeAlignedEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, alignedEnd[nDims-1], "before_aligned_end"); + ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd); } // Below we have a basic block that runs the loop body code for the @@ -1906,53 +1931,53 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // should step the loop counter for the next enclosing dimension // instead. llvm::Value *stepIndexAfterMaskedBodyPtr = - ctx->AllocaInst(LLVMTypes::BoolType, "step_index"); + ctx->AllocaInst(LLVMTypes::BoolType, "step_index"); /////////////////////////////////////////////////////////////////////////// // We're in the inner loop part where the only masking is due to outer // dimensions but the innermost dimension fits fully into a vector's // width. Set the mask and jump to the masked loop body. ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); { - llvm::Value *mask; - if (nDims == 1) - // 1D loop; we shouldn't ever get here anyway - mask = LLVMMaskAllOff; - else - mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); + llvm::Value *mask; + if (nDims == 1) + // 1D loop; we shouldn't ever get here anyway + mask = LLVMMaskAllOff; + else + mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); - ctx->SetInternalMask(mask); + ctx->SetInternalMask(mask); - ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbMaskedBody); + ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbMaskedBody); } /////////////////////////////////////////////////////////////////////////// // We need to include the effect of the innermost dimension in the mask // for the final bits here ctx->SetCurrentBasicBlock(bbPartial); { - llvm::Value *varyingCounter = - ctx->LoadInst(dimVariables[nDims-1]->storagePtr); - llvm::Value *smearEnd = ctx->BroadcastValue( - endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); + llvm::Value *varyingCounter = + ctx->LoadInst(dimVariables[nDims-1]->storagePtr); + llvm::Value *smearEnd = ctx->BroadcastValue( + endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); - llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); - emask = ctx->I1VecToBoolVec(emask); + llvm::Value *emask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); + emask = ctx->I1VecToBoolVec(emask); - if (nDims == 1) { - ctx->SetInternalMask(emask); - } - else { - llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); - llvm::Value *newMask = - ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, - "extras_mask"); - ctx->SetInternalMask(newMask); - } + if (nDims == 1) { + ctx->SetInternalMask(emask); + } + else { + llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); + llvm::Value *newMask = + ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, + "extras_mask"); + ctx->SetInternalMask(newMask); + } - ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbMaskedBody); + ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbMaskedBody); } /////////////////////////////////////////////////////////////////////////// @@ -1968,14 +1993,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // // run loop body with mask // } llvm::BasicBlock *bbPartialInnerAllOuter = - ctx->CreateBasicBlock("partial_inner_all_outer"); + ctx->CreateBasicBlock("partial_inner_all_outer"); ctx->SetCurrentBasicBlock(bbOuterNotInExtras); { - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); - llvm::Value *beforeAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, alignedEnd[nDims-1], "before_aligned_end"); - ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter, - beforeAlignedEnd); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); + llvm::Value *beforeAlignedEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, alignedEnd[nDims-1], "before_aligned_end"); + ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter, + beforeAlignedEnd); } /////////////////////////////////////////////////////////////////////////// @@ -1985,26 +2010,26 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // value of the varying loop counter and have the statements in the // loop body emit their code. llvm::BasicBlock *bbFullBodyContinue = - ctx->CreateBasicBlock("foreach_full_continue"); + ctx->CreateBasicBlock("foreach_full_continue"); ctx->SetCurrentBasicBlock(bbFullBody); { - ctx->SetInternalMask(LLVMMaskAllOn); - ctx->SetBlockEntryMask(LLVMMaskAllOn); - lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); - ctx->SetContinueTarget(bbFullBodyContinue); - ctx->AddInstrumentationPoint("foreach loop body (all on)"); - stmts->EmitCode(ctx); - AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL); - ctx->BranchInst(bbFullBodyContinue); + ctx->SetInternalMask(LLVMMaskAllOn); + ctx->SetBlockEntryMask(LLVMMaskAllOn); + lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], + dimVariables[nDims-1]->storagePtr, span); + ctx->SetContinueTarget(bbFullBodyContinue); + ctx->AddInstrumentationPoint("foreach loop body (all on)"); + stmts->EmitCode(ctx); + AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL); + ctx->BranchInst(bbFullBodyContinue); } ctx->SetCurrentBasicBlock(bbFullBodyContinue); { - ctx->RestoreContinuedLanes(); - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); - llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[nDims-1]), "new_counter"); - ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); - ctx->BranchInst(bbOuterNotInExtras); + ctx->RestoreContinuedLanes(); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); + llvm::Value *newCounter = + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[nDims-1]), "new_counter"); + ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); + ctx->BranchInst(bbOuterNotInExtras); } /////////////////////////////////////////////////////////////////////////// @@ -2012,33 +2037,33 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // less than the end value, in which case we need to run the body one // more time to get the extra bits. llvm::BasicBlock *bbSetInnerMask = - ctx->CreateBasicBlock("partial_inner_only"); + ctx->CreateBasicBlock("partial_inner_only"); ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); { - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); - llvm::Value *beforeFullEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, endVals[nDims-1], "before_full_end"); - ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); + llvm::Value *beforeFullEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, endVals[nDims-1], "before_full_end"); + ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd); } /////////////////////////////////////////////////////////////////////////// // The outer dimensions are all on, so the mask is just given by the // mask for the innermost dimension ctx->SetCurrentBasicBlock(bbSetInnerMask); { - llvm::Value *varyingCounter = - lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); - llvm::Value *smearEnd = ctx->BroadcastValue( - endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); - llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); - emask = ctx->I1VecToBoolVec(emask); - ctx->SetInternalMask(emask); - ctx->SetBlockEntryMask(emask); + llvm::Value *varyingCounter = + lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], + dimVariables[nDims-1]->storagePtr, span); + llvm::Value *smearEnd = ctx->BroadcastValue( + endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); + llvm::Value *emask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); + emask = ctx->I1VecToBoolVec(emask); + ctx->SetInternalMask(emask); + ctx->SetBlockEntryMask(emask); - ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbMaskedBody); + ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbMaskedBody); } /////////////////////////////////////////////////////////////////////////// @@ -2048,34 +2073,34 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // mask known to be all-on, which in turn leads to more efficient code // for that case. llvm::BasicBlock *bbStepInnerIndex = - ctx->CreateBasicBlock("step_inner_index"); + ctx->CreateBasicBlock("step_inner_index"); llvm::BasicBlock *bbMaskedBodyContinue = - ctx->CreateBasicBlock("foreach_masked_continue"); + ctx->CreateBasicBlock("foreach_masked_continue"); ctx->SetCurrentBasicBlock(bbMaskedBody); { - ctx->AddInstrumentationPoint("foreach loop body (masked)"); - ctx->SetContinueTarget(bbMaskedBodyContinue); - ctx->DisableGatherScatterWarnings(); - ctx->SetBlockEntryMask(ctx->GetFullMask()); - stmts->EmitCode(ctx); - ctx->EnableGatherScatterWarnings(); - ctx->BranchInst(bbMaskedBodyContinue); + ctx->AddInstrumentationPoint("foreach loop body (masked)"); + ctx->SetContinueTarget(bbMaskedBodyContinue); + ctx->DisableGatherScatterWarnings(); + ctx->SetBlockEntryMask(ctx->GetFullMask()); + stmts->EmitCode(ctx); + ctx->EnableGatherScatterWarnings(); + ctx->BranchInst(bbMaskedBodyContinue); } ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); { - ctx->RestoreContinuedLanes(); - llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex); + ctx->RestoreContinuedLanes(); + llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex); } /////////////////////////////////////////////////////////////////////////// // step the innermost index, for the case where we're doing the // innermost for loop over full vectors. ctx->SetCurrentBasicBlock(bbStepInnerIndex); { - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); - llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[nDims-1]), "new_counter"); - ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); - ctx->BranchInst(bbOuterInExtras); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); + llvm::Value *newCounter = + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[nDims-1]), "new_counter"); + ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); + ctx->BranchInst(bbOuterInExtras); } /////////////////////////////////////////////////////////////////////////// @@ -2262,8 +2287,12 @@ ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const { // math...) // Get the "program index" vector value +#ifdef ISPC_NVPTX_ENABLED llvm::Value *programIndex = g->target->getISA() == Target::NVPTX ? ctx->ProgramIndexVectorPTX() : ctx->ProgramIndexVector(); +#else /* ISPC_NVPTX_ENABLED */ + llvm::Value *programIndex = ctx->ProgramIndexVector(); +#endif /* ISPC_NVPTX_ENABLED */ // And smear the current lane out to a vector llvm::Value *firstSet32 = @@ -2460,19 +2489,22 @@ ForeachUniqueStmt::EmitCode(FunctionEmitContext *ctx) const { // And load the corresponding element value from the temporary // memory storing the value of the varying expr. llvm::Value *uniqueValue; - if (g->target->getISA() != Target::NVPTX) - { - llvm::Value *uniqueValuePtr = - ctx->GetElementPtrInst(exprMem, LLVMInt64(0), firstSet, exprPtrType, - "unique_index_ptr"); - uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value"); - } - else /* in case of PTX target, use __shfl PTX intrinsics via __insert/__extract function */ +#ifdef ISPC_NVPTX_ENABLED + if (g->target->getISA() == Target::NVPTX) { llvm::Value *firstSet32 = ctx->TruncInst(firstSet, LLVMTypes::Int32Type); uniqueValue = ctx->Extract(exprValue, firstSet32); } - + else + { +#endif /* ISPC_NVPTX_ENABLED */ + llvm::Value *uniqueValuePtr = + ctx->GetElementPtrInst(exprMem, LLVMInt64(0), firstSet, exprPtrType, + "unique_index_ptr"); + uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value"); +#ifdef ISPC_NVPTX_ENABLED + } +#endif /* ISPC_NVPTX_ENABLED */ // If it's a varying pointer type, need to convert from the int // type we store in the vector to the actual pointer type if (llvm::dyn_cast(symType) != NULL) @@ -3379,8 +3411,12 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const { } // Now we can emit code to call __do_print() +#ifdef ISPC_NVPTX_ENABLED llvm::Function *printFunc = g->target->getISA() != Target::NVPTX ? m->module->getFunction("__do_print") : m->module->getFunction("__do_print_nvptx"); +#else /* ISPC_NVPTX_ENABLED */ + llvm::Function *printFunc = m->module->getFunction("__do_print"); +#endif /* ISPC_NVPTX_ENABLED */ AssertPos(pos, printFunc); llvm::Value *mask = ctx->GetFullMask(); diff --git a/type.cpp b/type.cpp index 8e713cce..db2c7ea7 100644 --- a/type.cpp +++ b/type.cpp @@ -751,7 +751,7 @@ EnumType::Mangle() const { std::string ret; if (isConst) ret += "C"; ret += variability.MangleString(); - ret += std::string("enum_5B_") + name + std::string("_5C_"); + ret += std::string("enum[") + name + std::string("]"); return ret; } @@ -1433,7 +1433,7 @@ ArrayType::Mangle() const { sprintf(buf, "%d", numElements); else buf[0] = '\0'; - return s + "_5B_" + buf + "_5C_"; + return s + "[" + buf + "]"; } @@ -2106,12 +2106,12 @@ lMangleStruct(Variability variability, bool isConst, const std::string &name) { Assert(variability != Variability::Unbound); std::string ret; - ret += "s_5B_"; + ret += "s["; if (isConst) ret += "_c_"; ret += variability.MangleString(); - ret += name + std::string("_5C_"); + ret += name + std::string("]"); return ret; } @@ -3057,7 +3057,11 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const { llvmArgTypes.push_back(LLVMTypes::MaskType); std::vector callTypes; - if (isTask && g->target->getISA() != Target::NVPTX) { + if (isTask +#ifdef ISPC_NVPTX_ENABLED + && (g->target->getISA() != Target::NVPTX) +#endif + ){ // Tasks take three arguments: a pointer to a struct that holds the // actual task arguments, the thread index, and the total number of // threads the tasks system has running. (Task arguments are