diff --git a/Makefile b/Makefile index 98729bfc..5bac4a6e 100644 --- a/Makefile +++ b/Makefile @@ -39,6 +39,10 @@ LLVM_CONFIG=$(shell which llvm-config) CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir) +# Enable ARM by request +# To enable: make ARM_ENABLED=1 +ARM_ENABLED=0 + # Add llvm bin to the path so any scripts run will go to the right llvm-config LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir) export PATH:=$(LLVM_BIN):$(PATH) @@ -55,12 +59,15 @@ LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags) LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//) LLVM_VERSION_DEF=-D$(LLVM_VERSION) -LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker arm +LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker # Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step. # We check if it's available before adding it (to not break 3.2 and earlier). ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1) LLVM_COMPONENTS+=option endif +ifneq ($(ARM_ENABLED), 0) + LLVM_COMPONENTS+=arm +endif LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS)) CLANG=clang @@ -104,6 +111,9 @@ OPT=-O2 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE) \ -Wall $(LLVM_VERSION_DEF) \ -DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" +ifneq ($(ARM_ENABLED), 0) + CXXFLAGS+=-DISPC_ARM_ENABLED +endif LDFLAGS= ifeq ($(ARCH_OS),Linux) @@ -122,10 +132,12 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=neon-32 neon-16 neon-8 \ - avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ - sse2 sse2-x2 sse4 sse4-x2 sse4-8 sse4-16 \ - generic-1 generic-4 generic-8 generic-16 generic-32 generic-64 +TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ + sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \ + generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 +ifneq ($(ARM_ENABLED), 0) + TARGETS+=neon-32 neon-16 neon-8 +endif # These files need to be compiled in two versions - 32 and 64 bits. BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) # These are files to be compiled in single version. @@ -134,12 +146,12 @@ BUILTINS_OBJS_32=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-32bi BUILTINS_OBJS_64=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-64bit.o))) BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_COMMON:.ll=.o))) \ $(BUILTINS_OBJS_32) $(BUILTINS_OBJS_64) \ - builtins-c-32.cpp builtins-c-64.cpp + builtins-c-32.cpp builtins-c-64.cpp BISON_SRC=parse.yy FLEX_SRC=lex.ll OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \ - stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \ + stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \ $(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o)) default: ispc @@ -264,4 +276,3 @@ objs/stdlib_mask32_ispc.cpp: stdlib.ispc @echo Creating C++ source from $< for mask32 @$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \ python stdlib2cpp.py mask32 > $@ - diff --git a/builtins.cpp b/builtins.cpp index 82c45b02..f3a0cf59 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -631,7 +631,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length, llvm::Triple bcTriple(bcModule->getTargetTriple()); Debug(SourcePos(), "module triple: %s\nbitcode triple: %s\n", mTriple.str().c_str(), bcTriple.str().c_str()); -#ifndef __arm__ +#if defined(ISPC_ARM_ENABLED) && !defined(__arm__) // FIXME: More ugly and dangerous stuff. We really haven't set up // proper build and runtime infrastructure for ispc to do // cross-compilation, yet it's at minimum useful to be able to emit @@ -812,6 +812,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // Next, add the target's custom implementations of the various needed // builtin functions (e.g. __masked_store_32(), etc). switch (g->target->getISA()) { + +#ifdef ISPC_ARM_ENABLED case Target::NEON8: { if (runtime32) { EXPORT_MODULE(builtins_bitcode_neon_8_32bit); @@ -839,6 +841,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } break; } +#endif case Target::SSE2: { switch (g->target->getVectorWidth()) { case 4: diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile index 7e83e618..1a565ffd 100644 --- a/examples/mandelbrot_tasks/Makefile +++ b/examples/mandelbrot_tasks/Makefile @@ -1,7 +1,7 @@ -EXAMPLE=mandelbrot -CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp -ISPC_SRC=mandelbrot.ispc +EXAMPLE=mandelbrot_tasks +CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp +ISPC_SRC=mandelbrot_tasks.ispc ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_ARM_TARGETS=neon diff --git a/examples/mandelbrot_tasks/mandelbrot.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp similarity index 99% rename from examples/mandelbrot_tasks/mandelbrot.cpp rename to examples/mandelbrot_tasks/mandelbrot_tasks.cpp index a01cfe43..dae22736 100644 --- a/examples/mandelbrot_tasks/mandelbrot.cpp +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -42,7 +42,7 @@ #include #include #include "../timing.h" -#include "mandelbrot_ispc.h" +#include "mandelbrot_tasks_ispc.h" using namespace ispc; extern void mandelbrot_serial(float x0, float y0, float x1, float y1, diff --git a/examples/mandelbrot_tasks/mandelbrot.ispc b/examples/mandelbrot_tasks/mandelbrot_tasks.ispc similarity index 100% rename from examples/mandelbrot_tasks/mandelbrot.ispc rename to examples/mandelbrot_tasks/mandelbrot_tasks.ispc diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj index b92de72f..3a8fca79 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj @@ -21,7 +21,7 @@ {E80DA7D4-AB22-4648-A068-327307156BE6} Win32Proj - mandelbrot + mandelbrot_tasks @@ -65,22 +65,22 @@ true $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot + mandelbrot_tasks true $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot + mandelbrot_tasks false $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot + mandelbrot_tasks false $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot + mandelbrot_tasks @@ -153,12 +153,12 @@ - - + + - + Document ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 diff --git a/examples/mandelbrot_tasks/mandelbrot_serial.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp similarity index 100% rename from examples/mandelbrot_tasks/mandelbrot_serial.cpp rename to examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp diff --git a/examples/perf.ini b/examples/perf.ini index 3814bf16..d2a5c73e 100755 --- a/examples/perf.ini +++ b/examples/perf.ini @@ -22,7 +22,7 @@ mandelbrot #*** Mandelbrot Set mandelbrot_tasks -mandelbrot +mandelbrot_tasks ^ #*** Perlin Noise Function diff --git a/examples/perf.py b/examples/perf.py index 8503bd8c..f96ef9ec 100755 --- a/examples/perf.py +++ b/examples/perf.py @@ -73,10 +73,19 @@ def cpu_get(): #returns cpu_usage def cpu_check(): if is_windows == False: - cpu1 = cpu_get() - time.sleep(1) - cpu2 = cpu_get() - cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100 + if is_mac == False: + cpu1 = cpu_get() + time.sleep(1) + cpu2 = cpu_get() + cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100 + else: + os.system("sysctl -n vm.loadavg > cpu_temp") + c = open("cpu_temp", 'r') + c_line = c.readline() + c.close + os.remove("cpu_temp") + R = c_line.split(' ') + cpu_percent = float(R[1]) * 3 else: os.system("wmic cpu get loadpercentage /value > cpu_temp") c = open("cpu_temp", 'r') @@ -143,6 +152,8 @@ parser.add_option('-p', '--path', dest='path', global is_windows is_windows = (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) +global is_mac +is_mac = (platform.system() == 'Darwin') # save corrent path pwd = os.getcwd() diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index b4ced5c7..c9c2fa7b 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -365,7 +365,7 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) static inline int32_t lAtomicAdd(volatile int32_t *v, int32_t delta) { #ifdef ISPC_IS_WINDOWS - return InterlockedAdd((volatile LONG *)v, delta); + return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta; #else return __sync_fetch_and_add(v, delta); #endif diff --git a/ispc.cpp b/ispc.cpp index 0f07895f..a012b08d 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -141,10 +141,12 @@ lGetSystemISA() { static const char *supportedCPUs[] = { +#ifdef ISPC_ARM_ENABLED // FIXME: LLVM supports a ton of different ARM CPU variants--not just // cortex-a9 and a15. We should be able to handle any of them that also // have NEON support. "cortex-a9", "cortex-a15", +#endif "atom", "penryn", "core2", "corei7", "corei7-avx" #if !defined(LLVM_3_1) , "core-avx-i", "core-avx2" @@ -185,9 +187,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // possible ISA based on that. if (!strcmp(cpu, "core-avx2")) isa = "avx2"; +#ifdef ISPC_ARM_ENABLED else if (!strcmp(cpu, "cortex-a9") || !strcmp(cpu, "cortex-a15")) isa = "neon-32"; +#endif else if (!strcmp(cpu, "core-avx-i")) isa = "avx1.1"; else if (!strcmp(cpu, "sandybridge") || @@ -211,7 +215,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } } -#if !defined(__arm__) +#if defined(ISPC_ARM_ENABLED) && !defined(__arm__) if (cpu == NULL && !strncmp(isa, "neon", 4)) // If we're compiling NEON on an x86 host and the CPU wasn't // supplied, don't go and set the CPU based on the host... @@ -246,9 +250,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_cpu = cpu; if (arch == NULL) { +#ifdef ISPC_ARM_ENABLED if (!strncmp(isa, "neon", 4)) arch = "arm"; else +#endif arch = "x86-64"; } @@ -461,6 +467,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasGather = true; #endif } +#ifdef ISPC_ARM_ENABLED else if (!strcasecmp(isa, "neon-8")) { this->m_isa = Target::NEON8; this->m_nativeVectorWidth = 16; @@ -488,6 +495,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } +#endif else { fprintf(stderr, "Target ISA \"%s\" is unknown. Choices are: %s\n", isa, SupportedTargetISAs()); @@ -502,9 +510,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : llvm::Reloc::Default; std::string featuresString = m_attributes; llvm::TargetOptions options; +#ifdef ISPC_ARM_ENABLED if (m_isa == Target::NEON8 || m_isa == Target::NEON16 || m_isa == Target::NEON32) options.FloatABIType = llvm::FloatABI::Hard; +#endif #if !defined(LLVM_3_1) if (g->opt.disableFMA == false) options.AllowFPOpFusion = llvm::FPOpFusion::Fast; @@ -596,13 +606,21 @@ Target::SupportedTargetCPUs() { const char * Target::SupportedTargetArchs() { - return "arm, x86, x86-64"; + return +#ifdef ISPC_ARM_ENABLED + "arm, " +#endif + "x86, x86-64"; } const char * Target::SupportedTargetISAs() { - return "neon-8, neon-16, neon-32, sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, " + return +#ifdef ISPC_ARM_ENABLED + "neon-8, neon-16, neon-32, " +#endif + "sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, " "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2, " "generic-1, generic-4, generic-8, generic-16, generic-32"; } @@ -611,10 +629,13 @@ Target::SupportedTargetISAs() { std::string Target::GetTripleString() const { llvm::Triple triple; +#ifdef ISPC_ARM_ENABLED if (m_arch == "arm") { triple.setTriple("armv7-eabi"); } - else { + else +#endif + { // Start with the host triple as the default triple.setTriple(llvm::sys::getDefaultTargetTriple()); @@ -637,12 +658,14 @@ Target::GetTripleString() const { const char * Target::ISAToString(ISA isa) { switch (isa) { +#ifdef ISPC_ARM_ENABLED case Target::NEON8: return "neon-8"; case Target::NEON16: return "neon-16"; case Target::NEON32: return "neon-32"; +#endif case Target::SSE2: return "sse2"; case Target::SSE4: @@ -813,6 +836,7 @@ Globals::Globals() { includeStdlib = true; runCPP = true; debugPrint = false; + debugIR = -1; disableWarnings = false; warningsAsErrors = false; quiet = false; diff --git a/ispc.h b/ispc.h index 98fcd199..25a03e1d 100644 --- a/ispc.h +++ b/ispc.h @@ -59,6 +59,7 @@ #include #include #include +#include #include /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation @@ -66,6 +67,9 @@ */ #define ISPC_MAX_NVEC 64 +// Number of final optimization phase +#define LAST_OPT_NUMBER 1000 + // Forward declarations of a number of widely-used LLVM types namespace llvm { class AttributeSet; @@ -175,7 +179,11 @@ public: flexible/performant of them will apear last in the enumerant. Note also that __best_available_isa() needs to be updated if ISAs are added or the enumerant values are reordered. */ - enum ISA { NEON32, NEON16, NEON8, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, + enum ISA { +#ifdef ISPC_ARM_ENABLED + NEON32, NEON16, NEON8, +#endif + SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS }; /** Initializes the given Target pointer for a target of the given @@ -495,6 +503,16 @@ struct Globals { ispc's execution. */ bool debugPrint; + /** Indicates which stages of optimization we want to dump. */ + std::set debug_stages; + + /** Indicates after which optimization we want to generate + DebugIR information. */ + int debugIR; + + /** Indicates which phases of optimization we want to switch off. */ + std::set off_stages; + /** Indicates whether all warning messages should be surpressed. */ bool disableWarnings; diff --git a/ispc.vcxproj b/ispc.vcxproj index e9bf9d97..53386c4c 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -1,4 +1,4 @@ - + @@ -45,12 +45,6 @@ - - - - - - @@ -191,60 +185,6 @@ Building gen-bitcode-sse2-x2-64bit.cpp - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 32bit > $(Configuration)/gen-bitcode-neon-8-32bit.cpp - $(Configuration)/gen-bitcode-neon-8-32bit.cpp - builtins\util.m4;builtins\target-neon-common.ll - Building gen-bitcode-neon-8-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 64bit > $(Configuration)/gen-bitcode-neon-8-64bit.cpp - $(Configuration)/gen-bitcode-neon-8-64bit.cpp - builtins\util.m4;builtins\target-neon-common.ll - Building gen-bitcode-neon-8-64bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 32bit > $(Configuration)/gen-bitcode-neon-16-32bit.cpp - $(Configuration)/gen-bitcode-neon-16-32bit.cpp - builtins\util.m4;builtins\target-neon-common.ll - Building gen-bitcode-neon-16-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 64bit > $(Configuration)/gen-bitcode-neon-16-64bit.cpp - $(Configuration)/gen-bitcode-neon-16-64bit.cpp - builtins\util.m4;builtins\target-neon-common.ll - Building gen-bitcode-neon-16-64bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 32bit > $(Configuration)/gen-bitcode-neon-32-32bit.cpp - $(Configuration)/gen-bitcode-neon-32-32bit.cpp - builtins\util.m4;builtins\target-neon-common.ll - Building gen-bitcode-neon-32-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 64bit > $(Configuration)/gen-bitcode-neon-32-64bit.cpp - $(Configuration)/gen-bitcode-neon-32-64bit.cpp - builtins\util.m4;builtins\target-neon-common.ll - Building gen-bitcode-neon-32-64bit.cpp - - Document @@ -263,6 +203,26 @@ Building gen-bitcode-avx1-64bit.cpp +======= + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit > $(Configuration)/gen-bitcode-avx1-32bit.cpp + $(Configuration)/gen-bitcode-avx1-32bit.cpp + builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + Building gen-bitcode-avx1-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit > $(Configuration)/gen-bitcode-avx1-64bit.cpp + $(Configuration)/gen-bitcode-avx1-64bit.cpp + builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + Building gen-bitcode-avx1-64bit.cpp + + +>>>>>>> master Document diff --git a/main.cpp b/main.cpp index 8d7282f5..7290d3c8 100644 --- a/main.cpp +++ b/main.cpp @@ -155,6 +155,11 @@ devUsage(int ret) { printf(" disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n"); printf(" disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n"); printf(" [--yydebug]\t\t\t\tPrint debugging information during parsing\n"); + printf(" [--debug-phase=]\t\tSet optimization phases to dump. --debug-phase=first,210:220,300,305,310:last\n"); +#ifdef LLVM_3_4 + printf(" [--debug-ir=]\t\tSet optimization phase to generate debugIR after it\n"); +#endif + printf(" [--off-phase=]\t\tSwitch off optimization phases. --off-phase=first,210:220,300,305,310:last\n"); exit(ret); } @@ -211,6 +216,47 @@ lSignal(void *) { } +static int ParsingPhaseName(char * stage) { + if (strncmp(stage, "first", 5) == 0) { + return 0; + } + else if (strncmp(stage, "last", 4) == 0) { + return LAST_OPT_NUMBER; + } + else { + int t = atoi(stage); + if (t < 0 || t > LAST_OPT_NUMBER) { + fprintf(stderr, "Phases must be from 0 to %d. %s is incorrect.\n", LAST_OPT_NUMBER, stage); + exit(0); + } + else { + return t; + } + } +} + + +static std::set ParsingPhases(char * stages) { + std::set phases; + int begin = ParsingPhaseName(stages); + int end = begin; + + for (unsigned i = 0; i < strlen(stages); i++) { + if ((stages[i] == ',') || (i == strlen(stages) - 1)) { + for (int j = begin; j < end + 1; j++) { + phases.insert(j); + } + begin = ParsingPhaseName(stages + i + 1); + end = begin; + } + else if (stages[i] == ':') { + end = ParsingPhaseName(stages + i + 1); + } + } + return phases; +} + + static void lParseInclude(const char *path) { #ifdef ISPC_IS_WINDOWS @@ -253,6 +299,8 @@ int main(int Argc, char *Argv[]) { LLVMInitializeX86Disassembler(); LLVMInitializeX86TargetMC(); #endif // !__ARM__ + +#ifdef ISPC_ARM_ENABLED // Generating ARM from x86 is more likely to be useful, though. LLVMInitializeARMTargetInfo(); LLVMInitializeARMTarget(); @@ -260,6 +308,7 @@ int main(int Argc, char *Argv[]) { LLVMInitializeARMAsmParser(); LLVMInitializeARMDisassembler(); LLVMInitializeARMTargetMC(); +#endif char *file = NULL; const char *headerFileName = NULL; @@ -486,6 +535,20 @@ int main(int Argc, char *Argv[]) { } hostStubFileName = argv[i]; } + else if (strncmp(argv[i], "--debug-phase=", 14) == 0) { + fprintf(stderr, "WARNING: Adding debug phases may change the way PassManager" + "handles the phases and it may possibly make some bugs go" + "away or introduce the new ones.\n"); + g->debug_stages = ParsingPhases(argv[i] + strlen("--debug-phase=")); + } +#ifdef LLVM_3_4 + else if (strncmp(argv[i], "--debug-ir=", 11) == 0) { + g->debugIR = ParsingPhaseName(argv[i] + strlen("--debug-ir=")); + } +#endif + else if (strncmp(argv[i], "--off-phase=", 12) == 0) { + g->off_stages = ParsingPhases(argv[i] + strlen("--off-phase=")); + } else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) { lPrintVersion(); return 0; diff --git a/opt.cpp b/opt.cpp index 077320d5..3e2efcd8 100644 --- a/opt.cpp +++ b/opt.cpp @@ -63,6 +63,9 @@ #include #include #endif +#if defined (LLVM_3_4) + #include +#endif #include #include #include @@ -119,6 +122,8 @@ static llvm::Pass *CreateReplacePseudoMemoryOpsPass(); static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry); static llvm::Pass *CreateMakeInternalFuncsStaticPass(); +static llvm::Pass *CreateDebugPass(char * output); + #define DEBUG_START_PASS(NAME) \ if (g->debugPrint && \ (getenv("FUNC") == NULL || \ @@ -395,6 +400,54 @@ lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) { } +/////////////////////////////////////////////////////////////////////////// +// This is a wrap over class llvm::PassManager. This duplicates PassManager function run() +// and change PassManager function add by adding some checks and debug passes. +// This wrap can control: +// - If we want to switch off optimization with given number. +// - If we want to dump LLVM IR after optimization with given number. +// - If we want to generate LLVM IR debug for gdb after optimization with given number. +class DebugPassManager { +public: + DebugPassManager():number(0){} + void add(llvm::Pass * P, int stage); + bool run(llvm::Module& M) {return PM.run(M);} + llvm::PassManager& getPM() {return PM;} + +private: + llvm::PassManager PM; + int number; +}; + +void +DebugPassManager::add(llvm::Pass * P, int stage = -1) { + // taking number of optimization + if (stage == -1) { + number++; + } + else { + number = stage; + } + if (g->off_stages.find(number) == g->off_stages.end()) { + // adding optimization (not switched off) + PM.add(P); + if (g->debug_stages.find(number) != g->debug_stages.end()) { + // adding dump of LLVM IR after optimization + char buf[100]; + sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n", + number, P->getPassName()); + PM.add(CreateDebugPass(buf)); + } +#ifdef LLVM_3_4 + if (g->debugIR == number) { + // adding generating of LLVM IR debug after optimization + char buf[100]; + sprintf(buf, "Debug_IR_after_%d_phase.bc", number); + PM.add(llvm::createDebugIRPass(true, true, ".", buf)); + } +#endif + } +} /////////////////////////////////////////////////////////////////////////// void @@ -403,14 +456,8 @@ Optimize(llvm::Module *module, int optLevel) { printf("*** Code going into optimization ***\n"); module->dump(); } - - llvm::PassManager optPM; - optPM.add(llvm::createVerifierPass()); - -#if 0 - std::string err; - optPM.add(llvm::createPrintModulePass(new llvm::raw_fd_ostream("-", err))); -#endif + DebugPassManager optPM; + optPM.add(llvm::createVerifierPass(),0); llvm::TargetLibraryInfo *targetLibraryInfo = new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple())); @@ -427,7 +474,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(), targetMachine->getVectorTargetTransformInfo())); #else // LLVM 3.3+ - targetMachine->addAnalysisPasses(optPM); + targetMachine->addAnalysisPasses(optPM.getPM()); #endif #endif @@ -439,11 +486,11 @@ Optimize(llvm::Module *module, int optLevel) { // run absolutely no optimizations, since the front-end needs us to // take the various __pseudo_* functions it has emitted and turn // them into something that can actually execute. - optPM.add(CreateImproveMemoryOpsPass()); + optPM.add(CreateImproveMemoryOpsPass(), 100); if (g->opt.disableHandlePseudoMemoryOps == false) optPM.add(CreateReplacePseudoMemoryOpsPass()); - optPM.add(CreateIntrinsicsOptPass()); + optPM.add(CreateIntrinsicsOptPass(), 102); optPM.add(CreateIsCompileTimeConstantPass(true)); optPM.add(llvm::createFunctionInliningPass()); optPM.add(CreateMakeInternalFuncsStaticPass()); @@ -462,7 +509,7 @@ Optimize(llvm::Module *module, int optLevel) { llvm::initializeInstrumentation(*registry); llvm::initializeTarget(*registry); - optPM.add(llvm::createGlobalDCEPass()); + optPM.add(llvm::createGlobalDCEPass(), 200); // Early optimizations to try to reduce the total amount of code to // work with if we can @@ -476,14 +523,14 @@ Optimize(llvm::Module *module, int optLevel) { if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) { - optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createInstructionCombiningPass(), 210); optPM.add(CreateImproveMemoryOpsPass()); } if (!g->opt.disableMaskAllOnOptimizations) { - optPM.add(CreateIntrinsicsOptPass()); + optPM.add(CreateIntrinsicsOptPass(), 215); optPM.add(CreateInstructionSimplifyPass()); } - optPM.add(llvm::createDeadInstEliminationPass()); + optPM.add(llvm::createDeadInstEliminationPass(), 220); // Max struct size threshold for scalar replacement is // 1) 4 fields (r,g,b,w) @@ -513,10 +560,10 @@ Optimize(llvm::Module *module, int optLevel) { #if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3) // Starting from 3.4 this functionality was moved to // InstructionCombiningPass. See r184459 for details. - optPM.add(llvm::createSimplifyLibCallsPass()); + optPM.add(llvm::createSimplifyLibCallsPass(), 240); #endif optPM.add(llvm::createAggressiveDCEPass()); - optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createInstructionCombiningPass(), 241); optPM.add(llvm::createJumpThreadingPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold)); @@ -524,44 +571,45 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createTailCallEliminationPass()); if (!g->opt.disableMaskAllOnOptimizations) { - optPM.add(CreateIntrinsicsOptPass()); + optPM.add(CreateIntrinsicsOptPass(), 250); optPM.add(CreateInstructionSimplifyPass()); } if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) { - optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createInstructionCombiningPass(), 255); optPM.add(CreateImproveMemoryOpsPass()); if (g->opt.disableCoalescing == false && g->target->getISA() != Target::GENERIC) { // It is important to run this here to make it easier to // finding matching gathers we can coalesce.. - optPM.add(llvm::createEarlyCSEPass()); + optPM.add(llvm::createEarlyCSEPass(), 260); optPM.add(CreateGatherCoalescePass()); } } - optPM.add(llvm::createFunctionInliningPass()); + optPM.add(llvm::createFunctionInliningPass(), 265); optPM.add(llvm::createConstantPropagationPass()); optPM.add(CreateIntrinsicsOptPass()); optPM.add(CreateInstructionSimplifyPass()); if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) { - optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createInstructionCombiningPass(), 270); optPM.add(CreateImproveMemoryOpsPass()); } - optPM.add(llvm::createIPSCCPPass()); + optPM.add(llvm::createIPSCCPPass(), 275); optPM.add(llvm::createDeadArgEliminationPass()); optPM.add(llvm::createAggressiveDCEPass()); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createCFGSimplificationPass()); - if (g->opt.disableHandlePseudoMemoryOps == false) - optPM.add(CreateReplacePseudoMemoryOpsPass()); - optPM.add(CreateIntrinsicsOptPass()); + if (g->opt.disableHandlePseudoMemoryOps == false) { + optPM.add(CreateReplacePseudoMemoryOpsPass(),280); + } + optPM.add(CreateIntrinsicsOptPass(),281); optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createFunctionInliningPass()); @@ -579,9 +627,10 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createIndVarSimplifyPass()); optPM.add(llvm::createLoopIdiomPass()); optPM.add(llvm::createLoopDeletionPass()); - if (g->opt.unrollLoops) - optPM.add(llvm::createLoopUnrollPass()); - optPM.add(llvm::createGVNPass()); + if (g->opt.unrollLoops) { + optPM.add(llvm::createLoopUnrollPass(), 300); + } + optPM.add(llvm::createGVNPass(), 301); optPM.add(CreateIsCompileTimeConstantPass(true)); optPM.add(CreateIntrinsicsOptPass()); @@ -609,7 +658,7 @@ Optimize(llvm::Module *module, int optLevel) { // Finish up by making sure we didn't mess anything up in the IR along // the way. - optPM.add(llvm::createVerifierPass()); + optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER); optPM.run(*module); if (g->debugPrint) { @@ -4330,6 +4379,42 @@ CreateIsCompileTimeConstantPass(bool isLastTry) { return new IsCompileTimeConstantPass(isLastTry); } +////////////////////////////////////////////////////////////////////////// +// DebugPass + +/** This pass is added in list of passes after optimizations which + we want to debug and print dump of LLVM IR in stderr. Also it + prints name and number of previous optimization. + */ +class DebugPass : public llvm::ModulePass { +public: + static char ID; + DebugPass(char * output) : ModulePass(ID) { + sprintf(str_output, "%s", output); + } + + const char *getPassName() const { return "Dump LLVM IR"; } + bool runOnModule(llvm::Module &m); + +private: + char str_output[100]; +}; + +char DebugPass::ID = 0; + +bool +DebugPass::runOnModule(llvm::Module &module) { + fprintf(stderr, "%s", str_output); + fflush(stderr); + module.dump(); + return true; +} + +static llvm::Pass * +CreateDebugPass(char * output) { + return new DebugPass(output); +} + /////////////////////////////////////////////////////////////////////////// // MakeInternalFuncsStaticPass