Merge branch 'master' into arm

Conflicts: Makefile builtins.cpp ispc.cpp ispc.h ispc.vcxproj opt.cpp
2013-08-06 17:09:48 -07:00
parent 1276ea9844 0755e4f8ff
commit cd9afe946c
15 changed files with 298 additions and 123 deletions
--- a/27
+++ b/27
@@ -39,6 +39,10 @@
 LLVM_CONFIG=$(shell which llvm-config)
 CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
 # Enable ARM by request
 # To enable: make ARM_ENABLED=1
 ARM_ENABLED=0
 # Add llvm bin to the path so any scripts run will go to the right llvm-config
 LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
 export PATH:=$(LLVM_BIN):$(PATH)
@@ -55,12 +59,15 @@ LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
 LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
 LLVM_VERSION_DEF=-D$(LLVM_VERSION)
-LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker arm
+LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker
 # Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step.
 # We check if it's available before adding it (to not break 3.2 and earlier).
 ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1)
    LLVM_COMPONENTS+=option
 endif
 ifneq ($(ARM_ENABLED), 0)
    LLVM_COMPONENTS+=arm
 endif
 LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS))
 CLANG=clang
@@ -104,6 +111,9 @@ OPT=-O2
 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
 	-Wall $(LLVM_VERSION_DEF) \
 	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
 ifneq ($(ARM_ENABLED), 0)
    CXXFLAGS+=-DISPC_ARM_ENABLED
 endif
 LDFLAGS=
 ifeq ($(ARCH_OS),Linux)
@@ -122,10 +132,12 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=neon-32 neon-16 neon-8 \
+TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
-	avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
-	sse2 sse2-x2 sse4 sse4-x2 sse4-8 sse4-16 \
+	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
-	generic-1 generic-4 generic-8 generic-16 generic-32 generic-64
+ifneq ($(ARM_ENABLED), 0)
    TARGETS+=neon-32 neon-16 neon-8
 endif
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
 # These are files to be compiled in single version.
@@ -134,12 +146,12 @@ BUILTINS_OBJS_32=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-32bi
 BUILTINS_OBJS_64=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-64bit.o)))
 BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_COMMON:.ll=.o))) \
 	$(BUILTINS_OBJS_32) $(BUILTINS_OBJS_64) \
-	builtins-c-32.cpp builtins-c-64.cpp 
+	builtins-c-32.cpp builtins-c-64.cpp
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-	stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
+       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
 	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 default: ispc
@@ -264,4 +276,3 @@ objs/stdlib_mask32_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask32
 	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
 		python stdlib2cpp.py mask32 > $@
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -631,7 +631,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
        llvm::Triple bcTriple(bcModule->getTargetTriple());
        Debug(SourcePos(), "module triple: %s\nbitcode triple: %s\n",
              mTriple.str().c_str(), bcTriple.str().c_str());
-#ifndef __arm__
+#if defined(ISPC_ARM_ENABLED) && !defined(__arm__)
        // FIXME: More ugly and dangerous stuff.  We really haven't set up
        // proper build and runtime infrastructure for ispc to do
        // cross-compilation, yet it's at minimum useful to be able to emit
@@ -812,6 +812,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    // Next, add the target's custom implementations of the various needed
    // builtin functions (e.g. __masked_store_32(), etc).
    switch (g->target->getISA()) {
 #ifdef ISPC_ARM_ENABLED
    case Target::NEON8: {
        if (runtime32) {
            EXPORT_MODULE(builtins_bitcode_neon_8_32bit);
@@ -839,6 +841,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        }
        break;
    }
 #endif
    case Target::SSE2: {
        switch (g->target->getVectorWidth()) {
        case 4:
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -1,7 +1,7 @@
-EXAMPLE=mandelbrot
+EXAMPLE=mandelbrot_tasks
-CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
+CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
-ISPC_SRC=mandelbrot.ispc
+ISPC_SRC=mandelbrot_tasks.ispc
 ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
 ISPC_ARM_TARGETS=neon
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
@@ -42,7 +42,7 @@
 #include <algorithm>
 #include <string.h>
 #include "../timing.h"
-#include "mandelbrot_ispc.h"
+#include "mandelbrot_tasks_ispc.h"
 using namespace ispc;
 extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.ispc
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -21,7 +21,7 @@
  <PropertyGroup Label="Globals">
    <ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>mandelbrot</RootNamespace>
+    <RootNamespace>mandelbrot_tasks</RootNamespace>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -65,22 +65,22 @@
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -153,12 +153,12 @@
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
-    <ClCompile Include="mandelbrot.cpp" />
+    <ClCompile Include="mandelbrot_tasks.cpp" />
-    <ClCompile Include="mandelbrot_serial.cpp" />
+    <ClCompile Include="mandelbrot_tasks_serial.cpp" />
    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="mandelbrot.ispc">
+    <CustomBuild Include="mandelbrot_tasks.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
--- a/examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp
--- a/examples/perf.ini
+++ b/examples/perf.ini
@@ -22,7 +22,7 @@ mandelbrot
 #***
 Mandelbrot Set
 mandelbrot_tasks
-mandelbrot
+mandelbrot_tasks
 ^
 #***
 Perlin Noise Function
--- a/examples/perf.py
+++ b/examples/perf.py
@@ -73,10 +73,19 @@ def cpu_get():
 #returns cpu_usage
 def cpu_check():
    if is_windows == False:
-        cpu1 = cpu_get()
+        if is_mac == False:
-        time.sleep(1)
+            cpu1 = cpu_get()
-        cpu2 = cpu_get()
+            time.sleep(1)
-        cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
+            cpu2 = cpu_get()
            cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
        else:
            os.system("sysctl -n vm.loadavg > cpu_temp")
            c = open("cpu_temp", 'r')
            c_line = c.readline()
            c.close
            os.remove("cpu_temp")
            R = c_line.split(' ')
            cpu_percent = float(R[1]) * 3
    else:
 	os.system("wmic cpu get loadpercentage /value > cpu_temp")
 	c = open("cpu_temp", 'r')
@@ -143,6 +152,8 @@ parser.add_option('-p', '--path', dest='path',
 global is_windows
 is_windows = (platform.system() == 'Windows' or
              'CYGWIN_NT' in platform.system())
 global is_mac
 is_mac = (platform.system() == 'Darwin')
 # save corrent path
 pwd = os.getcwd()
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -365,7 +365,7 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue)
 static inline int32_t 
 lAtomicAdd(volatile int32_t *v, int32_t delta) {
 #ifdef ISPC_IS_WINDOWS
-    return InterlockedAdd((volatile LONG *)v, delta);
+    return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
 #else
    return __sync_fetch_and_add(v, delta);
 #endif
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -141,10 +141,12 @@ lGetSystemISA() {
 static const char *supportedCPUs[] = {
 #ifdef ISPC_ARM_ENABLED
    // FIXME: LLVM supports a ton of different ARM CPU variants--not just
    // cortex-a9 and a15.  We should be able to handle any of them that also
    // have NEON support.
    "cortex-a9", "cortex-a15",
 #endif
    "atom", "penryn", "core2", "corei7", "corei7-avx"
 #if !defined(LLVM_3_1)
    , "core-avx-i", "core-avx2"
@@ -185,9 +187,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
            // possible ISA based on that.
            if (!strcmp(cpu, "core-avx2"))
                isa = "avx2";
 #ifdef ISPC_ARM_ENABLED
            else if (!strcmp(cpu, "cortex-a9") ||
                     !strcmp(cpu, "cortex-a15"))
                isa = "neon-32";
 #endif
            else if (!strcmp(cpu, "core-avx-i"))
                isa = "avx1.1";
            else if (!strcmp(cpu, "sandybridge") ||
@@ -211,7 +215,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        }
    }
-#if !defined(__arm__)
+#if defined(ISPC_ARM_ENABLED) && !defined(__arm__)
    if (cpu == NULL && !strncmp(isa, "neon", 4))
        // If we're compiling NEON on an x86 host and the CPU wasn't
        // supplied, don't go and set the CPU based on the host...
@@ -246,9 +250,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
    this->m_cpu = cpu;
    if (arch == NULL) {
 #ifdef ISPC_ARM_ENABLED
        if (!strncmp(isa, "neon", 4))
            arch = "arm";
        else
 #endif
            arch = "x86-64";
    }
@@ -461,6 +467,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        this->m_hasGather = true;
 #endif
    }
 #ifdef ISPC_ARM_ENABLED
    else if (!strcasecmp(isa, "neon-8")) {
        this->m_isa = Target::NEON8;
        this->m_nativeVectorWidth = 16;
@@ -488,6 +495,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        this->m_maskingIsFree = false;
        this->m_maskBitCount = 32;
    }
 #endif
    else {
        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n",
                isa, SupportedTargetISAs());
@@ -502,9 +510,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
            llvm::Reloc::Default;
        std::string featuresString = m_attributes;
        llvm::TargetOptions options;
 #ifdef ISPC_ARM_ENABLED
        if (m_isa == Target::NEON8 || m_isa == Target::NEON16 ||
            m_isa == Target::NEON32)
            options.FloatABIType = llvm::FloatABI::Hard;
 #endif
 #if !defined(LLVM_3_1)
        if (g->opt.disableFMA == false)
            options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
@@ -596,13 +606,21 @@ Target::SupportedTargetCPUs() {
 const char *
 Target::SupportedTargetArchs() {
-    return "arm, x86, x86-64";
+    return
 #ifdef ISPC_ARM_ENABLED
        "arm, "
 #endif
        "x86, x86-64";
 }
 const char *
 Target::SupportedTargetISAs() {
-    return "neon-8, neon-16, neon-32, sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
+    return
 #ifdef ISPC_ARM_ENABLED
        "neon-8, neon-16, neon-32, "
 #endif
        "sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
        "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2, "
        "generic-1, generic-4, generic-8, generic-16, generic-32";
 }
@@ -611,10 +629,13 @@ Target::SupportedTargetISAs() {
 std::string
 Target::GetTripleString() const {
    llvm::Triple triple;
 #ifdef ISPC_ARM_ENABLED
    if (m_arch == "arm") {
        triple.setTriple("armv7-eabi");
    }
-    else {
+    else
 #endif
    {
        // Start with the host triple as the default
        triple.setTriple(llvm::sys::getDefaultTargetTriple());
@@ -637,12 +658,14 @@ Target::GetTripleString() const {
 const char *
 Target::ISAToString(ISA isa) {
    switch (isa) {
 #ifdef ISPC_ARM_ENABLED
    case Target::NEON8:
        return "neon-8";
    case Target::NEON16:
        return "neon-16";
    case Target::NEON32:
        return "neon-32";
 #endif
    case Target::SSE2:
        return "sse2";
    case Target::SSE4:
@@ -813,6 +836,7 @@ Globals::Globals() {
    includeStdlib = true;
    runCPP = true;
    debugPrint = false;
    debugIR = -1;
    disableWarnings = false;
    warningsAsErrors = false;
    quiet = false;
--- a/ispc.h
+++ b/ispc.h
@@ -59,6 +59,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <vector>
 #include <set>
 #include <string>
 /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
@@ -66,6 +67,9 @@
 */
 #define ISPC_MAX_NVEC 64
 // Number of final optimization phase
 #define LAST_OPT_NUMBER 1000
 // Forward declarations of a number of widely-used LLVM types
 namespace llvm {
    class AttributeSet;
@@ -175,7 +179,11 @@ public:
        flexible/performant of them will apear last in the enumerant.  Note
        also that __best_available_isa() needs to be updated if ISAs are
        added or the enumerant values are reordered.  */
-    enum ISA { NEON32, NEON16, NEON8, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC,
+    enum ISA {
 #ifdef ISPC_ARM_ENABLED
               NEON32, NEON16, NEON8,
 #endif
               SSE2, SSE4, AVX, AVX11, AVX2, GENERIC,
               NUM_ISAS };
    /** Initializes the given Target pointer for a target of the given
@@ -495,6 +503,16 @@ struct Globals {
        ispc's execution. */
    bool debugPrint;
    /** Indicates which stages of optimization we want to dump. */
    std::set<int> debug_stages;
    /** Indicates after which optimization we want to generate
        DebugIR information. */
    int debugIR;
    /** Indicates which phases of optimization we want to switch off. */
    std::set<int> off_stages;
    /** Indicates whether all warning messages should be surpressed. */
    bool disableWarnings;
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -45,12 +45,6 @@
    <ClCompile Include="$(Configuration)\gen-bitcode-generic-32-64bit.cpp" />
    <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-32bit.cpp" />
    <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-64bit.cpp" />
    <ClCompile Include="$(Configuration)\gen-bitcode-neon-8-32bit.cpp" />
    <ClCompile Include="$(Configuration)\gen-bitcode-neon-8-64bit.cpp" />
    <ClCompile Include="$(Configuration)\gen-bitcode-neon-16-32bit.cpp" />
    <ClCompile Include="$(Configuration)\gen-bitcode-neon-16-64bit.cpp" />
    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32-32bit.cpp" />
    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32-64bit.cpp" />
    <ClCompile Include="$(Configuration)\gen-bitcode-sse2-32bit.cpp" />
    <ClCompile Include="$(Configuration)\gen-bitcode-sse2-64bit.cpp" />
    <ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-32bit.cpp" />
@@ -191,60 +185,6 @@
      <Message>Building gen-bitcode-sse2-x2-64bit.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="builtins\target-neon-8.ll">
      <FileType>Document</FileType>
      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-8-32bit.cpp</Command>
      <Outputs>$(Configuration)/gen-bitcode-neon-8-32bit.cpp</Outputs>
      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
      <Message>Building gen-bitcode-neon-8-32bit.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="builtins\target-neon-8.ll">
      <FileType>Document</FileType>
      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-8-64bit.cpp</Command>
      <Outputs>$(Configuration)/gen-bitcode-neon-8-64bit.cpp</Outputs>
      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
      <Message>Building gen-bitcode-neon-8-64bit.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="builtins\target-neon-16.ll">
      <FileType>Document</FileType>
      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-16-32bit.cpp</Command>
      <Outputs>$(Configuration)/gen-bitcode-neon-16-32bit.cpp</Outputs>
      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
      <Message>Building gen-bitcode-neon-16-32bit.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="builtins\target-neon-16.ll">
      <FileType>Document</FileType>
      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-16-64bit.cpp</Command>
      <Outputs>$(Configuration)/gen-bitcode-neon-16-64bit.cpp</Outputs>
      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
      <Message>Building gen-bitcode-neon-16-64bit.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="builtins\target-neon-32.ll">
      <FileType>Document</FileType>
      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-32-32bit.cpp</Command>
      <Outputs>$(Configuration)/gen-bitcode-neon-32-32bit.cpp</Outputs>
      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
      <Message>Building gen-bitcode-neon-32-32bit.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="builtins\target-neon-32.ll">
      <FileType>Document</FileType>
      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-32-64bit.cpp</Command>
      <Outputs>$(Configuration)/gen-bitcode-neon-32-64bit.cpp</Outputs>
      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
      <Message>Building gen-bitcode-neon-32-64bit.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="builtins\target-avx1.ll">
      <FileType>Document</FileType>
@@ -263,6 +203,26 @@
      <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
    </CustomBuild>
  </ItemGroup>
 =======
  <ItemGroup>
    <CustomBuild Include="builtins\target-avx1.ll">
      <FileType>Document</FileType>
      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
      <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="builtins\target-avx1.ll">
      <FileType>Document</FileType>
      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
      <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
      <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
    </CustomBuild>
  </ItemGroup>
 >>>>>>> master
  <ItemGroup>
    <CustomBuild Include="builtins\target-avx1-x2.ll">
      <FileType>Document</FileType>
--- a/main.cpp
+++ b/main.cpp
@@ -155,6 +155,11 @@ devUsage(int ret) {
    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
    printf("    [--yydebug]\t\t\t\tPrint debugging information during parsing\n");
    printf("    [--debug-phase=<value>]\t\tSet optimization phases to dump. --debug-phase=first,210:220,300,305,310:last\n");
 #ifdef LLVM_3_4
    printf("    [--debug-ir=<value>]\t\tSet optimization phase to generate debugIR after it\n");
 #endif
    printf("    [--off-phase=<value>]\t\tSwitch off optimization phases. --off-phase=first,210:220,300,305,310:last\n");
    exit(ret);
 }
@@ -211,6 +216,47 @@ lSignal(void *) {
 }
 static int ParsingPhaseName(char * stage) {
    if (strncmp(stage, "first", 5) == 0) {
        return 0;
    }
    else if (strncmp(stage, "last", 4) == 0) {
        return LAST_OPT_NUMBER;
    }
    else {
        int t = atoi(stage);
        if (t < 0 || t > LAST_OPT_NUMBER) {
            fprintf(stderr, "Phases must be from 0 to %d. %s is incorrect.\n", LAST_OPT_NUMBER, stage);
            exit(0);
        }
        else {
            return t;
        }
    }
 }
 static std::set<int> ParsingPhases(char * stages) {
    std::set<int> phases;
    int begin = ParsingPhaseName(stages);
    int end = begin;
    for (unsigned i = 0; i < strlen(stages); i++) {
        if ((stages[i] == ',') || (i == strlen(stages) - 1)) {
            for (int j = begin; j < end + 1; j++) {
                phases.insert(j);
            }
            begin = ParsingPhaseName(stages + i + 1);
            end = begin;
        }
        else if (stages[i] == ':') {
            end = ParsingPhaseName(stages + i + 1);
        }
    }
    return phases;
 }
 static void
 lParseInclude(const char *path) {
 #ifdef ISPC_IS_WINDOWS
@@ -253,6 +299,8 @@ int main(int Argc, char *Argv[]) {
    LLVMInitializeX86Disassembler();
    LLVMInitializeX86TargetMC();
 #endif // !__ARM__
 #ifdef ISPC_ARM_ENABLED
    // Generating ARM from x86 is more likely to be useful, though.
    LLVMInitializeARMTargetInfo();
    LLVMInitializeARMTarget();
@@ -260,6 +308,7 @@ int main(int Argc, char *Argv[]) {
    LLVMInitializeARMAsmParser();
    LLVMInitializeARMDisassembler();
    LLVMInitializeARMTargetMC();
 #endif
    char *file = NULL;
    const char *headerFileName = NULL;
@@ -486,6 +535,20 @@ int main(int Argc, char *Argv[]) {
          }
          hostStubFileName = argv[i];
        }
        else if (strncmp(argv[i], "--debug-phase=", 14) == 0) {
            fprintf(stderr, "WARNING: Adding debug phases may change the way PassManager"
                            "handles the phases and it may possibly make some bugs go"
                            "away or introduce the new ones.\n");
            g->debug_stages = ParsingPhases(argv[i] + strlen("--debug-phase="));
        }
 #ifdef LLVM_3_4
        else if (strncmp(argv[i], "--debug-ir=", 11) == 0) {
            g->debugIR = ParsingPhaseName(argv[i] + strlen("--debug-ir="));
        }
 #endif
        else if (strncmp(argv[i], "--off-phase=", 12) == 0) {
            g->off_stages = ParsingPhases(argv[i] + strlen("--off-phase="));
        }
        else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
            lPrintVersion();
            return 0;
--- a/opt.cpp
+++ b/opt.cpp
@@ -63,6 +63,9 @@
  #include <llvm/IR/BasicBlock.h>
  #include <llvm/IR/Constants.h>
 #endif
 #if defined (LLVM_3_4)
  #include <llvm/Transforms/Instrumentation.h>
 #endif
 #include <llvm/PassManager.h>
 #include <llvm/PassRegistry.h>
 #include <llvm/Assembly/PrintModulePass.h>
@@ -119,6 +122,8 @@ static llvm::Pass *CreateReplacePseudoMemoryOpsPass();
 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
 static llvm::Pass *CreateMakeInternalFuncsStaticPass();
 static llvm::Pass *CreateDebugPass(char * output);
 #define DEBUG_START_PASS(NAME)                                 \
    if (g->debugPrint &&                                       \
        (getenv("FUNC") == NULL ||                             \
@@ -395,6 +400,54 @@ lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
 }
 ///////////////////////////////////////////////////////////////////////////
 // This is a wrap over class llvm::PassManager. This duplicates PassManager function run()
 //   and change PassManager function add by adding some checks and debug passes.
 //   This wrap can control:
 //   - If we want to switch off optimization with given number.
 //   - If we want to dump LLVM IR after optimization with given number.
 //   - If we want to generate LLVM IR debug for gdb after optimization with given number.
 class DebugPassManager {
 public:
    DebugPassManager():number(0){}
    void add(llvm::Pass * P, int stage);
    bool run(llvm::Module& M) {return PM.run(M);}
    llvm::PassManager& getPM() {return PM;}
 private:
    llvm::PassManager PM;
    int number;
 };
 void
 DebugPassManager::add(llvm::Pass * P, int stage = -1) {
    // taking number of optimization
    if (stage == -1) {
        number++;
    }
    else {
        number = stage;
    }
    if (g->off_stages.find(number) == g->off_stages.end()) {
        // adding optimization (not switched off)
        PM.add(P);
        if (g->debug_stages.find(number) != g->debug_stages.end()) {
            // adding dump of LLVM IR after optimization
            char buf[100];
            sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n",
                number, P->getPassName());
            PM.add(CreateDebugPass(buf));
        }
 #ifdef LLVM_3_4
        if (g->debugIR == number) {
            // adding generating of LLVM IR debug after optimization
            char buf[100];
            sprintf(buf, "Debug_IR_after_%d_phase.bc", number);
            PM.add(llvm::createDebugIRPass(true, true, ".", buf));
        }
 #endif
    }
 }
 ///////////////////////////////////////////////////////////////////////////
 void
@@ -403,14 +456,8 @@ Optimize(llvm::Module *module, int optLevel) {
        printf("*** Code going into optimization ***\n");
        module->dump();
    }
-
+    DebugPassManager optPM;
-    llvm::PassManager optPM;
+    optPM.add(llvm::createVerifierPass(),0);
    optPM.add(llvm::createVerifierPass());
 #if 0
    std::string err;
    optPM.add(llvm::createPrintModulePass(new llvm::raw_fd_ostream("-", err)));
 #endif
    llvm::TargetLibraryInfo *targetLibraryInfo =
        new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
@@ -427,7 +474,7 @@ Optimize(llvm::Module *module, int optLevel) {
    optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
                                            targetMachine->getVectorTargetTransformInfo()));
  #else // LLVM 3.3+
-    targetMachine->addAnalysisPasses(optPM);
+    targetMachine->addAnalysisPasses(optPM.getPM());
  #endif
 #endif
@@ -439,11 +486,11 @@ Optimize(llvm::Module *module, int optLevel) {
        // run absolutely no optimizations, since the front-end needs us to
        // take the various __pseudo_* functions it has emitted and turn
        // them into something that can actually execute.
-        optPM.add(CreateImproveMemoryOpsPass());
+        optPM.add(CreateImproveMemoryOpsPass(), 100);
        if (g->opt.disableHandlePseudoMemoryOps == false)
            optPM.add(CreateReplacePseudoMemoryOpsPass());
-        optPM.add(CreateIntrinsicsOptPass());
+        optPM.add(CreateIntrinsicsOptPass(), 102);
        optPM.add(CreateIsCompileTimeConstantPass(true));
        optPM.add(llvm::createFunctionInliningPass());
        optPM.add(CreateMakeInternalFuncsStaticPass());
@@ -462,7 +509,7 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::initializeInstrumentation(*registry);
        llvm::initializeTarget(*registry);
-        optPM.add(llvm::createGlobalDCEPass());
+        optPM.add(llvm::createGlobalDCEPass(), 200);
        // Early optimizations to try to reduce the total amount of code to
        // work with if we can
@@ -476,14 +523,14 @@ Optimize(llvm::Module *module, int optLevel) {
        if (g->opt.disableGatherScatterOptimizations == false &&
            g->target->getVectorWidth() > 1) {
-            optPM.add(llvm::createInstructionCombiningPass());
+            optPM.add(llvm::createInstructionCombiningPass(), 210);
            optPM.add(CreateImproveMemoryOpsPass());
        }
        if (!g->opt.disableMaskAllOnOptimizations) {
-            optPM.add(CreateIntrinsicsOptPass());
+            optPM.add(CreateIntrinsicsOptPass(), 215);
            optPM.add(CreateInstructionSimplifyPass());
        }
-        optPM.add(llvm::createDeadInstEliminationPass());
+        optPM.add(llvm::createDeadInstEliminationPass(), 220);
        // Max struct size threshold for scalar replacement is
        //    1) 4 fields (r,g,b,w)
@@ -513,10 +560,10 @@ Optimize(llvm::Module *module, int optLevel) {
 #if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
        // Starting from 3.4 this functionality was moved to
        // InstructionCombiningPass. See r184459 for details.
-        optPM.add(llvm::createSimplifyLibCallsPass());
+        optPM.add(llvm::createSimplifyLibCallsPass(), 240);
 #endif
        optPM.add(llvm::createAggressiveDCEPass());
-        optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(llvm::createInstructionCombiningPass(), 241);
        optPM.add(llvm::createJumpThreadingPass());
        optPM.add(llvm::createCFGSimplificationPass());
        optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
@@ -524,44 +571,45 @@ Optimize(llvm::Module *module, int optLevel) {
        optPM.add(llvm::createTailCallEliminationPass());
        if (!g->opt.disableMaskAllOnOptimizations) {
-            optPM.add(CreateIntrinsicsOptPass());
+            optPM.add(CreateIntrinsicsOptPass(), 250);
            optPM.add(CreateInstructionSimplifyPass());
        }
        if (g->opt.disableGatherScatterOptimizations == false &&
            g->target->getVectorWidth() > 1) {
-            optPM.add(llvm::createInstructionCombiningPass());
+            optPM.add(llvm::createInstructionCombiningPass(), 255);
            optPM.add(CreateImproveMemoryOpsPass());
            if (g->opt.disableCoalescing == false &&
                g->target->getISA() != Target::GENERIC) {
                // It is important to run this here to make it easier to
                // finding matching gathers we can coalesce..
-                optPM.add(llvm::createEarlyCSEPass());
+                optPM.add(llvm::createEarlyCSEPass(), 260);
                optPM.add(CreateGatherCoalescePass());
            }
        }
-        optPM.add(llvm::createFunctionInliningPass());
+        optPM.add(llvm::createFunctionInliningPass(), 265);
        optPM.add(llvm::createConstantPropagationPass());
        optPM.add(CreateIntrinsicsOptPass());
        optPM.add(CreateInstructionSimplifyPass());
        if (g->opt.disableGatherScatterOptimizations == false &&
            g->target->getVectorWidth() > 1) {
-            optPM.add(llvm::createInstructionCombiningPass());
+            optPM.add(llvm::createInstructionCombiningPass(), 270);
            optPM.add(CreateImproveMemoryOpsPass());
        }
-        optPM.add(llvm::createIPSCCPPass());
+        optPM.add(llvm::createIPSCCPPass(), 275);
        optPM.add(llvm::createDeadArgEliminationPass());
        optPM.add(llvm::createAggressiveDCEPass());
        optPM.add(llvm::createInstructionCombiningPass());
        optPM.add(llvm::createCFGSimplificationPass());
-        if (g->opt.disableHandlePseudoMemoryOps == false)
+        if (g->opt.disableHandlePseudoMemoryOps == false) {
-            optPM.add(CreateReplacePseudoMemoryOpsPass());
+            optPM.add(CreateReplacePseudoMemoryOpsPass(),280);
-        optPM.add(CreateIntrinsicsOptPass());
+        }
        optPM.add(CreateIntrinsicsOptPass(),281);
        optPM.add(CreateInstructionSimplifyPass());
        optPM.add(llvm::createFunctionInliningPass());
@@ -579,9 +627,10 @@ Optimize(llvm::Module *module, int optLevel) {
        optPM.add(llvm::createIndVarSimplifyPass());
        optPM.add(llvm::createLoopIdiomPass());
        optPM.add(llvm::createLoopDeletionPass());
-        if (g->opt.unrollLoops)
+        if (g->opt.unrollLoops) {
-            optPM.add(llvm::createLoopUnrollPass());
+            optPM.add(llvm::createLoopUnrollPass(), 300);
-        optPM.add(llvm::createGVNPass());
+        }
        optPM.add(llvm::createGVNPass(), 301);
        optPM.add(CreateIsCompileTimeConstantPass(true));
        optPM.add(CreateIntrinsicsOptPass());
@@ -609,7 +658,7 @@ Optimize(llvm::Module *module, int optLevel) {
    // Finish up by making sure we didn't mess anything up in the IR along
    // the way.
-    optPM.add(llvm::createVerifierPass());
+    optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER);
    optPM.run(*module);
    if (g->debugPrint) {
@@ -4330,6 +4379,42 @@ CreateIsCompileTimeConstantPass(bool isLastTry) {
    return new IsCompileTimeConstantPass(isLastTry);
 }
 //////////////////////////////////////////////////////////////////////////
 // DebugPass
 /** This pass is added in list of passes after optimizations which
    we want to debug and print dump of LLVM IR in stderr. Also it
    prints name and number of previous optimization.
 */
 class DebugPass : public llvm::ModulePass {
 public:
    static char ID;
    DebugPass(char * output) : ModulePass(ID) {
        sprintf(str_output, "%s", output);
    }
    const char *getPassName() const { return "Dump LLVM IR"; }
    bool runOnModule(llvm::Module &m);
 private:
    char str_output[100];
 };
 char DebugPass::ID = 0;
 bool
 DebugPass::runOnModule(llvm::Module &module) {
    fprintf(stderr, "%s", str_output);
    fflush(stderr);
    module.dump();
    return true;
 }
 static llvm::Pass *
 CreateDebugPass(char * output) {
    return new DebugPass(output);
 }
 ///////////////////////////////////////////////////////////////////////////
 // MakeInternalFuncsStaticPass