diff --git a/Makefile b/Makefile
index 98729bfc..5bac4a6e 100644
--- a/Makefile
+++ b/Makefile
@@ -39,6 +39,10 @@
 LLVM_CONFIG=$(shell which llvm-config)
 CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
 
+# Enable ARM by request
+# To enable: make ARM_ENABLED=1
+ARM_ENABLED=0
+
 # Add llvm bin to the path so any scripts run will go to the right llvm-config
 LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
 export PATH:=$(LLVM_BIN):$(PATH)
@@ -55,12 +59,15 @@ LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
 LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
 LLVM_VERSION_DEF=-D$(LLVM_VERSION)
 
-LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker arm
+LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker
 # Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step.
 # We check if it's available before adding it (to not break 3.2 and earlier).
 ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1)
     LLVM_COMPONENTS+=option
 endif
+ifneq ($(ARM_ENABLED), 0)
+    LLVM_COMPONENTS+=arm
+endif
 LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS))
 
 CLANG=clang
@@ -104,6 +111,9 @@ OPT=-O2
 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
 	-Wall $(LLVM_VERSION_DEF) \
 	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
+ifneq ($(ARM_ENABLED), 0)
+    CXXFLAGS+=-DISPC_ARM_ENABLED
+endif
 
 LDFLAGS=
 ifeq ($(ARCH_OS),Linux)
@@ -122,10 +132,12 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=neon-32 neon-16 neon-8 \
-	avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
-	sse2 sse2-x2 sse4 sse4-x2 sse4-8 sse4-16 \
-	generic-1 generic-4 generic-8 generic-16 generic-32 generic-64
+TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
+	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
+ifneq ($(ARM_ENABLED), 0)
+    TARGETS+=neon-32 neon-16 neon-8
+endif
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
 # These are files to be compiled in single version.
@@ -134,12 +146,12 @@ BUILTINS_OBJS_32=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-32bi
 BUILTINS_OBJS_64=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-64bit.o)))
 BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_COMMON:.ll=.o))) \
 	$(BUILTINS_OBJS_32) $(BUILTINS_OBJS_64) \
-	builtins-c-32.cpp builtins-c-64.cpp 
+	builtins-c-32.cpp builtins-c-64.cpp
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 
 OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-	stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
+       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
 	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 
 default: ispc
@@ -264,4 +276,3 @@ objs/stdlib_mask32_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask32
 	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
 		python stdlib2cpp.py mask32 > $@
-
diff --git a/builtins.cpp b/builtins.cpp
index 82c45b02..f3a0cf59 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -631,7 +631,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
         llvm::Triple bcTriple(bcModule->getTargetTriple());
         Debug(SourcePos(), "module triple: %s\nbitcode triple: %s\n",
               mTriple.str().c_str(), bcTriple.str().c_str());
-#ifndef __arm__
+#if defined(ISPC_ARM_ENABLED) && !defined(__arm__)
         // FIXME: More ugly and dangerous stuff.  We really haven't set up
         // proper build and runtime infrastructure for ispc to do
         // cross-compilation, yet it's at minimum useful to be able to emit
@@ -812,6 +812,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     // Next, add the target's custom implementations of the various needed
     // builtin functions (e.g. __masked_store_32(), etc).
     switch (g->target->getISA()) {
+
+#ifdef ISPC_ARM_ENABLED
     case Target::NEON8: {
         if (runtime32) {
             EXPORT_MODULE(builtins_bitcode_neon_8_32bit);
@@ -839,6 +841,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         }
         break;
     }
+#endif
     case Target::SSE2: {
         switch (g->target->getVectorWidth()) {
         case 4:
diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile
index 7e83e618..1a565ffd 100644
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -1,7 +1,7 @@
 
-EXAMPLE=mandelbrot
-CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
-ISPC_SRC=mandelbrot.ispc
+EXAMPLE=mandelbrot_tasks
+CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
+ISPC_SRC=mandelbrot_tasks.ispc
 ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
 ISPC_ARM_TARGETS=neon
 
diff --git a/examples/mandelbrot_tasks/mandelbrot.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
similarity index 99%
rename from examples/mandelbrot_tasks/mandelbrot.cpp
rename to examples/mandelbrot_tasks/mandelbrot_tasks.cpp
index a01cfe43..dae22736 100644
--- a/examples/mandelbrot_tasks/mandelbrot.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
@@ -42,7 +42,7 @@
 #include <algorithm>
 #include <string.h>
 #include "../timing.h"
-#include "mandelbrot_ispc.h"
+#include "mandelbrot_tasks_ispc.h"
 using namespace ispc;
 
 extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
diff --git a/examples/mandelbrot_tasks/mandelbrot.ispc b/examples/mandelbrot_tasks/mandelbrot_tasks.ispc
similarity index 100%
rename from examples/mandelbrot_tasks/mandelbrot.ispc
rename to examples/mandelbrot_tasks/mandelbrot_tasks.ispc
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
index b92de72f..3a8fca79 100644
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -21,7 +21,7 @@
   <PropertyGroup Label="Globals">
     <ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
-    <RootNamespace>mandelbrot</RootNamespace>
+    <RootNamespace>mandelbrot_tasks</RootNamespace>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -65,22 +65,22 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <LinkIncremental>true</LinkIncremental>
     <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
     <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <LinkIncremental>false</LinkIncremental>
     <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
     <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
@@ -153,12 +153,12 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
-    <ClCompile Include="mandelbrot.cpp" />
-    <ClCompile Include="mandelbrot_serial.cpp" />
+    <ClCompile Include="mandelbrot_tasks.cpp" />
+    <ClCompile Include="mandelbrot_tasks_serial.cpp" />
     <ClCompile Include="../tasksys.cpp" />
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="mandelbrot.ispc">
+    <CustomBuild Include="mandelbrot_tasks.ispc">
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
diff --git a/examples/mandelbrot_tasks/mandelbrot_serial.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp
similarity index 100%
rename from examples/mandelbrot_tasks/mandelbrot_serial.cpp
rename to examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp
diff --git a/examples/perf.ini b/examples/perf.ini
index 3814bf16..d2a5c73e 100755
--- a/examples/perf.ini
+++ b/examples/perf.ini
@@ -22,7 +22,7 @@ mandelbrot
 #***
 Mandelbrot Set
 mandelbrot_tasks
-mandelbrot
+mandelbrot_tasks
 ^
 #***
 Perlin Noise Function
diff --git a/examples/perf.py b/examples/perf.py
index 8503bd8c..f96ef9ec 100755
--- a/examples/perf.py
+++ b/examples/perf.py
@@ -73,10 +73,19 @@ def cpu_get():
 #returns cpu_usage
 def cpu_check():
     if is_windows == False:
-        cpu1 = cpu_get()
-        time.sleep(1)
-        cpu2 = cpu_get()
-        cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
+        if is_mac == False:
+            cpu1 = cpu_get()
+            time.sleep(1)
+            cpu2 = cpu_get()
+            cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
+        else:
+            os.system("sysctl -n vm.loadavg > cpu_temp")
+            c = open("cpu_temp", 'r')
+            c_line = c.readline()
+            c.close
+            os.remove("cpu_temp")
+            R = c_line.split(' ')
+            cpu_percent = float(R[1]) * 3
     else:
 	os.system("wmic cpu get loadpercentage /value > cpu_temp")
 	c = open("cpu_temp", 'r')
@@ -143,6 +152,8 @@ parser.add_option('-p', '--path', dest='path',
 global is_windows
 is_windows = (platform.system() == 'Windows' or
               'CYGWIN_NT' in platform.system())
+global is_mac
+is_mac = (platform.system() == 'Darwin')
 
 # save corrent path
 pwd = os.getcwd()
diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp
index b4ced5c7..c9c2fa7b 100644
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -365,7 +365,7 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue)
 static inline int32_t 
 lAtomicAdd(volatile int32_t *v, int32_t delta) {
 #ifdef ISPC_IS_WINDOWS
-    return InterlockedAdd((volatile LONG *)v, delta);
+    return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
 #else
     return __sync_fetch_and_add(v, delta);
 #endif
diff --git a/ispc.cpp b/ispc.cpp
index 0f07895f..a012b08d 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -141,10 +141,12 @@ lGetSystemISA() {
 
 
 static const char *supportedCPUs[] = {
+#ifdef ISPC_ARM_ENABLED
     // FIXME: LLVM supports a ton of different ARM CPU variants--not just
     // cortex-a9 and a15.  We should be able to handle any of them that also
     // have NEON support.
     "cortex-a9", "cortex-a15",
+#endif
     "atom", "penryn", "core2", "corei7", "corei7-avx"
 #if !defined(LLVM_3_1)
     , "core-avx-i", "core-avx2"
@@ -185,9 +187,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             // possible ISA based on that.
             if (!strcmp(cpu, "core-avx2"))
                 isa = "avx2";
+#ifdef ISPC_ARM_ENABLED
             else if (!strcmp(cpu, "cortex-a9") ||
                      !strcmp(cpu, "cortex-a15"))
                 isa = "neon-32";
+#endif
             else if (!strcmp(cpu, "core-avx-i"))
                 isa = "avx1.1";
             else if (!strcmp(cpu, "sandybridge") ||
@@ -211,7 +215,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         }
     }
 
-#if !defined(__arm__)
+#if defined(ISPC_ARM_ENABLED) && !defined(__arm__)
     if (cpu == NULL && !strncmp(isa, "neon", 4))
         // If we're compiling NEON on an x86 host and the CPU wasn't
         // supplied, don't go and set the CPU based on the host...
@@ -246,9 +250,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     this->m_cpu = cpu;
 
     if (arch == NULL) {
+#ifdef ISPC_ARM_ENABLED
         if (!strncmp(isa, "neon", 4))
             arch = "arm";
         else
+#endif
             arch = "x86-64";
     }
 
@@ -461,6 +467,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasGather = true;
 #endif
     }
+#ifdef ISPC_ARM_ENABLED
     else if (!strcasecmp(isa, "neon-8")) {
         this->m_isa = Target::NEON8;
         this->m_nativeVectorWidth = 16;
@@ -488,6 +495,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
+#endif
     else {
         fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n",
                 isa, SupportedTargetISAs());
@@ -502,9 +510,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             llvm::Reloc::Default;
         std::string featuresString = m_attributes;
         llvm::TargetOptions options;
+#ifdef ISPC_ARM_ENABLED
         if (m_isa == Target::NEON8 || m_isa == Target::NEON16 ||
             m_isa == Target::NEON32)
             options.FloatABIType = llvm::FloatABI::Hard;
+#endif
 #if !defined(LLVM_3_1)
         if (g->opt.disableFMA == false)
             options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
@@ -596,13 +606,21 @@ Target::SupportedTargetCPUs() {
 
 const char *
 Target::SupportedTargetArchs() {
-    return "arm, x86, x86-64";
+    return
+#ifdef ISPC_ARM_ENABLED
+        "arm, "
+#endif
+        "x86, x86-64";
 }
 
 
 const char *
 Target::SupportedTargetISAs() {
-    return "neon-8, neon-16, neon-32, sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
+    return
+#ifdef ISPC_ARM_ENABLED
+        "neon-8, neon-16, neon-32, "
+#endif
+        "sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
         "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2, "
         "generic-1, generic-4, generic-8, generic-16, generic-32";
 }
@@ -611,10 +629,13 @@ Target::SupportedTargetISAs() {
 std::string
 Target::GetTripleString() const {
     llvm::Triple triple;
+#ifdef ISPC_ARM_ENABLED
     if (m_arch == "arm") {
         triple.setTriple("armv7-eabi");
     }
-    else {
+    else
+#endif
+    {
         // Start with the host triple as the default
         triple.setTriple(llvm::sys::getDefaultTargetTriple());
 
@@ -637,12 +658,14 @@ Target::GetTripleString() const {
 const char *
 Target::ISAToString(ISA isa) {
     switch (isa) {
+#ifdef ISPC_ARM_ENABLED
     case Target::NEON8:
         return "neon-8";
     case Target::NEON16:
         return "neon-16";
     case Target::NEON32:
         return "neon-32";
+#endif
     case Target::SSE2:
         return "sse2";
     case Target::SSE4:
@@ -813,6 +836,7 @@ Globals::Globals() {
     includeStdlib = true;
     runCPP = true;
     debugPrint = false;
+    debugIR = -1;
     disableWarnings = false;
     warningsAsErrors = false;
     quiet = false;
diff --git a/ispc.h b/ispc.h
index 98fcd199..25a03e1d 100644
--- a/ispc.h
+++ b/ispc.h
@@ -59,6 +59,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <vector>
+#include <set>
 #include <string>
 
 /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
@@ -66,6 +67,9 @@
  */
 #define ISPC_MAX_NVEC 64
 
+// Number of final optimization phase
+#define LAST_OPT_NUMBER 1000
+
 // Forward declarations of a number of widely-used LLVM types
 namespace llvm {
     class AttributeSet;
@@ -175,7 +179,11 @@ public:
         flexible/performant of them will apear last in the enumerant.  Note
         also that __best_available_isa() needs to be updated if ISAs are
         added or the enumerant values are reordered.  */
-    enum ISA { NEON32, NEON16, NEON8, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC,
+    enum ISA {
+#ifdef ISPC_ARM_ENABLED
+               NEON32, NEON16, NEON8,
+#endif
+               SSE2, SSE4, AVX, AVX11, AVX2, GENERIC,
                NUM_ISAS };
 
     /** Initializes the given Target pointer for a target of the given
@@ -495,6 +503,16 @@ struct Globals {
         ispc's execution. */
     bool debugPrint;
 
+    /** Indicates which stages of optimization we want to dump. */
+    std::set<int> debug_stages;
+
+    /** Indicates after which optimization we want to generate
+        DebugIR information. */
+    int debugIR;
+
+    /** Indicates which phases of optimization we want to switch off. */
+    std::set<int> off_stages;
+
     /** Indicates whether all warning messages should be surpressed. */
     bool disableWarnings;
 
diff --git a/ispc.vcxproj b/ispc.vcxproj
index e9bf9d97..53386c4c 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug|Win32">
@@ -45,12 +45,6 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-32-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-8-32bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-8-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-16-32bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-16-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32-32bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-32bit.cpp" />
@@ -191,60 +185,6 @@
       <Message>Building gen-bitcode-sse2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-neon-8.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-8-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-neon-8-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-neon-8-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-neon-8.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-8-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-neon-8-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-neon-8-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-neon-16.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-16-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-neon-16-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-neon-16-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-neon-16.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-16-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-neon-16-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-neon-16-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-neon-32.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-32-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-neon-32-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-neon-32-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-neon-32.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-32-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-neon-32-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-neon-32-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1.ll">
       <FileType>Document</FileType>
@@ -263,6 +203,26 @@
       <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+=======
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+>>>>>>> master
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1-x2.ll">
       <FileType>Document</FileType>
diff --git a/main.cpp b/main.cpp
index 8d7282f5..7290d3c8 100644
--- a/main.cpp
+++ b/main.cpp
@@ -155,6 +155,11 @@ devUsage(int ret) {
     printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
     printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
     printf("    [--yydebug]\t\t\t\tPrint debugging information during parsing\n");
+    printf("    [--debug-phase=<value>]\t\tSet optimization phases to dump. --debug-phase=first,210:220,300,305,310:last\n");
+#ifdef LLVM_3_4
+    printf("    [--debug-ir=<value>]\t\tSet optimization phase to generate debugIR after it\n");
+#endif
+    printf("    [--off-phase=<value>]\t\tSwitch off optimization phases. --off-phase=first,210:220,300,305,310:last\n");
     exit(ret);
 }
 
@@ -211,6 +216,47 @@ lSignal(void *) {
 }
 
 
+static int ParsingPhaseName(char * stage) {
+    if (strncmp(stage, "first", 5) == 0) {
+        return 0;
+    }
+    else if (strncmp(stage, "last", 4) == 0) {
+        return LAST_OPT_NUMBER;
+    }
+    else {
+        int t = atoi(stage);
+        if (t < 0 || t > LAST_OPT_NUMBER) {
+            fprintf(stderr, "Phases must be from 0 to %d. %s is incorrect.\n", LAST_OPT_NUMBER, stage);
+            exit(0);
+        }
+        else {
+            return t;
+        }
+    }
+}
+
+
+static std::set<int> ParsingPhases(char * stages) {
+    std::set<int> phases;
+    int begin = ParsingPhaseName(stages);
+    int end = begin;
+
+    for (unsigned i = 0; i < strlen(stages); i++) {
+        if ((stages[i] == ',') || (i == strlen(stages) - 1)) {
+            for (int j = begin; j < end + 1; j++) {
+                phases.insert(j);
+            }
+            begin = ParsingPhaseName(stages + i + 1);
+            end = begin;
+        }
+        else if (stages[i] == ':') {
+            end = ParsingPhaseName(stages + i + 1);
+        }
+    }
+    return phases;
+}
+
+
 static void
 lParseInclude(const char *path) {
 #ifdef ISPC_IS_WINDOWS
@@ -253,6 +299,8 @@ int main(int Argc, char *Argv[]) {
     LLVMInitializeX86Disassembler();
     LLVMInitializeX86TargetMC();
 #endif // !__ARM__
+
+#ifdef ISPC_ARM_ENABLED
     // Generating ARM from x86 is more likely to be useful, though.
     LLVMInitializeARMTargetInfo();
     LLVMInitializeARMTarget();
@@ -260,6 +308,7 @@ int main(int Argc, char *Argv[]) {
     LLVMInitializeARMAsmParser();
     LLVMInitializeARMDisassembler();
     LLVMInitializeARMTargetMC();
+#endif
 
     char *file = NULL;
     const char *headerFileName = NULL;
@@ -486,6 +535,20 @@ int main(int Argc, char *Argv[]) {
           }
           hostStubFileName = argv[i];
         }
+        else if (strncmp(argv[i], "--debug-phase=", 14) == 0) {
+            fprintf(stderr, "WARNING: Adding debug phases may change the way PassManager"
+                            "handles the phases and it may possibly make some bugs go"
+                            "away or introduce the new ones.\n");
+            g->debug_stages = ParsingPhases(argv[i] + strlen("--debug-phase="));
+        }
+#ifdef LLVM_3_4
+        else if (strncmp(argv[i], "--debug-ir=", 11) == 0) {
+            g->debugIR = ParsingPhaseName(argv[i] + strlen("--debug-ir="));
+        }
+#endif
+        else if (strncmp(argv[i], "--off-phase=", 12) == 0) {
+            g->off_stages = ParsingPhases(argv[i] + strlen("--off-phase="));
+        }
         else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
             lPrintVersion();
             return 0;
diff --git a/opt.cpp b/opt.cpp
index 077320d5..3e2efcd8 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -63,6 +63,9 @@
   #include <llvm/IR/BasicBlock.h>
   #include <llvm/IR/Constants.h>
 #endif
+#if defined (LLVM_3_4)
+  #include <llvm/Transforms/Instrumentation.h>
+#endif
 #include <llvm/PassManager.h>
 #include <llvm/PassRegistry.h>
 #include <llvm/Assembly/PrintModulePass.h>
@@ -119,6 +122,8 @@ static llvm::Pass *CreateReplacePseudoMemoryOpsPass();
 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
 static llvm::Pass *CreateMakeInternalFuncsStaticPass();
 
+static llvm::Pass *CreateDebugPass(char * output);
+
 #define DEBUG_START_PASS(NAME)                                 \
     if (g->debugPrint &&                                       \
         (getenv("FUNC") == NULL ||                             \
@@ -395,6 +400,54 @@ lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
 }
 
 
+///////////////////////////////////////////////////////////////////////////
+// This is a wrap over class llvm::PassManager. This duplicates PassManager function run()
+//   and change PassManager function add by adding some checks and debug passes.
+//   This wrap can control:
+//   - If we want to switch off optimization with given number.
+//   - If we want to dump LLVM IR after optimization with given number.
+//   - If we want to generate LLVM IR debug for gdb after optimization with given number.
+class DebugPassManager {
+public:
+    DebugPassManager():number(0){}
+    void add(llvm::Pass * P, int stage);
+    bool run(llvm::Module& M) {return PM.run(M);}
+    llvm::PassManager& getPM() {return PM;}
+
+private:
+    llvm::PassManager PM;
+    int number;
+};
+
+void
+DebugPassManager::add(llvm::Pass * P, int stage = -1) {
+    // taking number of optimization
+    if (stage == -1) {
+        number++;
+    }
+    else {
+        number = stage;
+    }
+    if (g->off_stages.find(number) == g->off_stages.end()) {
+        // adding optimization (not switched off)
+        PM.add(P);
+        if (g->debug_stages.find(number) != g->debug_stages.end()) {
+            // adding dump of LLVM IR after optimization
+            char buf[100];
+            sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n",
+                number, P->getPassName());
+            PM.add(CreateDebugPass(buf));
+        }
+#ifdef LLVM_3_4
+        if (g->debugIR == number) {
+            // adding generating of LLVM IR debug after optimization
+            char buf[100];
+            sprintf(buf, "Debug_IR_after_%d_phase.bc", number);
+            PM.add(llvm::createDebugIRPass(true, true, ".", buf));
+        }
+#endif
+    }
+}
 ///////////////////////////////////////////////////////////////////////////
 
 void
@@ -403,14 +456,8 @@ Optimize(llvm::Module *module, int optLevel) {
         printf("*** Code going into optimization ***\n");
         module->dump();
     }
-
-    llvm::PassManager optPM;
-    optPM.add(llvm::createVerifierPass());
-
-#if 0
-    std::string err;
-    optPM.add(llvm::createPrintModulePass(new llvm::raw_fd_ostream("-", err)));
-#endif
+    DebugPassManager optPM;
+    optPM.add(llvm::createVerifierPass(),0);
 
     llvm::TargetLibraryInfo *targetLibraryInfo =
         new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
@@ -427,7 +474,7 @@ Optimize(llvm::Module *module, int optLevel) {
     optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
                                             targetMachine->getVectorTargetTransformInfo()));
   #else // LLVM 3.3+
-    targetMachine->addAnalysisPasses(optPM);
+    targetMachine->addAnalysisPasses(optPM.getPM());
   #endif
 #endif
 
@@ -439,11 +486,11 @@ Optimize(llvm::Module *module, int optLevel) {
         // run absolutely no optimizations, since the front-end needs us to
         // take the various __pseudo_* functions it has emitted and turn
         // them into something that can actually execute.
-        optPM.add(CreateImproveMemoryOpsPass());
+        optPM.add(CreateImproveMemoryOpsPass(), 100);
         if (g->opt.disableHandlePseudoMemoryOps == false)
             optPM.add(CreateReplacePseudoMemoryOpsPass());
 
-        optPM.add(CreateIntrinsicsOptPass());
+        optPM.add(CreateIntrinsicsOptPass(), 102);
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(llvm::createFunctionInliningPass());
         optPM.add(CreateMakeInternalFuncsStaticPass());
@@ -462,7 +509,7 @@ Optimize(llvm::Module *module, int optLevel) {
         llvm::initializeInstrumentation(*registry);
         llvm::initializeTarget(*registry);
 
-        optPM.add(llvm::createGlobalDCEPass());
+        optPM.add(llvm::createGlobalDCEPass(), 200);
 
         // Early optimizations to try to reduce the total amount of code to
         // work with if we can
@@ -476,14 +523,14 @@ Optimize(llvm::Module *module, int optLevel) {
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
-            optPM.add(llvm::createInstructionCombiningPass());
+            optPM.add(llvm::createInstructionCombiningPass(), 210);
             optPM.add(CreateImproveMemoryOpsPass());
         }
         if (!g->opt.disableMaskAllOnOptimizations) {
-            optPM.add(CreateIntrinsicsOptPass());
+            optPM.add(CreateIntrinsicsOptPass(), 215);
             optPM.add(CreateInstructionSimplifyPass());
         }
-        optPM.add(llvm::createDeadInstEliminationPass());
+        optPM.add(llvm::createDeadInstEliminationPass(), 220);
 
         // Max struct size threshold for scalar replacement is
         //    1) 4 fields (r,g,b,w)
@@ -513,10 +560,10 @@ Optimize(llvm::Module *module, int optLevel) {
 #if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
         // Starting from 3.4 this functionality was moved to
         // InstructionCombiningPass. See r184459 for details.
-        optPM.add(llvm::createSimplifyLibCallsPass());
+        optPM.add(llvm::createSimplifyLibCallsPass(), 240);
 #endif
         optPM.add(llvm::createAggressiveDCEPass());
-        optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(llvm::createInstructionCombiningPass(), 241);
         optPM.add(llvm::createJumpThreadingPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
@@ -524,44 +571,45 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createTailCallEliminationPass());
 
         if (!g->opt.disableMaskAllOnOptimizations) {
-            optPM.add(CreateIntrinsicsOptPass());
+            optPM.add(CreateIntrinsicsOptPass(), 250);
             optPM.add(CreateInstructionSimplifyPass());
         }
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
-            optPM.add(llvm::createInstructionCombiningPass());
+            optPM.add(llvm::createInstructionCombiningPass(), 255);
             optPM.add(CreateImproveMemoryOpsPass());
 
             if (g->opt.disableCoalescing == false &&
                 g->target->getISA() != Target::GENERIC) {
                 // It is important to run this here to make it easier to
                 // finding matching gathers we can coalesce..
-                optPM.add(llvm::createEarlyCSEPass());
+                optPM.add(llvm::createEarlyCSEPass(), 260);
                 optPM.add(CreateGatherCoalescePass());
             }
         }
 
-        optPM.add(llvm::createFunctionInliningPass());
+        optPM.add(llvm::createFunctionInliningPass(), 265);
         optPM.add(llvm::createConstantPropagationPass());
         optPM.add(CreateIntrinsicsOptPass());
         optPM.add(CreateInstructionSimplifyPass());
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
-            optPM.add(llvm::createInstructionCombiningPass());
+            optPM.add(llvm::createInstructionCombiningPass(), 270);
             optPM.add(CreateImproveMemoryOpsPass());
         }
 
-        optPM.add(llvm::createIPSCCPPass());
+        optPM.add(llvm::createIPSCCPPass(), 275);
         optPM.add(llvm::createDeadArgEliminationPass());
         optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());
 
-        if (g->opt.disableHandlePseudoMemoryOps == false)
-            optPM.add(CreateReplacePseudoMemoryOpsPass());
-        optPM.add(CreateIntrinsicsOptPass());
+        if (g->opt.disableHandlePseudoMemoryOps == false) {
+            optPM.add(CreateReplacePseudoMemoryOpsPass(),280);
+        }
+        optPM.add(CreateIntrinsicsOptPass(),281);
         optPM.add(CreateInstructionSimplifyPass());
 
         optPM.add(llvm::createFunctionInliningPass());
@@ -579,9 +627,10 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createIndVarSimplifyPass());
         optPM.add(llvm::createLoopIdiomPass());
         optPM.add(llvm::createLoopDeletionPass());
-        if (g->opt.unrollLoops)
-            optPM.add(llvm::createLoopUnrollPass());
-        optPM.add(llvm::createGVNPass());
+        if (g->opt.unrollLoops) {
+            optPM.add(llvm::createLoopUnrollPass(), 300);
+        }
+        optPM.add(llvm::createGVNPass(), 301);
 
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(CreateIntrinsicsOptPass());
@@ -609,7 +658,7 @@ Optimize(llvm::Module *module, int optLevel) {
 
     // Finish up by making sure we didn't mess anything up in the IR along
     // the way.
-    optPM.add(llvm::createVerifierPass());
+    optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER);
     optPM.run(*module);
 
     if (g->debugPrint) {
@@ -4330,6 +4379,42 @@ CreateIsCompileTimeConstantPass(bool isLastTry) {
     return new IsCompileTimeConstantPass(isLastTry);
 }
 
+//////////////////////////////////////////////////////////////////////////
+// DebugPass
+
+/** This pass is added in list of passes after optimizations which
+    we want to debug and print dump of LLVM IR in stderr. Also it
+    prints name and number of previous optimization.
+ */
+class DebugPass : public llvm::ModulePass {
+public:
+    static char ID;
+    DebugPass(char * output) : ModulePass(ID) {
+        sprintf(str_output, "%s", output);
+    }
+
+    const char *getPassName() const { return "Dump LLVM IR"; }
+    bool runOnModule(llvm::Module &m);
+
+private:
+    char str_output[100];
+};
+
+char DebugPass::ID = 0;
+
+bool
+DebugPass::runOnModule(llvm::Module &module) {
+    fprintf(stderr, "%s", str_output);
+    fflush(stderr);
+    module.dump();
+    return true;
+}
+
+static llvm::Pass *
+CreateDebugPass(char * output) {
+    return new DebugPass(output);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // MakeInternalFuncsStaticPass