diff --git a/.gitignore b/.gitignore
index 0469cf7d..429199bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,13 +3,20 @@
 depend
 ispc
 ispc_test
+ispc_ref
 objs
 docs/doxygen
 docs/*.html
 tests*/*cpp
 tests*/*run
+logs/
+notify_log.log
+alloy_results_*
 examples/*/*.png
 examples/*/*.ppm
 examples/*/objs/*
+examples/*/ref
+examples/*/test
+*.swp
 
 
diff --git a/Makefile b/Makefile
index fab66b58..aba1cdd4 100644
--- a/Makefile
+++ b/Makefile
@@ -39,6 +39,10 @@
 LLVM_CONFIG=$(shell which /usr/local/llvm-3.3/bin/llvm-config)
 CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
 
+# Enable ARM by request
+# To enable: make ARM_ENABLED=1
+ARM_ENABLED=0
+
 # Add llvm bin to the path so any scripts run will go to the right llvm-config
 LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
 export PATH:=$(LLVM_BIN):$(PATH)
@@ -55,12 +59,15 @@ LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
 LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
 LLVM_VERSION_DEF=-D$(LLVM_VERSION)
 
-LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker arm nvptx
+LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker nvptx
 # Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step.
 # We check if it's available before adding it (to not break 3.2 and earlier).
 ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1)
     LLVM_COMPONENTS+=option
 endif
+ifneq ($(ARM_ENABLED), 0)
+    LLVM_COMPONENTS+=arm
+endif
 LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS))
 
 CLANG=clang
@@ -72,6 +79,10 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
 ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
 	-lpthread
 
+ifeq ($(LLVM_VERSION),LLVM_3_4)
+    ISPC_LIBS += -lcurses
+endif
+
 ifeq ($(ARCH_OS),Linux)
 	ISPC_LIBS += -ldl
 endif
@@ -102,8 +113,16 @@ CXX=g++
 CPP=cpp
 OPT=-O2
 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
-	-Wall $(LLVM_VERSION_DEF) \
-	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
+	$(LLVM_VERSION_DEF) \
+	-Wall \
+	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" \
+	-Wno-sign-compare
+ifneq ($(LLVM_VERSION),LLVM_3_1)
+	CXXFLAGS+=-Werror
+endif
+ifneq ($(ARM_ENABLED), 0)
+    CXXFLAGS+=-DISPC_ARM_ENABLED
+endif
 
 LDFLAGS=
 ifeq ($(ARCH_OS),Linux)
@@ -122,8 +141,12 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=nvptx64 neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
+TARGETS=nvptx64 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
+ifneq ($(ARM_ENABLED), 0)
+    TARGETS+=neon-32 neon-16 neon-8
+endif
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
 # These are files to be compiled in single version.
@@ -132,12 +155,12 @@ BUILTINS_OBJS_32=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-32bi
 BUILTINS_OBJS_64=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-64bit.o)))
 BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_COMMON:.ll=.o))) \
 	$(BUILTINS_OBJS_32) $(BUILTINS_OBJS_64) \
-	builtins-c-32.cpp builtins-c-64.cpp 
+	builtins-c-32.cpp builtins-c-64.cpp
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 
 OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-	stdlib_generic_ispc.o stdlib_x86_ispc.o \
+       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o stdlib_mask64_ispc.o \
 	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 
 default: ispc
@@ -223,15 +246,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	$(CXX) $(CXXFLAGS) -o $@ -c $<
 
-objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
 	m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
 
-objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(32 bit version\)
 	m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
 
-objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(64 bit version\)
 	m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@
 
@@ -243,12 +266,27 @@ objs/builtins-c-64.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
 	$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@
 
-objs/stdlib_generic_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $< for generic
-	$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
-		python stdlib2cpp.py generic > $@
+objs/stdlib_mask1_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask1
+	$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask1 > $@
 
-objs/stdlib_x86_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $< for x86
-	$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
-		python stdlib2cpp.py x86 > $@
+objs/stdlib_mask8_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask8
+	$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask8 > $@
+
+objs/stdlib_mask16_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask16
+	$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask16 > $@
+
+objs/stdlib_mask32_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask32
+	$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask32 > $@
+
+objs/stdlib_mask64_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask64
+	$(CLANG) -E -x c -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask64 > $@
diff --git a/alloy.py b/alloy.py
new file mode 100755
index 00000000..cda51d70
--- /dev/null
+++ b/alloy.py
@@ -0,0 +1,660 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+
+def attach_mail_file(msg, filename, name):
+    if os.path.exists(filename):
+        fp = open(filename, "rb")
+        to_attach = MIMEBase("application", "octet-stream")
+        to_attach.set_payload(fp.read())
+        encode_base64(to_attach)
+        to_attach.add_header("Content-Disposition", "attachment", filename=name)
+        fp.close()
+        msg.attach(to_attach)
+
+def setting_paths(llvm, ispc, sde):
+    if llvm != "":
+        os.environ["LLVM_HOME"]=llvm
+    if ispc != "":
+        os.environ["ISPC_HOME"]=ispc
+    if sde != "":
+        os.environ["SDE_HOME"]=sde
+
+def check_LLVM(which_LLVM):
+    answer = []
+    if which_LLVM[0] == " ":
+        return answer
+    p = os.environ["LLVM_HOME"]
+    for i in range(0,len(which_LLVM)):
+        if not os.path.exists(p + os.sep + "bin-" + which_LLVM[i] + os.sep + "bin"):
+            answer.append(which_LLVM[i])
+    return answer
+
+def try_do_LLVM(text, command, from_validation):
+    if from_validation == True:
+        text = text + "\n"
+    print_debug("Trying to " + text, from_validation, alloy_build)
+    if os.system(command + " >> " + alloy_build + " 2>> " + alloy_build) != 0:
+        print_debug("ERROR.\n", from_validation, alloy_build)
+        error("can't " + text, 1)
+    print_debug("DONE.\n", from_validation, alloy_build)
+
+def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_validation, force, make):
+    print_debug("Building LLVM. Version: " + version_LLVM + ". ", from_validation, alloy_build)
+    if revision != "":
+        print_debug("Revision: " + revision + ".\n", from_validation, alloy_build)
+    else:
+        print_debug("\n", from_validation, alloy_build)
+    # Here we understand what and where do we want to build
+    current_path = os.getcwd()
+    llvm_home = os.environ["LLVM_HOME"]
+    os.chdir(llvm_home)
+    FOLDER_NAME=version_LLVM
+    if  version_LLVM == "trunk":
+        SVN_PATH="trunk"
+    if  version_LLVM == "3.3":
+        SVN_PATH="tags/RELEASE_33/final"
+        version_LLVM = "3_3"
+    if  version_LLVM == "3.2":
+        SVN_PATH="tags/RELEASE_32/final"
+        version_LLVM = "3_2"
+    if  version_LLVM == "3.1":
+        SVN_PATH="tags/RELEASE_31/final"
+        version_LLVM = "3_1"
+    if revision != "":
+        FOLDER_NAME = FOLDER_NAME + "_" + revision
+        revision = "-" + revision
+    if folder == "":
+        folder = FOLDER_NAME
+    LLVM_SRC="llvm-" + folder
+    LLVM_BUILD="build-" + folder
+    LLVM_BIN="bin-" + folder
+    if os.path.exists(LLVM_BIN + os.sep + "bin") and not force:
+        error("you have folder " + LLVM_BIN + ".\nIf you want to rebuild use --force", 1)
+    LLVM_BUILD_selfbuild = LLVM_BUILD + "_temp"
+    LLVM_BIN_selfbuild = LLVM_BIN + "_temp"
+    common.remove_if_exists(LLVM_SRC)
+    common.remove_if_exists(LLVM_BUILD)
+    common.remove_if_exists(LLVM_BIN)
+    if selfbuild:
+        common.remove_if_exists(LLVM_BUILD_selfbuild)
+        common.remove_if_exists(LLVM_BIN_selfbuild)
+    print_debug("Using folders: " + LLVM_SRC + " " + LLVM_BUILD + " " + LLVM_BIN + " in " + 
+        llvm_home + "\n", from_validation, alloy_build)
+    # load llvm
+    if tarball == "":
+        try_do_LLVM("load LLVM from http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " " + LLVM_SRC,
+                    from_validation)
+        os.chdir(LLVM_SRC + "/tools")
+        try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang",
+                    from_validation)
+        os.chdir("../")
+    else:
+        tar = tarball.split(" ")
+        os.makedirs(LLVM_SRC) 
+        os.chdir(LLVM_SRC) 
+        try_do_LLVM("untar LLVM from " + tar[0] + " ",
+                    "tar -xvzf " + tar[0] + " --strip-components 1", from_validation)
+        os.chdir("./tools") 
+        os.makedirs("clang") 
+        os.chdir("./clang") 
+        try_do_LLVM("untar clang from " + tar[1] + " ",
+                    "tar -xvzf " + tar[1] + " --strip-components 1", from_validation)
+        os.chdir("../../")
+    # paching llvm
+    patches = glob.glob(os.environ["ISPC_HOME"] + "/llvm_patches/*.*")
+    for patch in patches:
+        if version_LLVM in os.path.basename(patch):
+            try_do_LLVM("patch LLVM with patch" + patch + " ", "patch -p0 < " + patch, from_validation)
+    os.chdir("../")
+    # configuring llvm, build first part of selfbuild
+    os.makedirs(LLVM_BUILD)
+    os.makedirs(LLVM_BIN)
+    selfbuild_compiler = ""
+    if selfbuild:
+        print_debug("Making selfbuild and use folders " + LLVM_BUILD_selfbuild + " and " +
+            LLVM_BIN_selfbuild + "\n", from_validation, alloy_build)
+        os.makedirs(LLVM_BUILD_selfbuild)
+        os.makedirs(LLVM_BIN_selfbuild)
+        os.chdir(LLVM_BUILD_selfbuild)
+        try_do_LLVM("configure release version for selfbuild ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" +
+                    LLVM_BIN_selfbuild + " --enable-optimized",
+                    from_validation)
+        try_do_LLVM("build release version for selfbuild ",
+                    make, from_validation)
+        try_do_LLVM("install release version for selfbuild ",
+                    "make install",
+                    from_validation)
+        os.chdir("../")
+        selfbuild_compiler = " CC="+llvm_home+ "/" + LLVM_BIN_selfbuild + "/bin/clang"
+        print_debug("Now we have compiler for selfbuild: " + selfbuild_compiler + "\n", from_validation, alloy_build)
+    os.chdir(LLVM_BUILD)
+    if debug == False:
+        try_do_LLVM("configure release version ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" +
+                    LLVM_BIN + " --enable-optimized" + selfbuild_compiler,
+                    from_validation)
+    else:
+        try_do_LLVM("configure debug version ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" + LLVM_BIN +
+                    " --enable-debug-runtime --enable-debug-symbols --enable-keep-symbols" + selfbuild_compiler,
+                    from_validation)
+    # building llvm
+    try_do_LLVM("build LLVM ", make, from_validation)
+    try_do_LLVM("install LLVM ", "make install", from_validation)
+    os.chdir(current_path) 
+
+def check_targets():
+    answer = []
+    answer_sde = []
+    SSE2 = False;
+    SSE4 = False;
+    AVX = False;
+    AVX11 = False;
+    AVX2 = False;
+    if current_OS == "Linux":
+        cpu = open("/proc/cpuinfo")
+        f_lines = cpu.readlines()
+        cpu.close()
+        # check what native targets do we have
+        for i in range(0,len(f_lines)):
+            if SSE2 == False and "sse2" in f_lines[i]:
+                SSE2 = True;
+                answer = answer + ["sse2-i32x4", "sse2-i32x8"]
+            if SSE4 == False and "sse4_1" in f_lines[i]:
+                SSE4 = True;
+                answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
+            if AVX == False and "avx" in f_lines[i]:
+                AVX = True;
+                answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
+            if AVX11 == False and "rdrand" in f_lines[i]:
+                AVX11 = True;
+                answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
+            if AVX2 == False and "avx2" in f_lines[i]:
+                AVX2 = True;
+                answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+    if current_OS == "MacOS":
+        f_lines = take_lines("sysctl machdep.cpu.features", "first")
+        if "SSE2" in f_lines:
+            SSE2 = True;
+            answer = answer + ["sse2-i32x4", "sse2-i32x8"]
+        if "SSE4.1" in f_lines:
+            SSE4 = True;
+            answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
+        if "AVX1.0" in f_lines:
+            AVX = True;
+            answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
+        if "RDRAND" in f_lines:
+            AVX11 = True;
+            answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
+        if "AVX2.0" in f_lines:
+            AVX2 = True;
+            answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+
+    answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"]
+    # now check what targets we have with the help of SDE
+    sde_exists = ""
+    PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+    for counter in PATH_dir:
+        if os.path.exists(counter + os.sep + "sde") and sde_exists == "":
+            sde_exists = counter + os.sep + "sde"
+    if os.environ.get("SDE_HOME") != None:
+        if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"):
+            sde_exists = os.environ.get("SDE_HOME") + os.sep + "sde"
+    if sde_exists == "":
+        error("you haven't got sde neither in SDE_HOME nor in your PATH.\n" + 
+            "To test all platforms please set SDE_HOME to path containing SDE.\n" +
+            "Please refer to http://www.intel.com/software/sde for SDE download information.", 2)
+        return [answer, answer_sde]
+    # here we have SDE
+    f_lines = take_lines(sde_exists + " -help", "all")
+    for i in range(0,len(f_lines)):
+        if SSE4 == False and "wsm" in f_lines[i]:
+            answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]]
+        if AVX == False and "snb" in f_lines[i]:
+            answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]]
+        if AVX11 == False and "ivb" in f_lines[i]:
+            answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"]]
+        if AVX2 == False and "hsw" in f_lines[i]:
+            answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]]
+    return [answer, answer_sde]
+
+def build_ispc(version_LLVM, make):
+    current_path = os.getcwd()
+    os.chdir(os.environ["ISPC_HOME"])
+    p_temp = os.getenv("PATH")
+    os.environ["PATH"] = os.environ["LLVM_HOME"] + "/bin-" + version_LLVM + "/bin:" + os.environ["PATH"]
+    try_do_LLVM("clean ISPC for building", "make clean", True)
+    try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", make, True)
+    os.environ["PATH"] = p_temp
+    os.chdir(current_path)
+
+def execute_stability(stability, R, print_version):
+    stability1 = copy.deepcopy(stability)
+    temp = run_tests.run_tests(stability1, [], print_version)
+    for j in range(0,4):
+        R[j][0] = R[j][0] + temp[j]
+        for i in range(0,len(temp[j])):
+            R[j][1].append(temp[4])
+    number_of_fails = temp[5]
+    number_of_new_fails = len(temp[0]) + len(temp[1])
+    if number_of_fails == 0:
+        str_fails = ". No fails"
+    else:
+        str_fails = ". Fails: " + str(number_of_fails)
+    if number_of_new_fails == 0:
+        str_new_fails = ", No new fails.\n"
+    else:
+        str_new_fails = ", New fails: " + str(number_of_new_fails) + ".\n"
+    print_debug(temp[4][1:-3] + str_fails + str_new_fails, False, stability_log)
+
+def run_special_tests():
+   i = 5 
+
+def validation_run(only, only_targets, reference_branch, number, notify, update, make):
+    os.chdir(os.environ["ISPC_HOME"])
+    os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
+    if options.notify != "":
+        common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "notify_log.log")
+        smtp_server = os.environ["SMTP_ISPC"]
+        msg = MIMEMultipart()
+        msg['Subject'] = 'ISPC test system results'
+        msg['From'] = 'ISPC_test_system'
+        msg['To'] = options.notify
+    print_debug("Command: " + ' '.join(sys.argv) + "\n", False, "")
+    print_debug("Folder: " + os.environ["ISPC_HOME"] + "\n", False, "")
+    date = datetime.datetime.now()
+    print_debug("Date: " + date.strftime('%H:%M %d/%m/%Y') + "\n", False, "")
+    class options_for_drivers:
+        pass
+# *** *** ***
+# Stability validation run
+# *** *** ***
+    if ((("stability" in only) == True) or ("performance" in only) == False):
+        print_debug("\n\nStability validation run\n\n", False, "")
+        stability = options_for_drivers()
+# stability constant options
+        stability.random = False
+        stability.ispc_flags = ""
+        stability.compiler_exe = None
+        stability.num_jobs = 1024
+        stability.verbose = False
+        stability.time = False
+        stability.non_interactive = True
+        stability.update = update
+        stability.include_file = None
+        stability.silent = True
+        stability.in_file = "." + os.sep + f_date + os.sep + "run_tests_log.log"
+        stability.verify = False
+# stability varying options
+        stability.target = ""
+        stability.arch = ""
+        stability.no_opt = False
+        stability.wrapexe = ""
+# prepare parameters of run
+        [targets_t, sde_targets_t] = check_targets()
+        rebuild = True
+        opts = []
+        archs = []
+        LLVM = []
+        targets = []
+        sde_targets = []
+# parsing option only, update parameters of run
+        if "-O2" in only:
+            opts.append(False)
+        if "-O0" in only:
+            opts.append(True)
+        if "x86" in only and not ("x86-64" in only):
+            archs.append("x86")
+        if "x86-64" in only:
+            archs.append("x86-64")
+        if "native" in only:
+            sde_targets_t = []
+        for i in ["3.1", "3.2", "3.3", "trunk"]:
+            if i in only:
+                LLVM.append(i)
+        if "current" in only:
+            LLVM = [" "]
+            rebuild = False
+        else:
+            common.check_tools(1)
+        if only_targets != "":
+            only_targets += " "
+            only_targets = only_targets.replace("generic "," generic-4 generic-16 ")
+            only_targets_t = only_targets.split(" ")
+            for i in only_targets_t:
+                if i == "":
+                    continue
+                err = True
+                for j in range(0,len(targets_t)):
+                    if i in targets_t[j]:
+                        targets.append(targets_t[j])
+                        err = False
+                for j in range(0,len(sde_targets_t)):
+                    if i in sde_targets_t[j][1]:
+                        sde_targets.append(sde_targets_t[j])
+                        err = False
+                if err == True:
+                    error("You haven't sde for target " + i, 1)
+        else:
+            targets = targets_t[:-4]
+            sde_targets = sde_targets_t
+        if "build" in only:
+            targets = []
+            sde_targets = []
+            only = only + " stability "
+# finish parameters of run, prepare LLVM
+        if len(opts) == 0:
+            opts = [False]
+        if len(archs) == 0:
+            archs = ["x86", "x86-64"]
+        if len(LLVM) == 0:
+            LLVM = ["3.3", "trunk"]
+        gen_archs = ["x86-64"]
+        need_LLVM = check_LLVM(LLVM)
+        for i in range(0,len(need_LLVM)):
+            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make)
+# begin validation run for stabitily
+        common.remove_if_exists(stability.in_file)
+        R = [[[],[]],[[],[]],[[],[]],[[],[]]]
+        print_debug("\n_________________________STABILITY REPORT_________________________\n", False, stability_log)
+        for i in range(0,len(LLVM)):
+            print_version = 2
+            if rebuild:
+                build_ispc(LLVM[i], make)
+            for j in range(0,len(targets)):
+                stability.target = targets[j]
+                stability.wrapexe = ""
+                if "generic" in targets[j]:
+                    arch = gen_archs
+                else:
+                    arch = archs
+                for i1 in range(0,len(arch)):
+                    for i2 in range(0,len(opts)):
+                        stability.arch = arch[i1]
+                        stability.no_opt = opts[i2]
+                        execute_stability(stability, R, print_version)
+                        print_version = 0
+            for j in range(0,len(sde_targets)):
+                stability.target = sde_targets[j][1]
+                stability.wrapexe = os.environ["SDE_HOME"] + "/sde " + sde_targets[j][0] + " -- "
+                for i1 in range(0,len(archs)):
+                    for i2 in range(0,len(opts)):
+                        stability.arch = archs[i1]
+                        stability.no_opt = opts[i2]
+                        execute_stability(stability, R, print_version)
+                        print_version = 0
+# run special tests like embree
+# 
+        run_special_tests()
+        ttt = ["NEW RUNFAILS: ", "NEW COMPFAILS: ", "NEW PASSES RUNFAILS: ", "NEW PASSES COMPFAILS: "]
+        for j in range(0,4):
+            if len(R[j][0]) == 0:
+                print_debug("NO " + ttt[j][:-2] + "\n", False, stability_log)
+            else:
+                print_debug(ttt[j] + str(len(R[j][0])) + "\n", False, stability_log)
+                temp5 = [[],[]]
+                for i in range(0,len(R[j][0])):
+                    er = True
+                    for k in range(0,len(temp5[0])):
+                        if R[j][0][i] == temp5[0][k]:
+                            temp5[1][k].append(R[j][1][i])
+                            er = False
+                    if er == True:
+                        temp5[0].append(R[j][0][i])
+                        temp5[1].append([R[j][1][i]])
+                for i in range(0,len(temp5[0])):
+                    print_debug("\t" + temp5[0][i] + "\n", True, stability_log)
+                    for k in range(0,len(temp5[1][i])):
+                        print_debug("\t\t\t" + temp5[1][i][k], True, stability_log)
+        print_debug("__________________Watch stability.log for details_________________\n", False, stability_log)
+        if options.notify != "":
+            attach_mail_file(msg, stability.in_file, "run_tests_log.log")
+            attach_mail_file(msg, stability_log, "stability.log")
+
+# *** *** ***
+# Performance validation run
+# *** *** ***
+    if ((("performance" in only) == True) or ("stability" in only) == False):
+        print_debug("\n\nPerformance validation run\n\n", False, "")
+        common.check_tools(1)
+        performance = options_for_drivers()
+# performance constant options
+        performance.number = number
+        performance.config = "./perf.ini"
+        performance.path = "./"
+        performance.silent = True
+        performance.output = ""
+        performance.compiler = ""
+        performance.ref = "ispc_ref"
+        performance.in_file = "." + os.sep + f_date + os.sep + "performance.log"
+# prepare LLVM 3.3 as newest LLVM
+        need_LLVM = check_LLVM(["3.3"])
+        if len(need_LLVM) != 0:
+            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make)
+# prepare reference point. build both test and reference compilers
+        try_do_LLVM("apply git", "git branch", True)
+        temp4 = take_lines("git branch", "all")
+        for line in temp4:
+            if "*" in line:
+                current_branch = line[2:-1]
+        stashing = True
+        sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n")
+        if "No local changes" in take_lines("git stash", "first"):
+            stashing = False
+        #try_do_LLVM("stash current branch ", "git stash", True)
+        try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True)
+        sys.stdout.write(".\n")
+        build_ispc("3.3", make)
+        sys.stdout.write(".\n")
+        os.rename("ispc", "ispc_ref")
+        try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True)
+        if stashing:
+            try_do_LLVM("return current branch ", "git stash pop", True)
+        sys.stdout.write("You can interrupt script now.\n")
+        build_ispc("3.3", make)
+# begin validation run for performance. output is inserted into perf()
+        perf.perf(performance, [])
+        if options.notify != "":
+            attach_mail_file(msg, performance.in_file, "performance.log")
+            attach_mail_file(msg, "." + os.sep + "logs" + os.sep + "perf_build.log", "perf_build.log")
+
+# sending e-mail with results
+    if options.notify != "":
+        fp = open(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", 'rb')
+        f_lines = fp.readlines()
+        fp.close()
+        line = ""
+        for i in range(0,len(f_lines)):
+            line = line + f_lines[i][:-1]
+            line = line + '   \n'
+        text = MIMEText(line, "", "KOI-8")
+        msg.attach(text)
+        attach_mail_file(msg, alloy_build, "alloy_build.log")
+        s = smtplib.SMTP(smtp_server)
+        s.sendmail('ISPC_test_system', options.notify, msg.as_string())
+        s.quit()
+
+def Main():
+    global current_OS
+    if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True:
+        current_OS = "Windows"
+        error("Windows isn't supported now", 1)
+    else:
+        if (platform.system() == 'Darwin'):
+            current_OS = "MacOS"
+        else:
+            current_OS = "Linux" 
+
+    if (options.build_llvm == False and options.validation_run == False):
+        parser.print_help()
+        exit(0)
+
+    setting_paths(options.llvm_home, options.ispc_home, options.sde_home)
+    if os.environ.get("LLVM_HOME") == None:
+        error("you have no LLVM_HOME", 1)
+    if os.environ.get("ISPC_HOME") == None:
+        error("you have no ISPC_HOME", 1)
+    if options.notify != "":
+        if os.environ.get("SMTP_ISPC") == None:
+            error("you have no SMTP_ISPC in your environment for option notify", 1)
+    if options.only != "":
+        test_only_r = " 3.1 3.2 3.3 trunk current build stability performance x86 x86-64 -O0 -O2 native "
+        test_only = options.only.split(" ")
+        for iterator in test_only:
+            if not (" " + iterator + " " in test_only_r):
+                error("unknow option for only: " + iterator, 1)
+
+    global f_date
+    f_date = "logs"
+    common.remove_if_exists(f_date)
+    os.makedirs(f_date)
+    global alloy_build
+    alloy_build = os.getcwd() + os.sep + f_date + os.sep + "alloy_build.log"
+    global stability_log
+    stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log"
+    current_path = os.getcwd()
+    make = "make -j" + options.speed
+    try:
+        if options.build_llvm:
+            build_LLVM(options.version, options.revision, options.folder, options.tarball,
+                    options.debug, options.selfbuild, False, options.force, make)
+        if options.validation_run:
+            validation_run(options.only, options.only_targets, options.branch,
+                    options.number_for_performance, options.notify, options.update, make)
+    finally:
+        os.chdir(current_path)
+        date_name = "alloy_results_" + datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')
+        if os.path.exists(date_name):
+            error("It's forbidden to run alloy two times in a second, logs are in ./logs", 1)
+        os.rename(f_date, date_name)
+        print_debug("Logs are in " + date_name + "\n", False, "")
+
+###Main###
+from optparse import OptionParser
+from optparse import OptionGroup
+import sys
+import os
+import operator
+import time
+import glob
+import string
+import platform
+import smtplib
+import datetime
+import copy
+from email.MIMEMultipart import MIMEMultipart
+from email.MIMEBase import MIMEBase
+from email.mime.text import MIMEText
+from email.Encoders import encode_base64
+# our drivers
+import run_tests
+import perf
+import common
+error = common.error
+take_lines = common.take_lines
+print_debug = common.print_debug
+# parsing options
+class MyParser(OptionParser):
+    def format_epilog(self, formatter):
+        return self.epilog
+examples =  ("Examples:\n" +
+"Load and build LLVM from trunk\n\talloy.py -b\n" +
+"Load and build LLVM 3.3. Rewrite LLVM folders\n\talloy.py -b --version=3.3 --force\n" +
+"Untar files llvm.tgz clang.tgz, build LLVM from them in folder bin-from_tar\n\talloy.py -b --tarball='llvm.tgz clang.tgz' --folder=from_tar\n" +
+"Load LLVM from trunk, revision r172870. Build it. Do selfbuild\n\talloy.py -b --revision=r172870 --selfbuild\n" +
+"Validation run with LLVM 3.3, trunk; x86, x86-64; -O2;\nall supported targets; performance\n\talloy.py -r\n" + 
+"Validation run with all avx targets and sse4-i8x16 without performance\n\talloy.py -r --only=stability --only-targets='avx sse4-i8x16'\n" +
+"Validation run with avx2-i32x8, all sse4 and sse2 targets\nand all targets with i32x16\n\talloy.py -r --only-targets='avx2-i32x8 sse4 i32x16 sse2'\n" +
+"Stability validation run with LLVM 3.2, 3.3; -O0; x86,\nupdate fail_db.txt with passes and fails\n\talloy.py -r --only='3.2 -O0 stability 3.3 x86' --update-errors=FP\n" +
+"Try to build compiler with all LLVM\n\talloy.py -r --only=build\n" +
+"Performance validation run with 10 runs of each test and comparing to branch 'old'\n\talloy.py -r --only=performance --compare-with=old --number=10\n" +
+"Validation run. Update fail_db.txt with new fails, send results to my@my.com\n\talloy.py -r --update-errors=F --notify='my@my.com'\n")
+parser = MyParser(usage="Usage: alloy.py -r/-b [options]", epilog=examples)
+parser.add_option('-b', '--build-llvm', dest='build_llvm',
+    help='ask to build LLVM', default=False, action="store_true")
+parser.add_option('-r', '--run', dest='validation_run',
+    help='ask for validation run', default=False, action="store_true")
+parser.add_option('-j', dest='speed',
+    help='set -j for make', default="8")
+# options for activity "build LLVM"
+llvm_group = OptionGroup(parser, "Options for building LLVM",
+                    "These options must be used with -b option.")
+llvm_group.add_option('--version', dest='version',
+    help='version of llvm to build: 3.1 3.2 3.3 trunk. Default: trunk', default="trunk")
+llvm_group.add_option('--revision', dest='revision',
+    help='revision of llvm to build in format r172870', default="")
+llvm_group.add_option('--debug', dest='debug',
+    help='debug build of LLVM?', default=False, action="store_true")
+llvm_group.add_option('--folder', dest='folder',
+    help='folder to build LLVM in', default="")
+llvm_group.add_option('--tarball', dest='tarball',
+    help='"llvm_tarball clang_tarball"', default="")
+llvm_group.add_option('--selfbuild', dest='selfbuild',
+    help='make selfbuild of LLVM and clang', default=False, action="store_true")
+llvm_group.add_option('--force', dest='force',
+    help='rebuild LLVM', default=False, action='store_true')
+parser.add_option_group(llvm_group)
+# options for activity "validation run"
+run_group = OptionGroup(parser, "Options for validation run",
+                    "These options must be used with -r option.")
+run_group.add_option('--compare-with', dest='branch',
+    help='set performance reference point. Dafault: master', default="master")
+run_group.add_option('--number', dest='number_for_performance',
+    help='number of performance runs for each test. Default: 5', default=5)
+run_group.add_option('--notify', dest='notify',
+    help='email to sent results to', default="")
+run_group.add_option('--update-errors', dest='update',
+    help='rewrite fail_db.txt file according to received results (F or FP)', default="")
+run_group.add_option('--only-targets', dest='only_targets',
+    help='set list of targets to test. Possible values - all subnames of targets.',
+    default="")
+run_group.add_option('--only', dest='only',
+    help='set types of tests. Possible values:\n' + 
+        '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
+        'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).',
+        default="")
+parser.add_option_group(run_group)
+# options for activity "setup PATHS"
+setup_group = OptionGroup(parser, "Options for setup",
+                    "These options must be use with -r or -b to setup environment variables")
+setup_group.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="")
+setup_group.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="")
+setup_group.add_option('--sde_home', dest='sde_home',help='path to SDE',default="")
+parser.add_option_group(setup_group)
+(options, args) = parser.parse_args()
+Main()
diff --git a/builtins.cpp b/builtins.cpp
index 4b91ba30..48ce9afb 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -112,10 +112,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
         return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
 
     // varying
-    if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
-        t == LLVMTypes::MaskType)
-        return AtomicType::VaryingBool;
-    else if (t == LLVMTypes::Int8VectorType)
+    if (t == LLVMTypes::Int8VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
     else if (t == LLVMTypes::Int16VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16;
@@ -127,6 +124,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
         return AtomicType::VaryingDouble;
     else if (t == LLVMTypes::Int64VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
+    else if (t == LLVMTypes::MaskType)
+        return AtomicType::VaryingBool;
 
     // pointers to uniform
     else if (t == LLVMTypes::Int8PointerType)
@@ -303,6 +302,7 @@ lCheckModuleIntrinsics(llvm::Module *module) {
         // check the llvm.x86.* intrinsics for now...
         if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
             llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
+            if (id == 0) fprintf(stderr, "FATAL: intrinsic is not found: %s  \n", funcName.c_str());
             Assert(id != 0);
             llvm::Type *intrinsicType =
                 llvm::Intrinsic::getType(*g->ctx, id);
@@ -488,7 +488,6 @@ lSetInternalFunctions(llvm::Module *module) {
         "__num_cores",
         "__packed_load_active",
         "__packed_store_active",
-        "__pause",
         "__popcnt_int32",
         "__popcnt_int64",
         "__prefetch_read_uniform_1",
@@ -502,6 +501,8 @@ lSetInternalFunctions(llvm::Module *module) {
         "__rdrand_i64",
         "__reduce_add_double",
         "__reduce_add_float",
+        "__reduce_add_int8",
+        "__reduce_add_int16",
         "__reduce_add_int32",
         "__reduce_add_int64",
         "__reduce_equal_double",
@@ -576,20 +577,34 @@ lSetInternalFunctions(llvm::Module *module) {
         "__stdlib_pow",
         "__stdlib_powf",
         "__stdlib_sin",
+        "__stdlib_asin",
         "__stdlib_sincos",
         "__stdlib_sincosf",
         "__stdlib_sinf",
         "__stdlib_tan",
         "__stdlib_tanf",
-        "__svml_sin",
-        "__svml_cos",
-        "__svml_sincos",
-        "__svml_tan",
-        "__svml_atan",
-        "__svml_atan2",
-        "__svml_exp",
-        "__svml_log",
-        "__svml_pow",
+        "__svml_sind",
+        "__svml_asind",
+        "__svml_cosd",
+        "__svml_acosd",
+        "__svml_sincosd",
+        "__svml_tand",
+        "__svml_atand",
+        "__svml_atan2d",
+        "__svml_expd",
+        "__svml_logd",
+        "__svml_powd",
+        "__svml_sinf",
+        "__svml_asinf",
+        "__svml_cosf",
+        "__svml_acosf",
+        "__svml_sincosf",
+        "__svml_tanf",
+        "__svml_atanf",
+        "__svml_atan2f",
+        "__svml_expf",
+        "__svml_logf",
+        "__svml_powf",
         "__undef_uniform",
         "__undef_varying",
         "__vec4_add_float",
@@ -640,7 +655,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
         llvm::Triple bcTriple(bcModule->getTargetTriple());
         Debug(SourcePos(), "module triple: %s\nbitcode triple: %s\n",
               mTriple.str().c_str(), bcTriple.str().c_str());
-#ifndef __arm__
+#if defined(ISPC_ARM_ENABLED) && !defined(__arm__)
         // FIXME: More ugly and dangerous stuff.  We really haven't set up
         // proper build and runtime infrastructure for ispc to do
         // cross-compilation, yet it's at minimum useful to be able to emit
@@ -656,8 +671,12 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
         // the values for an ARM target.  This maybe won't cause problems
         // in the generated code, since bulitins.c doesn't do anything too
         // complex w.r.t. struct layouts, etc.
-        if (g->target->getISA() != Target::NEON &&
+        if (g->target->getISA() != Target::NEON32 &&
+            g->target->getISA() != Target::NEON16 &&
+            g->target->getISA() != Target::NEON8 &&
             g->target->getISA() != Target::NVPTX64)
+#else
+        if (g->target->getISA() != Target::NVPTX64)
 #endif // !__arm__
         {
             Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
@@ -831,15 +850,35 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         }
         break;
       };
-    case Target::NEON: {
+#ifdef ISPC_ARM_ENABLED
+    case Target::NEON8: {
         if (runtime32) {
-            EXPORT_MODULE(builtins_bitcode_neon_32bit);
+            EXPORT_MODULE(builtins_bitcode_neon_8_32bit);
         }
         else {
-            EXPORT_MODULE(builtins_bitcode_neon_64bit);
+            EXPORT_MODULE(builtins_bitcode_neon_8_64bit);
         }
         break;
     }
+    case Target::NEON16: {
+        if (runtime32) {
+            EXPORT_MODULE(builtins_bitcode_neon_16_32bit);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_neon_16_64bit);
+        }
+        break;
+    }
+    case Target::NEON32: {
+        if (runtime32) {
+            EXPORT_MODULE(builtins_bitcode_neon_32_32bit);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_neon_32_64bit);
+        }
+        break;
+    }
+#endif
     case Target::SSE2: {
         switch (g->target->getVectorWidth()) {
         case 4:
@@ -875,10 +914,31 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
             break;
         case 8:
             if (runtime32) {
-                EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit);
+                if (g->target->getMaskBitCount() == 16) {
+                    EXPORT_MODULE(builtins_bitcode_sse4_16_32bit);
+                }
+                else {
+                    Assert(g->target->getMaskBitCount() == 32);
+                    EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit);
+                }
             }
             else {
-                EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit);
+                if (g->target->getMaskBitCount() == 16) {
+                    EXPORT_MODULE(builtins_bitcode_sse4_16_64bit);
+                }
+                else {
+                    Assert(g->target->getMaskBitCount() == 32);
+                    EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit);
+                }
+            }
+            break;
+        case 16:
+            Assert(g->target->getMaskBitCount() == 8);
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_sse4_8_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_sse4_8_64bit);
             }
             break;
         default:
@@ -888,6 +948,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
     case Target::AVX: {
         switch (g->target->getVectorWidth()) {
+        case 4:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit);
+            }
+            break;
         case 8:
             if (runtime32) {
                 EXPORT_MODULE(builtins_bitcode_avx1_32bit);
@@ -1050,16 +1118,33 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         // If the user wants the standard library to be included, parse the
         // serialized version of the stdlib.ispc file to get its
         // definitions added.
+        extern char stdlib_mask1_code[], stdlib_mask8_code[];
+        extern char stdlib_mask16_code[], stdlib_mask32_code[], stdlib_mask64_code[];
         if (g->target->getISA() == Target::GENERIC &&
-            g->target->getVectorWidth() != 1) { // 1 wide uses x86 stdlib
-            extern char stdlib_generic_code[];
-            yy_scan_string(stdlib_generic_code);
-            yyparse();
+            g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib
+            yy_scan_string(stdlib_mask32_code);
         }
         else {
-            extern char stdlib_x86_code[];
-            yy_scan_string(stdlib_x86_code);
-            yyparse();
+            switch (g->target->getMaskBitCount()) {
+            case 1:
+                yy_scan_string(stdlib_mask1_code);
+                break;
+            case 8:
+                yy_scan_string(stdlib_mask8_code);
+                break;
+            case 16:
+                yy_scan_string(stdlib_mask16_code);
+                break;
+            case 32:
+                yy_scan_string(stdlib_mask32_code);
+                break;
+            case 64:
+                yy_scan_string(stdlib_mask64_code);
+                break;
+            default:
+                FATAL("Unhandled mask bit size for stdlib.ispc");
+            }
         }
+        yyparse();
     }
 }
diff --git a/builtins/dispatch.ll b/builtins/dispatch.ll
index f1d5a969..ba216df7 100644
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2011, Intel Corporation
+;;  Copyright (c) 2011-2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -41,15 +41,13 @@
 
 @__system_best_isa = internal global i32 -1
 
-declare void @abort() noreturn
-
 ;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
 ;; following code...  Specifically, __get_system_isa should return a value
 ;; corresponding to one of the Target::ISA enumerant values that gives the
 ;; most capable ISA that the curremt system can run.
 ;;
-;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum
-;; backwards compatibility for anyone building ispc with LLVM 3.0
+;; Note: clang from LLVM 3.1 should be used if this is updated, for maximum
+;; backwards compatibility for anyone building ispc with LLVM 3.1
 ;;
 ;; #include <stdint.h>
 ;; #include <stdlib.h>
@@ -60,7 +58,7 @@ declare void @abort() noreturn
 ;;                           : "0" (infoType));
 ;; }
 ;; 
-;; /* Save %ebx in case it's the PIC register */
+;; // Save %ebx in case it's the PIC register.
 ;; static void __cpuid_count(int info[4], int level, int count) {
 ;;   __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
 ;;                         "cpuid\n\t"
@@ -69,13 +67,23 @@ declare void @abort() noreturn
 ;;                         : "0" (level), "2" (count));
 ;; }
 ;; 
+;; static int __os_has_avx_support() {
+;;     // Check xgetbv; this uses a .byte sequence instead of the instruction
+;;     // directly because older assemblers do not include support for xgetbv and
+;;     // there is no easy way to conditionally compile based on the assembler used.
+;;     int rEAX, rEDX;
+;;     __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+;;     return (rEAX & 6) == 6;
+;; }
+;; 
 ;; int32_t __get_system_isa() {
 ;;     int info[4];
 ;;     __cpuid(info, 1);
 ;; 
-;;     /* NOTE: the values returned below must be the same as the
-;;        corresponding enumerant values in Target::ISA. */
-;;     if ((info[2] & (1 << 28)) != 0) {
+;;     // NOTE: the values returned below must be the same as the
+;;     // corresponding enumerant values in Target::ISA.
+;;     if ((info[2] & (1 << 28)) != 0 &&
+;;         __os_has_avx_support()) {
 ;;        if ((info[2] & (1 << 29)) != 0 &&  // F16C
 ;;            (info[2] & (1 << 30)) != 0) {  // RDRAND
 ;;            // So far, so good.  AVX2?
@@ -98,47 +106,56 @@ declare void @abort() noreturn
 ;;         abort();
 ;; }
 
-define i32 @__get_system_isa() nounwind uwtable ssp {
+define i32 @__get_system_isa() nounwind uwtable {
 entry:
   %0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
   %asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
   %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
   %and = and i32 %asmresult5.i, 268435456
   %cmp = icmp eq i32 %and, 0
-  br i1 %cmp, label %if.else13, label %if.then
+  br i1 %cmp, label %if.else14, label %land.lhs.true
 
-if.then:                                          ; preds = %entry
-  %1 = and i32 %asmresult5.i, 1610612736
-  %2 = icmp eq i32 %1, 1610612736
-  br i1 %2, label %if.then7, label %return
+land.lhs.true:                                    ; preds = %entry
+  %1 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind
+  %asmresult.i25 = extractvalue { i32, i32 } %1, 0
+  %and.i = and i32 %asmresult.i25, 6
+  %cmp.i = icmp eq i32 %and.i, 6
+  br i1 %cmp.i, label %if.then, label %if.else14
 
-if.then7:                                         ; preds = %if.then
-  %3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
-  %asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
-  %and10 = lshr i32 %asmresult4.i28, 5
-  %4 = and i32 %and10, 1
-  %5 = add i32 %4, 3
+if.then:                                          ; preds = %land.lhs.true
+  %2 = and i32 %asmresult5.i, 1610612736
+  %3 = icmp eq i32 %2, 1610612736
+  br i1 %3, label %if.then8, label %return
+
+if.then8:                                         ; preds = %if.then
+  %4 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult4.i30 = extractvalue { i32, i32, i32, i32 } %4, 1
+  %and11 = lshr i32 %asmresult4.i30, 5
+  %5 = and i32 %and11, 1
+  %6 = add i32 %5, 3
   br label %return
 
-if.else13:                                        ; preds = %entry
-  %and15 = and i32 %asmresult5.i, 524288
-  %cmp16 = icmp eq i32 %and15, 0
-  br i1 %cmp16, label %if.else18, label %return
+if.else14:                                        ; preds = %land.lhs.true, %entry
+  %and16 = and i32 %asmresult5.i, 524288
+  %cmp17 = icmp eq i32 %and16, 0
+  br i1 %cmp17, label %if.else19, label %return
 
-if.else18:                                        ; preds = %if.else13
-  %and20 = and i32 %asmresult6.i, 67108864
-  %cmp21 = icmp eq i32 %and20, 0
-  br i1 %cmp21, label %if.else23, label %return
+if.else19:                                        ; preds = %if.else14
+  %and21 = and i32 %asmresult6.i, 67108864
+  %cmp22 = icmp eq i32 %and21, 0
+  br i1 %cmp22, label %if.else24, label %return
 
-if.else23:                                        ; preds = %if.else18
+if.else24:                                        ; preds = %if.else19
   tail call void @abort() noreturn nounwind
   unreachable
 
-return:                                           ; preds = %if.else18, %if.else13, %if.then7, %if.then
-  %retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
+return:                                           ; preds = %if.else19, %if.else14, %if.then8, %if.then
+  %retval.0 = phi i32 [ %6, %if.then8 ], [ 2, %if.then ], [ 1, %if.else14 ], [ 0, %if.else19 ]
   ret i32 %retval.0
 }
 
+declare void @abort() noreturn nounwind
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This function is called by each of the dispatch functions we generate;
 ;; it sets @__system_best_isa if it is unset.
diff --git a/builtins/svml.m4 b/builtins/svml.m4
new file mode 100644
index 00000000..0a587577
--- /dev/null
+++ b/builtins/svml.m4
@@ -0,0 +1,217 @@
+;; copyright stub  :)
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;; svml macro
+
+;; svml_stubs : stubs for svml calls
+;; $1 - type ("float" or "double")
+;; $2 - svml internal function suffix ("f" for float, "d" for double)
+;; $3 - vector width
+define(`svml_stubs',`
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
+')
+
+;; svml_declare : declaration of __svml_* intrinsics 
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+define(`svml_declare',`
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_sincos$2(<$3 x $1> *, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
+');
+
+;; defintition of __svml_* internal functions
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
+define(`svml_define',`
+  define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+  define <$3 x $1> @__svml_asin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_asin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_cos$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_cos$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define void @__svml_sincos$4(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline {
+    %s = call <$3 x $1> @__svml_sincos$2(<$3 x $1> * %2, <$3 x $1> %0)
+    store <$3 x $1> %s, <$3 x $1> * %1
+    ret void
+  }
+
+  define <$3 x $1> @__svml_tan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_tan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_atan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_atan2$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan2$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_exp$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_exp$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_log$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_log$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_pow$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_pow$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
+  }
+')
+
+
+;; svml_define_x : defintition of __svml_* internal functions operation on extended width
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
+;; $5 - extended width, must be at least twice the native vector width
+;;      contigent on existing of unary$3to$5 and binary$3to$5 macros
+
+;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g.
+;;define void @__svml_sincosf(<8 x float>, <8 x float> *,
+;;                                    <8 x float> *) nounwind readnone alwaysinline {
+;;  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+;;  %a = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %b = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+;;
+;;  %cospa = alloca <4 x float>
+;;  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+;;
+;;  %cospb = alloca <4 x float>
+;;  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+;;
+;;  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %sin, <8 x float> * %1
+;;
+;;  %cosa = load <4 x float> * %cospa
+;;  %cosb = load <4 x float> * %cospb
+;;  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %cos, <8 x float> * %2
+;;
+;;  ret void
+;;}
+define(`svml_define_x',`
+  define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_sin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_asin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_asin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_cos$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_cos$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  {
+    %s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0)
+    %c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0)
+    store <$5 x $1> %s, <$5 x $1> * %1
+    store <$5 x $1> %c, <$5 x $1> * %2
+    ret void
+  }
+  define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_tan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_atan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan2$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_atan2$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_exp$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_exp$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_log$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_log$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_pow$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_pow$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
+')
+
diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index dcbe0a66..1d317713 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli
   sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
   ret double %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index 8c6b7753..f8fd5cd5 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -137,19 +137,14 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones 4x with our 16-wide
-; vectors...
+include(`svml.m4')
+;; single precision
+svml_declare(float,f8,8)
+svml_define_x(float,f8,8,f,16)
 
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+;; double precision
+svml_declare(double,4,4)
+svml_define_x(double,4,4,d,16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
@@ -271,6 +266,33 @@ reduce_equal(16)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <16 x i16> @__add_varying_i16(<16 x i16>,
+                                  <16 x i16>) nounwind readnone alwaysinline {
+  %r = add <16 x i16> %0, %1
+  ret <16 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
+  reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define <16 x i32> @__add_varying_int32(<16 x i32>,
                                        <16 x i32>) nounwind readnone alwaysinline {
   %s = add <16 x i32> %0, %1
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index e6ab3a4b..196e5ea4 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -137,19 +137,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
+include(`svml.m4')
+;; single precision
+svml_declare(float,f8,8)
+svml_define(float,f8,8,f)
 
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+;; double precision
+svml_declare(double,4,4)
+svml_define_x(double,4,4,d,8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
@@ -217,7 +212,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
   ret float %sum
 }
 
-
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
   reduce8(float, @__min_varying_float, @__min_uniform_float)
 }
@@ -229,6 +223,42 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
 
 reduce_equal(8)
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
 
@@ -257,20 +287,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
 }
 
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;; horizontal uint32 ops
-
 define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
 
-
 define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal double ops
 
@@ -329,9 +353,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
 }
 
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;; horizontal uint64 ops
-
 define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
   reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
diff --git a/builtins/target-avx1-i64x4.ll b/builtins/target-avx1-i64x4.ll
new file mode 100644
index 00000000..d183f1ce
--- /dev/null
+++ b/builtins/target-avx1-i64x4.ll
@@ -0,0 +1,81 @@
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx1-i64x4base.ll')
+
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+
+  ret <4 x i32> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+ifelse(NO_HALF_DECLARES, `1', `', `
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll
new file mode 100644
index 00000000..e1832030
--- /dev/null
+++ b/builtins/target-avx1-i64x4base.ll
@@ -0,0 +1,513 @@
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 4-wide definitions
+
+define(`WIDTH',`4')
+define(`MASK',`i64')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  ; do one N-R iteration
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+;; avx intrinsic
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9)
+  ret <4 x double> %call
+}
+
+
+define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10)
+  ret <4 x double> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+;; avx� intrinsic
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0)
+  ret <4 x double> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define(double,4,4,d)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+;; sse intrinsics
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+;; sse intrinsic 
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
+
+define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
+  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
+  %scalar = extractelement <4 x float> %v2, i32 0
+  ret float %scalar
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline 
+{
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define <4 x i32> @__add_varying_int32(<4 x i32>,
+                                      <4 x i32>) nounwind readnone alwaysinline {
+  %s = add <4 x i32> %0, %1
+  ret <4 x i32> %s
+}
+
+define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <4 x double> <double 0.,double 0.,double 0.,double 0.>, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %v1 = <4 x double> <double 0., double 0., double 0., double 0.>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0,   <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
+  ret double %sum
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define <4 x i64> @__add_varying_int64(<4 x i64>,
+                                      <4 x i64>) nounwind readnone alwaysinline {
+  %s = add <4 x i64> %0, %1
+  ret <4 x i64> %s
+}
+
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+; no masked load instruction for i8 and i16 types??
+masked_load(i8,  1)
+masked_load(i16, 2)
+
+;; avx intrinsics
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+ 
+define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline {
+  %mask      = trunc <4 x i64> %mask64 to <4 x i32>
+  %floatmask = bitcast <4 x i32> %mask to <4 x float>
+  %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
+  %retval = bitcast <4 x float> %floatval to <4 x i32>
+  ret <4 x i32> %retval
+}
+
+
+define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline {
+  %doublemask = bitcast <4 x i64> %mask to <4 x double>
+  %doubleval  = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask)
+  %retval = bitcast <4 x double> %doubleval to <4 x i64>
+  ret <4 x i64> %retval
+}
+
+masked_load_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+;; avx intrinsics
+declare void @llvm.x86.avx.maskstore.ps    (i8 *, <4 x float>,  <4 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+
+define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                <4 x i64>) nounwind alwaysinline {
+  %mask32 = trunc <4 x i64> %2 to <4 x i32>
+
+  %ptr    = bitcast <4 x i32> * %0 to i8 *
+  %val    = bitcast <4 x i32> %1 to <4 x float>
+  %mask   = bitcast <4 x i32> %mask32 to <4 x float>
+  call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
+  ret void
+}
+
+define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
+                                <4 x i64>) nounwind alwaysinline {
+  %ptr  = bitcast <4 x i64> * %0 to i8 *
+  %val  = bitcast <4 x i64> %1 to <4 x double>
+  %mask = bitcast <4 x i64> %2 to <4 x double>
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val)
+  ret void
+}
+
+
+masked_store_blend_8_16_by_4_mask64()
+
+;; sse intrinsic
+declare <4 x float>  @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask          = trunc   <4 x i64> %2 to <4 x i32>
+  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
+  %oldValue      = load    <4 x i32>* %0, align 4
+  %oldAsFloat    = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat    = bitcast <4 x i32> %1 to <4 x float>
+  %blend         = call    <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                             <4 x float> %newAsFloat,
+                                                             <4 x float> %mask_as_float)
+  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
+  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
+  ret void
+}
+
+;; avx intrinsic
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                <4 x double>) nounwind readnone
+
+define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask_as_double = bitcast <4 x i64>  %2 to <4 x double>
+  %oldValue       = load    <4 x i64>* %0, align 4
+  %oldAsDouble    = bitcast <4 x i64>  %oldValue to <4 x double>
+  %newAsDouble    = bitcast <4 x i64>  %1 to <4 x double>
+  %blend          = call    <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble,
+                                                                        <4 x double> %newAsDouble,
+                                                                        <4 x double> %mask_as_double)
+  %blendAsInt = bitcast <4 x double> %blend to <4 x i64>
+  store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4
+  ret void
+}
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; scatter
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 9b747e2e..910565dd 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -310,6 +310,7 @@ declare double @round (double) nounwind readnone
 ;declare float     @llvm.sqrt.f32(float %Val)
 declare double    @llvm.sqrt.f64(double %Val)
 declare float     @llvm.sin.f32(float %Val)
+declare float     @llvm.asin.f32(float %Val)
 declare float     @llvm.cos.f32(float %Val)
 declare float     @llvm.sqrt.f32(float %Val)
 declare float     @llvm.exp.f32(float %Val)
@@ -471,6 +472,15 @@ define  i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
   ret i64 %call
 }
 
+define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x i8> %v, i32 0
+  ret i8 %r
+}
+
+define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x i16> %v, i32 0
+  ret i16 %r
+}
 
 define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
   %r = extractelement <1 x float> %v, i32 0
@@ -642,7 +652,18 @@ define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
+declare  <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline 
+declare  void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+
+define  <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -653,7 +674,18 @@ define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
    
 }
 
-define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.asin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.asin.f32)
+   
+}
+
+define  <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -664,18 +696,18 @@ define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
 
 }
 
-define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+define  void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
 ;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
 ;  store <1 x float> %s, <1 x float> * %1
 ;  ret void
-   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
-   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
+   %sin = call <1 x float> @__svml_sinf(<1 x float> %0)
+   %cos = call <1 x float> @__svml_cosf(<1 x float> %0)
    store <1 x float> %sin, <1 x float> * %1
    store <1 x float> %cos, <1 x float> * %2
    ret void
 }
 
-define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -687,7 +719,7 @@ define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
   ret <1 x float > %0
 }
 
-define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline {
 ;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
 ;  ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -700,7 +732,7 @@ define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
 
 }
 
-define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
   ;ret <1 x float> %ret
   ;%y = extractelement <1 x float> %0, i32 0
@@ -713,19 +745,19 @@ define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone al
   ret <1 x float > %0
 }
 
-define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
   ;ret <1 x float> %ret
   unary1to1(float, @llvm.exp.f32)
 }
 
-define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
   ;ret <1 x float> %ret
   unary1to1(float, @llvm.log.f32)
 }
 
-define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
   ;ret <1 x float> %ret
   %r = extractelement <1 x float> %0, i32 0
@@ -953,3 +985,9 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index bbf1b842..2a5d1b32 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -202,21 +202,15 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
 declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
 declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 
-;; svml
-
 ; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
 ; or, use the macro to call the 4-wide ones twice with our 8-wide
 ; vectors...
 
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+;; svml
+
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
@@ -226,14 +220,16 @@ declare i1 @__any(<WIDTH x i1>) nounwind readnone
 declare i1 @__all(<WIDTH x i1>) nounwind readnone 
 declare i1 @__none(<WIDTH x i1>) nounwind readnone 
 
+declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
+declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
+
 declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
 declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
 declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone 
 
-declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone 
+declare i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
 declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone 
-
 declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone 
 
@@ -244,7 +240,6 @@ declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone
 declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone 
-
 declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone 
 
@@ -379,3 +374,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
 declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind 
 declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind 
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll
new file mode 100644
index 00000000..a0575927
--- /dev/null
+++ b/builtins/target-neon-16.ll
@@ -0,0 +1,517 @@
+;;
+;; target-neon-16.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`8')
+define(`MASK',`i16')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <8 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
+  %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i,
+      <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+       i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <8 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
+  %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
+  ret <8 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+  unary4to8(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <8 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+  unary4to8(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+  %and_mask = and <WIDTH x i16> %0,
+    <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %v = or i64 %va, %vb
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vor = or <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vor, i32 0
+  %v1 = extractelement <4 x MASK> %vor, i32 1
+  %v2 = extractelement <4 x MASK> %vor, i32 2
+  %v3 = extractelement <4 x MASK> %vor, i32 3
+  %v01 = or MASK %v0, %v1
+  %v23 = or MASK %v2, %v3
+  %v = or MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vand = and <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vand, i32 0
+  %v1 = extractelement <4 x MASK> %vand, i32 1
+  %v2 = extractelement <4 x MASK> %vand, i32 2
+  %v3 = extractelement <4 x MASK> %vand, i32 3
+  %v01 = and MASK %v0, %v1
+  %v23 = and MASK %v2, %v3
+  %v = and MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v8tov4($1, %0, %v0123, %v4567)
+  %v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8)
+  %vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0)
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  %r16 = trunc i32 %r to i16
+  ret i16 %r16
+}
+
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16>)
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  %a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16> %0)
+  %a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1)
+  %aa = extractelement <2 x i64> %a2, i32 0
+  %ab = extractelement <2 x i64> %a2, i32 1
+  %r = add i64 %aa, %ab
+  ret i64 %r
+}
+
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  v8tov4(i32, %0, %va, %vb)
+  %pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %psum = add <2 x i64> %pa, %pb
+  %a0 = extractelement <2 x i64> %psum, i32 0
+  %a1 = extractelement <2 x i64> %psum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+  v8tov2(double, %0, %v0, %v1, %v2, %v3)
+  %v01 = fadd <2 x double> %v0, %v1
+  %v23 = fadd <2 x double> %v2, %v3
+  %sum = fadd <2 x double> %v01, %v23
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+  v8tov2(i64, %0, %v0, %v1, %v2, %v3)
+  %v01 = add <2 x i64> %v0, %v1
+  %v23 = add <2 x i64> %v2, %v3
+  %sum = add <2 x i64> %v01, %v23
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16
+
+declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
diff --git a/builtins/target-neon.ll b/builtins/target-neon-32.ll
similarity index 60%
rename from builtins/target-neon.ll
rename to builtins/target-neon-32.ll
index e70b774b..30b062c9 100644
--- a/builtins/target-neon.ll
+++ b/builtins/target-neon-32.ll
@@ -1,5 +1,5 @@
 ;;
-;; target-neon.ll
+;; target-neon-32.ll
 ;;
 ;;  Copyright(c) 2012-2013 Matt Pharr
 ;;  Copyright(c) 2013 Google, Inc.
@@ -34,52 +34,20 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
-
 define(`WIDTH',`4')
-
 define(`MASK',`i32')
 
 include(`util.m4')
-
-stdlib_core()
-scans()
-reduce_equal(WIDTH)
-rdrand_decls()
-define_shuffles()
-aossoa()
-ctlztz()
+include(`target-neon-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
-declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
-
-define float @__half_to_float_uniform(i16 %v) nounwind readnone {
-  %v1 = bitcast i16 %v to <1 x i16>
-  %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, 
-           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
-  %r = extractelement <4 x float> %h, i32 0
-  ret float %r
-}
-
 define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone {
   %r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v)
   ret <4 x float> %r
 }
 
-define i16 @__float_to_half_uniform(float %v) nounwind readnone {
-  %v1 = bitcast float %v to <1 x float>
-  %vec = shufflevector <1 x float> %v1, <1 x float> undef, 
-           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
-  %r = extractelement <4 x i16> %h, i32 0
-  ret i16 %r
-}
-
-
 define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
   %r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v)
   ret <4 x i16> %r
@@ -88,48 +56,11 @@ define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; math
 
-define void @__fastmath() nounwind {
-  ret void
-}
-
 ;; round/floor/ceil
 
 ;; FIXME: grabbed these from the sse2 target, which does not have native
 ;; instructions for these.  Is there a better approach for NEON?
 
-define float @__round_uniform_float(float) nounwind readonly alwaysinline {
-  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
-  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
-  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
-  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
-  %binop21.i = fadd float %binop.i, -8.388608e+06
-  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
-  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
-  ret float %int_to_float_bitcast.i.i.i
-}
-
-define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp ogt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, -1082130432
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
-define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp olt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, 1065353216
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
 define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
   %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
   %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
@@ -164,10 +95,6 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin
 }
 
 ;; FIXME: rounding doubles and double vectors needs to be implemented
-declare double @__round_uniform_double(double) nounwind readnone 
-declare double @__floor_uniform_double(double) nounwind readnone 
-declare double @__ceil_uniform_double(double) nounwind readnone 
-
 declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
 declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
 declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
@@ -175,78 +102,6 @@ declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readn
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max
 
-define float @__max_uniform_float(float, float) nounwind readnone {
-  %cmp = fcmp ugt float %0, %1
-  %r = select i1 %cmp, float %0, float %1
-  ret float %r
-}
-
-define float @__min_uniform_float(float, float) nounwind readnone {
-  %cmp = fcmp ult float %0, %1
-  %r = select i1 %cmp, float %0, float %1
-  ret float %r
-}
-
-define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
-  %cmp = icmp slt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
-  %cmp = icmp sgt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
-  %cmp = icmp ult i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
-  %cmp = icmp ugt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
-  %cmp = icmp slt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
-  %cmp = icmp sgt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
-  %cmp = icmp ult i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
-  %cmp = icmp ugt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define double @__min_uniform_double(double, double) nounwind readnone {
-  %cmp = fcmp olt double %0, %1
-  %r = select i1 %cmp, double %0, double %1
-  ret double %r
-}
-
-define double @__max_uniform_double(double, double) nounwind readnone {
-  %cmp = fcmp ogt double %0, %1
-  %r = select i1 %cmp, double %0, double %1
-  ret double %r
-}
-
 declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
 
@@ -287,44 +142,6 @@ define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwin
   ret <4 x i32> %r
 }
 
-define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp slt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp sgt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp ult <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp ugt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
-                                              <WIDTH x double>) nounwind readnone {
-  %m = fcmp olt <WIDTH x double> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
-  ret <WIDTH x double> %r
-}
-
-define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
-                                              <WIDTH x double>) nounwind readnone {
-  %m = fcmp ogt <WIDTH x double> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
-  ret <WIDTH x double> %r
-}
-
 ;; sqrt/rsqrt/rcp
 
 declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
@@ -371,13 +188,6 @@ define float @__rcp_uniform_float(float) nounwind readnone {
   ret float %r
 }
 
-declare float @llvm.sqrt.f32(float)
-
-define float @__sqrt_uniform_float(float) nounwind readnone {
-  %r = call float @llvm.sqrt.f32(float %0)
-  ret float %r
-}
-
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
 
 define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
@@ -388,13 +198,6 @@ define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
   ret <4 x float> %result
 }
 
-declare double @llvm.sqrt.f64(double)
-
-define double @__sqrt_uniform_double(double) nounwind readnone {
-  %r = call double @llvm.sqrt.f64(double %0)
-  ret double %r
-}
-
 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
 
 define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
@@ -402,21 +205,6 @@ define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readno
   ret <4 x double> %r
 }
 
-;; bit ops
-
-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
-
-define i32 @__popcnt_int32(i32) nounwind readnone {
-  %v = call i32 @llvm.ctpop.i32(i32 %0)
-  ret i32 %v
-}
-
-define i64 @__popcnt_int64(i64) nounwind readnone {
-  %v = call i64 @llvm.ctpop.i64(i64 %0)
-  ret i64 %v
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
 
@@ -509,15 +297,38 @@ define float @__reduce_max_float(<4 x float>) nounwind readnone {
   neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
 }
 
-define internal i32 @add_i32(i32, i32) {
-  %r = add i32 %0, %1
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8)
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  %r16 = trunc i32 %r to i16
+  ret i16 %r16
+}
+
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+
+define i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
   ret i32 %r
 }
 
-declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
 
-define i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
-  neon_reduce(i32, @llvm.arm.neon.vpadd.v2i32, @add_i32)
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0)
+  %a0 = extractelement <2 x i64> %a64, i32 0
+  %a1 = extractelement <2 x i64> %a64, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
 }
 
 declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
@@ -617,90 +428,60 @@ define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unaligned loads/loads+broadcasts
+;; int8/int16
 
-masked_load(i8,  1)
-masked_load(i16, 2)
-masked_load(i32, 4)
-masked_load(float, 4)
-masked_load(i64, 8)
-masked_load(double, 8)
+declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
 
-gen_masked_store(i8)
-gen_masked_store(i16)
-gen_masked_store(i32)
-gen_masked_store(i64)
-masked_store_float_double()
-
-define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
-                                     <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i8> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i8> %new, <4 x i8> %old
-  store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
-  ret void
+define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
 }
 
-define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new, 
-                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i16> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i16> %new, <4 x i16> %old
-  store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
-  ret void
+declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
 }
 
-define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new, 
-                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i32> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i32> %new, <4 x i32> %old
-  store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
-  ret void
+declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
 }
 
-define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
-                            <WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i64> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i64> %new, <4 x i64> %old
-  store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
-  ret void
+declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
 }
 
-;; yuck.  We need declarations of these, even though we shouldnt ever
-;; actually generate calls to them for the NEON target...
+declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
 
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather
+declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
 
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)
+define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
 
-gen_scatter(i8)
-gen_scatter(i16)
-gen_scatter(i32)
-gen_scatter(float)
-gen_scatter(i64)
-gen_scatter(double)
+declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
 
-packed_load_and_store(4)
+define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; prefetch
+declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
 
-define_prefetches()
+define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll
new file mode 100644
index 00000000..2accfe53
--- /dev/null
+++ b/builtins/target-neon-8.ll
@@ -0,0 +1,583 @@
+;;
+;; target-neon-8.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+define(`MASK',`i8')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <16 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32>
+  %bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i,
+    <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <16 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32>
+  %bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float>
+  ret <16 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp olt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+  unary4to16(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <16 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+  unary4to16(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+  %and_mask = and <WIDTH x i8> %0,
+    <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128,
+     i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128>
+  %v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask)
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %vbshift = shl i64 %vb, 8
+  %v = or i64 %va, %vbshift
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vor8 = or <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vor8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vor16 = or <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vor16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vor32 = or <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vor32, i32 0
+  %v1 = extractelement <2 x i32> %vor32, i32 1
+  %v = or i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vand8 = and <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vand8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vand16 = and <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vand16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vand32 = and <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vand32, i32 0
+  %v1 = extractelement <2 x i32> %vand32, i32 1
+  %v = and i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v16tov8($1, %0, %va, %vb)
+  %va_16 = shufflevector <8 x $1> %va, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16)
+
+  %v8a = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8b = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 4, i32 5, i32 6, i32 7, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b)
+
+  %vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %a0 = extractelement <2 x i64> %a64, i32 0
+  %a1 = extractelement <2 x i64> %a64, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  v16tov8(i16, %0, %va, %vb)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va)
+  %b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32)
+  %sum = add <2 x i64> %a64, %b64
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  v16tov4(i32, %0, %va, %vb, %vc, %vd)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc)
+  %d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd)
+  %ab = add <2 x i64> %a64, %b64
+  %cd = add <2 x i64> %c64, %d64
+  %sum = add <2 x i64> %ab, %cd
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define internal <WIDTH x double> @__add_varying_double(<WIDTH x double>, <WIDTH x double>) {
+  %r = fadd <WIDTH x double> %0, %1
+  ret <WIDTH x double> %r
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define internal <WIDTH x i64> @__add_varying_int64(<WIDTH x i64>, <WIDTH x i64>) {
+  %r = add <WIDTH x i64> %0, %1
+  ret <WIDTH x i64> %r
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
new file mode 100644
index 00000000..1c0b421f
--- /dev/null
+++ b/builtins/target-neon-common.ll
@@ -0,0 +1,346 @@
+;;
+;; target-neon-common.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
+
+stdlib_core()
+scans()
+reduce_equal(WIDTH)
+rdrand_decls()
+define_shuffles()
+aossoa()
+ctlztz()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
+  %r = extractelement <4 x float> %h, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vec = shufflevector <1 x float> %v1, <1 x float> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
+  %r = extractelement <4 x i16> %h, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+define void @__fastmath() nounwind {
+  ret void
+}
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
+  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
+  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
+  %binop21.i = fadd float %binop.i, -8.388608e+06
+  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
+  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp ogt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, -1082130432
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp olt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, 1065353216
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare double @__round_uniform_double(double) nounwind readnone 
+declare double @__floor_uniform_double(double) nounwind readnone 
+declare double @__ceil_uniform_double(double) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+define float @__max_uniform_float(float, float) nounwind readnone {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__min_uniform_float(float, float) nounwind readnone {
+  %cmp = fcmp ult float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
+  %cmp = icmp slt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
+  %cmp = icmp sgt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
+  %cmp = icmp ult i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
+  %cmp = icmp ugt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define double @__min_uniform_double(double, double) nounwind readnone {
+  %cmp = fcmp olt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone {
+  %cmp = fcmp ogt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp slt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp sgt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp ult <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp ugt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone {
+  %m = fcmp olt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone {
+  %m = fcmp ogt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare float @llvm.sqrt.f32(float)
+
+define float @__sqrt_uniform_float(float) nounwind readnone {
+  %r = call float @llvm.sqrt.f32(float %0)
+  ret float %r
+}
+
+declare double @llvm.sqrt.f64(double)
+
+define double @__sqrt_uniform_double(double) nounwind readnone {
+  %r = call double @llvm.sqrt.f64(double %0)
+  ret double %r
+}
+
+;; bit ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define i32 @__popcnt_int32(i32) nounwind readnone {
+  %v = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %v
+}
+
+define i64 @__popcnt_int64(i64) nounwind readnone {
+  %v = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+masked_store_float_double()
+
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
+                                     <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i8> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i8> %new, <WIDTH x i8> %old
+  store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i16> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i16> %new, <WIDTH x i16> %old
+  store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i32> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i32> %new, <WIDTH x i32> %old
+  store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
+                            <WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i64> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i64> %new, <WIDTH x i64> %old
+  store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
+  ret void
+}
+
+;; yuck.  We need declarations of these, even though we shouldnt ever
+;; actually generate calls to them for the NEON target...
+
+
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+packed_load_and_store(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+
+define_prefetches()
diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll
index f3f9bfd9..a728803f 100644
--- a/builtins/target-nvptx64.ll
+++ b/builtins/target-nvptx64.ll
@@ -5,6 +5,10 @@ define(`WIDTH',`1')
 
 include(`util.m4')
 
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
+
 ; Define some basics for a 1-wide target
 stdlib_core()
 packed_load_and_store()
@@ -467,6 +471,9 @@ define  i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
 
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 
+declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
+declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
+
 define  i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
   %call = call i32 @llvm.ctpop.i32(i32 %0)
   ret i32 %call
@@ -643,103 +650,6 @@ define  <1 x double> @__rsqrt_varying_double(<1 x double> %v) nounwind readonly
 }
 
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm.sin.f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  unary1to1(float,@llvm.sin.f32)
-   
-}
-
-define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm.cos.f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  unary1to1(float, @llvm.cos.f32)
-
-}
-
-define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
-;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
-;  store <1 x float> %s, <1 x float> * %1
-;  ret void
-   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
-   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
-   store <1 x float> %sin, <1 x float> * %1
-   store <1 x float> %cos, <1 x float> * %2
-   ret void
-}
-
-define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm_tan_f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  ;unasry1to1(float, @llvm.tan.f32)
-  ; UNSUPPORTED!
-  ret <1 x float > %0
-}
-
-define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
-;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
-;  ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm_atan_f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  ;unsary1to1(float,@llvm.atan.f32)
-  ;UNSUPPORTED!
-  ret <1 x float > %0
-
-}
-
-define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
-  ;ret <1 x float> %ret
-  ;%y = extractelement <1 x float> %0, i32 0
-  ;%x = extractelement <1 x float> %1, i32 0
-  ;%q = fdiv float %y, %x
-  ;%a = call float @llvm.atan.f32 (float %q)
-  ;%rv = insertelement <1 x float> undef, float %a, i32 0
-  ;ret <1 x float> %rv
-  ; UNSUPPORTED!
-  ret <1 x float > %0
-}
-
-define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  unary1to1(float, @llvm.exp.f32)
-}
-
-define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  unary1to1(float, @llvm.log.f32)
-}
-
-define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
-  ;ret <1 x float> %ret
-  %r = extractelement <1 x float> %0, i32 0
-  %e  = extractelement <1 x float> %1, i32 0
-  %s = call float @llvm.pow.f32(float %r,float %e)
-  %rv = insertelement <1 x float> undef, float %s, i32 0
-  ret <1 x float> %rv
-
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
@@ -957,3 +867,8 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index c6a3afe2..ad1d88bc 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -269,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
   ret i64 %val
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
 
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 73361720..77bf1a9d 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)
 
-
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,8)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -367,6 +294,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define <4 x float> @__vec4_add_float(<4 x float> %v0,
                                      <4 x float> %v1) nounwind readnone alwaysinline {
   %v = fadd <4 x float> %v0, %v1
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 2bb06391..e42d4990 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
   %v1 = shufflevector <4 x float> %v, <4 x float> undef,
                       <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -466,62 +496,15 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
 
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,4)
 
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
new file mode 100644
index 00000000..72b81ff0
--- /dev/null
+++ b/builtins/target-sse4-16.ll
@@ -0,0 +1,490 @@
+;;  Copyright (c) 2013, Google, Inc.
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Google, Inc. nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`8')
+define(`MASK',`i16')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+   ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind
+alwaysinline {
+  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to8(%0, 8)
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to8(%0, 9)
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to8(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 8)
+}
+
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 9)
+}
+
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone {
+  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone {
+  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i64 @__movmsk(<8 x MASK>) nounwind readnone alwaysinline {
+  %m8 = trunc <8 x MASK> %0 to <8 x i8>
+  %mask8 = shufflevector <8 x i8> %m8, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8)
+  %m64 = zext i32 %m to i64
+  ret i64 %m64
+}
+
+define i1 @__any(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %mne = icmp ne i64 %m, 0
+  ret i1 %mne
+}
+
+define i1 @__all(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %meq = icmp eq i64 %m, ALL_ON_MASK
+  ret i1 %meq
+}
+
+define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %meq = icmp eq i64 %m, 0
+  ret i1 %meq
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) {
+  %r = fadd <8 x float> %0, %1
+  ret <8 x float> %r
+}
+
+define internal float @__add_uniform_float(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  reduce8(float, @__add_varying_float, @__add_uniform_float)
+}
+
+define float @__reduce_min_float(<8 x float>) nounwind readnone {
+  reduce8(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<8 x float>) nounwind readnone {
+  reduce8(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal <8 x i32> @__add_varying_int32(<8 x i32>, <8 x i32>) {
+  %r = add <8 x i32> %0, %1
+  ret <8 x i32> %r
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) {
+  %r = add i32 %0, %1
+  ret i32 %r
+}
+
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define internal <8 x double> @__add_varying_double(<8 x double>, <8 x double>) {
+  %r = fadd <8 x double> %0, %1
+  ret <8 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <8 x i64> @__add_varying_int64(<8 x i64>, <8 x i64>) {
+  %r = add <8 x i64> %0, %1
+  ret <8 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i64(<8 x i64>* nocapture, <8 x i64>,
+                                      <8 x MASK> %mask) nounwind
+                                      alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i64>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i64> %1, <8 x i64> %old
+  store <8 x i64> %blend, <8 x i64>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i32>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i32> %1, <8 x i32> %old
+  store <8 x i32> %blend, <8 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>,
+                                     <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i16>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i16> %1, <8 x i16> %old
+  store <8 x i16> %blend, <8 x i16>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
+                                     <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i8>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i8> %1, <8 x i8> %old
+  store <8 x i8> %blend, <8 x i8>* %0, align 4
+  ret void
+}
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
+  %r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+define_avg_up_int8()
+define_avg_up_int16()
+define_down_avgs()
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
new file mode 100644
index 00000000..69b355e3
--- /dev/null
+++ b/builtins/target-sse4-8.ll
@@ -0,0 +1,492 @@
+;;  Copyright (c) 2013, Google, Inc.
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Google, Inc. nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`16')
+define(`MASK',`i8')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
+  unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <16 x float> %0, %call
+  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <16 x float> %call, %two_minus
+  ret <16 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+   ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <16 x float> %v, %is
+  %v_is_is = fmul <16 x float> %v_is, %is
+  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <16 x float> %is, %three_sub
+  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <16 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind
+alwaysinline {
+  unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <16 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to16(%0, 8)
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to16(%0, 9)
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to16(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+;  XXXround2to4double(%0, 8)
+  ; FIXME: need round2to16double in util.m4...
+  ret <16 x double> undef  
+}
+
+define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+;  XXXround2to4double(%0, 9)
+  ret <16 x double> undef  
+}
+
+define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+;  XXXround2to4double(%0, 10)
+  ret <16 x double> undef  
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
+  binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <16 x float> %call
+}
+
+define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
+  binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %call
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %call
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone {
+  binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <16 x double> %ret
+}
+
+define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone {
+  binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <16 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME
+
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %m64 = zext i32 %m to i64
+  ret i64 %m64
+}
+
+define i1 @__any(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %mne = icmp ne i32 %m, 0
+  ret i1 %mne
+}
+
+define i1 @__all(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %meq = icmp eq i32 %m, ALL_ON_MASK
+  ret i1 %meq
+}
+
+define i1 @__none(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %meq = icmp eq i32 %m, 0
+  ret i1 %meq
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <16 x i16> @__add_varying_i16(<16 x i16>,
+                                  <16 x i16>) nounwind readnone alwaysinline {
+  %r = add <16 x i16> %0, %1
+  ret <16 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
+  reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) {
+  %r = fadd <16 x float> %0, %1
+  ret <16 x float> %r
+}
+
+define internal float @__add_uniform_float(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+  reduce16(float, @__add_varying_float, @__add_uniform_float)
+}
+
+define float @__reduce_min_float(<16 x float>) nounwind readnone {
+  reduce16(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<16 x float>) nounwind readnone {
+  reduce16(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) {
+  %r = add <16 x i32> %0, %1
+  ret <16 x i32> %r
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) {
+  %r = add i32 %0, %1
+  ret i32 %r
+}
+
+define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) {
+  %r = fadd <16 x double> %0, %1
+  ret <16 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) {
+  %r = add <16 x i64> %0, %1
+  ret <16 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>,
+                                      <16 x i8> %mask) nounwind
+                                      alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i64>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old
+  store <16 x i64> %blend, <16 x i64>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                      <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i32>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old
+  store <16 x i32> %blend, <16 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
+                                     <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i16>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old
+  store <16 x i16> %blend, <16 x i16>* %0, align 4
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
+                                     <16 x MASK> %mask) nounwind alwaysinline {
+  %old = load <16 x i8>* %0, align 4
+  %blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1,
+                                                   <16 x i8> %mask)
+  store <16 x i8> %blend, <16 x i8>* %0, align 4
+  ret void
+}
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+define_avg_up_int8()
+define_avg_up_int16()
+define_down_avgs()
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index ccae4d51..842db53f 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)
 
-
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,8)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -309,6 +236,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
   reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
 }
@@ -629,3 +586,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
   binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
   ret <8 x double> %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index f622b839..88be6c59 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -209,62 +209,14 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
 
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
@@ -299,6 +251,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
@@ -503,3 +485,9 @@ gen_scatter(i32)
 gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/util.m4 b/builtins/util.m4
index c19d4930..68fa818b 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -49,6 +49,63 @@ define(`MASK_HIGH_BIT_ON',
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; vector deconstruction utilities
+;; split 8-wide vector into 2 4-wide vectors
+;;
+;; $1: vector element type
+;; $2: 8-wide vector
+;; $3: first 4-wide vector
+;; $4: second 4-wide vector
+
+define(`v8tov4', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+')
+
+define(`v16tov8', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+define(`v4tov2', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
+')
+
+define(`v8tov2', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 2, i32 3>
+  $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 4, i32 5>
+  $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 6, i32 7>
+')
+
+define(`v16tov4', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; vector assembly: wider vector from two narrower vectors
+;;
+;; $1: vector element type
+;; $2: first n-wide vector
+;; $3: second n-wide vector
+;; $4: result 2*n-wide vector
+define(`v8tov16', `
+  $4 = shufflevector <8 x $1> $2, <8 x $1> $3,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
 ;; Helper macro for calling various SSE instructions for scalar values
 ;; but where the instruction takes a vector parameter.
 ;; $1 : name of variable to put the final value in
@@ -156,10 +213,7 @@ define(`reduce16', `
 ;;     the final reduction
 
 define(`reduce8by4', `
-  %v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
-        <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v2 = shufflevector <8 x $1> %0, <8 x $1> undef,
-        <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  v8tov4($1, %0, %v1, %v2)
   %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2)
   %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef,
         <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -266,30 +320,66 @@ define(`binary2to4', `
 ;; $4: 8-wide operand value
 
 define(`unary4to8', `
-  %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
-  %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
-  %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+  %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the input vector elements
+;; $3: scalar type of the result vector elements
+;; $4: 4-wide unary vector function to apply
+;; $5: 8-wide operand value
+
+define(`unary4to8conv', `
+  %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 '
 )
 
 define(`unary4to16', `
-  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
-  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
-  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-  %v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2)
-  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-  %v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3)
+  %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2)
+  %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3)
 
-  %$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+  %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3, 
+  %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b,
+  %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+define(`unary4to16conv', `
+  %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3)
+
+  %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b,
            <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                        i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 '
@@ -411,6 +501,42 @@ define(`unary2to8', `
 '
 )
 
+define(`unary2to16', `
+  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
+  %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4)
+  %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5)
+  %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6)
+  %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7)
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
 ;; Maps an 2-wide binary function to two 8-wide vector operands
 ;; $1: name of variable into which the final result should go
 ;; $2: scalar type of the vector elements
@@ -432,12 +558,58 @@ define(`binary2to8', `
   %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
   %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
 
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+define(`binary2to16', `
+  %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+  %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+  %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
+  %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
+  %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b)
+  %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b)
+  %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b)
+  %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b)
+
   %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, 
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+  %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>           
+
+  %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 '
 )
 
@@ -460,6 +632,26 @@ ret <8 x float> %ret
 '
 )
 
+define(`round4to16', `
+%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
+%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
+%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2)
+%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2)
+%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23,
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x float> %ret
+'
+)
+
 define(`round8to16', `
 %v0 = shufflevector <16 x float> $1, <16 x float> undef,
         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -690,6 +882,91 @@ shuffles(i64, 8)
 ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
 ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
 
+define(`mask_converts', `
+define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) {
+  ret <$1 x i8> %0
+}
+define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) {
+  %r = trunc <$1 x i16> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) {
+  ret <$1 x i16> %0
+}
+define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) {
+  %r = sext <$1 x i16> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) {
+  %r = sext <$1 x i16> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) {
+  %r = trunc <$1 x i32> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) {
+  %r = trunc <$1 x i32> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) {
+  ret <$1 x i32> %0
+}
+define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
+  %r = sext <$1 x i32> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
+  ret <$1 x i64> %0
+}
+')
+
+mask_converts(WIDTH)
+
 define(`global_atomic_associative', `
 
 define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
@@ -697,17 +974,10 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
   ; first, for any lanes where the mask is off, compute a vector where those lanes
   ; hold the identity value..
 
-  ; for the bit tricks below, we need the mask to be sign extended to be
-  ; the size of the element type.
-  ifelse(
-    MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>',
-    $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>',
-    $3,i32, `
-       ; silly workaround to do %mask = %m, which is not possible directly..
-       %maskmem = alloca <$1 x i32>
-       store <$1 x i32> %m, <$1 x i32> * %maskmem
-       %mask = load <$1 x i32> * %maskmem'
-  )
+  ; for the bit tricks below, we need the mask to have the
+  ; the same element size as the element type.
+  %mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m)
+
   ; zero out any lanes that are off
   %valoff = and <$1 x $3> %val, %mask
 
@@ -1551,11 +1821,6 @@ declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
 declare i1 @__is_compile_time_constant_uniform_int32(i32)
 declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
 
-define void @__pause() nounwind readnone {
-  call void asm sideeffect "pause", "~{dirflag},~{fpsr},~{flags}"() nounwind
-  ret void
-}
-
 ; This function declares placeholder masked store functions for the
 ;  front-end to use.
 ;
@@ -2440,13 +2705,16 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
 }
 
 define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
-  ifelse(MASK,i1, `
-  %se = sext <WIDTH x i1> %0 to <WIDTH x i32>
+;;  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
+;; `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
+;; ret <WIDTH x i32> %se')
+  ifelse(MASK,i32, `%se = bitcast <WIDTH x i32> %0 to <WIDTH x i32>',
+         MASK,i64, `%se = trunc <WIDTH x MASK> %0 to <WIDTH x i32>',
+                   `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>')
   ret <WIDTH x i32> %se
-  ', `
-  ret <WIDTH x i32> %0')
 }
 
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; memcpy/memmove/memset
 
@@ -2830,17 +3098,11 @@ m4exit(`1')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; read hw clock
 
+declare i64 @llvm.readcyclecounter()
+
 define i64 @__clock() nounwind {
-entry:
-  tail call void asm sideeffect "xorl %eax,%eax \0A    cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
-  %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
-  %asmresult = extractvalue { i32, i32 } %0, 0
-  %asmresult1 = extractvalue { i32, i32 } %0, 1
-  %conv = zext i32 %asmresult1 to i64
-  %shl = shl nuw i64 %conv, 32
-  %conv2 = zext i32 %asmresult to i64
-  %or = or i64 %shl, %conv2
-  ret i64 %or
+  %r = call i64 @llvm.readcyclecounter()
+  ret i64 %r
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2918,6 +3180,7 @@ define float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
 }
 
 declare double @sin(double) nounwind readnone
+declare double @asin(double) nounwind readnone
 declare double @cos(double) nounwind readnone
 declare void @sincos(double, double *, double *) nounwind readnone
 declare double @tan(double) nounwind readnone
@@ -2932,6 +3195,11 @@ define double @__stdlib_sin(double) nounwind readnone alwaysinline {
   ret double %r
 }
 
+define double @__stdlib_asin(double) nounwind readnone alwaysinline {
+  %r = call double @asin(double %0)
+  ret double %r
+}
+
 define double @__stdlib_cos(double) nounwind readnone alwaysinline {
   %r = call double @cos(double %0)
   ret double %r
@@ -3201,8 +3469,8 @@ return:
 ;; $1: llvm type of elements (and suffix for function name)
 
 define(`gen_masked_store', `
-define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x i32>) nounwind alwaysinline {
-  per_lane(WIDTH, <WIDTH x i32> %2, `
+define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x MASK>) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %2, `
       %ptr_LANE_ID = getelementptr <WIDTH x $1> * %0, i32 0, i32 LANE
       %storeval_LANE_ID = extractelement <WIDTH x $1> %1, i32 LANE
       store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID')
@@ -3260,6 +3528,56 @@ define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
 }
 ')
 
+define(`masked_store_blend_8_16_by_4_mask64', `
+define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
+                                     <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old32 = bitcast <4 x i8> %old to i32
+    %new32 = bitcast <4 x i8> %1 to i32
+
+    %mask8 = trunc <4 x i64> %2 to <4 x i8>
+    %mask32 = bitcast <4 x i8> %mask8 to i32
+    %notmask32 = xor i32 %mask32, -1
+
+    %newmasked = and i32 %new32, %mask32
+    %oldmasked = and i32 %old32, %notmask32
+    %result = or i32 %newmasked, %oldmasked
+
+    %resultvec = bitcast i32 %result to <4 x i8>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
+  ')
+  store <4 x i8> %resultvec, <4 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old64 = bitcast <4 x i16> %old to i64
+    %new64 = bitcast <4 x i16> %1 to i64
+
+    %mask16 = trunc <4 x i64> %2 to <4 x i16>
+    %mask64 = bitcast <4 x i16> %mask16 to i64
+    %notmask64 = xor i64 %mask64, -1
+
+    %newmasked = and i64 %new64, %mask64
+    %oldmasked = and i64 %old64, %notmask64
+    %result = or i64 %newmasked, %oldmasked
+
+    %resultvec = bitcast i64 %result to <4 x i16>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
+  ')
+  store <4 x i16> %resultvec, <4 x i16> * %0, align 2
+  ret void
+}
+')
+
 define(`masked_store_blend_8_16_by_8', `
 define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
                                      <8 x i32>) nounwind alwaysinline {
@@ -3378,10 +3696,10 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
 define(`packed_load_and_store', `
 
 define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
-                                 <WIDTH x i32> %full_mask) nounwind alwaysinline {
+                                 <WIDTH x MASK> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
-  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
+  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
@@ -3432,10 +3750,10 @@ done:
 }
 
 define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
-                                   <WIDTH x i32> %full_mask) nounwind alwaysinline {
+                                   <WIDTH x MASK> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
-  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
+  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
@@ -3544,10 +3862,10 @@ check_neighbors:
   %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1)
   %vr = bitcast <$1 x $4> %castvr to <$1 x $2>
   %eq = $5 $7 <$1 x $2> %vec, %vr
-  ifelse(MASK,i32, `
-    %eq32 = sext <$1 x i1> %eq to <$1 x i32>
-    %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', `
-    %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)')
+  ifelse(MASK,i1, `
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)',
+    `%eqm = sext <$1 x i1> %eq to <$1 x MASK>
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)')
   %alleq = icmp eq i64 %eqmm, ALL_ON_MASK
   br i1 %alleq, label %all_equal, label %not_all_equal
   ', `
@@ -3722,9 +4040,9 @@ pl_done:
 define(`gen_gather_general', `
 ; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
 define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs, 
-                                   <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                   <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %ret_ptr = alloca <WIDTH x $1>
-  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
+  per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
   %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
   %val_LANE_ID = load $1 * %ptr_LANE_ID
@@ -3738,9 +4056,9 @@ define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
 
 ; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
 define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs, 
-                                   <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                   <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %ret_ptr = alloca <WIDTH x $1>
-  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
+  per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
   %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
   %val_LANE_ID = load $1 * %ptr_LANE_ID
@@ -3804,7 +4122,7 @@ define <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %o
 
 define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
                                              <WIDTH x i32> %offset_delta,
-                                             <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                             <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
   ; to require that the 0th element of the array being gathered from is always
   ; legal to read from (and we do indeed require that, given the benefits!) 
@@ -3813,13 +4131,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
   %offsetsPtr = alloca <WIDTH x i32>
   store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %offsetsPtr
   call void @__masked_store_blend_i32(<WIDTH x i32> * %offsetsPtr, <WIDTH x i32> %offsets, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newOffsets = load <WIDTH x i32> * %offsetsPtr
 
   %deltaPtr = alloca <WIDTH x i32>
   store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %deltaPtr
   call void @__masked_store_blend_i32(<WIDTH x i32> * %deltaPtr, <WIDTH x i32> %offset_delta, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newDelta = load <WIDTH x i32> * %deltaPtr
 
   %ret0 = call <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %newOffsets,
@@ -3835,7 +4153,7 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
 
 define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
                                              <WIDTH x i64> %offset_delta,
-                                             <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                             <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
   ; to require that the 0th element of the array being gathered from is always
   ; legal to read from (and we do indeed require that, given the benefits!) 
@@ -3844,13 +4162,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64
   %offsetsPtr = alloca <WIDTH x i64>
   store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %offsetsPtr
   call void @__masked_store_blend_i64(<WIDTH x i64> * %offsetsPtr, <WIDTH x i64> %offsets, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newOffsets = load <WIDTH x i64> * %offsetsPtr
 
   %deltaPtr = alloca <WIDTH x i64>
   store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %deltaPtr
   call void @__masked_store_blend_i64(<WIDTH x i64> * %deltaPtr, <WIDTH x i64> %offset_delta, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newDelta = load <WIDTH x i64> * %deltaPtr
 
   %ret0 = call <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %newOffsets,
@@ -3876,27 +4194,27 @@ gen_gather_factored($1)
 define <WIDTH x $1>
 @__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale,
                            <WIDTH x i32> %offsets,
-                           <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                           <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %scale_vec = bitcast i32 %offset_scale to <1 x i32>
   %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef,
      <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
   %scaled_offsets = mul <WIDTH x i32> %smear_scale, %offsets
   %v = call <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %scaled_offsets, i32 1, 
-                                                     <WIDTH x i32> zeroinitializer, <WIDTH x i32> %vecmask)
+                                                     <WIDTH x i32> zeroinitializer, <WIDTH x MASK> %vecmask)
   ret <WIDTH x $1> %v
 }
 
 define <WIDTH x $1>
 @__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale,
                             <WIDTH x i64> %offsets,
-                            <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                            <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %scale64 = zext i32 %offset_scale to i64
   %scale_vec = bitcast i64 %scale64 to <1 x i64>
   %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef,
      <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
   %scaled_offsets = mul <WIDTH x i64> %smear_scale, %offsets
   %v = call <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %scaled_offsets,
-                                                     i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x i32> %vecmask)
+                                                     i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x MASK> %vecmask)
   ret <WIDTH x $1> %v
 }
 
@@ -3955,9 +4273,9 @@ define void @__scatter_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_s
 
 define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
                                          <WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
-                                         <WIDTH x i32> %mask) nounwind alwaysinline {
+                                         <WIDTH x MASK> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
       call void @__scatter_elt32_$1(i8 * %base, <WIDTH x i32> %offsets, i32 %offset_scale,
                                     <WIDTH x i32> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
   ret void
@@ -3965,9 +4283,9 @@ define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offs
 
 define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
                                          <WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
-                                         <WIDTH x i32> %mask) nounwind alwaysinline {
+                                         <WIDTH x MASK> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
       call void @__scatter_elt64_$1(i8 * %base, <WIDTH x i64> %offsets, i32 %offset_scale,
                                     <WIDTH x i64> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
   ret void
@@ -3975,8 +4293,8 @@ define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offs
 
 ; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s
 define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
-                            <WIDTH x i32> %mask) nounwind alwaysinline {
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+                            <WIDTH x MASK> %mask) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
   %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
   %val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
@@ -3987,8 +4305,8 @@ define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
 
 ; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s
 define void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
-                            <WIDTH x i32> %mask) nounwind alwaysinline {
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+                            <WIDTH x MASK> %mask) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
   %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
   %val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
@@ -4044,3 +4362,109 @@ define i1 @__rdrand_i64(i64 * %ptr) {
   ret i1 %good
 }
 ')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define(`define_avg_up_uint8', `
+define <WIDTH x i8> @__avg_up_uint8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum1 = add <WIDTH x i16> %a16, %b16
+  %sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_up_int8', `
+define <WIDTH x i8> @__avg_up_int8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum1 = add <WIDTH x i16> %a16, %b16
+  %sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_up_uint16', `
+define <WIDTH x i16> @__avg_up_uint16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum1 = add <WIDTH x i32> %a32, %b32
+  %sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_up_int16', `
+define <WIDTH x i16> @__avg_up_int16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum1 = add <WIDTH x i32> %a32, %b32
+  %sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_down_uint8', `
+define <WIDTH x i8> @__avg_down_uint8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum = add <WIDTH x i16> %a16, %b16
+  %avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_down_int8', `
+define <WIDTH x i8> @__avg_down_int8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum = add <WIDTH x i16> %a16, %b16
+  %avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_down_uint16', `
+define <WIDTH x i16> @__avg_down_uint16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum = add <WIDTH x i32> %a32, %b32
+  %avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_down_int16', `
+define <WIDTH x i16> @__avg_down_int16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum = add <WIDTH x i32> %a32, %b32
+  %avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_up_avgs', `
+define_avg_up_uint8()
+define_avg_up_int8()
+define_avg_up_uint16()
+define_avg_up_int16()
+')
+
+define(`define_down_avgs', `
+define_avg_down_uint8()
+define_avg_down_int8()
+define_avg_down_uint16()
+define_avg_down_int16()
+')
+
+define(`define_avgs', `
+define_up_avgs()
+define_down_avgs()
+')
diff --git a/cbackend.cpp b/cbackend.cpp
index d23bcc20..7d4b4cfc 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -3704,6 +3704,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) {
           case llvm::Intrinsic::sadd_with_overflow:
           case llvm::Intrinsic::trap:
           case llvm::Intrinsic::objectsize:
+          case llvm::Intrinsic::readcyclecounter:
               // We directly implement these intrinsics
             break;
           default:
@@ -4056,6 +4057,9 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID,
     return true;
   case llvm::Intrinsic::objectsize:
     return true;
+  case llvm::Intrinsic::readcyclecounter:
+    Out << "__clock()";
+    return true;
   }
 }
 
diff --git a/check_env.py b/check_env.py
new file mode 100755
index 00000000..8c90d895
--- /dev/null
+++ b/check_env.py
@@ -0,0 +1,102 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
+# // Author: Filippov Ilia
+
+import common
+import sys
+import os
+import string
+print_debug = common.print_debug
+error = common.error
+take_lines = common.take_lines
+
+exists = [False, False, False, False, False, False, False, False]
+names = ["m4", "bison", "flex", "sde", "ispc", "clang", "gcc", "icc"]
+
+PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+for counter in PATH_dir:
+    for i in range(0,8):
+        if os.path.exists(counter + os.sep + names[i]):
+            exists[i] = True
+
+print_debug("=== in PATH: ===\n", False, "")
+print_debug("Tools:\n", False, "")
+for i in range(0,3):
+    if exists[i]:
+        print_debug(take_lines(names[i] + " --version", "first"), False, "")
+    else:
+        error("you don't have " + names[i], 0)
+if exists[0] and exists[1] and exists[2]:
+    if common.check_tools(2):
+        print_debug("Tools' versions are ok\n", False, "")
+print_debug("\nSDE:\n", False, "")
+if exists[3]:
+    print_debug(take_lines(names[3] + " --version", "first"), False, "")
+else:
+    error("you don't have " + names[3], 2)
+print_debug("\nISPC:\n", False, "")
+if exists[4]:
+    print_debug(take_lines(names[4] + " --version", "first"), False, "")
+else:
+    error("you don't have " + names[4], 2)
+print_debug("\nC/C++ compilers:\n", False, "")
+for i in range(5,8):
+    if exists[i]:
+        print_debug(take_lines(names[i] + " --version", "first"), False, "")
+    else:
+        error("you don't have " + names[i], 2)
+
+print_debug("\n=== in ISPC specific environment variables: ===\n", False, "")
+if os.environ.get("LLVM_HOME") == None:
+    error("you have no LLVM_HOME", 2)
+else:
+    print_debug("Your LLVM_HOME:" + os.environ.get("LLVM_HOME") + "\n", False, "")
+if os.environ.get("ISPC_HOME") == None:
+    error("you have no ISPC_HOME", 2)
+else:
+    print_debug("Your ISPC_HOME:" + os.environ.get("ISPC_HOME") + "\n", False, "")
+    if os.path.exists(os.environ.get("ISPC_HOME") + os.sep + "ispc"):
+        print_debug("You have ISPC in your ISPC_HOME: " +
+        take_lines(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version", "first"), False, "")
+    else:
+        error("you don't have ISPC in your ISPC_HOME", 2)
+if os.environ.get("SDE_HOME") == None:
+    error("You have no SDE_HOME", 2)
+else:
+    print_debug("Your SDE_HOME:" + os.environ.get("SDE_HOME") + "\n", False, "")
+    if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"):
+        print_debug("You have sde in your SDE_HOME: " +
+        take_lines(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version", "first"), False, "")
+    else:
+        error("you don't have any SDE in your ISPC_HOME", 2)
diff --git a/common.py b/common.py
new file mode 100644
index 00000000..be3e9526
--- /dev/null
+++ b/common.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation 
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+import sys
+import os
+import shutil
+
+def write_to_file(filename, line):
+    f = open(filename, 'a')
+    f.writelines(line)
+    f.close()
+
+#remove file if it exists
+def remove_if_exists(filename):
+    if os.path.exists(filename):
+        if os.path.isdir(filename):
+            shutil.rmtree(filename)
+        else:
+            os.remove(filename)
+
+# detect version which is printed after command
+def take_lines(command, which):
+    os.system(command + " > " + "temp_detect_version")
+    version = open("temp_detect_version")
+    if which == "first":
+        answer = version.readline()
+    if which == "all":
+        answer = version.readlines()
+    version.close()
+    remove_if_exists("temp_detect_version")
+    return answer
+
+# print versions of compilers
+def print_version(ispc_test, ispc_ref, ref_compiler, s, perf_log, is_windows):
+    print_debug("\nUsing test compiler: " + take_lines(ispc_test + " --version", "first"), s, perf_log)
+    if ispc_ref != "":
+        print_debug("Using ref compiler:  " + take_lines(ispc_ref + " --version", "first"), s, perf_log)
+    if is_windows == False:
+        temp1 = take_lines(ref_compiler + " --version", "first")
+    else:
+        os.system(ref_compiler + " 2>&1" + " 2> temp_detect_version > temp_detect_version1" )
+        version = open("temp_detect_version")
+        temp1 = version.readline()
+        version.close()
+        remove_if_exists("temp_detect_version")
+        remove_if_exists("temp_detect_version1")
+    print_debug("Using C/C++ compiler: " + temp1 + "\n", s, perf_log)
+
+# print everything from scripts instead errors
+def print_debug(line, silent, filename):
+    if silent == False:
+        sys.stdout.write(line)
+        sys.stdout.flush()
+        if os.environ.get("ISPC_HOME") != None:
+            if os.path.exists(os.environ.get("ISPC_HOME")):
+                write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line)
+    if filename != "":
+        write_to_file(filename, line)
+
+# print errors from scripts
+# type 1 for error in environment
+# type 2 for warning
+# type 3 for error of compiler or test which isn't the goal of script 
+def error(line, error_type):
+    line = line + "\n"
+    if error_type == 1:
+        sys.stderr.write("Fatal error: " + line)
+        sys.exit(1)
+    if error_type == 2:
+        sys.stderr.write("Warning: " + line)
+    if error_type == 0:
+        print_debug("FIND ERROR: " + line, False, "")
+
+def check_tools(m):
+    input_tools=[[[1,4],"m4 --version", "bad m4 version"],
+                 [[2,4],"bison --version", "bad bison version"],
+                 [[2,5], "flex --version", "bad flex version"]]
+    ret = 1 
+    for t in range(0,len(input_tools)):
+        t1 = ((take_lines(input_tools[t][1], "first"))[:-1].split(" "))
+        for i in range(0,len(t1)):
+            t11 = t1[i].split(".")
+            f = True
+            for j in range(0,len(t11)):
+                if not t11[j].isdigit():
+                    f = False
+            if f == True:
+                for j in range(0,len(t11)):
+                    if j < len(input_tools[t][0]):
+                        if int(t11[j])<input_tools[t][0][j]:
+                            error(input_tools[t][2], m)
+                            ret = 0
+    return ret
diff --git a/contrib/ispc.vim b/contrib/ispc.vim
index cc8493f0..f3cb413b 100644
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -19,6 +19,13 @@ syn keyword	ispcRepeat	cdo cfor cwhile
 syn keyword	ispcBuiltin	programCount programIndex	
 syn keyword	ispcType	export uniform varying int8 int16 int32 int64
 
+"double precision floating point number, with dot, optional exponent
+syn match	cFloat		display contained "\d\+\.\d*d[-+]\=\d*\>"
+"double precision floating point number, starting with dot, optional exponent
+syn match	cFloat		display contained ".\d*d[-+]\=\d*\>"
+"double precision floating point number, without dot, with exponent
+syn match	cFloat		display contained "\d\+d[-+]\=\d\+\>"
+
 " Default highlighting
 command -nargs=+ HiLink hi def link <args>
 HiLink ispcStatement	Statement
diff --git a/ctx.cpp b/ctx.cpp
index 1e79c97b..c50d22f9 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1456,13 +1456,13 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
         for (unsigned int i = 0; i < at->getNumElements(); ++i) {
             llvm::Value *elt = ExtractInst(b, i);
             llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType,
-                                         LLVMGetName(elt, "_to_boolvec32"));
+                                         LLVMGetName(elt, "_to_boolvec"));
             ret = InsertInst(ret, sext, i);
         }
         return ret;
     }
     else
-        return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_i32"));
+        return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_boolvec"));
 }
 
 
@@ -2781,6 +2781,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
 
     // Figure out if we need a 8, 16, 32 or 64-bit masked store.
     llvm::Function *maskedStoreFunc = NULL;
+    llvm::Type *llvmValueType = value->getType();
 
     const PointerType *pt = CastType<PointerType>(valueType);
     if (pt != NULL) {
@@ -2809,8 +2810,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
         else
             maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingBool) &&
-             g->target->getMaskBitCount() == 1) {
+    else if (llvmValueType == LLVMTypes::Int1VectorType) {
         llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask,
                                               LLVMMaskAllOn, "~mask");
         llvm::Value *old = LoadInst(ptr);
@@ -2823,28 +2823,22 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
         StoreInst(final, ptr);
         return;
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingDouble)) {
+    else if (llvmValueType == LLVMTypes::DoubleVectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_double");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingInt64) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt64)) {
+    else if (llvmValueType == LLVMTypes::Int64VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingFloat)) {
+    else if (llvmValueType == LLVMTypes::FloatVectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_float");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingBool) ||
-             Type::Equal(valueType, AtomicType::VaryingInt32) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt32) ||
-             CastType<EnumType>(valueType) != NULL) {
+    else if (llvmValueType == LLVMTypes::Int32VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingInt16) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt16)) {
+    else if (llvmValueType == LLVMTypes::Int16VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i16");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingInt8) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt8)) {
+    else if (llvmValueType == LLVMTypes::Int8VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i8");
     }
     AssertPos(currentPos, maskedStoreFunc != NULL);
diff --git a/decl.cpp b/decl.cpp
index e7b3cdef..8a10543b 100644
--- a/decl.cpp
+++ b/decl.cpp
@@ -69,8 +69,15 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
     if (type == NULL)
         return NULL;
 
-    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
+    if ((typeQualifiers & TYPEQUAL_CONST) != 0) {
         type = type->GetAsConstType();
+    }
+
+    if ( ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
+         && ((typeQualifiers & TYPEQUAL_VARYING) != 0) ) {
+        Error(pos, "Type \"%s\" cannot be qualified with both uniform and varying.", 
+              type->GetString().c_str());
+    }
 
     if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) {
         if (Type::Equal(type, AtomicType::Void))
@@ -84,9 +91,10 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
         else
             type = type->GetAsVaryingType();
     }
-    else
+    else {
         if (Type::Equal(type, AtomicType::Void) == false)
             type = type->GetAsUnboundVariabilityType();
+    }
 
     if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
         if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
@@ -124,6 +132,17 @@ DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
     typeQualifiers = tq;
     soaWidth = 0;
     vectorSize = 0;
+    if (t != NULL) {
+        if (m->symbolTable->ContainsType(t)) {
+            // Typedefs might have uniform/varying qualifiers inside.
+            if (t->IsVaryingType()) {
+                typeQualifiers |= TYPEQUAL_VARYING;
+            }
+            else if (t->IsUniformType()) {
+                typeQualifiers |= TYPEQUAL_UNIFORM;
+            }
+        }
+    }
 }
 
 
@@ -229,6 +248,7 @@ Declarator::Declarator(DeclaratorKind dk, SourcePos p)
 void
 Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
     const Type *baseType = ds->GetBaseType(pos);
+
     InitFromType(baseType, ds);
 
     if (type == NULL) {
@@ -591,6 +611,7 @@ Declaration::Declaration(DeclSpecs *ds, Declarator *d) {
 }
 
 
+
 std::vector<VariableDeclaration>
 Declaration::GetVariableDeclarations() const {
     Assert(declSpecs->storageClass != SC_TYPEDEF);
diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt
index 007f283e..a8575ea0 100644
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,63 @@
+=== v1.5.0 === (27 September 2013)
+
+A major new version of ISPC with several new targets and important bug fixes.
+Here's a list of the most important changes, if you are using pre-built
+binaries (which are based on patched version of LLVM 3.3):
+
+* The naming of targets was changed to explicitly include data type width and
+  a number of threads in the gang. For example, avx2-i32x8 is avx2 target,
+  which uses 32 bit types as a base and has 8 threads in a gang. Old naming
+  scheme is still supported, but depricated.
+
+* New SSE4 targets for calculations based on 8 bit and 16 bit data types:
+  sse4-i8x16 and sse4-i16x8.
+
+* New AVX1 target for calculations based on 64 bit data types: avx1-i64x4.
+
+* SVML support was extended and improved.
+
+* Behavior of -g switch was changed to not affect optimization level.
+
+* ISPC debug infrastructure was redesigned. See --help-dev for more info and
+  enjoy capabilities of new --debug-phase=<value> and --off-phase=<value>
+  switches.
+
+* Fixed an auto-dispatch bug, which caused AVX code execution when OS doesn't
+  support AVX (but hardware does).
+
+* Fixed a bug, which discarded uniform/varying keyword in typedefs.
+
+* Several performance regressions were fixed.
+
+If you are building ISPC yourself, then following changes are also available
+to you:
+
+* --cpu=slm for targeting Intel Atom codename Silvermont (if LLVM 3.4 is used).
+
+* ARM NEON targets are available (if enabled in build system).
+
+* --debug-ir=<value> is available to generate debug information based on LLVM
+  IR (if LLVM 3.4 is used). In debugger you'll see LLVM IR instead of source
+  code.
+
+* A redesigned and improved test and configuration management system is
+  available to facilitate the process of building LLVM and testing ISPC
+  compiler.
+
+Standard library changes/fixes:
+
+* __pause() function was removed from standard library.
+
+* Fixed reduce_[min|max]_[float|double] intrinsics, which were producing
+  incorrect code under some conditions.
+
+Language changes:
+
+* By default a floating point constant without a suffix is a single precision
+  constant (32 bit). A new suffix "d" was introduced to allow double precision
+  constant (64 bit). Please refer to tests/double-consts.ispc for syntax
+  examples.
+
 === v1.4.4 === (19 July 2013)
 
 A minor version update with several stability fixes requested by the customers.
diff --git a/docs/build.sh b/docs/build.sh
index a13f3231..4f4fbfe4 100755
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -1,14 +1,16 @@
 #!/bin/bash
 
+rst2html=rst2html.py
+
 for i in ispc perfguide faq; do
-    rst2html --template=template.txt --link-stylesheet \
+    $rst2html --template=template.txt --link-stylesheet \
         --stylesheet-path=css/style.css $i.rst > $i.html
 done
 
-rst2html --template=template-news.txt --link-stylesheet \
+$rst2html --template=template-news.txt --link-stylesheet \
     --stylesheet-path=css/style.css news.rst > news.html
 
-rst2html --template=template-perf.txt --link-stylesheet \
+$rst2html --template=template-perf.txt --link-stylesheet \
         --stylesheet-path=css/style.css perf.rst > perf.html
 
 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
diff --git a/docs/ispc.rst b/docs/ispc.rst
old mode 100755
new mode 100644
index c6c63172..eac9b24e
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -270,6 +270,14 @@ new reserved words: ``unmasked``, ``foreach_unique``, ``foreach_active``,
 and ``in``.  Any program that happens to have a variable or function with
 one of these names must be modified to rename that symbol.
 
+Updating ISPC Programs For Changes In ISPC 1.5.0
+------------------------------------------------
+
+This release adds support for double precision floating point constants.
+Double precision floating point constants are floating point number with
+``d`` suffix and optional exponent part. Here are some examples: 3.14d,
+31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is
+treated as single precision constant.
 
 Getting Started with ISPC
 =========================
@@ -467,45 +475,100 @@ There are three options that affect the compilation target: ``--arch``,
 which sets the target architecture, ``--cpu``, which sets the target CPU,
 and ``--target``, which sets the target instruction set.
 
-By default, the ``ispc`` compiler generates code for the 64-bit x86-64
-architecture (i.e. ``--arch=x86-64``.)  To compile to a 32-bit x86 target,
-supply ``--arch=x86`` on the command line:
+If none of these options is specified, ``ispc`` generates code for the
+architecture of the system the compiler is running on (i.e. 64-bit x86-64
+(``--arch=x86-64``) on x86 systems and ARM NEON on ARM systems.
+
+To compile to a 32-bit x86 target, for example, supply ``--arch=x86`` on
+the command line:
 
 ::
 
    ispc foo.ispc -o foo.obj --arch=x86
 
-No other architectures are currently supported.
+Currently-supported architectures are ``x86-64``, ``x86``, and ``arm``.
 
 The target CPU determines both the default instruction set used as well as
 which CPU architecture the code is tuned for.  ``ispc --help`` provides a
-list of a number of the supported CPUs.  By default, the CPU type of the
-system on which you're running ``ispc`` is used to determine the target
-CPU.
+list of all of the supported CPUs.  By default, the CPU type of the system
+on which you're running ``ispc`` is used to determine the target CPU.
 
 ::
 
    ispc foo.ispc -o foo.obj --cpu=corei7-avx
 
-Finally, ``--target`` selects between the SSE2, SSE4, and AVX, and AVX2
-instruction sets.  (As general context, SSE2 was first introduced in
-processors that shipped in 2001, SSE4 was introduced in 2007, and
-processors with AVX were introduced in 2010.  AVX2 will be supported on
-future CPUs based on Intel's "Haswell" architecture.  Consult your CPU's
-manual for specifics on which vector instruction set it supports.)
+Finally, ``--target`` selects the target instruction set.  The target
+string is of the form ``[ISA]-i[mask size]x[gang size]``.  For example,
+``--target=avx2-i32x16`` specifies a target with the AVX2 instruction set,
+a mask size of 32 bits, and a gang size of 16.
+
+The following target ISAs are supported:
+
+============ ==========================================
+Target       Description
+------------ ------------------------------------------
+avx, avx1    AVX (2010-2011 era Intel CPUs)
+avx1.1       AVX 1.1 (2012 era "Ivybridge" Intel CPUs)
+avx2         AVX 2 target (2013- Intel "Haswell" CPUs)
+neon         ARM NEON
+sse2         SSE2 (early 2000s era x86 CPUs)
+sse4         SSE4 (generally 2008-2010 Intel CPUs)
+============ ==========================================
+
+Consult your CPU's manual for specifics on which vector instruction set it
+supports.
+
+The mask size may be 8, 16, or 32 bits, though not all combinations of ISAs
+and mask sizes are supported.  For best performance, the best general
+approach is to choose a mask size equal to the size of the most common
+datatype in your programs.  For example, if most of your computation is on
+32-bit floating-point values, an ``i32`` target is appropriate.  However,
+if you're mostly doing computation on 8-bit images, ``i8`` is a better choice.
+
+See `Basic Concepts: Program Instances and Gangs of Program Instances`_ for
+more discussion of the "gang size" and its implications for program
+execution.
+
+Running ``ispc --help`` and looking at the output for the ``--target``
+option gives the most up-to-date documentation about which targets your
+compiler binary supports.
+
+The naming scheme for compilation targets changed in August 2013; the
+following table shows the relationship between names in the old scheme and
+in the new scheme:
+
+============= ===========
+Target        Former Name
+------------- -----------
+avx1-i32x8    avx, avx1
+avx1-i32x16   avx-x2
+avx1.1-i32x8  avx1.1
+avx1.1-i32x16 avx1.1-x2
+avx2-i32x8    avx2
+avx2-i32x16   avx2-x2
+neon-8        n/a
+neon-16       n/a
+neon-32       n/a
+sse2-i32x4    sse2
+sse2-i32x8    sse2-x2
+sse4-i32x4    sse4
+sse4-i32x8    sse4-x2
+sse4-i8x16    n/a
+sse4-i16x8    n/a
+============= ===========
 
 By default, the target instruction set is chosen based on the most capable
 one supported by the system on which you're running ``ispc``.  You can
 override this choice with the ``--target`` flag; for example, to select
-Intel® SSE2, use ``--target=sse2``.  (As with the other options in this
-section, see the output of ``ispc --help`` for a full list of supported
-targets.)
+Intel® SSE2 with a 32-bit mask and 4 program instances in a gang, use
+``--target=sse2-i32x4``.  (As with the other options in this section, see
+the output of ``ispc --help`` for a full list of supported targets.)
 
 Generating Generic C++ Output
 -----------------------------
 
 In addition to generating object files or assembly output for specific
-targets like SSE2, SSE4, and AVX, ``ispc`` provides an option to generate
+targets like NEON, SSE2, SSE4, and AVX, ``ispc`` provides an option to generate
 "generic" C++ output.  This
 
 As an example, consider the following simple ``ispc`` program:
@@ -659,7 +722,7 @@ preprocessor runs:
   * - ISPC
     - 1
     - Detecting that the ``ispc`` compiler is processing the file
-  * - ISPC_TARGET_{SSE2,SSE4,AVX,AVX2}
+  * - ISPC_TARGET_{NEON_8,NEON_16,NEON_32,SSE2,SSE4,AVX,AVX11,AVX2,GENERIC}
     - 1
     - One of these will be set, depending on the compilation target.
   * - ISPC_POINTER_SIZE
@@ -1294,7 +1357,8 @@ but are likely to be supported in future releases:
 * Bitfield members of ``struct`` types
 * Variable numbers of arguments to functions
 * Literal floating-point constants (even without a ``f`` suffix) are
-  currently treated as being ``float`` type, not ``double``
+  currently treated as being ``float`` type, not ``double``. To have a double
+  precision floating point constant use ``d`` suffix.
 * The ``volatile`` qualifier
 * The ``register`` storage class for variables.  (Will be ignored).
 
@@ -3365,6 +3429,31 @@ The ``isnan()`` functions test whether the given value is a floating-point
     uniform bool isnan(uniform double v)
 
 
+A number of functions are also available for performing operations on 8- and
+16-bit quantities; these map to specialized instructions that perform these
+operations on targets that support them.  ``avg_up()`` computes the average
+of the two values, rounding up if their average is halfway between two
+integers (i.e., it computes ``(a+b+1)/2``).
+
+::
+
+   int8 avg_up(int8 a, int8 b)
+   unsigned int8 avg_up(unsigned int8 a, unsigned int8 b)
+   int16 avg_up(int16 a, int16 b)
+   unsigned int16 avg_up(unsigned int16 a, unsigned int16 b)
+
+
+``avg_down()`` computes the average of the two values, rounding down (i.e.,
+it computes ``(a+b)/2``).
+
+::
+
+   int8 avg_down(int8 a, int8 b)
+   unsigned int8 avg_down(unsigned int8 a, unsigned int8 b)
+   int16 avg_down(int16 a, int16 b)
+   unsigned int16 avg_down(unsigned int16 a, unsigned int16 b)
+
+
 Transcendental Functions
 ------------------------
 
@@ -3582,7 +3671,7 @@ command-line argument.
 Cross-Program Instance Operations
 ---------------------------------
 
-``ispc`` programs are often used to expresses independently-executing
+``ispc`` programs are often used to express independently-executing
 programs performing computation on separate data elements.  (i.e. pure
 data-parallelism).  However, it's often the case where it's useful for the
 program instances to be able to cooperate in computing results.  The
@@ -3613,7 +3702,7 @@ the running program instances.
 
 The ``rotate()`` function allows each program instance to find the value of
 the given value that their neighbor ``offset`` steps away has.  For
-example, on an 8-wide target, if ``offset`` has the value (1, 2, 3, 4, 5,
+example, on an 8-wide target, if ``value`` has the value (1, 2, 3, 4, 5,
 6, 7, 8) across the gang of running program instances, then ``rotate(value,
 -1)`` causes the first program instance to get the value 8, the second
 program instance to get the value 1, the third 2, and so forth.  The
@@ -3692,7 +3781,7 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v``
 Reductions
 ----------
 
-A number routines are available to evaluate conditions across the
+A number of routines are available to evaluate conditions across the
 running program instances.  For example, ``any()`` returns ``true`` if
 the given value ``v`` is ``true`` for any of the SPMD program
 instances currently running, ``all()`` returns ``true`` if it true
@@ -3711,29 +3800,44 @@ instances are added together by the ``reduce_add()`` function.
 
 ::
 
-    uniform float reduce_add(float x)
-    uniform int reduce_add(int x)
-    uniform unsigned int reduce_add(unsigned int x)
+    uniform int16 reduce_add(int8 x)
+    uniform unsigned int16 reduce_add(unsigned int8 x)
+    uniform int32 reduce_add(int16 x)
+    uniform unsigned int32 reduce_add(unsigned 16int x)
+    uniform int64 reduce_add(int32 x)
+    uniform unsigned int64 reduce_add(unsigned int32 x)
+    uniform int64 reduce_add(int64 x)
+    uniform unsigned int64 reduce_add(unsigned int64 x)
 
-You can also use functions to compute the minimum and maximum value of the
-given value across all of the currently-executing program instances.
+    uniform float reduce_add(float x)
+    uniform double reduce_add(double x)
+
+You can also use functions to compute the minimum value of the given value
+across all of the currently-executing program instances.
 
 ::
 
-    uniform float reduce_min(float a)
     uniform int32 reduce_min(int32 a)
     uniform unsigned int32 reduce_min(unsigned int32 a)
-    uniform double reduce_min(double a)
     uniform int64 reduce_min(int64 a)
     uniform unsigned int64 reduce_min(unsigned int64 a)
 
-    uniform float reduce_max(float a)
+    uniform float reduce_min(float a)
+    uniform double reduce_min(double a)
+
+Equivalent functions are available to comptue the maximum of the given
+varying variable over the active program instances.
+
+::
+
     uniform int32 reduce_max(int32 a)
     uniform unsigned int32 reduce_max(unsigned int32 a)
-    uniform double reduce_max(double a)
     uniform int64 reduce_max(int64 a)
     uniform unsigned int64 reduce_max(unsigned int64 a)
 
+    uniform float reduce_max(float a)
+    uniform double reduce_max(double a)
+
 Finally, you can check to see if a particular value has the same value in
 all of the currently-running program instances:
 
@@ -3741,9 +3845,10 @@ all of the currently-running program instances:
 
     uniform bool reduce_equal(int32 v)
     uniform bool reduce_equal(unsigned int32 v)
-    uniform bool reduce_equal(float v)
     uniform bool reduce_equal(int64 v)
     uniform bool reduce_equal(unsigned int64 v)
+
+    uniform bool reduce_equal(float v)
     uniform bool reduce_equal(double)
 
 There are also variants of these functions that return the value as a
@@ -3758,10 +3863,11 @@ performance in the `Performance Guide`_.
     uniform bool reduce_equal(int32 v, uniform int32 * uniform sameval)
     uniform bool reduce_equal(unsigned int32 v,
                               uniform unsigned int32 * uniform sameval)
-    uniform bool reduce_equal(float v, uniform float * uniform sameval)
     uniform bool reduce_equal(int64 v, uniform int64 * uniform sameval)
     uniform bool reduce_equal(unsigned int64 v,
                               uniform unsigned int64 * uniform sameval)
+
+    uniform bool reduce_equal(float v, uniform float * uniform sameval)
     uniform bool reduce_equal(double, uniform double * uniform sameval)
 
 If called when none of the program instances are running,
diff --git a/docs/news.rst b/docs/news.rst
index c1c35de3..7d78a662 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -2,6 +2,14 @@
 ispc News
 =========
 
+ispc 1.5.0 is Released
+----------------------
+
+A major update of ``ispc`` has been released with several new targets available
+and bunch of performance and stability fixes. The released binaries are built
+with patched version of LLVM 3.3. Please refer to Release Notes for complete
+set of changes.
+
 ispc 1.4.4 is Released
 ----------------------
 
diff --git a/docs/template-news.txt b/docs/template-news.txt
index 9a41fbdb..d5eebdd1 100644
--- a/docs/template-news.txt
+++ b/docs/template-news.txt
@@ -57,7 +57,7 @@
 %(body)s
 </div>
     <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2013 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
       <!-- Please Do Not remove this link, thank u -->
       </div>
       </div>
diff --git a/docs/template-perf.txt b/docs/template-perf.txt
index 4932e332..9537a836 100644
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -57,7 +57,7 @@
 %(body)s
 </div>
     <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2013 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
       <!-- Please Do Not remove this link, thank u -->
       </div>
       </div>
diff --git a/docs/template.txt b/docs/template.txt
index 8cb4f5ab..b9041f19 100644
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -57,7 +57,7 @@
 %(body)s
 </div>
     <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2013 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
       <!-- Please Do Not remove this link, thank u -->
       </div>
       </div>
diff --git a/doxygen.cfg b/doxygen.cfg
index 480d9331..a0ad3176 100644
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = 1.4.5dev
+PROJECT_NUMBER         = 1.5.1dev
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
diff --git a/examples/README.txt b/examples/README.txt
index 5b47df44..b67529c1 100644
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -146,6 +146,11 @@ This is a simple "hello world" type program that shows a ~10 line
 application program calling out to a ~5 line ispc program to do a simple
 computation.
 
+Sort
+====
+This is a bucket sort of 32 bit unsigned integers.
+By default 1000000 random elements get sorted.
+Call ./sort N in order to sort N elements instead.
 
 Volume
 ======
diff --git a/examples/aobench/ao.cpp b/examples/aobench/ao.cpp
index cbe75a0b..2286316d 100644
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -138,7 +138,7 @@ int main(int argc, char **argv)
     }
 
     // Report results and save image
-    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", 
+    printf("[aobench ispc]:\t\t\t[%.3f] million cycles (%d x %d image)\n", 
            minTimeISPC, width, height);
     savePPM("ao-ispc.ppm", width, height); 
 
@@ -158,7 +158,7 @@ int main(int argc, char **argv)
     }
 
     // Report results and save image
-    printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", 
+    printf("[aobench ispc + tasks]:\t\t[%.3f] million cycles (%d x %d image)\n", 
            minTimeISPCTasks, width, height);
     savePPM("ao-ispc-tasks.ppm", width, height); 
 
@@ -176,7 +176,7 @@ int main(int argc, char **argv)
     }
 
     // Report more results, save another image...
-    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
+    printf("[aobench serial]:\t\t[%.3f] million cycles (%d x %d image)\n", minTimeSerial, 
            width, height);
     printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
            minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
diff --git a/examples/common.mk b/examples/common.mk
index cdfc4c6a..367d3eb3 100644
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -9,18 +9,26 @@ CC=gcc
 CCFLAGS=-Iobjs/ -O2
 
 LIBS=-lm $(TASK_LIB) -lstdc++
-ISPC=ispc -O2 $(ISPC_FLAGS)
+ISPC=ispc
+ISPC_FLAGS=-O2
 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
 
-ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
+ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
 
 ifeq ($(ARCH),x86)
   ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
 	$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
   ISPC_TARGETS=$(ISPC_IA_TARGETS)
-  ISPC_FLAGS += --arch=x86-64
-  CXXFLAGS += -m64
-  CCFLAGS += -m64
+  ARCH_BIT:=$(shell getconf LONG_BIT)
+  ifeq ($(ARCH_BIT),32)
+    ISPC_FLAGS += --arch=x86
+    CXXFLAGS += -m32
+    CCFLAGS += -m32
+  else
+    ISPC_FLAGS += --arch=x86-64
+    CXXFLAGS += -m64
+    CCFLAGS += -m64
+  endif
 else ifeq ($(ARCH),arm)
   ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o))
   ISPC_TARGETS=$(ISPC_ARM_TARGETS)
@@ -44,7 +52,7 @@ dirs:
 objs/%.cpp objs/%.o objs/%.h: dirs
 
 clean:
-	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
+	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test
 
 $(EXAMPLE): $(OBJS)
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
@@ -61,10 +69,10 @@ objs/%.o: ../%.cpp dirs
 objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h
 
 objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+	$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
 
 objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
-	$(ISPC) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
 
 objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
 	$(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
@@ -73,7 +81,7 @@ $(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
 
 objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
-	$(ISPC) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
 
 objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
 	$(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@
@@ -82,7 +90,7 @@ $(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
 
 objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
-	$(ISPC) $< -o $@ --target=generic-1
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1
 
 $(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
diff --git a/examples/deferred/main.cpp b/examples/deferred/main.cpp
index 17bd3f42..4f2be879 100644
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -130,7 +130,7 @@ int main(int argc, char** argv) {
     printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
            serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
 #else
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
+    printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles);
 #endif // __cilk
 
     DeleteInputData(input);
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 828c1ab4..d81101f7 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1162,19 +1162,20 @@ REDUCE_ADD(double, __vec16_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >)
 
-REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_int32)
+REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+
+REDUCE_ADD(int64_t, __vec16_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >)
 
-REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >)
 
-REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64)
+REDUCE_ADD(int64_t, __vec16_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_max_int64, >)
 
-REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
 
@@ -1758,3 +1759,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
     return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+
+#endif // !WIN32
+
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 64b82cb1..531ed215 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -408,15 +408,15 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) {
     return (uint64_t)mask.v;
 }
 
-static FORCEINLINE __vec32_i1 __any(__vec32_i1 mask) {
+static FORCEINLINE bool __any(__vec32_i1 mask) {
     return (mask.v!=0);
 }
 
-static FORCEINLINE __vec32_i1 __all(__vec32_i1 mask) {
-    return (mask.v==0xFFFFFFFF);
+static FORCEINLINE bool __all(__vec32_i1 mask) {
+    return (mask.v==0xFFFFFFFFul);
 }
 
-static FORCEINLINE __vec32_i1 __none(__vec32_i1 mask) {
+static FORCEINLINE bool __none(__vec32_i1 mask) {
     return (mask.v==0);
 }
 
@@ -1231,19 +1231,20 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >)
 
-REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_int32)
+//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+
+REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_max_int32, >)
 
-REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_max_uint32, >)
 
-REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_int64)
+REDUCE_ADD(int64_t, __vec32_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_max_int64, >)
 
-REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_max_uint64, >)
 
@@ -1826,3 +1827,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
     return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // WIN32
+
+#undef FORCEINLINE
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index 7869faa5..bbeb007a 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -533,15 +533,15 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) {
     return (uint64_t)mask.v;
 }
 
-static FORCEINLINE __vec64_i1 __any(__vec64_i1 mask) {
+static FORCEINLINE bool __any(__vec64_i1 mask) {
     return (mask.v!=0);
 }
 
-static FORCEINLINE __vec64_i1 __all(__vec64_i1 mask) {
-    return (mask.v==0xFFFFFFFFFFFFFFFF);
+static FORCEINLINE bool __all(__vec64_i1 mask) {
+    return (mask.v==0xFFFFFFFFFFFFFFFFull);
 }
 
-static FORCEINLINE __vec64_i1 __none(__vec64_i1 mask) {
+static FORCEINLINE bool __none(__vec64_i1 mask) {
     return (mask.v==0);
 }
 
@@ -1364,19 +1364,20 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >)
 
-REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_int32)
+//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+
+REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_max_int32, >)
 
-REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_max_uint32, >)
 
-REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_int64)
+REDUCE_ADD(int64_t, __vec64_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_max_int64, >)
 
-REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_max_uint64, >)
 
@@ -1959,3 +1960,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
     return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif
+
+#undef FORCEINLINE
diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
new file mode 100644
index 00000000..78d35ddc
--- /dev/null
+++ b/examples/intrinsics/knc-i1x16.h
@@ -0,0 +1,2760 @@
+/**
+  Copyright (c) 2010-2013, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+#include <math.h>
+#include <assert.h>
+#include <algorithm>
+#include <immintrin.h>
+#include <zmmintrin.h>
+
+#ifdef _MSC_VER
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)  /*__declspec(align(x))*/
+#define POST_ALIGN(x)  
+#define roundf(x) (floorf(x + .5f))
+#define round(x) (floor(x + .5))
+#else
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)
+#define POST_ALIGN(x)  __attribute__ ((aligned(x)))
+#endif
+
+#define KNC 1
+#if 0
+extern "C" 
+{
+  int printf(const unsigned char *, ...);
+  int puts(unsigned char *);
+  unsigned int putchar(unsigned int);
+  int fflush(void *);
+  uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t);
+  uint8_t *memset(uint8_t *, uint8_t, uint64_t);
+  void memset_pattern16(void *, const void *, uint64_t);
+}
+#endif
+
+typedef float   __vec1_f;
+typedef double  __vec1_d;
+typedef int8_t  __vec1_i8;
+typedef int16_t __vec1_i16;
+typedef int32_t __vec1_i32;
+typedef int64_t __vec1_i64;
+
+/************ mask **************/
+
+struct __vec16_i1 
+{
+  __mmask16 v;
+
+  FORCEINLINE __vec16_i1() { }
+  FORCEINLINE __vec16_i1(const __mmask16 &vv) : v(vv) { }
+  FORCEINLINE __vec16_i1(bool  v0, bool  v1, bool  v2, bool  v3,
+                         bool  v4, bool  v5, bool  v6, bool  v7,
+                         bool  v8, bool  v9, bool v10, bool v11,
+                         bool v12, bool v13, bool v14, bool v15) {
+    v = ((v0 & 1) |
+        ((v1 & 1) << 1) |
+        ((v2 & 1) << 2) |
+        ((v3 & 1) << 3) |
+        ((v4 & 1) << 4) |
+        ((v5 & 1) << 5) |
+        ((v6 & 1) << 6) |
+        ((v7 & 1) << 7) |
+        ((v8 & 1) << 8) |
+        ((v9 & 1) << 9) |
+        ((v10 & 1) << 10) |
+        ((v11 & 1) << 11) |
+        ((v12 & 1) << 12) |
+        ((v13 & 1) << 13) |
+        ((v14 & 1) << 14) |
+        ((v15 & 1) << 15));
+  }
+
+  FORCEINLINE operator __mmask16() const { return v; }
+};
+
+/************ vector **************/
+
+struct PRE_ALIGN(64) __vec16_i32 
+{
+  __m512i v;
+  FORCEINLINE operator __m512i() const { return v; }
+  FORCEINLINE __vec16_i32() : v(_mm512_undefined_epi32()) {}
+  FORCEINLINE __vec16_i32(const int32_t &in) : v(_mm512_set1_epi32(in)) {}
+  FORCEINLINE __vec16_i32(const __m512i &in) : v(in) {}
+  FORCEINLINE __vec16_i32(const __vec16_i32 &o) : v(o.v) {}
+  FORCEINLINE __vec16_i32& operator =(const __vec16_i32 &o) { v=o.v; return *this; }
+  FORCEINLINE __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, 
+      int32_t v04, int32_t v05, int32_t v06, int32_t v07,
+      int32_t v08, int32_t v09, int32_t v10, int32_t v11,
+      int32_t v12, int32_t v13, int32_t v14, int32_t v15) :
+    v ( _mm512_set_16to16_pi(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) ) {}
+    FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
+    FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
+} POST_ALIGN(64);
+
+PRE_ALIGN(64) struct __vec16_f 
+{
+  __m512 v;
+  FORCEINLINE operator __m512() const { return v; }
+  FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { }
+  FORCEINLINE __vec16_f(const __m512 &in) : v(in) {}
+  FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {}
+  FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; }
+  FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, 
+      float v04, float v05, float v06, float v07,
+      float v08, float v09, float v10, float v11,
+      float v12, float v13, float v14, float v15) :
+    v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) )  {}
+  FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
+  FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
+} POST_ALIGN(64);
+
+static void zmm2hilo(const __m512i v1, const __m512i v2, __m512i &_hi, __m512i &_lo)
+{
+  _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, 
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      v2);
+  _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, 
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      v1);
+  _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      v2);
+  _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF,
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      v1);
+}
+static void hilo2zmm(const __m512i v_hi, const __m512i v_lo, __m512i &_v1, __m512i &_v2)
+{
+  _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v_hi);
+  _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v_lo);
+  _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v_hi);
+  _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v_lo);
+}
+
+struct PRE_ALIGN(128) __vec16_d 
+{
+  union {
+    __m512d v1;
+    __m512d v_hi;
+  };
+  union {
+    __m512d v2;
+    __m512d v_lo;
+  };
+  FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
+  FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
+  FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
+  FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; }
+  FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, 
+      double v04, double v05, double v06, double v07,
+      double v08, double v09, double v10, double v11,
+      double v12, double v13, double v14, double v15) {
+    v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08);
+    v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00);
+  }
+  FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
+  FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
+  FORCEINLINE __vec16_d cvt2hilo()  const
+  {
+    const __m512i _v1 = _mm512_castpd_si512(v1);
+    const __m512i _v2 = _mm512_castpd_si512(v2);
+    __m512i _hi, _lo;
+    zmm2hilo(_v1, _v2, _hi, _lo);
+    return __vec16_d(_mm512_castsi512_pd(_hi), _mm512_castsi512_pd(_lo));
+  }
+  FORCEINLINE __vec16_d cvt2zmm() const
+  {
+    const __m512i _hi = _mm512_castpd_si512(v_hi);
+    const __m512i _lo = _mm512_castpd_si512(v_lo);
+    __m512i _v1, _v2;
+    hilo2zmm(_hi,_lo, _v1,_v2);
+    return __vec16_d(_mm512_castsi512_pd(_v1), _mm512_castsi512_pd(_v2));
+  }
+} POST_ALIGN(128);
+
+struct PRE_ALIGN(128) __vec16_i64 
+{
+  union {
+    __m512i v1;
+    __m512i v_hi;
+  };
+  union
+  {
+    __m512i v2;
+    __m512i v_lo;
+  };
+  FORCEINLINE __vec16_i64() : v1(_mm512_undefined_epi32()), v2(_mm512_undefined_epi32()) {}
+  FORCEINLINE __vec16_i64(const __m512i _v1, const __m512i _v2) : v1(_v1), v2(_v2) {}
+  FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v1(o.v1), v2(o.v2) {}
+  FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v1=o.v1; v2=o.v2; return *this; }
+  FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, 
+      int64_t v04, int64_t v05, int64_t v06, int64_t v07,
+      int64_t v08, int64_t v09, int64_t v10, int64_t v11,
+      int64_t v12, int64_t v13, int64_t v14, int64_t v15) {
+    v2 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08);
+    v1 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00);
+  }
+  FORCEINLINE const int64_t& operator[](const int i) const {  return ((int64_t*)this)[i]; }
+  FORCEINLINE       int64_t& operator[](const int i)       {  return ((int64_t*)this)[i]; }
+  FORCEINLINE __vec16_i64 cvt2hilo()  const
+  {
+    __vec16_i64 ret;
+    zmm2hilo(v1,v2,ret.v_hi,ret.v_lo);
+    return ret;
+  }
+  FORCEINLINE __vec16_i64 cvt2zmm() const
+  {
+    __vec16_i64 ret;
+    hilo2zmm(v_hi,v_lo, ret.v1, ret.v2);
+    return ret;
+  }
+} POST_ALIGN(128);
+
+/************ scalar **************/
+
+template <typename T>
+struct vec16 
+{
+  FORCEINLINE vec16() { }
+  FORCEINLINE vec16(T v0, T v1, T  v2, T  v3, T  v4, T  v5, T  v6, T  v7,
+                    T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
+    data[0] = v0;        data[1] = v1;        data[2] = v2;        data[3] = v3;
+    data[4] = v4;        data[5] = v5;        data[6] = v6;        data[7] = v7;
+    data[8] = v8;        data[9] = v9;        data[10] = v10;      data[11] = v11;
+    data[12] = v12;      data[13] = v13;      data[14] = v14;      data[15] = v15;
+  }
+  T data[16]; 
+  FORCEINLINE const T& operator[](const int i) const { return data[i]; }
+  FORCEINLINE       T& operator[](const int i)       { return data[i]; }
+};
+
+PRE_ALIGN(16) struct __vec16_i8   : public vec16<int8_t> { 
+    __vec16_i8() { }
+    __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
+               int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+               int8_t v8, int8_t v9, int8_t v10, int8_t v11, 
+               int8_t v12, int8_t v13, int8_t v14, int8_t v15) 
+        : vec16<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                        v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(16);
+
+PRE_ALIGN(32) struct __vec16_i16  : public vec16<int16_t> { 
+    __vec16_i16() { }
+    __vec16_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
+                int16_t v4, int16_t v5, int16_t v6, int16_t v7,
+                int16_t v8, int16_t v9, int16_t v10, int16_t v11, 
+                int16_t v12, int16_t v13, int16_t v14, int16_t v15) 
+        : vec16<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(32);
+
+static inline int32_t __extract_element(__vec16_i32, int);
+
+
+///////////////////////////////////////////////////////////////////////////
+// macros...
+
+/* knc::macro::used */
+#define BINARY_OP(TYPE, NAME, OP)                               \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
+    TYPE ret;                                                   \
+   for (int i = 0; i < 16; ++i)                                 \
+       ret[i] = a[i] OP b[i];                             \
+   return ret;                                                   \
+}
+
+/* knc::macro::used */
+#define BINARY_OP_CAST(TYPE, CAST, NAME, OP)                        \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP (CAST)(b[i]);                 \
+   return ret;                                                      \
+}
+
+/* knc::macro::used */
+#define BINARY_OP_FUNC(TYPE, NAME, FUNC)                            \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret[i] = FUNC(a[i], b[i]);                             \
+   return ret;                                                      \
+}
+
+/* knc::macro::used */
+#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP)                        \
+static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) {     \
+   __vec16_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   return ret;                                                      \
+}                                                                   \
+static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b,       \
+                                              __vec16_i1 mask) {    \
+   __vec16_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   ret.v &= mask.v;                                                 \
+   return ret;                                                      \
+}
+
+/* knc::macro::used */
+#define INSERT_EXTRACT(VTYPE, STYPE)                                  \
+static FORCEINLINE STYPE __extract_element(VTYPE v, int index) {      \
+    return ((STYPE *)&v)[index];                                      \
+}                                                                     \
+static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
+    ((STYPE *)v)[index] = val;                                        \
+}
+
+/* knc::macro::used */
+#define LOAD_STORE(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 16; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 16; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+/* knc::macro::used */
+#define REDUCE_ADD(TYPE, VTYPE, NAME)           \
+static FORCEINLINE TYPE NAME(VTYPE v) {         \
+     TYPE ret = v[0];                         \
+     for (int i = 1; i < 16; ++i)               \
+         ret = ret + v[i];                    \
+     return ret;                                \
+}
+
+/* knc::macro::used */
+#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP)                    \
+static FORCEINLINE TYPE NAME(VTYPE v) {                         \
+    TYPE ret = v[0];                                          \
+    for (int i = 1; i < 16; ++i)                                \
+        ret = (ret OP (TYPE)v[i]) ? ret : (TYPE)v[i];       \
+    return ret;                                                 \
+}
+
+/* knc::macro::used */
+#define SELECT(TYPE)                                                \
+static FORCEINLINE TYPE __select(__vec16_i1 mask, TYPE a, TYPE b) { \
+    TYPE ret;                                                       \
+    for (int i = 0; i < 16; ++i)                                    \
+        ret[i] = (mask.v & (1<<i)) ? a[i] : b[i];             \
+    return ret;                                                     \
+}                                                                   \
+static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
+    return cond ? a : b;                                            \
+}
+
+/* knc::macro::used */
+#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
+static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP b;                              \
+   return ret;                                                      \
+}
+
+/* knc::macro::used */
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
+template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret[i] = v;                                              \
+    return ret;                                                    \
+}
+
+/* knc::macro::used */
+#define SETZERO(VTYPE, NAME)                                       \
+template <class RetVecType> VTYPE __setzero_##NAME();              \
+template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret[i] = 0;                                              \
+    return ret;                                                    \
+}
+
+/* knc::macro::used */
+#define UNDEF(VTYPE, NAME)                                         \
+template <class RetVecType> VTYPE __undef_##NAME();                \
+template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {            \
+    return VTYPE();                                                \
+}
+
+/* knc::macro::used */
+#define BROADCAST(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = v[index & 0xf];                  \
+    return ret;                                       \
+}                                                     \
+
+/* knc::macro::used */
+#define ROTATE(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = v[(i+index) & 0xf];              \
+    return ret;                                       \
+}                                                     \
+
+/* knc::macro::used */
+#define SHUFFLES(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = v[__extract_element(index, i) & 0xf];      \
+    return ret;                                       \
+}                                                     \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+        int ii = __extract_element(index, i) & 0x1f;    \
+        ret[i] = (ii < 16) ? v0[ii] : v1[ii-16];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+INSERT_EXTRACT(__vec1_i8, int8_t)
+INSERT_EXTRACT(__vec1_i16, int16_t)
+INSERT_EXTRACT(__vec1_i32, int32_t)
+INSERT_EXTRACT(__vec1_i64, int64_t)
+INSERT_EXTRACT(__vec1_f, float)
+INSERT_EXTRACT(__vec1_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// mask 
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) { return  _mm512_kmov    (mask);       }
+static FORCEINLINE       bool __any   (__vec16_i1 mask) { return !_mm512_kortestz(mask, mask); }
+static FORCEINLINE       bool __all   (__vec16_i1 mask) { return  _mm512_kortestc(mask, mask); }
+static FORCEINLINE       bool __none  (__vec16_i1 mask) { return  _mm512_kortestz(mask, mask); }
+static FORCEINLINE __vec16_i1 __not   (__vec16_i1 mask) { return  _mm512_knot    (mask);       }
+
+static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kxnor (a,b); }
+static FORCEINLINE __vec16_i1 __and     (__vec16_i1 a, __vec16_i1 b) { return _mm512_kand  (a,b); }
+static FORCEINLINE __vec16_i1 __xor     (__vec16_i1 a, __vec16_i1 b) { return _mm512_kxor  (a,b); }
+static FORCEINLINE __vec16_i1 __or      (__vec16_i1 a, __vec16_i1 b) { return _mm512_kor   (a,b); }
+static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandn (a,b); }
+static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandnr(a,b); }
+
+static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, __vec16_i1 b) { return __or(__and(a, mask), __and_not2(b, mask)); }
+static FORCEINLINE __vec16_i1 __select(      bool cond, __vec16_i1 a, __vec16_i1 b) { return cond ? a : b; }
+
+static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) { return (vec.v & (1 << index)) ? true : false; }
+static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, bool val) 
+{
+  if (val == false)  vec->v &= ~(1 << index);
+  else               vec->v |=  (1 << index);
+}
+
+template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) 
+{
+  return *p;
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) 
+{
+  *p = v;
+}
+
+template <class RetVecType> RetVecType __smear_i1(int i);
+template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
+
+template <class RetVecType> RetVecType __setzero_i1();
+template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
+
+template <class RetVecType> __vec16_i1 __undef_i1();
+template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); }
+
+///////////////////////////////////////////////////////////////////////////
+// int8
+///////////////////////////////////////////////////////////////////////////
+
+BINARY_OP(__vec16_i8, __add, +)
+BINARY_OP(__vec16_i8, __sub, -)
+BINARY_OP(__vec16_i8, __mul, *)
+
+BINARY_OP(__vec16_i8, __or, |)
+BINARY_OP(__vec16_i8, __and, &)
+BINARY_OP(__vec16_i8, __xor, ^)
+BINARY_OP(__vec16_i8, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i8, uint8_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i8, uint8_t, __urem, %)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
+
+CMP_OP(__vec16_i8, i8, int8_t,  __equal, ==)
+CMP_OP(__vec16_i8, i8, int8_t,  __not_equal, !=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_less_than, <)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i8)
+INSERT_EXTRACT(__vec16_i8, int8_t)
+SMEAR(__vec16_i8, i8, int8_t)
+SETZERO(__vec16_i8, i8)
+UNDEF(__vec16_i8, i8)
+BROADCAST(__vec16_i8, i8, int8_t)
+ROTATE(__vec16_i8, i8, int8_t)
+SHUFFLES(__vec16_i8, i8, int8_t)
+LOAD_STORE(__vec16_i8, int8_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int16
+///////////////////////////////////////////////////////////////////////////
+
+BINARY_OP(__vec16_i16, __add, +)
+BINARY_OP(__vec16_i16, __sub, -)
+BINARY_OP(__vec16_i16, __mul, *)
+
+BINARY_OP(__vec16_i16, __or, |)
+BINARY_OP(__vec16_i16, __and, &)
+BINARY_OP(__vec16_i16, __xor, ^)
+BINARY_OP(__vec16_i16, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i16, uint16_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i16, uint16_t, __urem, %)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
+
+CMP_OP(__vec16_i16, i16, int16_t,  __equal, ==)
+CMP_OP(__vec16_i16, i16, int16_t,  __not_equal, !=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_less_than, <)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i16)
+INSERT_EXTRACT(__vec16_i16, int16_t)
+SMEAR(__vec16_i16, i16, int16_t)
+SETZERO(__vec16_i16, i16)
+UNDEF(__vec16_i16, i16)
+BROADCAST(__vec16_i16, i16, int16_t)
+ROTATE(__vec16_i16, i16, int16_t)
+SHUFFLES(__vec16_i16, i16, int16_t)
+LOAD_STORE(__vec16_i16, int16_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int32
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_i32 __add (__vec16_i32 a, __vec16_i32 b) { return _mm512_add_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __sub (__vec16_i32 a, __vec16_i32 b) { return _mm512_sub_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __mul (__vec16_i32 a, __vec16_i32 b) { return _mm512_mullo_epi32(a,b); }
+static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epu32  (a,b); }
+static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epu32  (a,b); }
+static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __or  (__vec16_i32 a, __vec16_i32 b) { return _mm512_or_epi32   (a,b); }
+static FORCEINLINE __vec16_i32 __and (__vec16_i32 a, __vec16_i32 b) { return _mm512_and_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __xor (__vec16_i32 a, __vec16_i32 b) { return _mm512_xor_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a, __vec16_i32 b) { return _mm512_sllv_epi32 (a,b); }
+static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srlv_epi32 (a,b); }
+static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srav_epi32 (a,b); }
+static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a,     int32_t n) { return _mm512_slli_epi32 (a,n); }
+static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a,     int32_t n) { return _mm512_srli_epi32 (a,n); }
+static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a,     int32_t n) { return _mm512_srai_epi32 (a,n); }
+
+static FORCEINLINE __vec16_i1 __equal_i32                 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpeq_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __not_equal_i32             (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpneq_epi32_mask(a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32   (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_equal_i32     (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_equal_i32  (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_than_i32    (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_than_i32      (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_than_i32   (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epi32_mask (a,b); }
+
+static FORCEINLINE __vec16_i1 __equal_i32_and_mask                 (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpeq_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask             (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpneq_epi32_mask(m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask   (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask     (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask  (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask    (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask      (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask   (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epi32_mask (m,a,b); }
+
+static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask, __vec16_i32 a, __vec16_i32 b) { return _mm512_mask_mov_epi32(b, mask, a); }
+static FORCEINLINE __vec16_i32 __select(      bool cond, __vec16_i32 a, __vec16_i32 b) { return cond ? a : b; }
+
+static FORCEINLINE int32_t __extract_element(__vec16_i32  v,  int32_t index)              { return v[index];    }
+static FORCEINLINE void    __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val;  }
+
+template <class RetVecType> RetVecType __smear_i32(int32_t i);
+template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
+
+static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
+static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0);
+static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
+static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
+static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+template <class RetVecType> RetVecType __setzero_i32();
+template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
+
+template <class RetVecType> RetVecType __undef_i32();
+template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
+
+static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); }
+
+static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) 
+{
+  __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
+  __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0xF));
+  return _mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v);
+}
+
+static FORCEINLINE __vec16_i32 __shuffle_i32 (__vec16_i32 v, __vec16_i32 index) 
+{ 
+  return _mm512_mask_permutevar_epi32(v, 0xFFFF, __and(index, __smear_i32<__vec16_i32>(0xF)), v); 
+}
+static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __vec16_i32 index)
+{
+  const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10));
+  index  = __and(index, __smear_i32<__vec16_i32>(0xF));
+  __vec16_i32 ret = __undef_i32<__vec16_i32>();
+  ret = _mm512_mask_permutevar_epi32(ret,       mask,  index, v0);
+  ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1);
+  return ret;
+}
+
+template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
+  return __load<64>(p);
+#else
+  __vec16_i32 v;
+  v = _mm512_extloadunpacklo_epi32(v,           p,    _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  return v;
+#endif
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
+  __store<64>(p,v);
+#else
+  _mm512_extpackstorelo_epi32(          p,    v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
+}
+
+#if 0 /* knc::fails  ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
+template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) 
+{
+  return _mm512_load_epi32(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) 
+{
+  _mm512_store_epi32(p, v);
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// int64
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b) 
+{
+  return __vec16_i64(_mm512_add_epi64(a.v1, b.v1), _mm512_add_epi64(a.v2,b.v2));
+}
+
+static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) 
+{
+#if __ICC >= 99999 /* compiler gate, icc >= 99999 will hopefully support _mm512_sub_epi64 */
+  return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2));
+#else
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i64 ret;
+  __mmask16 borrow = 0;
+  ret.v_lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow);
+  ret.v_hi = _mm512_sbb_epi32    (a.v_hi, borrow, b.v_hi, &borrow);
+  return ret.cvt2zmm();
+#endif
+}
+
+static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b)
+{
+  const __vec16_i64 b = _b.cvt2hilo();
+  return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo),
+      _mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi),
+        _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm();
+}
+
+static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_i64 b) 
+{
+  __vec16_i64 ret;
+  ret.v1 = _mm512_mask_mov_epi64(b.v1, mask,      a.v1);
+  ret.v2 = _mm512_mask_mov_epi64(b.v2, mask >> 8, a.v2);
+  return ret;
+}
+
+#if __ICC >= 1400 /* compiler gate, icc >= 14.0.0 support _mm512_mullox_epi64 */
+static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) 
+{
+  return __vec16_i64(_mm512_mullox_epi64(a.v1,b.v1), _mm512_mullox_epi64(a.v2,b.v2));
+}
+#else  /* __ICC >= 1400 */
+static FORCEINLINE void __abs_i32i64(__m512i &_hi, __m512i &_lo)
+{
+  /*   abs(x) : 
+   * mask  = x >> 32;
+   * abs(x) = (x^mask) - mask
+   */ 
+  const __vec16_i32 mask = __ashr(_hi, __ispc_thirty_two);
+  __vec16_i32 hi = __xor(_hi, mask);
+  __vec16_i32 lo = __xor(_lo, mask);
+  __mmask16 borrow = 0;
+  _lo = _mm512_subsetb_epi32(lo, mask, &borrow);
+  _hi = _mm512_sbb_epi32    (hi, borrow, mask, &borrow);
+}
+static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  __vec16_i64 a = _a.cvt2hilo();
+  __vec16_i64 b = _b.cvt2hilo();
+  /* sign = (a^b) >> 32, if sign == 0 then a*b >= 0, otherwise a*b < 0 */
+  const __vec16_i1 sign = __not_equal_i32(__ashr(__xor(a.v_hi, b.v_hi), __ispc_thirty_two), __ispc_zero);
+  __abs_i32i64(a.v_hi, a.v_lo);  /* abs(a) */
+  __abs_i32i64(b.v_hi, b.v_lo);  /* abs(b) */
+  const __vec16_i32 lo_m1 = _mm512_mullo_epi32(a.v_lo, b.v_lo);
+  const __vec16_i32 hi_m1 = _mm512_mulhi_epu32(a.v_lo, b.v_lo);
+  const __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
+  const __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
+  __mmask16 carry;
+  const __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m3, &carry);
+  const __vec16_i32 hi = _mm512_adc_epi32(hi_p23, carry, hi_m1, &carry);
+  const __vec16_i32 lo = lo_m1;
+  const __vec16_i64 ret_abs = __vec16_i64(hi,lo).cvt2zmm();
+  /* if sign != 0, means either a or b is negative, then negate the result */
+  return __select(sign, __sub(__vec16_i64(__ispc_zero, __ispc_zero), ret_abs), ret_abs);
+}
+#endif  /* __ICC >= 1400 */
+
+
+static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_or_epi64 (a.v1, b.v1), _mm512_or_epi64 (a.v2, b.v2)); }
+static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); }
+static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); }
+
+static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2)); }
+static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2)); }
+
+static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); }
+static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); }
+
+
+static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  /* this is a safety gate in case b-shift >= 32 */
+  const __vec16_i32 xfer = __select(
+      __signed_less_than_i32(b.v_lo, __ispc_thirty_two), 
+      __lshr(a.v_lo,   __sub(__ispc_thirty_two, b.v_lo)),
+      __shl (a.v_lo,   __sub(b.v_lo, __ispc_thirty_two))
+      );
+  const __vec16_i32   hi = __or(__shl(a.v_hi, b.v_lo), xfer);
+  const __vec16_i32   lo =      __shl(a.v_lo, b.v_lo);
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  /* this is a safety gate in case b-shift >= 32 */
+  const __vec16_i32 xfer = __select(
+      __signed_less_than_i32(b.v_lo, __ispc_thirty_two), 
+      __shl (a.v_hi,   __sub(__ispc_thirty_two, b.v_lo)),
+      __lshr(a.v_hi,   __sub(b.v_lo, __ispc_thirty_two))
+      );
+  const __vec16_i32   lo = __or(__lshr(a.v_lo, b.v_lo), xfer);
+  const __vec16_i32   hi =      __lshr(a.v_hi, b.v_lo);
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  /* this is a safety gate in case b-shift >= 32 */
+  const __vec16_i32 xfer = __select(
+      __signed_less_than_i32(b.v_lo, __ispc_thirty_two), 
+      __shl (a.v_hi,   __sub(__ispc_thirty_two, b.v_lo)),
+      __ashr(a.v_hi,   __sub(b.v_lo, __ispc_thirty_two))
+      );
+  const __vec16_i32   lo = __or(__lshr(a.v_lo, b.v_lo), xfer);
+  const __vec16_i32   hi =      __ashr(a.v_hi, b.v_lo);
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+
+template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
+template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
+
+template <class RetVecType> RetVecType __setzero_i64();
+template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
+
+template <class RetVecType> RetVecType __undef_i64();
+template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
+
+static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, uint64_t shift) { return __lshr(a, __smear_i64<__vec16_i64>(shift)); }
+static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a,  int64_t shift) { return __ashr(a, __smear_i64<__vec16_i64>(shift)); }
+static FORCEINLINE __vec16_i64 __shl (__vec16_i64 a,  int64_t shift) { return __shl (a, __smear_i64<__vec16_i64>(shift)); }
+
+static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
+  return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
+}
+static FORCEINLINE __vec16_i1 __equal_i64_and_mask(__vec16_i64 _a, __vec16_i64 _b, __vec16_i1 mask) 
+{
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
+  __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
+  return _mm512_kand(full_match, (__mmask16)mask);
+}
+static FORCEINLINE __vec16_i1 __not_equal_i64(__vec16_i64 a, __vec16_i64 b) 
+{
+  return __not(__equal_i64(a,b));
+}
+static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(__vec16_i64 a, __vec16_i64 b, __vec16_i1 mask) 
+{
+  return __and(__not(__equal_i64(a,b)), mask);
+}
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_than, <)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_than, >)
+
+
+INSERT_EXTRACT(__vec16_i64, int64_t)
+
+
+#define CASTL2I(_v_, _v_hi_, _v_lo_) \
+  __vec16_i32 _v_hi_, _v_lo_;  \
+  { \
+  const __vec16_i64 v      = _v_.cvt2hilo(); \
+  _v_hi_   = v.v_hi; \
+  _v_lo_   = v.v_lo; }
+#define CASTI2L(_ret_hi_, _ret_lo_) \
+  __vec16_i64(_ret_hi_, _ret_lo_).cvt2zmm()
+static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 _v, int index) 
+{
+  CASTL2I(_v, v_hi, v_lo);
+  const __vec16_i32 ret_hi = __broadcast_i32(v_hi, index);
+  const __vec16_i32 ret_lo = __broadcast_i32(v_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_i64 __rotate_i64(const __vec16_i64 _v, const int index) 
+{
+  CASTL2I(_v, v_hi, v_lo);
+  const __vec16_i32 ret_hi = __rotate_i32(v_hi, index);
+  const __vec16_i32 ret_lo = __rotate_i32(v_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_i64 __shuffle_double(__vec16_i64 _v, const __vec16_i32 index) 
+{
+  CASTL2I(_v, v_hi, v_lo);
+  const __vec16_i32 ret_hi = __shuffle_i32(v_hi, index);
+  const __vec16_i32 ret_lo = __shuffle_i32(v_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_i64 __shuffle2_double(__vec16_i64 _v0, __vec16_i64 _v1, const __vec16_i32 index)
+{
+  CASTL2I(_v0, v0_hi, v0_lo);
+  CASTL2I(_v1, v1_hi, v1_lo);
+  const __vec16_i32 ret_hi = __shuffle2_i32(v0_hi, v1_hi, index);
+  const __vec16_i32 ret_lo = __shuffle2_i32(v0_lo, v1_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+#undef CASTI2L
+#undef CASTL2I
+
+template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
+  return __load<128>(p);
+#else
+  __vec16_i32 v1;
+  __vec16_i32 v2;
+  v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v2 = _mm512_extloadunpackhi_epi32(v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  return __vec16_i64(v2,v1);
+#endif
+}
+
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
+  return __store<128>(p,v);
+#else
+  __m512i v1 = v.v2;
+  __m512i v2 = v.v1;
+  _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
+}
+
+#if 0 /* knc::fails  as with _i32 this may generate fails ... so commetining it out */
+template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
+{
+  __m512i v2 = _mm512_load_epi32(p);
+  __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
+  return __vec16_i64(v2,v1);
+}
+template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
+template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
+{
+  __m512i v1 = v.v2;
+  __m512i v2 = v.v1;
+  _mm512_store_epi64(p, v2);
+  _mm512_store_epi64(((uint8_t*)p)+64, v1);
+}
+template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
+#endif
+
+
+///////////////////////////////////////////////////////////////////////////
+// float
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { return _mm512_add_ps(a,b); }
+static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) { return _mm512_sub_ps(a,b); }
+static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) { return _mm512_mul_ps(a,b); }
+static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) { return _mm512_div_ps(a,b); }
+
+static FORCEINLINE __vec16_i1 __equal_float        (__vec16_f a, __vec16_f b) { return _mm512_cmpeq_ps_mask (a,b);            }
+static FORCEINLINE __vec16_i1 __not_equal_float    (__vec16_f a, __vec16_f b) { return _mm512_cmpneq_ps_mask(a,b);            }
+static FORCEINLINE __vec16_i1 __less_than_float    (__vec16_f a, __vec16_f b) { return _mm512_cmplt_ps_mask (a,b);            }
+static FORCEINLINE __vec16_i1 __less_equal_float   (__vec16_f a, __vec16_f b) { return _mm512_cmple_ps_mask (a,b);            }
+static FORCEINLINE __vec16_i1 __greater_than_float (__vec16_f a, __vec16_f b) { return _mm512_cmp_ps_mask   (a,b,_CMP_GT_OS); }
+static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmp_ps_mask   (a,b,_CMP_GE_OS); }
+
+static FORCEINLINE __vec16_i1 __equal_float_and_mask        (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmpeq_ps_mask (m,a,b);            }
+static FORCEINLINE __vec16_i1 __not_equal_float_and_mask    (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmpneq_ps_mask(m,a,b);            }
+static FORCEINLINE __vec16_i1 __less_than_float_and_mask    (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmplt_ps_mask (m,a,b);            }
+static FORCEINLINE __vec16_i1 __less_equal_float_and_mask   (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmple_ps_mask (m,a,b);            }
+static FORCEINLINE __vec16_i1 __greater_than_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmp_ps_mask   (m,a,b,_CMP_GT_OS); }
+static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmp_ps_mask   (m,a,b,_CMP_GE_OS); }
+
+static FORCEINLINE __vec16_i1   __ordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpord_ps_mask  (a,b); }
+static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpunord_ps_mask(a,b); }
+
+static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) { return _mm512_mask_mov_ps(b, mask, a); }
+static FORCEINLINE __vec16_f __select(      bool cond, __vec16_f a, __vec16_f b) { return cond ? a : b; }
+
+static FORCEINLINE float __extract_element(__vec16_f  v, uint32_t index)            { return v[index];   }
+static FORCEINLINE void   __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; }
+
+template <class RetVecType> RetVecType __smear_float(float f);
+template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
+
+template <class RetVecType> RetVecType __setzero_float();
+template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
+
+template <class RetVecType> RetVecType __undef_float();
+template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
+
+static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index) 
+{
+  const __vec16_i32 v = _mm512_castps_si512(_v);
+  return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v));
+}
+ 
+static FORCEINLINE __vec16_f __rotate_float(__vec16_f _v, int index) 
+{
+  const __vec16_i32 v =  _mm512_castps_si512(_v);
+  const __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
+  const __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0xF));
+  return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v));
+}
+static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) 
+{
+  return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
+}
+static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f _v0, __vec16_f _v1, __vec16_i32 index)
+{
+  const __vec16_i32 v0 =  _mm512_castps_si512(_v0);
+  const __vec16_i32 v1 =  _mm512_castps_si512(_v1);
+  const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10));
+  index  = __and(index, __smear_i32<__vec16_i32>(0xF));
+  __vec16_i32 ret = __undef_i32<__vec16_i32>();
+  ret = _mm512_mask_permutevar_epi32(ret,       mask,  index, v0);
+  ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1);
+  return _mm512_castsi512_ps(ret);
+}
+
+template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
+  return __load<64>(p);
+#else
+  __vec16_f v;
+  v = _mm512_extloadunpacklo_ps(v,           p,    _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  return v;
+#endif
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
+  __store<64>(p,v);
+#else
+  _mm512_extpackstorelo_ps(          p,    v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+#endif
+}
+
+#if 0 /* knc::fails  ./tests/gs-improve-progindex.ispc with segfault */
+template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) 
+{
+    return _mm512_load_ps(p);
+}
+/* this one doesn't fail but it is  commented out for completeness, no aligned load/stores */
+template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) 
+{
+  _mm512_store_ps(p, v);
+}
+#endif
+
+/******** math ******/
+
+/*** float ***/
+static FORCEINLINE float __exp_uniform_float(float v) {    return expf(v);}
+static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) { return _mm512_exp_ps(v); }
+
+static FORCEINLINE float __log_uniform_float(float v) {    return logf(v);}
+static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) { return _mm512_log_ps(v); }
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {    return powf(a, b);}
+static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); }
+
+/*** double ***/
+static FORCEINLINE double __exp_uniform_double(double v) {    return exp(v);}
+static FORCEINLINE __vec16_d __exp_varying_double(__vec16_d v) { return __vec16_d(_mm512_exp_pd(v.v1),_mm512_exp_pd(v.v2)); }
+
+static FORCEINLINE double __log_uniform_double(double v) {    return log(v);}
+static FORCEINLINE __vec16_d __log_varying_double(__vec16_d v) { return __vec16_d(_mm512_log_pd(v.v1),_mm512_log_pd(v.v2)); }
+
+static FORCEINLINE double __pow_uniform_double(double a, double b) {    return pow(a,b);}
+static FORCEINLINE __vec16_d __pow_varying_double(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1),_mm512_pow_pd(a.v2,b.v2)); }
+
+/******** bitcast ******/
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// half<->float : this one passes the tests 
+// source : 
+// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion 
+///////////////////////////////////////////////////////////////////////////
+class Float16Compressor
+{
+  union Bits
+  {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static int const shift = 13;
+  static int const shiftSign = 16;
+
+  static int32_t const infN = 0x7F800000; // flt32 infinity
+  static int32_t const maxN = 0x477FE000; // max flt16 normal as a flt32
+  static int32_t const minN = 0x38800000; // min flt16 normal as a flt32
+  static int32_t const signN = 0x80000000; // flt32 sign bit
+
+  static int32_t const infC = infN >> shift;
+  static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32
+  static int32_t const maxC = maxN >> shift;
+  static int32_t const minC = minN >> shift;
+  static int32_t const signC = signN >> shiftSign; // flt16 sign bit
+
+  static int32_t const mulN = 0x52000000; // (1 << 23) / minN
+  static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift))
+
+  static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted
+  static int32_t const norC = 0x00400; // min flt32 normal down shifted
+
+  static int32_t const maxD = infC - maxC - 1;
+  static int32_t const minD = minC - subC - 1;
+
+  public:
+
+  static uint16_t compress(float value)
+  {
+    Bits v, s;
+    v.f = value;
+    uint32_t sign = v.si & signN;
+    v.si ^= sign;
+    sign >>= shiftSign; // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f; // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift; // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    return v.ui | sign;
+  }
+
+  static float decompress(uint16_t value)
+  {
+    Bits v;
+    v.ui = value;
+    int32_t sign = v.si & signC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+  }
+};
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) 
+{
+  return Float16Compressor::decompress(h);
+}
+static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) 
+{
+  __vec16_f ret;
+  for (int i = 0; i < 16; ++i)
+    ret[i] = __half_to_float_uniform(v[i]);
+  return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) 
+{
+  return Float16Compressor::compress(f);
+}
+static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) 
+{
+  __vec16_i16 ret;
+  for (int i = 0; i < 16; ++i)
+    ret[i] = __float_to_half_uniform(v[i]);
+  return ret;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// double
+///////////////////////////////////////////////////////////////////////////
+
+#define VECOP(OP) __vec16_d(_mm512_##OP(a.v1,b.v1),_mm512_##OP(a.v2,b.v2))
+static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { return VECOP(add_pd); }
+static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) { return VECOP(sub_pd); }
+static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) { return VECOP(mul_pd); }
+static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) { return VECOP(div_pd); }
+#undef VECOP
+
+#define CMPOP(OP) _mm512_kmovlhb(_mm512_##OP(a.v1,b.v1),_mm512_##OP(a.v2,b.v2))
+static FORCEINLINE __vec16_i1 __equal_double        (__vec16_d a, __vec16_d b) { return CMPOP(cmpeq_pd_mask);    }
+static FORCEINLINE __vec16_i1 __not_equal_double    (__vec16_d a, __vec16_d b) { return CMPOP(cmpneq_pd_mask);   }
+static FORCEINLINE __vec16_i1 __less_than_double    (__vec16_d a, __vec16_d b) { return CMPOP(cmplt_pd_mask);    }
+static FORCEINLINE __vec16_i1 __less_equal_double   (__vec16_d a, __vec16_d b) { return CMPOP(cmple_pd_mask);    }
+static FORCEINLINE __vec16_i1 __greater_than_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpnle_pd_mask);   }
+static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { return CMPOP(cmpnlt_pd_mask);   }
+static FORCEINLINE __vec16_i1 __ordered_double      (__vec16_d a, __vec16_d b) { return CMPOP(cmpord_pd_mask);   }
+static FORCEINLINE __vec16_i1 __unordered_double    (__vec16_d a, __vec16_d b) { return CMPOP(cmpunord_pd_mask); }
+#undef CMPOP
+
+#define CMPOPMASK(OP) _mm512_kmovlhb(_mm512_mask_##OP(m,a.v1,b.v1),_mm512_mask_##OP(_mm512_kswapb(m,m),a.v2,b.v2))
+static FORCEINLINE __vec16_i1 __equal_double_and_mask        (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpeq_pd_mask);  }
+static FORCEINLINE __vec16_i1 __not_equal_double_and_mask    (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpneq_pd_mask); }
+static FORCEINLINE __vec16_i1 __less_than_double_and_mask    (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmplt_pd_mask);  }
+static FORCEINLINE __vec16_i1 __less_equal_double_and_mask   (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmple_pd_mask);  }
+static FORCEINLINE __vec16_i1 __greater_than_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpnle_pd_mask); }
+static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpnlt_pd_mask); }
+#undef CMOPMASK
+
+
+static FORCEINLINE __vec16_d __select(__vec16_i1 m, __vec16_d a, __vec16_d b) 
+{
+  return __vec16_d(_mm512_mask_mov_pd(b.v1, m, a.v1), _mm512_mask_mov_pd(b.v2, _mm512_kswapb(m, m), a.v2));
+}
+static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) 
+{
+    return cond ? a : b;
+}
+
+static FORCEINLINE double __extract_element(__vec16_d  v, uint32_t index)             { return v[index];   }
+static FORCEINLINE void    __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; }
+
+template <class RetVecType> RetVecType __smear_double(double d);
+template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
+
+template <class RetVecType> RetVecType __setzero_double();
+template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
+
+template <class RetVecType> RetVecType __undef_double();
+template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
+
+#define CASTD2F(_v_, _v_hi_, _v_lo_) \
+  __vec16_f _v_hi_, _v_lo_;  \
+  { \
+  const __vec16_d v      = _v_.cvt2hilo(); \
+  _v_hi_   = _mm512_castpd_ps(v.v_hi); \
+  _v_lo_   = _mm512_castpd_ps(v.v_lo); }
+#define CASTF2D(_ret_hi_, _ret_lo_) \
+  __vec16_d(_mm512_castps_pd(_ret_hi_), _mm512_castps_pd(_ret_lo_)).cvt2zmm()
+static FORCEINLINE __vec16_d __broadcast_double(__vec16_d _v, int index) 
+{
+  CASTD2F(_v, v_hi, v_lo);
+  const __vec16_f ret_hi = __broadcast_float(v_hi, index);
+  const __vec16_f ret_lo = __broadcast_float(v_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_d __rotate_double(const __vec16_d _v, const int index) 
+{
+  CASTD2F(_v, v_hi, v_lo);
+  const __vec16_f ret_hi = __rotate_float(v_hi, index);
+  const __vec16_f ret_lo = __rotate_float(v_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index) 
+{
+  CASTD2F(_v, v_hi, v_lo);
+  const __vec16_f ret_hi = __shuffle_float(v_hi, index);
+  const __vec16_f ret_lo = __shuffle_float(v_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_d __shuffle2_double(__vec16_d _v0, __vec16_d _v1, const __vec16_i32 index)
+{
+  CASTD2F(_v0, v0_hi, v0_lo);
+  CASTD2F(_v1, v1_hi, v1_lo);
+  const __vec16_f ret_hi = __shuffle2_float(v0_hi, v1_hi, index);
+  const __vec16_f ret_lo = __shuffle2_float(v0_lo, v1_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+#undef CASTF2D
+#undef CASTD2F
+
+template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) \
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
+  return __load<128>(p);
+#else
+  __vec16_d ret;
+  ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  return ret;
+#endif
+}
+ 
+template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
+  return __store<128>(p,v);
+#else
+  _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+#endif
+}
+
+
+#if 0 /* knc::fails  as with _f this may generate fails ... so commetining it out */
+template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) 
+{
+  return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
+}
+template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) 
+{
+  _mm512_store_pd(p, v.v1);
+  _mm512_store_pd(((uint8_t*)p)+64, v.v2);
+}
+template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p)        { return __load<64>(p); }
+template <> static FORCEINLINE      void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v);    }
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// casts
+///////////////////////////////////////////////////////////////////////////
+
+
+/* knc::macro::used */
+#define CAST(TO, STO, FROM, SFROM, FUNC)        \
+static FORCEINLINE TO FUNC(TO, FROM val) {      \
+    TO ret;                                     \
+    for (int i = 0; i < 16; ++i)                \
+        ret[i] = (STO)((SFROM)(val[i]));    \
+    return ret;                                 \
+}
+
+// sign extension conversions
+
+// CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
+static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
+{
+  return __vec16_i64(_mm512_srai_epi32(val.v,31), val.v).cvt2zmm();
+}
+CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext)
+CAST(__vec16_i64, int64_t, __vec16_i8,  int8_t,  __cast_sext)
+CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
+CAST(__vec16_i32, int32_t, __vec16_i8,  int8_t,  __cast_sext)
+CAST(__vec16_i16, int16_t, __vec16_i8,  int8_t,  __cast_sext)
+
+/* knc::macro::used */
+#define CAST_SEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 16; ++i) {                    \
+        ret[i] = 0;                                 \
+        if (v.v & (1 << i))                           \
+            ret[i] = ~ret[i];                     \
+    }                                                 \
+    return ret;                                       \
+}
+
+CAST_SEXT_I1(__vec16_i8)
+CAST_SEXT_I1(__vec16_i16)
+
+//CAST_SEXT_I1(__vec16_i32)
+static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val)
+{
+  __vec16_i32 ret = _mm512_setzero_epi32();
+  __vec16_i32 one = _mm512_set1_epi32(-1);
+  return _mm512_mask_mov_epi32(ret, val, one);
+}
+
+CAST_SEXT_I1(__vec16_i64)
+
+// zero extension
+// CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
+static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val)
+{
+  return __vec16_i64(_mm512_setzero_epi32(), val.v).cvt2zmm();
+}
+
+CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
+CAST(__vec16_i64, uint64_t, __vec16_i8,  uint8_t,  __cast_zext)
+CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext)
+CAST(__vec16_i32, uint32_t, __vec16_i8,  uint8_t,  __cast_zext)
+CAST(__vec16_i16, uint16_t, __vec16_i8,  uint8_t,  __cast_zext)
+
+/* knc::macro::used */
+#define CAST_ZEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = (v.v & (1 << i)) ? 1 : 0;          \
+    return ret;                                       \
+}
+
+CAST_ZEXT_I1(__vec16_i8)
+CAST_ZEXT_I1(__vec16_i16)
+
+//CAST_ZEXT_I1(__vec16_i32)
+static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
+{
+  __vec16_i32 ret = _mm512_setzero_epi32();
+  __vec16_i32 one = _mm512_set1_epi32(1);
+  return _mm512_mask_mov_epi32(ret, val, one);
+}
+
+CAST_ZEXT_I1(__vec16_i64)
+
+// truncations
+CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i16, int16_t, __vec16_i32, int32_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i32, int32_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i16, int16_t, __cast_trunc)
+
+// signed int to float/double
+
+//CAST(__vec16_f, float, __vec16_i8,   int8_t,  __cast_sitofp)
+static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8  val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i16,  int16_t, __cast_sitofp)
+static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i32,  int32_t, __cast_sitofp)
+static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepi32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+
+CAST(__vec16_f, float, __vec16_i64,  int64_t, __cast_sitofp)
+
+//CAST(__vec16_d, double, __vec16_i8,  int8_t,  __cast_sitofp)
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
+}
+
+// CAST(__vec16_d, double, __vec16_i16, int16_t, __cast_sitofp)
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
+}
+
+// CAST(__vec16_d, double, __vec16_i32, int32_t, __cast_sitofp)
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(val);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
+}
+
+CAST(__vec16_d, double, __vec16_i64, int64_t, __cast_sitofp)
+
+// unsigned int to float/double
+
+// CAST(__vec16_f, float, __vec16_i8,   uint8_t,  __cast_uitofp)
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i8  val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i16,  uint16_t, __cast_uitofp)
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i32,  uint32_t, __cast_uitofp)
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepu32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+
+CAST(__vec16_f, float, __vec16_i64,  uint64_t, __cast_uitofp)
+
+// CAST(__vec16_d, double, __vec16_i8,  uint8_t,  __cast_uitofp)
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i8 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepu32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepu32lo_pd(other8);
+  return ret;
+}
+
+// CAST(__vec16_d, double, __vec16_i16, uint16_t, __cast_uitofp)
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i16 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepu32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepu32lo_pd(other8);
+  return ret;
+}
+
+// CAST(__vec16_d, double, __vec16_i32, uint32_t, __cast_uitofp)
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i32 val) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepu32lo_pd(val);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepu32lo_pd(other8);
+  return ret;
+}
+
+CAST(__vec16_d, double, __vec16_i64, uint64_t, __cast_uitofp)
+
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) 
+{
+  const __m512 ret = _mm512_setzero_ps();
+  const __m512 one = _mm512_set1_ps(1.0);
+  return _mm512_mask_mov_ps(ret, v, one);
+}
+
+// float/double to signed int
+CAST(__vec16_i8,  int8_t,  __vec16_f, float, __cast_fptosi)
+CAST(__vec16_i16, int16_t, __vec16_f, float, __cast_fptosi)
+
+// CAST(__vec16_i32, int32_t, __vec16_f, float, __cast_fptosi)
+static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) 
+{
+  return _mm512_cvtfxpnt_round_adjustps_epi32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+
+CAST(__vec16_i64, int64_t, __vec16_f, float, __cast_fptosi)
+CAST(__vec16_i8,  int8_t,  __vec16_d, double, __cast_fptosi)
+CAST(__vec16_i16, int16_t, __vec16_d, double, __cast_fptosi)
+#if 0 /* knc::2implement */
+#else
+CAST(__vec16_i32, int32_t, __vec16_d, double, __cast_fptosi)
+#endif
+CAST(__vec16_i64, int64_t, __vec16_d, double, __cast_fptosi)
+
+// float/double to unsigned int
+CAST(__vec16_i8,  uint8_t,  __vec16_f, float, __cast_fptoui)
+CAST(__vec16_i16, uint16_t, __vec16_f, float, __cast_fptoui)
+
+// CAST(__vec16_i32, uint32_t, __vec16_f, float, __cast_fptoui)
+static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) 
+{
+  return _mm512_cvtfxpnt_round_adjustps_epu32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+
+CAST(__vec16_i64, uint64_t, __vec16_f, float, __cast_fptoui)
+CAST(__vec16_i8,  uint8_t,  __vec16_d, double, __cast_fptoui)
+CAST(__vec16_i16, uint16_t, __vec16_d, double, __cast_fptoui)
+#if 0 /* knc::2implement */
+#else
+CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui)
+#endif
+CAST(__vec16_i64, uint64_t, __vec16_d, double, __cast_fptoui)
+
+// float/double conversions
+
+// CAST(__vec16_f, float,  __vec16_d, double, __cast_fptrunc)
+static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) 
+{
+  __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
+  __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
+
+  return _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA));
+}
+
+// CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
+static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtpslo_pd(val.v);
+  __vec16_f other8 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC));
+  ret.v2 = _mm512_cvtpslo_pd(other8);
+  return ret;
+}
+
+typedef union {
+    int32_t i32;
+    float f;
+    int64_t i64;
+    double d;
+} BitcastUnion;
+
+/* knc::macro::not used */
+#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT)       \
+static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
+    TO r;                                           \
+    for (int i = 0; i < 16; ++i) {                  \
+        BitcastUnion u;                             \
+        u.FROM_ELT = val[i];                      \
+        r[i] = u.TO_ELT;                          \
+    }                                               \
+    return r;                                       \
+}
+
+// CAST_BITS(__vec16_f,   f,   __vec16_i32, i32)
+static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) { return _mm512_castsi512_ps(val); }
+// CAST_BITS(__vec16_i32, i32, __vec16_f,   f)
+static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) { return _mm512_castps_si512(val); }
+
+// CAST_BITS(__vec16_d,   d,   __vec16_i64, i64)
+static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) { return *(__vec16_i64*)&val; }
+// CAST_BITS(__vec16_i64, i64, __vec16_d,   d)
+static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) { return *(__vec16_d*)&val; }
+
+/* knc::macro::used */
+#define CAST_BITS_SCALAR(TO, FROM)                  \
+static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
+    union {                                         \
+    TO to;                                          \
+    FROM from;                                      \
+    } u;                                            \
+    u.from = v;                                     \
+    return u.to;                                    \
+}
+
+CAST_BITS_SCALAR(uint32_t, float)
+CAST_BITS_SCALAR(int32_t, float)
+CAST_BITS_SCALAR(float, uint32_t)
+CAST_BITS_SCALAR(float, int32_t)
+CAST_BITS_SCALAR(uint64_t, double)
+CAST_BITS_SCALAR(int64_t, double)
+CAST_BITS_SCALAR(double, uint64_t)
+CAST_BITS_SCALAR(double, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// various math functions
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE void __fastmath() {
+}
+
+static FORCEINLINE float __round_uniform_float(float v) {
+    return roundf(v);
+}
+
+static FORCEINLINE float __floor_uniform_float(float v)  {
+    return floorf(v);
+}
+
+static FORCEINLINE float __ceil_uniform_float(float v) {
+    return ceilf(v);
+}
+
+static FORCEINLINE double __round_uniform_double(double v) {
+    return round(v);
+}
+
+static FORCEINLINE double __floor_uniform_double(double v) {
+    return floor(v);
+}
+
+static FORCEINLINE double __ceil_uniform_double(double v) {
+    return ceil(v);
+}
+
+static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) { return _mm512_round_ps(v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); }
+static FORCEINLINE __vec16_f __floor_varying_float(__vec16_f v) { return _mm512_floor_ps(v); }
+static FORCEINLINE __vec16_f __ceil_varying_float(__vec16_f v) { return _mm512_ceil_ps(v); }
+
+static FORCEINLINE __vec16_d __round_varying_float(__vec16_d v)  { return __vec16_d(_mm512_svml_round_pd(v.v1), _mm512_svml_round_pd(v.v2)); }
+static FORCEINLINE __vec16_d __floor_varying_float(__vec16_d v)  { return __vec16_d(_mm512_floor_pd(v.v1), _mm512_floor_pd(v.v2)); }
+static FORCEINLINE __vec16_d __ceil_varying_float(__vec16_d v)  { return __vec16_d(_mm512_ceil_pd(v.v1), _mm512_ceil_pd(v.v2)); }
+
+// min/max
+
+static FORCEINLINE float  __min_uniform_float (float  a, float  b) { return (a<b) ? a : b; }
+static FORCEINLINE float  __max_uniform_float (float  a, float  b) { return (a>b) ? a : b; }
+static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
+static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int32_t __min_uniform_int32 ( int32_t a,  int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32 ( int32_t a,  int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int64_t __min_uniform_int64 ( int64_t a,  int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64 ( int64_t a,  int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE __vec16_f __max_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmax_ps(v1, v2);}
+static FORCEINLINE __vec16_f __min_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmin_ps(v1, v2);}
+static FORCEINLINE __vec16_d __max_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmax_pd(v1.v1, v2.v1),_mm512_gmax_pd(v1.v2,v2.v2));}
+static FORCEINLINE __vec16_d __min_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmin_pd(v1.v1, v2.v1),_mm512_gmin_pd(v1.v2,v2.v2));}
+
+static FORCEINLINE __vec16_i32 __max_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epi32(v1, v2);}
+static FORCEINLINE __vec16_i32 __min_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epi32(v1, v2);}
+static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epu32(v1, v2);}
+static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epu32(v1, v2);}
+
+BINARY_OP_FUNC(__vec16_i64, __max_varying_int64,  __max_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __min_varying_int64,  __min_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64)
+BINARY_OP_FUNC(__vec16_i64, __min_varying_uint64, __min_uniform_uint64)
+
+// sqrt/rsqrt/rcp
+
+static FORCEINLINE float  __rsqrt_uniform_float(float  v) { return 1.f / sqrtf(v); }
+static FORCEINLINE float  __rcp_uniform_float  (float  v) { return 1.f / v;        }
+static FORCEINLINE float  __sqrt_uniform_float (float  v) { return sqrtf(v);       }
+static FORCEINLINE double __sqrt_uniform_double(double v) { return sqrt (v);       }
+
+static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) 
+{
+#ifdef ISPC_FAST_MATH
+  return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy.
+#else
+  return _mm512_recip_ps(v);
+#endif
+}
+
+static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) 
+{
+#ifdef ISPC_FAST_MATH
+  return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy
+#else 
+  return _mm512_invsqrt_ps(v);
+#endif
+}
+static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);}
+static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));}
+
+///////////////////////////////////////////////////////////////////////////
+// svml
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_f __svml_sinf  (__vec16_f v)              { return _mm512_sin_ps(v);     }
+static FORCEINLINE __vec16_f __svml_asinf (__vec16_f v)              { return _mm512_asin_ps(v);    }
+static FORCEINLINE __vec16_f __svml_cosf  (__vec16_f v)              { return _mm512_cos_ps(v);     }
+static FORCEINLINE __vec16_f __svml_tanf  (__vec16_f v)              { return _mm512_tan_ps(v);     }
+static FORCEINLINE __vec16_f __svml_atanf (__vec16_f v)              { return _mm512_atan_ps(v);    }
+static FORCEINLINE __vec16_f __svml_atan2f(__vec16_f a, __vec16_f b) { return _mm512_atan2_ps(a,b); }
+static FORCEINLINE __vec16_f __svml_expf  (__vec16_f v)              { return _mm512_exp_ps(v);     }
+static FORCEINLINE __vec16_f __svml_logf  (__vec16_f v)              { return _mm512_log_ps(v);     }
+static FORCEINLINE __vec16_f __svml_powf  (__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b);   }
+
+static FORCEINLINE __vec16_d __svml_sind  (__vec16_d v)              { return __vec16_d(_mm512_sin_pd(v.v1), _mm512_sin_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_asind (__vec16_d v)              { return __vec16_d(_mm512_asin_pd(v.v1), _mm512_asin_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_cosd  (__vec16_d v)              { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_tand  (__vec16_d v)              { return __vec16_d(_mm512_tan_pd(v.v1), _mm512_tan_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_atand (__vec16_d v)              { return __vec16_d(_mm512_atan_pd(v.v1), _mm512_atan_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_atan2d(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_atan2_pd(a.v1,b.v1), _mm512_atan2_pd(a.v2,b.v2)); }
+static FORCEINLINE __vec16_d __svml_expd  (__vec16_d v)              { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_logd  (__vec16_d v)              { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_powd  (__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); }
+
+///////////////////////////////////////////////////////////////////////////
+// bit ops
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __popcnt_int64(uint64_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & (1<<31)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & (1ull<<63)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// reductions
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE float __reduce_add_float(__vec16_f v) { return _mm512_reduce_add_ps(v); }
+static FORCEINLINE float __reduce_min_float(__vec16_f v) { return _mm512_reduce_min_ps(v); }
+static FORCEINLINE float __reduce_max_float(__vec16_f v) { return _mm512_reduce_max_ps(v); }
+
+static FORCEINLINE float __reduce_add_double(__vec16_d v) { return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2); }
+static FORCEINLINE float __reduce_min_double(__vec16_d v) { return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2)); }
+static FORCEINLINE float __reduce_max_double(__vec16_d v) { return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2)); }
+
+
+
+static FORCEINLINE  int64_t __reduce_add_int32  (__vec16_i32 v) { return _mm512_reduce_add_epi32(v);}
+static FORCEINLINE  int32_t __reduce_min_int32  (__vec16_i32 v) { return _mm512_reduce_min_epi32(v);}
+static FORCEINLINE  int32_t __reduce_max_int32  (__vec16_i32 v) { return _mm512_reduce_max_epi32(v);}
+static FORCEINLINE uint32_t __reduce_min_uint32 (__vec16_i32 v) { return _mm512_reduce_min_epu32(v);}
+static FORCEINLINE uint32_t __reduce_max_uint32 (__vec16_i32 v) { return _mm512_reduce_max_epu32(v);}
+
+REDUCE_ADD   ( int16_t, __vec16_i8,  __reduce_add_int8)
+REDUCE_ADD   ( int32_t, __vec16_i16, __reduce_add_int16)
+REDUCE_ADD   ( int64_t, __vec16_i64, __reduce_add_int64)
+REDUCE_MINMAX( int64_t, __vec16_i64, __reduce_min_int64, <)
+REDUCE_MINMAX( int64_t, __vec16_i64, __reduce_max_int64, >)
+REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <)
+REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
+
+///////////////////////////////////////////////////////////////////////////
+// masked load/store
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_i8 __masked_load_i8(void *p,
+                                               __vec16_i1 mask) {
+    __vec16_i8 ret;
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec16_i16 __masked_load_i16(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_i16 ret;
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  return _mm512_mask_load_epi32(__vec16_i32(), mask, p);
+#else
+  __vec16_i32 tmp;
+  tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __vec16_i32 ret;
+  return _mm512_mask_mov_epi32(ret.v, mask, tmp.v);
+#endif
+}
+
+static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
+#else
+  __vec16_f tmp;
+  tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  __vec16_f ret;
+  return _mm512_mask_mov_ps(ret.v, mask, tmp.v);
+#endif
+}
+
+static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_i64 ret;
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  __vec16_d ret;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p);
+  ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64);
+  return ret;
+#else
+  __vec16_d tmp;
+  tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  __vec16_d ret;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1);
+  ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2);
+  return ret;
+#endif
+}
+
+
+static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
+                                          __vec16_i1 mask) {
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
+                                           __vec16_i1 mask) {
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  _mm512_mask_store_epi32(p, mask, val.v);
+#else
+  __vec16_i32 tmp;
+  tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v);
+  _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
+}
+
+static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, __vec16_i1 mask) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  _mm512_mask_store_ps(p, mask, val.v);
+#else
+  __vec16_f tmp;
+  tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v);
+  _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+#endif
+}
+
+static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
+                                          __vec16_i1 mask) {
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, __vec16_i1 mask) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  _mm512_mask_store_pd(p, mask, val.v1);
+  _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
+#else
+  __vec16_d tmp;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1);
+  tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2);
+  _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+#endif
+}
+
+static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
+                                                __vec16_i1 mask) {
+    __masked_store_i8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
+                                                   __vec16_i1 mask) {
+    __masked_store_float(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i64(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
+                                                    __vec16_i1 mask) {
+    __masked_store_double(p, val, mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// gather/scatter
+///////////////////////////////////////////////////////////////////////////
+
+// offsets * offsetScale is in bytes (for all of these)
+
+/* knc::macro::used */
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec16_i1 mask) {          \
+    VTYPE ret;                                                          \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 16; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            ret[i] = *ptr;                                            \
+        }                                                               \
+    return ret;                                                         \
+}
+    
+
+/****************/
+// GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_base_offsets32_i8)
+static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets,  __vec16_i1 mask) 
+{
+  // (iw): need to temporarily store as int because gathers can only return ints.
+  __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, 
+                                                   _MM_UPCONV_EPI32_SINT8, scale,
+                                                   _MM_HINT_NONE);
+  // now, downconverting to chars into temporary char vector
+  __vec16_i8 ret;
+  _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+  return ret;
+}
+// GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
+static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) 
+{ 
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  __vec16_i1 still_to_do = mask;
+  __vec16_i32 tmp;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));    
+    tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base,
+        _MM_UPCONV_EPI32_SINT8, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+  __vec16_i8 ret;
+  _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+  return ret;
+}
+/****************/
+GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
+/****************/
+// GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
+static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets,   __vec16_i1 mask) 
+{
+  return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
+                                        base, _MM_UPCONV_EPI32_NONE, scale,
+                                        _MM_HINT_NONE);
+}
+// GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
+static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
+{
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  // There is no gather instruction with 64-bit offsets in KNC.
+  // We have to manually iterate over the upper 32 bits ;-)
+  __vec16_i1  still_to_do = mask;
+  __vec16_i32 ret;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));
+    ret = _mm512_mask_i32extgather_epi32(ret, match, offsets.v_lo, base,
+        _MM_UPCONV_EPI32_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match, still_to_do);
+  }
+
+  return ret;
+}
+/****************/
+// GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
+static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
+{
+  return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets,
+                                     base, _MM_UPCONV_PS_NONE, scale,
+                                     _MM_HINT_NONE);
+}
+// GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
+static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
+{
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  // There is no gather instruction with 64-bit offsets in KNC.
+  // We have to manually iterate over the upper 32 bits ;-)
+  __vec16_i1 still_to_do = mask;
+  __vec16_f ret;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));
+    ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base,
+        _MM_UPCONV_PS_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match, still_to_do);
+  }
+
+  return ret;
+}
+/****************/
+GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
+/****************/
+// GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_base_offsets32_double)
+static FORCEINLINE __vec16_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
+                                         base, _MM_UPCONV_PD_NONE, scale,
+                                         _MM_HINT_NONE); 
+  __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+  const __mmask8 mask8 = 0x00FF & (mask >> 8); /* knc::testme */
+  ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets,
+                                         base, _MM_UPCONV_PD_NONE, scale,
+                                         _MM_HINT_NONE); 
+  return ret;
+}
+GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __gather_base_offsets64_double)
+
+/* knc::macro::used */
+#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
+    VTYPE ret;                                              \
+    for (int i = 0; i < 16; ++i)                            \
+        if ((mask.v & (1 << i)) != 0) {                     \
+            STYPE *ptr = (STYPE *)ptrs[i];                \
+            ret[i] = *ptr;                                \
+        }                                                   \
+    return ret;                                             \
+}
+/* knc::macro::used */
+#define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
+  return FUNC1(0, 1, ptrs, mask); \
+}
+
+
+/***********/
+GATHER_GENERALF(__vec16_i8,  int8_t,  __vec16_i32, __gather32_i8, __gather_base_offsets32_i8)
+GATHER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __gather32_i16, __gather_base_offsets32_i16)
+GATHER_GENERALF(__vec16_i32, int32_t, __vec16_i32, __gather32_i32, __gather_base_offsets32_i32)
+GATHER_GENERALF(__vec16_i64, int64_t, __vec16_i32, __gather32_i64, __gather_base_offsets32_i64)
+GATHER_GENERALF(__vec16_f,   float,   __vec16_i32, __gather32_float, __gather_base_offsets32_float)
+GATHER_GENERALF(__vec16_d,   double,  __vec16_i32, __gather32_double, __gather_base_offsets32_double)
+/***********/
+GATHER_GENERAL(__vec16_i8,  int8_t,  __vec16_i64, __gather64_i8);
+GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16);
+GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32);
+GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64);
+GATHER_GENERAL(__vec16_f,   float,   __vec16_i64, __gather64_float);
+GATHER_GENERAL(__vec16_d,   double,  __vec16_i64, __gather64_double);
+/***********/
+
+// scatter
+
+/* knc::macro::used */
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val,                   \
+                             __vec16_i1 mask) {                         \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 16; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            *ptr = val[i];                                            \
+        }                                                               \
+}
+    
+
+/*****************/
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __scatter_base_offsets64_i8)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
+/*****************/
+// SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
+static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,  __vec16_i32 val, __vec16_i1 mask)
+{
+  _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, 
+                                  _MM_DOWNCONV_EPI32_NONE, scale, 
+                                  _MM_HINT_NONE);
+}
+// SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
+static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask) 
+{
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+
+  __vec16_i1 still_to_do = mask;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));    
+    _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, 
+        value,
+        _MM_DOWNCONV_EPI32_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+}
+/*****************/
+// SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
+static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
+                               __vec16_f val, __vec16_i1 mask) 
+{ 
+  _mm512_mask_i32extscatter_ps(base, mask, offsets, val, 
+                               _MM_DOWNCONV_PS_NONE, scale,
+                               _MM_HINT_NONE);
+}
+//SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
+static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) 
+{ 
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+
+  __vec16_i1 still_to_do = mask;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));    
+    _mm512_mask_i32extscatter_ps(base, match, offsets.v_lo, 
+        value,
+        _MM_DOWNCONV_PS_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+}
+/*****************/
+SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
+/*****************/
+// SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_base_offsets32_double)
+static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec16_i32 offsets,
+                               __vec16_d val, __vec16_i1 mask) 
+{ 
+  _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v1, 
+                                 _MM_DOWNCONV_PD_NONE, scale,
+                                 _MM_HINT_NONE);
+  __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+  const __mmask8 mask8 = 0x00FF & (mask >> 8); /* knc::testme */
+  _mm512_mask_i32loextscatter_pd(base, mask8, shuffled_offsets, val.v2, 
+                                 _MM_DOWNCONV_PD_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __scatter_base_offsets64_double)
+
+/* knc::macro::used */
+#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
+    VTYPE ret;                                                       \
+    for (int i = 0; i < 16; ++i)                                     \
+        if ((mask.v & (1 << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)ptrs[i];                         \
+            *ptr = val[i];                                         \
+        }                                                            \
+}
+/* knc::macro::used */
+#define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
+  return FUNC1(0, 1, ptrs, val, mask); \
+}
+
+/***********/
+SCATTER_GENERALF(__vec16_i8,  int8_t,  __vec16_i32, __scatter32_i8, __scatter_base_offsets32_i8)
+SCATTER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16, __scatter_base_offsets32_i16)
+SCATTER_GENERALF(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32, __scatter_base_offsets32_i32)
+SCATTER_GENERALF(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64, __scatter_base_offsets32_i64)
+SCATTER_GENERALF(__vec16_f,   float,   __vec16_i32, __scatter32_float, __scatter_base_offsets32_float)
+SCATTER_GENERALF(__vec16_d,   double,  __vec16_i32, __scatter32_double, __scatter_base_offsets32_double)
+/***********/
+SCATTER_GENERAL(__vec16_i8,  int8_t,  __vec16_i64, __scatter64_i8)
+SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
+SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32)
+SCATTER_GENERAL(__vec16_f,   float,   __vec16_i64, __scatter64_float)
+SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64)
+SCATTER_GENERAL(__vec16_d,   double,  __vec16_i64, __scatter64_double)
+/***********/
+
+///////////////////////////////////////////////////////////////////////////
+// packed load/store
+///////////////////////////////////////////////////////////////////////////
+
+
+static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, __vec16_i1 mask)
+{
+  __vec16_i32 v = __load<64>(val);
+  v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __store<64>(val, v);
+  return _mm_countbits_32(uint32_t(mask));
+}
+
+static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) 
+{
+  _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  return _mm_countbits_32(uint32_t(mask));
+}
+
+static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, __vec16_i1 mask)
+{
+  __vec16_i32 v = __load<64>(val);
+  v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __store<64>(val, v);
+  return _mm_countbits_32(uint32_t(mask));
+}
+
+static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, __vec16_i1 mask) 
+{
+  _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  return _mm_countbits_32(uint32_t(mask));
+}
+
+///////////////////////////////////////////////////////////////////////////
+// aos/soa
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE void __soa_to_aos3_float(__vec16_f v0, __vec16_f v1, __vec16_f v2,
+                                            float *ptr) {
+    for (int i = 0; i < 16; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec16_f *out0, __vec16_f *out1,
+                                            __vec16_f *out2) {
+    for (int i = 0; i < 16; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+    }
+}
+
+static FORCEINLINE void __soa_to_aos4_float(__vec16_f v0, __vec16_f v1, __vec16_f v2,
+                                            __vec16_f v3, float *ptr) {
+    for (int i = 0; i < 16; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+        *ptr++ = __extract_element(v3, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec16_f *out0, __vec16_f *out1,
+                                            __vec16_f *out2, __vec16_f *out3) {
+    for (int i = 0; i < 16; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+        __insert_element(out3, i, *ptr++);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// prefetch
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T1); // prefetch into L2$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *p) {
+    // There is no L3$ on KNC, don't want to pollute L2$ unecessarily
+}
+
+static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint
+    // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
+}
+
+///////////////////////////////////////////////////////////////////////////
+// atomics
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) {
+    int32_t old, min;
+    do {
+        old = *((volatile int32_t *)p);
+        min = (old < (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) {
+    int32_t old, max;
+    do {
+        old = *((volatile int32_t *)p);
+        max = (old > (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) {
+    uint32_t old, min;
+    do {
+        old = *((volatile uint32_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) {
+    uint32_t old, max;
+    do {
+        old = *((volatile uint32_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange((LONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval,
+                                             uint32_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) {
+    int64_t old, min;
+    do {
+        old = *((volatile int64_t *)p);
+        min = (old < (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) {
+    int64_t old, max;
+    do {
+        old = *((volatile int64_t *)p);
+        max = (old > (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) {
+    uint64_t old, min;
+    do {
+        old = *((volatile uint64_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) {
+    uint64_t old, max;
+    do {
+        old = *((volatile uint64_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange64((LONGLONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
+                                             uint64_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+
+#endif // !WIN32
+
+#undef FORCEINLINE
+#undef PRE_ALIGN
+#undef POST_ALIGN
diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
new file mode 100644
index 00000000..d7696117
--- /dev/null
+++ b/examples/intrinsics/knc-i1x8.h
@@ -0,0 +1,2818 @@
+/**
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+#include <math.h>
+#include <assert.h>
+#include <algorithm>
+#include <immintrin.h>
+#include <zmmintrin.h>
+
+#ifdef _MSC_VER
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)  /*__declspec(align(x))*/
+#define POST_ALIGN(x)  
+#define roundf(x) (floorf(x + .5f))
+#define round(x) (floor(x + .5))
+#else
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)
+#define POST_ALIGN(x)  __attribute__ ((aligned(x)))
+#endif
+
+#define KNC 1
+#if 0
+extern "C" 
+{
+  int printf(const unsigned char *, ...);
+  int puts(unsigned char *);
+  unsigned int putchar(unsigned int);
+  int fflush(void *);
+  uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t);
+  uint8_t *memset(uint8_t *, uint8_t, uint64_t);
+  void memset_pattern16(void *, const void *, uint64_t);
+}
+#endif
+
+typedef float __vec1_f;
+typedef double __vec1_d;
+typedef int8_t __vec1_i8;
+typedef int16_t __vec1_i16;
+typedef int32_t __vec1_i32;
+typedef int64_t __vec1_i64;
+
+struct __vec8_i1 {
+    __vec8_i1() { }
+    __vec8_i1(const __mmask8 &vv) : v(vv) { }
+    __vec8_i1(bool v0, bool v1, bool v2, bool v3,
+              bool v4, bool v5, bool v6, bool v7) {
+        v = ((v0 & 1) |
+             ((v1 & 1) << 1) |
+             ((v2 & 1) << 2) |
+             ((v3 & 1) << 3) |
+             ((v4 & 1) << 4) |
+             ((v5 & 1) << 5) |
+             ((v6 & 1) << 6) |
+             ((v7 & 1) << 7) );
+    }
+             
+    __mmask8 v;
+    FORCEINLINE operator __mmask8() const { return v; }//0xFF & v; }
+};
+
+
+template <typename T>
+struct vec8 {
+    vec8() { }
+    vec8(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) {
+        data[0] = v0;        data[1] = v1;        data[2] = v2;        data[3] = v3;
+        data[4] = v4;        data[5] = v5;        data[6] = v6;        data[7] = v7;
+    }
+    T data[8]; 
+    FORCEINLINE const T& operator[](const int i) const { return data[i]; }
+    FORCEINLINE       T& operator[](const int i)       { return data[i]; }
+};
+
+/****************/
+
+struct PRE_ALIGN(32) __vec8_i32  
+{
+#ifdef __ZMM64BIT__
+  __m512i _data;
+  FORCEINLINE __vec8_i32(const __m512i &in) : _data(in) {}
+  FORCEINLINE operator __m512i() const   { return _data; }
+#else /* __ZMM64BIT__ */
+  typedef int32_t  _v8si  __attribute__((vector_size(32)));
+  _v8si _data;
+  FORCEINLINE __vec8_i32(const __m512i &in) 
+  {
+    _mm512_mask_extpackstorelo_epi32((__m512i*)&_data,  0xFF, in, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  }
+  FORCEINLINE operator __m512i() const   
+  { 
+    return _mm512_extloadunpacklo_epi32(_mm512_setzero_epi32(), (uint8_t*)&_data, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); 
+  }
+#endif /* __ZMM64BIT__ */
+  
+  __vec8_i32() { }
+  FORCEINLINE __vec8_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
+      int32_t v4, int32_t v5, int32_t v6, int32_t v7) 
+  {
+    const __m512i v  = _mm512_set_16to16_pi(0,0,0,0,0,0,0,0, v7, v6, v5, v4, v3, v2, v1, v0);
+    *this = __vec8_i32(v);
+  }
+
+  FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
+  FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
+} POST_ALIGN(32);
+
+PRE_ALIGN(32) struct __vec8_f 
+{
+#ifdef __ZMM64BIT__
+  __m512 _data;
+  FORCEINLINE __vec8_f(const __m512 &in) : _data(in) {}
+  FORCEINLINE operator __m512() const   { return _data; }
+#else /* __ZMM64BIT__ */
+  typedef float  _v8sf  __attribute__((vector_size(32)));
+  _v8sf _data;
+  FORCEINLINE __vec8_f(const __m512 &in) 
+  {
+    _mm512_mask_extpackstorelo_ps((__m512*)&_data,  0xFF, in, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  }
+  FORCEINLINE operator __m512() const   
+  { 
+    return _mm512_extloadunpacklo_ps(_mm512_setzero_ps(), (uint8_t*)&_data, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); 
+  }
+#endif /* __ZMM64BIT__ */
+  FORCEINLINE __vec8_f() { }
+  FORCEINLINE __vec8_f(float v0, float v1, float v2, float v3, 
+                       float v4, float v5, float v6, float v7) 
+  {
+    const __m512 v  = _mm512_set_16to16_ps(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, v7, v6, v5, v4, v3, v2, v1, v0);
+    *this = __vec8_f(v);
+  }
+
+  FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
+  FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
+} POST_ALIGN(32);
+
+struct PRE_ALIGN(64) __vec8_d 
+{
+    __m512d v;
+    FORCEINLINE __vec8_d() : v(_mm512_undefined_pd()) {}
+    FORCEINLINE __vec8_d(const __m512d _v) : v(_v) {}
+    FORCEINLINE __vec8_d(const __vec8_d &o) : v(o.v) {}
+    FORCEINLINE __vec8_d& operator =(const __vec8_d &o) { v=o.v; return *this; }
+    FORCEINLINE operator __m512d() const { return v; }
+    FORCEINLINE __vec8_d(double v00, double v01, double v02, double v03, 
+                          double v04, double v05, double v06, double v07) :
+        v ( _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00) ) {}
+    FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
+    FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
+} POST_ALIGN(64);
+
+/****************/
+
+PRE_ALIGN(64) struct __vec8_i64  : public vec8<int64_t> { 
+    __vec8_i64() { }
+    __vec8_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, 
+               int64_t v4, int64_t v5, int64_t v6, int64_t v7) 
+        : vec8<int64_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+} POST_ALIGN(64);
+
+PRE_ALIGN(16) struct __vec8_i8   : public vec8<int8_t> { 
+    __vec8_i8() { }
+    __vec8_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
+               int8_t v4, int8_t v5, int8_t v6, int8_t v7)
+        : vec8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+} POST_ALIGN(16);
+
+PRE_ALIGN(32) struct __vec8_i16  : public vec8<int16_t> { 
+    __vec8_i16() { }
+    __vec8_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
+                int16_t v4, int16_t v5, int16_t v6, int16_t v7) 
+        : vec8<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+} POST_ALIGN(32);
+
+static inline int32_t __extract_element(__vec8_i32, int);
+
+
+///////////////////////////////////////////////////////////////////////////
+// macros...
+
+#define UNARY_OP(TYPE, NAME, OP)            \
+static FORCEINLINE TYPE NAME(TYPE v) {      \
+    TYPE ret;                               \
+    for (int i = 0; i < 8; ++i)            \
+        ret[i] = OP(v[i]);              \
+    return ret;                             \
+}
+
+#define BINARY_OP(TYPE, NAME, OP)                               \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
+    TYPE ret;                                                   \
+   for (int i = 0; i < 8; ++i)                                 \
+       ret[i] = a[i] OP b[i];                             \
+   return ret;                                                   \
+}
+
+#define BINARY_OP_CAST(TYPE, CAST, NAME, OP)                        \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP (CAST)(b[i]);                 \
+   return ret;                                                      \
+}
+
+#define BINARY_OP_FUNC(TYPE, NAME, FUNC)                            \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret[i] = FUNC(a[i], b[i]);                             \
+   return ret;                                                      \
+}
+
+#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP)                        \
+static FORCEINLINE __vec8_i1 NAME##_##SUFFIX(TYPE a, TYPE b) {     \
+   __vec8_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   return ret;                                                      \
+}                                                                   \
+static FORCEINLINE __vec8_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b,       \
+                                              __vec8_i1 mask) {    \
+   __vec8_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   ret.v &= mask.v;                                                 \
+   return ret;                                                      \
+}
+
+#define INSERT_EXTRACT(VTYPE, STYPE)                                  \
+static FORCEINLINE STYPE __extract_element(VTYPE v, int index) {      \
+    return ((STYPE *)&v)[index];                                      \
+}                                                                     \
+static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
+    ((STYPE *)v)[index] = val;                                        \
+}
+
+#define LOAD_STORE(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 8; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 8; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+#define LOADS(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 8; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+
+#define STORES(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 8; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+#define REDUCE_ADD(TYPE, VTYPE, NAME)           \
+static FORCEINLINE TYPE NAME(VTYPE v) {         \
+     TYPE ret = v[0];                         \
+     for (int i = 1; i < 8; ++i)               \
+         ret = ret + v[i];                    \
+     return ret;                                \
+}
+
+#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP)                    \
+static FORCEINLINE TYPE NAME(VTYPE v) {                         \
+    TYPE ret = v[0];                                          \
+    for (int i = 1; i < 8; ++i)                                \
+        ret = (ret OP (TYPE)v[i]) ? ret : (TYPE)v[i];       \
+    return ret;                                                 \
+}
+
+#define SELECT(TYPE)                                                \
+static FORCEINLINE TYPE __select(__vec8_i1 mask, TYPE a, TYPE b) { \
+    TYPE ret;                                                       \
+    for (int i = 0; i < 8; ++i)                                    \
+        ret[i] = (mask.v & (1<<i)) ? a[i] : b[i];             \
+    return ret;                                                     \
+}                                                                   \
+static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
+    return cond ? a : b;                                            \
+}
+
+#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
+static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP b;                              \
+   return ret;                                                      \
+}
+
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
+template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 8; ++i)                                   \
+        ret[i] = v;                                              \
+    return ret;                                                    \
+}
+
+#define SETZERO(VTYPE, NAME)                                       \
+template <class RetVecType> VTYPE __setzero_##NAME();              \
+template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 8; ++i)                                   \
+        ret[i] = 0;                                              \
+    return ret;                                                    \
+}
+
+#define UNDEF(VTYPE, NAME)                                         \
+template <class RetVecType> VTYPE __undef_##NAME();                \
+template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {            \
+    return VTYPE();                                                \
+}
+
+#define BROADCAST(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = v[index & 0x7];                  \
+    return ret;                                       \
+}                                                     \
+
+#define ROTATE(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = v[(i+index) & 0x7];              \
+    return ret;                                       \
+}                                                     \
+
+#define SHUFFLES(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec8_i32 index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = v[__extract_element(index, i) & 0x7];      \
+    return ret;                                       \
+}                                                     \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec8_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i) {                    \
+        int ii = __extract_element(index, i) & 0xf;    \
+        ret[i] = (ii < 8) ? v0[ii] : v1[ii-8];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+#define SHUFFLE2(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec8_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i) {                    \
+        int ii = __extract_element(index, i) & 0xf;    \
+        ret[i] = (ii < 8) ? v0[ii] : v1[ii-8];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+INSERT_EXTRACT(__vec1_i8, int8_t)
+INSERT_EXTRACT(__vec1_i16, int16_t)
+INSERT_EXTRACT(__vec1_i32, int32_t)
+INSERT_EXTRACT(__vec1_i64, int64_t)
+INSERT_EXTRACT(__vec1_f, float)
+INSERT_EXTRACT(__vec1_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// mask ops
+
+static FORCEINLINE __vec8_i1 __movmsk(__vec8_i1 mask) {
+    return mask.v;
+}
+
+static FORCEINLINE bool __any(__vec8_i1 mask) {
+    return (mask.v!=0);
+}
+
+static FORCEINLINE bool __all(__vec8_i1 mask) {
+    return (mask.v==0xFF);
+}
+
+static FORCEINLINE bool __none(__vec8_i1 mask) {
+    return (mask.v==0);
+}
+
+static FORCEINLINE __vec8_i1 __equal_i1(__vec8_i1 a, __vec8_i1 b) {
+    return (a.v & b.v) | (~a.v & ~b.v);
+}
+
+static FORCEINLINE __vec8_i1 __and(__vec8_i1 a, __vec8_i1 b) {
+    return  a.v & b.v;
+}
+
+static FORCEINLINE __vec8_i1 __xor(__vec8_i1 a, __vec8_i1 b) {
+    return a.v ^ b.v;
+}
+
+static FORCEINLINE __vec8_i1 __or(__vec8_i1 a, __vec8_i1 b) {
+    return  a.v | b.v;
+}
+
+static FORCEINLINE __vec8_i1 __not(__vec8_i1 v) {
+    return ~v;
+}
+
+static FORCEINLINE __vec8_i1 __and_not1(__vec8_i1 a, __vec8_i1 b) {
+    return  ~a.v & b.v;
+}
+
+static FORCEINLINE __vec8_i1 __and_not2(__vec8_i1 a, __vec8_i1 b) {
+    return  a.v & ~b.v;
+}
+
+static FORCEINLINE __vec8_i1 __select(__vec8_i1 mask, __vec8_i1 a, 
+                                       __vec8_i1 b) {
+    return  (a.v & mask.v) | (b.v & ~mask.v);
+}
+
+static FORCEINLINE __vec8_i1 __select(bool cond, __vec8_i1 a, __vec8_i1 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE bool __extract_element(__vec8_i1 vec, int index) {
+    return (vec.v & (1 << index)) ? true : false;
+}
+
+static FORCEINLINE void __insert_element(__vec8_i1 *vec, int index, 
+                                         bool val) {
+    if (val == false)
+        vec->v &= ~(1 << index);
+    else
+        vec->v |= (1 << index);
+}
+
+template <int ALIGN> static FORCEINLINE __vec8_i1 __load(const __vec8_i1 *p) {
+    uint8_t *ptr = (uint8_t *)p;
+    __vec8_i1 r;
+    r.v = *ptr;
+    return r;
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec8_i1 *p, __vec8_i1 v) {
+    uint8_t *ptr = (uint8_t *)p;
+    *ptr = v.v;
+}
+
+template <class RetVecType> RetVecType __smear_i1(int i);
+template <> static FORCEINLINE __vec8_i1 __smear_i1<__vec8_i1>(int i) {
+    return i?0xFF:0x0;
+}
+
+template <class RetVecType> RetVecType __setzero_i1();
+template <> static FORCEINLINE __vec8_i1 __setzero_i1<__vec8_i1>() {
+    return 0;
+}
+
+template <class RetVecType> __vec8_i1 __undef_i1();
+template <> FORCEINLINE __vec8_i1 __undef_i1<__vec8_i1>() {
+    return __vec8_i1();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// int8
+
+BINARY_OP(__vec8_i8, __add, +)
+BINARY_OP(__vec8_i8, __sub, -)
+BINARY_OP(__vec8_i8, __mul, *)
+
+BINARY_OP(__vec8_i8, __or, |)
+BINARY_OP(__vec8_i8, __and, &)
+BINARY_OP(__vec8_i8, __xor, ^)
+BINARY_OP(__vec8_i8, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i8, uint8_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i8, int8_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i8, uint8_t, __urem, %)
+BINARY_OP_CAST(__vec8_i8, int8_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i8, uint8_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i8, int8_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i8, uint8_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i8, int8_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i8, int8_t, __shl, <<)
+
+CMP_OP(__vec8_i8, i8, int8_t,  __equal, ==)
+CMP_OP(__vec8_i8, i8, int8_t,  __not_equal, !=)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_less_than, <)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i8)
+INSERT_EXTRACT(__vec8_i8, int8_t)
+SMEAR(__vec8_i8, i8, int8_t)
+SETZERO(__vec8_i8, i8)
+UNDEF(__vec8_i8, i8)
+BROADCAST(__vec8_i8, i8, int8_t)
+ROTATE(__vec8_i8, i8, int8_t)
+SHUFFLES(__vec8_i8, i8, int8_t)
+LOAD_STORE(__vec8_i8, int8_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int16
+
+BINARY_OP(__vec8_i16, __add, +)
+BINARY_OP(__vec8_i16, __sub, -)
+BINARY_OP(__vec8_i16, __mul, *)
+
+BINARY_OP(__vec8_i16, __or, |)
+BINARY_OP(__vec8_i16, __and, &)
+BINARY_OP(__vec8_i16, __xor, ^)
+BINARY_OP(__vec8_i16, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i16, uint16_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i16, int16_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i16, uint16_t, __urem, %)
+BINARY_OP_CAST(__vec8_i16, int16_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i16, uint16_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i16, int16_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i16, uint16_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i16, int16_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i16, int16_t, __shl, <<)
+
+CMP_OP(__vec8_i16, i16, int16_t,  __equal, ==)
+CMP_OP(__vec8_i16, i16, int16_t,  __not_equal, !=)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_less_than, <)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i16)
+INSERT_EXTRACT(__vec8_i16, int16_t)
+SMEAR(__vec8_i16, i16, int16_t)
+SETZERO(__vec8_i16, i16)
+UNDEF(__vec8_i16, i16)
+BROADCAST(__vec8_i16, i16, int16_t)
+ROTATE(__vec8_i16, i16, int16_t)
+SHUFFLES(__vec8_i16, i16, int16_t)
+LOAD_STORE(__vec8_i16, int16_t)
+
+#if 0 /* evghenii::int32 */
+///////////////////////////////////////////////////////////////////////////
+// int32
+
+BINARY_OP(__vec8_i32, __add, +)
+BINARY_OP(__vec8_i32, __sub, -)
+BINARY_OP(__vec8_i32, __mul, *)
+
+BINARY_OP(__vec8_i32, __or, |)
+BINARY_OP(__vec8_i32, __and, &)
+BINARY_OP(__vec8_i32, __xor, ^)
+BINARY_OP(__vec8_i32, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i32, uint32_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i32, int32_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i32, uint32_t, __urem, %)
+BINARY_OP_CAST(__vec8_i32, int32_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i32, uint32_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i32, int32_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i32, uint32_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i32, int32_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i32, int32_t, __shl, <<)
+
+CMP_OP(__vec8_i32, i32, int32_t,  __equal, ==)
+CMP_OP(__vec8_i32, i32, int32_t,  __not_equal, !=)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_less_than, <)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i32)
+INSERT_EXTRACT(__vec8_i32, int32_t)
+SMEAR(__vec8_i32, i32, int32_t)
+SETZERO(__vec8_i32, i32)
+UNDEF(__vec8_i32, i32)
+BROADCAST(__vec8_i32, i32, int32_t)
+ROTATE(__vec8_i32, i32, int32_t)
+SHUFFLES(__vec8_i32, i32, int32_t)
+LOAD_STORE(__vec8_i32, int32_t)
+
+#else /* evghenii::int32 */
+///////////////////////////////////////////////////////////////////////////
+// int32
+///////////////////////////////////////////////////////////////////////////
+
+#define IZERO _mm512_setzero_epi32()
+static FORCEINLINE __vec8_i32 __add(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_add_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __sub(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_sub_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __mul(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_mullo_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __udiv(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_div_epu32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __sdiv(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_div_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __urem(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_rem_epu32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __srem(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_rem_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __or(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_or_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __and(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_and_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __xor(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_xor_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __shl(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_sllv_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __lshr(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_srlv_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __ashr(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_srav_epi32(IZERO,0xFF, a, b); 
+}
+
+static FORCEINLINE __vec8_i32 __shl(__vec8_i32 a, int32_t n) {
+    return _mm512_mask_slli_epi32(IZERO,0xFF, a, n);
+}
+
+static FORCEINLINE __vec8_i32 __lshr(__vec8_i32 a, int32_t n) {
+    return _mm512_mask_srli_epi32(IZERO,0xFF, a, n); 
+}
+
+static FORCEINLINE __vec8_i32 __ashr(__vec8_i32 a, int32_t n) {
+    return _mm512_mask_srai_epi32(IZERO,0xFF, a, n); 
+}
+
+static FORCEINLINE __vec8_i1 __equal_i32(const __vec8_i32 &a, const __vec8_i32 &b) {
+    return _mm512_mask_cmpeq_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_i32_and_mask(const __vec8_i32 &a, const __vec8_i32 &b,
+                                                   __vec8_i1 m) {
+    return _mm512_mask_cmpeq_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpneq_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                       __vec8_i1 m) {
+    return _mm512_mask_cmpneq_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmple_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                 __vec8_i1 m) {
+    return _mm512_mask_cmple_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmple_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                               __vec8_i1 m) {
+    return _mm512_mask_cmple_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpge_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                    __vec8_i1 m) {
+    return _mm512_mask_cmpge_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpge_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                  __vec8_i1 m) {
+    return _mm512_mask_cmpge_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmplt_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                __vec8_i1 m) {
+    return _mm512_mask_cmplt_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmplt_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                              __vec8_i1 m) {
+    return _mm512_mask_cmplt_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpgt_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                   __vec8_i1 m) {
+    return _mm512_mask_cmpgt_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpgt_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                 __vec8_i1 m) {
+    return _mm512_mask_cmpgt_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __select(__vec8_i1 mask,
+                                        __vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_mov_epi32(b, mask, a);
+} 
+
+static FORCEINLINE __vec8_i32 __select(bool cond, __vec8_i32 a, __vec8_i32 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE int32_t __extract_element(__vec8_i32 v, int index) { //uint32_t index) {
+    return ((int32_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec8_i32 *v, uint32_t index, int32_t val) {
+    ((int32_t *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_i32(int32_t i);
+template <> static FORCEINLINE __vec8_i32 __smear_i32<__vec8_i32>(int32_t i) {
+    return _mm512_set_16to16_epi32(0,0,0,0,0,0,0,0, i,i,i,i,i,i,i,i);
+}
+
+static const __vec8_i32 __ispc_one = __smear_i32<__vec8_i32>(1);
+static const __vec8_i32 __ispc_thirty_two = __smear_i32<__vec8_i32>(32);
+static const __vec8_i32 __ispc_ffffffff = __smear_i32<__vec8_i32>(-1);
+static const __vec8_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7);
+
+template <class RetVecType> RetVecType __setzero_i32();
+template <> static FORCEINLINE __vec8_i32 __setzero_i32<__vec8_i32>() {
+    return _mm512_setzero_epi32();
+}
+
+template <class RetVecType> RetVecType __undef_i32();
+template <> static FORCEINLINE __vec8_i32 __undef_i32<__vec8_i32>() {
+    return __vec8_i32();
+}
+
+static FORCEINLINE __vec8_i32 __broadcast_i32(__vec8_i32 v, int index) {
+    int32_t val = __extract_element(v, index & 0xf);
+    return _mm512_set1_epi32(val);
+}
+
+#if 0 /* evghenii::doesn't work */
+static FORCEINLINE __vec8_i32 __rotate_i32(__vec8_i32 v, int index) {
+    __vec8_i32 idx = __smear_i32<__vec8_i32>(index);
+    __vec8_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec8_i32>(0x7));
+    return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
+}
+#else
+ROTATE(__vec8_i32, i32, int32_t)
+#endif
+
+static FORCEINLINE __vec8_i32 __shuffle_i32(__vec8_i32 v, __vec8_i32 index) {
+    return _mm512_mask_permutevar_epi32(v, 0xffff, index, v);
+}
+SHUFFLE2(__vec8_i32, i32, int32_t) /* evghenii::to implement */
+
+template <int ALIGN> static FORCEINLINE __vec8_i32 __load(const __vec8_i32 *p) {
+  __vec8_i32 v;
+  v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  return __select(0xFF,v,IZERO);
+}
+
+
+template <int ALIGN> static FORCEINLINE void __store(__vec8_i32 *p, __vec8_i32 v) {
+  _mm512_mask_extpackstorelo_epi32(          p,    0xFF, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+}
+
+#if 0
+template <> static FORCEINLINE __vec8_i32 __load<64>(const __vec8_i32 *p) {
+    return _mm512_load_epi32(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec8_i32 *p, __vec8_i32 v) {
+    _mm512_store_epi32(p, v);
+}
+#endif
+#endif /* evghenii::int32 */
+
+///////////////////////////////////////////////////////////////////////////
+// int64
+
+BINARY_OP(__vec8_i64, __add, +)
+BINARY_OP(__vec8_i64, __sub, -)
+BINARY_OP(__vec8_i64, __mul, *)
+
+BINARY_OP(__vec8_i64, __or, |)
+BINARY_OP(__vec8_i64, __and, &)
+BINARY_OP(__vec8_i64, __xor, ^)
+BINARY_OP(__vec8_i64, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i64, uint64_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i64, int64_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i64, uint64_t, __urem, %)
+BINARY_OP_CAST(__vec8_i64, int64_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i64, uint64_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i64, int64_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i64, uint64_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i64, int64_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i64, int64_t, __shl, <<)
+
+CMP_OP(__vec8_i64, i64, int64_t,  __equal, ==)
+CMP_OP(__vec8_i64, i64, int64_t,  __not_equal, !=)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_less_than, <)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i64)
+INSERT_EXTRACT(__vec8_i64, int64_t)
+SMEAR(__vec8_i64, i64, int64_t)
+SETZERO(__vec8_i64, i64)
+UNDEF(__vec8_i64, i64)
+BROADCAST(__vec8_i64, i64, int64_t)
+ROTATE(__vec8_i64, i64, int64_t)
+SHUFFLES(__vec8_i64, i64, int64_t)
+LOAD_STORE(__vec8_i64, int64_t)
+
+
+#if 0 /* evghenii::float */
+///////////////////////////////////////////////////////////////////////////
+// float
+
+BINARY_OP(__vec8_f, __add, +)
+BINARY_OP(__vec8_f, __sub, -)
+BINARY_OP(__vec8_f, __mul, *)
+BINARY_OP(__vec8_f, __div, /)
+
+CMP_OP(__vec8_f, float, float, __equal, ==)
+CMP_OP(__vec8_f, float, float, __not_equal, !=)
+CMP_OP(__vec8_f, float, float, __less_than, <)
+CMP_OP(__vec8_f, float, float, __less_equal, <=)
+CMP_OP(__vec8_f, float, float, __greater_than, >)
+CMP_OP(__vec8_f, float, float, __greater_equal, >=)
+
+static FORCEINLINE __vec8_i1 __ordered_float(__vec8_f a, __vec8_f b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+static FORCEINLINE __vec8_i1 __unordered_float(__vec8_f a, __vec8_f b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec8_f)
+INSERT_EXTRACT(__vec8_f, float)
+SMEAR(__vec8_f, float, float)
+SETZERO(__vec8_f, float)
+UNDEF(__vec8_f, float)
+BROADCAST(__vec8_f, float, float)
+ROTATE(__vec8_f, float, float)
+SHUFFLES(__vec8_f, float, float)
+LOAD_STORE(__vec8_f, float)
+#else /* evghenii::float */
+
+///////////////////////////////////////////////////////////////////////////
+// float
+///////////////////////////////////////////////////////////////////////////
+
+#define FZERO _mm512_setzero_ps()
+static FORCEINLINE __vec8_f __add(__vec8_f a, __vec8_f b) { 
+    return _mm512_mask_add_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_f __sub(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_sub_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_f __mul(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_mul_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_f __div(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_div_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpeq_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                     __vec8_i1 m) {
+    return _mm512_mask_cmpeq_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpneq_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                         __vec8_i1 m) {
+    return _mm512_mask_cmpneq_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmplt_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_float_and_mask(__vec8_f a, __vec8_f b,
+                                                         __vec8_i1 m) {
+    return _mm512_mask_cmplt_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmple_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                          __vec8_i1 m) {
+    return _mm512_mask_cmple_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmp_ps_mask(0xFF, a, b,_CMP_GT_OS);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_float_and_mask(__vec8_f a, __vec8_f b,
+                                                            __vec8_i1 m) {
+    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GT_OS);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmp_ps_mask(0xFF, a, b,_CMP_GE_OS);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                             __vec8_i1 m) {
+    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GE_OS);
+}
+
+static FORCEINLINE __vec8_i1 __ordered_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpord_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unordered_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpunord_ps_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_f __select(__vec8_i1 mask, __vec8_f a, __vec8_f b) {
+    return _mm512_mask_mov_ps(b, mask & 0xFF, a);
+}
+
+static FORCEINLINE __vec8_f __select(bool cond, __vec8_f a, __vec8_f b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE float __extract_element(__vec8_f v, uint32_t index) {
+  return v[index];
+ //   return ((float *)&v)[index];
+}
+
+static FORCEINLINE void  __insert_element(__vec8_f *v, uint32_t index, float val) {
+  (*v)[index] = val;
+//    ((float *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_float(float f);
+template <> static FORCEINLINE __vec8_f __smear_float<__vec8_f>(float f) {
+  return _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, f,f,f,f,f,f,f,f);
+}
+
+template <class RetVecType> RetVecType __setzero_float();
+template <> static FORCEINLINE __vec8_f __setzero_float<__vec8_f>() {
+    return _mm512_setzero_ps();
+}
+
+template <class RetVecType> RetVecType __undef_float();
+template <> static FORCEINLINE __vec8_f __undef_float<__vec8_f>() {
+    return __vec8_f();
+}
+
+static FORCEINLINE __vec8_f __broadcast_float(__vec8_f v, int index) {
+    float val = __extract_element(v, index & 0x7);
+  return _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, val,val,val,val,val,val,val,val);
+}
+ 
+#if 1
+static FORCEINLINE __vec8_f __shuffle_float(__vec8_f v, __vec8_i32 index) {
+    return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
+}
+#endif
+ROTATE(__vec8_f, float, float)
+SHUFFLE2(__vec8_f, float, float)
+
+#if 0
+LOADS(__vec8_f, float)
+#else
+template <int ALIGN> static FORCEINLINE __vec8_f __load(const __vec8_f *p) {
+  __vec8_f v;
+  v = _mm512_extloadunpacklo_ps(v,           p,    _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  return __select(0xFF,v,FZERO);
+}
+#endif
+
+#if 0
+STORES(__vec8_f, float)
+#else
+template <int ALIGN> static FORCEINLINE void __store(__vec8_f *p, __vec8_f v) 
+{
+  _mm512_mask_extpackstorelo_ps(          p,    0xFF, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_ps((uint8_t*)p+64, 0xFF, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+}
+#endif
+
+#endif /* evghenii::float */
+
+static FORCEINLINE float __exp_uniform_float(float v) {    return expf(v);}
+static FORCEINLINE __vec8_f __exp_varying_float(__vec8_f v) { return _mm512_mask_exp_ps(FZERO, 0xFF, v); }
+
+
+static FORCEINLINE float __log_uniform_float(float v) {    return logf(v);}
+static FORCEINLINE __vec8_f __log_varying_float(__vec8_f v) { return _mm512_mask_log_ps(FZERO, 0xFF, v); }
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {    return powf(a, b);}
+static FORCEINLINE __vec8_f __pow_varying_float(__vec8_f a, __vec8_f b) { return _mm512_mask_pow_ps(FZERO, 0xFF, a,b); }
+
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) {
+    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+    uint32_t exp = shifted_exp & o;   // just the exponent
+    o += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) { // Zero/Denormal?
+        o += 1 << 23;             // extra exp adjust
+        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+    }
+
+    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+    return __floatbits(o);
+}
+
+
+static FORCEINLINE __vec8_f __half_to_float_varying(__vec8_i16 v) {
+    __vec8_f ret;
+    for (int i = 0; i < 8; ++i)
+        ret[i] = __half_to_float_uniform(v[i]);
+    return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    int32_t fint = __intbits(f);
+    int32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    int32_t f32infty = 255 << 23;
+    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+    const uint32_t round_mask = ~0xfffu; 
+    const int32_t magic = 15 << 23;
+    const int32_t f16infty = 31 << 23;
+
+    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
+    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+
+static FORCEINLINE __vec8_i16 __float_to_half_varying(__vec8_f v) {
+    __vec8_i16 ret;
+    for (int i = 0; i < 8; ++i)
+        ret[i] = __float_to_half_uniform(v[i]);
+    return ret;
+}
+
+
+#if 0 /* evghenii::double */
+///////////////////////////////////////////////////////////////////////////
+// double
+
+BINARY_OP(__vec8_d, __add, +)
+BINARY_OP(__vec8_d, __sub, -)
+BINARY_OP(__vec8_d, __mul, *)
+BINARY_OP(__vec8_d, __div, /)
+
+CMP_OP(__vec8_d, double, double, __equal, ==)
+CMP_OP(__vec8_d, double, double, __not_equal, !=)
+CMP_OP(__vec8_d, double, double, __less_than, <)
+CMP_OP(__vec8_d, double, double, __less_equal, <=)
+CMP_OP(__vec8_d, double, double, __greater_than, >)
+CMP_OP(__vec8_d, double, double, __greater_equal, >=)
+
+static FORCEINLINE __vec8_i1 __ordered_double(__vec8_d a, __vec8_d b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+static FORCEINLINE __vec8_i1 __unordered_double(__vec8_d a, __vec8_d b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec8_d)
+INSERT_EXTRACT(__vec8_d, double)
+SMEAR(__vec8_d, double, double)
+SETZERO(__vec8_d, double)
+UNDEF(__vec8_d, double)
+BROADCAST(__vec8_d, double, double)
+ROTATE(__vec8_d, double, double)
+SHUFFLES(__vec8_d, double, double)
+LOAD_STORE(__vec8_d, double)
+#else /* evghenii::double */
+///////////////////////////////////////////////////////////////////////////
+// double
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec8_d __add(__vec8_d a, __vec8_d b) { 
+    return _mm512_add_pd(a, b);
+}
+static FORCEINLINE __vec8_d __sub(__vec8_d a, __vec8_d b) {
+    return _mm512_sub_pd(a, b);
+}
+static FORCEINLINE __vec8_d __mul(__vec8_d a, __vec8_d b) {
+    return _mm512_mul_pd(a, b);
+}
+
+static FORCEINLINE __vec8_d __div(__vec8_d a, __vec8_d b) {
+    return _mm512_div_pd(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpeq_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                      __vec8_i1 m) {
+    return _mm512_mask_cmpeq_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpneq_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                          __vec8_i1 m) {
+    return _mm512_mask_cmpneq_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmplt_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_double_and_mask(__vec8_d a, __vec8_d b,
+                                                          __vec8_i1 m) {
+    return _mm512_mask_cmplt_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmple_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                           __vec8_i1 m) {
+    return _mm512_mask_cmple_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpnle_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_double_and_mask(__vec8_d a, __vec8_d b,
+                                                             __vec8_i1 m) {
+    return _mm512_mask_cmpnle_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpnlt_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                              __vec8_i1 m) {
+    return _mm512_mask_cmpnlt_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __ordered_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpord_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unordered_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpunord_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_d __select(__vec8_i1 mask, __vec8_d a, __vec8_d b) {
+    return _mm512_mask_mov_pd(b, mask, a);
+}
+
+
+static FORCEINLINE __vec8_d __select(bool cond, __vec8_d a, __vec8_d b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE double __extract_element(__vec8_d v, uint32_t index) {
+    return ((double *)&v)[index];
+}
+
+static FORCEINLINE void  __insert_element(__vec8_d *v, uint32_t index, double val) {
+    ((double *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_double(double d);
+template <> static FORCEINLINE __vec8_d __smear_double<__vec8_d>(double d) { return _mm512_set1_pd(d); }
+
+template <class RetVecType> RetVecType __setzero_double();
+template <> static FORCEINLINE __vec8_d __setzero_double<__vec8_d>() { return _mm512_setzero_pd(); }
+
+template <class RetVecType> RetVecType __undef_double();
+template <> static FORCEINLINE __vec8_d __undef_double<__vec8_d>() {    return __vec8_d();}
+
+static FORCEINLINE __vec8_d __broadcast_double(__vec8_d v, int index) {
+    double val = __extract_element(v, index & 0xf);
+    return _mm512_set1_pd(val);
+}
+
+ROTATE(__vec8_d, double, double)
+SHUFFLES(__vec8_d, double, double)
+
+template <int ALIGN> static FORCEINLINE __vec8_d __load(const __vec8_d *p) {
+  __vec8_d ret;
+  ret.v = _mm512_extloadunpacklo_pd(ret.v, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v = _mm512_extloadunpackhi_pd(ret.v, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  return ret;
+}
+ 
+template <int ALIGN> static FORCEINLINE void __store(__vec8_d *p, __vec8_d v) {
+  _mm512_extpackstorelo_pd(p, v.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+}
+
+
+#if 0
+template <> static FORCEINLINE __vec8_d __load<64>(const __vec8_d *p) {
+    return  _mm512_load_pd(p);
+}
+template <> static FORCEINLINE __vec8_d __load<128>(const __vec8_d *p) {
+    return __load<64>(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec8_d *p, __vec8_d v) {
+    _mm512_store_pd(p, v.v);
+}
+template <> static FORCEINLINE void __store<128>(__vec8_d *p, __vec8_d v) {
+    __store<64>(p, v);
+}
+#endif
+#endif /* evghenii::double */
+
+///////////////////////////////////////////////////////////////////////////
+// casts
+
+
+#define CAST(TO, STO, FROM, SFROM, FUNC)        \
+static FORCEINLINE TO FUNC(TO, FROM val) {      \
+    TO ret;                                     \
+    for (int i = 0; i < 8; ++i)                \
+        ret[i] = (STO)((SFROM)(val[i]));    \
+    return ret;                                 \
+}
+
+// sign extension conversions
+CAST(__vec8_i64, int64_t, __vec8_i32, int32_t, __cast_sext)
+CAST(__vec8_i64, int64_t, __vec8_i16, int16_t, __cast_sext)
+CAST(__vec8_i64, int64_t, __vec8_i8,  int8_t,  __cast_sext)
+CAST(__vec8_i32, int32_t, __vec8_i16, int16_t, __cast_sext)
+CAST(__vec8_i32, int32_t, __vec8_i8,  int8_t,  __cast_sext)
+CAST(__vec8_i16, int16_t, __vec8_i8,  int8_t,  __cast_sext)
+
+#define CAST_SEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_sext(TYPE, __vec8_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 8; ++i) {                    \
+        ret[i] = 0;                                 \
+        if (v.v & (1 << i))                           \
+            ret[i] = ~ret[i];                     \
+    }                                                 \
+    return ret;                                       \
+}
+
+CAST_SEXT_I1(__vec8_i8)
+CAST_SEXT_I1(__vec8_i16)
+#if 0
+CAST_SEXT_I1(__vec8_i32)
+#else
+static FORCEINLINE __vec8_i32 __cast_sext(const __vec8_i32 &, const __vec8_i1 &val)
+{
+    __vec8_i32 ret = _mm512_setzero_epi32();
+    __vec8_i32 one = _mm512_set1_epi32(-1);
+    return _mm512_mask_mov_epi32(ret, 0xFF & val, one);
+}
+#endif
+CAST_SEXT_I1(__vec8_i64)
+
+// zero extension
+CAST(__vec8_i64, uint64_t, __vec8_i32, uint32_t, __cast_zext)
+CAST(__vec8_i64, uint64_t, __vec8_i16, uint16_t, __cast_zext)
+CAST(__vec8_i64, uint64_t, __vec8_i8,  uint8_t,  __cast_zext)
+CAST(__vec8_i32, uint32_t, __vec8_i16, uint16_t, __cast_zext)
+CAST(__vec8_i32, uint32_t, __vec8_i8,  uint8_t,  __cast_zext)
+CAST(__vec8_i16, uint16_t, __vec8_i8,  uint8_t,  __cast_zext)
+
+#define CAST_ZEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_zext(TYPE, __vec8_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = (v.v & (1 << i)) ? 1 : 0;          \
+    return ret;                                       \
+}
+
+CAST_ZEXT_I1(__vec8_i8)
+CAST_ZEXT_I1(__vec8_i16)
+#if 0
+CAST_ZEXT_I1(__vec8_i32)
+#else
+static FORCEINLINE __vec8_i32 __cast_zext(const __vec8_i32 &, const __vec8_i1 &val)
+{
+    __vec8_i32 ret = _mm512_setzero_epi32();
+    __vec8_i32 one = _mm512_set1_epi32(1);
+    return _mm512_mask_mov_epi32(ret, 0xFF & val, one);
+}
+#endif
+CAST_ZEXT_I1(__vec8_i64)
+
+// truncations
+CAST(__vec8_i32, int32_t, __vec8_i64, int64_t, __cast_trunc)
+CAST(__vec8_i16, int16_t, __vec8_i64, int64_t, __cast_trunc)
+CAST(__vec8_i8,  int8_t,  __vec8_i64, int64_t, __cast_trunc)
+CAST(__vec8_i16, int16_t, __vec8_i32, int32_t, __cast_trunc)
+CAST(__vec8_i8,  int8_t,  __vec8_i32, int32_t, __cast_trunc)
+CAST(__vec8_i8,  int8_t,  __vec8_i16, int16_t, __cast_trunc)
+
+// signed int to float/double
+#if 0
+CAST(__vec8_f, float, __vec8_i8,   int8_t,  __cast_sitofp)
+CAST(__vec8_f, float, __vec8_i16,  int16_t, __cast_sitofp)
+CAST(__vec8_f, float, __vec8_i32,  int32_t, __cast_sitofp)
+#else
+static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i8  val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i16 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i32 val) {return _mm512_mask_cvtfxpnt_round_adjustepi32_ps(FZERO, 0xFF, val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+#endif
+CAST(__vec8_f, float, __vec8_i64,  int64_t, __cast_sitofp)
+#if 0
+CAST(__vec8_d, double, __vec8_i8,  int8_t,  __cast_sitofp)
+CAST(__vec8_d, double, __vec8_i16, int16_t, __cast_sitofp)
+CAST(__vec8_d, double, __vec8_i32, int32_t, __cast_sitofp)
+#else
+static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i8 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return  _mm512_cvtepi32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i16 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return  _mm512_cvtepi32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i32 val) {
+    __vec8_d ret;
+    return _mm512_cvtepi32lo_pd(val);
+}
+#endif
+CAST(__vec8_d, double, __vec8_i64, int64_t, __cast_sitofp)
+
+// unsigned int to float/double
+#if 0
+CAST(__vec8_f, float, __vec8_i8,   uint8_t,  __cast_uitofp)
+CAST(__vec8_f, float, __vec8_i16,  uint16_t, __cast_uitofp)
+CAST(__vec8_f, float, __vec8_i32,  uint32_t, __cast_uitofp)
+#else
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i8  val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i16 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i32 val) {return _mm512_mask_cvtfxpnt_round_adjustepu32_ps(FZERO, 0xFF, val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+#endif
+CAST(__vec8_f, float, __vec8_i64,  uint64_t, __cast_uitofp)
+#if 0
+CAST(__vec8_d, double, __vec8_i8,  uint8_t,  __cast_uitofp)
+CAST(__vec8_d, double, __vec8_i16, uint16_t, __cast_uitofp)
+CAST(__vec8_d, double, __vec8_i32, uint32_t, __cast_uitofp)
+#else
+static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i8 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return  _mm512_cvtepu32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i16 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return _mm512_cvtepu32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i32 val) {
+    __vec8_d ret;
+    return _mm512_cvtepu32lo_pd(val);
+}
+#endif
+CAST(__vec8_d, double, __vec8_i64, uint64_t, __cast_uitofp)
+
+#if 0
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i1 v) {
+    __vec8_f ret;
+    for (int i = 0; i < 8; ++i)
+        ret[i] = (v.v & (1 << i)) ? 1. : 0.;
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i1 v) 
+{
+    const __m512 ret = _mm512_setzero_ps();
+    const __m512 one = _mm512_set1_ps(1.0);
+    return _mm512_mask_mov_ps(ret, v & 0xFF, one);
+}
+#endif
+
+// float/double to signed int
+CAST(__vec8_i8,  int8_t,  __vec8_f, float, __cast_fptosi)
+CAST(__vec8_i16, int16_t, __vec8_f, float, __cast_fptosi)
+#if 0
+CAST(__vec8_i32, int32_t, __vec8_f, float, __cast_fptosi)
+#else
+static FORCEINLINE __vec8_i32 __cast_fptosi(__vec8_i32, __vec8_f val) {
+  return _mm512_mask_cvtfxpnt_round_adjustps_epi32(IZERO, 0xFF, val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+#endif
+CAST(__vec8_i64, int64_t, __vec8_f, float, __cast_fptosi)
+CAST(__vec8_i8,  int8_t,  __vec8_d, double, __cast_fptosi)
+CAST(__vec8_i16, int16_t, __vec8_d, double, __cast_fptosi)
+#if 1
+CAST(__vec8_i32, int32_t, __vec8_d, double, __cast_fptosi)
+#else
+#endif
+CAST(__vec8_i64, int64_t, __vec8_d, double, __cast_fptosi)
+
+// float/double to unsigned int
+CAST(__vec8_i8,  uint8_t,  __vec8_f, float, __cast_fptoui)
+CAST(__vec8_i16, uint16_t, __vec8_f, float, __cast_fptoui)
+#if 0
+CAST(__vec8_i32, uint32_t, __vec8_f, float, __cast_fptoui)
+#else
+static FORCEINLINE __vec8_i32 __cast_fptoui(__vec8_i32, __vec8_f val) {
+  return _mm512_mask_cvtfxpnt_round_adjustps_epu32(IZERO, 0xFF, val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+#endif
+CAST(__vec8_i64, uint64_t, __vec8_f, float, __cast_fptoui)
+CAST(__vec8_i8,  uint8_t,  __vec8_d, double, __cast_fptoui)
+CAST(__vec8_i16, uint16_t, __vec8_d, double, __cast_fptoui)
+#if 1
+CAST(__vec8_i32, uint32_t, __vec8_d, double, __cast_fptoui)
+#else
+#endif
+CAST(__vec8_i64, uint64_t, __vec8_d, double, __cast_fptoui)
+
+// float/double conversions
+#if 0
+CAST(__vec8_f, float,  __vec8_d, double, __cast_fptrunc)
+CAST(__vec8_d, double, __vec8_f, float,  __cast_fpext)
+#else
+static FORCEINLINE __vec8_f __cast_fptrunc(__vec8_f, __vec8_d val) {
+    return _mm512_mask_cvtpd_pslo(FZERO, 0xFF, val);
+}
+static FORCEINLINE __vec8_d __cast_fpext(__vec8_d, __vec8_f val) {
+    return _mm512_cvtpslo_pd(val);
+}
+#endif
+
+typedef union {
+    int32_t i32;
+    float f;
+    int64_t i64;
+    double d;
+} BitcastUnion;
+
+#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT)       \
+static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
+    TO r;                                           \
+    for (int i = 0; i < 8; ++i) {                  \
+        BitcastUnion u;                             \
+        u.FROM_ELT = val[i];                      \
+        r[i] = u.TO_ELT;                          \
+    }                                               \
+    return r;                                       \
+}
+
+#if 0
+CAST_BITS(__vec8_f,   f,   __vec8_i32, i32)
+CAST_BITS(__vec8_i32, i32, __vec8_f,   f)
+#else
+static FORCEINLINE __vec8_f __cast_bits(__vec8_f, __vec8_i32 val) {
+    return _mm512_castsi512_ps(val);
+}
+static FORCEINLINE __vec8_i32 __cast_bits(__vec8_i32, __vec8_f val) {
+    return _mm512_castps_si512(val);
+}
+#endif
+
+#if 0
+CAST_BITS(__vec8_d,   d,   __vec8_i64, i64)
+CAST_BITS(__vec8_i64, i64, __vec8_d,   d)
+#else
+static FORCEINLINE __vec8_i64 __cast_bits(__vec8_i64, __vec8_d val) {
+    return *(__vec8_i64*)&val;
+}
+static FORCEINLINE __vec8_d __cast_bits(__vec8_d, __vec8_i64 val) {
+    return *(__vec8_d*)&val;
+}
+#endif
+
+#define CAST_BITS_SCALAR(TO, FROM)                  \
+static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
+    union {                                         \
+    TO to;                                          \
+    FROM from;                                      \
+    } u;                                            \
+    u.from = v;                                     \
+    return u.to;                                    \
+}
+
+CAST_BITS_SCALAR(uint32_t, float)
+CAST_BITS_SCALAR(int32_t, float)
+CAST_BITS_SCALAR(float, uint32_t)
+CAST_BITS_SCALAR(float, int32_t)
+CAST_BITS_SCALAR(uint64_t, double)
+CAST_BITS_SCALAR(int64_t, double)
+CAST_BITS_SCALAR(double, uint64_t)
+CAST_BITS_SCALAR(double, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// various math functions
+
+static FORCEINLINE void __fastmath() {
+}
+
+static FORCEINLINE float __round_uniform_float(float v) {
+    return roundf(v);
+}
+
+static FORCEINLINE float __floor_uniform_float(float v)  {
+    return floorf(v);
+}
+
+static FORCEINLINE float __ceil_uniform_float(float v) {
+    return ceilf(v);
+}
+
+static FORCEINLINE double __round_uniform_double(double v) {
+    return round(v);
+}
+
+static FORCEINLINE double __floor_uniform_double(double v) {
+    return floor(v);
+}
+
+static FORCEINLINE double __ceil_uniform_double(double v) {
+    return ceil(v);
+}
+
+#if 0
+UNARY_OP(__vec8_f, __round_varying_float, roundf)
+UNARY_OP(__vec8_f, __floor_varying_float, floorf)
+UNARY_OP(__vec8_f, __ceil_varying_float, ceilf)
+#else
+static FORCEINLINE __vec8_f __round_varying_float(__vec8_f v) {
+  return _mm512_mask_round_ps(FZERO, 0xFF, v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);
+}
+
+static FORCEINLINE __vec8_f __floor_varying_float(__vec8_f v) {
+  return _mm512_mask_floor_ps(FZERO, 0xFF, v);
+}
+
+static FORCEINLINE __vec8_f __ceil_varying_float(__vec8_f v) {
+  return _mm512_mask_ceil_ps(FZERO, 0xFF, v);
+}
+#endif
+
+#if 0
+UNARY_OP(__vec8_d, __round_varying_double, round)
+UNARY_OP(__vec8_d, __floor_varying_double, floor)
+UNARY_OP(__vec8_d, __ceil_varying_double, ceil)
+#else
+static FORCEINLINE __vec8_d __round_varying_float(__vec8_d v) {
+  return _mm512_svml_round_pd(v);
+}
+
+static FORCEINLINE __vec8_d __floor_varying_float(__vec8_d v) {
+  return _mm512_floor_pd(v);
+}
+
+static FORCEINLINE __vec8_d __ceil_varying_float(__vec8_d v) {
+  return _mm512_ceil_pd(v);
+}
+#endif
+
+
+// min/max
+
+static FORCEINLINE float __min_uniform_float(float a, float b) { return (a<b) ? a : b; }
+static FORCEINLINE float __max_uniform_float(float a, float b) { return (a>b) ? a : b; }
+static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
+static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32(int32_t a, int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64(int64_t a, int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
+
+
+#if 0
+BINARY_OP_FUNC(__vec8_f, __max_varying_float, __max_uniform_float)
+BINARY_OP_FUNC(__vec8_f, __min_varying_float, __min_uniform_float)
+#else
+static FORCEINLINE __vec8_f __max_varying_float (__vec8_f v1, __vec8_f v2) { return _mm512_mask_gmax_ps(FZERO, 0xFF, v1, v2);}
+static FORCEINLINE __vec8_f __min_varying_float (__vec8_f v1, __vec8_f v2) { return _mm512_mask_gmin_ps(FZERO, 0xFF, v1, v2);}
+#endif
+
+#if 0
+BINARY_OP_FUNC(__vec8_d, __max_varying_double, __max_uniform_double)
+BINARY_OP_FUNC(__vec8_d, __min_varying_double, __min_uniform_double)
+#else
+static FORCEINLINE __vec8_d __max_varying_double(__vec8_d v1, __vec8_d v2) { return _mm512_gmax_pd(v1,v2); }
+static FORCEINLINE __vec8_d __min_varying_double(__vec8_d v1, __vec8_d v2) { return _mm512_gmin_pd(v1,v2); }
+#endif
+
+#if 0
+BINARY_OP_FUNC(__vec8_i32, __max_varying_int32, __max_uniform_int32)
+BINARY_OP_FUNC(__vec8_i32, __min_varying_int32, __min_uniform_int32)
+BINARY_OP_FUNC(__vec8_i32, __max_varying_uint32, __max_uniform_uint32)
+BINARY_OP_FUNC(__vec8_i32, __min_varying_uint32, __min_uniform_uint32)
+#else
+static FORCEINLINE __vec8_i32 __max_varying_int32 (__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_max_epi32(IZERO,0xFF, v1, v2);}
+static FORCEINLINE __vec8_i32 __min_varying_int32 (__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_min_epi32(IZERO,0xFF, v1, v2);}
+static FORCEINLINE __vec8_i32 __max_varying_uint32(__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_max_epu32(IZERO,0xFF, v1, v2);}
+static FORCEINLINE __vec8_i32 __min_varying_uint32(__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_min_epu32(IZERO,0xFF, v1, v2);}
+#endif
+
+BINARY_OP_FUNC(__vec8_i64, __max_varying_int64, __max_uniform_int64)
+BINARY_OP_FUNC(__vec8_i64, __min_varying_int64, __min_uniform_int64)
+BINARY_OP_FUNC(__vec8_i64, __max_varying_uint64, __max_uniform_uint64)
+BINARY_OP_FUNC(__vec8_i64, __min_varying_uint64, __min_uniform_uint64)
+
+// sqrt/rsqrt/rcp
+
+static FORCEINLINE float __rsqrt_uniform_float(float v) {
+    return 1.f / sqrtf(v);
+}
+
+static FORCEINLINE float __rcp_uniform_float(float v) {
+    return 1.f / v;
+}
+
+static FORCEINLINE float __sqrt_uniform_float(float v) {
+    return sqrtf(v);
+}
+
+static FORCEINLINE double __sqrt_uniform_double(double v) {
+    return sqrt(v);
+}
+
+#if 0
+UNARY_OP(__vec8_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec8_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec8_f, __sqrt_varying_float, __sqrt_uniform_float)
+#else
+static FORCEINLINE __vec8_f __rcp_varying_float(__vec8_f v) {
+#ifdef ISPC_FAST_MATH
+    return _mm512_mask_rcp23_ps(FZERO, 0xFF, v); // Approximation with 23 bits of accuracy.
+#else
+    return _mm512_mask_recip_ps(FZERO, 0xFF, v);
+#endif
+}
+
+static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) {
+#ifdef ISPC_FAST_MATH
+    return _mm512_mask_rsqrt23_ps(FZERO,0xFF,v); // Approximation with 0.775ULP accuracy
+#else 
+    return _mm512_mask_invsqrt_ps(FZERO,0xFF,v);
+#endif
+}
+static FORCEINLINE __vec8_f __sqrt_varying_float (__vec8_f v) {    return _mm512_mask_sqrt_ps(FZERO,0xFF,v);}
+#endif
+
+#if 0
+UNARY_OP(__vec8_d, __sqrt_varying_double, __sqrt_uniform_double)
+#else
+static FORCEINLINE __vec8_d __sqrt_varying_double(__vec8_d v) {    return _mm512_sqrt_pd(v); }
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// svml
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec8_f __svml_logf(__vec8_f v)              { return _mm512_mask_log_ps(FZERO,0xFF,v); }
+static FORCEINLINE __vec8_f __svml_expf(__vec8_f v)              { return _mm512_mask_exp_ps(FZERO,0xFF,v); }
+static FORCEINLINE __vec8_f __svml_cosf(__vec8_f v)              { return _mm512_mask_cos_ps(FZERO,0xFF,v); }
+static FORCEINLINE __vec8_f __svml_powf(__vec8_f a, __vec8_f b) { return _mm512_mask_pow_ps(FZERO,0xFF,a,b); }
+
+static FORCEINLINE __vec8_d __svml_logd(__vec8_d v)              { return _mm512_log_pd(v); }
+static FORCEINLINE __vec8_d __svml_expd(__vec8_d v)              { return _mm512_exp_pd(v); }
+static FORCEINLINE __vec8_d __svml_cosd(__vec8_d v)              { return _mm512_cos_pd(v); }
+static FORCEINLINE __vec8_d __svml_powd(__vec8_d a, __vec8_d b) { return _mm512_pow_pd(a,b); }
+
+///////////////////////////////////////////////////////////////////////////
+// bit ops
+
+static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __popcnt_int64(uint64_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & (1<<31)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & (1ull<<63)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// reductions
+
+#if 0
+REDUCE_ADD(float, __vec8_f, __reduce_add_float)
+REDUCE_MINMAX(float, __vec8_f, __reduce_min_float, <)
+REDUCE_MINMAX(float, __vec8_f, __reduce_max_float, >)
+#else
+static FORCEINLINE float __reduce_add_float(__vec8_f v) { return _mm512_mask_reduce_add_ps(0xFF,v); }
+static FORCEINLINE float __reduce_min_float(__vec8_f v) { return _mm512_mask_reduce_min_ps(0xFF,v); }
+static FORCEINLINE float __reduce_max_float(__vec8_f v) { return _mm512_mask_reduce_max_ps(0xFF,v); }
+#endif
+
+#if 0
+REDUCE_ADD(double, __vec8_d, __reduce_add_double)
+REDUCE_MINMAX(double, __vec8_d, __reduce_min_double, <)
+REDUCE_MINMAX(double, __vec8_d, __reduce_max_double, >)
+#else
+static FORCEINLINE float __reduce_add_double(__vec8_d v) { return _mm512_reduce_add_pd(v); }
+static FORCEINLINE float __reduce_min_double(__vec8_d v) { return _mm512_reduce_min_pd(v); }
+static FORCEINLINE float __reduce_max_double(__vec8_d v) { return _mm512_reduce_max_pd(v); }
+#endif
+
+
+
+#if 0
+REDUCE_ADD   (int64_t, __vec8_i32, __reduce_add_int32)
+REDUCE_MINMAX(int32_t, __vec8_i32, __reduce_min_int32, <)
+REDUCE_MINMAX(int32_t, __vec8_i32, __reduce_max_int32, >)
+REDUCE_MINMAX(uint32_t, __vec8_i32, __reduce_min_uint32, <)
+REDUCE_MINMAX(uint32_t, __vec8_i32, __reduce_max_uint32, >)
+#else
+static FORCEINLINE  int64_t __reduce_add_int32  (__vec8_i32 v) { return _mm512_mask_reduce_add_epi32(0xFF, v);}
+static FORCEINLINE  int32_t __reduce_min_int32  (__vec8_i32 v) { return _mm512_mask_reduce_min_epi32(0xFF, v);}
+static FORCEINLINE  int32_t __reduce_max_int32  (__vec8_i32 v) { return _mm512_mask_reduce_max_epi32(0xFF, v);}
+static FORCEINLINE uint32_t __reduce_min_uint32 (__vec8_i32 v) { return _mm512_mask_reduce_min_epu32(0xFF, v);}
+static FORCEINLINE uint32_t __reduce_max_uint32 (__vec8_i32 v) { return _mm512_mask_reduce_max_epu32(0xFF, v);}
+#endif
+
+REDUCE_ADD   ( int16_t, __vec8_i8,  __reduce_add_int8)
+REDUCE_ADD   ( int32_t, __vec8_i16, __reduce_add_int16)
+REDUCE_ADD   ( int64_t, __vec8_i64, __reduce_add_int64)
+REDUCE_MINMAX( int64_t, __vec8_i64, __reduce_min_int64, <)
+REDUCE_MINMAX( int64_t, __vec8_i64, __reduce_max_int64, >)
+REDUCE_MINMAX(uint64_t, __vec8_i64, __reduce_min_uint64, <)
+REDUCE_MINMAX(uint64_t, __vec8_i64, __reduce_max_uint64, >)
+
+///////////////////////////////////////////////////////////////////////////
+// masked load/store
+
+static FORCEINLINE __vec8_i8 __masked_load_i8(void *p,
+                                               __vec8_i1 mask) {
+    __vec8_i8 ret;
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec8_i16 __masked_load_i16(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_i16 ret;
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+#if 0
+static FORCEINLINE __vec8_i32 __masked_load_i32(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_i32 ret;
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_i32 __masked_load_i32(void *p, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_mask_load_epi32(__vec8_i32(), mask, p);
+#else
+    __vec8_i32 tmp;
+    tmp = _mm512_mask_extloadunpacklo_epi32(tmp, 0xFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_extloadunpackhi_epi32(tmp, 0xFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __vec8_i32 ret;
+    return _mm512_mask_mov_epi32(ret, 0xFF & mask, tmp);
+#endif
+}
+#endif
+
+#if 0
+static FORCEINLINE __vec8_f __masked_load_float(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_f ret;
+    float *ptr = (float *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_f __masked_load_float(void *p, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
+#else
+    __vec8_f tmp;
+    tmp = _mm512_mask_extloadunpacklo_ps(tmp, 0xFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_extloadunpackhi_ps(tmp, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    __vec8_f ret;
+    return _mm512_mask_mov_ps(ret, 0xFF & mask, tmp);
+#endif
+}
+#endif
+
+static FORCEINLINE __vec8_i64 __masked_load_i64(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_i64 ret;
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+#if 0
+static FORCEINLINE __vec8_d __masked_load_double(void *p,
+                                                  __vec8_i1 mask) {
+    __vec8_d ret;
+    double *ptr = (double *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_d __masked_load_double(void *p, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    __vec8_d ret = FZERO;
+    ret = _mm512_mask_load_pd(ret, 0xFF & mask, p);
+    return ret;
+#else
+    __vec8_d tmp = FZERO;
+    tmp.v = _mm512_mask_extloadunpacklo_pd(tmp.v, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_extloadunpackhi_pd(tmp.v, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    __vec8_d ret = FZERO;
+    ret.v = _mm512_mask_mov_pd(ret.v, mask, tmp.v);
+    return ret;
+#endif
+}
+#endif
+
+
+static FORCEINLINE void __masked_store_i8(void *p, __vec8_i8 val,
+                                          __vec8_i1 mask) {
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+static FORCEINLINE void __masked_store_i16(void *p, __vec8_i16 val,
+                                           __vec8_i1 mask) {
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+#if 0
+static FORCEINLINE void __masked_store_i32(void *p, __vec8_i32 val,
+                                           __vec8_i1 mask) {
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_i32(void *p, __vec8_i32 val, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_epi32(p, mask, val.v);
+#else
+    __vec8_i32 tmp;
+    tmp = _mm512_extloadunpacklo_epi32(tmp, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp = _mm512_extloadunpackhi_epi32(tmp, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_mov_epi32(tmp, 0xFF & mask, val);
+    _mm512_mask_extpackstorelo_epi32(          p,    0xFF, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+#if 0
+static FORCEINLINE void __masked_store_float(void *p, __vec8_f val,
+                                             __vec8_i1 mask) {
+    float *ptr = (float *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_float(void *p, __vec8_f val,
+                                             __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_ps(p, 0xFF & mask, val.v);
+#else
+    __vec8_f tmp = FZERO;
+    tmp = _mm512_extloadunpacklo_ps(tmp, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp = _mm512_extloadunpackhi_ps(tmp, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_mov_ps(tmp, 0xFF & mask, val);
+    _mm512_mask_extpackstorelo_ps(          p,    0xFF, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_ps((uint8_t*)p+64, 0xFF, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+static FORCEINLINE void __masked_store_i64(void *p, __vec8_i64 val,
+                                          __vec8_i1 mask) {
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+#if 0
+static FORCEINLINE void __masked_store_double(void *p, __vec8_d val,
+                                              __vec8_i1 mask) {
+    double *ptr = (double *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_double(void *p, __vec8_d val,
+                                              __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_pd(p, mask, val.v);
+#else
+    __vec8_d tmp;
+    tmp.v = _mm512_extloadunpacklo_pd(tmp.v, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_extloadunpackhi_pd(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_mov_pd(tmp.v, mask, val.v);
+    _mm512_extpackstorelo_pd(p, tmp.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+static FORCEINLINE void __masked_store_blend_i8(void *p, __vec8_i8 val,
+                                                __vec8_i1 mask) {
+    __masked_store_i8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i16(void *p, __vec8_i16 val,
+                                                 __vec8_i1 mask) {
+    __masked_store_i16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i32(void *p, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    __masked_store_i32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_float(void *p, __vec8_f val,
+                                                   __vec8_i1 mask) {
+    __masked_store_float(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i64(void *p, __vec8_i64 val,
+                                                 __vec8_i1 mask) {
+    __masked_store_i64(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_double(void *p, __vec8_d val,
+                                                    __vec8_i1 mask) {
+    __masked_store_double(p, val, mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// gather/scatter
+
+// offsets * offsetScale is in bytes (for all of these)
+
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec8_i1 mask) {          \
+    VTYPE ret;                                                          \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 8; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            ret[i] = *ptr;                                            \
+        }                                                               \
+    return ret;                                                         \
+}
+    
+
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i32, __gather_base_offsets32_i8)
+#else
+static FORCEINLINE __vec8_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec8_i32 offsets,  __vec8_i1 mask) 
+{
+    // (iw): need to temporarily store as int because gathers can only return ints.
+    __vec8_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), 0xFF & mask, offsets, base, 
+                                                     _MM_UPCONV_EPI32_SINT8, scale,
+                                                     _MM_HINT_NONE);
+    // now, downconverting to chars into temporary char vector
+    __vec8_i8 ret;
+    _mm512_mask_extstore_epi32(ret.data,0xFF,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+    return ret;
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i64, __gather_base_offsets64_i8)
+/****************/
+GATHER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i64, __gather_base_offsets64_i16)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i32, __gather_base_offsets32_i32)
+#else
+static FORCEINLINE __vec8_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec8_i32 offsets,   __vec8_i1 mask) 
+{
+    return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), 0xFF & mask, offsets, 
+                                          base, _MM_UPCONV_EPI32_NONE, scale,
+                                          _MM_HINT_NONE);
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i64, __gather_base_offsets64_i32)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i32, __gather_base_offsets32_float)
+#else
+static FORCEINLINE __vec8_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) 
+{
+    return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), 0xFF & mask, offsets,
+                                       base, _MM_UPCONV_PS_NONE, scale,
+                                       _MM_HINT_NONE);
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i64, __gather_base_offsets64_float)
+/****************/
+GATHER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i64, __gather_base_offsets64_i64)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i32, __gather_base_offsets32_double)
+#else
+static FORCEINLINE __vec8_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) 
+{
+    __vec8_d ret;
+    ret.v = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
+                                       base, _MM_UPCONV_PD_NONE, scale,
+                                       _MM_HINT_NONE); 
+#if 0
+    __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+    const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */
+    ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets,
+                                       base, _MM_UPCONV_PD_NONE, scale,
+                                       _MM_HINT_NONE); 
+#endif
+    return ret;
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i64, __gather_base_offsets64_double)
+
+#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec8_i1 mask) {   \
+    VTYPE ret;                                              \
+    for (int i = 0; i < 8; ++i)                            \
+        if ((mask.v & (1 << i)) != 0) {                     \
+            STYPE *ptr = (STYPE *)ptrs[i];                \
+            ret[i] = *ptr;                                \
+        }                                                   \
+    return ret;                                             \
+}
+#define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec8_i1 mask) {   \
+  return FUNC1(0, 1, ptrs, mask); \
+}
+
+
+#if 1
+/***********/
+GATHER_GENERALF(__vec8_i8,  int8_t,  __vec8_i32, __gather32_i8, __gather_base_offsets32_i8)
+GATHER_GENERALF(__vec8_i16, int16_t, __vec8_i32, __gather32_i16, __gather_base_offsets32_i16)
+GATHER_GENERALF(__vec8_i32, int32_t, __vec8_i32, __gather32_i32, __gather_base_offsets32_i32)
+GATHER_GENERALF(__vec8_i64, int64_t, __vec8_i32, __gather32_i64, __gather_base_offsets32_i64)
+GATHER_GENERALF(__vec8_f,   float,   __vec8_i32, __gather32_float, __gather_base_offsets32_float)
+GATHER_GENERALF(__vec8_d,   double,  __vec8_i32, __gather32_double, __gather_base_offsets32_double)
+/***********/
+GATHER_GENERAL(__vec8_i8,  int8_t,  __vec8_i64, __gather64_i8);
+GATHER_GENERAL(__vec8_i16, int16_t, __vec8_i64, __gather64_i16);
+GATHER_GENERAL(__vec8_i32, int32_t, __vec8_i64, __gather64_i32);
+GATHER_GENERAL(__vec8_i64, int64_t, __vec8_i64, __gather64_i64);
+GATHER_GENERAL(__vec8_f,   float,   __vec8_i64, __gather64_float);
+GATHER_GENERAL(__vec8_d,   double,  __vec8_i64, __gather64_double);
+/***********/
+#endif
+
+// scatter
+
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val,                   \
+                             __vec8_i1 mask) {                         \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 8; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            *ptr = val[i];                                            \
+        }                                                               \
+}
+    
+
+/*****************/
+SCATTER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i64, __scatter_base_offsets64_i8)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i64, __scatter_base_offsets64_i16)
+/*****************/
+#if 0
+SCATTER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i32, __scatter_base_offsets32_i32)
+#else
+static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec8_i32 offsets,  __vec8_i32 val, __vec8_i1 mask)
+{
+    _mm512_mask_i32extscatter_epi32(b, 0xFF & mask, offsets, val, 
+                                    _MM_DOWNCONV_EPI32_NONE, scale, 
+                                    _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i64, __scatter_base_offsets64_i32)
+/*****************/
+#if 0
+SCATTER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i32, __scatter_base_offsets32_float)
+#else
+static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec8_i32 offsets,
+                               __vec8_f val, __vec8_i1 mask) 
+{ 
+    _mm512_mask_i32extscatter_ps(base, 0xFF & mask, offsets, val, 
+                                 _MM_DOWNCONV_PS_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i64, __scatter_base_offsets64_float)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i64, __scatter_base_offsets64_i64)
+/*****************/
+#if 0 /* evghenii::to implement */
+SCATTER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i32, __scatter_base_offsets32_double)
+#else /* evghenii:testme */
+static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec8_i32 offsets,
+                               __vec8_d val, __vec8_i1 mask) 
+{ 
+    _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v,
+                                 _MM_DOWNCONV_PD_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i64, __scatter_base_offsets64_double)
+
+#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec8_i1 mask) {  \
+    VTYPE ret;                                                       \
+    for (int i = 0; i < 8; ++i)                                     \
+        if ((mask.v & (1 << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)ptrs[i];                         \
+            *ptr = val[i];                                         \
+        }                                                            \
+}
+#define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec8_i1 mask) {  \
+  return FUNC1(0, 1, ptrs, val, mask); \
+}
+
+#if 1
+/***********/
+SCATTER_GENERALF(__vec8_i8,  int8_t,  __vec8_i32, __scatter32_i8, __scatter_base_offsets32_i8)
+SCATTER_GENERALF(__vec8_i16, int16_t, __vec8_i32, __scatter32_i16, __scatter_base_offsets32_i16)
+SCATTER_GENERALF(__vec8_i32, int32_t, __vec8_i32, __scatter32_i32, __scatter_base_offsets32_i32)
+SCATTER_GENERALF(__vec8_i64, int64_t, __vec8_i32, __scatter32_i64, __scatter_base_offsets32_i64)
+SCATTER_GENERALF(__vec8_f,   float,   __vec8_i32, __scatter32_float, __scatter_base_offsets32_float)
+SCATTER_GENERALF(__vec8_d,   double,  __vec8_i32, __scatter32_double, __scatter_base_offsets32_double)
+/***********/
+SCATTER_GENERAL(__vec8_i8,  int8_t,  __vec8_i64, __scatter64_i8)
+SCATTER_GENERAL(__vec8_i16, int16_t, __vec8_i64, __scatter64_i16)
+SCATTER_GENERAL(__vec8_i32, int32_t, __vec8_i64, __scatter64_i32)
+SCATTER_GENERAL(__vec8_f,   float,   __vec8_i64, __scatter64_float)
+SCATTER_GENERAL(__vec8_i64, int64_t, __vec8_i64, __scatter64_i64)
+SCATTER_GENERAL(__vec8_d,   double,  __vec8_i64, __scatter64_double)
+/***********/
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// packed load/store
+
+#if 0
+static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->operator[](i) = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, 
+                                                 __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val[i];
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
+                                                __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->operator[](i) = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
+                                                 __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val[i];
+            ++count;
+        }
+    }
+    return count;
+}
+#else
+static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    __vec8_i32 v = __load<64>(val);
+    v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __store<64>(val, v);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    __vec8_i32 v = __load<64>(val);
+    v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __store<64>(val, v);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// aos/soa
+
+static FORCEINLINE void __soa_to_aos3_float(__vec8_f v0, __vec8_f v1, __vec8_f v2,
+                                            float *ptr) {
+    for (int i = 0; i < 8; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec8_f *out0, __vec8_f *out1,
+                                            __vec8_f *out2) {
+    for (int i = 0; i < 8; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+    }
+}
+
+static FORCEINLINE void __soa_to_aos4_float(__vec8_f v0, __vec8_f v1, __vec8_f v2,
+                                            __vec8_f v3, float *ptr) {
+    for (int i = 0; i < 8; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+        *ptr++ = __extract_element(v3, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec8_f *out0, __vec8_f *out1,
+                                            __vec8_f *out2, __vec8_f *out3) {
+    for (int i = 0; i < 8; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+        __insert_element(out3, i, *ptr++);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// prefetch
+
+static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T1); // prefetch into L2$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *p) {
+    // There is no L3$ on KNC, don't want to pollute L2$ unecessarily
+}
+
+static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint
+    // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
+}
+
+///////////////////////////////////////////////////////////////////////////
+// atomics
+
+static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) {
+    int32_t old, min;
+    do {
+        old = *((volatile int32_t *)p);
+        min = (old < (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) {
+    int32_t old, max;
+    do {
+        old = *((volatile int32_t *)p);
+        max = (old > (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) {
+    uint32_t old, min;
+    do {
+        old = *((volatile uint32_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) {
+    uint32_t old, max;
+    do {
+        old = *((volatile uint32_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange((LONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval,
+                                             uint32_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) {
+    int64_t old, min;
+    do {
+        old = *((volatile int64_t *)p);
+        min = (old < (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) {
+    int64_t old, max;
+    do {
+        old = *((volatile int64_t *)p);
+        max = (old > (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) {
+    uint64_t old, min;
+    do {
+        old = *((volatile uint64_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) {
+    uint64_t old, max;
+    do {
+        old = *((volatile uint64_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange64((LONGLONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
+                                             uint64_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+
+#endif // !WIN32
+
+#undef FORCEINLINE
+#undef PRE_ALIGN
+#undef POST_ALIGN
diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h
new file mode 100644
index 00000000..05be27bd
--- /dev/null
+++ b/examples/intrinsics/knc-i1x8unsafe_fast.h
@@ -0,0 +1,86 @@
+#define __ZMM64BIT__
+#include "knc-i1x8.h"
+
+/* the following tests fails because on KNC native vec8_i32 and vec8_float are 512 and not 256 bit in size.
+ *
+ *  Using test compiler: Intel(r) SPMD Program Compiler (ispc), 1.4.5dev (build commit d68dbbc7bce74803 @ 20130919, LLVM 3.3)
+ *  Using C/C++ compiler: icpc (ICC) 14.0.0 20130728
+ *
+ */
+
+/* knc-i1x8unsafe_fast.h fails: 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+	./tests/ptr-assign-lhs-math-1.ispc
+33 / 1206 tests FAILED execution:
+	./tests/array-gather-simple.ispc
+	./tests/array-gather-vary.ispc
+	./tests/array-multidim-gather-scatter.ispc
+	./tests/array-scatter-vary.ispc
+	./tests/atomics-5.ispc
+	./tests/atomics-swap.ispc
+	./tests/cfor-array-gather-vary.ispc
+	./tests/cfor-gs-improve-varying-1.ispc
+	./tests/cfor-struct-gather-2.ispc
+	./tests/cfor-struct-gather-3.ispc
+	./tests/cfor-struct-gather.ispc
+	./tests/gather-struct-vector.ispc
+	./tests/global-array-4.ispc
+	./tests/gs-improve-varying-1.ispc
+	./tests/half-1.ispc
+	./tests/half-3.ispc
+	./tests/half.ispc
+	./tests/launch-3.ispc
+	./tests/launch-4.ispc
+	./tests/masked-scatter-vector.ispc
+	./tests/masked-struct-scatter-varying.ispc
+	./tests/new-delete-6.ispc
+	./tests/ptr-24.ispc
+	./tests/ptr-25.ispc
+	./tests/short-vec-15.ispc
+	./tests/struct-gather-2.ispc
+	./tests/struct-gather-3.ispc
+	./tests/struct-gather.ispc
+	./tests/struct-ref-lvalue.ispc
+	./tests/struct-test-118.ispc
+	./tests/struct-vary-index-expr.ispc
+	./tests/typedef-2.ispc
+	./tests/vector-varying-scatter.ispc
+*/
+
+/* knc-i1x8.h fails: 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+	./tests/ptr-assign-lhs-math-1.ispc
+3 / 1206 tests FAILED execution:
+	./tests/half-1.ispc
+	./tests/half-3.ispc
+	./tests/half.ispc
+*/
+
+/* knc-i1x8.h fails: 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+        ./tests/ptr-assign-lhs-math-1.ispc
+4 / 1206 tests FAILED execution:
+        ./tests/half-1.ispc
+        ./tests/half-3.ispc
+        ./tests/half.ispc
+        ./tests/test-141.ispc
+*/
+
+/* generic-16.h fails: (from these knc-i1x8.h & knc-i1x16.h are derived 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+        ./tests/ptr-assign-lhs-math-1.ispc
+6 / 1206 tests FAILED execution:
+        ./tests/func-overload-max.ispc
+        ./tests/half-1.ispc
+        ./tests/half-3.ispc
+        ./tests/half.ispc
+        ./tests/test-141.ispc
+        ./tests/test-143.ispc
+*/
+
+
+
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index bf383c88..8baef8cb 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1511,6 +1511,22 @@ static FORCEINLINE int64_t __count_trailing_zeros_i64(const __vec1_i64 mask) {
 // reductions
 ///////////////////////////////////////////////////////////////////////////
 
+static FORCEINLINE int16_t __reduce_add_i8(__vec16_i8 v) {
+  // TODO: improve this!
+  int16_t ret = 0;
+  for (int i = 0; i < 16; ++i)
+    ret += v.v[i];
+  return ret;
+}
+
+static FORCEINLINE int32_t __reduce_add_i16(__vec16_i16 v) {
+  // TODO: improve this!
+  int32_t ret = 0;
+  for (int i = 0; i < 16; ++i)
+    ret += v.v[i];
+  return ret;
+}
+
 static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) {
     return _mm512_reduce_add_epi32(v);
 }
@@ -2105,9 +2121,24 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
 #endif
 }
 
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // !WIN32
+
 #undef FORCEINLINE
 #undef PRE_ALIGN
 #undef POST_ALIGN
-
-
-
diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h
index 0041a6c9..a1b1fc9d 100644
--- a/examples/intrinsics/knc2x.h
+++ b/examples/intrinsics/knc2x.h
@@ -1607,6 +1607,9 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
 ///////////////////////////////////////////////////////////////////////////
 // reductions
 
+REDUCE_ADD(int16_t, __vec32_i8, __reduce_add_int8)
+REDUCE_ADD(int32_t, __vec32_i16, __reduce_add_int16)
+
 static FORCEINLINE float __reduce_add_float(__vec32_f v) {
     return _mm512_reduce_add_ps(v.v1) + _mm512_reduce_add_ps(v.v2);
 }
@@ -2052,7 +2055,24 @@ static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec32_f *out0, __vec32
 }
 */
 
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // !WIN32
+
 #undef FORCEINLINE
 #undef PRE_ALIGN
 #undef POST_ALIGN
-
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index d4739d61..ff00d920 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -2528,6 +2528,22 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
 ///////////////////////////////////////////////////////////////////////////
 // reductions
 
+static FORCEINLINE int16_t __reduce_add_int8(__vec4_i8 v) {
+    // TODO: improve
+    int16_t ret = 0;
+    for (int i = 0; i < 4; ++i)
+        ret += __extract_element(v, i);
+    return ret;
+}
+
+static FORCEINLINE int32_t __reduce_add_int16(__vec4_i16 v) {
+    // TODO: improve
+    int32_t ret = 0;
+    for (int i = 0; i < 4; ++i)
+        ret += __extract_element(v, i);
+    return ret;
+}
+
 static FORCEINLINE float __reduce_add_float(__vec4_f v) {
     float r = bits_as_float(_mm_extract_ps(v.v, 0));
     r += bits_as_float(_mm_extract_ps(v.v, 1));
@@ -3984,6 +4000,22 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
 #endif
 }
 
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // !WIN32
+
 #undef FORCEINLINE
-
-
diff --git a/examples/mandelbrot/mandelbrot.cpp b/examples/mandelbrot/mandelbrot.cpp
index 7e73768f..d2bebb96 100644
--- a/examples/mandelbrot/mandelbrot.cpp
+++ b/examples/mandelbrot/mandelbrot.cpp
@@ -109,7 +109,7 @@ int main() {
         minSerial = std::min(minSerial, dt);
     }
 
-    printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
     writePPM(buf, width, height, "mandelbrot-serial.ppm");
 
     printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile
index 7e83e618..1a565ffd 100644
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -1,7 +1,7 @@
 
-EXAMPLE=mandelbrot
-CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
-ISPC_SRC=mandelbrot.ispc
+EXAMPLE=mandelbrot_tasks
+CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
+ISPC_SRC=mandelbrot_tasks.ispc
 ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
 ISPC_ARM_TARGETS=neon
 
diff --git a/examples/mandelbrot_tasks/mandelbrot.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
similarity index 97%
rename from examples/mandelbrot_tasks/mandelbrot.cpp
rename to examples/mandelbrot_tasks/mandelbrot_tasks.cpp
index a01cfe43..698daf0f 100644
--- a/examples/mandelbrot_tasks/mandelbrot.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
@@ -42,7 +42,7 @@
 #include <algorithm>
 #include <string.h>
 #include "../timing.h"
-#include "mandelbrot_ispc.h"
+#include "mandelbrot_tasks_ispc.h"
 using namespace ispc;
 
 extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
@@ -137,7 +137,7 @@ int main(int argc, char *argv[]) {
         minSerial = std::min(minSerial, dt);
     }
 
-    printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
     writePPM(buf, width, height, "mandelbrot-serial.ppm");
 
     printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
diff --git a/examples/mandelbrot_tasks/mandelbrot.ispc b/examples/mandelbrot_tasks/mandelbrot_tasks.ispc
similarity index 100%
rename from examples/mandelbrot_tasks/mandelbrot.ispc
rename to examples/mandelbrot_tasks/mandelbrot_tasks.ispc
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
index b92de72f..3a8fca79 100644
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -21,7 +21,7 @@
   <PropertyGroup Label="Globals">
     <ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
-    <RootNamespace>mandelbrot</RootNamespace>
+    <RootNamespace>mandelbrot_tasks</RootNamespace>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -65,22 +65,22 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <LinkIncremental>true</LinkIncremental>
     <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
     <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <LinkIncremental>false</LinkIncremental>
     <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
     <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
@@ -153,12 +153,12 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
-    <ClCompile Include="mandelbrot.cpp" />
-    <ClCompile Include="mandelbrot_serial.cpp" />
+    <ClCompile Include="mandelbrot_tasks.cpp" />
+    <ClCompile Include="mandelbrot_tasks_serial.cpp" />
     <ClCompile Include="../tasksys.cpp" />
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="mandelbrot.ispc">
+    <CustomBuild Include="mandelbrot_tasks.ispc">
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
diff --git a/examples/mandelbrot_tasks/mandelbrot_serial.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp
similarity index 100%
rename from examples/mandelbrot_tasks/mandelbrot_serial.cpp
rename to examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp
diff --git a/examples/noise/Makefile b/examples/noise/Makefile
index 8cc72689..58d1cf3b 100644
--- a/examples/noise/Makefile
+++ b/examples/noise/Makefile
@@ -1,6 +1,6 @@
 
 EXAMPLE=noise
-CPP_SRC=$(EXAMPLE).cpp $(EXAMPLE)_serial.cpp
+CPP_SRC=noise.cpp noise_serial.cpp
 ISPC_SRC=noise.ispc
 ISPC_IA_TARGETS=sse2,sse4,avx-x2
 ISPC_ARM_TARGETS=neon
diff --git a/examples/noise/noise.cpp b/examples/noise/noise.cpp
index 58552ce3..123f98c7 100644
--- a/examples/noise/noise.cpp
+++ b/examples/noise/noise.cpp
@@ -106,7 +106,7 @@ int main() {
         minSerial = std::min(minSerial, dt);
     }
 
-    printf("[noise serial]:\t\t\t[%.3f] millon cycles\n", minSerial);
+    printf("[noise serial]:\t\t\t[%.3f] million cycles\n", minSerial);
     writePPM(buf, width, height, "noise-serial.ppm");
 
     printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
diff --git a/examples/perf.py b/examples/perf.py
deleted file mode 100755
index 8503bd8c..00000000
--- a/examples/perf.py
+++ /dev/null
@@ -1,262 +0,0 @@
-#!/usr/bin/python
-# // Author: Filippov Ilia
-
-from optparse import OptionParser
-import sys
-import os
-import operator
-import time
-import glob
-import string
-import platform
-
-def build_test():
-    global build_log
-    global is_windows
-    if is_windows == False:
-        os.system("make clean >> "+build_log)
-        return os.system("make >> "+build_log+" 2>> "+build_log)
-    else:
-        os.system("msbuild /t:clean >> " + build_log)
-        return os.system("msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /t:rebuild >> " + build_log)
-
-def execute_test(command):
-    global perf_temp
-    r = 0
-    if os.path.exists(perf_temp):
-        os.remove(perf_temp)
-    for k in range(int(options.number)):
-        r = r + os.system(command)
-    return r
-
-#gathers all tests results and made an item test from answer structure
-def run_test(command, c1, c2, test):
-    global perf_temp
-    if build_test() != 0:
-        sys.stdout.write("ERROR: Compilation fails\n")
-        return
-    if execute_test(command) != 0:
-        sys.stdout.write("ERROR: Execution fails\n")
-        return
-    tasks = [] #list of results with tasks, it will be test[2]
-    ispc = [] #list of results without tasks, it will be test[1]
-    j = 1
-    for line in open(perf_temp): # we take test output
-        if "speedup" in line: # we are interested only in lines with speedup
-            if j == c1: # we are interested only in lines with c1 numbers
-                sys.stdout.write(line)
-                line = line.expandtabs(0)
-                line = line.replace("("," ")
-                line = line.split(",")
-                for i in range(len(line)):
-                    subline = line[i].split(" ")
-                    number = float(subline[1][:-1])
-                    if "speedup from ISPC + tasks" in line[i]:
-                        tasks.append(number)
-                    else:
-                        ispc.append(number)
-                c1 = c1 + c2
-            j+=1
-    test[1] = test[1] + ispc
-    test[2] = test[2] + tasks
-
-
-def cpu_get():
-    p = open("/proc/stat", 'r')
-    cpu = p.readline()
-    p.close()
-    cpu = cpu.split(" ")
-    cpu_usage = (int(cpu[2]) + int(cpu[3]) + int(cpu[4]))
-    cpu_all = cpu_usage + int(cpu[5])
-    return [cpu_usage, cpu_all]
-
-#returns cpu_usage
-def cpu_check():
-    if is_windows == False:
-        cpu1 = cpu_get()
-        time.sleep(1)
-        cpu2 = cpu_get()
-        cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
-    else:
-	os.system("wmic cpu get loadpercentage /value > cpu_temp")
-	c = open("cpu_temp", 'r')
-        c_lines = c.readlines()
-	c.close()
-	os.remove("cpu_temp")
-	t = "0"
-	for i in c_lines[2]:
-            if i.isdigit():
-                t = t + i
-	cpu_percent = int(t)
-    return cpu_percent
-
-#returns geomean of list
-def geomean(par):
-    temp = 1
-    l = len(par)
-    for i in range(l):
-        temp = temp * par[i]
-    temp = temp ** (1.0/l)
-    return round(temp, 2)
-
-#takes an answer struct and print it.
-#answer struct: list answer contains lists test
-#test[0] - name of test
-#test[1] - list of results without tasks
-#test[2] - list of results with tasks
-#test[1] or test[2] may be empty
-def print_answer(answer):
-    sys.stdout.write("Name of test:\t\tISPC:\tISPC + tasks:\n")
-    max_t = [0,0]
-    diff_t = [0,0]
-    geomean_t = [0,0]
-    list_of_max = [[],[]]
-    for i in range(len(answer)):
-        for t in range(1,3):
-            if len(answer[i][t]) == 0:
-                max_t[t-1] = "n/a"
-                diff_t[t-1] = "n/a"
-            else:
-                list_of_max[t-1].append(max(answer[i][t]))
-                max_t[t-1] = str(max(answer[i][t]))
-                diff_t[t-1] = str(max(answer[i][t]) - min(answer[i][t]))
-        sys.stdout.write("%s:\n" % answer[i][0])
-        sys.stdout.write("\t\tmax:\t%s\t%s\n" % (max_t[0], max_t[1]))
-        sys.stdout.write("\t\tdiff:\t%s\t%s\n" % (diff_t[0], diff_t[1]))
-
-    geomean_t[0] = geomean(list_of_max[0])
-    geomean_t[1] = geomean(list_of_max[1])
-    sys.stdout.write("---------------------------------------------\n")
-    sys.stdout.write("Geomean:\t\t%s\t%s\n" % (geomean_t[0], geomean_t[1]))
-
-###Main###
-# parsing options
-parser = OptionParser()
-parser.add_option('-n', '--number', dest='number',
-    help='number of repeats', default="3")
-parser.add_option('-c', '--config', dest='config',
-    help='config file of tests', default="./perf.ini")
-parser.add_option('-p', '--path', dest='path',
-    help='path to examples directory', default="./")
-(options, args) = parser.parse_args()
-
-global is_windows
-is_windows = (platform.system() == 'Windows' or
-              'CYGWIN_NT' in platform.system())
-
-# save corrent path
-pwd = os.getcwd()
-pwd = pwd + os.sep
-if is_windows:
-    pwd = "..\\"
-
-# check if cpu usage is low now
-cpu_percent = cpu_check()
-if cpu_percent > 20:
-    sys.stdout.write("Warning: CPU Usage is very high.\n")
-    sys.stdout.write("Close other applications.\n")
-
-# check that required compilers exist
-PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
-compiler_exists = False
-ref_compiler_exists = False
-if is_windows == False:
-    compiler = "ispc"
-    ref_compiler = "g++"
-else:
-    compiler = "ispc.exe"
-    ref_compiler = "cl.exe"
-for counter in PATH_dir:
-    if os.path.exists(counter + os.sep + compiler):
-        compiler_exists = True
-    if os.path.exists(counter + os.sep + ref_compiler):
-        ref_compiler_exists = True
-if not compiler_exists:
-    sys.stderr.write("Fatal error: ISPC compiler not found.\n")
-    sys.stderr.write("Added path to ispc compiler to your PATH variable.\n")
-    sys.exit()
-if not ref_compiler_exists:
-    sys.stderr.write("Fatal error: reference compiler %s not found.\n" % ref_compiler)
-    sys.stderr.write("Added path to %s compiler to your PATH variable.\n" % ref_compiler)
-    sys.exit()
-
-# checks that config file exists
-path_config = os.path.normpath(options.config)
-if os.path.exists(path_config) == False:
-    sys.stderr.write("Fatal error: config file not found: %s.\n" % options.config) 
-    sys.stderr.write("Set path to your config file in --config.\n")
-    sys.exit()
-
-# read lines from config file except comments
-f = open(path_config, 'r')
-f_lines = f.readlines()
-f.close()
-lines =[]
-for i in range(len(f_lines)):
-    if f_lines[i][0] != "%":
-        lines.append(f_lines[i])
-length = len(lines)
-
-# prepare build.log and perf_temp files
-global build_log
-build_log = pwd + "build.log"
-if is_windows == False:
-    if os.path.exists(build_log):
-        os.remove(build_log)
-else:
-    if os.path.exists("build.log"):
-        os.remove("build.log")
-global perf_temp
-perf_temp = pwd + "perf_temp"
-
-i = 0
-answer = []
-sys.stdout.write("Okey go go go!\n\n")
-# loop for all tests
-while i < length-2:
-    # we read name of test
-    sys.stdout.write("%s" % lines[i])
-    test = [lines[i][:-1],[],[]]
-    # read location of test
-    folder = lines[i+1]
-    folder = folder[:-1]
-    folder = os.path.normpath(options.path + os.sep + folder)
-    # check that test exists
-    if os.path.exists(folder) == False:
-        sys.stdout.write("Fatal error: Can't find test %s. Your path is: \"%s\".\n" % (lines[i][:-1], options.path))
-        sys.stdout.write("Change current location to /examples or set path to /examples in --path.\n")
-        exit(0)
-    os.chdir(folder)
-    # read parameters of test
-    command = lines[i+2]
-    command = command[:-1]
-    if is_windows == False:
-        command = "./"+command + " >> " + perf_temp
-    else:
-        command = "x64\\Release\\"+command + " >> " + perf_temp
-    # parsing config parameters
-    next_line = lines[i+3]
-    if next_line[0] == "!": # we should take only one part of test output
-        R = next_line.split(' ')
-        c1 = int(R[1]) #c1 is a number of string which we want to use in test output
-        c2 = int(R[2]) #c2 is total number of strings in test output
-        i = i+1
-    else:
-        c1 = 1
-        c2 = 1
-    next_line = lines[i+3]
-    if next_line[0] == "^":  #we should concatenate result of this test with previous one
-        run_test(command, c1, c2, answer[len(answer)-1])
-        i = i+1
-    else: #we run this test and append it's result to answer structure
-        run_test(command, c1, c2, test)
-        answer.append(test)
-    # preparing next loop iteration
-    os.chdir(pwd)
-    i+=4
-
-# delete temp file
-if os.path.exists(perf_temp):
-    os.remove(perf_temp)
-#print collected answer
-print_answer(answer)
diff --git a/examples/sort/sort.cpp b/examples/sort/sort.cpp
index 1d05b247..f5e4264a 100644
--- a/examples/sort/sort.cpp
+++ b/examples/sort/sort.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2013, Intel Corporation
+  Copyright (c) 2013, Durham University
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -13,7 +13,7 @@
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
 
-    * Neither the name of Intel Corporation nor the names of its
+    * Neither the name of Durham University nor the names of its
       contributors may be used to endorse or promote products derived from
       this software without specific prior written permission.
 
@@ -86,7 +86,8 @@ int main (int argc, char *argv[])
 
     tISPC1 += get_elapsed_mcycles();
 
-    progressbar (i, m);
+    if (argc != 3)
+        progressbar (i, m);
   }
 
   printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1);
@@ -103,10 +104,11 @@ int main (int argc, char *argv[])
 
     tISPC2 += get_elapsed_mcycles();
 
-    progressbar (i, m);
+    if (argc != 3)
+        progressbar (i, m);
   }
-              
-  printf("[sort ispc+tasks]:\t[%.3f] million cycles\n", tISPC2);
+
+  printf("[sort ispc + tasks]:\t[%.3f] million cycles\n", tISPC2);
 
   srand (0);
 
@@ -120,13 +122,13 @@ int main (int argc, char *argv[])
 
     tSerial += get_elapsed_mcycles();
 
-    progressbar (i, m);
+    if (argc != 3)
+        progressbar (i, m);
   }
 
   printf("[sort serial]:\t\t[%.3f] million cycles\n", tSerial);
 
-  printf("\t\t\t\t(%.2fx speedup from ISPC serial)\n", tSerial/tISPC1);
-  printf("\t\t\t\t(%.2fx speedup from ISPC with tasks)\n", tSerial/tISPC2);
+  printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
 
   delete code;
   delete order;
diff --git a/examples/sort/sort.ispc b/examples/sort/sort.ispc
index 65df4736..25ea90f4 100644
--- a/examples/sort/sort.ispc
+++ b/examples/sort/sort.ispc
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2013, Intel Corporation
+  Copyright (c) 2013, Durham University
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -13,7 +13,7 @@
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
 
-    * Neither the name of Intel Corporation nor the names of its
+    * Neither the name of Durham University nor the names of its
       contributors may be used to endorse or promote products derived from
       this software without specific prior written permission.
 
@@ -172,7 +172,7 @@ task void bumpup (uniform int h[], uniform int g[])
 
 static void prefix_sum (uniform int num, uniform int h[])
 {
-  uniform int * uniform g = uniform new int [num+1];
+  uniform int * uniform g = uniform new uniform int [num+1];
   uniform int i;
 
   launch[num] addup (h, g+1);
@@ -191,9 +191,9 @@ export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int o
   uniform int num = ntasks < 1 ? num_cores () : ntasks;
   uniform int span = n / num;
   uniform int hsize = 256*programCount*num;
-  uniform int * uniform hist = uniform new int [hsize];
-  uniform int64 * uniform pair = uniform new int64 [n];
-  uniform int64 * uniform temp = uniform new int64 [n];
+  uniform int * uniform hist = uniform new uniform int [hsize];
+  uniform int64 * uniform pair = uniform new uniform int64 [n];
+  uniform int64 * uniform temp = uniform new uniform int64 [n];
   uniform int pass, i;
 
 #if DEBUG
diff --git a/examples/sort/sort_serial.cpp b/examples/sort/sort_serial.cpp
index ba955c77..38bbdda6 100644
--- a/examples/sort/sort_serial.cpp
+++ b/examples/sort/sort_serial.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2013, Intel Corporation
+  Copyright (c) 2013, Durham University
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -13,7 +13,7 @@
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
 
-    * Neither the name of Intel Corporation nor the names of its
+    * Neither the name of Durham University nor the names of its
       contributors may be used to endorse or promote products derived from
       this software without specific prior written permission.
 
diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp
index 9d5b3ee6..593d901f 100644
--- a/examples/stencil/stencil.cpp
+++ b/examples/stencil/stencil.cpp
@@ -130,7 +130,7 @@ int main() {
         minTimeSerial = std::min(minTimeSerial, dt);
     }
 
-    printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial);
+    printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);
 
     printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
            minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp
index b4ced5c7..c9c2fa7b 100644
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -365,7 +365,7 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue)
 static inline int32_t 
 lAtomicAdd(volatile int32_t *v, int32_t delta) {
 #ifdef ISPC_IS_WINDOWS
-    return InterlockedAdd((volatile LONG *)v, delta);
+    return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
 #else
     return __sync_fetch_and_add(v, delta);
 #endif
diff --git a/examples/volume_rendering/volume.cpp b/examples/volume_rendering/volume.cpp
index 7d8b8e99..458cd407 100644
--- a/examples/volume_rendering/volume.cpp
+++ b/examples/volume_rendering/volume.cpp
@@ -204,7 +204,7 @@ int main(int argc, char *argv[]) {
         minSerial = std::min(minSerial, dt);
     }
 
-    printf("[volume serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial);
     writePPM(image, width, height, "volume-serial.ppm");
 
     printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
diff --git a/expr.cpp b/expr.cpp
index fc3d295a..614cb5e5 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -1911,6 +1911,40 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
 }
 
 
+/* Returns true if shifting right by the given amount will lead to
+   inefficient code.  (Assumes x86 target.  May also warn inaccurately if
+   later optimization simplify the shift amount more than we are able to
+   see at this point.) */
+static bool
+lIsDifficultShiftAmount(Expr *expr) {
+    // Uniform shifts (of uniform values) are no problem.
+    if (expr->GetType()->IsVaryingType() == false)
+        return false;
+
+    ConstExpr *ce = dynamic_cast<ConstExpr *>(expr);
+    if (ce) {
+        // If the shift is by a constant amount, *and* it's the same amount
+        // in all vector lanes, we're in good shape.
+        uint32_t amount[ISPC_MAX_NVEC];
+        int count = ce->GetValues(amount);
+        for (int i = 1; i < count; ++i)
+            if (amount[i] != amount[0])
+              return true;
+        return false;
+    }
+
+    TypeCastExpr *tce = dynamic_cast<TypeCastExpr *>(expr);
+    if (tce && tce->expr) {
+        // Finally, if the shift amount is given by a uniform value that's
+        // been smeared out into a varying, we have the same shift for all
+        // lanes and are also in good shape.
+        return (tce->expr->GetType()->IsUniformType() == false);
+    }
+
+    return true;
+}
+
+
 llvm::Value *
 BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
     if (!arg0 || !arg1) {
@@ -1951,9 +1985,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
     case BitAnd:
     case BitXor:
     case BitOr: {
-        if (op == Shr && arg1->GetType()->IsVaryingType() &&
-            dynamic_cast<ConstExpr *>(arg1) == NULL)
-            PerformanceWarning(pos, "Shift right is extremely inefficient for "
+        if (op == Shr && lIsDifficultShiftAmount(arg1))
+            PerformanceWarning(pos, "Shift right is inefficient for "
                                "varying shift amounts.");
         return lEmitBinaryBitOp(op, value0, value1,
                                 arg0->GetType()->IsUnsignedType(), ctx);
@@ -2207,6 +2240,49 @@ lConstFoldBinaryIntOp(ConstExpr *constArg0, ConstExpr *constArg1,
 }
 
 
+/* Returns true if the given arguments (which are assumed to be the
+   operands of a divide) represent a divide that can be performed by one of
+   the __fast_idiv functions.
+ */
+static bool
+lCanImproveVectorDivide(Expr *arg0, Expr *arg1, int *divisor) {
+    const Type *type = arg0->GetType();
+    if (!type)
+        return false;
+
+    // The value being divided must be an int8/16/32.
+    if (!(Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingInt16) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingUInt16) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingInt32) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingUInt32)))
+        return false;
+
+    // The divisor must be the same compile-time constant value for all of
+    // the vector lanes.
+    ConstExpr *ce = dynamic_cast<ConstExpr *>(arg1);
+    if (!ce)
+        return false;
+    int64_t div[ISPC_MAX_NVEC];
+    int count = ce->GetValues(div);
+    for (int i = 1; i < count; ++i)
+        if (div[i] != div[0])
+          return false;
+    *divisor = div[0];
+
+    // And finally, the divisor must be >= 2 and <128 (for 8-bit divides),
+    // and <256 otherwise.
+    if (*divisor < 2)
+        return false;
+    if (Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) ||
+        Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8))
+        return *divisor < 128;
+    else
+        return *divisor < 256;
+}
+
+
 Expr *
 BinaryExpr::Optimize() {
     if (arg0 == NULL || arg1 == NULL)
@@ -2269,6 +2345,32 @@ BinaryExpr::Optimize() {
         }
     }
 
+    int divisor;
+    if (op == Div && lCanImproveVectorDivide(arg0, arg1, &divisor)) {
+        Debug(pos, "Improving vector divide by constant %d", divisor);
+
+        std::vector<Symbol *> idivFuns;
+        m->symbolTable->LookupFunction("__fast_idiv", &idivFuns);
+        if (idivFuns.size() == 0) {
+            Warning(pos, "Couldn't find __fast_idiv to optimize integer divide. "
+                    "Are you compiling with --nostdlib?");
+            return this;
+        }
+
+        Expr *idivSymExpr = new FunctionSymbolExpr("__fast_idiv", idivFuns, pos);
+        ExprList *args = new ExprList(arg0, pos);
+        args->exprs.push_back(new ConstExpr(AtomicType::UniformInt32, divisor, arg1->pos));
+        Expr *idivCall = new FunctionCallExpr(idivSymExpr, args, pos);
+
+        idivCall = ::TypeCheck(idivCall);
+        if (idivCall == NULL)
+          return NULL;
+
+        Assert(Type::EqualIgnoringConst(GetType(), idivCall->GetType()));
+        idivCall = new TypeCastExpr(GetType(), idivCall, pos);
+        return ::Optimize(idivCall);
+    }
+
     // From here on out, we're just doing constant folding, so if both args
     // aren't constants then we're done...
     if (constArg0 == NULL || constArg1 == NULL)
@@ -3021,6 +3123,14 @@ static llvm::Value *
 lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
                    llvm::Value *expr1, llvm::Value *expr2,
                    const Type *type) {
+#if 0 // !defined(LLVM_3_1)
+    // Though it should be equivalent, this seems to cause non-trivial
+    // performance regressions versus the below.  This may be related to
+    // http://llvm.org/bugs/show_bug.cgi?id=16941.
+    if (test->getType() != LLVMTypes::Int1VectorType)
+        test = ctx->TruncInst(test, LLVMTypes::Int1VectorType);
+    return ctx->SelectInst(test, expr1, expr2, "select");
+#else
     llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp");
     // Don't need to worry about masking here
     ctx->StoreInst(expr2, resultPtr);
@@ -3029,6 +3139,7 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
            PointerType::GetUniform(type)->LLVMType(g->ctx));
     ctx->StoreInst(expr1, resultPtr, test, type, PointerType::GetUniform(type));
     return ctx->LoadInst(resultPtr, "selectexpr_final");
+#endif // !LLVM_3_1
 }
 
 
@@ -6059,9 +6170,9 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                // If we have a bool vector of i32 elements, first truncate
-                // down to a single bit
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
+                // If we have a bool vector of non-i1 elements, first
+                // truncate down to a single bit.
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             // And then do an unisgned int->float cast
             cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
@@ -6103,8 +6214,8 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                // truncate i32 bool vector values to i1s
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
+                // truncate bool vector values to i1s
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double
                                  exprVal, targetType, cOpName);
@@ -6141,7 +6252,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6177,7 +6288,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6219,7 +6330,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6259,7 +6370,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6305,7 +6416,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6345,7 +6456,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6391,7 +6502,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6429,7 +6540,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6523,12 +6634,12 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
 
         if (fromType->IsUniformType()) {
             if (toType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) {
-                // extend out to i32 bool values from i1 here.  then we'll
-                // turn into a vector below, the way it does for everyone
-                // else...
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) {
+                // extend out to an bool as an i8/i16/i32 from the i1 here.
+                // Then we'll turn that into a vector below, the way it
+                // does for everyone else...
                 cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(),
-                                     LLVMGetName(cast, "to_i32bool"));
+                                     LLVMGetName(cast, "to_i_bool"));
             }
         }
         else
diff --git a/fail_db.txt b/fail_db.txt
new file mode 100644
index 00000000..31db9961
--- /dev/null
+++ b/fail_db.txt
@@ -0,0 +1,951 @@
+% List of known fails.
+% The list is unordered and contains information about commonly used platforms / configurations.
+% Our goas is to maintain this list for Linux, MacOS and Windows with reasonably new compilers.
+% Note, that it's important which C++ compiler was used. For example, gcc 4.4 is know to produce
+% considerably more fails with generic targets, than gcc 4.7 or later.
+% Using old compilers (gcc 4.4 is considered to be relatively old) may cause LLVM bugs.
+% To avoid them you can use LLVM selfbuild.
+% 
+./tests/masked-scatter-vector.ispc runfail  x86-64     sse2-i32x4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/broadcast-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/count-leading-trailing-zeros-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/count-leading-trailing-zeros-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-add-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-add-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-add-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-and-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-or-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-uniform-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-uniform-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-uniform-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-varying-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-varying-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-varying-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-varying-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/idiv.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/int64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/int64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/int64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/int64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/new-delete-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/phi-opts-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/phi-opts-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/popcnt-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/popcnt-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/popcnt-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-add-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-add-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/short-vec-14.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/soa-27.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/soa-28.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-128.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-57.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/uint64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/uint64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/uint64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/uint64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-gather-ifs.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-gather-multi-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-gather-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-mixed-unif-vary-indexing-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-mixed-unif-vary-indexing-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-mixed-unif-vary-indexing.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-multidim-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-scatter-unif-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-scatter-vary.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-struct-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/broadcast-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-array-gather-ifs.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-array-gather-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-array-multidim-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-array-struct-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-unif-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/count-leading-trailing-zeros-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/count-leading-trailing-zeros-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-add-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-add-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-add-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-and-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-or-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-uniform-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-uniform-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-uniform-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-varying-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-varying-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-varying-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-varying-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/gather-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/gather-to-vload-neg-offset.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/global-array-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/idiv.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/int64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/int64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/int64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/int64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/masked-scatter-struct.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/masked-scatter-vector.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/nested-structs-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/new-delete-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/pass-varying-lvalue-to-ref.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/phi-opts-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/phi-opts-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/popcnt-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/popcnt-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/popcnt-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-add-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-add-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/scatter-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/scatter-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/scatter-mask-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/scatter-mask-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/short-vec-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/short-vec-14.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/soa-28.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-128.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-57.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/uint64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/uint64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/uint64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/uint64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/unif-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/varying-struct-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/varying-struct-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/varying-struct-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/write-same-loc.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/masked-scatter-struct.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/memset-varying.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/rotate.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shift1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shift1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/memset-varying.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shift1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shift1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/memset-varying.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-10.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/memset-varying.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-10.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-11.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-13.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-5.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-6.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-11.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-13.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-5.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-6.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-down-int8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-up-int8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-down-int8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-up-int8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-11.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-13.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-5.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-6.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-11.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-13.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-5.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-6.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-down-int8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-up-int8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\test-141.ispc runfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\test-141.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.4         cl -O2 *
diff --git a/ispc.cpp b/ispc.cpp
index 480ff99a..0d9a4190 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -106,7 +106,7 @@ static void __cpuidex(int info[4], int level, int count) {
 static const char *
 lGetSystemISA() {
 #ifdef __arm__
-    return "neon";
+    return "neon-i32x4";
 #else
     int info[4];
     __cpuid(info, 1);
@@ -121,19 +121,19 @@ lGetSystemISA() {
             int info2[4];
             __cpuidex(info2, 7, 0);
             if ((info2[1] & (1 << 5)) != 0)
-                return "avx2";
+                return "avx2-i32x8";
             else
-                return "avx1.1";
+                return "avx1.1-i32x8";
         }
         // Regular AVX
-        return "avx";
+        return "avx1-i32x8";
     }
     else if ((info[2] & (1 << 19)) != 0)
-        return "sse4";
+        return "sse4-i32x4";
     else if ((info[3] & (1 << 26)) != 0)
-        return "sse2";
+        return "sse2-i32x4";
     else {
-        fprintf(stderr, "Unable to detect supported SSE/AVX ISA.  Exiting.\n");
+        Error(SourcePos(), "Unable to detect supported SSE/AVX ISA.  Exiting.");
         exit(1);
     }
 #endif
@@ -141,14 +141,20 @@ lGetSystemISA() {
 
 
 static const char *supportedCPUs[] = {
+    "sm_35", 
+#ifdef ISPC_ARM_ENABLED
     // FIXME: LLVM supports a ton of different ARM CPU variants--not just
     // cortex-a9 and a15.  We should be able to handle any of them that also
     // have NEON support.
-    "sm_35", "cortex-a9", "cortex-a15",
+    "cortex-a9", "cortex-a15",
+#endif
     "atom", "penryn", "core2", "corei7", "corei7-avx"
 #if !defined(LLVM_3_1)
     , "core-avx-i", "core-avx2"
 #endif // LLVM 3.2+
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3)
+    , "slm" 
+#endif // LLVM 3.4+
 };
 
 Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
@@ -184,22 +190,25 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             // If a CPU was specified explicitly, try to pick the best
             // possible ISA based on that.
             if (!strcmp(cpu, "core-avx2"))
-                isa = "avx2";
-            else if (!strcmp(cpu, "cortex-a9") ||
-                     !strcmp(cpu, "cortex-a15"))
-              isa = "neon";
+                isa = "avx2-i32x8";
             else if (!strcmp(cpu, "sm_35"))
               isa = "nvptx64";
+#ifdef ISPC_ARM_ENABLED
+            else if (!strcmp(cpu, "cortex-a9") ||
+                     !strcmp(cpu, "cortex-a15"))
+                isa = "neon-i32x4";
+#endif
             else if (!strcmp(cpu, "core-avx-i"))
-                isa = "avx1.1";
+                isa = "avx1.1-i32x8";
             else if (!strcmp(cpu, "sandybridge") ||
                 !strcmp(cpu, "corei7-avx"))
-                isa = "avx";
+                isa = "avx1-i32x8";
             else if (!strcmp(cpu, "corei7") ||
-                     !strcmp(cpu, "penryn"))
-                isa = "sse4";
+                     !strcmp(cpu, "penryn") ||
+                     !strcmp(cpu, "slm"))
+                isa = "sse4-i32x4";
             else
-                isa = "sse2";
+                isa = "sse2-i32x4";
             Warning(SourcePos(), "No --target specified on command-line.  "
                     "Using ISA \"%s\" based on specified CPU \"%s\".", isa,
                     cpu);
@@ -209,12 +218,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             // supports.
             isa = lGetSystemISA();
             Warning(SourcePos(), "No --target specified on command-line.  "
-                    "Using system ISA \"%s\".", isa);
+                    "Using default system target \"%s\".", isa);
         }
     }
 
-#if !defined(__arm__)
-    if (cpu == NULL && !strcmp(isa, "neon"))
+#if defined(ISPC_ARM_ENABLED) && !defined(__arm__)
+    if (cpu == NULL && !strncmp(isa, "neon", 4))
         // If we're compiling NEON on an x86 host and the CPU wasn't
         // supplied, don't go and set the CPU based on the host...
         cpu = "cortex-a9";
@@ -242,8 +251,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             }
         }
         if (foundCPU == false) {
-            fprintf(stderr, "Error: CPU type \"%s\" unknown. Supported CPUs: "
-                    "%s.\n", cpu, SupportedTargetCPUs().c_str());
+            Error(SourcePos(), "Error: CPU type \"%s\" unknown. Supported CPUs: "
+                    "%s.", cpu, SupportedCPUs().c_str());
             return;
         }
     }
@@ -251,10 +260,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     this->m_cpu = cpu;
 
     if (arch == NULL) {
-        if (!strcmp(isa, "neon"))
-            arch = "arm";
-        else if (!strcmp(isa, "nvptx64"))
+        if (!strcmp(isa, "nvptx64"))
             arch = "nvptx64";
+#ifdef ISPC_ARM_ENABLED
+        else if (!strncmp(isa, "neon", 4))
+            arch = "arm";
+#endif
         else
             arch = "x86-64";
     }
@@ -284,40 +295,98 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     }
 
     // Check default LLVM generated targets
-    if (!strcasecmp(isa, "sse2")) {
+    if (!strcasecmp(isa, "sse2") ||
+        !strcasecmp(isa, "sse2-i32x4")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
-        this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+        this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
+#if defined(LLVM_3_4)
+        ",-sse4.1,-sse4.2"
+#else
+        ",-sse41,-sse42"
+#endif
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse2-x2")) {
+    else if (!strcasecmp(isa, "sse2-x2") ||
+             !strcasecmp(isa, "sse2-i32x8")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 8;
-        this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+        this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
+#if defined(LLVM_3_4)
+        ",-sse4.1,-sse4.2"
+#else
+        ",-sse41,-sse42"
+#endif
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse4")) {
+    else if (!strcasecmp(isa, "sse4") ||
+             !strcasecmp(isa, "sse4-i32x4")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
         // TODO: why not sse42 and popcnt?
-        this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
+#if defined(LLVM_3_4)
+        ",+sse4.1,-sse4.2"        
+#else
+        ",+sse41,-sse42"
+#endif
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
+    else if (!strcasecmp(isa, "sse4x2") ||
+             !strcasecmp(isa, "sse4-x2") ||
+             !strcasecmp(isa, "sse4-i32x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 8;
-        this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
+#if defined(LLVM_3_4)
+        ",+sse4.1,-sse4.2"        
+#else
+        ",+sse41,-sse42"
+#endif
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "generic-4")) {
+    else if (!strcasecmp(isa, "sse4-i8x16")) {
+        this->m_isa = Target::SSE4;
+        this->m_nativeVectorWidth = 16;
+        this->m_vectorWidth = 16;
+        this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
+#if defined(LLVM_3_4)
+        ",+sse4.1,-sse4.2"        
+#else
+        ",+sse41,-sse42"
+#endif
+        ;
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 8;
+    }
+    else if (!strcasecmp(isa, "sse4-i16x8")) {
+        this->m_isa = Target::SSE4;
+        this->m_nativeVectorWidth = 8;
+        this->m_vectorWidth = 8;
+        this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
+#if defined(LLVM_3_4)
+        ",+sse4.1,-sse4.2"        
+#else
+        ",+sse41,-sse42"
+#endif
+        ;
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 16;
+    }
+    else if (!strcasecmp(isa, "generic-4") ||
+             !strcasecmp(isa, "generic-x4")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
@@ -327,7 +396,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-8")) {
+    else if (!strcasecmp(isa, "generic-8") ||
+             !strcasecmp(isa, "generic-x8")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -337,7 +407,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-16")) {
+    else if (!strcasecmp(isa, "generic-16") ||
+             !strcasecmp(isa, "generic-x16")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
@@ -347,7 +418,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-32")) {
+    else if (!strcasecmp(isa, "generic-32") ||
+             !strcasecmp(isa, "generic-x32")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 32;
         this->m_vectorWidth = 32;
@@ -357,7 +429,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-64")) {
+    else if (!strcasecmp(isa, "generic-64") ||
+             !strcasecmp(isa, "generic-x64")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 64;
         this->m_vectorWidth = 64;
@@ -367,14 +440,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-1")) {
+    else if (!strcasecmp(isa, "generic-1") ||
+             !strcasecmp(isa, "generic-x1")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 1;
         this->m_vectorWidth = 1;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx") || !strcasecmp(isa, "avx1")) {
+    else if (!strcasecmp(isa, "avx") ||
+             !strcasecmp(isa, "avx1") ||
+             !strcasecmp(isa, "avx1-i32x8")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -382,7 +458,18 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2")) {
+    else if (!strcasecmp(isa, "avx-i64x4") ||
+             !strcasecmp(isa, "avx1-i64x4")) {
+        this->m_isa = Target::AVX;
+        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+avx,+popcnt,+cmov";
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 64;
+    }
+    else if (!strcasecmp(isa, "avx-x2") ||
+             !strcasecmp(isa, "avx1-x2") ||
+             !strcasecmp(isa, "avx1-i32x16")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 16;
@@ -390,11 +477,18 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx1.1")) {
+    else if (!strcasecmp(isa, "avx1.1") ||
+             !strcasecmp(isa, "avx1.1-i32x8")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
-        this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
+        this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
+#if defined(LLVM_3_4)
+        ",+rdrnd"
+#else
+        ",+rdrand"
+#endif
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
@@ -403,11 +497,18 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasRand = true;
 #endif
     }
-    else if (!strcasecmp(isa, "avx1.1-x2")) {
+    else if (!strcasecmp(isa, "avx1.1-x2") ||
+             !strcasecmp(isa, "avx1.1-i32x16")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 16;
-        this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
+        this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
+#if defined(LLVM_3_4)
+        ",+rdrnd"
+#else
+        ",+rdrand"
+#endif
+        ;           
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
@@ -416,11 +517,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasRand = true;
 #endif
     }
-    else if (!strcasecmp(isa, "avx2")) {
+    else if (!strcasecmp(isa, "avx2") ||
+             !strcasecmp(isa, "avx2-i32x8")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
+#if defined(LLVM_3_4)
+        ",+rdrnd"
+#else
+        ",+rdrand"
+#endif
 #ifndef LLVM_3_1
             ",+fma"
 #endif // !LLVM_3_1
@@ -434,11 +541,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasGather = true;
 #endif
     }
-    else if (!strcasecmp(isa, "avx2-x2")) {
+    else if (!strcasecmp(isa, "avx2-x2") ||
+             !strcasecmp(isa, "avx2-i32x16")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
+#if defined(LLVM_3_4)
+        ",+rdrnd"
+#else
+        ",+rdrand"
+#endif
 #ifndef LLVM_3_1
             ",+fma"
 #endif // !LLVM_3_1
@@ -452,8 +565,28 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasGather = true;
 #endif
     }
-    else if (!strcasecmp(isa, "neon")) {
-        this->m_isa = Target::NEON;
+#ifdef ISPC_ARM_ENABLED
+    else if (!strcasecmp(isa, "neon-i8x16")) {
+        this->m_isa = Target::NEON8;
+        this->m_nativeVectorWidth = 16;
+        this->m_vectorWidth = 16;
+        this->m_attributes = "+neon,+fp16";
+        this->m_hasHalf = true; // ??
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 8;
+    }
+    else if (!strcasecmp(isa, "neon-i16x8")) {
+        this->m_isa = Target::NEON16;
+        this->m_nativeVectorWidth = 8;
+        this->m_vectorWidth = 8;
+        this->m_attributes = "+neon,+fp16";
+        this->m_hasHalf = true; // ??
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 16;
+    }
+    else if (!strcasecmp(isa, "neon") ||
+             !strcasecmp(isa, "neon-i32x4")) {
+        this->m_isa = Target::NEON32;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
         this->m_attributes = "+neon,+fp16";
@@ -461,6 +594,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
+#endif
     else if (!strcasecmp(isa, "nvptx64")) {
         this->m_isa = Target::NVPTX64;
         this->m_nativeVectorWidth = 1;
@@ -478,8 +612,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 #endif
     }
     else {
-        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n",
-                isa, SupportedTargetISAs());
+        Error(SourcePos(), "Target \"%s\" is unknown.  Choices are: %s.",
+                isa, SupportedTargets());
         error = true;
     }
 
@@ -491,8 +625,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             llvm::Reloc::Default;
         std::string featuresString = m_attributes;
         llvm::TargetOptions options;
-        if (m_isa == Target::NEON)
+#ifdef ISPC_ARM_ENABLED
+        if (m_isa == Target::NEON8 || m_isa == Target::NEON16 ||
+            m_isa == Target::NEON32)
             options.FloatABIType = llvm::FloatABI::Hard;
+#endif
 #if !defined(LLVM_3_1)
         if (g->opt.disableFMA == false)
             options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
@@ -551,6 +688,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         // Initialize target-specific "target-feature" attribute.
         if (!m_attributes.empty()) {
             llvm::AttrBuilder attrBuilder;
+            if (m_isa != Target::NVPTX64)
+              attrBuilder.addAttribute("target-cpu", this->m_cpu);
             attrBuilder.addAttribute("target-features", this->m_attributes);
             this->m_tf_attributes = new llvm::AttributeSet(
                 llvm::AttributeSet::get(
@@ -570,7 +709,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 
 
 std::string
-Target::SupportedTargetCPUs() {
+Target::SupportedCPUs() {
     std::string ret;
     int count = sizeof(supportedCPUs) / sizeof(supportedCPUs[0]);
     for (int i = 0; i < count; ++i) {
@@ -583,30 +722,45 @@ Target::SupportedTargetCPUs() {
 
 
 const char *
-Target::SupportedTargetArchs() {
-    return "nvptx64, arm, x86, x86-64";
+Target::SupportedArchs() {
+    return "nvptx64, "
+#ifdef ISPC_ARM_ENABLED
+        "arm, "
+#endif
+        "x86, x86-64";
 }
 
 
 const char *
-Target::SupportedTargetISAs() {
-    return "nvptx64, neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
-        ", avx1.1, avx1.1-x2, avx2, avx2-x2"
-        ", generic-1, generic-4, generic-8, generic-16, generic-32";
+Target::SupportedTargets() {
+    return "nvptx64, "
+#ifdef ISPC_ARM_ENABLED
+        "neon-i8x16, neon-16x8, neon-32x4, "
+#endif
+        "sse2-i32x4, sse2-i32x8, "
+        "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
+        "avx1-i32x8, avx1-i32x16, avx1-i64x4, "
+        "avx1.1-i32x8, avx1.1-i32x16, "
+        "avx2-i32x8, avx2-i32x16, "
+        "generic-x1, generic-x4, generic-x8, generic-x16, "
+        "generic-x32, generic-x64";
 }
 
 
 std::string
 Target::GetTripleString() const {
     llvm::Triple triple;
-    if (m_arch == "arm") {
-        triple.setTriple("armv7-eabi");
-    }
-    else if (m_arch == "nvptx64")
+    if (m_arch == "nvptx64")
     {
       triple.setTriple("nvptx64");
     }
-    else {
+#ifdef ISPC_ARM_ENABLED
+    else if (m_arch == "arm") {
+        triple.setTriple("armv7-eabi");
+    }
+#endif
+    else
+    {
         // Start with the host triple as the default
         triple.setTriple(llvm::sys::getDefaultTargetTriple());
 
@@ -629,10 +783,16 @@ Target::GetTripleString() const {
 const char *
 Target::ISAToString(ISA isa) {
     switch (isa) {
-    case Target::NEON:
-        return "neon";
     case Target::NVPTX64:
         return "nvptx64";
+#ifdef ISPC_ARM_ENABLED
+    case Target::NEON8:
+        return "neon-8";
+    case Target::NEON16:
+        return "neon-16";
+    case Target::NEON32:
+        return "neon-32";
+#endif
     case Target::SSE2:
         return "sse2";
     case Target::SSE4:
@@ -803,6 +963,7 @@ Globals::Globals() {
     includeStdlib = true;
     runCPP = true;
     debugPrint = false;
+    debugIR = -1;
     disableWarnings = false;
     warningsAsErrors = false;
     quiet = false;
diff --git a/ispc.h b/ispc.h
index de41a3e8..e2a58ba9 100644
--- a/ispc.h
+++ b/ispc.h
@@ -38,7 +38,7 @@
 #ifndef ISPC_H
 #define ISPC_H
 
-#define ISPC_VERSION "1.4.5dev"
+#define ISPC_VERSION "1.5.1dev"
 
 #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
 #error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"
@@ -59,6 +59,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <vector>
+#include <set>
 #include <string>
 
 /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
@@ -66,6 +67,9 @@
  */
 #define ISPC_MAX_NVEC 64
 
+// Number of final optimization phase
+#define LAST_OPT_NUMBER 1000
+
 // Forward declarations of a number of widely-used LLVM types
 namespace llvm {
     class AttributeSet;
@@ -175,7 +179,12 @@ public:
         flexible/performant of them will apear last in the enumerant.  Note
         also that __best_available_isa() needs to be updated if ISAs are
         added or the enumerant values are reordered.  */
-    enum ISA { NVPTX64, NEON, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS };
+    enum ISA { NVPTX64,
+#ifdef ISPC_ARM_ENABLED
+               NEON32, NEON16, NEON8,
+#endif
+               SSE2, SSE4, AVX, AVX11, AVX2, GENERIC,
+               NUM_ISAS };
 
     /** Initializes the given Target pointer for a target of the given
         name, if the name is a known target.  Returns true if the
@@ -183,16 +192,16 @@ public:
     Target(const char *arch, const char *cpu, const char *isa, bool pic);
 
     /** Returns a comma-delimited string giving the names of the currently
-        supported target ISAs. */
-    static const char *SupportedTargetISAs();
+        supported compilation targets. */
+    static const char *SupportedTargets();
 
     /** Returns a comma-delimited string giving the names of the currently
-        supported target CPUs. */
-    static std::string SupportedTargetCPUs();
+        supported CPUs. */
+    static std::string SupportedCPUs();
 
     /** Returns a comma-delimited string giving the names of the currently
-        supported target architectures. */
-    static const char *SupportedTargetArchs();
+        supported architectures. */
+    static const char *SupportedArchs();
 
     /** Returns a triple string specifying the target architecture, vendor,
         and environment. */
@@ -494,6 +503,16 @@ struct Globals {
         ispc's execution. */
     bool debugPrint;
 
+    /** Indicates which stages of optimization we want to dump. */
+    std::set<int> debug_stages;
+
+    /** Indicates after which optimization we want to generate
+        DebugIR information. */
+    int debugIR;
+
+    /** Indicates which phases of optimization we want to switch off. */
+    std::set<int> off_stages;
+
     /** Indicates whether all warning messages should be surpressed. */
     bool disableWarnings;
 
diff --git a/ispc.vcxproj b/ispc.vcxproj
index 96682fe3..58fa5b08 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -22,6 +22,8 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-avx1-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx1-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx1-x2-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx1-i64x4-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx1-i64x4-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-x2-32bit.cpp" />
@@ -45,18 +47,23 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-32-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-generic.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-x86.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask1.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask8.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask16.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask32.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask64.cpp" />
     <ClCompile Include="ispc.cpp" />
     <ClCompile Include="$(Configuration)\lex.cc">
       <DisableSpecificWarnings>4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
@@ -99,11 +106,14 @@
   <ItemGroup>
     <CustomBuild Include="stdlib.ispc">
       <FileType>Document</FileType>
-      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 &gt; $(Configuration)/gen-stdlib-x86.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic &gt; $(Configuration)/gen-stdlib-generic.cpp;
+      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask1 &gt; $(Configuration)/gen-stdlib-mask1.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask8 &gt; $(Configuration)/gen-stdlib-mask8.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask16 &gt; $(Configuration)/gen-stdlib-mask16.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask32 &gt; $(Configuration)/gen-stdlib-mask32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask64 &gt; $(Configuration)/gen-stdlib-mask64.cpp;
 </Command>
-      <Outputs>$(Configuration)/gen-stdlib-generic.cpp;$(Configuration)/gen-stdlib-x86.cpp</Outputs>
-      <Message>Building gen-stdlib-{generic,x86}.cpp</Message>
+      <Outputs>$(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp;$(Configuration)/gen-stdlib-mask64.cpp</Outputs>
+      <Message>Building gen-stdlib-{mask1,8,16,32,64}.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -111,7 +121,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins/dispatch.ll | python bitcode2cpp.py dispatch.ll &gt; $(Configuration)/gen-bitcode-dispatch.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4</AdditionalInputs>
       <Message>Building gen-bitcode-dispatch.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -120,7 +130,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -129,16 +139,52 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-8-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-8-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-16-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-16-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse4-x2.ll">
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -147,7 +193,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -156,7 +202,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -165,7 +211,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -174,7 +220,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -183,29 +229,16 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-neon.ll">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll &gt; gen-bitcode-neon.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-neon.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll &gt; gen-bitcode-neon.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-neon.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-neon.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-neon.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1.ll">
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -214,7 +247,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -223,7 +256,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -232,16 +265,34 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-i64x4-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-i64x4-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx11.ll">
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -250,7 +301,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -259,7 +310,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -268,7 +319,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -277,7 +328,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -286,7 +337,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -295,7 +346,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -304,7 +355,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -313,7 +364,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-1-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-1-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-1-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -322,7 +373,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-1-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-1-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -331,7 +382,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-4-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-4-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-4-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -340,7 +391,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-4-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -349,7 +400,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-8-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-8-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-8-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -358,7 +409,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-8-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-8-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-8-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -367,7 +418,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-16-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-16-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-16-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -376,7 +427,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-16-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-16-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-16-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -385,7 +436,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-32-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-32-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-32-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -394,7 +445,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-32-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-32-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-32-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -403,7 +454,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-64-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-64-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-64-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -412,7 +463,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-64-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-64-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-64-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
diff --git a/lex.ll b/lex.ll
index f6633fce..3655220f 100644
--- a/lex.ll
+++ b/lex.ll
@@ -76,7 +76,9 @@ static int allTokens[] = {
   TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED,
   TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
   TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
-  TOKEN_FLOAT_CONSTANT,
+  TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT,
+  TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT,
+  TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT,
   TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
   TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT,
   TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP,
@@ -150,6 +152,11 @@ void ParserInit() {
     tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
     tokenToName[TOKEN_DOTDOTDOT] = "...";
     tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
+    tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT";
+    tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT";
+    tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT";
+    tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT";
+    tokenToName[TOKEN_UINT16_CONSTANT] = "TOKEN_UINT16_CONSTANT";
     tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT";
     tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT";
     tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT";
@@ -260,6 +267,11 @@ void ParserInit() {
     tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
     tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
     tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
+    tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant";
+    tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant";
+    tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant";
+    tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant";
+    tokenNameRemap["TOKEN_UINT16_CONSTANT"] = "unsigned int16 constant";
     tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant";
     tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant";
     tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant";
@@ -333,6 +345,9 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
+FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9]+)|([0-9]+[dD][-+]?[0-9]+)|(\.[0-9]*[dD][-+]?[0-9]+))
+
+
 
 IDENT [a-zA-Z_][a-zA-Z_0-9]*
 ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
@@ -427,6 +442,17 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     return lParseInteger(true);
 }
 
+{FORTRAN_DOUBLE_NUMBER} {
+    RT;
+    {
+      int i = 0;
+      while (yytext[i] != 'd' && yytext[i] != 'D') i++;
+      yytext[i] = 'E';
+    }
+    yylval.doubleVal = atof(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
+
 
 {FLOAT_NUMBER} {
     RT;
@@ -440,6 +466,8 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     return TOKEN_FLOAT_CONSTANT;
 }
 
+
+
 "++" { RT; return TOKEN_INC_OP; }
 "--" { RT; return TOKEN_DEC_OP; }
 "<<" { RT; return TOKEN_LEFT_OP; }
@@ -599,7 +627,22 @@ lParseInteger(bool dotdotdot) {
         }
         else {
             // No u or l suffix
-            // First, see if we can fit this into a 32-bit integer...
+            // If we're compiling to an 8-bit mask target and the constant
+            // fits into 8 bits, return an 8-bit int.
+            if (g->target->getMaskBitCount() == 8) {
+                if (yylval.intVal <= 0x7fULL)
+                    return TOKEN_INT8_CONSTANT;
+                else if (yylval.intVal <= 0xffULL)
+                    return TOKEN_UINT8_CONSTANT;
+            }
+            // And similarly for 16-bit masks and constants
+            if (g->target->getMaskBitCount() == 16) {
+                if (yylval.intVal <= 0x7fffULL)
+                    return TOKEN_INT16_CONSTANT;
+                else if (yylval.intVal <= 0xffffULL)
+                    return TOKEN_UINT16_CONSTANT;
+            }
+            // Otherwise, see if we can fit this into a 32-bit integer...
             if (yylval.intVal <= 0x7fffffffULL)
                 return TOKEN_INT32_CONSTANT;
             else if (yylval.intVal <= 0xffffffffULL)
diff --git a/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch
new file mode 100644
index 00000000..36bb5572
--- /dev/null
+++ b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch
@@ -0,0 +1,102 @@
+This patch needs to be applied to LLVM 3.3 to fix performance regression after r172868 revision.
+This regression is due to increased register pressure after revision causing spills in case of multiple loads 
+This regression is fixed in 3.4 but the changes in 3.4 is not back portable,
+so we roll back r172868 to avoid regression with 3.3.
+
+Index: test/CodeGen/X86/sandybridge-loads.ll
+===================================================================
+--- test/CodeGen/X86/sandybridge-loads.ll       (revision 191082)
++++ test/CodeGen/X86/sandybridge-loads.ll       (working copy)
+@@ -1,24 +1,5 @@
+ ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
+ 
+-;CHECK: wideloads
+-;CHECK: vmovaps
+-;CHECK: vinsertf128
+-;CHECK: vmovaps
+-;CHECK-NOT: vinsertf128
+-;CHECK: ret
+-
+-define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+-  %v0 = load <8 x float>* %a, align 16  ; <---- unaligned!
+-  %v1 = load <8 x float>* %b, align 32  ; <---- aligned!
+-  %m0 = fcmp olt <8 x float> %v1, %v0
+-  %v2 = load <8 x float>* %c, align 32  ; <---- aligned!
+-  %m1 = fcmp olt <8 x float> %v2, %v0
+-  %mand = and <8 x i1> %m1, %m0
+-  %r = zext <8 x i1> %mand to <8 x i32>
+-  store <8 x i32> %r, <8 x i32>* undef, align 32
+-  ret void
+-}
+-
+ ; CHECK: widestores
+ ; loads:
+ ; CHECK: vmovaps
+Index: test/CodeGen/X86/v8i1-masks.ll
+===================================================================
+--- test/CodeGen/X86/v8i1-masks.ll	(revision 172868)
++++ test/CodeGen/X86/v8i1-masks.ll	(revision 172866)
+@@ -1,7 +1,7 @@
+ ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
+ 
+ ;CHECK: and_masks
+-;CHECK: vmovaps
++;CHECK: vmovups
+ ;CHECK: vcmpltp
+ ;CHECK: vcmpltp
+ ;CHECK: vandps
+Index: lib/Target/X86/X86ISelLowering.cpp
+===================================================================
+--- lib/Target/X86/X86ISelLowering.cpp  (revision 191077)
++++ lib/Target/X86/X86ISelLowering.cpp  (working copy)
+@@ -16756,42 +16756,9 @@
+   EVT MemVT = Ld->getMemoryVT();
+   DebugLoc dl = Ld->getDebugLoc();
+   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+-  unsigned RegSz = RegVT.getSizeInBits();
+ 
+-  // On Sandybridge unaligned 256bit loads are inefficient.
+   ISD::LoadExtType Ext = Ld->getExtensionType();
+-  unsigned Alignment = Ld->getAlignment();
+-  bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
+-  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+-      !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
+-    unsigned NumElems = RegVT.getVectorNumElements();
+-    if (NumElems < 2)
+-      return SDValue();
+ 
+-    SDValue Ptr = Ld->getBasePtr();
+-    SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
+-
+-    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+-                                  NumElems/2);
+-    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+-                                Ld->getPointerInfo(), Ld->isVolatile(),
+-                                Ld->isNonTemporal(), Ld->isInvariant(),
+-                                Alignment);
+-    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+-    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+-                                Ld->getPointerInfo(), Ld->isVolatile(),
+-                                Ld->isNonTemporal(), Ld->isInvariant(),
+-                                std::min(16U, Alignment));
+-    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+-                             Load1.getValue(1),
+-                             Load2.getValue(1));
+-
+-    SDValue NewVec = DAG.getUNDEF(RegVT);
+-    NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
+-    NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+-    return DCI.CombineTo(N, NewVec, TF, true);
+-  }
+-
+   // If this is a vector EXT Load then attempt to optimize it using a
+   // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+   // expansion is still better than scalar code.
+@@ -16805,6 +16772,7 @@
+     assert(MemVT.isVector() && "Must load a vector from memory");
+ 
+     unsigned NumElems = RegVT.getVectorNumElements();
++    unsigned RegSz = RegVT.getSizeInBits();
+     unsigned MemSz = MemVT.getSizeInBits();
+     assert(RegSz > MemSz && "Register size must be greater than the mem size");
+ 
diff --git a/llvm_patches/r183327-AVX2-GATHER.patch b/llvm_patches/3_3_r183327-AVX2-GATHER.patch
old mode 100755
new mode 100644
similarity index 100%
rename from llvm_patches/r183327-AVX2-GATHER.patch
rename to llvm_patches/3_3_r183327-AVX2-GATHER.patch
diff --git a/llvm_patches/r184575-x86-shift.patch b/llvm_patches/3_3_r184575-x86-shift.patch
similarity index 100%
rename from llvm_patches/r184575-x86-shift.patch
rename to llvm_patches/3_3_r184575-x86-shift.patch
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 26c18bf5..275cf794 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -115,13 +115,29 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
     LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
     LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);
 
-    if (target.getMaskBitCount() == 1)
+    switch (target.getMaskBitCount()) {
+    case 1:
         LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
             llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.getVectorWidth());
-    else {
-        Assert(target.getMaskBitCount() == 32);
+        break;
+    case 8:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt8Ty(*ctx), target.getVectorWidth());
+        break;
+    case 16:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt16Ty(*ctx), target.getVectorWidth());
+        break;
+    case 32:
         LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
             llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth());
+        break;
+    case 64:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt64Ty(*ctx), target.getVectorWidth());
+        break;
+    default:
+        FATAL("Unhandled mask width for initializing MaskType");
     }
 
     LLVMTypes::Int1VectorType =
@@ -154,12 +170,30 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
 
     std::vector<llvm::Constant *> maskOnes;
     llvm::Constant *onMask = NULL;
-    if (target.getMaskBitCount() == 1)
+    switch (target.getMaskBitCount()) {
+    case 1:
         onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1,
                                         false /*unsigned*/); // 0x1
-    else
+        break;
+    case 8:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), -1,
+                                        true /*signed*/); // 0xff
+        break;
+    case 16:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), -1,
+                                        true /*signed*/); // 0xffff
+        break;
+    case 32:
         onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
                                     true /*signed*/); // 0xffffffff
+        break;
+    case 64:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1,
+                                    true /*signed*/); // 0xffffffffffffffffull
+        break;
+    default:
+        FATAL("Unhandled mask width for onMask");
+    }
 
     for (int i = 0; i < target.getVectorWidth(); ++i)
         maskOnes.push_back(onMask);
@@ -167,13 +201,30 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
 
     std::vector<llvm::Constant *> maskZeros;
     llvm::Constant *offMask = NULL;
-    if (target.getMaskBitCount() == 1)
+    switch (target.getMaskBitCount()) {
+    case 1:
         offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0,
                                          true /*signed*/);
-    else
+        break;
+    case 8:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
+    case 16:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
+    case 32:
         offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
                                          true /*signed*/);
-
+        break;
+    case 64:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
+    default:
+        FATAL("Unhandled mask width for offMask");
+    }
     for (int i = 0; i < target.getVectorWidth(); ++i)
         maskZeros.push_back(offMask);
     LLVMMaskAllOff = llvm::ConstantVector::get(maskZeros);
@@ -441,12 +492,20 @@ LLVMUInt64Vector(const uint64_t *ivec) {
 llvm::Constant *
 LLVMBoolVector(bool b) {
     llvm::Constant *v;
-    if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+    if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int64Type, b ? 0xffffffffffffffffull : 0,
+                                   false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
         v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0,
                                    false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int16Type, b ? 0xffff : 0,
+                                   false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int8Type, b ? 0xff : 0,
+                                   false /*unsigned*/);
     else {
-        Assert(LLVMTypes::BoolVectorType->getElementType() ==
-               llvm::Type::getInt1Ty(*g->ctx));
+        Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType);
         v = b ? LLVMTrue : LLVMFalse;
     }
 
@@ -462,12 +521,20 @@ LLVMBoolVector(const bool *bvec) {
     std::vector<llvm::Constant *> vals;
     for (int i = 0; i < g->target->getVectorWidth(); ++i) {
         llvm::Constant *v;
-        if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+        if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int64Type, bvec[i] ? 0xffffffffffffffffull : 0,
+                                       false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
             v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0,
                                        false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int16Type, bvec[i] ? 0xffff : 0,
+                                       false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int8Type, bvec[i] ? 0xff : 0,
+                                       false /*unsigned*/);
         else {
-            Assert(LLVMTypes::BoolVectorType->getElementType() ==
-                   llvm::Type::getInt1Ty(*g->ctx));
+            Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType);
             v = bvec[i] ? LLVMTrue : LLVMFalse;
         }
 
diff --git a/main.cpp b/main.cpp
index 9ab0b793..ce6b5d4c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -85,13 +85,16 @@ usage(int ret) {
     printf("                          \t\taddressing calculations are done by default, even\n");
     printf("                          \t\ton 64-bit target architectures.)\n");
     printf("    [--arch={%s}]\t\tSelect target architecture\n",
-           Target::SupportedTargetArchs());
+           Target::SupportedArchs());
     printf("    [--c++-include-file=<name>]\t\tSpecify name of file to emit in #include statement in generated C++ code.\n");
 #ifndef ISPC_IS_WINDOWS
     printf("    [--colored-output]\t\tAlways use terminal colors in error/warning messages.\n");
 #endif
-    printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
-    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs().c_str());
+    printf("    ");
+    char cpuHelp[2048];
+    sprintf(cpuHelp, "[--cpu=<cpu>]\t\t\tSelect target CPU type\n<cpu>={%s}\n",
+            Target::SupportedCPUs().c_str());
+    PrintWithWordBreaks(cpuHelp, 16, TerminalWidth(), stdout);
     printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
     printf("    [--dev-stub <filename>]\t\tEmit device-side offload stub functions to file\n");
     printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
@@ -127,7 +130,11 @@ usage(int ret) {
     printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
 #endif // !ISPC_IS_WINDOWS
     printf("    [--quiet]\t\t\t\tSuppress all output\n");
-    printf("    [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
+    printf("    ");
+    char targetHelp[2048];
+    sprintf(targetHelp, "[--target=<t>]\t\t\tSelect target ISA and width.\n"
+            "<t>={%s}", Target::SupportedTargets());
+    PrintWithWordBreaks(targetHelp, 24, TerminalWidth(), stdout);
     printf("    [--version]\t\t\t\tPrint ispc version\n");
     printf("    [--werror]\t\t\t\tTreat warnings as errors\n");
     printf("    [--woff]\t\t\t\tDisable warnings\n");
@@ -156,6 +163,11 @@ devUsage(int ret) {
     printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
     printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
     printf("    [--yydebug]\t\t\t\tPrint debugging information during parsing\n");
+    printf("    [--debug-phase=<value>]\t\tSet optimization phases to dump. --debug-phase=first,210:220,300,305,310:last\n");
+#ifdef LLVM_3_4
+    printf("    [--debug-ir=<value>]\t\tSet optimization phase to generate debugIR after it\n");
+#endif
+    printf("    [--off-phase=<value>]\t\tSwitch off optimization phases. --off-phase=first,210:220,300,305,310:last\n");
     exit(ret);
 }
 
@@ -212,6 +224,47 @@ lSignal(void *) {
 }
 
 
+static int ParsingPhaseName(char * stage) {
+    if (strncmp(stage, "first", 5) == 0) {
+        return 0;
+    }
+    else if (strncmp(stage, "last", 4) == 0) {
+        return LAST_OPT_NUMBER;
+    }
+    else {
+        int t = atoi(stage);
+        if (t < 0 || t > LAST_OPT_NUMBER) {
+            fprintf(stderr, "Phases must be from 0 to %d. %s is incorrect.\n", LAST_OPT_NUMBER, stage);
+            exit(0);
+        }
+        else {
+            return t;
+        }
+    }
+}
+
+
+static std::set<int> ParsingPhases(char * stages) {
+    std::set<int> phases;
+    int begin = ParsingPhaseName(stages);
+    int end = begin;
+
+    for (unsigned i = 0; i < strlen(stages); i++) {
+        if ((stages[i] == ',') || (i == strlen(stages) - 1)) {
+            for (int j = begin; j < end + 1; j++) {
+                phases.insert(j);
+            }
+            begin = ParsingPhaseName(stages + i + 1);
+            end = begin;
+        }
+        else if (stages[i] == ':') {
+            end = ParsingPhaseName(stages + i + 1);
+        }
+    }
+    return phases;
+}
+
+
 static void
 lParseInclude(const char *path) {
 #ifdef ISPC_IS_WINDOWS
@@ -254,6 +307,8 @@ int main(int Argc, char *Argv[]) {
     LLVMInitializeX86Disassembler();
     LLVMInitializeX86TargetMC();
 #endif // !__ARM__
+
+#ifdef ISPC_ARM_ENABLED
     // Generating ARM from x86 is more likely to be useful, though.
     LLVMInitializeARMTargetInfo();
     LLVMInitializeARMTarget();
@@ -261,6 +316,7 @@ int main(int Argc, char *Argv[]) {
     LLVMInitializeARMAsmParser();
     LLVMInitializeARMDisassembler();
     LLVMInitializeARMTargetMC();
+#endif
 
     LLVMInitializeNVPTXTargetInfo();
     LLVMInitializeNVPTXTarget();
@@ -282,7 +338,6 @@ int main(int Argc, char *Argv[]) {
     // as we're parsing below
     g = new Globals;
 
-    bool debugSet = false, optSet = false;
     Module::OutputType ot = Module::Object;
     bool generatePIC = false;
     const char *arch = NULL, *cpu = NULL, *target = NULL;
@@ -325,7 +380,6 @@ int main(int Argc, char *Argv[]) {
             g->emitInstrumentation = true;
         else if (!strcmp(argv[i], "-g")) {
             g->generateDebuggingSymbols = true;
-            debugSet = true;
         }
         else if (!strcmp(argv[i], "--emit-asm"))
             ot = Module::Asm;
@@ -452,12 +506,10 @@ int main(int Argc, char *Argv[]) {
         }
         else if (!strcmp(argv[i], "-O0")) {
             g->opt.level = 0;
-            optSet = true;
         }
         else if (!strcmp(argv[i], "-O") ||  !strcmp(argv[i], "-O1") ||
                  !strcmp(argv[i], "-O2") || !strcmp(argv[i], "-O3")) {
             g->opt.level = 1;
-            optSet = true;
         }
         else if (!strcmp(argv[i], "-"))
             ;
@@ -498,6 +550,20 @@ int main(int Argc, char *Argv[]) {
           }
           hostStubFileName = argv[i];
         }
+        else if (strncmp(argv[i], "--debug-phase=", 14) == 0) {
+            fprintf(stderr, "WARNING: Adding debug phases may change the way PassManager"
+                            "handles the phases and it may possibly make some bugs go"
+                            "away or introduce the new ones.\n");
+            g->debug_stages = ParsingPhases(argv[i] + strlen("--debug-phase="));
+        }
+#ifdef LLVM_3_4
+        else if (strncmp(argv[i], "--debug-ir=", 11) == 0) {
+            g->debugIR = ParsingPhaseName(argv[i] + strlen("--debug-ir="));
+        }
+#endif
+        else if (strncmp(argv[i], "--off-phase=", 12) == 0) {
+            g->off_stages = ParsingPhases(argv[i] + strlen("--off-phase="));
+        }
         else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
             lPrintVersion();
             return 0;
@@ -517,12 +583,6 @@ int main(int Argc, char *Argv[]) {
         }
     }
 
-    // If the user specified -g, then the default optimization level is 0.
-    // If -g wasn't specified, the default optimization level is 1 (full
-    // optimization).
-    if (debugSet && !optSet)
-        g->opt.level = 0;
-
     if (g->enableFuzzTest) {
         if (g->fuzzTestSeed == -1) {
 #ifdef ISPC_IS_WINDOWS
diff --git a/module.cpp b/module.cpp
index 85bf242c..755a5dc4 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1877,6 +1877,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
     char *p = targetMacro;
     while (*p) {
         *p = toupper(*p);
+        if (*p == '-') *p = '_';
         ++p;
     }
     opts.addMacroDef(targetMacro);
diff --git a/opt.cpp b/opt.cpp
index ba32c639..75eae20c 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -63,6 +63,9 @@
   #include <llvm/IR/BasicBlock.h>
   #include <llvm/IR/Constants.h>
 #endif
+#if defined (LLVM_3_4)
+  #include <llvm/Transforms/Instrumentation.h>
+#endif
 #include <llvm/PassManager.h>
 #include <llvm/PassRegistry.h>
 #include <llvm/Assembly/PrintModulePass.h>
@@ -85,6 +88,7 @@
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Analysis/Passes.h>
 #include <llvm/Support/raw_ostream.h>
+#include <llvm/Support/PatternMatch.h>
 #if defined(LLVM_3_1)
   #include <llvm/Analysis/DebugInfo.h>
 #else
@@ -108,7 +112,8 @@
 #endif
 
 static llvm::Pass *CreateIntrinsicsOptPass();
-static llvm::Pass *CreateVSelMovmskOptPass();
+static llvm::Pass *CreateInstructionSimplifyPass();
+static llvm::Pass *CreatePeepholePass();
 
 static llvm::Pass *CreateImproveMemoryOpsPass();
 static llvm::Pass *CreateGatherCoalescePass();
@@ -117,6 +122,8 @@ static llvm::Pass *CreateReplacePseudoMemoryOpsPass();
 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
 static llvm::Pass *CreateMakeInternalFuncsStaticPass();
 
+static llvm::Pass *CreateDebugPass(char * output);
+
 #define DEBUG_START_PASS(NAME)                                 \
     if (g->debugPrint &&                                       \
         (getenv("FUNC") == NULL ||                             \
@@ -393,6 +400,54 @@ lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
 }
 
 
+///////////////////////////////////////////////////////////////////////////
+// This is a wrap over class llvm::PassManager. This duplicates PassManager function run()
+//   and change PassManager function add by adding some checks and debug passes.
+//   This wrap can control:
+//   - If we want to switch off optimization with given number.
+//   - If we want to dump LLVM IR after optimization with given number.
+//   - If we want to generate LLVM IR debug for gdb after optimization with given number.
+class DebugPassManager {
+public:
+    DebugPassManager():number(0){}
+    void add(llvm::Pass * P, int stage);
+    bool run(llvm::Module& M) {return PM.run(M);}
+    llvm::PassManager& getPM() {return PM;}
+
+private:
+    llvm::PassManager PM;
+    int number;
+};
+
+void
+DebugPassManager::add(llvm::Pass * P, int stage = -1) {
+    // taking number of optimization
+    if (stage == -1) {
+        number++;
+    }
+    else {
+        number = stage;
+    }
+    if (g->off_stages.find(number) == g->off_stages.end()) {
+        // adding optimization (not switched off)
+        PM.add(P);
+        if (g->debug_stages.find(number) != g->debug_stages.end()) {
+            // adding dump of LLVM IR after optimization
+            char buf[100];
+            sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n",
+                number, P->getPassName());
+            PM.add(CreateDebugPass(buf));
+        }
+#ifdef LLVM_3_4
+        if (g->debugIR == number) {
+            // adding generating of LLVM IR debug after optimization
+            char buf[100];
+            sprintf(buf, "Debug_IR_after_%d_phase.bc", number);
+            PM.add(llvm::createDebugIRPass(true, true, ".", buf));
+        }
+#endif
+    }
+}
 ///////////////////////////////////////////////////////////////////////////
 
 void
@@ -401,14 +456,8 @@ Optimize(llvm::Module *module, int optLevel) {
         printf("*** Code going into optimization ***\n");
         module->dump();
     }
-
-    llvm::PassManager optPM;
-    optPM.add(llvm::createVerifierPass());
-
-#if 0
-    std::string err;
-    optPM.add(llvm::createPrintModulePass(new llvm::raw_fd_ostream("-", err)));
-#endif
+    DebugPassManager optPM;
+    optPM.add(llvm::createVerifierPass(),0);
 
     llvm::TargetLibraryInfo *targetLibraryInfo =
         new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
@@ -425,7 +474,7 @@ Optimize(llvm::Module *module, int optLevel) {
     optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
                                             targetMachine->getVectorTargetTransformInfo()));
   #else // LLVM 3.3+
-    targetMachine->addAnalysisPasses(optPM);
+    targetMachine->addAnalysisPasses(optPM.getPM());
   #endif
 #endif
 
@@ -437,11 +486,11 @@ Optimize(llvm::Module *module, int optLevel) {
         // run absolutely no optimizations, since the front-end needs us to
         // take the various __pseudo_* functions it has emitted and turn
         // them into something that can actually execute.
-        optPM.add(CreateImproveMemoryOpsPass());
+        optPM.add(CreateImproveMemoryOpsPass(), 100);
         if (g->opt.disableHandlePseudoMemoryOps == false)
             optPM.add(CreateReplacePseudoMemoryOpsPass());
 
-        optPM.add(CreateIntrinsicsOptPass());
+        optPM.add(CreateIntrinsicsOptPass(), 102);
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(llvm::createFunctionInliningPass());
         optPM.add(CreateMakeInternalFuncsStaticPass());
@@ -460,7 +509,7 @@ Optimize(llvm::Module *module, int optLevel) {
         llvm::initializeInstrumentation(*registry);
         llvm::initializeTarget(*registry);
 
-        optPM.add(llvm::createGlobalDCEPass());
+        optPM.add(llvm::createGlobalDCEPass(), 200);
 
         // Early optimizations to try to reduce the total amount of code to
         // work with if we can
@@ -469,16 +518,19 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createDeadInstEliminationPass());
         optPM.add(llvm::createCFGSimplificationPass());
 
+        optPM.add(llvm::createPromoteMemoryToRegisterPass());
+        optPM.add(llvm::createAggressiveDCEPass());
+
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
-            optPM.add(llvm::createInstructionCombiningPass());
+            optPM.add(llvm::createInstructionCombiningPass(), 210);
             optPM.add(CreateImproveMemoryOpsPass());
         }
         if (!g->opt.disableMaskAllOnOptimizations) {
-            optPM.add(CreateIntrinsicsOptPass());
-            optPM.add(CreateVSelMovmskOptPass());
+            optPM.add(CreateIntrinsicsOptPass(), 215);
+            optPM.add(CreateInstructionSimplifyPass());
         }
-        optPM.add(llvm::createDeadInstEliminationPass());
+        optPM.add(llvm::createDeadInstEliminationPass(), 220);
 
         // Max struct size threshold for scalar replacement is
         //    1) 4 fields (r,g,b,w)
@@ -508,9 +560,10 @@ Optimize(llvm::Module *module, int optLevel) {
 #if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
         // Starting from 3.4 this functionality was moved to
         // InstructionCombiningPass. See r184459 for details.
-        optPM.add(llvm::createSimplifyLibCallsPass());
+        optPM.add(llvm::createSimplifyLibCallsPass(), 240);
 #endif
-        optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(llvm::createAggressiveDCEPass());
+        optPM.add(llvm::createInstructionCombiningPass(), 241);
         optPM.add(llvm::createJumpThreadingPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
@@ -518,75 +571,85 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createTailCallEliminationPass());
 
         if (!g->opt.disableMaskAllOnOptimizations) {
-            optPM.add(CreateIntrinsicsOptPass());
-            optPM.add(CreateVSelMovmskOptPass());
+            optPM.add(CreateIntrinsicsOptPass(), 250);
+            optPM.add(CreateInstructionSimplifyPass());
         }
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
-            optPM.add(llvm::createInstructionCombiningPass());
+            optPM.add(llvm::createInstructionCombiningPass(), 255);
             optPM.add(CreateImproveMemoryOpsPass());
 
             if (g->opt.disableCoalescing == false &&
                 g->target->getISA() != Target::GENERIC) {
                 // It is important to run this here to make it easier to
                 // finding matching gathers we can coalesce..
-                optPM.add(llvm::createEarlyCSEPass());
+                optPM.add(llvm::createEarlyCSEPass(), 260);
                 optPM.add(CreateGatherCoalescePass());
             }
         }
 
-        optPM.add(llvm::createFunctionInliningPass());
+        optPM.add(llvm::createFunctionInliningPass(), 265);
         optPM.add(llvm::createConstantPropagationPass());
         optPM.add(CreateIntrinsicsOptPass());
-        optPM.add(CreateVSelMovmskOptPass());
+        optPM.add(CreateInstructionSimplifyPass());
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
-            optPM.add(llvm::createInstructionCombiningPass());
+            optPM.add(llvm::createInstructionCombiningPass(), 270);
             optPM.add(CreateImproveMemoryOpsPass());
         }
 
-        optPM.add(llvm::createIPSCCPPass());
+        optPM.add(llvm::createIPSCCPPass(), 275);
         optPM.add(llvm::createDeadArgEliminationPass());
+        optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());
 
-        if (g->opt.disableHandlePseudoMemoryOps == false)
-            optPM.add(CreateReplacePseudoMemoryOpsPass());
-        optPM.add(CreateIntrinsicsOptPass());
-        optPM.add(CreateVSelMovmskOptPass());
+        if (g->opt.disableHandlePseudoMemoryOps == false) {
+            optPM.add(CreateReplacePseudoMemoryOpsPass(),280);
+        }
+        optPM.add(CreateIntrinsicsOptPass(),281);
+        optPM.add(CreateInstructionSimplifyPass());
 
         optPM.add(llvm::createFunctionInliningPass());
         optPM.add(llvm::createArgumentPromotionPass());
         optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false));
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createLoopRotatePass());
         optPM.add(llvm::createLICMPass());
         optPM.add(llvm::createLoopUnswitchPass(false));
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createIndVarSimplifyPass());
         optPM.add(llvm::createLoopIdiomPass());
         optPM.add(llvm::createLoopDeletionPass());
-        if (g->opt.unrollLoops)
-            optPM.add(llvm::createLoopUnrollPass());
-        optPM.add(llvm::createGVNPass());
+        if (g->opt.unrollLoops) {
+            optPM.add(llvm::createLoopUnrollPass(), 300);
+        }
+        optPM.add(llvm::createGVNPass(), 301);
 
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(CreateIntrinsicsOptPass());
-        optPM.add(CreateVSelMovmskOptPass());
+        optPM.add(CreateInstructionSimplifyPass());
 
         optPM.add(llvm::createMemCpyOptPass());
         optPM.add(llvm::createSCCPPass());
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createJumpThreadingPass());
         optPM.add(llvm::createCorrelatedValuePropagationPass());
         optPM.add(llvm::createDeadStoreEliminationPass());
         optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
+        optPM.add(CreatePeepholePass());
+        optPM.add(llvm::createFunctionInliningPass());
+        optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createStripDeadPrototypesPass());
         optPM.add(CreateMakeInternalFuncsStaticPass());
         optPM.add(llvm::createGlobalDCEPass());
@@ -595,7 +658,7 @@ Optimize(llvm::Module *module, int optLevel) {
 
     // Finish up by making sure we didn't mess anything up in the IR along
     // the way.
-    optPM.add(llvm::createVerifierPass());
+    optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER);
     optPM.run(*module);
 
     if (g->debugPrint) {
@@ -670,14 +733,17 @@ IntrinsicsOpt::IntrinsicsOpt()
     // All of the mask instructions we may encounter.  Note that even if
     // compiling for AVX, we may still encounter the regular 4-wide SSE
     // MOVMSK instruction.
-    llvm::Function *sseMovmsk =
+    llvm::Function *ssei8Movmsk =
+        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse2_pmovmskb_128);
+    maskInstructions.push_back(ssei8Movmsk);
+    llvm::Function *sseFloatMovmsk =
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps);
-    maskInstructions.push_back(sseMovmsk);
+    maskInstructions.push_back(sseFloatMovmsk);
     maskInstructions.push_back(m->module->getFunction("__movmsk"));
-    llvm::Function *avxMovmsk =
+    llvm::Function *avxFloatMovmsk =
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256);
-    Assert(avxMovmsk != NULL);
-    maskInstructions.push_back(avxMovmsk);
+    Assert(avxFloatMovmsk != NULL);
+    maskInstructions.push_back(avxFloatMovmsk);
 
     // And all of the blend instructions
     blendInstructions.push_back(BlendInstruction(
@@ -924,80 +990,153 @@ CreateIntrinsicsOptPass() {
     @todo The better thing to do would be to submit a patch to LLVM to get
     these; they're presumably pretty simple patterns to match.
 */
-class VSelMovmskOpt : public llvm::BasicBlockPass {
+class InstructionSimplifyPass : public llvm::BasicBlockPass {
 public:
-    VSelMovmskOpt()
+    InstructionSimplifyPass()
         : BasicBlockPass(ID) { }
 
     const char *getPassName() const { return "Vector Select Optimization"; }
     bool runOnBasicBlock(llvm::BasicBlock &BB);
 
     static char ID;
+
+private:
+    static bool simplifySelect(llvm::SelectInst *selectInst,
+                               llvm::BasicBlock::iterator iter);
+    static llvm::Value *simplifyBoolVec(llvm::Value *value);
+    static bool simplifyCall(llvm::CallInst *callInst,
+                               llvm::BasicBlock::iterator iter);
 };
 
-char VSelMovmskOpt::ID = 0;
+char InstructionSimplifyPass::ID = 0;
+
+
+llvm::Value *
+InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) {
+    llvm::TruncInst *trunc = llvm::dyn_cast<llvm::TruncInst>(value);
+    if (trunc != NULL) {
+        // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector)
+        llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(value);
+        if (sext && 
+            sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
+            return sext->getOperand(0);
+
+        llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(value);
+        if (zext && 
+            zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
+            return zext->getOperand(0);
+    }
+
+    llvm::ICmpInst *icmp = llvm::dyn_cast<llvm::ICmpInst>(value);
+    if (icmp != NULL) {
+        // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo
+        if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) {
+            llvm::Value *op1 = icmp->getOperand(1);
+            if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
+                llvm::Value *op0 = icmp->getOperand(0);
+                llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(op0);
+                if (sext)
+                    return sext->getOperand(0);
+                llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(op0);
+                if (zext)
+                    return zext->getOperand(0);
+            }
+        }
+    }
+    return NULL;
+}
 
 
 bool
-VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
-    DEBUG_START_PASS("VSelMovmaskOpt");
+InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst,
+                                        llvm::BasicBlock::iterator iter) {
+    if (selectInst->getType()->isVectorTy() == false)
+        return false;
+
+    llvm::Value *factor = selectInst->getOperand(0);
+
+    // Simplify all-on or all-off mask values
+    MaskStatus maskStatus = lGetMaskStatus(factor);
+    llvm::Value *value = NULL;
+    if (maskStatus == ALL_ON)
+        // Mask all on -> replace with the first select value
+        value = selectInst->getOperand(1);
+    else if (maskStatus == ALL_OFF)
+        // Mask all off -> replace with the second select value
+        value = selectInst->getOperand(2);
+    if (value != NULL) {
+        llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
+                                   iter, value);
+        return true;
+    }
+
+    // Sometimes earlier LLVM optimization passes generate unnecessarily
+    // complex expressions for the selection vector, which in turn confuses
+    // the code generators and leads to sub-optimal code (particularly for
+    // 8 and 16-bit masks).  We'll try to simplify them out here so that
+    // the code generator patterns match..
+    if ((factor = simplifyBoolVec(factor)) != NULL) {
+        llvm::Instruction *newSelect =
+            llvm::SelectInst::Create(factor, selectInst->getOperand(1),
+                                     selectInst->getOperand(2),
+                                     selectInst->getName());
+        llvm::ReplaceInstWithInst(selectInst, newSelect);
+        return true;
+    }
+
+    return false;
+}
+
+
+bool
+InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst,
+                                      llvm::BasicBlock::iterator iter) {
+    llvm::Function *calledFunc = callInst->getCalledFunction();
+
+    // Turn a __movmsk call with a compile-time constant vector into the
+    // equivalent scalar value.
+    if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
+        return false;
+
+    uint64_t mask;
+    if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
+        llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
+                                   iter, LLVMInt64(mask));
+        return true;
+    }
+    return false;
+}
+
+
+bool
+InstructionSimplifyPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("InstructionSimplify");
 
     bool modifiedAny = false;
 
  restart:
     for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
         llvm::SelectInst *selectInst = llvm::dyn_cast<llvm::SelectInst>(&*iter);
-        if (selectInst != NULL && selectInst->getType()->isVectorTy()) {
-            llvm::Value *factor = selectInst->getOperand(0);
-
-            MaskStatus maskStatus = lGetMaskStatus(factor);
-            llvm::Value *value = NULL;
-            if (maskStatus == ALL_ON)
-                // Mask all on -> replace with the first select value
-                value = selectInst->getOperand(1);
-            else if (maskStatus == ALL_OFF)
-                // Mask all off -> replace with the second select value
-                value = selectInst->getOperand(2);
-
-            if (value != NULL) {
-                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
-                                           iter, value);
-                modifiedAny = true;
-                goto restart;
-            }
+        if (selectInst && simplifySelect(selectInst, iter)) {
+            modifiedAny = true;
+            goto restart;
         }
-
         llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
-        if (callInst == NULL)
-            continue;
-
-        llvm::Function *calledFunc = callInst->getCalledFunction();
-        if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
-            continue;
-
-        uint64_t mask;
-        if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
-#if 0
-            fprintf(stderr, "mask %d\n", mask);
-            callInst->getArgOperand(0)->dump();
-            fprintf(stderr, "-----------\n");
-#endif
-            llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
-                                       iter, LLVMInt64(mask));
+        if (callInst && simplifyCall(callInst, iter)) {
             modifiedAny = true;
             goto restart;
         }
     }
 
-    DEBUG_END_PASS("VSelMovMskOpt");
+    DEBUG_END_PASS("InstructionSimplify");
 
     return modifiedAny;
 }
 
 
 static llvm::Pass *
-CreateVSelMovmskOptPass() {
-    return new VSelMovmskOpt;
+CreateInstructionSimplifyPass() {
+    return new InstructionSimplifyPass;
 }
 
 
@@ -4240,6 +4379,42 @@ CreateIsCompileTimeConstantPass(bool isLastTry) {
     return new IsCompileTimeConstantPass(isLastTry);
 }
 
+//////////////////////////////////////////////////////////////////////////
+// DebugPass
+
+/** This pass is added in list of passes after optimizations which
+    we want to debug and print dump of LLVM IR in stderr. Also it
+    prints name and number of previous optimization.
+ */
+class DebugPass : public llvm::ModulePass {
+public:
+    static char ID;
+    DebugPass(char * output) : ModulePass(ID) {
+        sprintf(str_output, "%s", output);
+    }
+
+    const char *getPassName() const { return "Dump LLVM IR"; }
+    bool runOnModule(llvm::Module &m);
+
+private:
+    char str_output[100];
+};
+
+char DebugPass::ID = 0;
+
+bool
+DebugPass::runOnModule(llvm::Module &module) {
+    fprintf(stderr, "%s", str_output);
+    fflush(stderr);
+    module.dump();
+    return true;
+}
+
+static llvm::Pass *
+CreateDebugPass(char * output) {
+    return new DebugPass(output);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // MakeInternalFuncsStaticPass
 
@@ -4273,6 +4448,14 @@ char MakeInternalFuncsStaticPass::ID = 0;
 bool
 MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
     const char *names[] = {
+        "__avg_up_uint8",
+        "__avg_up_int8",
+        "__avg_up_uint16",
+        "__avg_up_int16",
+        "__avg_down_uint8",
+        "__avg_down_int8",
+        "__avg_down_uint16",
+        "__avg_down_int16",
         "__fast_masked_vload",
         "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16",
         "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64",
@@ -4352,3 +4535,391 @@ static llvm::Pass *
 CreateMakeInternalFuncsStaticPass() {
     return new MakeInternalFuncsStaticPass;
 }
+
+
+///////////////////////////////////////////////////////////////////////////
+// PeepholePass
+
+class PeepholePass : public llvm::BasicBlockPass {
+public:
+    PeepholePass();
+
+    const char *getPassName() const { return "Peephole Optimizations"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+    static char ID;
+};
+
+char PeepholePass::ID = 0;
+
+PeepholePass::PeepholePass()
+    : BasicBlockPass(ID) {
+}
+
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+
+using namespace llvm::PatternMatch;
+
+template<typename Op_t, unsigned Opcode>
+struct CastClassTypes_match {
+    Op_t Op;
+    const llvm::Type *fromType, *toType;
+
+    CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f,
+                         const llvm::Type *t)
+        : Op(OpMatch), fromType(f), toType(t) {}
+
+    template<typename OpTy>
+    bool match(OpTy *V) {
+        if (llvm::Operator *O = llvm::dyn_cast<llvm::Operator>(V))
+            return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) &&
+                    O->getType() == toType &&
+                    O->getOperand(0)->getType() == fromType);
+        return false;
+    }
+};
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
+m_SExt8To16(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
+        Op,
+        LLVMTypes::Int8VectorType,
+        LLVMTypes::Int16VectorType);
+}
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
+m_ZExt8To16(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
+        Op,
+        LLVMTypes::Int8VectorType,
+        LLVMTypes::Int16VectorType);
+}
+
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
+m_Trunc16To8(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
+        Op,
+        LLVMTypes::Int16VectorType,
+        LLVMTypes::Int8VectorType);
+}
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
+m_SExt16To32(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
+        Op,
+        LLVMTypes::Int16VectorType,
+        LLVMTypes::Int32VectorType);
+}
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
+m_ZExt16To32(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
+        Op,
+        LLVMTypes::Int16VectorType,
+        LLVMTypes::Int32VectorType);
+}
+
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
+m_Trunc32To16(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
+        Op,
+        LLVMTypes::Int32VectorType,
+        LLVMTypes::Int16VectorType);
+}
+
+template<typename Op_t>
+struct UDiv2_match {
+    Op_t Op;
+
+    UDiv2_match(const Op_t &OpMatch)
+        : Op(OpMatch) {}
+
+    template<typename OpTy>
+    bool match(OpTy *V) {
+        llvm::BinaryOperator *bop;
+        llvm::ConstantDataVector *cdv;
+        if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
+            (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
+            cdv->getSplatValue() != NULL) {
+            const llvm::APInt &apInt = cdv->getUniqueInteger();
+
+            switch (bop->getOpcode()) {
+            case llvm::Instruction::UDiv:
+                // divide by 2
+                return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
+            case llvm::Instruction::LShr:
+                // shift left by 1
+                return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
+            default:
+                return false;
+            }
+        }
+        return false;
+    }
+};
+
+template<typename V>
+inline UDiv2_match<V>
+m_UDiv2(const V &v) {
+    return UDiv2_match<V>(v);
+}
+
+template<typename Op_t>
+struct SDiv2_match {
+    Op_t Op;
+
+    SDiv2_match(const Op_t &OpMatch)
+        : Op(OpMatch) {}
+
+    template<typename OpTy>
+    bool match(OpTy *V) {
+        llvm::BinaryOperator *bop;
+        llvm::ConstantDataVector *cdv;
+        if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
+            (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
+            cdv->getSplatValue() != NULL) {
+            const llvm::APInt &apInt = cdv->getUniqueInteger();
+
+            switch (bop->getOpcode()) {
+            case llvm::Instruction::SDiv:
+                // divide by 2
+                return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
+            case llvm::Instruction::AShr:
+                // shift left by 1
+                return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
+            default:
+                return false;
+            }
+        }
+        return false;
+    }
+};
+
+template<typename V>
+inline SDiv2_match<V>
+m_SDiv2(const V &v) {
+    return SDiv2_match<V>(v);
+}
+
+// Returns true if the given function has a call to an intrinsic function
+// in its definition.
+static bool
+lHasIntrinsicInDefinition(llvm::Function *func) {
+  llvm::Function::iterator bbiter = func->begin();
+  for (; bbiter != func->end(); ++bbiter) {
+    for (llvm::BasicBlock::iterator institer = bbiter->begin();
+         institer != bbiter->end(); ++institer) {
+      if (llvm::isa<llvm::IntrinsicInst>(institer))
+        return true;
+    }
+  }
+  return false;
+}
+
+static llvm::Instruction *
+lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) {
+  llvm::Function *func = m->module->getFunction(name);
+  Assert(func != NULL);
+
+  // Make sure that the definition of the llvm::Function has a call to an
+  // intrinsic function in its instructions; otherwise we will generate
+  // infinite loops where we "helpfully" turn the default implementations
+  // of target builtins like __avg_up_uint8 that are implemented with plain
+  // arithmetic ops into recursive calls to themselves.
+  if (lHasIntrinsicInDefinition(func))
+    return lCallInst(func, opa, opb, name);
+  else
+    return NULL;
+}
+
+//////////////////////////////////////////////////
+
+static llvm::Instruction *
+lMatchAvgUpUInt8(llvm::Value *inst) {
+    // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_ZExt8To16(m_Value(opa)),
+                  m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)),
+                  m_ZExt8To16(m_Value(opb)))),
+        m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return NULL;
+
+        return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgDownUInt8(llvm::Value *inst) {
+    // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc16To8(m_UDiv2(
+                    m_Add(m_ZExt8To16(m_Value(opa)),
+                          m_ZExt8To16(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb);
+    }
+    return NULL;
+}
+
+static llvm::Instruction *
+lMatchAvgUpUInt16(llvm::Value *inst) {
+    // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc32To16(m_UDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_ZExt16To32(m_Value(opa)),
+                  m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)),
+                  m_ZExt16To32(m_Value(opb)))),
+        m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return NULL;
+
+        return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgDownUInt16(llvm::Value *inst) {
+    // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc32To16(m_UDiv2(
+                    m_Add(m_ZExt16To32(m_Value(opa)),
+                          m_ZExt16To32(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgUpInt8(llvm::Value *inst) {
+    // (int8)(((int16)a + (int16)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_SExt8To16(m_Value(opa)),
+                  m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)),
+                  m_SExt8To16(m_Value(opb)))),
+        m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return NULL;
+
+        return lGetBinaryIntrinsic("__avg_up_int8", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgDownInt8(llvm::Value *inst) {
+    // (int8)(((int16)a + (int16)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc16To8(m_SDiv2(
+                    m_Add(m_SExt8To16(m_Value(opa)),
+                          m_SExt8To16(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_int8", opa, opb);
+    }
+    return NULL;
+}
+
+static llvm::Instruction *
+lMatchAvgUpInt16(llvm::Value *inst) {
+    // (int16)(((int32)a + (int32)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc32To16(m_SDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_SExt16To32(m_Value(opa)),
+                  m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)),
+                  m_SExt16To32(m_Value(opb)))),
+        m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return NULL;
+
+        return lGetBinaryIntrinsic("__avg_up_int16", opa, opb);
+    }
+    return NULL;
+}
+
+static llvm::Instruction *
+lMatchAvgDownInt16(llvm::Value *inst) {
+    // (int16)(((int32)a + (int32)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc32To16(m_SDiv2(
+                    m_Add(m_SExt16To32(m_Value(opa)),
+                          m_SExt16To32(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_int16", opa, opb);
+    }
+    return NULL;
+}
+#endif // !LLVM_3_1 && !LLVM_3_2
+
+bool
+PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("PeepholePass");
+
+    bool modifiedAny = false;
+ restart:
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        llvm::Instruction *inst = &*iter;
+
+        llvm::Instruction *builtinCall = NULL;
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpUInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpUInt16(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownUInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownUInt16(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpInt16(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownInt16(inst);
+#endif // !LLVM_3_1 && !LLVM_3_2
+        if (builtinCall != NULL) {
+          llvm::ReplaceInstWithInst(inst, builtinCall);
+          modifiedAny = true;
+          goto restart;
+        }
+    }
+
+    DEBUG_END_PASS("PeepholePass");
+
+    return modifiedAny;
+}
+
+static llvm::Pass *
+CreatePeepholePass() {
+  return new PeepholePass;
+}
diff --git a/parse.yy b/parse.yy
index 3ad815cf..933a3455 100644
--- a/parse.yy
+++ b/parse.yy
@@ -149,7 +149,8 @@ struct ForeachDimension {
 
 %union {
     uint64_t intVal;
-    float floatVal;
+    float  floatVal;
+    double doubleVal;
     std::string *stringVal;
     const char *constCharPtr;
 
@@ -179,11 +180,13 @@ struct ForeachDimension {
 }
 
 
+%token TOKEN_INT8_CONSTANT TOKEN_UINT8_CONSTANT
+%token TOKEN_INT16_CONSTANT TOKEN_UINT16_CONSTANT
 %token TOKEN_INT32_CONSTANT TOKEN_UINT32_CONSTANT
 %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT
 %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT
 %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT
-%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
+%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL
 %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL
 %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP
 %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
@@ -291,6 +294,22 @@ primary_expression
             Error(@1, "Undeclared symbol \"%s\".%s", name, alts.c_str());
         }
     }
+    | TOKEN_INT8_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformInt8->GetAsConstType(),
+                           (int8_t)yylval.intVal, @1);
+    }
+    | TOKEN_UINT8_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformUInt8->GetAsConstType(),
+                           (uint8_t)yylval.intVal, @1);
+    }
+    | TOKEN_INT16_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformInt16->GetAsConstType(),
+                           (int16_t)yylval.intVal, @1);
+    }
+    | TOKEN_UINT16_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformUInt16->GetAsConstType(),
+                           (uint16_t)yylval.intVal, @1);
+    }
     | TOKEN_INT32_CONSTANT {
         $$ = new ConstExpr(AtomicType::UniformInt32->GetAsConstType(),
                            (int32_t)yylval.intVal, @1);
@@ -309,7 +328,11 @@ primary_expression
     }
     | TOKEN_FLOAT_CONSTANT {
         $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(),
-                           (float)yylval.floatVal, @1);
+                           yylval.floatVal, @1);
+    }
+    | TOKEN_DOUBLE_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
+                           yylval.doubleVal, @1);
     }
     | TOKEN_TRUE {
         $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1);
@@ -1233,7 +1256,10 @@ declarator
     ;
 
 int_constant
-    : TOKEN_INT32_CONSTANT { $$ = yylval.intVal; }
+    : TOKEN_INT8_CONSTANT { $$ = yylval.intVal; }
+    | TOKEN_INT16_CONSTANT { $$ = yylval.intVal; }
+    | TOKEN_INT32_CONSTANT { $$ = yylval.intVal; }
+    | TOKEN_INT64_CONSTANT { $$ = yylval.intVal; }
     ;
 
 direct_declarator
@@ -2148,8 +2174,27 @@ lAddFunctionParams(Declarator *decl) {
 
 /** Add a symbol for the built-in mask variable to the symbol table */
 static void lAddMaskToSymbolTable(SourcePos pos) {
-    const Type *t = g->target->getMaskBitCount() == 1 ?
-        AtomicType::VaryingBool : AtomicType::VaryingUInt32;
+    const Type *t = NULL;
+    switch (g->target->getMaskBitCount()) {
+    case 1:
+        t = AtomicType::VaryingBool;
+        break;
+    case 8:
+        t = AtomicType::VaryingUInt8;
+        break;
+    case 16:
+        t = AtomicType::VaryingUInt16;
+        break;
+    case 32:
+        t = AtomicType::VaryingUInt32;
+        break;
+    case 64:
+        t = AtomicType::VaryingUInt64;
+        break;
+    default:
+        FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable");
+    }
+
     t = t->GetAsConstType();
     Symbol *maskSymbol = new Symbol("__mask", pos, t);
     m->symbolTable->AddVariable(maskSymbol);
@@ -2241,7 +2286,11 @@ lGetConstantInt(Expr *expr, int *value, SourcePos pos, const char *usage) {
             Error(pos, "%s must be representable with a 32-bit integer.", usage);
             return false;
         }
-        *value = (int)ci->getZExtValue();
+        const Type *type = expr->GetType();
+        if (type->IsUnsignedType())
+            *value = (int)ci->getZExtValue();
+        else
+            *value = (int)ci->getSExtValue();
         return true;
     }
 }
diff --git a/examples/perf.ini b/perf.ini
similarity index 84%
rename from examples/perf.ini
rename to perf.ini
index 3814bf16..249c25f4 100755
--- a/examples/perf.ini
+++ b/perf.ini
@@ -10,44 +10,48 @@
 %****************************************************************************************************
 AOBench
 aobench
-ao 10 512 512
+10 512 512
 #***
 Deferred Shading
 deferred
-deferred_shading data/pp1280x720.bin
+data/pp1280x720.bin
 #***
 Mandelbrot Set
 mandelbrot
-mandelbrot
+
 #***
 Mandelbrot Set
 mandelbrot_tasks
-mandelbrot
+
 ^
 #***
 Perlin Noise Function
 noise
-noise
+
 #***
 Binomial Options
 options
-options
+
 ! 1 2
 #***
 Black-Scholes Options
 options
-options
+
 ! 2 2
 #***
 Ray Tracer
 rt
-rt sponza
+sponza
 #***
 3D Stencil
 stencil
-stencil
+
 #***
 Volume Rendering
 volume_rendering
-volume camera.dat density_highres.vol
+camera.dat density_highres.vol
+#***
+Sort
+sort
+1000000 1
 #***
diff --git a/perf.py b/perf.py
new file mode 100755
index 00000000..576a5c7d
--- /dev/null
+++ b/perf.py
@@ -0,0 +1,489 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+
+def print_file(line):
+    if options.output != "":
+        output = open(options.output, 'w')
+        output.writelines(line)
+        output.close()
+
+def build_test(commands):
+    os.system(commands[4])
+    test = os.system(commands[1])
+    if options.ref:
+        ref = os.system(commands[3])
+    return (options.ref and ref) or test
+
+def execute_test(commands):
+    r = 0
+    common.remove_if_exists(perf_temp+"_test") 
+    common.remove_if_exists(perf_temp+"_ref")
+    for k in range(int(options.number)):
+        r = r + os.system(commands[0])
+        if options.ref:
+            r = r + os.system(commands[2])
+    return r
+
+#gathers all tests results and made an item test from answer structure
+def run_test(commands, c1, c2, test, test_ref, b_serial):
+    if build_test(commands) != 0:
+        error("Compilation fails of test %s\n" % test[0], 0)
+        return
+    if execute_test(commands) != 0:
+        error("Execution fails of test %s\n" % test[0], 0)
+        return
+    print_debug("TEST COMPILER:\n", s, perf_log)
+    analyse_test(c1, c2, test, b_serial, perf_temp+"_test")
+    if options.ref:
+        print_debug("REFERENCE COMPILER:\n", s, perf_log)
+        analyse_test(c1, c2, test_ref, b_serial, perf_temp+"_ref")
+
+
+def analyse_test(c1, c2, test, b_serial, perf_temp_n):
+    tasks = [] #list of results with tasks, it will be test[2]
+    ispc = [] #list of results without tasks, it will be test[1]
+    absolute_tasks = []  #list of absolute results with tasks, it will be test[4]
+    absolute_ispc = [] #list of absolute results without tasks, ut will be test[3]
+    serial = [] #list serial times, it will be test[5]
+    j = 1
+    for line in open(perf_temp_n): # we take test output
+        if "speedup" in line: # we are interested only in lines with speedup
+            if j == c1: # we are interested only in lines with c1 numbers
+                line = line.expandtabs(0)
+                line = line.replace("("," ")
+                line = line.split(",")
+                for i in range(len(line)):
+                    subline = line[i].split(" ")
+                    number = float(subline[1][:-1])
+                    if "speedup from ISPC + tasks" in line[i]:
+                        tasks.append(number)
+                    else:
+                        ispc.append(number)
+                c1 = c1 + c2
+            j+=1
+        if "million cycles" in line:
+            if j == c1:
+                line = line.replace("]","[")
+                line = line.split("[")
+                number = float(line[3])
+                if "tasks" in line[1]:
+                    absolute_tasks.append(number)
+                else:
+                    if "ispc" in line[1]:
+                        absolute_ispc.append(number)
+                if "serial" in line[1]:
+                    serial.append(number)
+
+    if len(ispc) != 0:
+        if len(tasks) != 0:
+            print_debug("ISPC speedup / ISPC + tasks speedup / ISPC time / ISPC + tasks time / serial time\n", s, perf_log)
+            for i in range(0,len(serial)):
+                print_debug("%10s   /\t%10s\t    /%9s  /    %10s\t    /%10s\n" %
+                    (ispc[i], tasks[i], absolute_ispc[i], absolute_tasks[i], serial[i]), s, perf_log)
+        else:
+            print_debug("ISPC speedup / ISPC time / serial time\n", s, perf_log)
+            for i in range(0,len(serial)):
+                print_debug("%10s   /%9s  /%10s\n" % (ispc[i], absolute_ispc[i], serial[i]), s, perf_log)
+    else:
+        if len(tasks) != 0:
+            print_debug("ISPC + tasks speedup / ISPC + tasks time / serial time\n", s, perf_log)
+            for i in range(0,len(serial)):
+                print_debug("%10s\t     /    %10s\t /%10s\n" % (tasks[i], absolute_tasks[i], serial[i]), s, perf_log)
+
+    test[1] = test[1] + ispc
+    test[2] = test[2] + tasks
+    test[3] = test[3] + absolute_ispc
+    test[4] = test[4] + absolute_tasks
+    if b_serial == True:
+        #if we concatenate outputs we should use only the first serial answer.
+        test[5] = test[5] + serial
+
+def cpu_get():
+    p = open("/proc/stat", 'r')
+    cpu = p.readline()
+    p.close()
+    cpu = cpu.split(" ")
+    cpu_usage = (int(cpu[2]) + int(cpu[3]) + int(cpu[4]))
+    cpu_all = cpu_usage + int(cpu[5])
+    return [cpu_usage, cpu_all]
+
+#returns cpu_usage
+def cpu_check():
+    if is_windows == False:
+        if is_mac == False:
+            cpu1 = cpu_get()
+            time.sleep(1)
+            cpu2 = cpu_get()
+            cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
+        else:
+            os.system("sysctl -n vm.loadavg > cpu_temp")
+            c = open("cpu_temp", 'r')
+            c_line = c.readline()
+            c.close
+            os.remove("cpu_temp")
+            R = c_line.split(' ')
+            cpu_percent = float(R[1]) * 3
+    else:
+	os.system("wmic cpu get loadpercentage /value > cpu_temp")
+	c = open("cpu_temp", 'r')
+        c_lines = c.readlines()
+	c.close()
+	os.remove("cpu_temp")
+	t = "0"
+	for i in c_lines[2]:
+            if i.isdigit():
+                t = t + i
+	cpu_percent = int(t)
+    return cpu_percent
+
+#returns geomean of list
+def geomean(par):
+    temp = 1
+    l = len(par)
+    for i in range(l):
+        temp = temp * par[i]
+    temp = temp ** (1.0/l)
+    return round(temp, 2)
+
+#takes an answer struct and print it.
+#answer struct: list answer contains lists test
+#test[0] - name of test
+#test[1] - list of results without tasks
+#test[2] - list of results with tasks
+#test[3] - list of absolute results without tasks
+#test[4] - list of absolute results with tasks
+#test[5] - list of absolute time without ISPC (serial)
+#test[1..4] may be empty
+def print_answer(answer):
+    filelist = []
+    print_debug("--------------------------------------------------------------------------\n", s, perf_log)
+    print_debug("test name:\t    ISPC speedup: ISPC + tasks speedup: | " + 
+        "    ISPC time:    ISPC + tasks time:  serial:\n", s, perf_log)
+    filelist.append("test name,ISPC speedup,diff," +
+        "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n")
+    max_t = [0,0,0,0,0]
+    diff_t = [0,0,0,0,0]
+    geomean_t = [0,0,0,0,0]
+    list_of_max = [[],[],[],[],[]]
+    list_of_compare = [[],[],[],[],[],[]]
+    for i in range(len(answer)):
+        list_of_compare[0].append(answer[i][0])
+        for t in range(1,6):
+            if len(answer[i][t]) == 0:
+                max_t[t-1] = "n/a"
+                diff_t[t-1] = "n/a"
+                list_of_compare[t].append(0);
+            else:
+                if t < 3:
+                    mm = max(answer[i][t])
+                else:
+                    mm = min(answer[i][t])
+                list_of_compare[t].append(mm)
+                max_t[t-1] = '%.2f' % mm
+                list_of_max[t-1].append(mm)
+                diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t]))
+        print_debug("%s:\n" % answer[i][0], s, perf_log)
+        print_debug("\t\tmax:\t%5s\t\t%10s\t|min:%10s\t%10s\t%10s\n" %
+            (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4]), s, perf_log)
+        print_debug("\t\tdiff:\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" %
+            (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4]), s, perf_log)
+        for t in range(0,5):
+            if max_t[t] == "n/a":
+                max_t[t] = ""
+            if diff_t[t] == "n/a":
+                diff_t[t] = ""
+        filelist.append(answer[i][0] + "," +
+                        max_t[0] + "," + diff_t[0] + "," +  max_t[1] + "," + diff_t[1] + "," +
+                        max_t[2] + "," + diff_t[2] + "," +  max_t[3] + "," + diff_t[3] + "," +
+                        max_t[4] + "," + diff_t[4] + "\n")
+    for i in range(0,5):
+        geomean_t[i] = geomean(list_of_max[i])
+    print_debug("---------------------------------------------------------------------------------\n", s, perf_log)
+    print_debug("Geomean:\t\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" %
+        (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]), s, perf_log)
+    filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1])
+        + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n")
+    print_file(filelist)
+    return list_of_compare
+
+
+def compare(A, B):
+    print_debug("\n\n_____________________PERFORMANCE REPORT____________________________\n", False, "")
+    print_debug("test name:                 ISPC time: ISPC time ref: %:\n", False, "")
+    for i in range(0,len(A[0])):
+        if B[3][i] == 0:
+            p1 = 0
+        else:
+            p1 = 100 - 100 * A[3][i]/B[3][i]
+        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[3][i], B[3][i], abs(p1)), False, "")
+        if p1 < -1:
+            print_debug(" <+", False, "")
+        if p1 > 1:
+            print_debug(" <-", False, "")
+        print_debug("\n", False, "")
+    print_debug("\n", False, "")
+
+    print_debug("test name:                 TASKS time: TASKS time ref: %:\n", False, "")
+    for i in range(0,len(A[0])):
+        if B[4][i] == 0:
+            p2 = 0
+        else:
+            p2 = 100 - 100 * A[4][i]/B[4][i]
+        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[4][i], B[4][i], abs(p2)), False, "")
+        if p2 < -1:
+            print_debug(" <+", False, "")
+        if p2 > 1:
+            print_debug(" <-", False, "")
+        print_debug("\n", False, "")
+    if "performance.log" in options.in_file:
+        print_debug("\n\n_________________Watch performance.log for details________________\n", False, "")
+    else:
+        print_debug("\n\n__________________________________________________________________\n", False, "")
+
+
+
+def perf(options1, args):
+    global options
+    options = options1  
+    global s
+    s = options.silent
+
+    # save current OS
+    global is_windows
+    is_windows = (platform.system() == 'Windows' or
+              'CYGWIN_NT' in platform.system())
+    global is_mac
+    is_mac = (platform.system() == 'Darwin')
+
+    # save current path
+    pwd = os.getcwd()
+    pwd = pwd + os.sep
+    pwd1 = pwd
+    if is_windows:
+        pwd1 = "..\\..\\"
+
+    # check if cpu usage is low now
+    cpu_percent = cpu_check()
+    if cpu_percent > 20:
+        error("CPU Usage is very high.\nClose other applications.\n", 2)
+
+    global ispc_test
+    global ispc_ref
+    global ref_compiler
+    global refc_compiler
+    # check that required compilers exist
+    PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+    ispc_test_exists = False
+    ispc_ref_exists = False
+    ref_compiler_exists = False
+    if is_windows == False:
+        ispc_test = "ispc"
+        ref_compiler = "g++"
+        refc_compiler = "gcc"
+        if options.compiler != "":
+            if options.compiler == "clang" or options.compiler == "clang++":
+                ref_compiler = "clang++"
+                refc_compiler = "clang"
+            if options.compiler == "icc" or options.compiler == "icpc":
+                ref_compiler = "icpc"
+                refc_compiler = "icc"
+    else:
+        ispc_test = "ispc.exe"
+        ref_compiler = "cl.exe"
+    ispc_ref = options.ref
+    if options.ref != "":
+        options.ref = True
+    for counter in PATH_dir:
+        if os.path.exists(counter + os.sep + ispc_test):
+            ispc_test_exists = True
+        if os.path.exists(counter + os.sep + ref_compiler):
+            ref_compiler_exists = True
+        if os.path.exists(counter + os.sep + ispc_ref):
+            ispc_ref_exists = True
+    if not ispc_test_exists:
+        error("ISPC compiler not found.\nAdded path to ispc compiler to your PATH variable.\n", 1)
+    if not ref_compiler_exists:
+        error("C/C++ compiler %s not found.\nAdded path to %s compiler to your PATH variable.\n" % (ref_compiler, ref_compiler), 1)
+    if options.ref:
+        if not ispc_ref_exists:
+            error("ISPC reference compiler not found.\nAdded path to ispc reference compiler to your PATH variable.\n", 1)
+
+    # checks that config file exists
+    path_config = os.path.normpath(options.config)
+    if os.path.exists(path_config) == False:
+        error("config file not found: %s.\nSet path to your config file in --config.\n" % options.config, 1)
+        sys.exit()
+
+    # read lines from config file except comments
+    f = open(path_config, 'r')
+    f_lines = f.readlines()
+    f.close()
+    lines =[]
+    for i in range(len(f_lines)):
+        if f_lines[i][0] != "%":
+            lines.append(f_lines[i])
+    length = len(lines)
+
+    # prepare build.log, perf_temp and perf.log files
+    global perf_log
+    if options.in_file:
+        perf_log = pwd + options.in_file
+        common.remove_if_exists(perf_log)
+    else:
+        perf_log = ""
+    global build_log
+    build_log = pwd + os.sep + "logs" + os.sep + "perf_build.log"
+    common.remove_if_exists(build_log)
+    if os.path.exists(pwd + os.sep + "logs") == False:
+        os.makedirs(pwd + os.sep + "logs")
+
+    global perf_temp
+    perf_temp = pwd + "perf_temp"
+    # end of preparations
+ 
+    print_debug("Okey go go go!\n\n", s, perf_log)
+    
+    #print compilers versions   
+    common.print_version(ispc_test, ispc_ref, ref_compiler, False, perf_log, is_windows) 
+
+    # begin
+    i = 0
+    answer = []
+    answer_ref = []
+
+    # loop for all tests
+    while i < length-2:
+        # we read name of test
+        print_debug("%s" % lines[i], s, perf_log)
+        test = [lines[i][:-1],[],[],[],[],[]]
+        test_ref = [lines[i][:-1],[],[],[],[],[]]
+        # read location of test
+        folder = lines[i+1]
+        folder = folder[:-1]
+        folder = os.path.normpath(options.path + os.sep + "examples" + os.sep + folder)
+        # check that test exists
+        if os.path.exists(folder) == False:
+            error("Can't find test %s. Your path is: \"%s\".\nChange current location to ISPC_HOME or set path to ISPC_HOME in --path.\n" %
+                 (lines[i][:-1], options.path), 1)
+        os.chdir(folder)
+        # read parameters of test
+        command = lines[i+2]
+        command = command[:-1]
+        if is_windows == False:
+            ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref"
+            ex_command = "./test " + command + " >> " + perf_temp + "_test"
+            bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+" >> "+build_log+" 2>> "+build_log
+            bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+" >> "+build_log+" 2>> "+build_log
+            re_command = "make clean >> "+build_log
+        else:
+            ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref"
+            ex_command = "x64\\Release\\test.exe " + command + " >> " + perf_temp + "_test"
+            bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /t:rebuild >> " + build_log
+            bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /t:rebuild >> " + build_log
+            re_command = "msbuild /t:clean >> " + build_log
+        commands = [ex_command, bu_command, ex_command_ref, bu_command_ref, re_command]
+        # parsing config parameters
+        next_line = lines[i+3]
+        if next_line[0] == "!": # we should take only one part of test output
+            R = next_line.split(' ')
+            c1 = int(R[1]) #c1 is a number of string which we want to use in test output
+            c2 = int(R[2]) #c2 is total number of strings in test output
+            i = i+1
+        else:
+            c1 = 1
+            c2 = 1
+        next_line = lines[i+3]
+        if next_line[0] == "^":  #we should concatenate result of this test with previous one
+            run_test(commands, c1, c2, answer[len(answer)-1], answer_ref[len(answer)-1], False)
+            i = i+1
+        else: #we run this test and append it's result to answer structure
+            run_test(commands, c1, c2, test, test_ref, True)
+            answer.append(test)
+            answer_ref.append(test_ref)
+
+        # preparing next loop iteration
+        os.chdir(pwd1)
+        i+=4
+
+    # delete temp file
+    common.remove_if_exists(perf_temp+"_test")
+    common.remove_if_exists(perf_temp+"_ref")
+
+    #print collected answer
+    print_debug("\n\nTEST COMPILER:\n", s, perf_log)
+    A = print_answer(answer)
+    if options.ref != "":
+        print_debug("\n\nREFERENCE COMPILER:\n", s, perf_log)
+        B = print_answer(answer_ref)
+        # print perf report
+        compare(A,B)
+
+ 
+
+###Main###
+from optparse import OptionParser
+import sys
+import os
+import operator
+import time
+import glob
+import string
+import platform
+# our functions
+import common
+print_debug = common.print_debug
+error = common.error
+
+if __name__ == "__main__":
+    # parsing options
+    parser = OptionParser()
+    parser.add_option('-n', '--number', dest='number',
+        help='number of repeats', default="3")
+    parser.add_option('-c', '--config', dest='config',
+        help='config file of tests', default="./perf.ini")
+    parser.add_option('-p', '--path', dest='path',
+        help='path to ispc root', default=".")
+    parser.add_option('-s', '--silent', dest='silent',
+        help='silent mode, only table output', default=False, action="store_true")
+    parser.add_option('-o', '--output', dest='output',
+        help='output file for script reading', default="")
+    parser.add_option('--compiler', dest='compiler',
+        help='C/C++ compiler', default="")
+    parser.add_option('-r', '--ref', dest='ref',
+        help='set reference compiler for compare', default="")
+    parser.add_option('-f', '--file', dest='in_file',
+        help='file to save perf output', default="")
+    (options, args) = parser.parse_args()
+    perf(options, args)
diff --git a/run_tests.py b/run_tests.py
index 7c6b1eb8..4ee80fe3 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -1,179 +1,54 @@
 #!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 # test-running driver for ispc
-
-from optparse import OptionParser
-import multiprocessing
-from ctypes import c_int
-import os
-import sys
-import glob
-import re
-import signal
-import random
-import string
-import subprocess
-import shlex
-import platform
-import tempfile
-import os.path
-import time
-
-# disable fancy error/warning printing with ANSI colors, so grepping for error
-# messages doesn't get confused
-os.environ["TERM"] = "dumb"
-
-# This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
-# git history has a workaround for that issue.
-
-is_windows = (platform.system() == 'Windows' or
-              'CYGWIN_NT' in platform.system())
-
-parser = OptionParser()
-parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
-                  default=False, action="store_true")
-parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics",
-                  default=None)
-parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
-                  default="")
-parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
-                  default="sse4")
-parser.add_option('-a', '--arch', dest='arch',
-                  help='Set architecture (arm, x86, x86-64)',
-                  default="x86-64")
-parser.add_option("-c", "--compiler", dest="compiler_exe", help="Compiler binary to use to run tests",
-                  default=None)
-parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
-                  default=False, action="store_true")
-parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel',
-                  default="1024", type="int")
-parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output',
-                  default=False, action="store_true")
-parser.add_option('--wrap-exe', dest='wrapexe',
-                  help='Executable to wrap test runs with (e.g. "valgrind")',
-                  default="")
-parser.add_option('--time', dest='time', help='Enable time output',
-                  default=False, action="store_true")
-
-(options, args) = parser.parse_args()
-
-if options.target == 'neon':
-    options.arch = 'arm'
-
-# use relative path to not depend on host directory, which may possibly
-# have white spaces and unicode characters.
-if not is_windows:
-    ispc_exe = "./ispc"
-else:
-    ispc_exe = ".\\Release\\ispc.exe"
-
-# checks the required ispc compiler otherwise prints an error message
-if not os.path.exists(ispc_exe):
-    sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe)
-    sys.exit()
-
-ispc_exe += " " + options.ispc_flags
-
-if __name__ == '__main__':
-    sys.stdout.write("ispc compiler: %s\n" % ispc_exe)
-
-is_generic_target = (options.target.find("generic-") != -1 and
-                     options.target != "generic-1")
-if is_generic_target and options.include_file == None:
-    if options.target == "generic-4":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/sse4.h\n")
-        options.include_file = "examples/intrinsics/sse4.h"
-    elif options.target == "generic-8":
-        sys.stderr.write("No generics #include specified and no default available for \"generic-8\" target.\n")
-        sys.exit(1)
-    elif options.target == "generic-16":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-16.h\n")
-        options.include_file = "examples/intrinsics/generic-16.h"
-    elif options.target == "generic-32":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-32.h\n")
-        options.include_file = "examples/intrinsics/generic-32.h"
-    elif options.target == "generic-64":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-64.h\n")
-        options.include_file = "examples/intrinsics/generic-64.h"
-
-if options.compiler_exe == None:
-    if is_windows:
-        options.compiler_exe = "cl.exe"
-    else:
-        options.compiler_exe = "g++"
-
-# checks the required compiler otherwise prints an error message
-PATH_dir = string.split(os.getenv("PATH"), os.pathsep) 
-compiler_exists = False
-
-for counter in PATH_dir:
-    if os.path.exists(counter + os.sep + options.compiler_exe):
-        compiler_exists = True
-        break
-
-if not compiler_exists:
-    sys.stderr.write("Fatal error: missing the required compiler: %s \n" %
-        options.compiler_exe)
-    sys.exit()
-
-ispc_root = "."
-    
-# if no specific test files are specified, run all of the tests in tests/,
-# failing_tests/, and tests_errors/
-if len(args) == 0:
-    files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \
-        glob.glob(ispc_root + os.sep + "failing_tests" + os.sep + "*ispc") + \
-        glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc")
-else:
-    if is_windows:
-        argfiles = [ ]
-        for f in args:
-            # we have to glob ourselves if this is being run under a DOS
-            # shell, as it passes wildcard as is.
-            argfiles += glob.glob(f)
-    else:
-        argfiles = args
-        
-    files = [ ]
-    for f in argfiles:
-        if os.path.splitext(string.lower(f))[1] != ".ispc":
-            sys.stdout.write("Ignoring file %s, which doesn't have an .ispc extension.\n" % f)
-        else:
-            files += [ f ]
-
-# max_test_length is used to issue exact number of whitespace characters when
-# updating status. Otherwise update causes new lines standard 80 char terminal
-# on both Linux and Windows.
-max_test_length = 0
-for f in files:
-    max_test_length = max(max_test_length, len(f))
-
-# randomly shuffle the tests if asked to do so
-if (options.random):
-    random.seed()
-    random.shuffle(files)
-
-# counter
-total_tests = 0
-
-
 # utility routine to print an update on the number of tests that have been
 # finished.  Should be called with the lock held..
 def update_progress(fn, total_tests_arg, counter, max_test_length_arg):
     counter.value += 1
-    progress_str = " Done %d / %d [%s]" % (counter.value, total_tests_arg, fn)
-    # spaces to clear out detrius from previous printing...
-    spaces_needed = max_test_length_arg - len(fn)
-    for x in range(spaces_needed):
-        progress_str += ' '
-    progress_str += '\r'
-    sys.stdout.write(progress_str)
-    sys.stdout.flush()
+    if options.non_interactive == False:
+        progress_str = " Done %d / %d [%s]" % (counter.value, total_tests_arg, fn)
+        # spaces to clear out detrius from previous printing...
+        spaces_needed = max_test_length_arg - len(fn)
+        for x in range(spaces_needed):
+            progress_str += ' '
+        progress_str += '\r'
+        sys.stdout.write(progress_str)
+        sys.stdout.flush()
 
 def run_command(cmd):
     if options.verbose:
-        sys.stdout.write("Running: %s\n" % cmd)
+        print_debug("Running: %s\n" % cmd, s, run_tests_log)
 
     # Here's a bit tricky part. To pass a command for execution we should
     # break down the line in to arguments. shlex class is designed exactly
@@ -201,9 +76,9 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
         (return_code, output) = run_command(cmd)
         compile_failed = (return_code != 0)
         if compile_failed:
-            sys.stdout.write("Compilation of test %s failed            \n" % filename)
+            print_debug("Compilation of test %s failed            \n" % filename, s, run_tests_log)
             if output != "":
-                sys.stdout.write("%s" % output.encode("utf-8"))
+                print_debug("%s" % output.encode("utf-8"), s, run_tests_log)
             return (1, 0)
 
     (return_code, output) = run_command(run_cmd)
@@ -212,11 +87,11 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
     surprise = ((expect_failure and not run_failed) or
                 (not expect_failure and run_failed))
     if surprise == True:
-        sys.stderr.write("Test %s %s (return code %d)            \n" % \
+        print_debug("Test %s %s (return code %d)            \n" % \
             (filename, "unexpectedly passed" if expect_failure else "failed",
-             return_code))
+             return_code), s, run_tests_log)
     if output != "":
-        sys.stdout.write("%s\n" % output.encode("utf-8"))
+        print_debug("%s\n" % output.encode("utf-8"), s, run_tests_log)
     if surprise == True:
         return (0, 1)
     else:
@@ -231,7 +106,7 @@ def add_prefix(path):
     else:
         input_prefix = ""
     path = input_prefix + path
-    path = os.path.normpath(path)
+    path = os.path.abspath(path)
     return path
 
 
@@ -294,12 +169,12 @@ def run_test(testname):
         firstline = firstline.rstrip()
         file.close()
 
-        if (output.find(firstline) == -1):
-            sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
-                (firstline, testname, output))
+        if re.search(firstline, output) == None:
+            print_debug("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
+                (firstline, testname, output), s, run_tests_log)
             return (1, 0)
         elif got_error == False:
-            sys.stderr.write("Unexpectedly no errors issued from test %s\n" % testname)
+            print_debug("Unexpectedly no errors issued from test %s\n" % testname, s, run_tests_log)
             return (1, 0)
         else:
             return (0, 0)
@@ -325,8 +200,7 @@ def run_test(testname):
                     break
         file.close()
         if match == -1:
-            sys.stderr.write("Fatal error: unable to find function signature " + \
-                  "in test %s\n" % testname)
+            error("unable to find function signature in test %s\n" % testname, 0)
             return (1, 0)
         else:
             global is_generic_target
@@ -359,10 +233,13 @@ def run_test(testname):
                 gcc_isa=""
                 if options.target == 'generic-4':
                     gcc_isa = '-msse4.2'
-                if options.target == 'generic-8':
+                if (options.target == 'generic-8'):
+                  if (options.include_file.find("knc-i1x8.h")!=-1 or options.include_file.find("knc-i1x8unsafe_fast.h")!=-1):
+                    gcc_isa = '-mmic'
+                  else:
                     gcc_isa = '-mavx'
                 if (options.target == 'generic-16' or options.target == 'generic-32' or options.target == 'generic-64') \
-                        and (options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1):
+                        and (options.include_file.find("knc-i1x16.h")!=-1 or options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1):
                     gcc_isa = '-mmic'
 
                 cc_cmd = "%s -O2 -I. %s %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \
@@ -401,7 +278,21 @@ def run_test(testname):
 # pull tests to run from the given queue and run them.  Multiple copies of
 # this function will be running in parallel across all of the CPU cores of
 # the system.
-def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test_length_arg, counter, mutex):
+def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test_length_arg, counter, mutex, glob_var):
+    # This is needed on windows because windows doen't copy globals from parent process whili multiprocessing
+    global is_windows
+    is_windows = glob_var[0]
+    global options
+    options = glob_var[1]
+    global s
+    s = glob_var[2]
+    global ispc_exe
+    ispc_exe = glob_var[3]
+    global is_generic_target
+    is_generic_target = glob_var[4]
+    global run_tests_log
+    run_tests_log = glob_var[5]    
+
     if is_windows:
         tmpdir = "tmp%d" % os.getpid()
         os.mkdir(tmpdir)
@@ -444,14 +335,266 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test
             skip_files += [ filename ]
 
 
-task_threads = []
-
 def sigint(signum, frame):
     for t in task_threads:
         t.terminate()
     sys.exit(1)
 
-if __name__ == '__main__':
+
+def file_check(compfails, runfails):
+    errors = len(compfails) + len(runfails)
+    new_compfails = []
+    new_runfails = []
+    new_passes_compfails = []
+    new_passes_runfails = []
+# Open file fail_db.txt
+    f = open(test_states, 'r')
+    f_lines = f.readlines()
+    f.close()
+# Detect OS
+    if platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system():
+        OS = "Windows"
+    else:
+        if platform.system() == 'Darwin':
+            OS = "Mac"
+        else:
+            OS = "Linux"
+# Detect opt_set
+    if options.no_opt == True:
+        opt = "-O0"
+    else:
+        opt = "-O2"
+# Detect LLVM version
+    temp1 = common.take_lines(ispc_exe + " --version", "first")
+    llvm_version = temp1[-10:-2]
+# Detect compiler version
+    if is_windows == False:
+        temp1 = common.take_lines(options.compiler_exe + " --version", "first")
+        temp2 = re.search("[0-9]*\.[0-9]*\.[0-9]", temp1)
+        if temp2 == None:
+            temp3 = re.search("[0-9]*\.[0-9]*", temp1)
+        else:
+            temp3 = re.search("[0-9]*\.[0-9]*", temp2.group())
+        compiler_version = options.compiler_exe + temp3.group()
+    else:
+        compiler_version = "cl" 
+    new_line = " "+options.arch.rjust(6)+" "+options.target.rjust(14)+" "+OS.rjust(7)+" "+llvm_version+" "+compiler_version.rjust(10)+" "+opt+" *\n"
+
+    new_compfails = compfails[:]
+    new_runfails = runfails[:]
+    new_f_lines = f_lines[:]
+    for j in range(0, len(f_lines)):
+        if (((" "+options.arch+" ") in f_lines[j]) and
+           ((" "+options.target+" ") in f_lines[j]) and
+           ((" "+OS+" ") in f_lines[j]) and
+           ((" "+llvm_version+" ") in f_lines[j]) and
+           ((" "+compiler_version+" ") in f_lines[j]) and
+           ((" "+opt+" ") in f_lines[j])):
+            if (" compfail " in f_lines[j]):
+                f = 0
+                for i in range(0, len(compfails)):
+                    if compfails[i] in f_lines[j]:
+                        new_compfails.remove(compfails[i])
+                    else:
+                        f = f + 1
+                if f == len(compfails):
+                    temp3 = f_lines[j].split(" ")
+                    new_passes_compfails.append(temp3[0])
+                    if options.update == "FP":
+                        new_f_lines.remove(f_lines[j])
+            if (" runfail " in f_lines[j]):
+                f = 0
+                for i in range(0, len(runfails)):
+                    if runfails[i] in f_lines[j]:
+                        new_runfails.remove(runfails[i])
+                    else:
+                        f = f + 1
+                if f == len(runfails):
+                    temp3 = f_lines[j].split(" ")
+                    new_passes_runfails.append(temp3[0])
+                    if options.update == "FP":
+                        new_f_lines.remove(f_lines[j])
+    if len(new_runfails) != 0:
+        print_debug("NEW RUNFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_runfails)):
+            new_f_lines.append(new_runfails[i] + " runfail " + new_line)
+            print_debug("\t" + new_runfails[i] + "\n", s, run_tests_log)
+    if len(new_compfails) != 0:
+        print_debug("NEW COMPFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_compfails)):
+            new_f_lines.append(new_compfails[i] + " compfail " + new_line)
+            print_debug("\t" + new_compfails[i] + "\n", s, run_tests_log)
+    if len(new_runfails) == 0 and len(new_compfails) == 0:
+        print_debug("No new fails\n", s, run_tests_log)
+    if len(new_passes_runfails) != 0:
+        print_debug("NEW PASSES after RUNFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_passes_runfails)):
+            print_debug("\t" + new_passes_runfails[i] + "\n", s, run_tests_log)
+    if len(new_passes_compfails) != 0:
+        print_debug("NEW PASSES after COMPFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_passes_compfails)):
+            print_debug("\t" + new_passes_compfails[i] + "\n", s, run_tests_log)
+    
+    if options.update != "":
+        output = open(test_states, 'w')
+        output.writelines(new_f_lines)
+        output.close()
+    return [new_runfails, new_compfails, new_passes_runfails, new_passes_compfails, new_line, errors]
+
+def verify():
+    # Open file fail_db.txt
+    f = open(test_states, 'r')
+    f_lines = f.readlines()
+    f.close()
+    check = [["g++", "clang", "cl"],["-O0", "-O2"],["x86","x86-64"],
+             ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"],
+             ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8",
+              "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8", "avx1.1-i32x16",
+              "avx2-i32x8", "avx2-i32x16", "generic-1", "generic-4", "generic-8",
+              "generic-16", "generic-32", "generic-64"]]
+    for i in range (0,len(f_lines)):
+        if f_lines[i][0] == "%":
+            continue
+        for j in range(0,len(check)):
+            temp = 0
+            for t in range(0,len(check[j])):
+                if " " + check[j][t] + " " in f_lines[i]:
+                    temp = temp + 1
+            if temp != 1:
+                print_debug("error in line " + str(i) + "\n", False, run_tests_log)
+                break
+
+
+def run_tests(options1, args, print_version):
+    global options
+    options = options1
+    global s
+    s = options.silent
+    
+    # prepare run_tests_log and fail_db files
+    global run_tests_log
+    if options.in_file:
+        run_tests_log = os.getcwd() + os.sep + options.in_file
+        if print_version == 1:
+            common.remove_if_exists(run_tests_log)
+    else:
+        run_tests_log = ""
+    global test_states
+    test_states = "fail_db.txt"
+    if options.verify:
+        verify()
+        return 0
+
+    # disable fancy error/warning printing with ANSI colors, so grepping for error
+    # messages doesn't get confused
+    os.environ["TERM"] = "dumb"
+ 
+    # This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
+    # git history has a workaround for that issue.
+    global is_windows 
+    is_windows = (platform.system() == 'Windows' or
+                'CYGWIN_NT' in platform.system())
+ 
+    if options.target == 'neon':
+        options.arch = 'arm'
+ 
+    # use relative path to not depend on host directory, which may possibly
+    # have white spaces and unicode characters.
+    global ispc_exe
+    if not is_windows:
+        ispc_exe = "./ispc"
+    else:
+        ispc_exe = ".\\Release\\ispc.exe"
+ 
+    # checks the required ispc compiler otherwise prints an error message
+    if not os.path.exists(ispc_exe):
+        error("missing ispc compiler: %s\n" % ispc_exe, 1)
+    ispc_exe += " " + options.ispc_flags
+    print_debug("ispc compiler: %s\n" % ispc_exe, s, run_tests_log)
+
+    global is_generic_target 
+    is_generic_target = (options.target.find("generic-") != -1 and
+                     options.target != "generic-1" and options.target != "generic-x1")
+    if is_generic_target and options.include_file == None:
+        if options.target == "generic-4" or options.target == "generic-x4":
+            error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2)
+            options.include_file = "examples/intrinsics/sse4.h"
+            options.target = "generic-4"
+        elif options.target == "generic-8" or options.target == "generic-x8":
+            error("No generics #include specified and no default available for \"generic-8\" target.\n", 1)
+            options.target = "generic-8"
+        elif options.target == "generic-16" or options.target == "generic-x16":
+            error("No generics #include specified; using examples/intrinsics/generic-16.h\n", 2)
+            options.include_file = "examples/intrinsics/generic-16.h"
+            options.target = "generic-16"
+        elif options.target == "generic-32" or options.target == "generic-x32":
+            error("No generics #include specified; using examples/intrinsics/generic-32.h\n", 2)
+            options.include_file = "examples/intrinsics/generic-32.h"
+            options.target = "generic-32"
+        elif options.target == "generic-64" or options.target == "generic-x64":
+            error("No generics #include specified; using examples/intrinsics/generic-64.h\n", 2)
+            options.include_file = "examples/intrinsics/generic-64.h"
+            options.target = "generic-64"
+ 
+    if options.compiler_exe == None:
+        if is_windows:
+            options.compiler_exe = "cl.exe"
+        else:
+            options.compiler_exe = "g++"
+ 
+    # checks the required compiler otherwise prints an error message
+    PATH_dir = string.split(os.getenv("PATH"), os.pathsep) 
+    compiler_exists = False
+ 
+    for counter in PATH_dir:
+        if os.path.exists(counter + os.sep + options.compiler_exe):
+            compiler_exists = True
+            break
+ 
+    if not compiler_exists:
+        error("missing the required compiler: %s \n" % options.compiler_exe, 1)
+
+    # print compilers versions
+    if print_version > 0:
+        common.print_version(ispc_exe, "", options.compiler_exe, False, run_tests_log, is_windows)
+ 
+    ispc_root = "."
+    
+    # if no specific test files are specified, run all of the tests in tests/,
+    # failing_tests/, and tests_errors/
+    if len(args) == 0:
+        files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \
+            glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc")
+    else:
+        if is_windows:
+            argfiles = [ ]
+            for f in args:
+                # we have to glob ourselves if this is being run under a DOS
+                # shell, as it passes wildcard as is.
+                argfiles += glob.glob(f)
+        else:
+            argfiles = args
+        
+        files = [ ]
+        for f in argfiles:
+            if os.path.splitext(string.lower(f))[1] != ".ispc":
+                error("Ignoring file %s, which doesn't have an .ispc extension.\n" % f, 2)
+            else:
+                files += [ f ]
+ 
+    # max_test_length is used to issue exact number of whitespace characters when
+    # updating status. Otherwise update causes new lines standard 80 char terminal
+    # on both Linux and Windows.
+    max_test_length = 0
+    for f in files:
+        max_test_length = max(max_test_length, len(f))
+ 
+    # randomly shuffle the tests if asked to do so
+    if (options.random):
+        random.seed()
+        random.shuffle(files)
+ 
+    # counter
     total_tests = len(files)
 
     compile_error_files = [ ]
@@ -460,7 +603,7 @@ if __name__ == '__main__':
 
     nthreads = min(multiprocessing.cpu_count(), options.num_jobs)
     nthreads = min(nthreads, len(files))
-    sys.stdout.write("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests))
+    print_debug("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests), s, run_tests_log)
 
     # put each of the test filenames into a queue
     q = multiprocessing.Queue()
@@ -480,41 +623,111 @@ if __name__ == '__main__':
 
     start_time = time.time()
     # launch jobs to run tests
+    glob_var = [is_windows, options, s, ispc_exe, is_generic_target, run_tests_log]
+    global task_threads
+    task_threads = [0] * nthreads
     for x in range(nthreads):
-        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests, max_test_length, finished_tests_counter, finished_tests_counter_lock))
-        task_threads.append(t)
-        t.start()
-
+        task_threads[x] = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests,
+            max_test_length, finished_tests_counter, finished_tests_counter_lock, glob_var))
+        task_threads[x].start()
     # wait for them to all finish and then return the number that failed
     # (i.e. return 0 if all is ok)
     for t in task_threads:
         t.join()
-    sys.stdout.write("\n")
+    if options.non_interactive == False:
+        print_debug("\n", s, run_tests_log)
 
     elapsed_time = time.time() - start_time
-    if options.time:
-        sys.stdout.write("Elapsed time: %d s\n" % elapsed_time)
 
     while not qret.empty():
-        (c, r, s) = qret.get()
+        (c, r, skip) = qret.get()
         compile_error_files += c
         run_error_files += r
-        skip_files += s
+        skip_files += skip
 
+    if options.non_interactive:
+        print_debug(" Done %d / %d\n" % (finished_tests_counter.value, total_tests), s, run_tests_log)
     if len(skip_files) > 0:
         skip_files.sort()
-        sys.stdout.write("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests))
+        print_debug("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests), s, run_tests_log)
         for f in skip_files:
-            sys.stdout.write("\t%s\n" % f)
+            print_debug("\t%s\n" % f, s, run_tests_log)
     if len(compile_error_files) > 0:
         compile_error_files.sort()
-        sys.stdout.write("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests))
+        print_debug("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests), s, run_tests_log)
         for f in compile_error_files:
-            sys.stdout.write("\t%s\n" % f)
+            print_debug("\t%s\n" % f, s, run_tests_log)
     if len(run_error_files) > 0:
         run_error_files.sort()
-        sys.stdout.write("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests))
+        print_debug("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests), s, run_tests_log)
         for f in run_error_files:
-            sys.stdout.write("\t%s\n" % f)
+            print_debug("\t%s\n" % f, s, run_tests_log)
+    if len(compile_error_files) == 0 and len(run_error_files) == 0:
+        print_debug("No fails\n", s, run_tests_log)
 
-    sys.exit(len(compile_error_files) + len(run_error_files))
+    R = file_check(compile_error_files, run_error_files)
+
+    if options.time:
+        print_debug("Elapsed time: %d s\n" % elapsed_time, s, run_tests_log)
+
+    return R
+
+
+from optparse import OptionParser
+import multiprocessing
+from ctypes import c_int
+import os
+import sys
+import glob
+import re
+import signal
+import random
+import string
+import subprocess
+import shlex
+import platform
+import tempfile
+import os.path
+import time
+# our functions
+import common
+print_debug = common.print_debug
+error = common.error
+
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
+                  default=False, action="store_true")
+    parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics",
+                  default=None)
+    parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
+                  default="")
+    parser.add_option('-t', '--target', dest='target',
+                  help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)',
+                                    default="sse4")
+    parser.add_option('-a', '--arch', dest='arch',
+                  help='Set architecture (arm, x86, x86-64)',
+                                    default="x86-64")
+    parser.add_option("-c", "--compiler", dest="compiler_exe", help="C/C++ compiler binary to use to run tests",
+                  default=None)
+    parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
+                  default=False, action="store_true")
+    parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel',
+                  default="1024", type="int")
+    parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output',
+                  default=False, action="store_true")
+    parser.add_option('--wrap-exe', dest='wrapexe',
+                  help='Executable to wrap test runs with (e.g. "valgrind")',
+                                    default="")
+    parser.add_option('--time', dest='time', help='Enable time output',
+                  default=False, action="store_true")
+    parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates',
+                  default=False, action="store_true")
+    parser.add_option('-u', "--update-errors", dest='update', help='Update file with fails (F of FP)', default="")
+    parser.add_option('-s', "--silent", dest='silent', help='enable silent mode without any output', default=False,
+                  action = "store_true")
+    parser.add_option("--file", dest='in_file', help='file to save run_tests output', default="")
+    parser.add_option("--verify", dest='verify', help='verify the file fail_db.txt', default=False, action="store_true")
+    (options, args) = parser.parse_args()
+    L = run_tests(options, args, 1)
+    exit(0)
diff --git a/stdlib.ispc b/stdlib.ispc
index 4e06f5da..9b02d0ba 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -38,12 +38,23 @@
            ispc code 
 */
 
-#ifdef ISPC_TARGET_GENERIC
-#define IntMaskType bool
-#define UIntMaskType bool
+#if (ISPC_MASK_BITS == 1)
+  #define IntMaskType bool
+  #define UIntMaskType bool
+#elif (ISPC_MASK_BITS == 8)
+  #define IntMaskType int8
+  #define UIntMaskType unsigned int8
+#elif (ISPC_MASK_BITS == 16)
+  #define IntMaskType int16
+  #define UIntMaskType unsigned int16
+#elif (ISPC_MASK_BITS == 32)
+  #define IntMaskType int32
+  #define UIntMaskType unsigned int32
+#elif (ISPC_MASK_BITS == 64)
+  #define IntMaskType int64
+  #define UIntMaskType unsigned int64
 #else
-#define IntMaskType int32
-#define UIntMaskType unsigned int32
+  #error Unknown value of ISPC_MASK_BITS
 #endif
 
 ///////////////////////////////////////////////////////////////////////////
@@ -335,14 +346,15 @@ static inline int32 sign_extend(bool v) {
     return __sext_varying_bool(v);
 }
 
+
 __declspec(safe) 
 static inline uniform bool any(bool v) {
     // We only care about whether "any" is true for the active program instances,
     // so we have to make v with the current program mask.
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __any(v & __mask);
 #else
-    return __any(__sext_varying_bool(v) & __mask);
+    return __any((UIntMaskType)__sext_varying_bool(v) & __mask);
 #endif
 }
 
@@ -350,11 +362,10 @@ __declspec(safe)
 static inline uniform bool all(bool v) {
     // As with any(), we need to explicitly mask v with the current program mask
     // so we're only looking at the current lanes
-
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __all(v | !__mask);
 #else
-    return __all(__sext_varying_bool(v) | !__mask);
+    return __all((UIntMaskType)__sext_varying_bool(v) | !__mask);
 #endif
 }
 
@@ -362,11 +373,10 @@ __declspec(safe)
 static inline uniform bool none(bool v) {
     // As with any(), we need to explicitly mask v with the current program mask
     // so we're only looking at the current lanes
-
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __none(v & __mask);
 #else
-    return __none(__sext_varying_bool(v) & __mask);
+    return __none((UIntMaskType)__sext_varying_bool(v) & __mask);
 #endif
 }
 
@@ -399,10 +409,10 @@ static inline int popcnt(int64 v) {
 __declspec(safe) 
 static inline uniform int popcnt(bool v) {
     // As with any() and all(), only count across the active lanes
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __popcnt_int64(__movmsk(v & __mask));
 #else
-    return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask));
+    return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
 #endif
 }
 
@@ -880,21 +890,45 @@ static inline uniform double select(uniform bool c, uniform double a,
 ///////////////////////////////////////////////////////////////////////////
 // Horizontal ops / reductions
 
+__declspec(safe)
+static inline uniform int16 reduce_add(int8 x) {
+    return __reduce_add_int8(__mask ? x : (int8)0);
+}
+
+__declspec(safe)
+static inline uniform unsigned int16 reduce_add(unsigned int8 x) {
+    return __reduce_add_int8(__mask ? x : (int8)0);
+}
+
+__declspec(safe)
+static inline uniform int32 reduce_add(int16 x) {
+    return __reduce_add_int16(__mask ? x : (int16)0);
+}
+
+__declspec(safe)
+static inline uniform unsigned int32 reduce_add(unsigned int16 x) {
+    return __reduce_add_int16(__mask ? x : (int16)0);
+}
+
 __declspec(safe) 
 static inline uniform float reduce_add(float x) {
     // zero the lanes where the mask is off
     return __reduce_add_float(__mask ? x : 0.);
 }
 
-
 __declspec(safe) 
 static inline uniform float reduce_min(float v) {
     // For the lanes where the mask is off, replace the given value with
     // infinity, so that it doesn't affect the result.
     int iflt_max = 0x7f800000; // infinity
-    // Must use __floatbits_varying_int32, not floatbits(), since with the
-    // latter the current mask enters into the returned result...
-    return __reduce_min_float(__mask ? v : __floatbits_varying_int32(iflt_max));
+    // unmasked block is needed to make sure that argument for unmasked
+    // function __reduce_min_float() are calculated without a mask.
+    bool test = __mask;
+    uniform float result;
+    unmasked {
+        result = __reduce_min_float(test ? v : floatbits(iflt_max));
+    }
+    return result;
 }
 
 __declspec(safe) 
@@ -902,13 +936,18 @@ static inline uniform float reduce_max(float v) {
     // For the lanes where the mask is off, replace the given value with
     // negative infinity, so that it doesn't affect the result.
     const int iflt_neg_max = 0xff800000; // -infinity
-    // Must use __floatbits_varying_int32, not floatbits(), since with the
-    // latter the current mask enters into the returned result...
-    return __reduce_max_float(__mask ? v : __floatbits_varying_int32(iflt_neg_max));
+    // unmasked block is needed to make sure that argument for unmasked
+    // function __reduce_max_float() are calculated without a mask.
+    bool test = __mask;
+    uniform float result;
+    unmasked {
+        result = __reduce_max_float(test ? v : floatbits(iflt_neg_max));
+    }
+    return result;
 }
 
 __declspec(safe) 
-static inline uniform int reduce_add(int x) {
+static inline uniform int64 reduce_add(int32 x) {
     // Zero out the values for lanes that aren't running
     return __reduce_add_int32(__mask ? x : 0);
 }
@@ -930,7 +969,7 @@ static inline uniform int reduce_max(int v) {
 }
 
 __declspec(safe) 
-static inline uniform unsigned int reduce_add(unsigned int x) {
+static inline uniform unsigned int64 reduce_add(unsigned int32 x) {
     // Set values for non-running lanes to zero so they don't affect the
     // result.
     return __reduce_add_int32(__mask ? x : 0);
@@ -960,17 +999,27 @@ static inline uniform double reduce_add(double x) {
 __declspec(safe) 
 static inline uniform double reduce_min(double v) {
     int64 iflt_max = 0x7ff0000000000000; // infinity
-    // Must use __doublebits_varying_int64, not doublebits(), since with the
-    // latter the current mask enters into the returned result...
-    return __reduce_min_double(__mask ? v : __doublebits_varying_int64(iflt_max));
+    // unmasked block is needed to make sure that argument for unmasked
+    // function __reduce_min_double() are calculated without a mask.
+    bool test = __mask;
+    uniform double result;
+    unmasked {
+        result = __reduce_min_double(test ? v : doublebits(iflt_max));
+    }
+    return result;
 }
 
 __declspec(safe) 
 static inline uniform double reduce_max(double v) {
     const int64 iflt_neg_max = 0xfff0000000000000; // -infinity
-    // Must use __doublebits_varying_int64, not doublebits(), since with the
-    // latter the current mask enters into the returned result...
-    return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max));
+    // unmasked block is needed to make sure that argument for unmasked
+    // function __reduce_max_double() are calculated without a mask.
+    bool test = __mask;
+    uniform double result;
+    unmasked {
+        result = __reduce_max_double(test ? v : doublebits(iflt_neg_max));
+    }
+    return result;
 }
 
 __declspec(safe) 
@@ -1325,88 +1374,88 @@ static inline uniform double max(uniform double a, uniform double b) {
 
 // int8
 
-__declspec(safe,cost2) 
+__declspec(safe,cost1)
 static inline uniform unsigned int8 min(uniform unsigned int8 a,
                                         uniform unsigned int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2) 
+__declspec(safe,cost1)
 static inline uniform unsigned int8 max(uniform unsigned int8 a, 
                                         uniform unsigned int8 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int8 min(uniform int8 a, uniform int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int8 max(uniform int8 a, uniform int8 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int8 min(unsigned int8 a, unsigned int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int8 max(unsigned int8 a, unsigned int8 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int8 min(int8 a, int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int8 max(int8 a, int8 b) {
     return (a > b) ? a : b;
 }
 
 // int16
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform unsigned int16 min(uniform unsigned int16 a, 
                                          uniform unsigned int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform unsigned int16 max(uniform unsigned int16 a, 
                                          uniform unsigned int16 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int16 min(uniform int16 a, uniform int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int16 max(uniform int16 a, uniform int16 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int16 min(unsigned int16 a, unsigned int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int16 max(unsigned int16 a, unsigned int16 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int16 min(int16 a, int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int16 max(int16 a, int16 b) {
     return (a > b) ? a : b;
 }
@@ -1510,6 +1559,18 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl
     return min(max(v, low), high);
 }
 
+// double
+
+__declspec(safe,cost2)
+static inline double clamp(double v, double low, double high) {
+    return min(max(v, low), high);
+}
+
+__declspec(safe,cost2)
+static inline uniform double clamp(uniform double v, uniform double low, uniform double high) {
+    return min(max(v, low), high);
+}
+
 // int8
 
 __declspec(safe,cost2)
@@ -2134,7 +2195,7 @@ static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
 __declspec(safe)
 static inline float sin(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_sin(x_full);
+        return __svml_sinf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2267,8 +2328,10 @@ static inline float asin(float x) {
     bool isnan = (x > 1);
 
     float v;
-    if (__math_lib == __math_lib_svml ||
-        __math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_asinf(x);
+    } 
+    else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_asinf(extract(x, i));
@@ -2371,7 +2434,7 @@ static inline uniform float asin(uniform float x) {
 __declspec(safe)
 static inline float cos(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_cos(x_full);
+        return __svml_cosf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2499,18 +2562,28 @@ static inline float acos(float v) {
     return 1.57079637050628662109375 - asin(v);
 }
 
+__declspec(safe)
+static inline double acos(const double v) {
+    return 1.57079637050628662109375d0 - asin(v);
+}
+
 
 __declspec(safe)
 static inline uniform float acos(uniform float v) {
     return 1.57079637050628662109375 - asin(v);
 }
 
+__declspec(safe)
+static inline uniform double acos(const uniform double v) {
+    return 1.57079637050628662109375d0 - asin(v);
+}
+
 
 __declspec(safe)
 static inline void sincos(float x_full, varying float * uniform sin_result, 
                           varying float * uniform cos_result) {
     if (__math_lib == __math_lib_svml) {
-        __svml_sincos(x_full, sin_result, cos_result);
+        __svml_sincosf(x_full, sin_result, cos_result);
     }
     else if (__math_lib == __math_lib_system) {
         foreach_active (i) {
@@ -2642,7 +2715,7 @@ static inline void sincos(uniform float x_full, uniform float * uniform sin_resu
 __declspec(safe)
 static inline float tan(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_tan(x_full);
+        return __svml_tanf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2793,7 +2866,7 @@ static inline uniform float tan(uniform float x_full) {
 __declspec(safe)
 static inline float atan(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_atan(x_full);
+        return __svml_atanf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2888,7 +2961,7 @@ static inline uniform float atan(uniform float x_full) {
 __declspec(safe)
 static inline float atan2(float y, float x) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_atan2(y, x);
+        return __svml_atan2f(y, x);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2951,7 +3024,7 @@ static inline float exp(float x_full) {
         return __exp_varying_float(x_full);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_exp(x_full);
+        return __svml_expf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3119,7 +3192,7 @@ static inline void __range_reduce_log(float input, varying float * uniform reduc
     static const int nonexponent_mask = 0x807FFFFF;
 
     // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126
-    static const int exponent_neg1 = (126 << 23);
+    static const int exponent_neg1 = (126l << 23);
     // NOTE(boulos): We don't need to mask anything out since we know
     // the sign bit has to be 0. If it's 1, we need to return infinity/nan
     // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
@@ -3142,7 +3215,7 @@ static inline void __range_reduce_log(uniform float input, uniform float * unifo
     uniform int int_version = intbits(input);
     static const uniform int nonexponent_mask = 0x807FFFFF;
 
-    static const uniform int exponent_neg1 = (126 << 23);
+    static const uniform int exponent_neg1 = (126ul << 23);
     uniform int biased_exponent = int_version >> 23;
     uniform int offset_exponent = biased_exponent + 1;
     *exponent = offset_exponent - 127; // get the real value
@@ -3158,7 +3231,7 @@ static inline float log(float x_full) {
         return __log_varying_float(x_full);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_log(x_full);
+        return __svml_logf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3333,7 +3406,7 @@ static inline float pow(float a, float b) {
         return __pow_varying_float(a, b);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_pow(a, b);
+        return __svml_powf(a, b);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3423,7 +3496,11 @@ static inline uniform double frexp(uniform double x, uniform int * uniform pw2)
 
 __declspec(safe)
 static inline double sin(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_sind(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return sin((float)x);
     else {
         double ret;
@@ -3444,8 +3521,30 @@ static inline uniform double sin(uniform double x) {
 }
 
 __declspec(safe)
-static inline double cos(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+static inline double asin(const double x) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_asind(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
+        return asin((float)x);
+    else {
+        double ret;
+        foreach_active (i) {
+            uniform double r = __stdlib_asin(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+__declspec(safe)
+static inline double cos(const double x) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_cosd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return cos((float)x);
     else {
         double ret;
@@ -3468,7 +3567,11 @@ static inline uniform double cos(uniform double x) {
 __declspec(safe)
 static inline void sincos(double x, varying double * uniform sin_result,
                           varying double * uniform cos_result) {
-    if (__math_lib == __math_lib_ispc_fast) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      __svml_sincosd(x, sin_result, cos_result);
+    }
+    else if (__math_lib == __math_lib_ispc_fast) {
         float sr, cr;
         sincos((float)x, &sr, &cr);
         *sin_result = sr;
@@ -3499,7 +3602,11 @@ static inline void sincos(uniform double x, uniform double * uniform sin_result,
 
 __declspec(safe)
 static inline double tan(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_tand(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return tan((float)x);
     else {
         double ret;
@@ -3543,7 +3650,11 @@ static inline uniform double atan(uniform double x) {
 
 __declspec(safe)
 static inline double atan2(double y, double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_atan2d(y,x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return atan2((float)y, (float)x);
     else {
         double ret;
@@ -3565,7 +3676,11 @@ static inline uniform double atan2(uniform double y, uniform double x) {
 
 __declspec(safe)
 static inline double exp(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_expd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return exp((float)x);
     else {
         double ret;
@@ -3587,7 +3702,11 @@ static inline uniform double exp(uniform double x) {
 
 __declspec(safe)
 static inline double log(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_logd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return log((float)x);
     else {
         double ret;
@@ -3609,7 +3728,11 @@ static inline uniform double log(uniform double x) {
 
 __declspec(safe)
 static inline double pow(double a, double b) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_powd(a,b);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return pow((float)a, (float)b);
     else {
         double ret;
@@ -3640,18 +3763,18 @@ static inline uniform float half_to_float(uniform unsigned int16 h) {
     else {
         // https://gist.github.com/2144712
         // Fabian "ryg" Giesen.
-        static const uniform unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift
+        static const uniform unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift
 
         uniform int32 o = ((int32)(h & 0x7fff)) << 13;     // exponent/mantissa bits
         uniform unsigned int32 exp = shifted_exp & o;   // just the exponent
-        o += (127 - 15) << 23;        // exponent adjust
+        o += (uniform int32)(127 - 15) << 23;        // exponent adjust
 
         // handle exponent special cases
         if (exp == shifted_exp) // Inf/NaN?
-            o += (128 - 16) << 23;    // extra exp adjust
+            o += (uniform unsigned int32)(128 - 16) << 23;    // extra exp adjust
         else if (exp == 0) { // Zero/Denormal?
-            o += 1 << 23;             // extra exp adjust
-            o = intbits(floatbits(o) - floatbits(113 << 23)); // renormalize
+            o += 1ul << 23;             // extra exp adjust
+            o = intbits(floatbits(o) - floatbits(113ul << 23)); // renormalize
         }
 
         o |= ((int32)(h & 0x8000)) << 16;    // sign bit
@@ -3668,17 +3791,17 @@ static inline float half_to_float(unsigned int16 h) {
         // https://gist.github.com/2144712
         // Fabian "ryg" Giesen.
 
-        const unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift
+        const unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift
 
-        int32 o = ((int32)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+        int32 o = ((int32)(h & 0x7ffful)) << 13;     // exponent/mantissa bits
         unsigned int32 exp = shifted_exp & o;   // just the exponent
-        o += (127 - 15) << 23;        // exponent adjust
+        o += (int32)(127 - 15) << 23;        // exponent adjust
 
-        int32 infnan_val = o + ((128 - 16) << 23);
-        int32 zerodenorm_val = intbits(floatbits(o + (1<<23)) - floatbits(113 << 23));
+        int32 infnan_val = o + ((int32)(128 - 16) << 23);
+        int32 zerodenorm_val = intbits(floatbits(o + (1ul<<23)) - floatbits(113ul << 23));
         int32 reg_val = (exp == 0) ? zerodenorm_val : o;
 
-        int32 sign_bit = ((int32)(h & 0x8000)) << 16;
+        int32 sign_bit = ((int32)(h & 0x8000ul)) << 16;
         return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
     }
 }
@@ -3708,16 +3831,16 @@ static inline uniform int16 float_to_half(uniform float f) {
         // NaN->qNaN and Inf->Inf
         // unconditional assignment here, will override with right value for
         // the regular case below.
-        uniform int32 f32infty = 255 << 23;
-        o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+        uniform int32 f32infty = 255ul << 23;
+        o = (fint > f32infty) ? 0x7e00u : 0x7c00u; 
 
         // (De)normalized number or zero
         // update fint unconditionally to save the blending; we don't need it
         // anymore for the Inf/NaN case anyway.
 
-        const uniform unsigned int32 round_mask = ~0xfffu; 
-        const uniform int32 magic = 15 << 23;
-        const uniform int32 f16infty = 31 << 23;
+        const uniform unsigned int32 round_mask = ~0xffful;
+        const uniform int32 magic = 15ul << 23;
+        const uniform int32 f16infty = 31ul << 23;
 
         uniform int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask;
         fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
@@ -3754,16 +3877,16 @@ static inline int16 float_to_half(float f) {
         // NaN->qNaN and Inf->Inf
         // unconditional assignment here, will override with right value for
         // the regular case below.
-        int32 f32infty = 255 << 23;
-        o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+        int32 f32infty = 255ul << 23;
+        o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
 
         // (De)normalized number or zero
         // update fint unconditionally to save the blending; we don't need it
         // anymore for the Inf/NaN case anyway.
 
-        const unsigned int32 round_mask = ~0xfffu; 
-        const int32 magic = 15 << 23;
-        const int32 f16infty = 31 << 23;
+        const unsigned int32 round_mask = ~0xffful;
+        const int32 magic = 15ul << 23;
+        const int32 f16infty = 31ul << 23;
 
         // Shift exponent down, denormalize if necessary.
         // NOTE This represents half-float denormals using single precision denormals.
@@ -3782,7 +3905,7 @@ static inline int16 float_to_half(float f) {
         //   FP16 denormals are rare in practice, I don't know. Whatever slow path your HW
         //   may or may not have for denormals, this may well hit it.
         float fscale = floatbits(fint & round_mask) * floatbits(magic);
-        fscale = min(fscale, floatbits((31 << 23) - 0x1000));
+        fscale = min(fscale, floatbits((31ul << 23) - 0x1000ul));
         int32 fint2 = intbits(fscale) - round_mask;
 
         if (fint < f32infty)
@@ -3949,7 +4072,7 @@ float_to_srgb8(float inval)
     // Do the table lookup and unpack bias, scale
     unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20];
     unsigned int bias = (tab >> 16) << 9;
-    unsigned int scale = tab & 0xffff;
+    unsigned int scale = tab & 0xfffful;
 
     // Grab next-highest mantissa bits and perform linear interpolation
     unsigned int t = (intbits(inval) >> 12) & 0xff;
@@ -3999,7 +4122,7 @@ float_to_srgb8(uniform float inval)
     // Do the table lookup and unpack bias, scale
     uniform unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20];
     uniform unsigned int bias = (tab >> 16) << 9;
-    uniform unsigned int scale = tab & 0xffff;
+    uniform unsigned int scale = tab & 0xfffful;
 
     // Grab next-highest mantissa bits and perform linear interpolation
     uniform unsigned int t = (intbits(inval) >> 12) & 0xff;
@@ -4046,14 +4169,14 @@ static inline uniform unsigned int random(uniform RNGState * uniform state)
 static inline float frandom(varying RNGState * uniform state)
 {
     unsigned int irand = random(state);
-    irand &= (1<<23)-1;
+    irand &= (1ul<<23)-1;
     return floatbits(0x3F800000 | irand)-1.0f;
 }
 
 static inline uniform float frandom(uniform RNGState * uniform state)
 {
     uniform unsigned int irand = random(state);
-    irand &= (1<<23)-1;
+    irand &= (1ul<<23)-1;
     return floatbits(0x3F800000 | irand)-1.0f;
 }
 
@@ -4061,18 +4184,18 @@ static inline void seed_rng(varying RNGState * uniform state,
                             unsigned int seed) {
     state->z1 = seed;
     state->z2 = seed ^ 0xbeeff00d;
-    state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
-    state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
-                 ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
+    state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
+    state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul)  << 8) |
+                 ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
 }
 
 static inline void seed_rng(uniform RNGState * uniform state, 
                             uniform unsigned int seed) {
     state->z1 = seed;
     state->z2 = seed ^ 0xbeeff00d;
-    state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
-    state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
-                 ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
+    state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
+    state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul)  << 8) |
+                 ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
 }
 
 
@@ -4090,7 +4213,7 @@ static inline uniform bool rdrand(float * uniform ptr) {
         uniform int32 irand;
         uniform bool success = __rdrand_i32(&irand);
         if (success) {
-            irand &= (1<<23)-1;
+            irand &= (1ul<<23)-1;
             *ptr = floatbits(0x3F800000 | irand)-1.0f;
         }
         return success;
@@ -4110,7 +4233,7 @@ static inline bool rdrand(varying float * uniform ptr) {
                 // in vector form.  However, we need to be careful to not
                 // clobber any existing already-set values in *ptr with
                 // inactive lanes here...
-                irand &= (1<<23)-1;
+                irand &= (1ul<<23)-1;
                 *ptr = floatbits(0x3F800000 | irand)-1.0f;
                 success = true;
             }
@@ -4130,7 +4253,7 @@ static inline bool rdrand(float * ptr) {
         foreach_active (index) {
             uniform int32 irand;
             if (__rdrand_i32(&irand)) {
-                irand &= (1<<23)-1;
+                irand &= (1ul<<23)-1;
                 *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f;
                 success = true;
             }
@@ -4264,3 +4387,720 @@ static inline bool rdrand(int64 * ptr) {
         return success;
     }
 }
+
+///////////////////////////////////////////////////////////////////////////
+// Fast vector integer division
+
+/* These tables and the algorithms in the __fast_idiv() functions below are
+   from Halide; the idea is based on the paper "Division by Invariant
+   Integers using Multiplication" by Granlund and Montgomery.
+
+   Copyright (c) 2012 MIT CSAIL
+
+   Developed by:
+
+   The Halide team
+   MIT CSAIL
+   http://halide-lang.org
+
+   Permission is hereby granted, free of charge, to any person obtaining a
+   copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+static const uniform int64 __idiv_table_u8[][3] = {
+    {0,          0LL,  1},     {1,        171LL,  1},     {0,          0LL,  2},
+    {1,        205LL,  2},     {1,        171LL,  2},     {2,         37LL,  2},
+    {0,          0LL,  3},     {1,         57LL,  1},     {1,        205LL,  3},
+    {2,        117LL,  3},     {1,        171LL,  3},     {1,         79LL,  2},
+    {2,         37LL,  3},     {1,        137LL,  3},     {0,          0LL,  4},
+    {1,        241LL,  4},     {1,         57LL,  2},     {1,         27LL,  1},
+    {1,        205LL,  4},     {2,        135LL,  4},     {2,        117LL,  4},
+    {2,        101LL,  4},     {1,        171LL,  4},     {1,         41LL,  2},
+    {1,         79LL,  3},     {1,         19LL,  1},     {2,         37LL,  4},
+    {2,         27LL,  4},     {1,        137LL,  4},     {2,          9LL,  4},
+    {0,          0LL,  5},     {1,        249LL,  5},     {1,        241LL,  5},
+    {1,        235LL,  5},     {1,         57LL,  3},     {1,        111LL,  4},
+    {1,         27LL,  2},     {2,        165LL,  5},     {1,        205LL,  5},
+    {1,         25LL,  2},     {2,        135LL,  5},     {1,        191LL,  5},
+    {1,        187LL,  5},     {2,        109LL,  5},     {2,        101LL,  5},
+    {1,        175LL,  5},     {1,        171LL,  5},     {2,         79LL,  5},
+    {1,         41LL,  3},     {1,        161LL,  5},     {1,         79LL,  4},
+    {1,        155LL,  5},     {1,         19LL,  2},     {1,        149LL,  5},
+    {2,         37LL,  5},     {1,          9LL,  1},     {2,         27LL,  5},
+    {1,        139LL,  5},     {1,        137LL,  5},     {2,         13LL,  5},
+    {2,          9LL,  5},     {2,          5LL,  5},     {0,          0LL,  6},
+    {1,        253LL,  6},     {1,        249LL,  6},     {1,        245LL,  6},
+    {1,        121LL,  5},     {1,        119LL,  5},     {1,        235LL,  6},
+    {1,        231LL,  6},     {1,         57LL,  4},     {1,        225LL,  6},
+    {1,        111LL,  5},     {1,        219LL,  6},     {1,         27LL,  3},
+    {1,        213LL,  6},     {2,        165LL,  6},     {1,         13LL,  2},
+    {1,        205LL,  6},     {1,        203LL,  6},     {1,         25LL,  3},
+    {1,         99LL,  5},     {2,        135LL,  6},     {1,        193LL,  6},
+    {1,        191LL,  6},     {1,        189LL,  6},     {1,        187LL,  6},
+    {1,        185LL,  6},     {1,        183LL,  6},     {1,        181LL,  6},
+    {1,        179LL,  6},     {1,        177LL,  6},     {1,        175LL,  6},
+    {1,        173LL,  6},     {1,        171LL,  6},     {1,        169LL,  6},
+    {1,         21LL,  3},     {1,         83LL,  5},     {1,         41LL,  4},
+    {1,        163LL,  6},     {1,        161LL,  6},     {2,         63LL,  6},
+    {1,         79LL,  5},     {2,         57LL,  6},     {1,        155LL,  6},
+    {2,         51LL,  6},     {1,         19LL,  3},     {1,        151LL,  6},
+    {1,        149LL,  6},     {1,         37LL,  4},     {2,         37LL,  6},
+    {1,        145LL,  6},     {1,          9LL,  2},     {1,        143LL,  6},
+    {2,         27LL,  6},     {2,         25LL,  6},     {1,        139LL,  6},
+    {1,         69LL,  5},     {1,        137LL,  6},     {2,         15LL,  6},
+    {2,         13LL,  6},     {2,         11LL,  6},     {2,          9LL,  6},
+    {2,          7LL,  6},     {2,          5LL,  6},     {2,          3LL,  6},
+    {0,          0LL,  7},     {1,        255LL,  7},     {1,        127LL,  6},
+    {1,         63LL,  5},     {1,        125LL,  6},     {1,         31LL,  4},
+    {1,        123LL,  6},     {1,         61LL,  5},     {1,        121LL,  6},
+    {1,         15LL,  3},     {1,        119LL,  6},     {1,         59LL,  5},
+    {1,        235LL,  7},     {1,        117LL,  6},     {1,         29LL,  4},
+    {1,        115LL,  6},     {1,         57LL,  5},     {1,        113LL,  6},
+    {1,        225LL,  7},     {1,          7LL,  2},     {1,        111LL,  6},
+    {1,         55LL,  5},     {1,        219LL,  7},     {1,        109LL,  6},
+    {1,         27LL,  4},     {1,        215LL,  7},     {1,        107LL,  6},
+    {1,         53LL,  5},     {1,        211LL,  7},     {1,        105LL,  6},
+    {1,         13LL,  3},     {1,        207LL,  7},     {1,        103LL,  6},
+    {1,         51LL,  5},     {1,        203LL,  7},     {1,        101LL,  6},
+    {1,         25LL,  4},     {1,        199LL,  7},     {1,         99LL,  6},
+    {1,        197LL,  7},     {1,         49LL,  5},     {1,         97LL,  6},
+    {1,        193LL,  7},     {1,          3LL,  1},     {1,        191LL,  7},
+    {1,         95LL,  6},     {1,        189LL,  7},     {1,         47LL,  5},
+    {1,        187LL,  7},     {1,         93LL,  6},     {1,        185LL,  7},
+    {1,         23LL,  4},     {1,        183LL,  7},     {1,         91LL,  6},
+    {1,        181LL,  7},     {1,         45LL,  5},     {1,        179LL,  7},
+    {1,         89LL,  6},     {1,        177LL,  7},     {1,         11LL,  3},
+    {1,        175LL,  7},     {1,         87LL,  6},     {1,        173LL,  7},
+    {1,         43LL,  5},     {1,        171LL,  7},     {1,         85LL,  6},
+    {1,        169LL,  7},     {2,         81LL,  7},     {1,         21LL,  4},
+    {1,        167LL,  7},     {1,         83LL,  6},     {1,        165LL,  7},
+    {1,         41LL,  5},     {2,         71LL,  7},     {1,        163LL,  7},
+    {1,         81LL,  6},     {1,        161LL,  7},     {1,          5LL,  2},
+    {2,         63LL,  7},     {1,        159LL,  7},     {1,         79LL,  6},
+    {1,        157LL,  7},     {2,         57LL,  7},     {1,         39LL,  5},
+    {1,        155LL,  7},     {1,         77LL,  6},     {2,         51LL,  7},
+    {1,        153LL,  7},     {1,         19LL,  4},     {2,         47LL,  7},
+    {1,        151LL,  7},     {1,         75LL,  6},     {1,        149LL,  7},
+    {2,         41LL,  7},     {1,         37LL,  5},     {1,        147LL,  7},
+    {2,         37LL,  7},     {1,         73LL,  6},     {1,        145LL,  7},
+    {2,         33LL,  7},     {1,          9LL,  3},     {2,         31LL,  7},
+    {1,        143LL,  7},     {1,         71LL,  6},     {2,         27LL,  7},
+    {1,        141LL,  7},     {2,         25LL,  7},     {1,         35LL,  5},
+    {1,        139LL,  7},     {2,         21LL,  7},     {1,         69LL,  6},
+    {2,         19LL,  7},     {1,        137LL,  7},     {1,         17LL,  4},
+    {2,         15LL,  7},     {1,        135LL,  7},     {2,         13LL,  7},
+    {1,         67LL,  6},     {2,         11LL,  7},     {1,        133LL,  7},
+    {2,          9LL,  7},     {1,         33LL,  5},     {2,          7LL,  7},
+    {1,        131LL,  7},     {2,          5LL,  7},     {1,         65LL,  6},
+    {2,          3LL,  7},     {1,        129LL,  7},     {0,          0LL,  8},
+};
+static const uniform int64 __idiv_table_s8[][3] = {
+    {0,          0LL,  1},     {1,         86LL,  0},     {0,          0LL,  2},
+    {1,        103LL,  1},     {1,         43LL,  0},     {1,        147LL,  2},
+    {0,          0LL,  3},     {1,         57LL,  1},     {1,        103LL,  2},
+    {1,        187LL,  3},     {1,         43LL,  1},     {1,         79LL,  2},
+    {1,        147LL,  3},     {1,        137LL,  3},     {0,          0LL,  4},
+    {1,        121LL,  3},     {1,         57LL,  2},     {1,         27LL,  1},
+    {1,        103LL,  3},     {1,         49LL,  2},     {1,        187LL,  4},
+    {1,        179LL,  4},     {1,         43LL,  2},     {1,         41LL,  2},
+    {1,         79LL,  3},     {1,         19LL,  1},     {1,        147LL,  4},
+    {1,         71LL,  3},     {1,        137LL,  4},     {1,        133LL,  4},
+    {0,          0LL,  5},     {1,        125LL,  4},     {1,        121LL,  4},
+    {1,         59LL,  3},     {1,         57LL,  3},     {1,        111LL,  4},
+    {1,         27LL,  2},     {1,        211LL,  5},     {1,        103LL,  4},
+    {1,         25LL,  2},     {1,         49LL,  3},     {1,          6LL,  0},
+    {1,         47LL,  3},     {1,         23LL,  2},     {1,         45LL,  3},
+    {1,         11LL,  1},     {1,         43LL,  3},     {1,         21LL,  2},
+    {1,         41LL,  3},     {1,         81LL,  4},     {1,         79LL,  4},
+    {1,         39LL,  3},     {1,         19LL,  2},     {1,         75LL,  4},
+    {1,        147LL,  5},     {1,          9LL,  1},     {1,         71LL,  4},
+    {1,         35LL,  3},     {1,        137LL,  5},     {1,        135LL,  5},
+    {1,        133LL,  5},     {1,        131LL,  5},     {0,          0LL,  6},
+    {1,        127LL,  5},     {1,         63LL,  4},     {1,         31LL,  3},
+    {1,         61LL,  4},     {1,         15LL,  2},     {1,         59LL,  4},
+    {1,         29LL,  3},     {1,         57LL,  4},     {1,        113LL,  5},
+    {1,          7LL,  1},     {1,         55LL,  4},     {1,         27LL,  3},
+    {1,        107LL,  5},     {1,         53LL,  4},     {1,         13LL,  2},
+    {1,        103LL,  5},     {1,         51LL,  4},     {1,         25LL,  3},
+    {1,         99LL,  5},     {1,         49LL,  4},     {1,         97LL,  5},
+    {1,          3LL,  0},     {1,         95LL,  5},     {1,         47LL,  4},
+    {1,         93LL,  5},     {1,         23LL,  3},     {1,         91LL,  5},
+    {1,         45LL,  4},     {1,         89LL,  5},     {1,         11LL,  2},
+    {1,         87LL,  5},     {1,         43LL,  4},     {1,         85LL,  5},
+    {1,         21LL,  3},     {1,         83LL,  5},     {1,         41LL,  4},
+    {1,        163LL,  6},     {1,         81LL,  5},     {1,          5LL,  1},
+    {1,         79LL,  5},     {1,        157LL,  6},     {1,         39LL,  4},
+    {1,         77LL,  5},     {1,         19LL,  3},     {1,        151LL,  6},
+    {1,         75LL,  5},     {1,         37LL,  4},     {1,        147LL,  6},
+    {1,         73LL,  5},     {1,          9LL,  2},     {1,        143LL,  6},
+    {1,         71LL,  5},     {1,        141LL,  6},     {1,         35LL,  4},
+    {1,         69LL,  5},     {1,        137LL,  6},     {1,         17LL,  3},
+    {1,        135LL,  6},     {1,         67LL,  5},     {1,        133LL,  6},
+    {1,         33LL,  4},     {1,        131LL,  6},     {1,         65LL,  5},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+};
+static const uniform int64 __idiv_table_u16[][3] = {
+    {0,          0LL,  1},     {1,      43691LL,  1},     {0,          0LL,  2},
+    {1,      52429LL,  2},     {1,      43691LL,  2},     {2,       9363LL,  2},
+    {0,          0LL,  3},     {1,      58255LL,  3},     {1,      52429LL,  3},
+    {1,      47663LL,  3},     {1,      43691LL,  3},     {1,      20165LL,  2},
+    {2,       9363LL,  3},     {1,      34953LL,  3},     {0,          0LL,  4},
+    {1,      61681LL,  4},     {1,      58255LL,  4},     {1,      55189LL,  4},
+    {1,      52429LL,  4},     {2,      34329LL,  4},     {1,      47663LL,  4},
+    {2,      25645LL,  4},     {1,      43691LL,  4},     {2,      18351LL,  4},
+    {1,      20165LL,  3},     {2,      12137LL,  4},     {2,       9363LL,  4},
+    {1,      18079LL,  3},     {1,      34953LL,  4},     {2,       2115LL,  4},
+    {0,          0LL,  5},     {1,      63551LL,  5},     {1,      61681LL,  5},
+    {1,      59919LL,  5},     {1,      58255LL,  5},     {1,       7085LL,  2},
+    {1,      55189LL,  5},     {2,      42011LL,  5},     {1,      52429LL,  5},
+    {2,      36765LL,  5},     {2,      34329LL,  5},     {1,      48771LL,  5},
+    {1,      47663LL,  5},     {1,      11651LL,  3},     {2,      25645LL,  5},
+    {2,      23705LL,  5},     {1,      43691LL,  5},     {2,      20063LL,  5},
+    {2,      18351LL,  5},     {1,      41121LL,  5},     {1,      20165LL,  4},
+    {1,      39569LL,  5},     {2,      12137LL,  5},     {2,      10725LL,  5},
+    {2,       9363LL,  5},     {2,       8049LL,  5},     {1,      18079LL,  4},
+    {1,      35545LL,  5},     {1,      34953LL,  5},     {1,       8595LL,  3},
+    {2,       2115LL,  5},     {2,       1041LL,  5},     {0,          0LL,  6},
+    {1,       4033LL,  2},     {1,      63551LL,  6},     {1,      31301LL,  5},
+    {1,      61681LL,  6},     {2,      56039LL,  6},     {1,      59919LL,  6},
+    {1,      59075LL,  6},     {1,      58255LL,  6},     {1,      57457LL,  6},
+    {1,       7085LL,  3},     {2,      46313LL,  6},     {1,      55189LL,  6},
+    {1,       6809LL,  3},     {2,      42011LL,  6},     {1,      53093LL,  6},
+    {1,      52429LL,  6},     {1,      25891LL,  5},     {2,      36765LL,  6},
+    {1,      25267LL,  5},     {2,      34329LL,  6},     {1,      49345LL,  6},
+    {1,      48771LL,  6},     {1,      48211LL,  6},     {1,      47663LL,  6},
+    {2,      28719LL,  6},     {1,      11651LL,  4},     {2,      26647LL,  6},
+    {2,      25645LL,  6},     {2,      24665LL,  6},     {2,      23705LL,  6},
+    {1,      44151LL,  6},     {1,      43691LL,  6},     {2,      20945LL,  6},
+    {2,      20063LL,  6},     {1,      42367LL,  6},     {2,      18351LL,  6},
+    {1,       5191LL,  3},     {1,      41121LL,  6},     {1,      20361LL,  5},
+    {1,      20165LL,  5},     {1,      19973LL,  5},     {1,      39569LL,  6},
+    {2,      12863LL,  6},     {2,      12137LL,  6},     {1,       2405LL,  2},
+    {2,      10725LL,  6},     {1,      37787LL,  6},     {2,       9363LL,  6},
+    {1,      18559LL,  5},     {2,       8049LL,  6},     {2,       7409LL,  6},
+    {1,      18079LL,  5},     {1,      35849LL,  6},     {1,      35545LL,  6},
+    {2,       4957LL,  6},     {1,      34953LL,  6},     {1,       4333LL,  3},
+    {1,       8595LL,  4},     {2,       2665LL,  6},     {2,       2115LL,  6},
+    {2,       1573LL,  6},     {2,       1041LL,  6},     {2,        517LL,  6},
+    {0,          0LL,  7},     {1,      16257LL,  5},     {1,       4033LL,  3},
+    {1,      16009LL,  5},     {1,      63551LL,  7},     {1,      63073LL,  7},
+    {1,      31301LL,  6},     {1,      31069LL,  6},     {1,      61681LL,  7},
+    {1,      61231LL,  7},     {2,      56039LL,  7},     {1,      30175LL,  6},
+    {1,      59919LL,  7},     {1,      29747LL,  6},     {1,      59075LL,  7},
+    {1,      29331LL,  6},     {1,      58255LL,  7},     {1,      57853LL,  7},
+    {1,      57457LL,  7},     {1,      28533LL,  6},     {1,       7085LL,  4},
+    {1,      14075LL,  5},     {2,      46313LL,  7},     {1,      27777LL,  6},
+    {1,      55189LL,  7},     {1,      13707LL,  5},     {1,       6809LL,  4},
+    {2,      42705LL,  7},     {2,      42011LL,  7},     {1,      53431LL,  7},
+    {1,      53093LL,  7},     {1,      52759LL,  7},     {1,      52429LL,  7},
+    {2,      38671LL,  7},     {1,      25891LL,  6},     {1,       6433LL,  4},
+    {2,      36765LL,  7},     {2,      36145LL,  7},     {1,      25267LL,  6},
+    {2,      34927LL,  7},     {2,      34329LL,  7},     {1,      49637LL,  7},
+    {1,      49345LL,  7},     {2,      32577LL,  7},     {1,      48771LL,  7},
+    {2,      31443LL,  7},     {1,      48211LL,  7},     {1,      47935LL,  7},
+    {1,      47663LL,  7},     {2,      29251LL,  7},     {2,      28719LL,  7},
+    {1,       2929LL,  3},     {1,      11651LL,  5},     {1,      23173LL,  6},
+    {2,      26647LL,  7},     {1,       2865LL,  3},     {2,      25645LL,  7},
+    {1,       1417LL,  2},     {2,      24665LL,  7},     {1,      44859LL,  7},
+    {2,      23705LL,  7},     {2,      23233LL,  7},     {1,      44151LL,  7},
+    {1,       2745LL,  3},     {1,      43691LL,  7},     {2,      21393LL,  7},
+    {2,      20945LL,  7},     {1,      43019LL,  7},     {2,      20063LL,  7},
+    {1,      21291LL,  6},     {1,      42367LL,  7},     {1,      21077LL,  6},
+    {2,      18351LL,  7},     {1,      41735LL,  7},     {1,       5191LL,  4},
+    {2,      17111LL,  7},     {1,      41121LL,  7},     {2,      16305LL,  7},
+    {1,      20361LL,  6},     {1,      40525LL,  7},     {1,      20165LL,  6},
+    {1,      40137LL,  7},     {1,      19973LL,  6},     {1,      39757LL,  7},
+    {1,      39569LL,  7},     {2,      13231LL,  7},     {2,      12863LL,  7},
+    {1,      39017LL,  7},     {2,      12137LL,  7},     {2,      11779LL,  7},
+    {1,       2405LL,  3},     {2,      11073LL,  7},     {2,      10725LL,  7},
+    {1,      18979LL,  6},     {1,      37787LL,  7},     {2,       9699LL,  7},
+    {2,       9363LL,  7},     {1,      37283LL,  7},     {1,      18559LL,  6},
+    {2,       8373LL,  7},     {2,       8049LL,  7},     {1,       4579LL,  4},
+    {2,       7409LL,  7},     {2,       7093LL,  7},     {1,      18079LL,  6},
+    {1,      36003LL,  7},     {1,      35849LL,  7},     {2,       5857LL,  7},
+    {1,      35545LL,  7},     {1,      35395LL,  7},     {2,       4957LL,  7},
+    {1,      35099LL,  7},     {1,      34953LL,  7},     {1,       4351LL,  4},
+    {1,       4333LL,  4},     {2,       3507LL,  7},     {1,       8595LL,  5},
+    {2,       2943LL,  7},     {2,       2665LL,  7},     {1,      16981LL,  6},
+    {2,       2115LL,  7},     {2,       1843LL,  7},     {2,       1573LL,  7},
+    {1,      33421LL,  7},     {2,       1041LL,  7},     {1,      33157LL,  7},
+    {2,        517LL,  7},     {1,      32897LL,  7},     {0,          0LL,  8},
+};
+static const uniform int64 __idiv_table_s16[][3] = {
+    {0,          0LL,  1},     {1,      21846LL,  0},     {0,          0LL,  2},
+    {1,      26215LL,  1},     {1,      10923LL,  0},     {1,      18725LL,  1},
+    {0,          0LL,  3},     {1,       7282LL,  0},     {1,      26215LL,  2},
+    {1,       5958LL,  0},     {1,      10923LL,  1},     {1,      20165LL,  2},
+    {1,      18725LL,  2},     {1,      34953LL,  3},     {0,          0LL,  4},
+    {1,      30841LL,  3},     {1,       3641LL,  0},     {1,      55189LL,  4},
+    {1,      26215LL,  3},     {1,      49933LL,  4},     {1,       2979LL,  0},
+    {1,      45591LL,  4},     {1,      10923LL,  2},     {1,       5243LL,  1},
+    {1,      20165LL,  3},     {1,      38837LL,  4},     {1,      18725LL,  3},
+    {1,      18079LL,  3},     {1,      34953LL,  4},     {1,      16913LL,  3},
+    {0,          0LL,  5},     {1,       1986LL,  0},     {1,      30841LL,  4},
+    {1,       3745LL,  1},     {1,       3641LL,  1},     {1,       7085LL,  2},
+    {1,      55189LL,  5},     {1,      26887LL,  4},     {1,      26215LL,  4},
+    {1,      51151LL,  5},     {1,      49933LL,  5},     {1,      12193LL,  3},
+    {1,       2979LL,  1},     {1,      11651LL,  3},     {1,      45591LL,  5},
+    {1,      44621LL,  5},     {1,      10923LL,  3},     {1,       2675LL,  1},
+    {1,       5243LL,  2},     {1,      41121LL,  5},     {1,      20165LL,  4},
+    {1,      19785LL,  4},     {1,      38837LL,  5},     {1,      38131LL,  5},
+    {1,      18725LL,  4},     {1,      36793LL,  5},     {1,      18079LL,  4},
+    {1,      17773LL,  4},     {1,      34953LL,  5},     {1,       8595LL,  3},
+    {1,      16913LL,  4},     {1,      33289LL,  5},     {0,          0LL,  6},
+    {1,       4033LL,  2},     {1,        993LL,  0},     {1,      31301LL,  5},
+    {1,      30841LL,  5},     {1,      15197LL,  4},     {1,       3745LL,  2},
+    {1,      14769LL,  4},     {1,       3641LL,  2},     {1,      57457LL,  6},
+    {1,       7085LL,  3},     {1,      55925LL,  6},     {1,      55189LL,  6},
+    {1,       6809LL,  3},     {1,      26887LL,  5},     {1,      26547LL,  5},
+    {1,      26215LL,  5},     {1,      25891LL,  5},     {1,      51151LL,  6},
+    {1,      25267LL,  5},     {1,      49933LL,  6},     {1,      24673LL,  5},
+    {1,      12193LL,  4},     {1,      48211LL,  6},     {1,       2979LL,  2},
+    {1,       5891LL,  3},     {1,      11651LL,  4},     {1,      11523LL,  4},
+    {1,      45591LL,  6},     {1,      45101LL,  6},     {1,      44621LL,  6},
+    {1,      44151LL,  6},     {1,      10923LL,  4},     {1,      43241LL,  6},
+    {1,       2675LL,  2},     {1,        662LL,  0},     {1,       5243LL,  3},
+    {1,       5191LL,  3},     {1,      41121LL,  6},     {1,      20361LL,  5},
+    {1,      20165LL,  5},     {1,      19973LL,  5},     {1,      19785LL,  5},
+    {1,       1225LL,  1},     {1,      38837LL,  6},     {1,       2405LL,  2},
+    {1,      38131LL,  6},     {1,      37787LL,  6},     {1,      18725LL,  5},
+    {1,      18559LL,  5},     {1,      36793LL,  6},     {1,      36473LL,  6},
+    {1,      18079LL,  5},     {1,      35849LL,  6},     {1,      17773LL,  5},
+    {1,      35247LL,  6},     {1,      34953LL,  6},     {1,       4333LL,  3},
+    {1,       8595LL,  4},     {1,      34101LL,  6},     {1,      16913LL,  5},
+    {1,      33555LL,  6},     {1,      33289LL,  6},     {1,      33027LL,  6},
+    {0,          0LL,  7},     {1,      16257LL,  5},     {1,       4033LL,  3},
+    {1,      16009LL,  5},     {1,        993LL,  1},     {1,      31537LL,  6},
+    {1,      31301LL,  6},     {1,      31069LL,  6},     {1,      30841LL,  6},
+    {1,       3827LL,  3},     {1,      15197LL,  5},     {1,      30175LL,  6},
+    {1,       3745LL,  3},     {1,      29747LL,  6},     {1,      14769LL,  5},
+    {1,      29331LL,  6},     {1,       3641LL,  3},     {1,      28927LL,  6},
+    {1,      57457LL,  7},     {1,      28533LL,  6},     {1,       7085LL,  4},
+    {1,      14075LL,  5},     {1,      55925LL,  7},     {1,      27777LL,  6},
+    {1,      55189LL,  7},     {1,      13707LL,  5},     {1,       6809LL,  4},
+    {1,      54121LL,  7},     {1,      26887LL,  6},     {1,       6679LL,  4},
+    {1,      26547LL,  6},     {1,       6595LL,  4},     {1,      26215LL,  6},
+    {1,       6513LL,  4},     {1,      25891LL,  6},     {1,       6433LL,  4},
+    {1,      51151LL,  7},     {1,      50841LL,  7},     {1,      25267LL,  6},
+    {1,       6279LL,  4},     {1,      49933LL,  7},     {1,      24819LL,  6},
+    {1,      24673LL,  6},     {1,      49057LL,  7},     {1,      12193LL,  5},
+    {1,      24245LL,  6},     {1,      48211LL,  7},     {1,        749LL,  1},
+    {1,       2979LL,  3},     {1,      23697LL,  6},     {1,       5891LL,  4},
+    {1,       2929LL,  3},     {1,      11651LL,  5},     {1,      23173LL,  6},
+    {1,      11523LL,  5},     {1,       2865LL,  3},     {1,      45591LL,  7},
+    {1,       1417LL,  2},     {1,      45101LL,  7},     {1,      11215LL,  5},
+    {1,      44621LL,  7},     {1,      44385LL,  7},     {1,      44151LL,  7},
+    {1,       2745LL,  3},     {1,      10923LL,  5},     {1,      43465LL,  7},
+    {1,      43241LL,  7},     {1,      43019LL,  7},     {1,       2675LL,  3},
+    {1,      21291LL,  6},     {1,        331LL,  0},     {1,      21077LL,  6},
+    {1,       5243LL,  4},     {1,      41735LL,  7},     {1,       5191LL,  4},
+    {1,      10331LL,  5},     {1,      41121LL,  7},     {1,      40921LL,  7},
+    {1,      20361LL,  6},     {1,      40525LL,  7},     {1,      20165LL,  6},
+    {1,      20069LL,  6},     {1,      19973LL,  6},     {1,      39757LL,  7},
+    {1,      19785LL,  6},     {1,       4923LL,  4},     {1,       1225LL,  2},
+    {1,      39017LL,  7},     {1,      38837LL,  7},     {1,      19329LL,  6},
+    {1,       2405LL,  3},     {1,      38305LL,  7},     {1,      38131LL,  7},
+    {1,      18979LL,  6},     {1,      37787LL,  7},     {1,      18809LL,  6},
+    {1,      18725LL,  6},     {1,      37283LL,  7},     {1,      18559LL,  6},
+    {1,      36955LL,  7},     {1,      36793LL,  7},     {1,       4579LL,  4},
+    {1,      36473LL,  7},     {1,      36315LL,  7},     {1,      18079LL,  6},
+    {1,      36003LL,  7},     {1,      35849LL,  7},     {1,      35697LL,  7},
+    {1,      17773LL,  6},     {1,       8849LL,  5},     {1,      35247LL,  7},
+    {1,      35099LL,  7},     {1,      34953LL,  7},     {1,       4351LL,  4},
+    {1,       4333LL,  4},     {1,      17261LL,  6},     {1,       8595LL,  5},
+    {1,        535LL,  1},     {1,      34101LL,  7},     {1,      16981LL,  6},
+    {1,      16913LL,  6},     {1,      16845LL,  6},     {1,      33555LL,  7},
+    {1,      33421LL,  7},     {1,      33289LL,  7},     {1,      33157LL,  7},
+    {1,      33027LL,  7},     {1,      32897LL,  7},     {1,      32769LL,  7},
+};
+static const uniform int64 __idiv_table_u32[][3] = {
+    {0,          0LL,  1},     {1, 2863311531LL,  1},     {0,          0LL,  2},
+    {1, 3435973837LL,  2},     {1, 2863311531LL,  2},     {2,  613566757LL,  2},
+    {0,          0LL,  3},     {1,  954437177LL,  1},     {1, 3435973837LL,  3},
+    {1, 3123612579LL,  3},     {1, 2863311531LL,  3},     {1, 1321528399LL,  2},
+    {2,  613566757LL,  3},     {1, 2290649225LL,  3},     {0,          0LL,  4},
+    {1, 4042322161LL,  4},     {1,  954437177LL,  2},     {2, 2938661835LL,  4},
+    {1, 3435973837LL,  4},     {2, 2249744775LL,  4},     {1, 3123612579LL,  4},
+    {1, 2987803337LL,  4},     {1, 2863311531LL,  4},     {1, 1374389535LL,  3},
+    {1, 1321528399LL,  3},     {2,  795364315LL,  4},     {2,  613566757LL,  4},
+    {1, 2369637129LL,  4},     {1, 2290649225LL,  4},     {2,  138547333LL,  4},
+    {0,          0LL,  5},     {1, 1041204193LL,  3},     {1, 4042322161LL,  5},
+    {2, 3558687189LL,  5},     {1,  954437177LL,  3},     {2, 3134165325LL,  5},
+    {2, 2938661835LL,  5},     {2, 2753184165LL,  5},     {1, 3435973837LL,  5},
+    {1, 3352169597LL,  5},     {2, 2249744775LL,  5},     {1,  799063683LL,  3},
+    {1, 3123612579LL,  5},     {2, 1813430637LL,  5},     {1, 2987803337LL,  5},
+    {1, 2924233053LL,  5},     {1, 2863311531LL,  5},     {1, 1402438301LL,  4},
+    {1, 1374389535LL,  4},     {1, 2694881441LL,  5},     {1, 1321528399LL,  4},
+    {2,  891408307LL,  5},     {2,  795364315LL,  5},     {2,  702812831LL,  5},
+    {2,  613566757LL,  5},     {2,  527452125LL,  5},     {1, 2369637129LL,  5},
+    {1,  582368447LL,  3},     {1, 2290649225LL,  5},     {1, 1126548799LL,  4},
+    {2,  138547333LL,  5},     {2,   68174085LL,  5},     {0,          0LL,  6},
+    {1, 4228890877LL,  6},     {1, 1041204193LL,  4},     {1,  128207979LL,  1},
+    {1, 4042322161LL,  6},     {1, 1991868891LL,  5},     {2, 3558687189LL,  6},
+    {1, 3871519817LL,  6},     {1,  954437177LL,  4},     {2, 3235934265LL,  6},
+    {2, 3134165325LL,  6},     {1,  458129845LL,  3},     {2, 2938661835LL,  6},
+    {1,  892460737LL,  4},     {2, 2753184165LL,  6},     {1, 3479467177LL,  6},
+    {1, 3435973837LL,  6},     {1, 3393554407LL,  6},     {1, 3352169597LL,  6},
+    {1,  827945503LL,  4},     {2, 2249744775LL,  6},     {1, 3233857729LL,  6},
+    {1,  799063683LL,  4},     {1,  789879043LL,  4},     {1, 3123612579LL,  6},
+    {1, 3088515809LL,  6},     {2, 1813430637LL,  6},     {2, 1746305385LL,  6},
+    {1, 2987803337LL,  6},     {1, 2955676419LL,  6},     {1, 2924233053LL,  6},
+    {2, 1491936009LL,  6},     {1, 2863311531LL,  6},     {2, 1372618415LL,  6},
+    {1, 1402438301LL,  5},     {1, 2776544515LL,  6},     {1, 1374389535LL,  5},
+    {2, 1148159575LL,  6},     {1, 2694881441LL,  6},     {2, 1042467791LL,  6},
+    {1, 1321528399LL,  5},     {2,  940802361LL,  6},     {2,  891408307LL,  6},
+    {2,  842937507LL,  6},     {2,  795364315LL,  6},     {2,  748664025LL,  6},
+    {2,  702812831LL,  6},     {2,  657787785LL,  6},     {2,  613566757LL,  6},
+    {2,  570128403LL,  6},     {2,  527452125LL,  6},     {2,  485518043LL,  6},
+    {1, 2369637129LL,  6},     {2,  403800345LL,  6},     {1,  582368447LL,  4},
+    {1, 1154949189LL,  5},     {1, 2290649225LL,  6},     {2,  248469183LL,  6},
+    {1, 1126548799LL,  5},     {2,  174592167LL,  6},     {2,  138547333LL,  6},
+    {1,  274877907LL,  3},     {2,   68174085LL,  6},     {2,   33818641LL,  6},
+    {0,          0LL,  7},     {1,  266354561LL,  3},     {1, 4228890877LL,  7},
+    {1, 4196609267LL,  7},     {1, 1041204193LL,  5},     {1, 4133502361LL,  7},
+    {1,  128207979LL,  2},     {1, 4072265289LL,  7},     {1, 4042322161LL,  7},
+    {1,  125400505LL,  2},     {1, 1991868891LL,  6},     {1, 1977538899LL,  6},
+    {2, 3558687189LL,  7},     {1,  974744351LL,  5},     {1, 3871519817LL,  7},
+    {1, 3844446251LL,  7},     {1,  954437177LL,  5},     {1, 3791419407LL,  7},
+    {2, 3235934265LL,  7},     {1, 3739835469LL,  7},     {2, 3134165325LL,  7},
+    {1, 3689636335LL,  7},     {1,  458129845LL,  4},     {1,  910191745LL,  5},
+    {2, 2938661835LL,  7},     {1, 3593175255LL,  7},     {1,  892460737LL,  5},
+    {1, 3546811703LL,  7},     {2, 2753184165LL,  7},     {1,  875407347LL,  5},
+    {1, 3479467177LL,  7},     {2, 2620200175LL,  7},     {1, 3435973837LL,  7},
+    {1, 3414632385LL,  7},     {1, 3393554407LL,  7},     {1, 3372735055LL,  7},
+    {1, 3352169597LL,  7},     {1, 1665926709LL,  6},     {1,  827945503LL,  5},
+    {1, 1645975491LL,  6},     {2, 2249744775LL,  7},     {1, 1626496491LL,  6},
+    {1, 3233857729LL,  7},     {2, 2134925265LL,  7},     {1,  799063683LL,  5},
+    {2, 2060591247LL,  7},     {1,  789879043LL,  5},     {1, 1570730897LL,  6},
+    {1, 3123612579LL,  7},     {2, 1916962805LL,  7},     {1, 3088515809LL,  7},
+    {2, 1847555765LL,  7},     {2, 1813430637LL,  7},     {1, 3037324939LL,  7},
+    {2, 1746305385LL,  7},     {1, 3004130131LL,  7},     {1, 2987803337LL,  7},
+    {2, 1648338801LL,  7},     {1, 2955676419LL,  7},     {1, 2939870663LL,  7},
+    {1, 2924233053LL,  7},     {2, 1522554545LL,  7},     {2, 1491936009LL,  7},
+    {1, 2878302691LL,  7},     {1, 2863311531LL,  7},     {1,  356059465LL,  4},
+    {2, 1372618415LL,  7},     {2, 1343553873LL,  7},     {1, 1402438301LL,  6},
+    {2, 1286310003LL,  7},     {1, 2776544515LL,  7},     {1, 1381296015LL,  6},
+    {1, 1374389535LL,  6},     {1,   42735993LL,  1},     {2, 1148159575LL,  7},
+    {1, 2708156719LL,  7},     {1, 2694881441LL,  7},     {1, 1340867839LL,  6},
+    {2, 1042467791LL,  7},     {1,  663956297LL,  5},     {1, 1321528399LL,  6},
+    {1, 2630410593LL,  7},     {2,  940802361LL,  7},     {1, 2605477791LL,  7},
+    {2,  891408307LL,  7},     {1, 2581013211LL,  7},     {2,  842937507LL,  7},
+    {1, 1278501893LL,  6},     {2,  795364315LL,  7},     {2,  771906565LL,  7},
+    {2,  748664025LL,  7},     {2,  725633745LL,  7},     {2,  702812831LL,  7},
+    {2,  680198441LL,  7},     {2,  657787785LL,  7},     {2,  635578121LL,  7},
+    {2,  613566757LL,  7},     {1, 2443359173LL,  7},     {2,  570128403LL,  7},
+    {2,  548696263LL,  7},     {2,  527452125LL,  7},     {1, 1200340205LL,  6},
+    {2,  485518043LL,  7},     {2,  464823301LL,  7},     {1, 2369637129LL,  7},
+    {2,  423966729LL,  7},     {2,  403800345LL,  7},     {2,  383805589LL,  7},
+    {1,  582368447LL,  5},     {2,  344322273LL,  7},     {1, 1154949189LL,  6},
+    {1, 2300233531LL,  7},     {1, 2290649225LL,  7},     {1,  285143057LL,  4},
+    {2,  248469183LL,  7},     {1, 2262369605LL,  7},     {1, 1126548799LL,  6},
+    {2,  192835267LL,  7},     {2,  174592167LL,  7},     {2,  156496785LL,  7},
+    {2,  138547333LL,  7},     {2,  120742053LL,  7},     {1,  274877907LL,  4},
+    {1, 2190262207LL,  7},     {2,   68174085LL,  7},     {1, 2172947881LL,  7},
+    {2,   33818641LL,  7},     {1, 2155905153LL,  7},     {0,          0LL,  8},
+};
+static const uniform int64 __idiv_table_s32[][3] = {
+    {0,          0LL,  1},     {1, 1431655766LL,  0},     {0,          0LL,  2},
+    {1, 1717986919LL,  1},     {1,  715827883LL,  0},     {1, 2454267027LL,  2},
+    {0,          0LL,  3},     {1,  954437177LL,  1},     {1, 1717986919LL,  2},
+    {1,  780903145LL,  1},     {1,  715827883LL,  1},     {1, 1321528399LL,  2},
+    {1, 2454267027LL,  3},     {1, 2290649225LL,  3},     {0,          0LL,  4},
+    {1, 2021161081LL,  3},     {1,  954437177LL,  2},     {1, 1808407283LL,  3},
+    {1, 1717986919LL,  3},     {1,  818089009LL,  2},     {1,  780903145LL,  2},
+    {1, 2987803337LL,  4},     {1,  715827883LL,  2},     {1, 1374389535LL,  3},
+    {1, 1321528399LL,  3},     {1, 1272582903LL,  3},     {1, 2454267027LL,  4},
+    {1, 2369637129LL,  4},     {1, 2290649225LL,  4},     {1, 2216757315LL,  4},
+    {0,          0LL,  5},     {1, 1041204193LL,  3},     {1, 2021161081LL,  4},
+    {1, 3926827243LL,  5},     {1,  954437177LL,  3},     {1, 3714566311LL,  5},
+    {1, 1808407283LL,  4},     {1, 3524075731LL,  5},     {1, 1717986919LL,  4},
+    {1, 1676084799LL,  4},     {1,  818089009LL,  3},     {1,  799063683LL,  3},
+    {1,  780903145LL,  3},     {1, 3054198967LL,  5},     {1, 2987803337LL,  5},
+    {1, 2924233053LL,  5},     {1,  715827883LL,  3},     {1, 1402438301LL,  4},
+    {1, 1374389535LL,  4},     {1, 2694881441LL,  5},     {1, 1321528399LL,  4},
+    {1, 1296593901LL,  4},     {1, 1272582903LL,  4},     {1,  156180629LL,  1},
+    {1, 2454267027LL,  5},     {1, 2411209711LL,  5},     {1, 2369637129LL,  5},
+    {1,  582368447LL,  3},     {1, 2290649225LL,  5},     {1, 1126548799LL,  4},
+    {1, 2216757315LL,  5},     {1, 2181570691LL,  5},     {0,          0LL,  6},
+    {1, 2114445439LL,  5},     {1, 1041204193LL,  4},     {1,  128207979LL,  1},
+    {1, 2021161081LL,  5},     {1, 1991868891LL,  5},     {1, 3926827243LL,  6},
+    {1, 3871519817LL,  6},     {1,  954437177LL,  4},     {1, 3765450781LL,  6},
+    {1, 3714566311LL,  6},     {1,  458129845LL,  3},     {1, 1808407283LL,  5},
+    {1,  892460737LL,  4},     {1, 3524075731LL,  6},     {1, 1739733589LL,  5},
+    {1, 1717986919LL,  5},     {1,  424194301LL,  3},     {1, 1676084799LL,  5},
+    {1,  827945503LL,  4},     {1,  818089009LL,  4},     {1, 1616928865LL,  5},
+    {1,  799063683LL,  4},     {1,  789879043LL,  4},     {1,  780903145LL,  4},
+    {1, 3088515809LL,  6},     {1, 3054198967LL,  6},     {1, 3020636341LL,  6},
+    {1, 2987803337LL,  6},     {1,  738919105LL,  4},     {1, 2924233053LL,  6},
+    {1, 2893451653LL,  6},     {1,  715827883LL,  4},     {1,  354224107LL,  3},
+    {1, 1402438301LL,  5},     {1, 2776544515LL,  6},     {1, 1374389535LL,  5},
+    {1,  680390859LL,  4},     {1, 2694881441LL,  6},     {1,  333589693LL,  3},
+    {1, 1321528399LL,  5},     {1, 2617884829LL,  6},     {1, 1296593901LL,  5},
+    {1, 1284476201LL,  5},     {1, 1272582903LL,  5},     {1, 2521815661LL,  6},
+    {1,  156180629LL,  2},     {1, 2476377541LL,  6},     {1, 2454267027LL,  6},
+    {1, 1216273925LL,  5},     {1, 2411209711LL,  6},     {1, 1195121335LL,  5},
+    {1, 2369637129LL,  6},     {1, 2349383821LL,  6},     {1,  582368447LL,  4},
+    {1, 1154949189LL,  5},     {1, 2290649225LL,  6},     {1,   70991195LL,  1},
+    {1, 1126548799LL,  5},     {1,  558694933LL,  4},     {1, 2216757315LL,  6},
+    {1,  274877907LL,  3},     {1, 2181570691LL,  6},     {1, 2164392969LL,  6},
+    {0,          0LL,  7},     {1,  266354561LL,  3},     {1, 2114445439LL,  6},
+    {1, 1049152317LL,  5},     {1, 1041204193LL,  5},     {1, 4133502361LL,  7},
+    {1,  128207979LL,  2},     {1, 4072265289LL,  7},     {1, 2021161081LL,  6},
+    {1,  125400505LL,  2},     {1, 1991868891LL,  6},     {1, 1977538899LL,  6},
+    {1, 3926827243LL,  7},     {1,  974744351LL,  5},     {1, 3871519817LL,  7},
+    {1,  961111563LL,  5},     {1,  954437177LL,  5},     {1, 3791419407LL,  7},
+    {1, 3765450781LL,  7},     {1, 1869917735LL,  6},     {1, 3714566311LL,  7},
+    {1,  230602271LL,  3},     {1,  458129845LL,  4},     {1,  910191745LL,  5},
+    {1, 1808407283LL,  6},     {1, 3593175255LL,  7},     {1,  892460737LL,  5},
+    {1,  443351463LL,  4},     {1, 3524075731LL,  7},     {1,  875407347LL,  5},
+    {1, 1739733589LL,  6},     {1,  432197967LL,  4},     {1, 1717986919LL,  6},
+    {1, 3414632385LL,  7},     {1,  424194301LL,  4},     {1,  210795941LL,  3},
+    {1, 1676084799LL,  6},     {1, 1665926709LL,  6},     {1,  827945503LL,  5},
+    {1, 1645975491LL,  6},     {1,  818089009LL,  5},     {1, 1626496491LL,  6},
+    {1, 1616928865LL,  6},     {1, 3214946281LL,  7},     {1,  799063683LL,  5},
+    {1,  397222409LL,  4},     {1,  789879043LL,  5},     {1, 1570730897LL,  6},
+    {1,  780903145LL,  5},     {1, 3105965051LL,  7},     {1, 3088515809LL,  7},
+    {1, 3071261531LL,  7},     {1, 3054198967LL,  7},     {1,  759331235LL,  5},
+    {1, 3020636341LL,  7},     {1, 3004130131LL,  7},     {1, 2987803337LL,  7},
+    {1, 2971653049LL,  7},     {1,  738919105LL,  5},     {1, 2939870663LL,  7},
+    {1, 2924233053LL,  7},     {1, 2908760921LL,  7},     {1, 2893451653LL,  7},
+    {1, 2878302691LL,  7},     {1,  715827883LL,  5},     {1,  356059465LL,  4},
+    {1,  354224107LL,  4},     {1, 2819260585LL,  7},     {1, 1402438301LL,  6},
+    {1, 1395319325LL,  6},     {1, 2776544515LL,  7},     {1, 1381296015LL,  6},
+    {1, 1374389535LL,  6},     {1,   42735993LL,  1},     {1,  680390859LL,  5},
+    {1, 2708156719LL,  7},     {1, 2694881441LL,  7},     {1, 1340867839LL,  6},
+    {1,  333589693LL,  4},     {1,  663956297LL,  5},     {1, 1321528399LL,  6},
+    {1, 2630410593LL,  7},     {1, 2617884829LL,  7},     {1,   81421181LL,  2},
+    {1, 1296593901LL,  6},     {1, 2581013211LL,  7},     {1, 1284476201LL,  6},
+    {1, 1278501893LL,  6},     {1, 1272582903LL,  6},     {1, 2533436931LL,  7},
+    {1, 2521815661LL,  7},     {1, 2510300521LL,  7},     {1,  156180629LL,  3},
+    {1, 2487582869LL,  7},     {1, 2476377541LL,  7},     {1, 2465272709LL,  7},
+    {1, 2454267027LL,  7},     {1, 2443359173LL,  7},     {1, 1216273925LL,  6},
+    {1,  605457945LL,  5},     {1, 2411209711LL,  7},     {1, 1200340205LL,  6},
+    {1, 1195121335LL,  6},     {1, 2379895299LL,  7},     {1, 2369637129LL,  7},
+    {1, 2359467013LL,  7},     {1, 2349383821LL,  7},     {1, 2339386443LL,  7},
+    {1,  582368447LL,  5},     {1, 2319644785LL,  7},     {1, 1154949189LL,  6},
+    {1, 2300233531LL,  7},     {1, 2290649225LL,  7},     {1,  285143057LL,  4},
+    {1,   70991195LL,  2},     {1, 2262369605LL,  7},     {1, 1126548799LL,  6},
+    {1, 1121950641LL,  6},     {1,  558694933LL,  5},     {1, 2225732041LL,  7},
+    {1, 2216757315LL,  7},     {1, 2207854675LL,  7},     {1,  274877907LL,  4},
+    {1, 2190262207LL,  7},     {1, 2181570691LL,  7},     {1, 2172947881LL,  7},
+    {1, 2164392969LL,  7},     {1, 2155905153LL,  7},     {1, 2147483649LL,  7},
+};
+
+__declspec(safe)
+static unmasked inline unsigned int8
+__fast_idiv(unsigned int8 numerator, uniform unsigned int8 divisor) {
+  uniform int64 method = __idiv_table_u8[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_u8[divisor-2][1];
+  uniform int64 shift = __idiv_table_u8[divisor-2][2];
+
+  unsigned int16 mult = multiplier;
+  unsigned int16 val = numerator;
+  if (method == 0)
+      return numerator >> shift;
+  else if (method == 1)
+      return (val * mult) >> (8 + shift);
+  else {
+      val *= mult;
+      val >>= 8;
+      val += (numerator-val)>>1;
+      return (val >> shift);
+  }
+}
+
+__declspec(safe)
+static unmasked inline int8 __fast_idiv(int8 numerator, uniform int8 divisor) {
+  uniform int8 method = __idiv_table_s8[divisor-2][0];
+  uniform int16 multiplier = __idiv_table_s8[divisor-2][1];
+  uniform int8 shift = __idiv_table_s8[divisor-2][2];
+
+  if (method == 0)
+      return numerator >> shift;
+  else {
+      unsigned int8 sign = numerator >> 7;
+      numerator ^= sign;
+      int16 mul = (int16)numerator * (int16)multiplier;
+      mul >>= 8 + shift;
+      return (int8)mul ^ sign;
+  }
+}
+
+__declspec(safe)
+static unmasked inline unsigned int16 __fast_idiv(unsigned int16 numerator,
+                                                  uniform unsigned int16 divisor) {
+  uniform int64 method = __idiv_table_u16[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_u16[divisor-2][1];
+  uniform int64 shift = __idiv_table_u16[divisor-2][2];
+
+  unsigned int32 mult = multiplier;
+  unsigned int32 val = numerator;
+  if (method == 0)
+      return numerator >> shift;
+  else if (method == 1)
+      return (val * mult) >> (16 + shift);
+  else {
+      val *= mult;
+      val >>= 16;
+      val += (numerator-val)>>1;
+      return val >> shift;
+  }
+}
+
+__declspec(safe)
+static unmasked inline int16 __fast_idiv(int16 numerator, uniform int16 divisor) {
+  uniform int64 method = __idiv_table_s16[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_s16[divisor-2][1];
+  uniform int64 shift = __idiv_table_s16[divisor-2][2];
+
+  if (method == 0)
+      return numerator >> shift;
+  else {
+      unsigned int16 sign = numerator >> 15;
+      numerator ^= sign;
+      int32 mul = (int32)numerator * (int32)multiplier;
+      mul >>= 16 + shift;
+      int16 result = mul;
+      return result ^ sign;
+  }
+}
+
+__declspec(safe)
+static unmasked inline inline unsigned int32 __fast_idiv(unsigned int32 numerator,
+                                                         uniform unsigned int32 divisor) {
+  uniform int64 method = __idiv_table_u32[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_u32[divisor-2][1];
+  uniform int64 shift = __idiv_table_u32[divisor-2][2];
+
+  unsigned int64 mult = multiplier;
+  unsigned int64 val = numerator;
+  if (method == 0)
+      return numerator >> shift;
+  else if (method == 1)
+      return (val * mult) >> (32 + shift);
+  else {
+      val *= mult;
+      val >>= 32;
+      val += (numerator-val)>>1;
+      return val >> shift;
+  }
+}
+
+__declspec(safe)
+static unmasked inline int32 __fast_idiv(int32 numerator, uniform int32 divisor) {
+  uniform int64 method = __idiv_table_s32[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_s32[divisor-2][1];
+  uniform int64 shift = __idiv_table_s32[divisor-2][2];
+
+  if (method == 0)
+      return numerator >> shift;
+  else {
+      unsigned int32 sign = numerator >> 31;
+      numerator ^= sign;
+      int64 mul = (int64)numerator * (int64)multiplier;
+      mul >>= 32 + shift;
+      int32 result = mul;
+      return result ^ sign;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Saturating int8/int16 ops
+
+__declspec(safe)
+static unmasked inline unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) {
+    return __avg_up_uint8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int8 avg_up(int8 a, int8 b) {
+    return __avg_up_int8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) {
+    return __avg_up_uint16(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int16 avg_up(int16 a, int16 b) {
+    return __avg_up_int16(a, b);
+}
+
+__declspec(safe)
+static unmasked inline unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) {
+    return __avg_down_uint8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int8 avg_down(int8 a, int8 b) {
+    return __avg_down_int8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) {
+    return __avg_down_uint16(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int16 avg_down(int16 a, int16 b) {
+    return __avg_down_int16(a, b);
+}
diff --git a/sym.cpp b/sym.cpp
index f16f5e11..05f9996a 100644
--- a/sym.cpp
+++ b/sym.cpp
@@ -214,6 +214,17 @@ SymbolTable::LookupType(const char *name) const {
     return NULL;
 }
 
+bool
+SymbolTable::ContainsType(const Type *type) const {
+    TypeMapType::const_iterator iter = types.begin();
+    while (iter != types.end()) {
+        if (iter->second == type) {
+            return true;
+        }
+        iter++;
+    }
+    return false;
+}
 
 std::vector<std::string>
 SymbolTable::ClosestVariableOrFunctionMatch(const char *str) const {
diff --git a/sym.h b/sym.h
index efb532a3..761c3612 100644
--- a/sym.h
+++ b/sym.h
@@ -219,6 +219,12 @@ public:
         @return Pointer to the Type, if found; otherwise NULL is returned.
     */
     const Type *LookupType(const char *name) const;
+    
+    /** Look for a type given a pointer.
+
+        @return True if found, False otherwise.
+    */
+    bool ContainsType(const Type * type) const;
 
     /** This method returns zero or more strings with the names of symbols
         in the symbol table that nearly (but not exactly) match the given
diff --git a/tests/aossoa-1.ispc b/tests/aossoa-1.ispc
index 59964d6d..32d3bcba 100644
--- a/tests/aossoa-1.ispc
+++ b/tests/aossoa-1.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 3
-#define maxProgramCount 64
+#define width 3ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 3;
diff --git a/tests/aossoa-2.ispc b/tests/aossoa-2.ispc
index 9ff82226..df8eae5c 100644
--- a/tests/aossoa-2.ispc
+++ b/tests/aossoa-2.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 4
-#define maxProgramCount 64
+#define width 4ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 4;
diff --git a/tests/aossoa-5.ispc b/tests/aossoa-5.ispc
index eb4fed3a..d6346455 100644
--- a/tests/aossoa-5.ispc
+++ b/tests/aossoa-5.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 3
-#define maxProgramCount 64
+#define width 3ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 3;
diff --git a/tests/aossoa-6.ispc b/tests/aossoa-6.ispc
index b64cd10b..7c177fde 100644
--- a/tests/aossoa-6.ispc
+++ b/tests/aossoa-6.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 4
-#define maxProgramCount 64
+#define width 4ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 4;
diff --git a/tests/atomics-12.ispc b/tests/atomics-12.ispc
index c27ad99c..d6359555 100644
--- a/tests/atomics-12.ispc
+++ b/tests/atomics-12.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 30 && programIndex & 1)
-        b = atomic_or_global(&s, (1 << programIndex));
+        b = atomic_or_global(&s, (1ul << programIndex));
     RET[programIndex] = s;
 }
 
@@ -15,6 +15,6 @@ export void result(uniform float RET[]) {
     uniform int sum = 0;
     for (uniform int i = 0; i < min(30, programCount); ++i)
         if (i & 1)
-            sum += (1 << i);
+            sum += (1ul << i);
     RET[programIndex] = sum;
 }
diff --git a/tests/atomics-13.ispc b/tests/atomics-13.ispc
index 86faaddb..dea3bfc3 100644
--- a/tests/atomics-13.ispc
+++ b/tests/atomics-13.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     int32 b = 0;
     if (programIndex < 32 && programIndex & 1)
-        b = atomic_or_global(&s, (1 << programIndex));
+        b = atomic_or_global(&s, (1ul << programIndex));
     RET[programIndex] = popcnt(reduce_max((int32)b));
 }
 
diff --git a/tests/atomics-4.ispc b/tests/atomics-4.ispc
index 30b343d1..ac746ad2 100644
--- a/tests/atomics-4.ispc
+++ b/tests/atomics-4.ispc
@@ -5,10 +5,10 @@ uniform int32 s = 0;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = atomic_or_global(&s, (1<<min(programIndex,30)));
+    float b = atomic_or_global(&s, (1ul<<min(programIndex,30)));
     RET[programIndex] = s;
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1<<min(programCount,31))-1;
+    RET[programIndex] = (1ul<<min(programCount,31))-1;
 }
diff --git a/tests/avg-down-int16.ispc b/tests/avg-down-int16.ispc
new file mode 100644
index 00000000..10a3c2a2
--- /dev/null
+++ b/tests/avg-down-int16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int16 a = aFOO[programIndex];
+    int16 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-down-int8.ispc b/tests/avg-down-int8.ispc
new file mode 100644
index 00000000..67638934
--- /dev/null
+++ b/tests/avg-down-int8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int8 a = aFOO[programIndex];
+    int8 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-down-uint16.ispc b/tests/avg-down-uint16.ispc
new file mode 100644
index 00000000..70f9185e
--- /dev/null
+++ b/tests/avg-down-uint16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int16 a = aFOO[programIndex];
+    unsigned int16 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-down-uint8.ispc b/tests/avg-down-uint8.ispc
new file mode 100644
index 00000000..75fbf116
--- /dev/null
+++ b/tests/avg-down-uint8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int8 a = aFOO[programIndex];
+    unsigned int8 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-up-int16.ispc b/tests/avg-up-int16.ispc
new file mode 100644
index 00000000..8f557a5b
--- /dev/null
+++ b/tests/avg-up-int16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int16 a = aFOO[programIndex];
+    int16 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/avg-up-int8.ispc b/tests/avg-up-int8.ispc
new file mode 100644
index 00000000..d0a3b444
--- /dev/null
+++ b/tests/avg-up-int8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int8 a = aFOO[programIndex];
+    int8 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/avg-up-uint16.ispc b/tests/avg-up-uint16.ispc
new file mode 100644
index 00000000..273f9f3b
--- /dev/null
+++ b/tests/avg-up-uint16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int16 a = aFOO[programIndex];
+    unsigned int16 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/avg-up-uint8.ispc b/tests/avg-up-uint8.ispc
new file mode 100644
index 00000000..d5d02491
--- /dev/null
+++ b/tests/avg-up-uint8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int8 a = aFOO[programIndex];
+    unsigned int8 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/clock.ispc b/tests/clock.ispc
new file mode 100644
index 00000000..0e95379b
--- /dev/null
+++ b/tests/clock.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+  unsigned uniform int64 a = clock();
+  float x = pow(sqrt(aFOO[programIndex]), 5.5);
+  unsigned uniform int64 b = clock();
+  RET[programIndex] = (b - a) > 0 ? 1 : 0;
+}
+
+export void result(uniform float RET[]) {
+  RET[programIndex] = 1;
+}
diff --git a/tests/coalesce-1.ispc b/tests/coalesce-1.ispc
index acfe8cdf..39a79a91 100644
--- a/tests/coalesce-1.ispc
+++ b/tests/coalesce-1.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
     
     assert(programIndex <= 64);
diff --git a/tests/coalesce-2.ispc b/tests/coalesce-2.ispc
index 88b952a4..a047e456 100644
--- a/tests/coalesce-2.ispc
+++ b/tests/coalesce-2.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     RET[programIndex] = buf[programIndex & 1];
diff --git a/tests/coalesce-3.ispc b/tests/coalesce-3.ispc
index 7a05963f..c1718b4f 100644
--- a/tests/coalesce-3.ispc
+++ b/tests/coalesce-3.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     RET[programIndex] = buf[(programIndex >> 2) * 16 + (programIndex & 3)];
diff --git a/tests/coalesce-4.ispc b/tests/coalesce-4.ispc
index 1ddd4b89..182a4d4f 100644
--- a/tests/coalesce-4.ispc
+++ b/tests/coalesce-4.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[2*programIndex];
diff --git a/tests/coalesce-5.ispc b/tests/coalesce-5.ispc
index 2dd8d44e..385e8526 100644
--- a/tests/coalesce-5.ispc
+++ b/tests/coalesce-5.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[4*programIndex];
diff --git a/tests/coalesce-6.ispc b/tests/coalesce-6.ispc
index 2a54a2db..8c630a45 100644
--- a/tests/coalesce-6.ispc
+++ b/tests/coalesce-6.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[4*programIndex];
diff --git a/tests/coalesce-7.ispc b/tests/coalesce-7.ispc
index 8ed628bd..29b56b8d 100644
--- a/tests/coalesce-7.ispc
+++ b/tests/coalesce-7.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[4*programIndex];
diff --git a/tests/coalesce-8.ispc b/tests/coalesce-8.ispc
index dfefaa19..f01ca9c3 100644
--- a/tests/coalesce-8.ispc
+++ b/tests/coalesce-8.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     int index = (programIndex < 4) ? (programIndex & 1) :
diff --git a/tests/count-leading-trailing-zeros-1.ispc b/tests/count-leading-trailing-zeros-1.ispc
index 221d066d..3f12c07d 100644
--- a/tests/count-leading-trailing-zeros-1.ispc
+++ b/tests/count-leading-trailing-zeros-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    RET[programIndex] = count_trailing_zeros(0xf0);
+    RET[programIndex] = count_trailing_zeros(0xf0ul);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/count-leading-trailing-zeros-4.ispc b/tests/count-leading-trailing-zeros-4.ispc
index 475c18ca..4b849018 100644
--- a/tests/count-leading-trailing-zeros-4.ispc
+++ b/tests/count-leading-trailing-zeros-4.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    int32 i = (1 << (programIndex % 28));
+    int32 i = (1ul << (programIndex % 28));
     RET[programIndex] = count_leading_zeros(i);
 }
 
diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc
new file mode 100644
index 00000000..5f9a66d5
--- /dev/null
+++ b/tests/double-consts.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    // Test parsing of double constants.
+    double d1 = 1.0d40;
+    double d2 = 1.d40;
+    double d3 = 1d40;
+    double d4 = .1d41;
+    double d5 = 10000000000000000000000000000000000000000.d;
+    double d6 = 10000000000000000000000000000000000000000.0d;
+
+    // All the constants should be equal and if it's evaluated as "float",
+    // then sqrt will evaluate to +inf.
+    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && d1 == d6 &&
+        ((float)sqrt(d1)) < 2e20) {
+        RET[programIndex] = a;
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/exclusive-scan-and-2.ispc b/tests/exclusive-scan-and-2.ispc
index 5d2bcd1f..b742a91e 100644
--- a/tests/exclusive-scan-and-2.ispc
+++ b/tests/exclusive-scan-and-2.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = -1;
-    int32 a = ~(1 << programIndex);
+    int32 a = ~(1ul << programIndex);
     if ((programIndex < 32) && (programIndex & 1) == 0) {
         RET[programIndex] = exclusive_scan_and(a);
     }
@@ -15,7 +15,7 @@ export void result(uniform float RET[]) {
     if ((programIndex & 1) == 0 && programIndex > 0 && programIndex < 32) {
         int val = 0xffffffff;
         for (int i = 0; i < programIndex-1; i += 2)
-            val &= ~(1<<i);
+            val &= ~(1ul<<i);
         RET[programIndex] = val;
     }
 }
diff --git a/tests/exclusive-scan-or-1.ispc b/tests/exclusive-scan-or-1.ispc
index bd2b7598..ce790c4f 100644
--- a/tests/exclusive-scan-or-1.ispc
+++ b/tests/exclusive-scan-or-1.ispc
@@ -3,11 +3,11 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = -1;
-    int32 a = (1 << (min(programIndex, 30)));
+    int32 a = (1ul << (min(programIndex, 30)));
     RET[programIndex] = exclusive_scan_or(a);
 }
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1 << (min(programIndex, 31))) - 1;
+    RET[programIndex] = (1ul << (min(programIndex, 31))) - 1;
 }
diff --git a/tests/frexp-double-1.ispc b/tests/frexp-double-1.ispc
index 6c38b05e..dbb4128b 100644
--- a/tests/frexp-double-1.ispc
+++ b/tests/frexp-double-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = (1<< (programIndex % 28)) * 1.5;
+    double a = (1ul<< (programIndex % 28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/frexp-double.ispc b/tests/frexp-double.ispc
index ba4831d7..f397355f 100644
--- a/tests/frexp-double.ispc
+++ b/tests/frexp-double.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = (1<< (programIndex%28)) * 1.5;
+    double a = (1ul << (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/frexp-float-1.ispc b/tests/frexp-float-1.ispc
index 7d5fc1d2..9df35c4c 100644
--- a/tests/frexp-float-1.ispc
+++ b/tests/frexp-float-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = (1<< (programIndex%28)) * 1.5;
+    float a = (1ul << (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/frexp-float.ispc b/tests/frexp-float.ispc
index ec54e4be..2bfa35ff 100644
--- a/tests/frexp-float.ispc
+++ b/tests/frexp-float.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = (1<< (programIndex%28)) * 1.5;
+    float a = (1ul << (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/idiv.ispc b/tests/idiv.ispc
new file mode 100644
index 00000000..bd0766da
--- /dev/null
+++ b/tests/idiv.ispc
@@ -0,0 +1,81 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+  uniform int errorCount = 0;
+
+  for (unsigned int8 num = 0; num < 255; ++num) {
+    for (uniform unsigned int8 div = 2; div < 255; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 32) break;
+      }
+    }
+  }
+
+  for (int8 num = 0; num < 127; ++num) {
+    for (uniform int8 div = 2; div < 127; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 64) break;
+      }
+    }
+  }
+
+  for (int16 num = 0; num < 32767; ++num) {
+    for (uniform int16 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 96) break;
+      }
+    }
+  }
+
+  for (unsigned int16 num = 0; num < 0xffff; ++num) {
+    for (uniform unsigned int16 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 128) break;
+      }
+    }
+  }
+
+  // randomly sample int32s...
+  uniform RNGState state;
+  seed_rng(&state, 1234);
+  for (uniform int i = 0; i < 64k; ++i) {
+    unsigned int32 num = random(&state);
+    for (uniform unsigned int32 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("ui32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 160) break;
+      }
+    }
+  }
+
+  for (uniform int64 i = 0; i < 64k; ++i) {
+    int32 num = random(&state);
+    if (num < 0)
+      continue;
+    for (uniform int32 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("si32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 192) break;
+      }
+    }
+  }
+
+  RET[programIndex] = errorCount;
+}
+
+export void result(uniform float RET[]) {
+  RET[programIndex] = 0;
+}
+
diff --git a/tests/kilo-mega-giga-2.ispc b/tests/kilo-mega-giga-2.ispc
index 77e201ef..42545b8d 100644
--- a/tests/kilo-mega-giga-2.ispc
+++ b/tests/kilo-mega-giga-2.ispc
@@ -8,5 +8,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 2*1024*1024 + 5;
+    RET[programIndex] = 2ul*1024ul*1024ul + 5;
 }
diff --git a/tests/ldexp-double.ispc b/tests/ldexp-double.ispc
index 6b3ed734..e1b7a59f 100644
--- a/tests/ldexp-double.ispc
+++ b/tests/ldexp-double.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = 1 << (programIndex % 28);
+    double a = 1ul << (programIndex % 28);
     if (programIndex & 1)
         a = -a;
     RET[programIndex] = ldexp(a, 2);
@@ -11,7 +11,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 
 export void result(uniform float RET[]) {
     int pi = programIndex % 28;
-    RET[programIndex] = (1 << (pi + 2));
+    RET[programIndex] = (1ul << (pi + 2));
     if (programIndex & 1)
         RET[programIndex] = -RET[programIndex];
 }
diff --git a/tests/ldexp-float.ispc b/tests/ldexp-float.ispc
index a2ec9a27..305ae106 100644
--- a/tests/ldexp-float.ispc
+++ b/tests/ldexp-float.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = 1 << (programIndex % 28);
+    float a = 1ul << (programIndex % 28);
     if (programIndex & 1)
         a = -a;
     RET[programIndex] = ldexp(a, 2);
@@ -11,7 +11,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 
 export void result(uniform float RET[]) {
     int pi = programIndex % 28;
-    RET[programIndex] = (1 << (pi + 2));
+    RET[programIndex] = (1ul << (pi + 2));
     if (programIndex & 1)
         RET[programIndex] = -RET[programIndex];
 }
diff --git a/tests/local-atomics-12.ispc b/tests/local-atomics-12.ispc
index 23a30af5..358ffd34 100644
--- a/tests/local-atomics-12.ispc
+++ b/tests/local-atomics-12.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 29 && (programIndex & 1))
-        b = atomic_or_local(&s, (1 << programIndex));
+        b = atomic_or_local(&s, (1ul << programIndex));
     RET[programIndex] = s;
 }
 
@@ -15,6 +15,6 @@ export void result(uniform float RET[]) {
     uniform int sum = 0;
     for (uniform int i = 0; i < min(programCount, 29); ++i)
         if (i & 1)
-            sum += (1 << i);
+            sum += (1ul << i);
     RET[programIndex] = sum;
 }
diff --git a/tests/local-atomics-13.ispc b/tests/local-atomics-13.ispc
index 36fd1f1c..b9d35d09 100644
--- a/tests/local-atomics-13.ispc
+++ b/tests/local-atomics-13.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     int32 b = 0;
     if (programIndex < 28 && (programIndex & 1))
-        b = atomic_or_local(&s, (1 << programIndex));
+        b = atomic_or_local(&s, (1ul << programIndex));
     RET[programIndex] = popcnt(reduce_max(b));
 }
 
diff --git a/tests/local-atomics-14.ispc b/tests/local-atomics-14.ispc
index 4cf81809..25c52e60 100644
--- a/tests/local-atomics-14.ispc
+++ b/tests/local-atomics-14.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 32 && (programIndex & 1))
-        b = atomic_or_local(&s, (1 << programIndex));
+        b = atomic_or_local(&s, (1ul << programIndex));
     RET[programIndex] = (s>>20);
 }
 
@@ -15,6 +15,6 @@ export void result(uniform float RET[]) {
     uniform int sum = 0;
     for (uniform int i = 0; i < min(32, programCount); ++i)
         if (i & 1)
-            sum += (1 << i);
+            sum += (1ul << i);
     RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
 }
diff --git a/tests/local-atomics-4.ispc b/tests/local-atomics-4.ispc
index f7f6a04a..b3648ab5 100644
--- a/tests/local-atomics-4.ispc
+++ b/tests/local-atomics-4.ispc
@@ -7,10 +7,10 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 29)
-        atomic_or_local(&s, (1<<programIndex));
+        atomic_or_local(&s, (1ul<<programIndex));
     RET[programIndex] = s;
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1<<min(29,programCount))-1;
+    RET[programIndex] = (1ul<<min(29,programCount))-1;
 }
diff --git a/tests/rand-distrib-1.ispc b/tests/rand-distrib-1.ispc
index 3a23a917..8869d600 100644
--- a/tests/rand-distrib-1.ispc
+++ b/tests/rand-distrib-1.ispc
@@ -11,7 +11,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     for (uniform int i = 0; i < iters; ++i) {
         unsigned int val = random(&state);
         for (uniform int j = 0; j < 32; ++j) {
-            if (val & (1<<j))
+            if (val & (1ul<<j))
                 ++count[j];
         }
     }
diff --git a/tests/reduce-add-int16-1.ispc b/tests/reduce-add-int16-1.ispc
new file mode 100644
index 00000000..58529ca1
--- /dev/null
+++ b/tests/reduce-add-int16-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int16 iv = (int)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int16.ispc b/tests/reduce-add-int16.ispc
new file mode 100644
index 00000000..8657b201
--- /dev/null
+++ b/tests/reduce-add-int16.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int16 iv = (int)v;
+/*CO    if (iv & 1)*/
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; ++i)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int8-1.ispc b/tests/reduce-add-int8-1.ispc
new file mode 100644
index 00000000..e5310aae
--- /dev/null
+++ b/tests/reduce-add-int8-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int8 iv = (int)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int8.ispc b/tests/reduce-add-int8.ispc
new file mode 100644
index 00000000..7e0dd027
--- /dev/null
+++ b/tests/reduce-add-int8.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+  int8 db = b-4;
+  int8 iv = programIndex + db;
+  int m = reduce_add(iv);
+  RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; ++i)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/sizeof-9.ispc b/tests/sizeof-9.ispc
index ad96fab2..84492bcc 100644
--- a/tests/sizeof-9.ispc
+++ b/tests/sizeof-9.ispc
@@ -2,7 +2,7 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    RET[programIndex] = sizeof 1;
+  RET[programIndex] = sizeof 1u;
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-83.ispc b/tests/test-83.ispc
index eaca92d7..6aa408b7 100644
--- a/tests/test-83.ispc
+++ b/tests/test-83.ispc
@@ -6,7 +6,7 @@ float f(int i) { return i + 1.; }
 float f(float v) { return 2 * v; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = f(a) + f(10); 
+    RET[programIndex] = f(a) + f(10l);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-84.ispc b/tests/test-84.ispc
index f39568b0..fedf149a 100644
--- a/tests/test-84.ispc
+++ b/tests/test-84.ispc
@@ -6,7 +6,7 @@ float f(float v) { return 2 * v; }
 float f(int i) { return i + 1.; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = f(a) + f(10); 
+    RET[programIndex] = f(a) + f(10l);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-85.ispc b/tests/test-85.ispc
index 0001816c..e9510ffb 100644
--- a/tests/test-85.ispc
+++ b/tests/test-85.ispc
@@ -8,7 +8,7 @@ float f(float a, int b) { return a + b; }
 float f(int i) { return i + 1.; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = f(a) + f() + f(a, a) + f(10); 
+    RET[programIndex] = f(a) + f() + f(a, a) + f(10l);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests_errors/array-plus-equals.ispc b/tests_errors/array-plus-equals.ispc
index 0e0ba744..7fcecc43 100644
--- a/tests_errors/array-plus-equals.ispc
+++ b/tests_errors/array-plus-equals.ispc
@@ -1,4 +1,4 @@
-// Illegal to assign to array type "varying float[5]"
+// Illegal to assign to array type "varying float\[5\]"
 
 void foo(float *x) {
     float a[5] = { 1,2,3,4,5};
diff --git a/tests_errors/array-pointer-assign.ispc b/tests_errors/array-pointer-assign.ispc
index d709dbd3..3e74c2ed 100644
--- a/tests_errors/array-pointer-assign.ispc
+++ b/tests_errors/array-pointer-assign.ispc
@@ -1,4 +1,4 @@
-// Illegal to assign to array type "varying float[5]"
+// Illegal to assign to array type "varying float\[5\]"
 
 void foo(float *x) {
     float a[5] = { 1,2,3,4,5};
diff --git a/tests_errors/float-logical.ispc b/tests_errors/float-logical.ispc
index 27ab4c8c..a5a1a5cc 100644
--- a/tests_errors/float-logical.ispc
+++ b/tests_errors/float-logical.ispc
@@ -1,4 +1,4 @@
-// Illegal to use ^= operator with floating-point
+// Illegal to use \^= operator with floating-point
 
 float foo(float a, float b) {
     return a ^= b;
diff --git a/tests_errors/fptr-typecheck-2.ispc b/tests_errors/fptr-typecheck-2.ispc
index ea59fa54..6a665207 100644
--- a/tests_errors/fptr-typecheck-2.ispc
+++ b/tests_errors/fptr-typecheck-2.ispc
@@ -1,4 +1,4 @@
-// Can't convert argument of type "void * uniform" to type "varying float" for function call argument.
+// Can't convert argument of type "void \* uniform" to type "varying float" for function call argument.
 
 float bar(float a, float b);
 
diff --git a/tests_errors/fptr-typecheck-3.ispc b/tests_errors/fptr-typecheck-3.ispc
index 26412632..a65ac74e 100644
--- a/tests_errors/fptr-typecheck-3.ispc
+++ b/tests_errors/fptr-typecheck-3.ispc
@@ -1,4 +1,4 @@
-// Too few parameter values provided in function call (1 provided, 2 expected).
+// Too few parameter values provided in function call \(1 provided, 2 expected\).
 
 float bar(float a, float b);
 
diff --git a/tests_errors/initexpr-2.ispc b/tests_errors/initexpr-2.ispc
index 681fe6fc..db0b4925 100644
--- a/tests_errors/initexpr-2.ispc
+++ b/tests_errors/initexpr-2.ispc
@@ -1,3 +1,3 @@
-// Initializer list for array "varying int32[2][4]" must have no more than 2 elements (has 3)
+// Initializer list for array "varying int32\[2\]\[4\]" must have no more than 2 elements \(has 3\)
 
 int a[2][4] = { { 1, 2, 3 }, { 1, 2, 3, 4 }, 1 };
diff --git a/tests_errors/int-ptr-fail.ispc b/tests_errors/int-ptr-fail.ispc
index 0c06bfc8..4185651e 100644
--- a/tests_errors/int-ptr-fail.ispc
+++ b/tests_errors/int-ptr-fail.ispc
@@ -1,5 +1,5 @@
-// Type conversion from "const uniform int32" to "uniform int32 * varying" for initializer is not possible
+// Type conversion from "const uniform int32" to "uniform int32 \* varying" for initializer is not possible
 
 int voo() {
-    int * varying foo = 1;
+    int * varying foo = 1l;
 }
diff --git a/tests_errors/lvalue-2.ispc b/tests_errors/lvalue-2.ispc
index ae2e3edd..77438ebb 100644
--- a/tests_errors/lvalue-2.ispc
+++ b/tests_errors/lvalue-2.ispc
@@ -1,4 +1,4 @@
-// Can't assign to type "const uniform int32" on left-hand side of expression
+// Can't assign to type "const uniform int[0-9]*" on left-hand side of expression
 
 int bar(){ 
     4 = 0;
diff --git a/tests_errors/lvalue-3.ispc b/tests_errors/lvalue-3.ispc
index eb856bf0..72b8b9b8 100644
--- a/tests_errors/lvalue-3.ispc
+++ b/tests_errors/lvalue-3.ispc
@@ -1,4 +1,4 @@
-// Can't assign to type "const uniform int32" on left-hand side of expression
+// Can't assign to type "const uniform int[0-9]*" on left-hand side of expression
 
 int bar(){ 
     int x;
diff --git a/tests_errors/new-delete-3.ispc b/tests_errors/new-delete-3.ispc
index bb22aa56..f46053e3 100644
--- a/tests_errors/new-delete-3.ispc
+++ b/tests_errors/new-delete-3.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '('
+// syntax error, unexpected '\('
 
 int * func(int a) {
     return new int[a](10);
diff --git a/tests_errors/new-delete-6.ispc b/tests_errors/new-delete-6.ispc
index 250441c2..1b090a1c 100644
--- a/tests_errors/new-delete-6.ispc
+++ b/tests_errors/new-delete-6.ispc
@@ -1,4 +1,4 @@
-// Can't convert from type "uniform int32 * varying" to type "uniform int32 * uniform" for return
+// Can't convert from type "uniform int32 \* varying" to type "uniform int32 \* uniform" for return
 
 int * uniform func(int x) {
     return new int[x];
diff --git a/tests_errors/ptr-1.ispc b/tests_errors/ptr-1.ispc
index 97a88488..5a9d891c 100644
--- a/tests_errors/ptr-1.ispc
+++ b/tests_errors/ptr-1.ispc
@@ -1,4 +1,4 @@
-// Can't convert from pointer type "void * varying" to incompatible pointer type "uniform int32 * varying" for return statement
+// Can't convert from pointer type "void \* varying" to incompatible pointer type "uniform int32 \* varying" for return statement
 
 int *foo(void *p) {
     return p;
diff --git a/tests_errors/ptr-const-1.ispc b/tests_errors/ptr-const-1.ispc
index 4dcfaa75..65900060 100644
--- a/tests_errors/ptr-const-1.ispc
+++ b/tests_errors/ptr-const-1.ispc
@@ -1,4 +1,4 @@
-// Can't assign to type "const uniform int32 * const varying"
+// Can't assign to type "const uniform int32 \* const varying"
 
 void foo(const int * const p) {
     ++p;
diff --git a/tests_errors/ptrcast-lose-info.ispc b/tests_errors/ptrcast-lose-info.ispc
index 5da374aa..61efe95e 100644
--- a/tests_errors/ptrcast-lose-info.ispc
+++ b/tests_errors/ptrcast-lose-info.ispc
@@ -1,4 +1,4 @@
-// Pointer type cast of type "uniform int32 * uniform" to integer type "uniform int32" may lose information.
+// Pointer type cast of type "uniform int32 \* uniform" to integer type "uniform int32" may lose information.
 //  rule: run on arch=x86-64
 
 int32 foo(int * uniform x)  {
diff --git a/tests_errors/ref-3.ispc b/tests_errors/ref-3.ispc
index 85a8dd35..11b30a92 100644
--- a/tests_errors/ref-3.ispc
+++ b/tests_errors/ref-3.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '*',
+// syntax error, unexpected '\*',
 
 void foo(int & * x) {
     *x = NULL;
diff --git a/tests_errors/soa-11.ispc b/tests_errors/soa-11.ispc
index 67d814c6..d632e9b0 100644
--- a/tests_errors/soa-11.ispc
+++ b/tests_errors/soa-11.ispc
@@ -1,4 +1,4 @@
-// Type conversion from "const uniform int32" to "soa<4> struct Foo" for assignment operator is not possible
+// Type conversion from "const uniform int[0-9]*" to "soa<4> struct Foo" for assignment operator is not possible
 
 struct Pt { float x, y, z; };
 
diff --git a/tests_errors/soa-12.ispc b/tests_errors/soa-12.ispc
index e2cd3242..c0420614 100644
--- a/tests_errors/soa-12.ispc
+++ b/tests_errors/soa-12.ispc
@@ -1,4 +1,4 @@
-// Can't convert between types "const uniform int32" and "soa<4> float" with different SOA widths
+// Can't convert between types "const uniform int[0-9]*" and "soa<4> float" with different SOA widths
 
 struct Pt { float x, y, z; };
 
diff --git a/tests_errors/soa-3.ispc b/tests_errors/soa-3.ispc
index b2be1b59..04dc84bc 100644
--- a/tests_errors/soa-3.ispc
+++ b/tests_errors/soa-3.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '-', expecting int32 constant
+// syntax error, unexpected '-', expecting int
 
 struct F { float a, b, c; };
 
diff --git a/tests_errors/soa-4.ispc b/tests_errors/soa-4.ispc
index b2be1b59..04dc84bc 100644
--- a/tests_errors/soa-4.ispc
+++ b/tests_errors/soa-4.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '-', expecting int32 constant
+// syntax error, unexpected '-', expecting int
 
 struct F { float a, b, c; };
 
diff --git a/tests_errors/soa-9.ispc b/tests_errors/soa-9.ispc
index 7c6a1df9..e9e7509a 100644
--- a/tests_errors/soa-9.ispc
+++ b/tests_errors/soa-9.ispc
@@ -1,4 +1,4 @@
-// Can't convert from pointer to SOA type "soa<8> struct A * uniform" to pointer to non-SOA type "void * varying" 
+// Can't convert from pointer to SOA type "soa<8> struct A \* uniform" to pointer to non-SOA type "void \* varying" 
 
 struct A { float a, b; };
 
diff --git a/tests_errors/struct_arith.ispc b/tests_errors/struct_arith.ispc
index 9d942880..df729d02 100644
--- a/tests_errors/struct_arith.ispc
+++ b/tests_errors/struct_arith.ispc
@@ -1,4 +1,4 @@
-// Assignment operator "+=" is illegal with struct type
+// Assignment operator "\+=" is illegal with struct type
 
 struct Point { float x, y, z; };
 
diff --git a/tests_errors/vec-size-compile-constant.ispc b/tests_errors/vec-size-compile-constant.ispc
index b9e61721..0eb6f90e 100644
--- a/tests_errors/vec-size-compile-constant.ispc
+++ b/tests_errors/vec-size-compile-constant.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected identifier, expecting int32 constant
+// syntax error, unexpected identifier, expecting int
 
 void foo(uniform int i) {
     float<i> a;
diff --git a/util.cpp b/util.cpp
index dbea9517..6b121988 100644
--- a/util.cpp
+++ b/util.cpp
@@ -79,8 +79,8 @@
     compiler under a debuffer; in this case, just return a reasonable
     default.
  */
-static int
-lTerminalWidth() {
+int
+TerminalWidth() {
     if (g->disableLineWrap)
         return 1<<30;
 
@@ -228,8 +228,8 @@ lFindIndent(int numColons, const char *buf) {
 /** Print the given string to the given FILE, assuming the given output
     column width.  Break words as needed to avoid words spilling past the
     last column.  */
-static void
-lPrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) {
+void
+PrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) {
 #ifdef ISPC_IS_WINDOWS
     fputs(buf, out);
     fputs("\n", out);
@@ -375,7 +375,7 @@ lPrint(const char *type, bool isError, SourcePos p, const char *fmt,
         return;
     printed.insert(formattedBuf);
 
-    lPrintWithWordBreaks(formattedBuf, indent, lTerminalWidth(), stderr);
+    PrintWithWordBreaks(formattedBuf, indent, TerminalWidth(), stderr);
     lPrintFileLineContext(p);
 
     free(errorBuf);
diff --git a/util.h b/util.h
index b247b8bd..7edf71f7 100644
--- a/util.h
+++ b/util.h
@@ -156,4 +156,18 @@ void GetDirectoryAndFileName(const std::string &currentDir,
 bool VerifyDataLayoutCompatibility(const std::string &module_dl,
                                    const std::string &lib_dl);
 
+/** Print the given string to the given FILE, assuming the given output
+    column width.  Break words as needed to avoid words spilling past the
+    last column.  */
+void PrintWithWordBreaks(const char *buf, int indent, int columnWidth,
+                         FILE *out);
+
+/** Returns the width of the terminal where the compiler is running.
+    Finding this out may fail in a variety of reasonable situations (piping
+    compiler output to 'less', redirecting output to a file, running the
+    compiler under a debuffer; in this case, just return a reasonable
+    default.
+ */
+int TerminalWidth();
+
 #endif // ISPC_UTIL_H