diff --git a/.gitignore b/.gitignore
index 0469cf7d..7cdc4a4e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,13 +3,21 @@
 depend
 ispc
 ispc_test
+ispc_ref
 objs
 docs/doxygen
 docs/*.html
 tests*/*cpp
 tests*/*run
+logs/
+notify_log.log
+alloy_results_*
 examples/*/*.png
 examples/*/*.ppm
 examples/*/objs/*
+examples/*/ref
+examples/*/test
+*.swp
+check_isa.exe
 
 
diff --git a/Makefile b/Makefile
index 09ec302d..f6d7af38 100644
--- a/Makefile
+++ b/Makefile
@@ -83,6 +83,10 @@ ifeq ($(LLVM_VERSION),LLVM_3_4)
     ISPC_LIBS += -lcurses
 endif
 
+ifeq ($(LLVM_VERSION),LLVM_3_5)
+    ISPC_LIBS += -lcurses
+endif
+
 ifeq ($(ARCH_OS),Linux)
 	ISPC_LIBS += -ldl
 endif
@@ -109,14 +113,13 @@ else
     BUILD_VERSION:=$(GIT_REVISION)
 endif
 
-CXX=g++
-CPP=cpp
+CXX=clang++
 OPT=-O2
 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
 	$(LLVM_VERSION_DEF) \
 	-Wall \
 	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" \
-	-Wno-sign-compare
+	-Wno-sign-compare -Wno-unused-function
 ifneq ($(LLVM_VERSION),LLVM_3_1)
 	CXXFLAGS+=-Werror
 endif
@@ -141,7 +144,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
@@ -160,7 +163,7 @@ BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 
 OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
+       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o stdlib_mask64_ispc.o \
 	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 
 default: ispc
@@ -202,9 +205,14 @@ ispc: print_llvm_src dirs $(OBJS)
 	@$(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
 
 # Use clang as a default compiler, instead of gcc
+# This is default now.
 clang: ispc
 clang: CXX=clang++
 
+# Use gcc as a default compiler, instead of gcc
+gcc: ispc
+gcc: CXX=clang++
+
 # Build ispc with address sanitizer instrumentation using clang compiler
 # Note that this is not portable build
 asan: clang
@@ -246,15 +254,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 
-objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
 
-objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(32 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
 
-objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(64 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@
 
@@ -268,20 +276,25 @@ objs/builtins-c-64.cpp: builtins/builtins.c
 
 objs/stdlib_mask1_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask1
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask1 > $@
 
 objs/stdlib_mask8_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask8
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask8 > $@
 
 objs/stdlib_mask16_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask16
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask16 > $@
 
 objs/stdlib_mask32_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask32
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask32 > $@
+
+objs/stdlib_mask64_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask64
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask64 > $@
diff --git a/alloy.py b/alloy.py
new file mode 100755
index 00000000..657e67bf
--- /dev/null
+++ b/alloy.py
@@ -0,0 +1,732 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+
+def attach_mail_file(msg, filename, name):
+    if os.path.exists(filename):
+        fp = open(filename, "rb")
+        to_attach = MIMEBase("application", "octet-stream")
+        to_attach.set_payload(fp.read())
+        encode_base64(to_attach)
+        to_attach.add_header("Content-Disposition", "attachment", filename=name)
+        fp.close()
+        msg.attach(to_attach)
+
+def setting_paths(llvm, ispc, sde):
+    if llvm != "":
+        os.environ["LLVM_HOME"]=llvm
+    if ispc != "":
+        os.environ["ISPC_HOME"]=ispc
+    if sde != "":
+        os.environ["SDE_HOME"]=sde
+
+def check_LLVM(which_LLVM):
+    answer = []
+    if which_LLVM[0] == " ":
+        return answer
+    p = os.environ["LLVM_HOME"]
+    for i in range(0,len(which_LLVM)):
+        if not os.path.exists(p + os.sep + "bin-" + which_LLVM[i] + os.sep + "bin"):
+            answer.append(which_LLVM[i])
+    return answer
+
+def try_do_LLVM(text, command, from_validation):
+    if from_validation == True:
+        text = text + "\n"
+    print_debug("Trying to " + text, from_validation, alloy_build)
+    postfix = ""
+    if current_OS == "Windows":
+        postfix = " 1>> " + alloy_build + " 2>&1"
+    else:
+        postfix = " >> " + alloy_build + " 2>> " + alloy_build
+    if os.system(command + postfix) != 0:
+        print_debug("ERROR.\n", from_validation, alloy_build)
+        error("can't " + text, 1)
+    print_debug("DONE.\n", from_validation, alloy_build)
+
+def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, from_validation, force, make):
+    print_debug("Building LLVM. Version: " + version_LLVM + ". ", from_validation, alloy_build)
+    if revision != "":
+        print_debug("Revision: " + revision + ".\n", from_validation, alloy_build)
+    else:
+        print_debug("\n", from_validation, alloy_build)
+    # Here we understand what and where do we want to build
+    current_path = os.getcwd()
+    llvm_home = os.environ["LLVM_HOME"]
+    os.chdir(llvm_home)
+    FOLDER_NAME=version_LLVM
+    if  version_LLVM == "trunk":
+        SVN_PATH="trunk"
+    if  version_LLVM == "3.4":
+        SVN_PATH="tags/RELEASE_34/rc1"
+        version_LLVM = "3_4"
+    if  version_LLVM == "3.3":
+        SVN_PATH="tags/RELEASE_33/final"
+        version_LLVM = "3_3"
+    if  version_LLVM == "3.2":
+        SVN_PATH="tags/RELEASE_32/final"
+        version_LLVM = "3_2"
+    if  version_LLVM == "3.1":
+        SVN_PATH="tags/RELEASE_31/final"
+        version_LLVM = "3_1"
+    if revision != "":
+        FOLDER_NAME = FOLDER_NAME + "_" + revision
+        revision = "-" + revision
+    if folder == "":
+        folder = FOLDER_NAME
+    LLVM_SRC="llvm-" + folder
+    LLVM_BUILD="build-" + folder
+    LLVM_BIN="bin-" + folder
+    if os.path.exists(LLVM_BIN + os.sep + "bin") and not force:
+        error("you have folder " + LLVM_BIN + ".\nIf you want to rebuild use --force", 1)
+    LLVM_BUILD_selfbuild = LLVM_BUILD + "_temp"
+    LLVM_BIN_selfbuild = LLVM_BIN + "_temp"
+    common.remove_if_exists(LLVM_SRC)
+    common.remove_if_exists(LLVM_BUILD)
+    common.remove_if_exists(LLVM_BIN)
+    if selfbuild:
+        common.remove_if_exists(LLVM_BUILD_selfbuild)
+        common.remove_if_exists(LLVM_BIN_selfbuild)
+    print_debug("Using folders: " + LLVM_SRC + " " + LLVM_BUILD + " " + LLVM_BIN + " in " + 
+        llvm_home + "\n", from_validation, alloy_build)
+    # load llvm
+    if tarball == "":
+        try_do_LLVM("load LLVM from http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " " + LLVM_SRC,
+                    from_validation)
+        os.chdir(LLVM_SRC + "/tools")
+        try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang",
+                    from_validation)
+        if extra == True:
+            os.chdir("./clang/tools")
+            try_do_LLVM("load extra clang extra tools ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/clang-tools-extra/" + SVN_PATH + " extra",
+                    from_validation)
+            os.chdir("../../../projects")
+            try_do_LLVM("load extra clang compiler-rt ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/compiler-rt/" + SVN_PATH + " compiler-rt",
+                    from_validation)
+        os.chdir("../")
+    else:
+        tar = tarball.split(" ")
+        os.makedirs(LLVM_SRC) 
+        os.chdir(LLVM_SRC) 
+        try_do_LLVM("untar LLVM from " + tar[0] + " ",
+                    "tar -xvzf " + tar[0] + " --strip-components 1", from_validation)
+        os.chdir("./tools") 
+        os.makedirs("clang") 
+        os.chdir("./clang") 
+        try_do_LLVM("untar clang from " + tar[1] + " ",
+                    "tar -xvzf " + tar[1] + " --strip-components 1", from_validation)
+        os.chdir("../../")
+    # paching llvm
+    patches = glob.glob(os.environ["ISPC_HOME"] + os.sep + "llvm_patches" + os.sep + "*.*")
+    for patch in patches:
+        if version_LLVM in os.path.basename(patch):
+            if current_OS != "Windows":
+                try_do_LLVM("patch LLVM with patch " + patch + " ", "patch -p0 < " + patch, from_validation)
+            else:
+                try_do_LLVM("patch LLVM with patch " + patch + " ", "C:\\gnuwin32\\bin\\patch.exe -p0 < " + patch, from_validation)
+    os.chdir("../")
+    # configuring llvm, build first part of selfbuild
+    os.makedirs(LLVM_BUILD)
+    os.makedirs(LLVM_BIN)
+    selfbuild_compiler = ""
+    if selfbuild:
+        print_debug("Making selfbuild and use folders " + LLVM_BUILD_selfbuild + " and " +
+            LLVM_BIN_selfbuild + "\n", from_validation, alloy_build)
+        os.makedirs(LLVM_BUILD_selfbuild)
+        os.makedirs(LLVM_BIN_selfbuild)
+        os.chdir(LLVM_BUILD_selfbuild)
+        try_do_LLVM("configure release version for selfbuild ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" +
+                    LLVM_BIN_selfbuild + " --enable-optimized",
+                    from_validation)
+        try_do_LLVM("build release version for selfbuild ",
+                    make, from_validation)
+        try_do_LLVM("install release version for selfbuild ",
+                    "make install",
+                    from_validation)
+        os.chdir("../")
+        selfbuild_compiler = " CC="+llvm_home+ "/" + LLVM_BIN_selfbuild + "/bin/clang"
+        print_debug("Now we have compiler for selfbuild: " + selfbuild_compiler + "\n", from_validation, alloy_build)
+    os.chdir(LLVM_BUILD)
+    if debug == False:
+        if current_OS != "Windows":
+            try_do_LLVM("configure release version ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" +
+                    LLVM_BIN + " --enable-optimized" + selfbuild_compiler,
+                    from_validation)
+        else:
+            try_do_LLVM("configure release version ",
+                    'cmake -G "Visual Studio 10" -DCMAKE_INSTALL_PREFIX="..\\'+ LLVM_BIN +
+                    '" -DLLVM_LIT_TOOLS_DIR="C:\\gnuwin32\\bin" ..\\' + LLVM_SRC,
+                    from_validation)
+    else:
+        try_do_LLVM("configure debug version ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" + LLVM_BIN +
+                    " --enable-debug-runtime --enable-debug-symbols --enable-keep-symbols" + selfbuild_compiler,
+                    from_validation)
+    # building llvm
+    if current_OS != "Windows":
+        try_do_LLVM("build LLVM ", make, from_validation)
+        try_do_LLVM("install LLVM ", "make install", from_validation)
+    else:
+        try_do_LLVM("build LLVM and than install LLVM ", "msbuild INSTALL.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild", from_validation)
+    os.chdir(current_path) 
+
+def check_targets():
+    answer = []
+    answer_sde = []
+    # check what native targets do we have
+    if current_OS != "Windows":
+        try_do_LLVM("build check_ISA", "clang check_isa.cpp -o check_isa.exe", True)
+    else:
+        try_do_LLVM("build check_ISA", "cl check_isa.cpp", True)
+    SSE2  = ["sse2-i32x4",  "sse2-i32x8"]
+    SSE4  = ["sse4-i32x4",  "sse4-i32x8",   "sse4-i16x8", "sse4-i8x16"]
+    AVX   = ["avx1-i32x4",  "avx1-i32x8",  "avx1-i32x16",  "avx1-i64x4"]
+    AVX11 = ["avx1.1-i32x8","avx1.1-i32x16","avx1.1-i64x4"]
+    AVX2  = ["avx2-i32x8",  "avx2-i32x16",  "avx2-i64x4"]
+    targets = [["AVX2", AVX2, False], ["AVX1.1", AVX11, False], ["AVX", AVX, False], ["SSE4", SSE4, False], ["SSE2", SSE2, False]]
+    f_lines = take_lines("check_isa.exe", "first")
+    for i in range(0,5):
+        if targets[i][0] in f_lines:
+            for j in range(i,5):
+                answer = targets[j][1] + answer
+                targets[j][2] = True
+            break
+    if current_OS != "Windows":
+        answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"]
+    # now check what targets we have with the help of SDE
+    sde_exists = ""
+    PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+    if current_OS == "Windows":
+        sde_n = "sde.exe"
+    else:
+        sde_n = "sde"
+    for counter in PATH_dir:
+        if os.path.exists(counter + os.sep + sde_n) and sde_exists == "":
+            sde_exists = counter + os.sep + sde_n
+    if os.environ.get("SDE_HOME") != None:
+        if os.path.exists(os.environ.get("SDE_HOME") + os.sep + sde_n):
+            sde_exists = os.environ.get("SDE_HOME") + os.sep + sde_n
+    if sde_exists == "":
+        error("you haven't got sde neither in SDE_HOME nor in your PATH.\n" + 
+            "To test all platforms please set SDE_HOME to path containing SDE.\n" +
+            "Please refer to http://www.intel.com/software/sde for SDE download information.", 2)
+        return [answer, answer_sde]
+    # here we have SDE
+    f_lines = take_lines(sde_exists + " -help", "all")
+    for i in range(0,len(f_lines)):
+        if targets[3][2] == False and "wsm" in f_lines[i]:
+            answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]]
+        if targets[2][2] == False and "snb" in f_lines[i]:
+            answer_sde = answer_sde + [["-snb", "avx1-i32x4"], ["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]]
+        if targets[1][2] == False and "ivb" in f_lines[i]:
+            answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"], ["-ivb", "avx1.1-i64x4"]]
+        if targets[0][2] == False and "hsw" in f_lines[i]:
+            answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"], ["-hsw", "avx2-i64x4"]]
+    return [answer, answer_sde]
+
+def build_ispc(version_LLVM, make):
+    current_path = os.getcwd()
+    os.chdir(os.environ["ISPC_HOME"])
+    if current_OS != "Windows":
+        p_temp = os.getenv("PATH")
+        os.environ["PATH"] = os.environ["LLVM_HOME"] + "/bin-" + version_LLVM + "/bin:" + os.environ["PATH"]
+        try_do_LLVM("clean ISPC for building", "make clean", True)
+        try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", make, True)
+        os.environ["PATH"] = p_temp
+    else:
+        p_temp = os.getenv("LLVM_INSTALL_DIR")
+        v_temp = os.getenv("LLVM_VERSION")
+        os.environ["LLVM_INSTALL_DIR"] = os.environ["LLVM_HOME"] + "\\bin-" + version_LLVM
+        if version_LLVM == "3.3":
+            temp = "3_3"
+        if version_LLVM == "3.4":
+            temp = "3_4"
+        if version_LLVM == "trunk":
+            temp = "3_5"
+        os.environ["LLVM_VERSION"] = "LLVM_" + temp
+        try_do_LLVM("clean ISPC for building", "msbuild ispc.vcxproj /t:clean", True)
+        try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", "msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild", True)
+        os.environ["LLVM_INSTALL_DIR"] = p_temp
+        os.environ["LLVM_VERSION"] = v_temp
+    os.chdir(current_path)
+
+def execute_stability(stability, R, print_version):
+    stability1 = copy.deepcopy(stability)
+    b_temp = run_tests.run_tests(stability1, [], print_version)
+    temp = b_temp[0]
+    time = b_temp[1]
+    for j in range(0,4):
+        R[j][0] = R[j][0] + temp[j]
+        for i in range(0,len(temp[j])):
+            R[j][1].append(temp[4])
+    number_of_fails = temp[5]
+    number_of_new_fails = len(temp[0]) + len(temp[1])
+    number_of_passes = len(temp[2]) + len(temp[3])
+    if number_of_fails == 0:
+        str_fails = ". No fails"
+    else:
+        str_fails = ". Fails: " + str(number_of_fails)
+    if number_of_new_fails == 0:
+        str_new_fails = ", No new fails"
+    else:
+        str_new_fails = ", New fails: " + str(number_of_new_fails)
+    if number_of_passes == 0:
+        str_new_passes = "."
+    else:
+        str_new_passes = ", " + str(number_of_passes) + " new passes."
+    if stability.time:
+        str_time = " " + time + "\n"
+    else:
+        str_time = "\n"
+    print_debug(temp[4][1:-3] + str_fails + str_new_fails + str_new_passes + str_time, False, stability_log)
+
+def run_special_tests():
+   i = 5 
+
+class options_for_drivers:
+    pass
+
+def validation_run(only, only_targets, reference_branch, number, notify, update, speed_number, make, perf_llvm, time):
+    os.chdir(os.environ["ISPC_HOME"])
+    if current_OS != "Windows":
+        os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
+    if options.notify != "":
+        common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "notify_log.log")
+        smtp_server = os.environ["SMTP_ISPC"]
+        msg = MIMEMultipart()
+        msg['Subject'] = 'ISPC test system results'
+        msg['From'] = 'ISPC_test_system'
+        msg['To'] = options.notify
+    print_debug("Command: " + ' '.join(sys.argv) + "\n", False, "")
+    print_debug("Folder: " + os.environ["ISPC_HOME"] + "\n", False, "")
+    date = datetime.datetime.now()
+    print_debug("Date: " + date.strftime('%H:%M %d/%m/%Y') + "\n", False, "")
+# *** *** ***
+# Stability validation run
+# *** *** ***
+    if ((("stability" in only) == True) or ("performance" in only) == False):
+        print_debug("\n\nStability validation run\n\n", False, "")
+        stability = options_for_drivers()
+# stability constant options
+        stability.random = False
+        stability.ispc_flags = ""
+        stability.compiler_exe = None
+        stability.num_jobs = speed_number
+        stability.verbose = False
+        stability.time = time
+        stability.non_interactive = True
+        stability.update = update
+        stability.include_file = None
+        stability.silent = True
+        stability.in_file = "." + os.sep + f_date + os.sep + "run_tests_log.log"
+        stability.verify = False
+# stability varying options
+        stability.target = ""
+        stability.arch = ""
+        stability.no_opt = False
+        stability.wrapexe = ""
+# prepare parameters of run
+        [targets_t, sde_targets_t] = check_targets()
+        rebuild = True
+        opts = []
+        archs = []
+        LLVM = []
+        targets = []
+        sde_targets = []
+# parsing option only, update parameters of run
+        if "-O2" in only:
+            opts.append(False)
+        if "-O0" in only:
+            opts.append(True)
+        if "x86" in only and not ("x86-64" in only):
+            archs.append("x86")
+        if "x86-64" in only:
+            archs.append("x86-64")
+        if "native" in only:
+            sde_targets_t = []
+        for i in ["3.1", "3.2", "3.3", "3.4", "trunk"]:
+            if i in only:
+                LLVM.append(i)
+        if "current" in only:
+            LLVM = [" "]
+            rebuild = False
+        else:
+            common.check_tools(1)
+        if only_targets != "":
+            only_targets += " "
+            only_targets = only_targets.replace("generic "," generic-4 generic-16 ")
+            only_targets_t = only_targets.split(" ")
+            for i in only_targets_t:
+                if i == "":
+                    continue
+                err = True
+                for j in range(0,len(targets_t)):
+                    if i in targets_t[j]:
+                        targets.append(targets_t[j])
+                        err = False
+                for j in range(0,len(sde_targets_t)):
+                    if i in sde_targets_t[j][1]:
+                        sde_targets.append(sde_targets_t[j])
+                        err = False
+                if err == True:
+                    error("You haven't sde for target " + i, 1)
+        else:
+            targets = targets_t[:-4]
+            sde_targets = sde_targets_t
+        if "build" in only:
+            targets = []
+            sde_targets = []
+            only = only + " stability "
+# finish parameters of run, prepare LLVM
+        if len(opts) == 0:
+            opts = [False]
+        if len(archs) == 0:
+            archs = ["x86", "x86-64"]
+        if len(LLVM) == 0:
+            LLVM = ["3.3", "trunk"]
+        gen_archs = ["x86-64"]
+        need_LLVM = check_LLVM(LLVM)
+        for i in range(0,len(need_LLVM)):
+            build_LLVM(need_LLVM[i], "", "", "", False, False, False, True, False, make)
+# begin validation run for stabitily
+        common.remove_if_exists(stability.in_file)
+        R = [[[],[]],[[],[]],[[],[]],[[],[]]]
+        print_debug("\n_________________________STABILITY REPORT_________________________\n", False, stability_log)
+        for i in range(0,len(LLVM)):
+            print_version = 2
+            if rebuild:
+                build_ispc(LLVM[i], make)
+            for j in range(0,len(targets)):
+                stability.target = targets[j]
+                stability.wrapexe = ""
+                if "generic" in targets[j]:
+                    arch = gen_archs
+                else:
+                    arch = archs
+                for i1 in range(0,len(arch)):
+                    for i2 in range(0,len(opts)):
+                        stability.arch = arch[i1]
+                        stability.no_opt = opts[i2]
+                        execute_stability(stability, R, print_version)
+                        print_version = 0
+            for j in range(0,len(sde_targets)):
+                stability.target = sde_targets[j][1]
+                stability.wrapexe = os.environ["SDE_HOME"] + "/sde " + sde_targets[j][0] + " -- "
+                for i1 in range(0,len(archs)):
+                    for i2 in range(0,len(opts)):
+                        stability.arch = archs[i1]
+                        stability.no_opt = opts[i2]
+                        execute_stability(stability, R, print_version)
+                        print_version = 0
+# run special tests like embree
+# 
+        run_special_tests()
+        ttt = ["NEW RUNFAILS: ", "NEW COMPFAILS: ", "NEW PASSES RUNFAILS: ", "NEW PASSES COMPFAILS: "]
+        for j in range(0,4):
+            if len(R[j][0]) == 0:
+                print_debug("NO " + ttt[j][:-2] + "\n", False, stability_log)
+            else:
+                print_debug(ttt[j] + str(len(R[j][0])) + "\n", False, stability_log)
+                temp5 = [[],[]]
+                for i in range(0,len(R[j][0])):
+                    er = True
+                    for k in range(0,len(temp5[0])):
+                        if R[j][0][i] == temp5[0][k]:
+                            temp5[1][k].append(R[j][1][i])
+                            er = False
+                    if er == True:
+                        temp5[0].append(R[j][0][i])
+                        temp5[1].append([R[j][1][i]])
+                for i in range(0,len(temp5[0])):
+                    print_debug("\t" + temp5[0][i] + "\n", True, stability_log)
+                    for k in range(0,len(temp5[1][i])):
+                        print_debug("\t\t\t" + temp5[1][i][k], True, stability_log)
+        print_debug("__________________Watch stability.log for details_________________\n", False, stability_log)
+        if options.notify != "":
+            attach_mail_file(msg, stability.in_file, "run_tests_log.log")
+            attach_mail_file(msg, stability_log, "stability.log")
+
+# *** *** ***
+# Performance validation run
+# *** *** ***
+    if ((("performance" in only) == True) or ("stability" in only) == False):
+        print_debug("\n\nPerformance validation run\n\n", False, "")
+        common.check_tools(1)
+        performance = options_for_drivers()
+# performance constant options
+        performance.number = number
+        performance.config = "." + os.sep + "perf.ini"
+        performance.path = "." + os.sep
+        performance.silent = True
+        performance.output = ""
+        performance.compiler = ""
+        performance.ref = "ispc_ref"
+        if current_OS == "Windows":
+            performance.ref = "ispc_ref.exe"
+        performance.perf_target = ""
+        performance.in_file = "." + os.sep + f_date + os.sep + "performance.log"
+# prepare LLVM 3.3 as newest LLVM
+        need_LLVM = check_LLVM(["3.3"])
+        if len(need_LLVM) != 0:
+            build_LLVM(need_LLVM[0], "", "", "", False, False, False, True, False, make)
+        if perf_llvm == False:
+            # prepare reference point. build both test and reference compilers
+            try_do_LLVM("apply git", "git branch", True)
+            temp4 = take_lines("git branch", "all")
+            for line in temp4:
+                if "*" in line:
+                    current_branch = line[2:-1]
+            stashing = True
+            sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n")
+            if "No local changes" in take_lines("git stash", "first"):
+                stashing = False
+            #try_do_LLVM("stash current branch ", "git stash", True)
+            try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True)
+            sys.stdout.write(".\n")
+            build_ispc("3.3", make)
+            sys.stdout.write(".\n")
+            if current_OS != "Windows":
+                os.rename("ispc", "ispc_ref")
+            else:
+                common.remove_if_exists("Release\\ispc_ref.exe")
+                os.rename("Release\\ispc.exe", "Release\\ispc_ref.exe")
+            try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True)
+            if stashing:
+                try_do_LLVM("return current branch ", "git stash pop", True)
+            sys.stdout.write("You can interrupt script now.\n")
+            build_ispc("3.3", make)
+        else:
+            # build compiler with two different LLVM versions
+            if len(check_LLVM([reference_branch])) != 0:
+                error("you haven't got llvm called " + reference_branch, 1)
+            build_ispc("3.3", make)
+            os.rename("ispc", "ispc_ref")
+            build_ispc(reference_branch, make)
+# begin validation run for performance. output is inserted into perf()
+        perf.perf(performance, [])
+        if options.notify != "":
+            attach_mail_file(msg, performance.in_file, "performance.log")
+            attach_mail_file(msg, "." + os.sep + "logs" + os.sep + "perf_build.log", "perf_build.log")
+
+# sending e-mail with results
+    if options.notify != "":
+        fp = open(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", 'rb')
+        f_lines = fp.readlines()
+        fp.close()
+        line = ""
+        for i in range(0,len(f_lines)):
+            line = line + f_lines[i][:-1]
+            line = line + '   \n'
+        text = MIMEText(line, "", "KOI-8")
+        msg.attach(text)
+        attach_mail_file(msg, alloy_build, "alloy_build.log")
+        s = smtplib.SMTP(smtp_server)
+        s.sendmail('ISPC_test_system', options.notify, msg.as_string())
+        s.quit()
+
+def Main():
+    global current_OS
+    if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True:
+        current_OS = "Windows"
+    else:
+        if (platform.system() == 'Darwin'):
+            current_OS = "MacOS"
+        else:
+            current_OS = "Linux" 
+
+    if (options.build_llvm == False and options.validation_run == False):
+        parser.print_help()
+        exit(0)
+
+    setting_paths(options.llvm_home, options.ispc_home, options.sde_home)
+    if os.environ.get("LLVM_HOME") == None:
+        error("you have no LLVM_HOME", 1)
+    if os.environ.get("ISPC_HOME") == None:
+        error("you have no ISPC_HOME", 1)
+    if options.notify != "":
+        if os.environ.get("SMTP_ISPC") == None:
+            error("you have no SMTP_ISPC in your environment for option notify", 1)
+    if options.only != "":
+        test_only_r = " 3.1 3.2 3.3 trunk current build stability performance x86 x86-64 -O0 -O2 native "
+        test_only = options.only.split(" ")
+        for iterator in test_only:
+            if not (" " + iterator + " " in test_only_r):
+                error("unknow option for only: " + iterator, 1)
+    if current_OS == "Windows":
+        if options.debug == True or options.selfbuild == True or options.tarball != "":
+            error("Debug, selfbuild and tarball options are unsupported on windows", 1)
+    global f_date
+    f_date = "logs"
+    common.remove_if_exists(f_date)
+    os.makedirs(f_date)
+    global alloy_build
+    alloy_build = os.getcwd() + os.sep + f_date + os.sep + "alloy_build.log"
+    global stability_log
+    stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log"
+    current_path = os.getcwd()
+    make = "make -j" + options.speed
+    if os.environ["ISPC_HOME"] != os.getcwd():
+        error("you ISPC_HOME and your current path are different!\n", 2)
+    if options.perf_llvm == True:
+        if options.branch == "master":
+            options.branch = "trunk"
+    try:
+        start_time = time.time()
+        if options.build_llvm:
+            build_LLVM(options.version, options.revision, options.folder, options.tarball,
+                    options.debug, options.selfbuild, options.extra, False, options.force, make)
+        if options.validation_run:
+            validation_run(options.only, options.only_targets, options.branch,
+                    options.number_for_performance, options.notify, options.update, int(options.speed),
+                    make, options.perf_llvm, options.time)
+        elapsed_time = time.time() - start_time
+        if options.time:
+            print_debug("Elapsed time: " + time.strftime('%Hh%Mm%Ssec.', time.gmtime(elapsed_time)) + "\n", False, "")
+    finally:
+        os.chdir(current_path)
+        date_name = "alloy_results_" + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
+        if os.path.exists(date_name):
+            error("It's forbidden to run alloy two times in a second, logs are in ./logs", 1)
+        os.rename(f_date, date_name)
+        print_debug("Logs are in " + date_name + "\n", False, "")
+
+###Main###
+from optparse import OptionParser
+from optparse import OptionGroup
+import sys
+import os
+import operator
+import time
+import glob
+import string
+import platform
+import smtplib
+import datetime
+import copy
+import multiprocessing
+from email.MIMEMultipart import MIMEMultipart
+from email.MIMEBase import MIMEBase
+from email.mime.text import MIMEText
+from email.Encoders import encode_base64
+# our drivers
+import run_tests
+import perf
+import common
+error = common.error
+take_lines = common.take_lines
+print_debug = common.print_debug
+if __name__ == '__main__':
+    # parsing options
+    class MyParser(OptionParser):
+        def format_epilog(self, formatter):
+            return self.epilog
+    examples =  ("Examples:\n" +
+    "Load and build LLVM from trunk\n\talloy.py -b\n" +
+    "Load and build LLVM 3.3. Rewrite LLVM folders\n\talloy.py -b --version=3.3 --force\n" +
+    "Untar files llvm.tgz clang.tgz, build LLVM from them in folder bin-from_tar\n\talloy.py -b --tarball='llvm.tgz clang.tgz' --folder=from_tar\n" +
+    "Load LLVM from trunk, revision r172870. Build it. Do selfbuild\n\talloy.py -b --revision=r172870 --selfbuild\n" +
+    "Validation run with LLVM 3.3, trunk; x86, x86-64; -O2;\nall supported targets; performance\n\talloy.py -r\n" + 
+    "Validation run with all avx targets and sse4-i8x16 without performance\n\talloy.py -r --only=stability --only-targets='avx sse4-i8x16'\n" +
+    "Validation run with avx2-i32x8, all sse4 and sse2 targets\nand all targets with i32x16\n\talloy.py -r --only-targets='avx2-i32x8 sse4 i32x16 sse2'\n" +
+    "Stability validation run with LLVM 3.2, 3.3; -O0; x86,\nupdate fail_db.txt with passes and fails\n\talloy.py -r --only='3.2 -O0 stability 3.3 x86' --update-errors=FP\n" +
+    "Try to build compiler with all LLVM\n\talloy.py -r --only=build\n" +
+    "Performance validation run with 10 runs of each test and comparing to branch 'old'\n\talloy.py -r --only=performance --compare-with=old --number=10\n" +
+    "Validation run. Update fail_db.txt with new fails, send results to my@my.com\n\talloy.py -r --update-errors=F --notify='my@my.com'\n")
+    num_threads="%s" % multiprocessing.cpu_count()
+    parser = MyParser(usage="Usage: alloy.py -r/-b [options]", epilog=examples)
+    parser.add_option('-b', '--build-llvm', dest='build_llvm',
+        help='ask to build LLVM', default=False, action="store_true")
+    parser.add_option('-r', '--run', dest='validation_run',
+        help='ask for validation run', default=False, action="store_true")
+    parser.add_option('-j', dest='speed',
+        help='set -j for make', default=num_threads)
+    # options for activity "build LLVM"
+    llvm_group = OptionGroup(parser, "Options for building LLVM",
+                    "These options must be used with -b option.")
+    llvm_group.add_option('--version', dest='version',
+        help='version of llvm to build: 3.1 3.2 3.3 3.4 trunk. Default: trunk', default="trunk")
+    llvm_group.add_option('--revision', dest='revision',
+        help='revision of llvm to build in format r172870', default="")
+    llvm_group.add_option('--debug', dest='debug',
+        help='debug build of LLVM?', default=False, action="store_true")
+    llvm_group.add_option('--folder', dest='folder',
+        help='folder to build LLVM in', default="")
+    llvm_group.add_option('--tarball', dest='tarball',
+        help='"llvm_tarball clang_tarball"', default="")
+    llvm_group.add_option('--selfbuild', dest='selfbuild',
+        help='make selfbuild of LLVM and clang', default=False, action="store_true")
+    llvm_group.add_option('--force', dest='force',
+        help='rebuild LLVM', default=False, action='store_true')
+    llvm_group.add_option('--extra', dest='extra',
+        help='load extra clang tools', default=False, action='store_true')
+    parser.add_option_group(llvm_group)
+    # options for activity "validation run"
+    run_group = OptionGroup(parser, "Options for validation run",
+                    "These options must be used with -r option.")
+    run_group.add_option('--compare-with', dest='branch',
+        help='set performance reference point. Dafault: master', default="master")
+    run_group.add_option('--number', dest='number_for_performance',
+        help='number of performance runs for each test. Default: 5', default=5)
+    run_group.add_option('--notify', dest='notify',
+        help='email to sent results to', default="")
+    run_group.add_option('--update-errors', dest='update',
+        help='rewrite fail_db.txt file according to received results (F or FP)', default="")
+    run_group.add_option('--only-targets', dest='only_targets',
+        help='set list of targets to test. Possible values - all subnames of targets.',
+        default="")
+    run_group.add_option('--time', dest='time',
+        help='display time of testing', default=False, action='store_true')
+    run_group.add_option('--only', dest='only',
+        help='set types of tests. Possible values:\n' + 
+            '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
+            'build (only build with different LLVM), 3.1, 3.2, 3.3, 3.4, trunk, native (do not use SDE), current (do not rebuild ISPC).',
+            default="")
+    run_group.add_option('--perf_LLVM', dest='perf_llvm',
+        help='compare LLVM 3.3 with "--compare-with", default trunk', default=False, action='store_true')
+    parser.add_option_group(run_group)
+    # options for activity "setup PATHS"
+    setup_group = OptionGroup(parser, "Options for setup",
+                    "These options must be use with -r or -b to setup environment variables")
+    setup_group.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="")
+    setup_group.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="")
+    setup_group.add_option('--sde_home', dest='sde_home',help='path to SDE',default="")
+    parser.add_option_group(setup_group)
+    (options, args) = parser.parse_args()
+    Main()
diff --git a/builtins.cpp b/builtins.cpp
index 886eec15..2afd92d9 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -302,6 +302,7 @@ lCheckModuleIntrinsics(llvm::Module *module) {
         // check the llvm.x86.* intrinsics for now...
         if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
             llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
+            if (id == 0) fprintf(stderr, "FATAL: intrinsic is not found: %s  \n", funcName.c_str());
             Assert(id != 0);
             llvm::Type *intrinsicType =
                 llvm::Intrinsic::getType(*g->ctx, id);
@@ -535,6 +536,12 @@ lSetInternalFunctions(llvm::Module *module) {
         "__set_system_isa",
         "__sext_uniform_bool",
         "__sext_varying_bool",
+        "__shift_double",
+        "__shift_float",
+        "__shift_i16",
+        "__shift_i32",
+        "__shift_i64",
+        "__shift_i8",
         "__shuffle2_double",
         "__shuffle2_float",
         "__shuffle2_i16",
@@ -576,20 +583,34 @@ lSetInternalFunctions(llvm::Module *module) {
         "__stdlib_pow",
         "__stdlib_powf",
         "__stdlib_sin",
+        "__stdlib_asin",
         "__stdlib_sincos",
         "__stdlib_sincosf",
         "__stdlib_sinf",
         "__stdlib_tan",
         "__stdlib_tanf",
-        "__svml_sin",
-        "__svml_cos",
-        "__svml_sincos",
-        "__svml_tan",
-        "__svml_atan",
-        "__svml_atan2",
-        "__svml_exp",
-        "__svml_log",
-        "__svml_pow",
+        "__svml_sind",
+        "__svml_asind",
+        "__svml_cosd",
+        "__svml_acosd",
+        "__svml_sincosd",
+        "__svml_tand",
+        "__svml_atand",
+        "__svml_atan2d",
+        "__svml_expd",
+        "__svml_logd",
+        "__svml_powd",
+        "__svml_sinf",
+        "__svml_asinf",
+        "__svml_cosf",
+        "__svml_acosf",
+        "__svml_sincosf",
+        "__svml_tanf",
+        "__svml_atanf",
+        "__svml_atan2f",
+        "__svml_expf",
+        "__svml_logf",
+        "__svml_powf",
         "__undef_uniform",
         "__undef_varying",
         "__vec4_add_float",
@@ -920,6 +941,34 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
     case Target::AVX: {
         switch (g->target->getVectorWidth()) {
+        case 4:
+            if (g->target->getDataTypeWidth() == 32) {
+                // Note here that for avx1-i32x4 we are using bitcode file for
+                // sse4-i32x4. This is intentional and good enough.
+                // AVX target implies appropriate target-feature attrbute,
+                // which forces LLVM to generate AVX code, even for SSE4
+                // intrinsics. Except that the only "missing" feature in sse4
+                // target is implemenation of __masked_[store|load]_[i32|i64]
+                // using maskmov instruction. But it's not very popular
+                // intrinsics, so we assume the implementation to be good
+                // enough at the moment.
+                if (runtime32) {
+                    EXPORT_MODULE(builtins_bitcode_sse4_32bit);
+                }
+                else {
+                    EXPORT_MODULE(builtins_bitcode_sse4_64bit);
+                }
+            } else if (g->target->getDataTypeWidth() == 64) {
+                if (runtime32) {
+                    EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit);
+                }
+                else {
+                    EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit);
+                }
+            } else {
+                FATAL("logic error in DefineStdlib");
+            }
+            break;
         case 8:
             if (runtime32) {
                 EXPORT_MODULE(builtins_bitcode_avx1_32bit);
@@ -943,6 +992,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
     case Target::AVX11: {
         switch (g->target->getVectorWidth()) {
+        case 4:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx11_i64x4_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx11_i64x4_64bit);
+            }
+            break;
         case 8:
             if (runtime32) {
                 EXPORT_MODULE(builtins_bitcode_avx11_32bit);
@@ -966,6 +1023,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
     case Target::AVX2: {
         switch (g->target->getVectorWidth()) {
+        case 4:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx2_i64x4_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx2_i64x4_64bit);
+            }
+            break;
         case 8:
             if (runtime32) {
                 EXPORT_MODULE(builtins_bitcode_avx2_32bit);
@@ -1083,7 +1148,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         // serialized version of the stdlib.ispc file to get its
         // definitions added.
         extern char stdlib_mask1_code[], stdlib_mask8_code[];
-        extern char stdlib_mask16_code[], stdlib_mask32_code[];
+        extern char stdlib_mask16_code[], stdlib_mask32_code[], stdlib_mask64_code[];
         if (g->target->getISA() == Target::GENERIC &&
             g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib
             yy_scan_string(stdlib_mask32_code);
@@ -1102,6 +1167,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
             case 32:
                 yy_scan_string(stdlib_mask32_code);
                 break;
+            case 64:
+                yy_scan_string(stdlib_mask64_code);
+                break;
             default:
                 FATAL("Unhandled mask bit size for stdlib.ispc");
             }
diff --git a/builtins/dispatch.ll b/builtins/dispatch.ll
index f1d5a969..ba216df7 100644
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2011, Intel Corporation
+;;  Copyright (c) 2011-2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -41,15 +41,13 @@
 
 @__system_best_isa = internal global i32 -1
 
-declare void @abort() noreturn
-
 ;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
 ;; following code...  Specifically, __get_system_isa should return a value
 ;; corresponding to one of the Target::ISA enumerant values that gives the
 ;; most capable ISA that the curremt system can run.
 ;;
-;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum
-;; backwards compatibility for anyone building ispc with LLVM 3.0
+;; Note: clang from LLVM 3.1 should be used if this is updated, for maximum
+;; backwards compatibility for anyone building ispc with LLVM 3.1
 ;;
 ;; #include <stdint.h>
 ;; #include <stdlib.h>
@@ -60,7 +58,7 @@ declare void @abort() noreturn
 ;;                           : "0" (infoType));
 ;; }
 ;; 
-;; /* Save %ebx in case it's the PIC register */
+;; // Save %ebx in case it's the PIC register.
 ;; static void __cpuid_count(int info[4], int level, int count) {
 ;;   __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
 ;;                         "cpuid\n\t"
@@ -69,13 +67,23 @@ declare void @abort() noreturn
 ;;                         : "0" (level), "2" (count));
 ;; }
 ;; 
+;; static int __os_has_avx_support() {
+;;     // Check xgetbv; this uses a .byte sequence instead of the instruction
+;;     // directly because older assemblers do not include support for xgetbv and
+;;     // there is no easy way to conditionally compile based on the assembler used.
+;;     int rEAX, rEDX;
+;;     __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+;;     return (rEAX & 6) == 6;
+;; }
+;; 
 ;; int32_t __get_system_isa() {
 ;;     int info[4];
 ;;     __cpuid(info, 1);
 ;; 
-;;     /* NOTE: the values returned below must be the same as the
-;;        corresponding enumerant values in Target::ISA. */
-;;     if ((info[2] & (1 << 28)) != 0) {
+;;     // NOTE: the values returned below must be the same as the
+;;     // corresponding enumerant values in Target::ISA.
+;;     if ((info[2] & (1 << 28)) != 0 &&
+;;         __os_has_avx_support()) {
 ;;        if ((info[2] & (1 << 29)) != 0 &&  // F16C
 ;;            (info[2] & (1 << 30)) != 0) {  // RDRAND
 ;;            // So far, so good.  AVX2?
@@ -98,47 +106,56 @@ declare void @abort() noreturn
 ;;         abort();
 ;; }
 
-define i32 @__get_system_isa() nounwind uwtable ssp {
+define i32 @__get_system_isa() nounwind uwtable {
 entry:
   %0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
   %asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
   %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
   %and = and i32 %asmresult5.i, 268435456
   %cmp = icmp eq i32 %and, 0
-  br i1 %cmp, label %if.else13, label %if.then
+  br i1 %cmp, label %if.else14, label %land.lhs.true
 
-if.then:                                          ; preds = %entry
-  %1 = and i32 %asmresult5.i, 1610612736
-  %2 = icmp eq i32 %1, 1610612736
-  br i1 %2, label %if.then7, label %return
+land.lhs.true:                                    ; preds = %entry
+  %1 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind
+  %asmresult.i25 = extractvalue { i32, i32 } %1, 0
+  %and.i = and i32 %asmresult.i25, 6
+  %cmp.i = icmp eq i32 %and.i, 6
+  br i1 %cmp.i, label %if.then, label %if.else14
 
-if.then7:                                         ; preds = %if.then
-  %3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
-  %asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
-  %and10 = lshr i32 %asmresult4.i28, 5
-  %4 = and i32 %and10, 1
-  %5 = add i32 %4, 3
+if.then:                                          ; preds = %land.lhs.true
+  %2 = and i32 %asmresult5.i, 1610612736
+  %3 = icmp eq i32 %2, 1610612736
+  br i1 %3, label %if.then8, label %return
+
+if.then8:                                         ; preds = %if.then
+  %4 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult4.i30 = extractvalue { i32, i32, i32, i32 } %4, 1
+  %and11 = lshr i32 %asmresult4.i30, 5
+  %5 = and i32 %and11, 1
+  %6 = add i32 %5, 3
   br label %return
 
-if.else13:                                        ; preds = %entry
-  %and15 = and i32 %asmresult5.i, 524288
-  %cmp16 = icmp eq i32 %and15, 0
-  br i1 %cmp16, label %if.else18, label %return
+if.else14:                                        ; preds = %land.lhs.true, %entry
+  %and16 = and i32 %asmresult5.i, 524288
+  %cmp17 = icmp eq i32 %and16, 0
+  br i1 %cmp17, label %if.else19, label %return
 
-if.else18:                                        ; preds = %if.else13
-  %and20 = and i32 %asmresult6.i, 67108864
-  %cmp21 = icmp eq i32 %and20, 0
-  br i1 %cmp21, label %if.else23, label %return
+if.else19:                                        ; preds = %if.else14
+  %and21 = and i32 %asmresult6.i, 67108864
+  %cmp22 = icmp eq i32 %and21, 0
+  br i1 %cmp22, label %if.else24, label %return
 
-if.else23:                                        ; preds = %if.else18
+if.else24:                                        ; preds = %if.else19
   tail call void @abort() noreturn nounwind
   unreachable
 
-return:                                           ; preds = %if.else18, %if.else13, %if.then7, %if.then
-  %retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
+return:                                           ; preds = %if.else19, %if.else14, %if.then8, %if.then
+  %retval.0 = phi i32 [ %6, %if.then8 ], [ 2, %if.then ], [ 1, %if.else14 ], [ 0, %if.else19 ]
   ret i32 %retval.0
 }
 
+declare void @abort() noreturn nounwind
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This function is called by each of the dispatch functions we generate;
 ;; it sets @__system_best_isa if it is unset.
diff --git a/builtins/svml.m4 b/builtins/svml.m4
new file mode 100644
index 00000000..0a587577
--- /dev/null
+++ b/builtins/svml.m4
@@ -0,0 +1,217 @@
+;; copyright stub  :)
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;; svml macro
+
+;; svml_stubs : stubs for svml calls
+;; $1 - type ("float" or "double")
+;; $2 - svml internal function suffix ("f" for float, "d" for double)
+;; $3 - vector width
+define(`svml_stubs',`
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
+')
+
+;; svml_declare : declaration of __svml_* intrinsics 
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+define(`svml_declare',`
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_sincos$2(<$3 x $1> *, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
+');
+
+;; defintition of __svml_* internal functions
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
+define(`svml_define',`
+  define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+  define <$3 x $1> @__svml_asin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_asin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_cos$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_cos$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define void @__svml_sincos$4(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline {
+    %s = call <$3 x $1> @__svml_sincos$2(<$3 x $1> * %2, <$3 x $1> %0)
+    store <$3 x $1> %s, <$3 x $1> * %1
+    ret void
+  }
+
+  define <$3 x $1> @__svml_tan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_tan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_atan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_atan2$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan2$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_exp$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_exp$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_log$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_log$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_pow$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_pow$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
+  }
+')
+
+
+;; svml_define_x : defintition of __svml_* internal functions operation on extended width
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
+;; $5 - extended width, must be at least twice the native vector width
+;;      contigent on existing of unary$3to$5 and binary$3to$5 macros
+
+;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g.
+;;define void @__svml_sincosf(<8 x float>, <8 x float> *,
+;;                                    <8 x float> *) nounwind readnone alwaysinline {
+;;  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+;;  %a = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %b = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+;;
+;;  %cospa = alloca <4 x float>
+;;  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+;;
+;;  %cospb = alloca <4 x float>
+;;  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+;;
+;;  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %sin, <8 x float> * %1
+;;
+;;  %cosa = load <4 x float> * %cospa
+;;  %cosb = load <4 x float> * %cospb
+;;  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %cos, <8 x float> * %2
+;;
+;;  ret void
+;;}
+define(`svml_define_x',`
+  define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_sin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_asin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_asin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_cos$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_cos$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  {
+    %s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0)
+    %c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0)
+    store <$5 x $1> %s, <$5 x $1> * %1
+    store <$5 x $1> %c, <$5 x $1> * %2
+    ret void
+  }
+  define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_tan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_atan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan2$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_atan2$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_exp$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_exp$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_log$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_log$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_pow$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_pow$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
+')
+
diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index 1d317713..1c467476 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -31,30 +31,16 @@
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; AVX target implementation.
+;;
+;; Please note that this file uses SSE intrinsics, but LLVM generates AVX
+;; instructions, so it doesn't makes sense to change this implemenation.
+
 
 ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rcp
-
-declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
-
-define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
-;    uniform float iv = extract(__rcp_u(v), 0);
-;    return iv * (2. - v * iv);
-  %vecval = insertelement <4 x float> undef, float %0, i32 0
-  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
-  %scall = extractelement <4 x float> %call, i32 0
-
-  ; do one N-R iteration
-  %v_iv = fmul float %0, %scall
-  %two_minus = fsub float 2., %v_iv  
-  %iv_mul = fmul float %scall, %two_minus
-  ret float %iv_mul
-}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
@@ -77,7 +63,8 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline {
   ;  r3 = a3
   ;
   ;  It doesn't matter what we pass as a, since we only need the r0 value
-  ;  here.  So we pass the same register for both.
+  ;  here.  So we pass the same register for both.  Further, only the 0th
+  ;  element of the b parameter matters
   %xi = insertelement <4 x float> undef, float %0, i32 0
   %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
   %rs = extractelement <4 x float> %xr, i32 0
@@ -117,7 +104,7 @@ define double @__round_uniform_double(double) nounwind readonly alwaysinline {
 define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
   ; see above for round_ss instrinsic discussion...
   %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  ; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
   %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
   %rs = extractelement <2 x double> %xr, i32 0
   ret double %rs
@@ -126,12 +113,31 @@ define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
 define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
   ; see above for round_ss instrinsic discussion...
   %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  ; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
   %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
   %rs = extractelement <2 x double> %xr, i32 0
   ret double %rs
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+  ; do the rcpss call
+  ;    uniform float iv = extract(__rcp_u(v), 0);
+  ;    return iv * (2. - v * iv);
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration to improve precision, as above
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt
@@ -144,6 +150,7 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
   %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
   %is = extractelement <4 x float> %vis, i32 0
 
+  ; Newton-Raphson iteration to improve precision
   ;  return 0.5 * is * (3. - (v * is) * is);
   %v_is = fmul float %0, %is
   %v_is_is = fmul float %v_is, %is
@@ -164,9 +171,18 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
   ret float %ret
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; fastmath
+;; fast math mode
 
 declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
 declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
@@ -200,6 +216,22 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
   ret float %ret
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}
+
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -235,7 +267,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops
+;; horizontal ops / reductions
 
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 
@@ -251,32 +283,6 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
   ret i64 %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-define double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
-  ret double %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
-  ret double %ret
-}
-
-define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
-  ret double %ret
-}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int8/int16 builtins
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index d9e0322b..f8fd5cd5 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -137,19 +137,14 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones 4x with our 16-wide
-; vectors...
+include(`svml.m4')
+;; single precision
+svml_declare(float,f8,8)
+svml_define_x(float,f8,8,f,16)
 
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+;; double precision
+svml_declare(double,4,4)
+svml_define_x(double,4,4,d,16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 90e2f3ac..e98a3843 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -49,11 +49,10 @@ include(`target-avx-common.ll')
 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
 
 define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; do one N-R iteration to improve precision
   ;  float iv = __rcp_v(v);
   ;  return iv * (2. - v * iv);
-
   %call = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %0)
-  ; do one N-R iteration
   %v_iv = fmul <8 x float> %0, %call
   %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
                                  float 2., float 2., float 2., float 2.>, %v_iv  
@@ -61,6 +60,46 @@ define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinl
   ret <8 x float> %iv_mul
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
+
+define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
+  ret <8 x double> %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 
@@ -94,63 +133,15 @@ define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwa
 }
 
 define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
   round4to8double(%0, 9)
 }
 
-
 define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
   round4to8double(%0, 10)
 }
 
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rsqrt
-
-declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
-
-define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
-  ;  float is = __rsqrt_v(v);
-  %is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul <8 x float> %v, %is
-  %v_is_is = fmul <8 x float> %v_is, %is
-  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
-                                 float 3., float 3., float 3., float 3.>, %v_is_is
-  %is_mul = fmul <8 x float> %is, %three_sub
-  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
-                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
-  ret <8 x float> %half_scale
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; sqrt
-
-declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
-
-define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  %call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
-  ret <8 x float> %call
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; svml
-
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
@@ -171,7 +162,37 @@ define <8 x float> @__min_varying_float(<8 x float>,
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
+  ret <8 x double> %ret
+}
+
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
+  ret <8 x double> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+include(`svml.m4')
+;; single precision
+svml_declare(float,f8,8)
+svml_define(float,f8,8,f)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define_x(double,4,4,d,8)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; mask handling
 
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 
@@ -203,6 +224,9 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal ops / reductions
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal float ops
 
@@ -221,12 +245,36 @@ define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
   reduce8(float, @__min_varying_float, @__min_uniform_float)
 }
 
-
 define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
   reduce8(float, @__max_varying_float, @__max_uniform_float)
 }
 
-reduce_equal(8)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
+  ret double %sum
+}
+
+define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int8 ops
@@ -267,6 +315,7 @@ define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
 
+;; helper functions
 define <8 x i32> @__add_varying_int32(<8 x i32>,
                                       <8 x i32>) nounwind readnone alwaysinline {
   %s = add <8 x i32> %0, %1
@@ -278,16 +327,15 @@ define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
   ret i32 %s
 }
 
+;; reduction functions
 define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
 }
 
-
 define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
 }
 
-
 define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
 }
@@ -300,38 +348,11 @@ define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; horizontal double ops
-
-declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
-
-define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
-  %v0 = shufflevector <8 x double> %0, <8 x double> undef,
-                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
-                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
-  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
-  %final0 = extractelement <4 x double> %sum1, i32 0
-  %final1 = extractelement <4 x double> %sum1, i32 2
-  %sum = fadd double %final0, %final1
-
-  ret double %sum
-}
-
-define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
-  reduce8(double, @__min_varying_double, @__min_uniform_double)
-}
-
-
-define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
-  reduce8(double, @__max_varying_double, @__max_uniform_double)
-}
-
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int64 ops
 
+;; helper functions
 define <8 x i64> @__add_varying_int64(<8 x i64>,
                                       <8 x i64>) nounwind readnone alwaysinline {
   %s = add <8 x i64> %0, %1
@@ -343,6 +364,7 @@ define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
   ret i64 %s
 }
 
+;; reduction functions
 define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
   reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
 }
@@ -367,6 +389,7 @@ define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
   reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
 
+reduce_equal(8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
@@ -451,6 +474,10 @@ define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
   ret void
 }
 
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store blend
 
 masked_store_blend_8_16_by_8()
 
@@ -522,8 +549,6 @@ define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
   ret void
 }
 
-masked_store_float_double()
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; scatter
 
@@ -534,30 +559,3 @@ gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
-
-define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
-  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
-  ret <8 x double> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
-declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
-
-define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
-  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
-  ret <8 x double> %ret
-}
-
-define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
-  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
-  ret <8 x double> %ret
-}
-
diff --git a/builtins/target-avx1-i64x4.ll b/builtins/target-avx1-i64x4.ll
new file mode 100644
index 00000000..d183f1ce
--- /dev/null
+++ b/builtins/target-avx1-i64x4.ll
@@ -0,0 +1,81 @@
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx1-i64x4base.ll')
+
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+
+  ret <4 x i32> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+ifelse(NO_HALF_DECLARES, `1', `', `
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll
new file mode 100644
index 00000000..e1832030
--- /dev/null
+++ b/builtins/target-avx1-i64x4base.ll
@@ -0,0 +1,513 @@
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 4-wide definitions
+
+define(`WIDTH',`4')
+define(`MASK',`i64')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  ; do one N-R iteration
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+;; avx intrinsic
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9)
+  ret <4 x double> %call
+}
+
+
+define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10)
+  ret <4 x double> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+;; avx� intrinsic
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0)
+  ret <4 x double> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define(double,4,4,d)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+;; sse intrinsics
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+;; sse intrinsic 
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
+
+define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
+  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
+  %scalar = extractelement <4 x float> %v2, i32 0
+  ret float %scalar
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline 
+{
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define <4 x i32> @__add_varying_int32(<4 x i32>,
+                                      <4 x i32>) nounwind readnone alwaysinline {
+  %s = add <4 x i32> %0, %1
+  ret <4 x i32> %s
+}
+
+define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <4 x double> <double 0.,double 0.,double 0.,double 0.>, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %v1 = <4 x double> <double 0., double 0., double 0., double 0.>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0,   <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
+  ret double %sum
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define <4 x i64> @__add_varying_int64(<4 x i64>,
+                                      <4 x i64>) nounwind readnone alwaysinline {
+  %s = add <4 x i64> %0, %1
+  ret <4 x i64> %s
+}
+
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+; no masked load instruction for i8 and i16 types??
+masked_load(i8,  1)
+masked_load(i16, 2)
+
+;; avx intrinsics
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+ 
+define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline {
+  %mask      = trunc <4 x i64> %mask64 to <4 x i32>
+  %floatmask = bitcast <4 x i32> %mask to <4 x float>
+  %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
+  %retval = bitcast <4 x float> %floatval to <4 x i32>
+  ret <4 x i32> %retval
+}
+
+
+define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline {
+  %doublemask = bitcast <4 x i64> %mask to <4 x double>
+  %doubleval  = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask)
+  %retval = bitcast <4 x double> %doubleval to <4 x i64>
+  ret <4 x i64> %retval
+}
+
+masked_load_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+;; avx intrinsics
+declare void @llvm.x86.avx.maskstore.ps    (i8 *, <4 x float>,  <4 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+
+define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                <4 x i64>) nounwind alwaysinline {
+  %mask32 = trunc <4 x i64> %2 to <4 x i32>
+
+  %ptr    = bitcast <4 x i32> * %0 to i8 *
+  %val    = bitcast <4 x i32> %1 to <4 x float>
+  %mask   = bitcast <4 x i32> %mask32 to <4 x float>
+  call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
+  ret void
+}
+
+define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
+                                <4 x i64>) nounwind alwaysinline {
+  %ptr  = bitcast <4 x i64> * %0 to i8 *
+  %val  = bitcast <4 x i64> %1 to <4 x double>
+  %mask = bitcast <4 x i64> %2 to <4 x double>
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val)
+  ret void
+}
+
+
+masked_store_blend_8_16_by_4_mask64()
+
+;; sse intrinsic
+declare <4 x float>  @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask          = trunc   <4 x i64> %2 to <4 x i32>
+  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
+  %oldValue      = load    <4 x i32>* %0, align 4
+  %oldAsFloat    = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat    = bitcast <4 x i32> %1 to <4 x float>
+  %blend         = call    <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                             <4 x float> %newAsFloat,
+                                                             <4 x float> %mask_as_float)
+  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
+  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
+  ret void
+}
+
+;; avx intrinsic
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                <4 x double>) nounwind readnone
+
+define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask_as_double = bitcast <4 x i64>  %2 to <4 x double>
+  %oldValue       = load    <4 x i64>* %0, align 4
+  %oldAsDouble    = bitcast <4 x i64>  %oldValue to <4 x double>
+  %newAsDouble    = bitcast <4 x i64>  %1 to <4 x double>
+  %blend          = call    <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble,
+                                                                        <4 x double> %newAsDouble,
+                                                                        <4 x double> %mask_as_double)
+  %blendAsInt = bitcast <4 x double> %blend to <4 x i64>
+  store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4
+  ret void
+}
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; scatter
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
diff --git a/builtins/target-avx11-i64x4.ll b/builtins/target-avx11-i64x4.ll
new file mode 100644
index 00000000..8fe75266
--- /dev/null
+++ b/builtins/target-avx11-i64x4.ll
@@ -0,0 +1,120 @@
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx1-i64x4base.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+define(`expand_4to8', `
+  %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+')
+define(`extract_4from8', `
+  %$3 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <4 x float> @__half_to_float_varying(<4 x i16> %v4) nounwind readnone {
+  expand_4to8(i16, v4, v) 
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  extract_4from8(float, r, ret)
+  ret <4 x float> %ret
+}
+
+define <4 x i16> @__float_to_half_varying(<4 x float> %v4) nounwind readnone {
+  expand_4to8(float, v4, v) 
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  extract_4from8(i16, r, ret)
+  ret <4 x i16> %ret
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
diff --git a/builtins/target-avx2-i64x4.ll b/builtins/target-avx2-i64x4.ll
new file mode 100644
index 00000000..d74f32dc
--- /dev/null
+++ b/builtins/target-avx2-i64x4.ll
@@ -0,0 +1,355 @@
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+ifelse(LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
+
+include(`target-avx1-i64x4base.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+;; declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+;; declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readonly
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+;; declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readonly
+;; declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readonly
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+
+
+define(`expand_4to8', `
+  %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+')
+define(`extract_4from8', `
+  %$3 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <4 x float> @__half_to_float_varying(<4 x i16> %v4) nounwind readnone {
+  expand_4to8(i16, v4, v) 
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  extract_4from8(float, r, ret)
+  ret <4 x float> %ret
+}
+
+define <4 x i16> @__float_to_half_varying(<4 x float> %v4) nounwind readnone {
+  expand_4to8(float, v4, v) 
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  extract_4from8(i16, r, ret)
+  ret <4 x i16> %ret
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+declare void @llvm.trap() noreturn nounwind
+
+
+ifelse(LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <4 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
+                             i32 %scale, <4 x i32> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8  = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8 * %ptr,
+                             <4 x i32> %offsets, <4 x i32> %vecmask, i8 %scale8)
+  ret <4 x i32> %v
+}
+
+
+define <4 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <4 x i64> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8  = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets, <4 x i32> %vecmask, i8 %scale8)
+
+  ret <4 x i32> %v
+}
+
+
+define <4 x i32> @__gather32_i32(<4 x i32> %ptrs,
+                                 <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8 * null,
+                      <4 x i32> %ptrs, <4 x i32> %vecmask, i8 1)
+  
+  ret <4 x i32> %v
+}
+
+
+define <4 x i32> @__gather64_i32(<4 x i64> %ptrs, 
+                                 <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x i32> %vecmask, i8 1)
+
+  ret <4 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <4 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <4 x i32> %offsets,
+                                  <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8 * %ptr,
+                       <4 x i32> %offsets, <4 x float> %mask, i8 %scale8)
+
+  ret <4 x float> %v
+}
+
+
+define <4 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <4 x i64> %offsets,
+                                   <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets, <4 x float> %mask, i8 %scale8)
+
+  ret <4 x float> %v
+}
+
+
+define <4 x float> @__gather32_float(<4 x i32> %ptrs, 
+                                     <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8 * null,
+                     <4 x i32> %ptrs, <4 x float> %mask, i8 1)
+
+  ret <4 x float> %v
+}
+
+
+define <4 x float> @__gather64_float(<4 x i64> %ptrs, 
+                                     <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x float> %mask, i8 1)
+
+  ret <4 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <4 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <4 x i32> %offsets,
+                             <4 x i64> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets, <4 x i64> %vecmask, i8 %scale8)
+
+  ret <4 x i64> %v
+}
+
+
+define <4 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <4 x i64> %offsets,
+                             <4 x i64> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets, <4 x i64> %vecmask, i8 %scale8)
+
+  ret <4 x i64> %v
+}
+
+
+define <4 x i64> @__gather32_i64(<4 x i32> %ptrs, 
+                                 <4 x i64> %vecmask) nounwind readonly alwaysinline {
+
+  %v = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs, <4 x i64> %vecmask, i8 1)
+  ret <4 x i64> %v
+}
+
+
+define <4 x i64> @__gather64_i64(<4 x i64> %ptrs, 
+                                 <4 x i64> %vecmask) nounwind readonly alwaysinline {
+  %v = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x i64> %vecmask, i8 1)
+  ret <4 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <4 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <4 x i32> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets, <4 x double> %vecmask, i8 %scale8)
+  ret <4 x double> %v
+}
+
+define <4 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <4 x i64> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets, <4 x double> %vecmask, i8 %scale8)
+
+  ret <4 x double> %v
+}
+
+define <4 x double> @__gather32_double(<4 x i32> %ptrs, 
+                                       <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs, <4 x double> %vecmask, i8 1)
+
+  ret <4 x double> %v
+}
+
+define <4 x double> @__gather64_double(<4 x i64> %ptrs, 
+                                       <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x double> %vecmask, i8 1)
+
+  ret <4 x double> %v
+}
+
+')
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 31ebcdd5..910565dd 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -310,6 +310,7 @@ declare double @round (double) nounwind readnone
 ;declare float     @llvm.sqrt.f32(float %Val)
 declare double    @llvm.sqrt.f64(double %Val)
 declare float     @llvm.sin.f32(float %Val)
+declare float     @llvm.asin.f32(float %Val)
 declare float     @llvm.cos.f32(float %Val)
 declare float     @llvm.sqrt.f32(float %Val)
 declare float     @llvm.exp.f32(float %Val)
@@ -651,7 +652,18 @@ define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
+declare  <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline 
+declare  void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+
+define  <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -662,7 +674,18 @@ define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
    
 }
 
-define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.asin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.asin.f32)
+   
+}
+
+define  <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -673,18 +696,18 @@ define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
 
 }
 
-define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+define  void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
 ;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
 ;  store <1 x float> %s, <1 x float> * %1
 ;  ret void
-   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
-   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
+   %sin = call <1 x float> @__svml_sinf(<1 x float> %0)
+   %cos = call <1 x float> @__svml_cosf(<1 x float> %0)
    store <1 x float> %sin, <1 x float> * %1
    store <1 x float> %cos, <1 x float> * %2
    ret void
 }
 
-define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -696,7 +719,7 @@ define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
   ret <1 x float > %0
 }
 
-define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline {
 ;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
 ;  ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -709,7 +732,7 @@ define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
 
 }
 
-define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
   ;ret <1 x float> %ret
   ;%y = extractelement <1 x float> %0, i32 0
@@ -722,19 +745,19 @@ define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone al
   ret <1 x float > %0
 }
 
-define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
   ;ret <1 x float> %ret
   unary1to1(float, @llvm.exp.f32)
 }
 
-define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
   ;ret <1 x float> %ret
   unary1to1(float, @llvm.log.f32)
 }
 
-define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
   ;ret <1 x float> %ret
   %r = extractelement <1 x float> %0, i32 0
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 2896c6b1..92b7a18e 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -80,6 +80,13 @@ declare <WIDTH x i32> @__rotate_i32(<WIDTH x i32>, i32) nounwind readnone
 declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
 declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
 
+declare <WIDTH x i8> @__shift_i8(<WIDTH x i8>, i32) nounwind readnone
+declare <WIDTH x i16> @__shift_i16(<WIDTH x i16>, i32) nounwind readnone
+declare <WIDTH x float> @__shift_float(<WIDTH x float>, i32) nounwind readnone
+declare <WIDTH x i32> @__shift_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x double> @__shift_double(<WIDTH x double>, i32) nounwind readnone
+declare <WIDTH x i64> @__shift_i64(<WIDTH x i64>, i32) nounwind readnone
+
 declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
 declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
                                     <WIDTH x i32>) nounwind readnone
@@ -202,21 +209,15 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
 declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
 declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 
-;; svml
-
 ; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
 ; or, use the macro to call the 4-wide ones twice with our 8-wide
 ; vectors...
 
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+;; svml
+
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
index 696b0748..1c0b421f 100644
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -316,15 +316,10 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
 ;; yuck.  We need declarations of these, even though we shouldnt ever
 ;; actually generate calls to them for the NEON target...
 
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index da22a66c..77bf1a9d 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)
 
-
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,8)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index a6b206b6..e42d4990 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -496,62 +496,15 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
 
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,4)
 
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index d7f3833d..72b81ff0 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -209,16 +209,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 ;; svml
 
 ; FIXME
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index fd4b74d7..69b355e3 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -222,15 +222,9 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
 
 ; FIXME
 
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll
index 4b8751b5..50dd0582 100644
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -29,6 +29,9 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; SSE4 target implementation.
+
 ctlztz()
 define_prefetches()
 define_shuffles()
@@ -67,7 +70,7 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline {
 define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
   ; see above for round_ss instrinsic discussion...
   %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
   %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
   %rs = extractelement <4 x float> %xr, i32 0
   ret float %rs
@@ -97,7 +100,7 @@ define double @__round_uniform_double(double) nounwind readonly alwaysinline {
 define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
   ; see above for round_ss instrinsic discussion...
   %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  ; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
   %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
   %rs = extractelement <2 x double> %xr, i32 0
   ret double %rs
@@ -106,7 +109,7 @@ define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
 define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
   ; see above for round_ss instrinsic discussion...
   %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  ; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
   %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
   %rs = extractelement <2 x double> %xr, i32 0
   ret double %rs
@@ -119,6 +122,8 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 
 define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
   ; do the rcpss call
+  ;    uniform float iv = extract(__rcp_u(v), 0);
+  ;    return iv * (2. - v * iv);
   %vecval = insertelement <4 x float> undef, float %0, i32 0
   %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
   %scall = extractelement <4 x float> %call, i32 0
@@ -130,9 +135,8 @@ define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
   ret float %iv_mul
 }
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; rsqrt
+;; rsqrt
 
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 
@@ -154,7 +158,7 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; sqrt
+;; sqrt
 
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 
@@ -163,6 +167,16 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
   ret float %ret
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; fast math mode
 
@@ -198,36 +212,25 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
   ret float %ret
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-define double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
-  ret double %ret
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max
 
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define double @__min_uniform_double(double, double) nounwind readnone {
+define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
   sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
   ret double %ret
 }
 
-
-define double @__max_uniform_double(double, double) nounwind readnone {
+define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
   sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
   ret double %ret
 }
 
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int32 min/max
+;; int min/max
 
 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
@@ -242,8 +245,9 @@ define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
   ret i32 %ret
 }
 
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; unsigned int min/max
+;; unsigned int min/max
 
 declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
@@ -258,9 +262,8 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
   ret i32 %ret
 }
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops / reductions
+;; horizontal ops / reductions
 
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index a7faddb3..842db53f 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)
 
-
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,8)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index e05b865f..16177b47 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -58,10 +58,10 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
 
 define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
   ; do one N-R iteration to improve precision
   ;  float iv = __rcp_v(v);
   ;  return iv * (2. - v * iv);
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
   %v_iv = fmul <4 x float> %0, %call
   %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
   %iv_mul = fmul <4 x float> %call, %two_minus
@@ -87,7 +87,7 @@ define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwa
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; sqrt
+;; sqrt
 
 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
 
@@ -154,16 +154,34 @@ define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alway
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+define <4 x float> @__max_varying_float(<4 x float>,
+                                        <4 x float>) nounwind readonly alwaysinline {
   %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
   ret <4 x float> %call
 }
 
-define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+define <4 x float> @__min_varying_float(<4 x float>,
+                                        <4 x float>) nounwind readonly alwaysinline {
   %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
   ret <4 x float> %call
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int32 min/max
 
@@ -191,83 +209,19 @@ define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly a
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
+;; svml stuff
 
-declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
 
-define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
-  ret <4 x double> %ret
-}
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops / reductions
+;; mask handling
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 
@@ -299,6 +253,55 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal ops / reductions
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
+  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
+  %scalar = extractelement <4 x float> %v2, i32 0
+  ret float %scalar
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone alwaysinline {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone alwaysinline {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+define double @__reduce_add_double(<4 x double>) nounwind readnone alwaysinline {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = fadd <2 x double> %v0, %v1
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
 declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
 
 define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
@@ -314,6 +317,9 @@ define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
   ret i16 %r16
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
 define internal <4 x i16> @__add_varying_i16(<4 x i16>,
                                   <4 x i16>) nounwind readnone alwaysinline {
   %r = add <4 x i16> %0, %1
@@ -329,24 +335,11 @@ define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
   reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
 }
 
-declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
 
-define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
-  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
-  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
-  %scalar = extractelement <4 x float> %v2, i32 0
-  ret float %scalar
-}
-
-define float @__reduce_min_float(<4 x float>) nounwind readnone {
-  reduce4(float, @__min_varying_float, @__min_uniform_float)
-}
-
-define float @__reduce_max_float(<4 x float>) nounwind readnone {
-  reduce4(float, @__max_varying_float, @__max_uniform_float)
-}
-
-define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
+;; reduction functions
+define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone alwaysinline {
   %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
                       <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %m1 = add <4 x i32> %v1, %v
@@ -356,44 +349,27 @@ define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
   ret i32 %sum
 }
 
-define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
   reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
 }
 
-define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
   reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
 }
 
-define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
   reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
 
-define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
   reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
 
-define double @__reduce_add_double(<4 x double>) nounwind readnone {
-  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
-                      <2 x i32> <i32 0, i32 1>
-  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
-                      <2 x i32> <i32 2, i32 3>
-  %sum = fadd <2 x double> %v0, %v1
-  %e0 = extractelement <2 x double> %sum, i32 0
-  %e1 = extractelement <2 x double> %sum, i32 1
-  %m = fadd double %e0, %e1
-  ret double %m
-}
-
-define double @__reduce_min_double(<4 x double>) nounwind readnone {
-  reduce4(double, @__min_varying_double, @__min_uniform_double)
-}
-
-define double @__reduce_max_double(<4 x double>) nounwind readnone {
-  reduce4(double, @__max_varying_double, @__max_uniform_double)
-}
-
-define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
+;; reduction functions
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
   %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
                       <2 x i32> <i32 0, i32 1>
   %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
@@ -405,27 +381,50 @@ define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
   ret i64 %m
 }
 
-define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
   reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
 }
 
-define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
   reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
 }
 
-define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
   reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
 
-define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
   reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
 
 reduce_equal(4)
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store blend
+
+masked_store_blend_8_16_by_4()
+
 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                              <4 x float>) nounwind readnone
 
@@ -492,29 +491,6 @@ define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
   ret void
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; masked store
-
-masked_store_blend_8_16_by_4()
-
-gen_masked_store(i8)
-gen_masked_store(i16)
-gen_masked_store(i32)
-gen_masked_store(i64)
-
-masked_store_float_double()
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unaligned loads/loads+broadcasts
-
-
-masked_load(i8,  1)
-masked_load(i16, 2)
-masked_load(i32, 4)
-masked_load(float, 4)
-masked_load(i64, 8)
-masked_load(double, 8)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
 
diff --git a/builtins/util.m4 b/builtins/util.m4
index 95e3844d..e1c9bf97 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -797,6 +797,24 @@ not_const:
   ret <WIDTH x $1> %result
 }
 
+define <WIDTH x $1> @__shift_$1(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
+  %ptr = alloca <WIDTH x $1>, i32 3
+  %ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
+  store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr0
+  %ptr1 = getelementptr <WIDTH x $1> * %ptr, i32 1
+  store <WIDTH x $1> %0, <WIDTH x $1> * %ptr1
+  %ptr2 = getelementptr <WIDTH x $1> * %ptr, i32 2
+  store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr2
+
+  %offset = add i32 %1, WIDTH
+  %ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(3*WIDTH) x $1] *
+  %load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
+  %load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
+  %result = load <WIDTH x $1> * %load_ptr_vec, align $2
+  ret <WIDTH x $1> %result
+}
+
+
 define <WIDTH x $1> @__shuffle_$1(<WIDTH x $1>, <WIDTH x i32>) nounwind readnone alwaysinline {
 forloop(i, 0, eval(WIDTH-1), `  
   %index_`'i = extractelement <WIDTH x i32> %1, i32 i')
@@ -947,6 +965,22 @@ define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
   %r = sext <$1 x i32> %0 to <$1 x i64>
   ret <$1 x i64> %r
 }
+
+define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
+  ret <$1 x i64> %0
+}
 ')
 
 mask_converts(WIDTH)
@@ -2689,9 +2723,13 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
 }
 
 define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
-  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
-  `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
-  ret <WIDTH x i32> %se')
+;;  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
+;; `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
+;; ret <WIDTH x i32> %se')
+  ifelse(MASK,i32, `%se = bitcast <WIDTH x i32> %0 to <WIDTH x i32>',
+         MASK,i64, `%se = trunc <WIDTH x MASK> %0 to <WIDTH x i32>',
+                   `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>')
+  ret <WIDTH x i32> %se
 }
 
 
@@ -3160,6 +3198,7 @@ define float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
 }
 
 declare double @sin(double) nounwind readnone
+declare double @asin(double) nounwind readnone
 declare double @cos(double) nounwind readnone
 declare void @sincos(double, double *, double *) nounwind readnone
 declare double @tan(double) nounwind readnone
@@ -3174,6 +3213,11 @@ define double @__stdlib_sin(double) nounwind readnone alwaysinline {
   ret double %r
 }
 
+define double @__stdlib_asin(double) nounwind readnone alwaysinline {
+  %r = call double @asin(double %0)
+  ret double %r
+}
+
 define double @__stdlib_cos(double) nounwind readnone alwaysinline {
   %r = call double @cos(double %0)
   ret double %r
@@ -3502,6 +3546,56 @@ define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
 }
 ')
 
+define(`masked_store_blend_8_16_by_4_mask64', `
+define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
+                                     <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old32 = bitcast <4 x i8> %old to i32
+    %new32 = bitcast <4 x i8> %1 to i32
+
+    %mask8 = trunc <4 x i64> %2 to <4 x i8>
+    %mask32 = bitcast <4 x i8> %mask8 to i32
+    %notmask32 = xor i32 %mask32, -1
+
+    %newmasked = and i32 %new32, %mask32
+    %oldmasked = and i32 %old32, %notmask32
+    %result = or i32 %newmasked, %oldmasked
+
+    %resultvec = bitcast i32 %result to <4 x i8>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
+  ')
+  store <4 x i8> %resultvec, <4 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old64 = bitcast <4 x i16> %old to i64
+    %new64 = bitcast <4 x i16> %1 to i64
+
+    %mask16 = trunc <4 x i64> %2 to <4 x i16>
+    %mask64 = bitcast <4 x i16> %mask16 to i64
+    %notmask64 = xor i64 %mask64, -1
+
+    %newmasked = and i64 %new64, %mask64
+    %oldmasked = and i64 %old64, %notmask64
+    %result = or i64 %newmasked, %oldmasked
+
+    %resultvec = bitcast i64 %result to <4 x i16>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
+  ')
+  store <4 x i16> %resultvec, <4 x i16> * %0, align 2
+  ret void
+}
+')
+
 define(`masked_store_blend_8_16_by_8', `
 define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
                                      <8 x i32>) nounwind alwaysinline {
@@ -3758,9 +3852,9 @@ domixed:
   %first = call i64 @llvm.cttz.i64(i64 %mm)
   %first32 = trunc i64 %first to i32
   %baseval = extractelement <$1 x $2> %v, i32 %first32
-  %basev1 = bitcast $2 %baseval to <1 x $2>
+  %basev1 = insertelement <$1 x $2> undef, $2 %baseval, i32 0
   ; get a vector that is that value smeared across all elements
-  %basesmear = shufflevector <1 x $2> %basev1, <1 x $2> undef,
+  %basesmear = shufflevector <$1 x $2> %basev1, <$1 x $2> undef,
         <$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 >
 
   ; now to a blend of that vector with the original vector, such that the
diff --git a/cbackend.cpp b/cbackend.cpp
index 7d4b4cfc..40f87074 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -241,7 +241,11 @@ namespace {
   class CBEMCAsmInfo : public llvm::MCAsmInfo {
   public:
     CBEMCAsmInfo() {
+#if defined(LLVM_3_5)
+      GlobalPrefix = '\0';
+#else
       GlobalPrefix = "";
+#endif
       PrivateGlobalPrefix = "";
     }
   };
@@ -558,8 +562,15 @@ char CWriter::ID = 0;
 static std::string CBEMangle(const std::string &S) {
   std::string Result;
 
-  for (unsigned i = 0, e = S.size(); i != e; ++i)
-    if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
+  for (unsigned i = 0, e = S.size(); i != e; ++i) {
+    if (i+1 != e && ((S[i] == '>' && S[i+1] == '>') ||
+                     (S[i] == '<' && S[i+1] == '<'))) {
+      Result += '_';
+      Result += 'A'+(S[i]&15);
+      Result += 'A'+((S[i]>>4)&15);
+      Result += '_';
+      i++;
+    } else if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
       Result += S[i];
     } else {
       Result += '_';
@@ -567,6 +578,7 @@ static std::string CBEMangle(const std::string &S) {
       Result += 'A'+((S[i]>>4)&15);
       Result += '_';
     }
+  }
   return Result;
 }
 
@@ -2188,7 +2200,7 @@ bool CWriter::doInitialization(llvm::Module &M) {
 #endif
   TAsm = new CBEMCAsmInfo();
   MRI  = new llvm::MCRegisterInfo();
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
   TCtx = new llvm::MCContext(TAsm, MRI, NULL);
 #else
   TCtx = new llvm::MCContext(*TAsm, *MRI, NULL);
@@ -3066,7 +3078,7 @@ void CWriter::visitReturnInst(llvm::ReturnInst &I) {
   // Don't output a void return if this is the last basic block in the function
   if (I.getNumOperands() == 0 &&
       &*--I.getParent()->getParent()->end() == I.getParent() &&
-      !I.getParent()->size() == 1) {
+      (!I.getParent()->size()) == 1) {
     return;
   }
 
diff --git a/check_env.py b/check_env.py
new file mode 100755
index 00000000..8c90d895
--- /dev/null
+++ b/check_env.py
@@ -0,0 +1,102 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
+# // Author: Filippov Ilia
+
+import common
+import sys
+import os
+import string
+print_debug = common.print_debug
+error = common.error
+take_lines = common.take_lines
+
+exists = [False, False, False, False, False, False, False, False]
+names = ["m4", "bison", "flex", "sde", "ispc", "clang", "gcc", "icc"]
+
+PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+for counter in PATH_dir:
+    for i in range(0,8):
+        if os.path.exists(counter + os.sep + names[i]):
+            exists[i] = True
+
+print_debug("=== in PATH: ===\n", False, "")
+print_debug("Tools:\n", False, "")
+for i in range(0,3):
+    if exists[i]:
+        print_debug(take_lines(names[i] + " --version", "first"), False, "")
+    else:
+        error("you don't have " + names[i], 0)
+if exists[0] and exists[1] and exists[2]:
+    if common.check_tools(2):
+        print_debug("Tools' versions are ok\n", False, "")
+print_debug("\nSDE:\n", False, "")
+if exists[3]:
+    print_debug(take_lines(names[3] + " --version", "first"), False, "")
+else:
+    error("you don't have " + names[3], 2)
+print_debug("\nISPC:\n", False, "")
+if exists[4]:
+    print_debug(take_lines(names[4] + " --version", "first"), False, "")
+else:
+    error("you don't have " + names[4], 2)
+print_debug("\nC/C++ compilers:\n", False, "")
+for i in range(5,8):
+    if exists[i]:
+        print_debug(take_lines(names[i] + " --version", "first"), False, "")
+    else:
+        error("you don't have " + names[i], 2)
+
+print_debug("\n=== in ISPC specific environment variables: ===\n", False, "")
+if os.environ.get("LLVM_HOME") == None:
+    error("you have no LLVM_HOME", 2)
+else:
+    print_debug("Your LLVM_HOME:" + os.environ.get("LLVM_HOME") + "\n", False, "")
+if os.environ.get("ISPC_HOME") == None:
+    error("you have no ISPC_HOME", 2)
+else:
+    print_debug("Your ISPC_HOME:" + os.environ.get("ISPC_HOME") + "\n", False, "")
+    if os.path.exists(os.environ.get("ISPC_HOME") + os.sep + "ispc"):
+        print_debug("You have ISPC in your ISPC_HOME: " +
+        take_lines(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version", "first"), False, "")
+    else:
+        error("you don't have ISPC in your ISPC_HOME", 2)
+if os.environ.get("SDE_HOME") == None:
+    error("You have no SDE_HOME", 2)
+else:
+    print_debug("Your SDE_HOME:" + os.environ.get("SDE_HOME") + "\n", False, "")
+    if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"):
+        print_debug("You have sde in your SDE_HOME: " +
+        take_lines(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version", "first"), False, "")
+    else:
+        error("you don't have any SDE in your ISPC_HOME", 2)
diff --git a/check_isa.cpp b/check_isa.cpp
new file mode 100644
index 00000000..a4d10606
--- /dev/null
+++ b/check_isa.cpp
@@ -0,0 +1,129 @@
+/*
+  Copyright (c) 2013, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// This file is a standalone program, which detects the best supported ISA.  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+
+
+#include <stdio.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#include <intrin.h>
+#endif
+
+#if !defined (__arm__)
+#if !defined(ISPC_IS_WINDOWS)
+static void __cpuid(int info[4], int infoType) {
+    __asm__ __volatile__ ("cpuid"
+                          : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+                          : "0" (infoType));
+}
+
+/* Save %ebx in case it's the PIC register */
+static void __cpuidex(int info[4], int level, int count) {
+  __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+                        "cpuid\n\t"
+                        "xchg{l}\t{%%}ebx, %1\n\t"
+                        : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+                        : "0" (level), "2" (count));
+}
+#endif // !ISPC_IS_WINDOWS
+
+static bool __os_has_avx_support() {
+#if defined(ISPC_IS_WINDOWS)
+    // Check if the OS will save the YMM registers
+    unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+    return (xcrFeatureMask & 6) == 6;
+#else // !defined(ISPC_IS_WINDOWS)
+    // Check xgetbv; this uses a .byte sequence instead of the instruction
+    // directly because older assemblers do not include support for xgetbv and
+    // there is no easy way to conditionally compile based on the assembler used.
+    int rEAX, rEDX;
+    __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+    return (rEAX & 6) == 6;
+#endif // !defined(ISPC_IS_WINDOWS)
+}
+#endif // !__arm__
+
+
+static const char *
+lGetSystemISA() {
+#ifdef __arm__
+    return "ARM NEON";
+#else
+    int info[4];
+    __cpuid(info, 1);
+
+    if ((info[2] & (1 << 28)) != 0 &&
+         __os_has_avx_support()) {  // AVX
+        // AVX1 for sure....
+        // Ivy Bridge?
+        if ((info[2] & (1 << 29)) != 0 &&  // F16C
+            (info[2] & (1 << 30)) != 0) {  // RDRAND
+            // So far, so good.  AVX2?
+            // Call cpuid with eax=7, ecx=0
+            int info2[4];
+            __cpuidex(info2, 7, 0);
+            if ((info2[1] & (1 << 5)) != 0) {
+                return "AVX2 (codename Haswell)";
+            }
+            else {
+                return "AVX1.1 (codename Ivy Bridge)";
+            }
+        }
+        // Regular AVX
+        return "AVX (codename Sandy Bridge)";
+    }
+    else if ((info[2] & (1 << 19)) != 0) {
+        return "SSE4";
+    }
+    else if ((info[3] & (1 << 26)) != 0) {
+        return "SSE2";
+    }
+    else {
+        return "Error";
+    }
+#endif
+}
+
+int main () {
+    const char* isa = lGetSystemISA();
+    printf("ISA: %s\n", isa);
+
+    return 0;
+}
diff --git a/common.py b/common.py
new file mode 100644
index 00000000..be3e9526
--- /dev/null
+++ b/common.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation 
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+import sys
+import os
+import shutil
+
+def write_to_file(filename, line):
+    f = open(filename, 'a')
+    f.writelines(line)
+    f.close()
+
+#remove file if it exists
+def remove_if_exists(filename):
+    if os.path.exists(filename):
+        if os.path.isdir(filename):
+            shutil.rmtree(filename)
+        else:
+            os.remove(filename)
+
+# detect version which is printed after command
+def take_lines(command, which):
+    os.system(command + " > " + "temp_detect_version")
+    version = open("temp_detect_version")
+    if which == "first":
+        answer = version.readline()
+    if which == "all":
+        answer = version.readlines()
+    version.close()
+    remove_if_exists("temp_detect_version")
+    return answer
+
+# print versions of compilers
+def print_version(ispc_test, ispc_ref, ref_compiler, s, perf_log, is_windows):
+    print_debug("\nUsing test compiler: " + take_lines(ispc_test + " --version", "first"), s, perf_log)
+    if ispc_ref != "":
+        print_debug("Using ref compiler:  " + take_lines(ispc_ref + " --version", "first"), s, perf_log)
+    if is_windows == False:
+        temp1 = take_lines(ref_compiler + " --version", "first")
+    else:
+        os.system(ref_compiler + " 2>&1" + " 2> temp_detect_version > temp_detect_version1" )
+        version = open("temp_detect_version")
+        temp1 = version.readline()
+        version.close()
+        remove_if_exists("temp_detect_version")
+        remove_if_exists("temp_detect_version1")
+    print_debug("Using C/C++ compiler: " + temp1 + "\n", s, perf_log)
+
+# print everything from scripts instead errors
+def print_debug(line, silent, filename):
+    if silent == False:
+        sys.stdout.write(line)
+        sys.stdout.flush()
+        if os.environ.get("ISPC_HOME") != None:
+            if os.path.exists(os.environ.get("ISPC_HOME")):
+                write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line)
+    if filename != "":
+        write_to_file(filename, line)
+
+# print errors from scripts
+# type 1 for error in environment
+# type 2 for warning
+# type 3 for error of compiler or test which isn't the goal of script 
+def error(line, error_type):
+    line = line + "\n"
+    if error_type == 1:
+        sys.stderr.write("Fatal error: " + line)
+        sys.exit(1)
+    if error_type == 2:
+        sys.stderr.write("Warning: " + line)
+    if error_type == 0:
+        print_debug("FIND ERROR: " + line, False, "")
+
+def check_tools(m):
+    input_tools=[[[1,4],"m4 --version", "bad m4 version"],
+                 [[2,4],"bison --version", "bad bison version"],
+                 [[2,5], "flex --version", "bad flex version"]]
+    ret = 1 
+    for t in range(0,len(input_tools)):
+        t1 = ((take_lines(input_tools[t][1], "first"))[:-1].split(" "))
+        for i in range(0,len(t1)):
+            t11 = t1[i].split(".")
+            f = True
+            for j in range(0,len(t11)):
+                if not t11[j].isdigit():
+                    f = False
+            if f == True:
+                for j in range(0,len(t11)):
+                    if j < len(input_tools[t][0]):
+                        if int(t11[j])<input_tools[t][0][j]:
+                            error(input_tools[t][2], m)
+                            ret = 0
+    return ret
diff --git a/contrib/ispc.vim b/contrib/ispc.vim
index cc8493f0..f3cb413b 100644
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -19,6 +19,13 @@ syn keyword	ispcRepeat	cdo cfor cwhile
 syn keyword	ispcBuiltin	programCount programIndex	
 syn keyword	ispcType	export uniform varying int8 int16 int32 int64
 
+"double precision floating point number, with dot, optional exponent
+syn match	cFloat		display contained "\d\+\.\d*d[-+]\=\d*\>"
+"double precision floating point number, starting with dot, optional exponent
+syn match	cFloat		display contained ".\d*d[-+]\=\d*\>"
+"double precision floating point number, without dot, with exponent
+syn match	cFloat		display contained "\d\+d[-+]\=\d\+\>"
+
 " Default highlighting
 command -nargs=+ HiLink hi def link <args>
 HiLink ispcStatement	Statement
diff --git a/ctx.cpp b/ctx.cpp
index c50d22f9..c1a7e61a 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -348,7 +348,7 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
             AssertPos(currentPos, diSubprogramType.Verify());
         }
 
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         Assert(diSubprogramType.isCompositeType());
         llvm::DICompositeType diSubprogramType_n =
             static_cast<llvm::DICompositeType>(diSubprogramType);
diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt
index 007f283e..a8575ea0 100644
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,63 @@
+=== v1.5.0 === (27 September 2013)
+
+A major new version of ISPC with several new targets and important bug fixes.
+Here's a list of the most important changes, if you are using pre-built
+binaries (which are based on patched version of LLVM 3.3):
+
+* The naming of targets was changed to explicitly include data type width and
+  a number of threads in the gang. For example, avx2-i32x8 is avx2 target,
+  which uses 32 bit types as a base and has 8 threads in a gang. Old naming
+  scheme is still supported, but depricated.
+
+* New SSE4 targets for calculations based on 8 bit and 16 bit data types:
+  sse4-i8x16 and sse4-i16x8.
+
+* New AVX1 target for calculations based on 64 bit data types: avx1-i64x4.
+
+* SVML support was extended and improved.
+
+* Behavior of -g switch was changed to not affect optimization level.
+
+* ISPC debug infrastructure was redesigned. See --help-dev for more info and
+  enjoy capabilities of new --debug-phase=<value> and --off-phase=<value>
+  switches.
+
+* Fixed an auto-dispatch bug, which caused AVX code execution when OS doesn't
+  support AVX (but hardware does).
+
+* Fixed a bug, which discarded uniform/varying keyword in typedefs.
+
+* Several performance regressions were fixed.
+
+If you are building ISPC yourself, then following changes are also available
+to you:
+
+* --cpu=slm for targeting Intel Atom codename Silvermont (if LLVM 3.4 is used).
+
+* ARM NEON targets are available (if enabled in build system).
+
+* --debug-ir=<value> is available to generate debug information based on LLVM
+  IR (if LLVM 3.4 is used). In debugger you'll see LLVM IR instead of source
+  code.
+
+* A redesigned and improved test and configuration management system is
+  available to facilitate the process of building LLVM and testing ISPC
+  compiler.
+
+Standard library changes/fixes:
+
+* __pause() function was removed from standard library.
+
+* Fixed reduce_[min|max]_[float|double] intrinsics, which were producing
+  incorrect code under some conditions.
+
+Language changes:
+
+* By default a floating point constant without a suffix is a single precision
+  constant (32 bit). A new suffix "d" was introduced to allow double precision
+  constant (64 bit). Please refer to tests/double-consts.ispc for syntax
+  examples.
+
 === v1.4.4 === (19 July 2013)
 
 A minor version update with several stability fixes requested by the customers.
diff --git a/docs/ispc.rst b/docs/ispc.rst
index ff07f6d8..93b6ac9b 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -270,6 +270,14 @@ new reserved words: ``unmasked``, ``foreach_unique``, ``foreach_active``,
 and ``in``.  Any program that happens to have a variable or function with
 one of these names must be modified to rename that symbol.
 
+Updating ISPC Programs For Changes In ISPC 1.5.0
+------------------------------------------------
+
+This release adds support for double precision floating point constants.
+Double precision floating point constants are floating point number with
+``d`` suffix and optional exponent part. Here are some examples: 3.14d,
+31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is
+treated as single precision constant.
 
 Getting Started with ISPC
 =========================
@@ -1349,7 +1357,8 @@ but are likely to be supported in future releases:
 * Bitfield members of ``struct`` types
 * Variable numbers of arguments to functions
 * Literal floating-point constants (even without a ``f`` suffix) are
-  currently treated as being ``float`` type, not ``double``
+  currently treated as being ``float`` type, not ``double``. To have a double
+  precision floating point constant use ``d`` suffix.
 * The ``volatile`` qualifier
 * The ``register`` storage class for variables.  (Will be ignored).
 
@@ -2335,8 +2344,11 @@ based on C++'s ``new`` and ``delete`` operators:
 In the above code, each program instance allocates its own ``count`` sized
 array of ``uniform int`` values, uses that memory, and then deallocates
 that memory.  Uses of ``new`` and ``delete`` in ``ispc`` programs are
-serviced by corresponding calls the system C library's ``malloc()`` and
-``free()`` functions.
+implemented as calls to C library's aligned memory allocation routines,
+which are platform dependent (``posix_memalign()`` and ``free()`` on Linux
+and Mac and ``_aligned_malloc()`` and ``_aligned_free()`` on Windows). So it's
+advised to pair ISPC's ``new`` and ``delete`` with each other, but not with
+C/C++ memory management functions.
 
 Note that the rules for ``uniform`` and ``varying`` for ``new`` are
 analogous to the corresponding rules for pointers (as described in
@@ -3710,6 +3722,22 @@ the size of the gang (it is masked to ensure valid offsets).
     double rotate(double value, uniform int offset)
 
 
+The ``shift()`` function allows each program instance to find the value of
+the given value that their neighbor ``offset`` steps away has.  This is similar
+to ``rotate()`` with the exception that values are not circularly shifted.  
+Instead, zeroes are shifted in where appropriate.
+
+
+::
+
+    int8 shift(int8 value, uniform int offset)
+    int16 shift(int16 value, uniform int offset)
+    int32 shift(int32 value, uniform int offset)
+    int64 shift(int64 value, uniform int offset)
+    float shift(float value, uniform int offset)
+    double shift(double value, uniform int offset)
+
+
 Finally, the ``shuffle()`` functions allow two variants of fully general
 shuffling of values among the program instances.  For the first version,
 each program instance's value of permutation gives the program instance
@@ -3742,7 +3770,7 @@ the last element of ``value1``, etc.)
     double shuffle(double value0, double value1, int permutation)
 
 Finally, there are primitive operations that extract and set values in the
-SIMD lanes.  You can implement all of the broadcast, rotate, and shuffle
+SIMD lanes.  You can implement all of the broadcast, rotate, shift, and shuffle
 operations described above in this section from these routines, though in
 general, not as efficiently.  These routines are useful for implementing
 other reductions and cross-lane communication that isn't included in the
diff --git a/docs/news.rst b/docs/news.rst
index c1c35de3..7d78a662 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -2,6 +2,14 @@
 ispc News
 =========
 
+ispc 1.5.0 is Released
+----------------------
+
+A major update of ``ispc`` has been released with several new targets available
+and bunch of performance and stability fixes. The released binaries are built
+with patched version of LLVM 3.3. Please refer to Release Notes for complete
+set of changes.
+
 ispc 1.4.4 is Released
 ----------------------
 
diff --git a/docs/template-news.txt b/docs/template-news.txt
index 9a41fbdb..d5eebdd1 100644
--- a/docs/template-news.txt
+++ b/docs/template-news.txt
@@ -57,7 +57,7 @@
 %(body)s
 </div>
     <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2013 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
       <!-- Please Do Not remove this link, thank u -->
       </div>
       </div>
diff --git a/docs/template-perf.txt b/docs/template-perf.txt
index 4932e332..9537a836 100644
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -57,7 +57,7 @@
 %(body)s
 </div>
     <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2013 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
       <!-- Please Do Not remove this link, thank u -->
       </div>
       </div>
diff --git a/docs/template.txt b/docs/template.txt
index 8cb4f5ab..b9041f19 100644
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -57,7 +57,7 @@
 %(body)s
 </div>
     <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2013 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
       <!-- Please Do Not remove this link, thank u -->
       </div>
       </div>
diff --git a/doxygen.cfg b/doxygen.cfg
index 480d9331..a0ad3176 100644
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = 1.4.5dev
+PROJECT_NUMBER         = 1.5.1dev
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
diff --git a/examples/README.txt b/examples/README.txt
index 5b47df44..b67529c1 100644
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -146,6 +146,11 @@ This is a simple "hello world" type program that shows a ~10 line
 application program calling out to a ~5 line ispc program to do a simple
 computation.
 
+Sort
+====
+This is a bucket sort of 32 bit unsigned integers.
+By default 1000000 random elements get sorted.
+Call ./sort N in order to sort N elements instead.
 
 Volume
 ======
diff --git a/examples/aobench/Makefile b/examples/aobench/Makefile
index 7aba4f01..c8122c07 100644
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=ao
 CPP_SRC=ao.cpp ao_serial.cpp
 ISPC_SRC=ao.ispc
-ISPC_IA_TARGETS=sse2,sse4,avx
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/aobench/aobench.vcxproj b/examples/aobench/aobench.vcxproj
index 48e26e40..c46ee41a 100644
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -1,180 +1,16 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>aobench</RootNamespace>
+    <ISPC_file>ao</ISPC_file>
+    <default_targets>sse2,sse4,avx1-i32x8</default_targets>
+  </PropertyGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="ao.cpp" />
     <ClCompile Include="ao_serial.cpp" />
     <ClCompile Include="../tasksys.cpp" />
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="ao.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>aobench</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>ao</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
-    <TargetName>ao</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>ao</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>ao</TargetName>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
 </Project>
diff --git a/examples/aobench_instrumented/Makefile b/examples/aobench_instrumented/Makefile
index 9921cf3e..d0b27cbf 100644
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -1,5 +1,5 @@
 
-CXX=g++ -m64
+CXX=clang++ -m64
 CXXFLAGS=-Iobjs/ -g3 -Wall
 ISPC=ispc
 ISPCFLAGS=-O2 --instrument --arch=x86-64 --target=sse2
diff --git a/examples/aobench_instrumented/aobench_instrumented.vcxproj b/examples/aobench_instrumented/aobench_instrumented.vcxproj
index d54332b6..5247762c 100644
--- a/examples/aobench_instrumented/aobench_instrumented.vcxproj
+++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj
@@ -26,15 +26,15 @@
   <ItemGroup>
     <CustomBuild Include="ao.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
@@ -44,6 +44,7 @@
     <ProjectGuid>{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>aobench_instrumented</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -171,4 +172,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/examples/common.mk b/examples/common.mk
index cdfc4c6a..04a566bb 100644
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -3,24 +3,50 @@ TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
 TASK_OBJ=objs/tasksys.o
 
-CXX=g++
-CXXFLAGS=-Iobjs/ -O2
-CC=gcc
-CCFLAGS=-Iobjs/ -O2
+CXX=clang++
+CXXFLAGS+=-Iobjs/ -O2
+CC=clang
+CCFLAGS+=-Iobjs/ -O2
 
 LIBS=-lm $(TASK_LIB) -lstdc++
-ISPC=ispc -O2 $(ISPC_FLAGS)
+ISPC=ispc
+ISPC_FLAGS+=-O2
 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
 
-ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
+ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
 
 ifeq ($(ARCH),x86)
-  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
-	$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
+  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o)
+  COMMA=,
+  ifneq (,$(findstring $(COMMA),$(ISPC_IA_TARGETS)))
+    #$(info multi-target detected: $(ISPC_IA_TARGETS))
+    ifneq (,$(findstring sse2,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse2.o)
+    endif
+    ifneq (,$(findstring sse4,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse4.o)
+    endif
+    ifneq (,$(findstring avx1-,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx.o)
+    endif
+    ifneq (,$(findstring avx1.1,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx11.o)
+    endif
+    ifneq (,$(findstring avx2,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx2.o)
+    endif
+  endif
   ISPC_TARGETS=$(ISPC_IA_TARGETS)
-  ISPC_FLAGS += --arch=x86-64
-  CXXFLAGS += -m64
-  CCFLAGS += -m64
+  ARCH_BIT:=$(shell getconf LONG_BIT)
+  ifeq ($(ARCH_BIT),32)
+    ISPC_FLAGS += --arch=x86
+    CXXFLAGS += -m32
+    CCFLAGS += -m32
+  else
+    ISPC_FLAGS += --arch=x86-64
+    CXXFLAGS += -m64
+    CCFLAGS += -m64
+  endif
 else ifeq ($(ARCH),arm)
   ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o))
   ISPC_TARGETS=$(ISPC_ARM_TARGETS)
@@ -44,7 +70,7 @@ dirs:
 objs/%.cpp objs/%.o objs/%.h: dirs
 
 clean:
-	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
+	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test
 
 $(EXAMPLE): $(OBJS)
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
@@ -58,13 +84,13 @@ objs/%.o: %.c dirs $(ISPC_HEADER)
 objs/%.o: ../%.cpp dirs
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 
-objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h
+objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h dirs
 
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o: %.ispc dirs
+	$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
 
 objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
-	$(ISPC) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
 
 objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
 	$(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
@@ -73,7 +99,7 @@ $(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
 
 objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
-	$(ISPC) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
 
 objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
 	$(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@
@@ -82,7 +108,7 @@ $(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
 
 objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
-	$(ISPC) $< -o $@ --target=generic-1
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1
 
 $(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
diff --git a/examples/common.props b/examples/common.props
new file mode 100644
index 00000000..7bf37005
--- /dev/null
+++ b/examples/common.props
@@ -0,0 +1,172 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <PropertyGroup Label="User">
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+    <Target_str Condition=" '$(Target_str)' == '' ">$(default_targets)</Target_str>
+    <Target_out>$(TargetDir)$(ISPC_file).obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse2')))">$(Target_out);$(TargetDir)$(ISPC_file)_sse2.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse4')))">$(Target_out);$(TargetDir)$(ISPC_file)_sse4.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1-')))">$(Target_out);$(TargetDir)$(ISPC_file)_avx.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1.1')))">$(Target_out);$(TargetDir)$(ISPC_file)_avx11.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx2')))">$(Target_out);$(TargetDir)$(ISPC_file)_avx2.obj</Target_out>
+  </PropertyGroup>
+  <ItemGroup>
+    <CustomBuild Include='$(ISPC_file).ispc'>
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Target_out);$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/examples/deferred/Makefile b/examples/deferred/Makefile
index 09fa56f0..be8ce7c4 100644
--- a/examples/deferred/Makefile
+++ b/examples/deferred/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=deferred_shading
 CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
 ISPC_SRC=kernels.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 ISPC_FLAGS=--opt=fast-math
 
diff --git a/examples/deferred/deferred_shading.vcxproj b/examples/deferred/deferred_shading.vcxproj
index 9a2a64bf..cd361b26 100755
--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -1,153 +1,13 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
-    <RootNamespace>mandelbrot</RootNamespace>
+    <RootNamespace>deferred</RootNamespace>
+    <ISPC_file>kernels</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-x2</default_targets>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="common.cpp" />
     <ClCompile Include="dynamic_c.cpp" />
@@ -155,24 +15,4 @@
     <ClCompile Include="main.cpp" />
     <ClCompile Include="../tasksys.cpp" />
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="kernels.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
 </Project>
diff --git a/examples/examples.sln b/examples/examples.sln
index e9992f76..2285f6a6 100755
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -25,6 +25,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferre
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "perfbench", "perfbench\perfbench.vcxproj", "{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sort", "sort\sort.vcxproj", "{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -129,6 +131,14 @@ Global
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.Build.0 = Release|Win32
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.ActiveCfg = Release|x64
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.Build.0 = Release|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Debug|Win32.ActiveCfg = Debug|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Debug|Win32.Build.0 = Debug|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Debug|x64.ActiveCfg = Debug|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Debug|x64.Build.0 = Debug|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Release|Win32.ActiveCfg = Release|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Release|Win32.Build.0 = Release|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Release|x64.ActiveCfg = Release|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/examples/gmres/Makefile b/examples/gmres/Makefile
index 5b57cbf8..07765069 100644
--- a/examples/gmres/Makefile
+++ b/examples/gmres/Makefile
@@ -3,7 +3,7 @@ EXAMPLE=gmres
 CPP_SRC=algorithm.cpp main.cpp matrix.cpp
 CC_SRC=mmio.c
 ISPC_SRC=matrix.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index d81101f7..fa794276 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -311,6 +311,17 @@ static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
     return ret;                                       \
 }                                                     \
 
+#define SHIFT(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __shift_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+      int modIndex = i+index;                         \
+      STYPE val = ((modIndex >= 0) && (modIndex < 16)) ? v.v[modIndex] : 0; \
+      ret.v[i] = val;                                 \
+    }                                                 \
+    return ret;                                       \
+}                                                     \
+
 #define SHUFFLES(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) {   \
     VTYPE ret;                                        \
@@ -492,6 +503,7 @@ SETZERO(__vec16_i8, i8)
 UNDEF(__vec16_i8, i8)
 BROADCAST(__vec16_i8, i8, int8_t)
 ROTATE(__vec16_i8, i8, int8_t)
+SHIFT(__vec16_i8, i8, int8_t)
 SHUFFLES(__vec16_i8, i8, int8_t)
 LOAD_STORE(__vec16_i8, int8_t)
 
@@ -537,6 +549,7 @@ SETZERO(__vec16_i16, i16)
 UNDEF(__vec16_i16, i16)
 BROADCAST(__vec16_i16, i16, int16_t)
 ROTATE(__vec16_i16, i16, int16_t)
+SHIFT(__vec16_i16, i16, int16_t)
 SHUFFLES(__vec16_i16, i16, int16_t)
 LOAD_STORE(__vec16_i16, int16_t)
 
@@ -582,6 +595,7 @@ SETZERO(__vec16_i32, i32)
 UNDEF(__vec16_i32, i32)
 BROADCAST(__vec16_i32, i32, int32_t)
 ROTATE(__vec16_i32, i32, int32_t)
+SHIFT(__vec16_i32, i32, int32_t)
 SHUFFLES(__vec16_i32, i32, int32_t)
 LOAD_STORE(__vec16_i32, int32_t)
 
@@ -627,6 +641,7 @@ SETZERO(__vec16_i64, i64)
 UNDEF(__vec16_i64, i64)
 BROADCAST(__vec16_i64, i64, int64_t)
 ROTATE(__vec16_i64, i64, int64_t)
+SHIFT(__vec16_i64, i64, int64_t)
 SHUFFLES(__vec16_i64, i64, int64_t)
 LOAD_STORE(__vec16_i64, int64_t)
 
@@ -672,6 +687,7 @@ SETZERO(__vec16_f, float)
 UNDEF(__vec16_f, float)
 BROADCAST(__vec16_f, float, float)
 ROTATE(__vec16_f, float, float)
+SHIFT(__vec16_f, float, float)
 SHUFFLES(__vec16_f, float, float)
 LOAD_STORE(__vec16_f, float)
 
@@ -832,6 +848,7 @@ SETZERO(__vec16_d, double)
 UNDEF(__vec16_d, double)
 BROADCAST(__vec16_d, double, double)
 ROTATE(__vec16_d, double, double)
+SHIFT(__vec16_d, double, double)
 SHUFFLES(__vec16_d, double, double)
 LOAD_STORE(__vec16_d, double)
 
diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
new file mode 100644
index 00000000..ef14d26e
--- /dev/null
+++ b/examples/intrinsics/knc-i1x16.h
@@ -0,0 +1,2776 @@
+/**
+  Copyright (c) 2010-2013, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+#include <math.h>
+#include <assert.h>
+#include <algorithm>
+#include <immintrin.h>
+#include <zmmintrin.h>
+
+#ifdef _MSC_VER
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)  /*__declspec(align(x))*/
+#define POST_ALIGN(x)  
+#define roundf(x) (floorf(x + .5f))
+#define round(x) (floor(x + .5))
+#else
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)
+#define POST_ALIGN(x)  __attribute__ ((aligned(x)))
+#endif
+
+#define KNC 1
+#if 0
+extern "C" 
+{
+  int printf(const unsigned char *, ...);
+  int puts(unsigned char *);
+  unsigned int putchar(unsigned int);
+  int fflush(void *);
+  uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t);
+  uint8_t *memset(uint8_t *, uint8_t, uint64_t);
+  void memset_pattern16(void *, const void *, uint64_t);
+}
+#endif
+
+typedef float   __vec1_f;
+typedef double  __vec1_d;
+typedef int8_t  __vec1_i8;
+typedef int16_t __vec1_i16;
+typedef int32_t __vec1_i32;
+typedef int64_t __vec1_i64;
+
+/************ mask **************/
+
+struct __vec16_i1 
+{
+  __mmask16 v;
+
+  FORCEINLINE __vec16_i1() { }
+  FORCEINLINE __vec16_i1(const __mmask16 &vv) : v(vv) { }
+  FORCEINLINE __vec16_i1(bool  v0, bool  v1, bool  v2, bool  v3,
+                         bool  v4, bool  v5, bool  v6, bool  v7,
+                         bool  v8, bool  v9, bool v10, bool v11,
+                         bool v12, bool v13, bool v14, bool v15) {
+    v = ((v0 & 1) |
+        ((v1 & 1) << 1) |
+        ((v2 & 1) << 2) |
+        ((v3 & 1) << 3) |
+        ((v4 & 1) << 4) |
+        ((v5 & 1) << 5) |
+        ((v6 & 1) << 6) |
+        ((v7 & 1) << 7) |
+        ((v8 & 1) << 8) |
+        ((v9 & 1) << 9) |
+        ((v10 & 1) << 10) |
+        ((v11 & 1) << 11) |
+        ((v12 & 1) << 12) |
+        ((v13 & 1) << 13) |
+        ((v14 & 1) << 14) |
+        ((v15 & 1) << 15));
+  }
+
+  FORCEINLINE operator __mmask16() const { return v; }
+};
+
+/************ vector **************/
+
+struct PRE_ALIGN(64) __vec16_i32 
+{
+  __m512i v;
+  FORCEINLINE operator __m512i() const { return v; }
+  FORCEINLINE __vec16_i32() : v(_mm512_undefined_epi32()) {}
+  FORCEINLINE __vec16_i32(const int32_t &in) : v(_mm512_set1_epi32(in)) {}
+  FORCEINLINE __vec16_i32(const __m512i &in) : v(in) {}
+  FORCEINLINE __vec16_i32(const __vec16_i32 &o) : v(o.v) {}
+  FORCEINLINE __vec16_i32& operator =(const __vec16_i32 &o) { v=o.v; return *this; }
+  FORCEINLINE __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, 
+      int32_t v04, int32_t v05, int32_t v06, int32_t v07,
+      int32_t v08, int32_t v09, int32_t v10, int32_t v11,
+      int32_t v12, int32_t v13, int32_t v14, int32_t v15) :
+    v ( _mm512_set_16to16_pi(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) ) {}
+    FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
+    FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
+} POST_ALIGN(64);
+
+PRE_ALIGN(64) struct __vec16_f 
+{
+  __m512 v;
+  FORCEINLINE operator __m512() const { return v; }
+  FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { }
+  FORCEINLINE __vec16_f(const __m512 &in) : v(in) {}
+  FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {}
+  FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; }
+  FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, 
+      float v04, float v05, float v06, float v07,
+      float v08, float v09, float v10, float v11,
+      float v12, float v13, float v14, float v15) :
+    v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) )  {}
+  FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
+  FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
+} POST_ALIGN(64);
+
+static void zmm2hilo(const __m512i v1, const __m512i v2, __m512i &_hi, __m512i &_lo)
+{
+  _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, 
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      v2);
+  _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, 
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      v1);
+  _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      v2);
+  _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF,
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      v1);
+}
+static void hilo2zmm(const __m512i v_hi, const __m512i v_lo, __m512i &_v1, __m512i &_v2)
+{
+  _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v_hi);
+  _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v_lo);
+  _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v_hi);
+  _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v_lo);
+}
+
+struct PRE_ALIGN(128) __vec16_d 
+{
+  union {
+    __m512d v1;
+    __m512d v_hi;
+  };
+  union {
+    __m512d v2;
+    __m512d v_lo;
+  };
+  FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
+  FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
+  FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
+  FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; }
+  FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, 
+      double v04, double v05, double v06, double v07,
+      double v08, double v09, double v10, double v11,
+      double v12, double v13, double v14, double v15) {
+    v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08);
+    v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00);
+  }
+  FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
+  FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
+  FORCEINLINE __vec16_d cvt2hilo()  const
+  {
+    const __m512i _v1 = _mm512_castpd_si512(v1);
+    const __m512i _v2 = _mm512_castpd_si512(v2);
+    __m512i _hi, _lo;
+    zmm2hilo(_v1, _v2, _hi, _lo);
+    return __vec16_d(_mm512_castsi512_pd(_hi), _mm512_castsi512_pd(_lo));
+  }
+  FORCEINLINE __vec16_d cvt2zmm() const
+  {
+    const __m512i _hi = _mm512_castpd_si512(v_hi);
+    const __m512i _lo = _mm512_castpd_si512(v_lo);
+    __m512i _v1, _v2;
+    hilo2zmm(_hi,_lo, _v1,_v2);
+    return __vec16_d(_mm512_castsi512_pd(_v1), _mm512_castsi512_pd(_v2));
+  }
+} POST_ALIGN(128);
+
+struct PRE_ALIGN(128) __vec16_i64 
+{
+  union {
+    __m512i v1;
+    __m512i v_hi;
+  };
+  union
+  {
+    __m512i v2;
+    __m512i v_lo;
+  };
+  FORCEINLINE __vec16_i64() : v1(_mm512_undefined_epi32()), v2(_mm512_undefined_epi32()) {}
+  FORCEINLINE __vec16_i64(const __m512i _v1, const __m512i _v2) : v1(_v1), v2(_v2) {}
+  FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v1(o.v1), v2(o.v2) {}
+  FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v1=o.v1; v2=o.v2; return *this; }
+  FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, 
+      int64_t v04, int64_t v05, int64_t v06, int64_t v07,
+      int64_t v08, int64_t v09, int64_t v10, int64_t v11,
+      int64_t v12, int64_t v13, int64_t v14, int64_t v15) {
+    v2 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08);
+    v1 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00);
+  }
+  FORCEINLINE const int64_t& operator[](const int i) const {  return ((int64_t*)this)[i]; }
+  FORCEINLINE       int64_t& operator[](const int i)       {  return ((int64_t*)this)[i]; }
+  FORCEINLINE __vec16_i64 cvt2hilo()  const
+  {
+    __vec16_i64 ret;
+    zmm2hilo(v1,v2,ret.v_hi,ret.v_lo);
+    return ret;
+  }
+  FORCEINLINE __vec16_i64 cvt2zmm() const
+  {
+    __vec16_i64 ret;
+    hilo2zmm(v_hi,v_lo, ret.v1, ret.v2);
+    return ret;
+  }
+} POST_ALIGN(128);
+
+/************ scalar **************/
+
+template <typename T>
+struct vec16 
+{
+  FORCEINLINE vec16() { }
+  FORCEINLINE vec16(T v0, T v1, T  v2, T  v3, T  v4, T  v5, T  v6, T  v7,
+                    T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
+    data[0] = v0;        data[1] = v1;        data[2] = v2;        data[3] = v3;
+    data[4] = v4;        data[5] = v5;        data[6] = v6;        data[7] = v7;
+    data[8] = v8;        data[9] = v9;        data[10] = v10;      data[11] = v11;
+    data[12] = v12;      data[13] = v13;      data[14] = v14;      data[15] = v15;
+  }
+  T data[16]; 
+  FORCEINLINE const T& operator[](const int i) const { return data[i]; }
+  FORCEINLINE       T& operator[](const int i)       { return data[i]; }
+};
+
+PRE_ALIGN(16) struct __vec16_i8   : public vec16<int8_t> { 
+    __vec16_i8() { }
+    __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
+               int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+               int8_t v8, int8_t v9, int8_t v10, int8_t v11, 
+               int8_t v12, int8_t v13, int8_t v14, int8_t v15) 
+        : vec16<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                        v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(16);
+
+PRE_ALIGN(32) struct __vec16_i16  : public vec16<int16_t> { 
+    __vec16_i16() { }
+    __vec16_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
+                int16_t v4, int16_t v5, int16_t v6, int16_t v7,
+                int16_t v8, int16_t v9, int16_t v10, int16_t v11, 
+                int16_t v12, int16_t v13, int16_t v14, int16_t v15) 
+        : vec16<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(32);
+
+static inline int32_t __extract_element(__vec16_i32, int);
+
+
+///////////////////////////////////////////////////////////////////////////
+// macros...
+
+/* knc::macro::used */
+#define BINARY_OP(TYPE, NAME, OP)                               \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
+    TYPE ret;                                                   \
+   for (int i = 0; i < 16; ++i)                                 \
+       ret[i] = a[i] OP b[i];                             \
+   return ret;                                                   \
+}
+
+/* knc::macro::used */
+#define BINARY_OP_CAST(TYPE, CAST, NAME, OP)                        \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP (CAST)(b[i]);                 \
+   return ret;                                                      \
+}
+
+/* knc::macro::used */
+#define BINARY_OP_FUNC(TYPE, NAME, FUNC)                            \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret[i] = FUNC(a[i], b[i]);                             \
+   return ret;                                                      \
+}
+
+/* knc::macro::used */
+#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP)                        \
+static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) {     \
+   __vec16_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   return ret;                                                      \
+}                                                                   \
+static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b,       \
+                                              __vec16_i1 mask) {    \
+   __vec16_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   ret.v &= mask.v;                                                 \
+   return ret;                                                      \
+}
+
+/* knc::macro::used */
+#define INSERT_EXTRACT(VTYPE, STYPE)                                  \
+static FORCEINLINE STYPE __extract_element(VTYPE v, int index) {      \
+    return ((STYPE *)&v)[index];                                      \
+}                                                                     \
+static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
+    ((STYPE *)v)[index] = val;                                        \
+}
+
+/* knc::macro::used */
+#define LOAD_STORE(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 16; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 16; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+/* knc::macro::used */
+#define REDUCE_ADD(TYPE, VTYPE, NAME)           \
+static FORCEINLINE TYPE NAME(VTYPE v) {         \
+     TYPE ret = v[0];                         \
+     for (int i = 1; i < 16; ++i)               \
+         ret = ret + v[i];                    \
+     return ret;                                \
+}
+
+/* knc::macro::used */
+#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP)                    \
+static FORCEINLINE TYPE NAME(VTYPE v) {                         \
+    TYPE ret = v[0];                                          \
+    for (int i = 1; i < 16; ++i)                                \
+        ret = (ret OP (TYPE)v[i]) ? ret : (TYPE)v[i];       \
+    return ret;                                                 \
+}
+
+/* knc::macro::used */
+#define SELECT(TYPE)                                                \
+static FORCEINLINE TYPE __select(__vec16_i1 mask, TYPE a, TYPE b) { \
+    TYPE ret;                                                       \
+    for (int i = 0; i < 16; ++i)                                    \
+        ret[i] = (mask.v & (1<<i)) ? a[i] : b[i];             \
+    return ret;                                                     \
+}                                                                   \
+static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
+    return cond ? a : b;                                            \
+}
+
+/* knc::macro::used */
+#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
+static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP b;                              \
+   return ret;                                                      \
+}
+
+/* knc::macro::used */
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
+template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret[i] = v;                                              \
+    return ret;                                                    \
+}
+
+/* knc::macro::used */
+#define SETZERO(VTYPE, NAME)                                       \
+template <class RetVecType> VTYPE __setzero_##NAME();              \
+template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret[i] = 0;                                              \
+    return ret;                                                    \
+}
+
+/* knc::macro::used */
+#define UNDEF(VTYPE, NAME)                                         \
+template <class RetVecType> VTYPE __undef_##NAME();                \
+template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {            \
+    return VTYPE();                                                \
+}
+
+/* knc::macro::used */
+#define BROADCAST(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = v[index & 0xf];                  \
+    return ret;                                       \
+}                                                     \
+
+/* knc::macro::used */
+#define ROTATE(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = v[(i+index) & 0xf];              \
+    return ret;                                       \
+}                                                     \
+
+#define SHIFT(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __shift_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+      int modIndex = i+index;                         \
+      STYPE val = ((modIndex >= 0) && (modIndex < 16)) ? v[modIndex] : 0; \
+      ret[i] = val;                                 \
+    }                                                 \
+    return ret;                                       \
+}                                                     \
+
+/* knc::macro::used */
+#define SHUFFLES(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = v[__extract_element(index, i) & 0xf];      \
+    return ret;                                       \
+}                                                     \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+        int ii = __extract_element(index, i) & 0x1f;    \
+        ret[i] = (ii < 16) ? v0[ii] : v1[ii-16];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+INSERT_EXTRACT(__vec1_i8, int8_t)
+INSERT_EXTRACT(__vec1_i16, int16_t)
+INSERT_EXTRACT(__vec1_i32, int32_t)
+INSERT_EXTRACT(__vec1_i64, int64_t)
+INSERT_EXTRACT(__vec1_f, float)
+INSERT_EXTRACT(__vec1_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// mask 
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) { return  _mm512_kmov    (mask);       }
+static FORCEINLINE       bool __any   (__vec16_i1 mask) { return !_mm512_kortestz(mask, mask); }
+static FORCEINLINE       bool __all   (__vec16_i1 mask) { return  _mm512_kortestc(mask, mask); }
+static FORCEINLINE       bool __none  (__vec16_i1 mask) { return  _mm512_kortestz(mask, mask); }
+static FORCEINLINE __vec16_i1 __not   (__vec16_i1 mask) { return  _mm512_knot    (mask);       }
+
+static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kxnor (a,b); }
+static FORCEINLINE __vec16_i1 __and     (__vec16_i1 a, __vec16_i1 b) { return _mm512_kand  (a,b); }
+static FORCEINLINE __vec16_i1 __xor     (__vec16_i1 a, __vec16_i1 b) { return _mm512_kxor  (a,b); }
+static FORCEINLINE __vec16_i1 __or      (__vec16_i1 a, __vec16_i1 b) { return _mm512_kor   (a,b); }
+static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandn (a,b); }
+static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandnr(a,b); }
+
+static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, __vec16_i1 b) { return __or(__and(a, mask), __and_not2(b, mask)); }
+static FORCEINLINE __vec16_i1 __select(      bool cond, __vec16_i1 a, __vec16_i1 b) { return cond ? a : b; }
+
+static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) { return (vec.v & (1 << index)) ? true : false; }
+static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, bool val) 
+{
+  if (val == false)  vec->v &= ~(1 << index);
+  else               vec->v |=  (1 << index);
+}
+
+template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) 
+{
+  return *p;
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) 
+{
+  *p = v;
+}
+
+template <class RetVecType> RetVecType __smear_i1(int i);
+template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
+
+template <class RetVecType> RetVecType __setzero_i1();
+template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
+
+template <class RetVecType> __vec16_i1 __undef_i1();
+template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); }
+
+///////////////////////////////////////////////////////////////////////////
+// int8
+///////////////////////////////////////////////////////////////////////////
+
+BINARY_OP(__vec16_i8, __add, +)
+BINARY_OP(__vec16_i8, __sub, -)
+BINARY_OP(__vec16_i8, __mul, *)
+
+BINARY_OP(__vec16_i8, __or, |)
+BINARY_OP(__vec16_i8, __and, &)
+BINARY_OP(__vec16_i8, __xor, ^)
+BINARY_OP(__vec16_i8, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i8, uint8_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i8, uint8_t, __urem, %)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
+
+CMP_OP(__vec16_i8, i8, int8_t,  __equal, ==)
+CMP_OP(__vec16_i8, i8, int8_t,  __not_equal, !=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_less_than, <)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i8)
+INSERT_EXTRACT(__vec16_i8, int8_t)
+SMEAR(__vec16_i8, i8, int8_t)
+SETZERO(__vec16_i8, i8)
+UNDEF(__vec16_i8, i8)
+BROADCAST(__vec16_i8, i8, int8_t)
+ROTATE(__vec16_i8, i8, int8_t)
+SHIFT(__vec16_i8, i8, int8_t)
+SHUFFLES(__vec16_i8, i8, int8_t)
+LOAD_STORE(__vec16_i8, int8_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int16
+///////////////////////////////////////////////////////////////////////////
+
+BINARY_OP(__vec16_i16, __add, +)
+BINARY_OP(__vec16_i16, __sub, -)
+BINARY_OP(__vec16_i16, __mul, *)
+
+BINARY_OP(__vec16_i16, __or, |)
+BINARY_OP(__vec16_i16, __and, &)
+BINARY_OP(__vec16_i16, __xor, ^)
+BINARY_OP(__vec16_i16, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i16, uint16_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i16, uint16_t, __urem, %)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
+
+CMP_OP(__vec16_i16, i16, int16_t,  __equal, ==)
+CMP_OP(__vec16_i16, i16, int16_t,  __not_equal, !=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_less_than, <)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i16)
+INSERT_EXTRACT(__vec16_i16, int16_t)
+SMEAR(__vec16_i16, i16, int16_t)
+SETZERO(__vec16_i16, i16)
+UNDEF(__vec16_i16, i16)
+BROADCAST(__vec16_i16, i16, int16_t)
+ROTATE(__vec16_i16, i16, int16_t)
+SHIFT(__vec16_i16, i16, int16_t)
+SHUFFLES(__vec16_i16, i16, int16_t)
+LOAD_STORE(__vec16_i16, int16_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int32
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_i32 __add (__vec16_i32 a, __vec16_i32 b) { return _mm512_add_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __sub (__vec16_i32 a, __vec16_i32 b) { return _mm512_sub_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __mul (__vec16_i32 a, __vec16_i32 b) { return _mm512_mullo_epi32(a,b); }
+static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epu32  (a,b); }
+static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epu32  (a,b); }
+static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __or  (__vec16_i32 a, __vec16_i32 b) { return _mm512_or_epi32   (a,b); }
+static FORCEINLINE __vec16_i32 __and (__vec16_i32 a, __vec16_i32 b) { return _mm512_and_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __xor (__vec16_i32 a, __vec16_i32 b) { return _mm512_xor_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a, __vec16_i32 b) { return _mm512_sllv_epi32 (a,b); }
+static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srlv_epi32 (a,b); }
+static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srav_epi32 (a,b); }
+static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a,     int32_t n) { return _mm512_slli_epi32 (a,n); }
+static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a,     int32_t n) { return _mm512_srli_epi32 (a,n); }
+static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a,     int32_t n) { return _mm512_srai_epi32 (a,n); }
+
+static FORCEINLINE __vec16_i1 __equal_i32                 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpeq_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __not_equal_i32             (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpneq_epi32_mask(a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32   (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_equal_i32     (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_equal_i32  (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_than_i32    (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_than_i32      (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_than_i32   (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epi32_mask (a,b); }
+
+static FORCEINLINE __vec16_i1 __equal_i32_and_mask                 (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpeq_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask             (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpneq_epi32_mask(m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask   (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask     (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask  (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask    (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask      (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask   (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epi32_mask (m,a,b); }
+
+static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask, __vec16_i32 a, __vec16_i32 b) { return _mm512_mask_mov_epi32(b, mask, a); }
+static FORCEINLINE __vec16_i32 __select(      bool cond, __vec16_i32 a, __vec16_i32 b) { return cond ? a : b; }
+
+static FORCEINLINE int32_t __extract_element(__vec16_i32  v,  int32_t index)              { return v[index];    }
+static FORCEINLINE void    __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val;  }
+
+template <class RetVecType> RetVecType __smear_i32(int32_t i);
+template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
+
+static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
+static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0);
+static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
+static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
+static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+template <class RetVecType> RetVecType __setzero_i32();
+template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
+
+template <class RetVecType> RetVecType __undef_i32();
+template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
+
+static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); }
+
+static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) 
+{
+  __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
+  __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0xF));
+  return _mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v);
+}
+
+SHIFT(__vec16_i32, i32, int32_t)
+
+static FORCEINLINE __vec16_i32 __shuffle_i32 (__vec16_i32 v, __vec16_i32 index) 
+{ 
+  return _mm512_mask_permutevar_epi32(v, 0xFFFF, __and(index, __smear_i32<__vec16_i32>(0xF)), v); 
+}
+static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __vec16_i32 index)
+{
+  const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10));
+  index  = __and(index, __smear_i32<__vec16_i32>(0xF));
+  __vec16_i32 ret = __undef_i32<__vec16_i32>();
+  ret = _mm512_mask_permutevar_epi32(ret,       mask,  index, v0);
+  ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1);
+  return ret;
+}
+
+template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // return __load<64>(p);
+  return _mm512_load_epi32(p);
+#else
+  __vec16_i32 v;
+  v = _mm512_extloadunpacklo_epi32(v,           p,    _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  return v;
+#endif
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // __store<64>(p,v);
+  _mm512_store_epi32(p, v);
+#else
+  _mm512_extpackstorelo_epi32(          p,    v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
+}
+
+#if 0 /* knc::fails  ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
+template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) 
+{
+  return _mm512_load_epi32(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) 
+{
+  _mm512_store_epi32(p, v);
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// int64
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b) 
+{
+  return __vec16_i64(_mm512_add_epi64(a.v1, b.v1), _mm512_add_epi64(a.v2,b.v2));
+}
+
+static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) 
+{
+#if __ICC >= 99999 /* compiler gate, icc >= 99999 will hopefully support _mm512_sub_epi64 */
+  return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2));
+#else
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i64 ret;
+  __mmask16 borrow = 0;
+  ret.v_lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow);
+  ret.v_hi = _mm512_sbb_epi32    (a.v_hi, borrow, b.v_hi, &borrow);
+  return ret.cvt2zmm();
+#endif
+}
+
+static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b)
+{
+  const __vec16_i64 b = _b.cvt2hilo();
+  return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo),
+      _mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi),
+        _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm();
+}
+
+static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_i64 b) 
+{
+  __vec16_i64 ret;
+  ret.v1 = _mm512_mask_mov_epi64(b.v1, mask,      a.v1);
+  ret.v2 = _mm512_mask_mov_epi64(b.v2, mask >> 8, a.v2);
+  return ret;
+}
+
+#if __ICC >= 1400 /* compiler gate, icc >= 14.0.0 support _mm512_mullox_epi64 */
+static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) 
+{
+  return __vec16_i64(_mm512_mullox_epi64(a.v1,b.v1), _mm512_mullox_epi64(a.v2,b.v2));
+}
+#else  /* __ICC >= 1400 */
+static FORCEINLINE void __abs_i32i64(__m512i &_hi, __m512i &_lo)
+{
+  /*   abs(x) : 
+   * mask  = x >> 32;
+   * abs(x) = (x^mask) - mask
+   */ 
+  const __vec16_i32 mask = __ashr(_hi, __ispc_thirty_two);
+  __vec16_i32 hi = __xor(_hi, mask);
+  __vec16_i32 lo = __xor(_lo, mask);
+  __mmask16 borrow = 0;
+  _lo = _mm512_subsetb_epi32(lo, mask, &borrow);
+  _hi = _mm512_sbb_epi32    (hi, borrow, mask, &borrow);
+}
+static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  __vec16_i64 a = _a.cvt2hilo();
+  __vec16_i64 b = _b.cvt2hilo();
+  /* sign = (a^b) >> 32, if sign == 0 then a*b >= 0, otherwise a*b < 0 */
+  const __vec16_i1 sign = __not_equal_i32(__ashr(__xor(a.v_hi, b.v_hi), __ispc_thirty_two), __ispc_zero);
+  __abs_i32i64(a.v_hi, a.v_lo);  /* abs(a) */
+  __abs_i32i64(b.v_hi, b.v_lo);  /* abs(b) */
+  const __vec16_i32 lo_m1 = _mm512_mullo_epi32(a.v_lo, b.v_lo);
+  const __vec16_i32 hi_m1 = _mm512_mulhi_epu32(a.v_lo, b.v_lo);
+  const __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
+  const __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
+  __mmask16 carry;
+  const __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m3, &carry);
+  const __vec16_i32 hi = _mm512_adc_epi32(hi_p23, carry, hi_m1, &carry);
+  const __vec16_i32 lo = lo_m1;
+  const __vec16_i64 ret_abs = __vec16_i64(hi,lo).cvt2zmm();
+  /* if sign != 0, means either a or b is negative, then negate the result */
+  return __select(sign, __sub(__vec16_i64(__ispc_zero, __ispc_zero), ret_abs), ret_abs);
+}
+#endif  /* __ICC >= 1400 */
+
+
+static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_or_epi64 (a.v1, b.v1), _mm512_or_epi64 (a.v2, b.v2)); }
+static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); }
+static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); }
+
+static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2)); }
+static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2)); }
+
+static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); }
+static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); }
+
+
+static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  /* this is a safety gate in case b-shift >= 32 */
+  const __vec16_i32 xfer = __select(
+      __signed_less_than_i32(b.v_lo, __ispc_thirty_two), 
+      __lshr(a.v_lo,   __sub(__ispc_thirty_two, b.v_lo)),
+      __shl (a.v_lo,   __sub(b.v_lo, __ispc_thirty_two))
+      );
+  const __vec16_i32   hi = __or(__shl(a.v_hi, b.v_lo), xfer);
+  const __vec16_i32   lo =      __shl(a.v_lo, b.v_lo);
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  /* this is a safety gate in case b-shift >= 32 */
+  const __vec16_i32 xfer = __select(
+      __signed_less_than_i32(b.v_lo, __ispc_thirty_two), 
+      __shl (a.v_hi,   __sub(__ispc_thirty_two, b.v_lo)),
+      __lshr(a.v_hi,   __sub(b.v_lo, __ispc_thirty_two))
+      );
+  const __vec16_i32   lo = __or(__lshr(a.v_lo, b.v_lo), xfer);
+  const __vec16_i32   hi =      __lshr(a.v_hi, b.v_lo);
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  /* this is a safety gate in case b-shift >= 32 */
+  const __vec16_i32 xfer = __select(
+      __signed_less_than_i32(b.v_lo, __ispc_thirty_two), 
+      __shl (a.v_hi,   __sub(__ispc_thirty_two, b.v_lo)),
+      __ashr(a.v_hi,   __sub(b.v_lo, __ispc_thirty_two))
+      );
+  const __vec16_i32   lo = __or(__lshr(a.v_lo, b.v_lo), xfer);
+  const __vec16_i32   hi =      __ashr(a.v_hi, b.v_lo);
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+
+template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
+template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
+
+template <class RetVecType> RetVecType __setzero_i64();
+template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
+
+template <class RetVecType> RetVecType __undef_i64();
+template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
+
+static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, uint64_t shift) { return __lshr(a, __smear_i64<__vec16_i64>(shift)); }
+static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a,  int64_t shift) { return __ashr(a, __smear_i64<__vec16_i64>(shift)); }
+static FORCEINLINE __vec16_i64 __shl (__vec16_i64 a,  int64_t shift) { return __shl (a, __smear_i64<__vec16_i64>(shift)); }
+
+static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
+  return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
+}
+static FORCEINLINE __vec16_i1 __equal_i64_and_mask(__vec16_i64 _a, __vec16_i64 _b, __vec16_i1 mask) 
+{
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
+  __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
+  return _mm512_kand(full_match, (__mmask16)mask);
+}
+static FORCEINLINE __vec16_i1 __not_equal_i64(__vec16_i64 a, __vec16_i64 b) 
+{
+  return __not(__equal_i64(a,b));
+}
+static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(__vec16_i64 a, __vec16_i64 b, __vec16_i1 mask) 
+{
+  return __and(__not(__equal_i64(a,b)), mask);
+}
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_than, <)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_than, >)
+
+
+INSERT_EXTRACT(__vec16_i64, int64_t)
+
+
+#define CASTL2I(_v_, _v_hi_, _v_lo_) \
+  __vec16_i32 _v_hi_, _v_lo_;  \
+  { \
+  const __vec16_i64 v      = _v_.cvt2hilo(); \
+  _v_hi_   = v.v_hi; \
+  _v_lo_   = v.v_lo; }
+#define CASTI2L(_ret_hi_, _ret_lo_) \
+  __vec16_i64(_ret_hi_, _ret_lo_).cvt2zmm()
+static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 _v, int index) 
+{
+  CASTL2I(_v, v_hi, v_lo);
+  const __vec16_i32 ret_hi = __broadcast_i32(v_hi, index);
+  const __vec16_i32 ret_lo = __broadcast_i32(v_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_i64 __rotate_i64(const __vec16_i64 _v, const int index) 
+{
+  CASTL2I(_v, v_hi, v_lo);
+  const __vec16_i32 ret_hi = __rotate_i32(v_hi, index);
+  const __vec16_i32 ret_lo = __rotate_i32(v_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+SHIFT(__vec16_i64, i64, int64_t)
+
+static FORCEINLINE __vec16_i64 __shuffle_double(__vec16_i64 _v, const __vec16_i32 index) 
+{
+  CASTL2I(_v, v_hi, v_lo);
+  const __vec16_i32 ret_hi = __shuffle_i32(v_hi, index);
+  const __vec16_i32 ret_lo = __shuffle_i32(v_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_i64 __shuffle2_double(__vec16_i64 _v0, __vec16_i64 _v1, const __vec16_i32 index)
+{
+  CASTL2I(_v0, v0_hi, v0_lo);
+  CASTL2I(_v1, v1_hi, v1_lo);
+  const __vec16_i32 ret_hi = __shuffle2_i32(v0_hi, v1_hi, index);
+  const __vec16_i32 ret_lo = __shuffle2_i32(v0_lo, v1_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+#undef CASTI2L
+#undef CASTL2I
+
+template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // return __load<128>(p);
+  __m512i v2 = _mm512_load_epi32(p);
+  __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
+  return __vec16_i64(v2,v1);
+#else
+  __vec16_i32 v1;
+  __vec16_i32 v2;
+  v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v2 = _mm512_extloadunpackhi_epi32(v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  return __vec16_i64(v2,v1);
+#endif
+}
+
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // __store<128>(p,v);
+  __m512i v1 = v.v2;
+  __m512i v2 = v.v1;
+  _mm512_store_epi64(p, v2);
+  _mm512_store_epi64(((uint8_t*)p)+64, v1);
+#else
+  __m512i v1 = v.v2;
+  __m512i v2 = v.v1;
+  _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
+}
+
+#if 0 /* knc::fails  as with _i32 this may generate fails ... so commetining it out */
+template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
+{
+  __m512i v2 = _mm512_load_epi32(p);
+  __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
+  return __vec16_i64(v2,v1);
+}
+template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
+template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
+{
+  __m512i v1 = v.v2;
+  __m512i v2 = v.v1;
+  _mm512_store_epi64(p, v2);
+  _mm512_store_epi64(((uint8_t*)p)+64, v1);
+}
+template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
+#endif
+
+
+///////////////////////////////////////////////////////////////////////////
+// float
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { return _mm512_add_ps(a,b); }
+static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) { return _mm512_sub_ps(a,b); }
+static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) { return _mm512_mul_ps(a,b); }
+static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) { return _mm512_div_ps(a,b); }
+
+static FORCEINLINE __vec16_i1 __equal_float        (__vec16_f a, __vec16_f b) { return _mm512_cmpeq_ps_mask (a,b);            }
+static FORCEINLINE __vec16_i1 __not_equal_float    (__vec16_f a, __vec16_f b) { return _mm512_cmpneq_ps_mask(a,b);            }
+static FORCEINLINE __vec16_i1 __less_than_float    (__vec16_f a, __vec16_f b) { return _mm512_cmplt_ps_mask (a,b);            }
+static FORCEINLINE __vec16_i1 __less_equal_float   (__vec16_f a, __vec16_f b) { return _mm512_cmple_ps_mask (a,b);            }
+static FORCEINLINE __vec16_i1 __greater_than_float (__vec16_f a, __vec16_f b) { return _mm512_cmp_ps_mask   (a,b,_CMP_GT_OS); }
+static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmp_ps_mask   (a,b,_CMP_GE_OS); }
+
+static FORCEINLINE __vec16_i1 __equal_float_and_mask        (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmpeq_ps_mask (m,a,b);            }
+static FORCEINLINE __vec16_i1 __not_equal_float_and_mask    (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmpneq_ps_mask(m,a,b);            }
+static FORCEINLINE __vec16_i1 __less_than_float_and_mask    (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmplt_ps_mask (m,a,b);            }
+static FORCEINLINE __vec16_i1 __less_equal_float_and_mask   (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmple_ps_mask (m,a,b);            }
+static FORCEINLINE __vec16_i1 __greater_than_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmp_ps_mask   (m,a,b,_CMP_GT_OS); }
+static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmp_ps_mask   (m,a,b,_CMP_GE_OS); }
+
+static FORCEINLINE __vec16_i1   __ordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpord_ps_mask  (a,b); }
+static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpunord_ps_mask(a,b); }
+
+static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) { return _mm512_mask_mov_ps(b, mask, a); }
+static FORCEINLINE __vec16_f __select(      bool cond, __vec16_f a, __vec16_f b) { return cond ? a : b; }
+
+static FORCEINLINE float __extract_element(__vec16_f  v, uint32_t index)            { return v[index];   }
+static FORCEINLINE void   __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; }
+
+template <class RetVecType> RetVecType __smear_float(float f);
+template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
+
+template <class RetVecType> RetVecType __setzero_float();
+template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
+
+template <class RetVecType> RetVecType __undef_float();
+template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
+
+static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index) 
+{
+  const __vec16_i32 v = _mm512_castps_si512(_v);
+  return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v));
+}
+ 
+static FORCEINLINE __vec16_f __rotate_float(__vec16_f _v, int index) 
+{
+  const __vec16_i32 v =  _mm512_castps_si512(_v);
+  const __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
+  const __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0xF));
+  return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v));
+}
+SHIFT(__vec16_f, float, float)
+static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) 
+{
+  return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
+}
+static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f _v0, __vec16_f _v1, __vec16_i32 index)
+{
+  const __vec16_i32 v0 =  _mm512_castps_si512(_v0);
+  const __vec16_i32 v1 =  _mm512_castps_si512(_v1);
+  const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10));
+  index  = __and(index, __smear_i32<__vec16_i32>(0xF));
+  __vec16_i32 ret = __undef_i32<__vec16_i32>();
+  ret = _mm512_mask_permutevar_epi32(ret,       mask,  index, v0);
+  ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1);
+  return _mm512_castsi512_ps(ret);
+}
+
+template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // return __load<64>(p);
+  return _mm512_load_ps(p);
+#else
+  __vec16_f v;
+  v = _mm512_extloadunpacklo_ps(v,           p,    _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  return v;
+#endif
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // __store<64>(p,v);
+  _mm512_store_ps(p, v);
+#else
+  _mm512_extpackstorelo_ps(          p,    v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+#endif
+}
+
+#if 0 /* knc::fails  ./tests/gs-improve-progindex.ispc with segfault */
+template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) 
+{
+    return _mm512_load_ps(p);
+}
+/* this one doesn't fail but it is  commented out for completeness, no aligned load/stores */
+template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) 
+{
+  _mm512_store_ps(p, v);
+}
+#endif
+
+/******** math ******/
+
+/*** float ***/
+static FORCEINLINE float __exp_uniform_float(float v) {    return expf(v);}
+static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) { return _mm512_exp_ps(v); }
+
+static FORCEINLINE float __log_uniform_float(float v) {    return logf(v);}
+static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) { return _mm512_log_ps(v); }
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {    return powf(a, b);}
+static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); }
+
+/*** double ***/
+static FORCEINLINE double __exp_uniform_double(double v) {    return exp(v);}
+static FORCEINLINE __vec16_d __exp_varying_double(__vec16_d v) { return __vec16_d(_mm512_exp_pd(v.v1),_mm512_exp_pd(v.v2)); }
+
+static FORCEINLINE double __log_uniform_double(double v) {    return log(v);}
+static FORCEINLINE __vec16_d __log_varying_double(__vec16_d v) { return __vec16_d(_mm512_log_pd(v.v1),_mm512_log_pd(v.v2)); }
+
+static FORCEINLINE double __pow_uniform_double(double a, double b) {    return pow(a,b);}
+static FORCEINLINE __vec16_d __pow_varying_double(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1),_mm512_pow_pd(a.v2,b.v2)); }
+
+/******** bitcast ******/
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// half<->float : this one passes the tests 
+// source : 
+// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion 
+///////////////////////////////////////////////////////////////////////////
+class Float16Compressor
+{
+  union Bits
+  {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static int const shift = 13;
+  static int const shiftSign = 16;
+
+  static int32_t const infN = 0x7F800000; // flt32 infinity
+  static int32_t const maxN = 0x477FE000; // max flt16 normal as a flt32
+  static int32_t const minN = 0x38800000; // min flt16 normal as a flt32
+  static int32_t const signN = 0x80000000; // flt32 sign bit
+
+  static int32_t const infC = infN >> shift;
+  static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32
+  static int32_t const maxC = maxN >> shift;
+  static int32_t const minC = minN >> shift;
+  static int32_t const signC = signN >> shiftSign; // flt16 sign bit
+
+  static int32_t const mulN = 0x52000000; // (1 << 23) / minN
+  static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift))
+
+  static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted
+  static int32_t const norC = 0x00400; // min flt32 normal down shifted
+
+  static int32_t const maxD = infC - maxC - 1;
+  static int32_t const minD = minC - subC - 1;
+
+  public:
+
+  static uint16_t compress(float value)
+  {
+    Bits v, s;
+    v.f = value;
+    uint32_t sign = v.si & signN;
+    v.si ^= sign;
+    sign >>= shiftSign; // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f; // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift; // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    return v.ui | sign;
+  }
+
+  static float decompress(uint16_t value)
+  {
+    Bits v;
+    v.ui = value;
+    int32_t sign = v.si & signC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+  }
+};
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) 
+{
+  return Float16Compressor::decompress(h);
+}
+static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) 
+{
+  __vec16_f ret;
+  for (int i = 0; i < 16; ++i)
+    ret[i] = __half_to_float_uniform(v[i]);
+  return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) 
+{
+  return Float16Compressor::compress(f);
+}
+static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) 
+{
+  __vec16_i16 ret;
+  for (int i = 0; i < 16; ++i)
+    ret[i] = __float_to_half_uniform(v[i]);
+  return ret;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// double
+///////////////////////////////////////////////////////////////////////////
+
+#define VECOP(OP) __vec16_d(_mm512_##OP(a.v1,b.v1),_mm512_##OP(a.v2,b.v2))
+static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { return VECOP(add_pd); }
+static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) { return VECOP(sub_pd); }
+static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) { return VECOP(mul_pd); }
+static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) { return VECOP(div_pd); }
+#undef VECOP
+
+#define CMPOP(OP) _mm512_kmovlhb(_mm512_##OP(a.v1,b.v1),_mm512_##OP(a.v2,b.v2))
+static FORCEINLINE __vec16_i1 __equal_double        (__vec16_d a, __vec16_d b) { return CMPOP(cmpeq_pd_mask);    }
+static FORCEINLINE __vec16_i1 __not_equal_double    (__vec16_d a, __vec16_d b) { return CMPOP(cmpneq_pd_mask);   }
+static FORCEINLINE __vec16_i1 __less_than_double    (__vec16_d a, __vec16_d b) { return CMPOP(cmplt_pd_mask);    }
+static FORCEINLINE __vec16_i1 __less_equal_double   (__vec16_d a, __vec16_d b) { return CMPOP(cmple_pd_mask);    }
+static FORCEINLINE __vec16_i1 __greater_than_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpnle_pd_mask);   }
+static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { return CMPOP(cmpnlt_pd_mask);   }
+static FORCEINLINE __vec16_i1 __ordered_double      (__vec16_d a, __vec16_d b) { return CMPOP(cmpord_pd_mask);   }
+static FORCEINLINE __vec16_i1 __unordered_double    (__vec16_d a, __vec16_d b) { return CMPOP(cmpunord_pd_mask); }
+#undef CMPOP
+
+#define CMPOPMASK(OP) _mm512_kmovlhb(_mm512_mask_##OP(m,a.v1,b.v1),_mm512_mask_##OP(_mm512_kswapb(m,m),a.v2,b.v2))
+static FORCEINLINE __vec16_i1 __equal_double_and_mask        (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpeq_pd_mask);  }
+static FORCEINLINE __vec16_i1 __not_equal_double_and_mask    (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpneq_pd_mask); }
+static FORCEINLINE __vec16_i1 __less_than_double_and_mask    (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmplt_pd_mask);  }
+static FORCEINLINE __vec16_i1 __less_equal_double_and_mask   (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmple_pd_mask);  }
+static FORCEINLINE __vec16_i1 __greater_than_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpnle_pd_mask); }
+static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpnlt_pd_mask); }
+#undef CMOPMASK
+
+
+static FORCEINLINE __vec16_d __select(__vec16_i1 m, __vec16_d a, __vec16_d b) 
+{
+  return __vec16_d(_mm512_mask_mov_pd(b.v1, m, a.v1), _mm512_mask_mov_pd(b.v2, _mm512_kswapb(m, m), a.v2));
+}
+static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) 
+{
+    return cond ? a : b;
+}
+
+static FORCEINLINE double __extract_element(__vec16_d  v, uint32_t index)             { return v[index];   }
+static FORCEINLINE void    __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; }
+
+template <class RetVecType> RetVecType __smear_double(double d);
+template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
+
+template <class RetVecType> RetVecType __setzero_double();
+template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
+
+template <class RetVecType> RetVecType __undef_double();
+template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
+
+#define CASTD2F(_v_, _v_hi_, _v_lo_) \
+  __vec16_f _v_hi_, _v_lo_;  \
+  { \
+  const __vec16_d v      = _v_.cvt2hilo(); \
+  _v_hi_   = _mm512_castpd_ps(v.v_hi); \
+  _v_lo_   = _mm512_castpd_ps(v.v_lo); }
+#define CASTF2D(_ret_hi_, _ret_lo_) \
+  __vec16_d(_mm512_castps_pd(_ret_hi_), _mm512_castps_pd(_ret_lo_)).cvt2zmm()
+static FORCEINLINE __vec16_d __broadcast_double(__vec16_d _v, int index) 
+{
+  CASTD2F(_v, v_hi, v_lo);
+  const __vec16_f ret_hi = __broadcast_float(v_hi, index);
+  const __vec16_f ret_lo = __broadcast_float(v_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_d __rotate_double(const __vec16_d _v, const int index) 
+{
+  CASTD2F(_v, v_hi, v_lo);
+  const __vec16_f ret_hi = __rotate_float(v_hi, index);
+  const __vec16_f ret_lo = __rotate_float(v_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+SHIFT(__vec16_d, double, double)
+static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index) 
+{
+  CASTD2F(_v, v_hi, v_lo);
+  const __vec16_f ret_hi = __shuffle_float(v_hi, index);
+  const __vec16_f ret_lo = __shuffle_float(v_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_d __shuffle2_double(__vec16_d _v0, __vec16_d _v1, const __vec16_i32 index)
+{
+  CASTD2F(_v0, v0_hi, v0_lo);
+  CASTD2F(_v1, v1_hi, v1_lo);
+  const __vec16_f ret_hi = __shuffle2_float(v0_hi, v1_hi, index);
+  const __vec16_f ret_lo = __shuffle2_float(v0_lo, v1_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+#undef CASTF2D
+#undef CASTD2F
+
+template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) \
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // return __load<128>(p);
+  return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
+#else
+  __vec16_d ret;
+  ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  return ret;
+#endif
+}
+ 
+template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  // return __store<128>(p,v);
+  _mm512_store_pd(p, v.v1);
+  _mm512_store_pd(((uint8_t*)p)+64, v.v2);
+#else
+  _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+#endif
+}
+
+
+#if 0 /* knc::fails  as with _f this may generate fails ... so commetining it out */
+template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) 
+{
+  return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
+}
+template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) 
+{
+  _mm512_store_pd(p, v.v1);
+  _mm512_store_pd(((uint8_t*)p)+64, v.v2);
+}
+template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p)        { return __load<64>(p); }
+template <> static FORCEINLINE      void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v);    }
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// casts
+///////////////////////////////////////////////////////////////////////////
+
+
+/* knc::macro::used */
+#define CAST(TO, STO, FROM, SFROM, FUNC)        \
+static FORCEINLINE TO FUNC(TO, FROM val) {      \
+    TO ret;                                     \
+    for (int i = 0; i < 16; ++i)                \
+        ret[i] = (STO)((SFROM)(val[i]));    \
+    return ret;                                 \
+}
+
+// sign extension conversions
+
+// CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
+static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
+{
+  return __vec16_i64(_mm512_srai_epi32(val.v,31), val.v).cvt2zmm();
+}
+CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext)
+CAST(__vec16_i64, int64_t, __vec16_i8,  int8_t,  __cast_sext)
+CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
+CAST(__vec16_i32, int32_t, __vec16_i8,  int8_t,  __cast_sext)
+CAST(__vec16_i16, int16_t, __vec16_i8,  int8_t,  __cast_sext)
+
+/* knc::macro::used */
+#define CAST_SEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 16; ++i) {                    \
+        ret[i] = 0;                                 \
+        if (v.v & (1 << i))                           \
+            ret[i] = ~ret[i];                     \
+    }                                                 \
+    return ret;                                       \
+}
+
+CAST_SEXT_I1(__vec16_i8)
+CAST_SEXT_I1(__vec16_i16)
+
+//CAST_SEXT_I1(__vec16_i32)
+static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val)
+{
+  __vec16_i32 ret = _mm512_setzero_epi32();
+  __vec16_i32 one = _mm512_set1_epi32(-1);
+  return _mm512_mask_mov_epi32(ret, val, one);
+}
+
+CAST_SEXT_I1(__vec16_i64)
+
+// zero extension
+// CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
+static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val)
+{
+  return __vec16_i64(_mm512_setzero_epi32(), val.v).cvt2zmm();
+}
+
+CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
+CAST(__vec16_i64, uint64_t, __vec16_i8,  uint8_t,  __cast_zext)
+CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext)
+CAST(__vec16_i32, uint32_t, __vec16_i8,  uint8_t,  __cast_zext)
+CAST(__vec16_i16, uint16_t, __vec16_i8,  uint8_t,  __cast_zext)
+
+/* knc::macro::used */
+#define CAST_ZEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = (v.v & (1 << i)) ? 1 : 0;          \
+    return ret;                                       \
+}
+
+CAST_ZEXT_I1(__vec16_i8)
+CAST_ZEXT_I1(__vec16_i16)
+
+//CAST_ZEXT_I1(__vec16_i32)
+static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
+{
+  __vec16_i32 ret = _mm512_setzero_epi32();
+  __vec16_i32 one = _mm512_set1_epi32(1);
+  return _mm512_mask_mov_epi32(ret, val, one);
+}
+
+CAST_ZEXT_I1(__vec16_i64)
+
+// truncations
+CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i16, int16_t, __vec16_i32, int32_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i32, int32_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i16, int16_t, __cast_trunc)
+
+// signed int to float/double
+
+//CAST(__vec16_f, float, __vec16_i8,   int8_t,  __cast_sitofp)
+static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8  val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i16,  int16_t, __cast_sitofp)
+static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i32,  int32_t, __cast_sitofp)
+static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepi32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+
+CAST(__vec16_f, float, __vec16_i64,  int64_t, __cast_sitofp)
+
+//CAST(__vec16_d, double, __vec16_i8,  int8_t,  __cast_sitofp)
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
+}
+
+// CAST(__vec16_d, double, __vec16_i16, int16_t, __cast_sitofp)
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
+}
+
+// CAST(__vec16_d, double, __vec16_i32, int32_t, __cast_sitofp)
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(val);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
+}
+
+CAST(__vec16_d, double, __vec16_i64, int64_t, __cast_sitofp)
+
+// unsigned int to float/double
+
+// CAST(__vec16_f, float, __vec16_i8,   uint8_t,  __cast_uitofp)
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i8  val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i16,  uint16_t, __cast_uitofp)
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i32,  uint32_t, __cast_uitofp)
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepu32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+
+CAST(__vec16_f, float, __vec16_i64,  uint64_t, __cast_uitofp)
+
+// CAST(__vec16_d, double, __vec16_i8,  uint8_t,  __cast_uitofp)
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i8 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepu32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepu32lo_pd(other8);
+  return ret;
+}
+
+// CAST(__vec16_d, double, __vec16_i16, uint16_t, __cast_uitofp)
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i16 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepu32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepu32lo_pd(other8);
+  return ret;
+}
+
+// CAST(__vec16_d, double, __vec16_i32, uint32_t, __cast_uitofp)
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i32 val) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepu32lo_pd(val);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepu32lo_pd(other8);
+  return ret;
+}
+
+CAST(__vec16_d, double, __vec16_i64, uint64_t, __cast_uitofp)
+
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) 
+{
+  const __m512 ret = _mm512_setzero_ps();
+  const __m512 one = _mm512_set1_ps(1.0);
+  return _mm512_mask_mov_ps(ret, v, one);
+}
+
+// float/double to signed int
+CAST(__vec16_i8,  int8_t,  __vec16_f, float, __cast_fptosi)
+CAST(__vec16_i16, int16_t, __vec16_f, float, __cast_fptosi)
+
+// CAST(__vec16_i32, int32_t, __vec16_f, float, __cast_fptosi)
+static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) 
+{
+  return _mm512_cvtfxpnt_round_adjustps_epi32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+
+CAST(__vec16_i64, int64_t, __vec16_f, float, __cast_fptosi)
+CAST(__vec16_i8,  int8_t,  __vec16_d, double, __cast_fptosi)
+CAST(__vec16_i16, int16_t, __vec16_d, double, __cast_fptosi)
+#if 0 /* knc::2implement */
+#else
+CAST(__vec16_i32, int32_t, __vec16_d, double, __cast_fptosi)
+#endif
+CAST(__vec16_i64, int64_t, __vec16_d, double, __cast_fptosi)
+
+// float/double to unsigned int
+CAST(__vec16_i8,  uint8_t,  __vec16_f, float, __cast_fptoui)
+CAST(__vec16_i16, uint16_t, __vec16_f, float, __cast_fptoui)
+
+// CAST(__vec16_i32, uint32_t, __vec16_f, float, __cast_fptoui)
+static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) 
+{
+  return _mm512_cvtfxpnt_round_adjustps_epu32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+
+CAST(__vec16_i64, uint64_t, __vec16_f, float, __cast_fptoui)
+CAST(__vec16_i8,  uint8_t,  __vec16_d, double, __cast_fptoui)
+CAST(__vec16_i16, uint16_t, __vec16_d, double, __cast_fptoui)
+#if 0 /* knc::2implement */
+#else
+CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui)
+#endif
+CAST(__vec16_i64, uint64_t, __vec16_d, double, __cast_fptoui)
+
+// float/double conversions
+
+// CAST(__vec16_f, float,  __vec16_d, double, __cast_fptrunc)
+static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) 
+{
+  __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
+  __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
+
+  return _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA));
+}
+
+// CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
+static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtpslo_pd(val.v);
+  __vec16_f other8 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC));
+  ret.v2 = _mm512_cvtpslo_pd(other8);
+  return ret;
+}
+
+typedef union {
+    int32_t i32;
+    float f;
+    int64_t i64;
+    double d;
+} BitcastUnion;
+
+/* knc::macro::not used */
+#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT)       \
+static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
+    TO r;                                           \
+    for (int i = 0; i < 16; ++i) {                  \
+        BitcastUnion u;                             \
+        u.FROM_ELT = val[i];                      \
+        r[i] = u.TO_ELT;                          \
+    }                                               \
+    return r;                                       \
+}
+
+// CAST_BITS(__vec16_f,   f,   __vec16_i32, i32)
+static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) { return _mm512_castsi512_ps(val); }
+// CAST_BITS(__vec16_i32, i32, __vec16_f,   f)
+static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) { return _mm512_castps_si512(val); }
+
+// CAST_BITS(__vec16_d,   d,   __vec16_i64, i64)
+static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) { return *(__vec16_i64*)&val; }
+// CAST_BITS(__vec16_i64, i64, __vec16_d,   d)
+static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) { return *(__vec16_d*)&val; }
+
+/* knc::macro::used */
+#define CAST_BITS_SCALAR(TO, FROM)                  \
+static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
+    union {                                         \
+    TO to;                                          \
+    FROM from;                                      \
+    } u;                                            \
+    u.from = v;                                     \
+    return u.to;                                    \
+}
+
+CAST_BITS_SCALAR(uint32_t, float)
+CAST_BITS_SCALAR(int32_t, float)
+CAST_BITS_SCALAR(float, uint32_t)
+CAST_BITS_SCALAR(float, int32_t)
+CAST_BITS_SCALAR(uint64_t, double)
+CAST_BITS_SCALAR(int64_t, double)
+CAST_BITS_SCALAR(double, uint64_t)
+CAST_BITS_SCALAR(double, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// various math functions
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE void __fastmath() {
+}
+
+static FORCEINLINE float __round_uniform_float(float v) {
+    return roundf(v);
+}
+
+static FORCEINLINE float __floor_uniform_float(float v)  {
+    return floorf(v);
+}
+
+static FORCEINLINE float __ceil_uniform_float(float v) {
+    return ceilf(v);
+}
+
+static FORCEINLINE double __round_uniform_double(double v) {
+    return round(v);
+}
+
+static FORCEINLINE double __floor_uniform_double(double v) {
+    return floor(v);
+}
+
+static FORCEINLINE double __ceil_uniform_double(double v) {
+    return ceil(v);
+}
+
+static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) { return _mm512_round_ps(v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); }
+static FORCEINLINE __vec16_f __floor_varying_float(__vec16_f v) { return _mm512_floor_ps(v); }
+static FORCEINLINE __vec16_f __ceil_varying_float(__vec16_f v) { return _mm512_ceil_ps(v); }
+
+static FORCEINLINE __vec16_d __round_varying_float(__vec16_d v)  { return __vec16_d(_mm512_svml_round_pd(v.v1), _mm512_svml_round_pd(v.v2)); }
+static FORCEINLINE __vec16_d __floor_varying_float(__vec16_d v)  { return __vec16_d(_mm512_floor_pd(v.v1), _mm512_floor_pd(v.v2)); }
+static FORCEINLINE __vec16_d __ceil_varying_float(__vec16_d v)  { return __vec16_d(_mm512_ceil_pd(v.v1), _mm512_ceil_pd(v.v2)); }
+
+// min/max
+
+static FORCEINLINE float  __min_uniform_float (float  a, float  b) { return (a<b) ? a : b; }
+static FORCEINLINE float  __max_uniform_float (float  a, float  b) { return (a>b) ? a : b; }
+static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
+static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int32_t __min_uniform_int32 ( int32_t a,  int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32 ( int32_t a,  int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int64_t __min_uniform_int64 ( int64_t a,  int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64 ( int64_t a,  int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE __vec16_f __max_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmax_ps(v1, v2);}
+static FORCEINLINE __vec16_f __min_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmin_ps(v1, v2);}
+static FORCEINLINE __vec16_d __max_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmax_pd(v1.v1, v2.v1),_mm512_gmax_pd(v1.v2,v2.v2));}
+static FORCEINLINE __vec16_d __min_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmin_pd(v1.v1, v2.v1),_mm512_gmin_pd(v1.v2,v2.v2));}
+
+static FORCEINLINE __vec16_i32 __max_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epi32(v1, v2);}
+static FORCEINLINE __vec16_i32 __min_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epi32(v1, v2);}
+static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epu32(v1, v2);}
+static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epu32(v1, v2);}
+
+BINARY_OP_FUNC(__vec16_i64, __max_varying_int64,  __max_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __min_varying_int64,  __min_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64)
+BINARY_OP_FUNC(__vec16_i64, __min_varying_uint64, __min_uniform_uint64)
+
+// sqrt/rsqrt/rcp
+
+static FORCEINLINE float  __rsqrt_uniform_float(float  v) { return 1.f / sqrtf(v); }
+static FORCEINLINE float  __rcp_uniform_float  (float  v) { return 1.f / v;        }
+static FORCEINLINE float  __sqrt_uniform_float (float  v) { return sqrtf(v);       }
+static FORCEINLINE double __sqrt_uniform_double(double v) { return sqrt (v);       }
+
+static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) 
+{
+#ifdef ISPC_FAST_MATH
+  return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy.
+#else
+  return _mm512_recip_ps(v);
+#endif
+}
+
+static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) 
+{
+#ifdef ISPC_FAST_MATH
+  return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy
+#else 
+  return _mm512_invsqrt_ps(v);
+#endif
+}
+static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);}
+static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));}
+
+///////////////////////////////////////////////////////////////////////////
+// svml
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_f __svml_sinf  (__vec16_f v)              { return _mm512_sin_ps(v);     }
+static FORCEINLINE __vec16_f __svml_asinf (__vec16_f v)              { return _mm512_asin_ps(v);    }
+static FORCEINLINE __vec16_f __svml_cosf  (__vec16_f v)              { return _mm512_cos_ps(v);     }
+static FORCEINLINE __vec16_f __svml_tanf  (__vec16_f v)              { return _mm512_tan_ps(v);     }
+static FORCEINLINE __vec16_f __svml_atanf (__vec16_f v)              { return _mm512_atan_ps(v);    }
+static FORCEINLINE __vec16_f __svml_atan2f(__vec16_f a, __vec16_f b) { return _mm512_atan2_ps(a,b); }
+static FORCEINLINE __vec16_f __svml_expf  (__vec16_f v)              { return _mm512_exp_ps(v);     }
+static FORCEINLINE __vec16_f __svml_logf  (__vec16_f v)              { return _mm512_log_ps(v);     }
+static FORCEINLINE __vec16_f __svml_powf  (__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b);   }
+
+static FORCEINLINE __vec16_d __svml_sind  (__vec16_d v)              { return __vec16_d(_mm512_sin_pd(v.v1), _mm512_sin_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_asind (__vec16_d v)              { return __vec16_d(_mm512_asin_pd(v.v1), _mm512_asin_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_cosd  (__vec16_d v)              { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_tand  (__vec16_d v)              { return __vec16_d(_mm512_tan_pd(v.v1), _mm512_tan_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_atand (__vec16_d v)              { return __vec16_d(_mm512_atan_pd(v.v1), _mm512_atan_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_atan2d(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_atan2_pd(a.v1,b.v1), _mm512_atan2_pd(a.v2,b.v2)); }
+static FORCEINLINE __vec16_d __svml_expd  (__vec16_d v)              { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_logd  (__vec16_d v)              { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_powd  (__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); }
+
+///////////////////////////////////////////////////////////////////////////
+// bit ops
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __popcnt_int64(uint64_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & (1<<31)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & (1ull<<63)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// reductions
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE float __reduce_add_float(__vec16_f v) { return _mm512_reduce_add_ps(v); }
+static FORCEINLINE float __reduce_min_float(__vec16_f v) { return _mm512_reduce_min_ps(v); }
+static FORCEINLINE float __reduce_max_float(__vec16_f v) { return _mm512_reduce_max_ps(v); }
+
+static FORCEINLINE float __reduce_add_double(__vec16_d v) { return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2); }
+static FORCEINLINE float __reduce_min_double(__vec16_d v) { return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2)); }
+static FORCEINLINE float __reduce_max_double(__vec16_d v) { return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2)); }
+
+
+
+static FORCEINLINE  int64_t __reduce_add_int32  (__vec16_i32 v) { return _mm512_reduce_add_epi32(v);}
+static FORCEINLINE  int32_t __reduce_min_int32  (__vec16_i32 v) { return _mm512_reduce_min_epi32(v);}
+static FORCEINLINE  int32_t __reduce_max_int32  (__vec16_i32 v) { return _mm512_reduce_max_epi32(v);}
+static FORCEINLINE uint32_t __reduce_min_uint32 (__vec16_i32 v) { return _mm512_reduce_min_epu32(v);}
+static FORCEINLINE uint32_t __reduce_max_uint32 (__vec16_i32 v) { return _mm512_reduce_max_epu32(v);}
+
+REDUCE_ADD   ( int16_t, __vec16_i8,  __reduce_add_int8)
+REDUCE_ADD   ( int32_t, __vec16_i16, __reduce_add_int16)
+REDUCE_ADD   ( int64_t, __vec16_i64, __reduce_add_int64)
+REDUCE_MINMAX( int64_t, __vec16_i64, __reduce_min_int64, <)
+REDUCE_MINMAX( int64_t, __vec16_i64, __reduce_max_int64, >)
+REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <)
+REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
+
+///////////////////////////////////////////////////////////////////////////
+// masked load/store
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_i8 __masked_load_i8(void *p,
+                                               __vec16_i1 mask) {
+    __vec16_i8 ret;
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec16_i16 __masked_load_i16(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_i16 ret;
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  return _mm512_mask_load_epi32(__vec16_i32(), mask, p);
+#else
+  __vec16_i32 tmp;
+  tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __vec16_i32 ret;
+  return _mm512_mask_mov_epi32(ret.v, mask, tmp.v);
+#endif
+}
+
+static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
+#else
+  __vec16_f tmp;
+  tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  __vec16_f ret;
+  return _mm512_mask_mov_ps(ret.v, mask, tmp.v);
+#endif
+}
+
+static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_i64 ret;
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  __vec16_d ret;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p);
+  ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64);
+  return ret;
+#else
+  __vec16_d tmp;
+  tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  __vec16_d ret;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1);
+  ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2);
+  return ret;
+#endif
+}
+
+
+static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
+                                          __vec16_i1 mask) {
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
+                                           __vec16_i1 mask) {
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+static FORCEINLINE void __masked_store_i32(void *p, const __vec16_i32 val, const __vec16_i1 mask) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  _mm512_mask_store_epi32(p, mask, val.v);
+#else
+  _mm512_mask_i32extscatter_epi32(p, mask, __ispc_stride1, val, _MM_DOWNCONV_EPI32_NONE, _MM_SCALE_4, _MM_HINT_NONE);
+#endif
+}
+
+static FORCEINLINE void __masked_store_float(void *p, const __vec16_f val, const __vec16_i1 mask) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  _mm512_mask_store_ps(p, mask, val.v);
+#else
+  _mm512_mask_i32extscatter_ps(p, mask, __ispc_stride1, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_4, _MM_HINT_NONE);
+#endif
+}
+
+static FORCEINLINE void __masked_store_i64(void *p, const __vec16_i64 val, const __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  _mm512_mask_store_epi64(p, mask, val.v1);
+  _mm512_mask_store_epi64((uint8_t*)p+64, tmp_m, val.v2);
+#else
+  _mm512_mask_i32loextscatter_epi64(          p,                      mask,  __ispc_stride1, val.v1, _MM_DOWNCONV_EPI64_NONE, _MM_SCALE_8, _MM_HINT_NONE);
+  _mm512_mask_i32loextscatter_epi64((int64_t*)p+8, _mm512_kswapb(mask,mask), __ispc_stride1, val.v2, _MM_DOWNCONV_EPI64_NONE, _MM_SCALE_8, _MM_HINT_NONE);
+#endif
+}
+
+static FORCEINLINE void __masked_store_double(void *p, const __vec16_d val, const __vec16_i1 mask) 
+{
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  _mm512_mask_store_pd(p, mask, val.v1);
+  _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
+#else
+  _mm512_mask_i32loextscatter_pd(           p,                    mask,  __ispc_stride1, val.v1, _MM_DOWNCONV_PD_NONE, _MM_SCALE_8, _MM_HINT_NONE);
+  _mm512_mask_i32loextscatter_pd((double*)p+8, _mm512_kswapb(mask,mask), __ispc_stride1, val.v2, _MM_DOWNCONV_PD_NONE, _MM_SCALE_8, _MM_HINT_NONE);
+#endif
+}
+
+static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
+                                                __vec16_i1 mask) {
+    __masked_store_i8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
+                                                   __vec16_i1 mask) {
+    __masked_store_float(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i64(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
+                                                    __vec16_i1 mask) {
+    __masked_store_double(p, val, mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// gather/scatter
+///////////////////////////////////////////////////////////////////////////
+
+// offsets * offsetScale is in bytes (for all of these)
+
+/* knc::macro::used */
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec16_i1 mask) {          \
+    VTYPE ret;                                                          \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 16; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            ret[i] = *ptr;                                            \
+        }                                                               \
+    return ret;                                                         \
+}
+    
+
+/****************/
+// GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_base_offsets32_i8)
+static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets,  __vec16_i1 mask) 
+{
+  // (iw): need to temporarily store as int because gathers can only return ints.
+  __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, 
+                                                   _MM_UPCONV_EPI32_SINT8, scale,
+                                                   _MM_HINT_NONE);
+  // now, downconverting to chars into temporary char vector
+  __vec16_i8 ret;
+  _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+  return ret;
+}
+// GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
+static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) 
+{ 
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  __vec16_i1 still_to_do = mask;
+  __vec16_i32 tmp;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));    
+    tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base,
+        _MM_UPCONV_EPI32_SINT8, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+  __vec16_i8 ret;
+  _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+  return ret;
+}
+/****************/
+GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
+/****************/
+// GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
+static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets,   __vec16_i1 mask) 
+{
+  return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
+                                        base, _MM_UPCONV_EPI32_NONE, scale,
+                                        _MM_HINT_NONE);
+}
+// GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
+static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
+{
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  // There is no gather instruction with 64-bit offsets in KNC.
+  // We have to manually iterate over the upper 32 bits ;-)
+  __vec16_i1  still_to_do = mask;
+  __vec16_i32 ret;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));
+    ret = _mm512_mask_i32extgather_epi32(ret, match, offsets.v_lo, base,
+        _MM_UPCONV_EPI32_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match, still_to_do);
+  }
+
+  return ret;
+}
+/****************/
+// GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
+static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
+{
+  return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets,
+                                     base, _MM_UPCONV_PS_NONE, scale,
+                                     _MM_HINT_NONE);
+}
+// GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
+static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
+{
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  // There is no gather instruction with 64-bit offsets in KNC.
+  // We have to manually iterate over the upper 32 bits ;-)
+  __vec16_i1 still_to_do = mask;
+  __vec16_f ret;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));
+    ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base,
+        _MM_UPCONV_PS_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match, still_to_do);
+  }
+
+  return ret;
+}
+/****************/
+GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
+/****************/
+// GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_base_offsets32_double)
+static FORCEINLINE __vec16_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
+                                         base, _MM_UPCONV_PD_NONE, scale,
+                                         _MM_HINT_NONE); 
+  __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+  const __mmask8 mask8 = 0x00FF & (mask >> 8); /* knc::testme */
+  ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets,
+                                         base, _MM_UPCONV_PD_NONE, scale,
+                                         _MM_HINT_NONE); 
+  return ret;
+}
+GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __gather_base_offsets64_double)
+
+/* knc::macro::used */
+#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
+    VTYPE ret;                                              \
+    for (int i = 0; i < 16; ++i)                            \
+        if ((mask.v & (1 << i)) != 0) {                     \
+            STYPE *ptr = (STYPE *)ptrs[i];                \
+            ret[i] = *ptr;                                \
+        }                                                   \
+    return ret;                                             \
+}
+/* knc::macro::used */
+#define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
+  return FUNC1(0, 1, ptrs, mask); \
+}
+
+
+/***********/
+GATHER_GENERALF(__vec16_i8,  int8_t,  __vec16_i32, __gather32_i8, __gather_base_offsets32_i8)
+GATHER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __gather32_i16, __gather_base_offsets32_i16)
+GATHER_GENERALF(__vec16_i32, int32_t, __vec16_i32, __gather32_i32, __gather_base_offsets32_i32)
+GATHER_GENERALF(__vec16_i64, int64_t, __vec16_i32, __gather32_i64, __gather_base_offsets32_i64)
+GATHER_GENERALF(__vec16_f,   float,   __vec16_i32, __gather32_float, __gather_base_offsets32_float)
+GATHER_GENERALF(__vec16_d,   double,  __vec16_i32, __gather32_double, __gather_base_offsets32_double)
+/***********/
+GATHER_GENERAL(__vec16_i8,  int8_t,  __vec16_i64, __gather64_i8);
+GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16);
+GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32);
+GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64);
+GATHER_GENERAL(__vec16_f,   float,   __vec16_i64, __gather64_float);
+GATHER_GENERAL(__vec16_d,   double,  __vec16_i64, __gather64_double);
+/***********/
+
+// scatter
+
+/* knc::macro::used */
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val,                   \
+                             __vec16_i1 mask) {                         \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 16; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            *ptr = val[i];                                            \
+        }                                                               \
+}
+    
+
+/*****************/
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __scatter_base_offsets64_i8)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
+/*****************/
+// SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
+static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,  __vec16_i32 val, __vec16_i1 mask)
+{
+  _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, 
+                                  _MM_DOWNCONV_EPI32_NONE, scale, 
+                                  _MM_HINT_NONE);
+}
+// SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
+static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask) 
+{
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+
+  __vec16_i1 still_to_do = mask;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));    
+    _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, 
+        value,
+        _MM_DOWNCONV_EPI32_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+}
+/*****************/
+// SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
+static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
+                               __vec16_f val, __vec16_i1 mask) 
+{ 
+  _mm512_mask_i32extscatter_ps(base, mask, offsets, val, 
+                               _MM_DOWNCONV_PS_NONE, scale,
+                               _MM_HINT_NONE);
+}
+//SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
+static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) 
+{ 
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+
+  __vec16_i1 still_to_do = mask;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));    
+    _mm512_mask_i32extscatter_ps(base, match, offsets.v_lo, 
+        value,
+        _MM_DOWNCONV_PS_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+}
+/*****************/
+SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
+/*****************/
+// SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_base_offsets32_double)
+static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec16_i32 offsets,
+                               __vec16_d val, __vec16_i1 mask) 
+{ 
+  _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v1, 
+                                 _MM_DOWNCONV_PD_NONE, scale,
+                                 _MM_HINT_NONE);
+  __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+  const __mmask8 mask8 = 0x00FF & (mask >> 8); /* knc::testme */
+  _mm512_mask_i32loextscatter_pd(base, mask8, shuffled_offsets, val.v2, 
+                                 _MM_DOWNCONV_PD_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __scatter_base_offsets64_double)
+
+/* knc::macro::used */
+#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
+    VTYPE ret;                                                       \
+    for (int i = 0; i < 16; ++i)                                     \
+        if ((mask.v & (1 << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)ptrs[i];                         \
+            *ptr = val[i];                                         \
+        }                                                            \
+}
+/* knc::macro::used */
+#define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
+  return FUNC1(0, 1, ptrs, val, mask); \
+}
+
+/***********/
+SCATTER_GENERALF(__vec16_i8,  int8_t,  __vec16_i32, __scatter32_i8, __scatter_base_offsets32_i8)
+SCATTER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16, __scatter_base_offsets32_i16)
+SCATTER_GENERALF(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32, __scatter_base_offsets32_i32)
+SCATTER_GENERALF(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64, __scatter_base_offsets32_i64)
+SCATTER_GENERALF(__vec16_f,   float,   __vec16_i32, __scatter32_float, __scatter_base_offsets32_float)
+SCATTER_GENERALF(__vec16_d,   double,  __vec16_i32, __scatter32_double, __scatter_base_offsets32_double)
+/***********/
+SCATTER_GENERAL(__vec16_i8,  int8_t,  __vec16_i64, __scatter64_i8)
+SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
+SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32)
+SCATTER_GENERAL(__vec16_f,   float,   __vec16_i64, __scatter64_float)
+SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64)
+SCATTER_GENERAL(__vec16_d,   double,  __vec16_i64, __scatter64_double)
+/***********/
+
+///////////////////////////////////////////////////////////////////////////
+// packed load/store
+///////////////////////////////////////////////////////////////////////////
+
+
+static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, __vec16_i1 mask)
+{
+  __vec16_i32 v = __load<64>(val);
+  v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __store<64>(val, v);
+  return _mm_countbits_32(uint32_t(mask));
+}
+
+static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) 
+{
+  _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  return _mm_countbits_32(uint32_t(mask));
+}
+
+static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, __vec16_i1 mask)
+{
+  __vec16_i32 v = __load<64>(val);
+  v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __store<64>(val, v);
+  return _mm_countbits_32(uint32_t(mask));
+}
+
+static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, __vec16_i1 mask) 
+{
+  _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  return _mm_countbits_32(uint32_t(mask));
+}
+
+///////////////////////////////////////////////////////////////////////////
+// aos/soa
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE void __soa_to_aos3_float(__vec16_f v0, __vec16_f v1, __vec16_f v2,
+                                            float *ptr) {
+    for (int i = 0; i < 16; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec16_f *out0, __vec16_f *out1,
+                                            __vec16_f *out2) {
+    for (int i = 0; i < 16; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+    }
+}
+
+static FORCEINLINE void __soa_to_aos4_float(__vec16_f v0, __vec16_f v1, __vec16_f v2,
+                                            __vec16_f v3, float *ptr) {
+    for (int i = 0; i < 16; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+        *ptr++ = __extract_element(v3, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec16_f *out0, __vec16_f *out1,
+                                            __vec16_f *out2, __vec16_f *out3) {
+    for (int i = 0; i < 16; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+        __insert_element(out3, i, *ptr++);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// prefetch
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T1); // prefetch into L2$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *p) {
+    // There is no L3$ on KNC, don't want to pollute L2$ unecessarily
+}
+
+static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint
+    // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
+}
+
+///////////////////////////////////////////////////////////////////////////
+// atomics
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) {
+    int32_t old, min;
+    do {
+        old = *((volatile int32_t *)p);
+        min = (old < (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) {
+    int32_t old, max;
+    do {
+        old = *((volatile int32_t *)p);
+        max = (old > (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) {
+    uint32_t old, min;
+    do {
+        old = *((volatile uint32_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) {
+    uint32_t old, max;
+    do {
+        old = *((volatile uint32_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange((LONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval,
+                                             uint32_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) {
+    int64_t old, min;
+    do {
+        old = *((volatile int64_t *)p);
+        min = (old < (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) {
+    int64_t old, max;
+    do {
+        old = *((volatile int64_t *)p);
+        max = (old > (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) {
+    uint64_t old, min;
+    do {
+        old = *((volatile uint64_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) {
+    uint64_t old, max;
+    do {
+        old = *((volatile uint64_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange64((LONGLONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
+                                             uint64_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+
+#endif // !WIN32
+
+#undef FORCEINLINE
+#undef PRE_ALIGN
+#undef POST_ALIGN
diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
new file mode 100644
index 00000000..d7696117
--- /dev/null
+++ b/examples/intrinsics/knc-i1x8.h
@@ -0,0 +1,2818 @@
+/**
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+#include <math.h>
+#include <assert.h>
+#include <algorithm>
+#include <immintrin.h>
+#include <zmmintrin.h>
+
+#ifdef _MSC_VER
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)  /*__declspec(align(x))*/
+#define POST_ALIGN(x)  
+#define roundf(x) (floorf(x + .5f))
+#define round(x) (floor(x + .5))
+#else
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)
+#define POST_ALIGN(x)  __attribute__ ((aligned(x)))
+#endif
+
+#define KNC 1
+#if 0
+extern "C" 
+{
+  int printf(const unsigned char *, ...);
+  int puts(unsigned char *);
+  unsigned int putchar(unsigned int);
+  int fflush(void *);
+  uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t);
+  uint8_t *memset(uint8_t *, uint8_t, uint64_t);
+  void memset_pattern16(void *, const void *, uint64_t);
+}
+#endif
+
+typedef float __vec1_f;
+typedef double __vec1_d;
+typedef int8_t __vec1_i8;
+typedef int16_t __vec1_i16;
+typedef int32_t __vec1_i32;
+typedef int64_t __vec1_i64;
+
+struct __vec8_i1 {
+    __vec8_i1() { }
+    __vec8_i1(const __mmask8 &vv) : v(vv) { }
+    __vec8_i1(bool v0, bool v1, bool v2, bool v3,
+              bool v4, bool v5, bool v6, bool v7) {
+        v = ((v0 & 1) |
+             ((v1 & 1) << 1) |
+             ((v2 & 1) << 2) |
+             ((v3 & 1) << 3) |
+             ((v4 & 1) << 4) |
+             ((v5 & 1) << 5) |
+             ((v6 & 1) << 6) |
+             ((v7 & 1) << 7) );
+    }
+             
+    __mmask8 v;
+    FORCEINLINE operator __mmask8() const { return v; }//0xFF & v; }
+};
+
+
+template <typename T>
+struct vec8 {
+    vec8() { }
+    vec8(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) {
+        data[0] = v0;        data[1] = v1;        data[2] = v2;        data[3] = v3;
+        data[4] = v4;        data[5] = v5;        data[6] = v6;        data[7] = v7;
+    }
+    T data[8]; 
+    FORCEINLINE const T& operator[](const int i) const { return data[i]; }
+    FORCEINLINE       T& operator[](const int i)       { return data[i]; }
+};
+
+/****************/
+
+struct PRE_ALIGN(32) __vec8_i32  
+{
+#ifdef __ZMM64BIT__
+  __m512i _data;
+  FORCEINLINE __vec8_i32(const __m512i &in) : _data(in) {}
+  FORCEINLINE operator __m512i() const   { return _data; }
+#else /* __ZMM64BIT__ */
+  typedef int32_t  _v8si  __attribute__((vector_size(32)));
+  _v8si _data;
+  FORCEINLINE __vec8_i32(const __m512i &in) 
+  {
+    _mm512_mask_extpackstorelo_epi32((__m512i*)&_data,  0xFF, in, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  }
+  FORCEINLINE operator __m512i() const   
+  { 
+    return _mm512_extloadunpacklo_epi32(_mm512_setzero_epi32(), (uint8_t*)&_data, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); 
+  }
+#endif /* __ZMM64BIT__ */
+  
+  __vec8_i32() { }
+  FORCEINLINE __vec8_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
+      int32_t v4, int32_t v5, int32_t v6, int32_t v7) 
+  {
+    const __m512i v  = _mm512_set_16to16_pi(0,0,0,0,0,0,0,0, v7, v6, v5, v4, v3, v2, v1, v0);
+    *this = __vec8_i32(v);
+  }
+
+  FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
+  FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
+} POST_ALIGN(32);
+
+PRE_ALIGN(32) struct __vec8_f 
+{
+#ifdef __ZMM64BIT__
+  __m512 _data;
+  FORCEINLINE __vec8_f(const __m512 &in) : _data(in) {}
+  FORCEINLINE operator __m512() const   { return _data; }
+#else /* __ZMM64BIT__ */
+  typedef float  _v8sf  __attribute__((vector_size(32)));
+  _v8sf _data;
+  FORCEINLINE __vec8_f(const __m512 &in) 
+  {
+    _mm512_mask_extpackstorelo_ps((__m512*)&_data,  0xFF, in, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  }
+  FORCEINLINE operator __m512() const   
+  { 
+    return _mm512_extloadunpacklo_ps(_mm512_setzero_ps(), (uint8_t*)&_data, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); 
+  }
+#endif /* __ZMM64BIT__ */
+  FORCEINLINE __vec8_f() { }
+  FORCEINLINE __vec8_f(float v0, float v1, float v2, float v3, 
+                       float v4, float v5, float v6, float v7) 
+  {
+    const __m512 v  = _mm512_set_16to16_ps(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, v7, v6, v5, v4, v3, v2, v1, v0);
+    *this = __vec8_f(v);
+  }
+
+  FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
+  FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
+} POST_ALIGN(32);
+
+struct PRE_ALIGN(64) __vec8_d 
+{
+    __m512d v;
+    FORCEINLINE __vec8_d() : v(_mm512_undefined_pd()) {}
+    FORCEINLINE __vec8_d(const __m512d _v) : v(_v) {}
+    FORCEINLINE __vec8_d(const __vec8_d &o) : v(o.v) {}
+    FORCEINLINE __vec8_d& operator =(const __vec8_d &o) { v=o.v; return *this; }
+    FORCEINLINE operator __m512d() const { return v; }
+    FORCEINLINE __vec8_d(double v00, double v01, double v02, double v03, 
+                          double v04, double v05, double v06, double v07) :
+        v ( _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00) ) {}
+    FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
+    FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
+} POST_ALIGN(64);
+
+/****************/
+
+PRE_ALIGN(64) struct __vec8_i64  : public vec8<int64_t> { 
+    __vec8_i64() { }
+    __vec8_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, 
+               int64_t v4, int64_t v5, int64_t v6, int64_t v7) 
+        : vec8<int64_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+} POST_ALIGN(64);
+
+PRE_ALIGN(16) struct __vec8_i8   : public vec8<int8_t> { 
+    __vec8_i8() { }
+    __vec8_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
+               int8_t v4, int8_t v5, int8_t v6, int8_t v7)
+        : vec8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+} POST_ALIGN(16);
+
+PRE_ALIGN(32) struct __vec8_i16  : public vec8<int16_t> { 
+    __vec8_i16() { }
+    __vec8_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
+                int16_t v4, int16_t v5, int16_t v6, int16_t v7) 
+        : vec8<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+} POST_ALIGN(32);
+
+static inline int32_t __extract_element(__vec8_i32, int);
+
+
+///////////////////////////////////////////////////////////////////////////
+// macros...
+
+#define UNARY_OP(TYPE, NAME, OP)            \
+static FORCEINLINE TYPE NAME(TYPE v) {      \
+    TYPE ret;                               \
+    for (int i = 0; i < 8; ++i)            \
+        ret[i] = OP(v[i]);              \
+    return ret;                             \
+}
+
+#define BINARY_OP(TYPE, NAME, OP)                               \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
+    TYPE ret;                                                   \
+   for (int i = 0; i < 8; ++i)                                 \
+       ret[i] = a[i] OP b[i];                             \
+   return ret;                                                   \
+}
+
+#define BINARY_OP_CAST(TYPE, CAST, NAME, OP)                        \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP (CAST)(b[i]);                 \
+   return ret;                                                      \
+}
+
+#define BINARY_OP_FUNC(TYPE, NAME, FUNC)                            \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret[i] = FUNC(a[i], b[i]);                             \
+   return ret;                                                      \
+}
+
+#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP)                        \
+static FORCEINLINE __vec8_i1 NAME##_##SUFFIX(TYPE a, TYPE b) {     \
+   __vec8_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   return ret;                                                      \
+}                                                                   \
+static FORCEINLINE __vec8_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b,       \
+                                              __vec8_i1 mask) {    \
+   __vec8_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   ret.v &= mask.v;                                                 \
+   return ret;                                                      \
+}
+
+#define INSERT_EXTRACT(VTYPE, STYPE)                                  \
+static FORCEINLINE STYPE __extract_element(VTYPE v, int index) {      \
+    return ((STYPE *)&v)[index];                                      \
+}                                                                     \
+static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
+    ((STYPE *)v)[index] = val;                                        \
+}
+
+#define LOAD_STORE(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 8; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 8; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+#define LOADS(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 8; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+
+#define STORES(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 8; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+#define REDUCE_ADD(TYPE, VTYPE, NAME)           \
+static FORCEINLINE TYPE NAME(VTYPE v) {         \
+     TYPE ret = v[0];                         \
+     for (int i = 1; i < 8; ++i)               \
+         ret = ret + v[i];                    \
+     return ret;                                \
+}
+
+#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP)                    \
+static FORCEINLINE TYPE NAME(VTYPE v) {                         \
+    TYPE ret = v[0];                                          \
+    for (int i = 1; i < 8; ++i)                                \
+        ret = (ret OP (TYPE)v[i]) ? ret : (TYPE)v[i];       \
+    return ret;                                                 \
+}
+
+#define SELECT(TYPE)                                                \
+static FORCEINLINE TYPE __select(__vec8_i1 mask, TYPE a, TYPE b) { \
+    TYPE ret;                                                       \
+    for (int i = 0; i < 8; ++i)                                    \
+        ret[i] = (mask.v & (1<<i)) ? a[i] : b[i];             \
+    return ret;                                                     \
+}                                                                   \
+static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
+    return cond ? a : b;                                            \
+}
+
+#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
+static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP b;                              \
+   return ret;                                                      \
+}
+
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
+template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 8; ++i)                                   \
+        ret[i] = v;                                              \
+    return ret;                                                    \
+}
+
+#define SETZERO(VTYPE, NAME)                                       \
+template <class RetVecType> VTYPE __setzero_##NAME();              \
+template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 8; ++i)                                   \
+        ret[i] = 0;                                              \
+    return ret;                                                    \
+}
+
+#define UNDEF(VTYPE, NAME)                                         \
+template <class RetVecType> VTYPE __undef_##NAME();                \
+template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {            \
+    return VTYPE();                                                \
+}
+
+#define BROADCAST(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = v[index & 0x7];                  \
+    return ret;                                       \
+}                                                     \
+
+#define ROTATE(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = v[(i+index) & 0x7];              \
+    return ret;                                       \
+}                                                     \
+
+#define SHUFFLES(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec8_i32 index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = v[__extract_element(index, i) & 0x7];      \
+    return ret;                                       \
+}                                                     \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec8_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i) {                    \
+        int ii = __extract_element(index, i) & 0xf;    \
+        ret[i] = (ii < 8) ? v0[ii] : v1[ii-8];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+#define SHUFFLE2(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec8_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i) {                    \
+        int ii = __extract_element(index, i) & 0xf;    \
+        ret[i] = (ii < 8) ? v0[ii] : v1[ii-8];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+INSERT_EXTRACT(__vec1_i8, int8_t)
+INSERT_EXTRACT(__vec1_i16, int16_t)
+INSERT_EXTRACT(__vec1_i32, int32_t)
+INSERT_EXTRACT(__vec1_i64, int64_t)
+INSERT_EXTRACT(__vec1_f, float)
+INSERT_EXTRACT(__vec1_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// mask ops
+
+static FORCEINLINE __vec8_i1 __movmsk(__vec8_i1 mask) {
+    return mask.v;
+}
+
+static FORCEINLINE bool __any(__vec8_i1 mask) {
+    return (mask.v!=0);
+}
+
+static FORCEINLINE bool __all(__vec8_i1 mask) {
+    return (mask.v==0xFF);
+}
+
+static FORCEINLINE bool __none(__vec8_i1 mask) {
+    return (mask.v==0);
+}
+
+static FORCEINLINE __vec8_i1 __equal_i1(__vec8_i1 a, __vec8_i1 b) {
+    return (a.v & b.v) | (~a.v & ~b.v);
+}
+
+static FORCEINLINE __vec8_i1 __and(__vec8_i1 a, __vec8_i1 b) {
+    return  a.v & b.v;
+}
+
+static FORCEINLINE __vec8_i1 __xor(__vec8_i1 a, __vec8_i1 b) {
+    return a.v ^ b.v;
+}
+
+static FORCEINLINE __vec8_i1 __or(__vec8_i1 a, __vec8_i1 b) {
+    return  a.v | b.v;
+}
+
+static FORCEINLINE __vec8_i1 __not(__vec8_i1 v) {
+    return ~v;
+}
+
+static FORCEINLINE __vec8_i1 __and_not1(__vec8_i1 a, __vec8_i1 b) {
+    return  ~a.v & b.v;
+}
+
+static FORCEINLINE __vec8_i1 __and_not2(__vec8_i1 a, __vec8_i1 b) {
+    return  a.v & ~b.v;
+}
+
+static FORCEINLINE __vec8_i1 __select(__vec8_i1 mask, __vec8_i1 a, 
+                                       __vec8_i1 b) {
+    return  (a.v & mask.v) | (b.v & ~mask.v);
+}
+
+static FORCEINLINE __vec8_i1 __select(bool cond, __vec8_i1 a, __vec8_i1 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE bool __extract_element(__vec8_i1 vec, int index) {
+    return (vec.v & (1 << index)) ? true : false;
+}
+
+static FORCEINLINE void __insert_element(__vec8_i1 *vec, int index, 
+                                         bool val) {
+    if (val == false)
+        vec->v &= ~(1 << index);
+    else
+        vec->v |= (1 << index);
+}
+
+template <int ALIGN> static FORCEINLINE __vec8_i1 __load(const __vec8_i1 *p) {
+    uint8_t *ptr = (uint8_t *)p;
+    __vec8_i1 r;
+    r.v = *ptr;
+    return r;
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec8_i1 *p, __vec8_i1 v) {
+    uint8_t *ptr = (uint8_t *)p;
+    *ptr = v.v;
+}
+
+template <class RetVecType> RetVecType __smear_i1(int i);
+template <> static FORCEINLINE __vec8_i1 __smear_i1<__vec8_i1>(int i) {
+    return i?0xFF:0x0;
+}
+
+template <class RetVecType> RetVecType __setzero_i1();
+template <> static FORCEINLINE __vec8_i1 __setzero_i1<__vec8_i1>() {
+    return 0;
+}
+
+template <class RetVecType> __vec8_i1 __undef_i1();
+template <> FORCEINLINE __vec8_i1 __undef_i1<__vec8_i1>() {
+    return __vec8_i1();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// int8
+
+BINARY_OP(__vec8_i8, __add, +)
+BINARY_OP(__vec8_i8, __sub, -)
+BINARY_OP(__vec8_i8, __mul, *)
+
+BINARY_OP(__vec8_i8, __or, |)
+BINARY_OP(__vec8_i8, __and, &)
+BINARY_OP(__vec8_i8, __xor, ^)
+BINARY_OP(__vec8_i8, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i8, uint8_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i8, int8_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i8, uint8_t, __urem, %)
+BINARY_OP_CAST(__vec8_i8, int8_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i8, uint8_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i8, int8_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i8, uint8_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i8, int8_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i8, int8_t, __shl, <<)
+
+CMP_OP(__vec8_i8, i8, int8_t,  __equal, ==)
+CMP_OP(__vec8_i8, i8, int8_t,  __not_equal, !=)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_less_than, <)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i8)
+INSERT_EXTRACT(__vec8_i8, int8_t)
+SMEAR(__vec8_i8, i8, int8_t)
+SETZERO(__vec8_i8, i8)
+UNDEF(__vec8_i8, i8)
+BROADCAST(__vec8_i8, i8, int8_t)
+ROTATE(__vec8_i8, i8, int8_t)
+SHUFFLES(__vec8_i8, i8, int8_t)
+LOAD_STORE(__vec8_i8, int8_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int16
+
+BINARY_OP(__vec8_i16, __add, +)
+BINARY_OP(__vec8_i16, __sub, -)
+BINARY_OP(__vec8_i16, __mul, *)
+
+BINARY_OP(__vec8_i16, __or, |)
+BINARY_OP(__vec8_i16, __and, &)
+BINARY_OP(__vec8_i16, __xor, ^)
+BINARY_OP(__vec8_i16, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i16, uint16_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i16, int16_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i16, uint16_t, __urem, %)
+BINARY_OP_CAST(__vec8_i16, int16_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i16, uint16_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i16, int16_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i16, uint16_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i16, int16_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i16, int16_t, __shl, <<)
+
+CMP_OP(__vec8_i16, i16, int16_t,  __equal, ==)
+CMP_OP(__vec8_i16, i16, int16_t,  __not_equal, !=)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_less_than, <)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i16)
+INSERT_EXTRACT(__vec8_i16, int16_t)
+SMEAR(__vec8_i16, i16, int16_t)
+SETZERO(__vec8_i16, i16)
+UNDEF(__vec8_i16, i16)
+BROADCAST(__vec8_i16, i16, int16_t)
+ROTATE(__vec8_i16, i16, int16_t)
+SHUFFLES(__vec8_i16, i16, int16_t)
+LOAD_STORE(__vec8_i16, int16_t)
+
+#if 0 /* evghenii::int32 */
+///////////////////////////////////////////////////////////////////////////
+// int32
+
+BINARY_OP(__vec8_i32, __add, +)
+BINARY_OP(__vec8_i32, __sub, -)
+BINARY_OP(__vec8_i32, __mul, *)
+
+BINARY_OP(__vec8_i32, __or, |)
+BINARY_OP(__vec8_i32, __and, &)
+BINARY_OP(__vec8_i32, __xor, ^)
+BINARY_OP(__vec8_i32, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i32, uint32_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i32, int32_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i32, uint32_t, __urem, %)
+BINARY_OP_CAST(__vec8_i32, int32_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i32, uint32_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i32, int32_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i32, uint32_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i32, int32_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i32, int32_t, __shl, <<)
+
+CMP_OP(__vec8_i32, i32, int32_t,  __equal, ==)
+CMP_OP(__vec8_i32, i32, int32_t,  __not_equal, !=)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_less_than, <)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i32)
+INSERT_EXTRACT(__vec8_i32, int32_t)
+SMEAR(__vec8_i32, i32, int32_t)
+SETZERO(__vec8_i32, i32)
+UNDEF(__vec8_i32, i32)
+BROADCAST(__vec8_i32, i32, int32_t)
+ROTATE(__vec8_i32, i32, int32_t)
+SHUFFLES(__vec8_i32, i32, int32_t)
+LOAD_STORE(__vec8_i32, int32_t)
+
+#else /* evghenii::int32 */
+///////////////////////////////////////////////////////////////////////////
+// int32
+///////////////////////////////////////////////////////////////////////////
+
+#define IZERO _mm512_setzero_epi32()
+static FORCEINLINE __vec8_i32 __add(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_add_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __sub(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_sub_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __mul(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_mullo_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __udiv(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_div_epu32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __sdiv(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_div_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __urem(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_rem_epu32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __srem(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_rem_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __or(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_or_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __and(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_and_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __xor(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_xor_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __shl(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_sllv_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __lshr(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_srlv_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __ashr(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_srav_epi32(IZERO,0xFF, a, b); 
+}
+
+static FORCEINLINE __vec8_i32 __shl(__vec8_i32 a, int32_t n) {
+    return _mm512_mask_slli_epi32(IZERO,0xFF, a, n);
+}
+
+static FORCEINLINE __vec8_i32 __lshr(__vec8_i32 a, int32_t n) {
+    return _mm512_mask_srli_epi32(IZERO,0xFF, a, n); 
+}
+
+static FORCEINLINE __vec8_i32 __ashr(__vec8_i32 a, int32_t n) {
+    return _mm512_mask_srai_epi32(IZERO,0xFF, a, n); 
+}
+
+static FORCEINLINE __vec8_i1 __equal_i32(const __vec8_i32 &a, const __vec8_i32 &b) {
+    return _mm512_mask_cmpeq_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_i32_and_mask(const __vec8_i32 &a, const __vec8_i32 &b,
+                                                   __vec8_i1 m) {
+    return _mm512_mask_cmpeq_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpneq_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                       __vec8_i1 m) {
+    return _mm512_mask_cmpneq_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmple_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                 __vec8_i1 m) {
+    return _mm512_mask_cmple_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmple_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                               __vec8_i1 m) {
+    return _mm512_mask_cmple_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpge_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                    __vec8_i1 m) {
+    return _mm512_mask_cmpge_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpge_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                  __vec8_i1 m) {
+    return _mm512_mask_cmpge_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmplt_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                __vec8_i1 m) {
+    return _mm512_mask_cmplt_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmplt_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                              __vec8_i1 m) {
+    return _mm512_mask_cmplt_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpgt_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                   __vec8_i1 m) {
+    return _mm512_mask_cmpgt_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpgt_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                 __vec8_i1 m) {
+    return _mm512_mask_cmpgt_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __select(__vec8_i1 mask,
+                                        __vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_mov_epi32(b, mask, a);
+} 
+
+static FORCEINLINE __vec8_i32 __select(bool cond, __vec8_i32 a, __vec8_i32 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE int32_t __extract_element(__vec8_i32 v, int index) { //uint32_t index) {
+    return ((int32_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec8_i32 *v, uint32_t index, int32_t val) {
+    ((int32_t *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_i32(int32_t i);
+template <> static FORCEINLINE __vec8_i32 __smear_i32<__vec8_i32>(int32_t i) {
+    return _mm512_set_16to16_epi32(0,0,0,0,0,0,0,0, i,i,i,i,i,i,i,i);
+}
+
+static const __vec8_i32 __ispc_one = __smear_i32<__vec8_i32>(1);
+static const __vec8_i32 __ispc_thirty_two = __smear_i32<__vec8_i32>(32);
+static const __vec8_i32 __ispc_ffffffff = __smear_i32<__vec8_i32>(-1);
+static const __vec8_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7);
+
+template <class RetVecType> RetVecType __setzero_i32();
+template <> static FORCEINLINE __vec8_i32 __setzero_i32<__vec8_i32>() {
+    return _mm512_setzero_epi32();
+}
+
+template <class RetVecType> RetVecType __undef_i32();
+template <> static FORCEINLINE __vec8_i32 __undef_i32<__vec8_i32>() {
+    return __vec8_i32();
+}
+
+static FORCEINLINE __vec8_i32 __broadcast_i32(__vec8_i32 v, int index) {
+    int32_t val = __extract_element(v, index & 0xf);
+    return _mm512_set1_epi32(val);
+}
+
+#if 0 /* evghenii::doesn't work */
+static FORCEINLINE __vec8_i32 __rotate_i32(__vec8_i32 v, int index) {
+    __vec8_i32 idx = __smear_i32<__vec8_i32>(index);
+    __vec8_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec8_i32>(0x7));
+    return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
+}
+#else
+ROTATE(__vec8_i32, i32, int32_t)
+#endif
+
+static FORCEINLINE __vec8_i32 __shuffle_i32(__vec8_i32 v, __vec8_i32 index) {
+    return _mm512_mask_permutevar_epi32(v, 0xffff, index, v);
+}
+SHUFFLE2(__vec8_i32, i32, int32_t) /* evghenii::to implement */
+
+template <int ALIGN> static FORCEINLINE __vec8_i32 __load(const __vec8_i32 *p) {
+  __vec8_i32 v;
+  v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  return __select(0xFF,v,IZERO);
+}
+
+
+template <int ALIGN> static FORCEINLINE void __store(__vec8_i32 *p, __vec8_i32 v) {
+  _mm512_mask_extpackstorelo_epi32(          p,    0xFF, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+}
+
+#if 0
+template <> static FORCEINLINE __vec8_i32 __load<64>(const __vec8_i32 *p) {
+    return _mm512_load_epi32(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec8_i32 *p, __vec8_i32 v) {
+    _mm512_store_epi32(p, v);
+}
+#endif
+#endif /* evghenii::int32 */
+
+///////////////////////////////////////////////////////////////////////////
+// int64
+
+BINARY_OP(__vec8_i64, __add, +)
+BINARY_OP(__vec8_i64, __sub, -)
+BINARY_OP(__vec8_i64, __mul, *)
+
+BINARY_OP(__vec8_i64, __or, |)
+BINARY_OP(__vec8_i64, __and, &)
+BINARY_OP(__vec8_i64, __xor, ^)
+BINARY_OP(__vec8_i64, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i64, uint64_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i64, int64_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i64, uint64_t, __urem, %)
+BINARY_OP_CAST(__vec8_i64, int64_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i64, uint64_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i64, int64_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i64, uint64_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i64, int64_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i64, int64_t, __shl, <<)
+
+CMP_OP(__vec8_i64, i64, int64_t,  __equal, ==)
+CMP_OP(__vec8_i64, i64, int64_t,  __not_equal, !=)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_less_than, <)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i64)
+INSERT_EXTRACT(__vec8_i64, int64_t)
+SMEAR(__vec8_i64, i64, int64_t)
+SETZERO(__vec8_i64, i64)
+UNDEF(__vec8_i64, i64)
+BROADCAST(__vec8_i64, i64, int64_t)
+ROTATE(__vec8_i64, i64, int64_t)
+SHUFFLES(__vec8_i64, i64, int64_t)
+LOAD_STORE(__vec8_i64, int64_t)
+
+
+#if 0 /* evghenii::float */
+///////////////////////////////////////////////////////////////////////////
+// float
+
+BINARY_OP(__vec8_f, __add, +)
+BINARY_OP(__vec8_f, __sub, -)
+BINARY_OP(__vec8_f, __mul, *)
+BINARY_OP(__vec8_f, __div, /)
+
+CMP_OP(__vec8_f, float, float, __equal, ==)
+CMP_OP(__vec8_f, float, float, __not_equal, !=)
+CMP_OP(__vec8_f, float, float, __less_than, <)
+CMP_OP(__vec8_f, float, float, __less_equal, <=)
+CMP_OP(__vec8_f, float, float, __greater_than, >)
+CMP_OP(__vec8_f, float, float, __greater_equal, >=)
+
+static FORCEINLINE __vec8_i1 __ordered_float(__vec8_f a, __vec8_f b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+static FORCEINLINE __vec8_i1 __unordered_float(__vec8_f a, __vec8_f b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec8_f)
+INSERT_EXTRACT(__vec8_f, float)
+SMEAR(__vec8_f, float, float)
+SETZERO(__vec8_f, float)
+UNDEF(__vec8_f, float)
+BROADCAST(__vec8_f, float, float)
+ROTATE(__vec8_f, float, float)
+SHUFFLES(__vec8_f, float, float)
+LOAD_STORE(__vec8_f, float)
+#else /* evghenii::float */
+
+///////////////////////////////////////////////////////////////////////////
+// float
+///////////////////////////////////////////////////////////////////////////
+
+#define FZERO _mm512_setzero_ps()
+static FORCEINLINE __vec8_f __add(__vec8_f a, __vec8_f b) { 
+    return _mm512_mask_add_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_f __sub(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_sub_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_f __mul(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_mul_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_f __div(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_div_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpeq_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                     __vec8_i1 m) {
+    return _mm512_mask_cmpeq_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpneq_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                         __vec8_i1 m) {
+    return _mm512_mask_cmpneq_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmplt_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_float_and_mask(__vec8_f a, __vec8_f b,
+                                                         __vec8_i1 m) {
+    return _mm512_mask_cmplt_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmple_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                          __vec8_i1 m) {
+    return _mm512_mask_cmple_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmp_ps_mask(0xFF, a, b,_CMP_GT_OS);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_float_and_mask(__vec8_f a, __vec8_f b,
+                                                            __vec8_i1 m) {
+    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GT_OS);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmp_ps_mask(0xFF, a, b,_CMP_GE_OS);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                             __vec8_i1 m) {
+    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GE_OS);
+}
+
+static FORCEINLINE __vec8_i1 __ordered_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpord_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unordered_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpunord_ps_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_f __select(__vec8_i1 mask, __vec8_f a, __vec8_f b) {
+    return _mm512_mask_mov_ps(b, mask & 0xFF, a);
+}
+
+static FORCEINLINE __vec8_f __select(bool cond, __vec8_f a, __vec8_f b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE float __extract_element(__vec8_f v, uint32_t index) {
+  return v[index];
+ //   return ((float *)&v)[index];
+}
+
+static FORCEINLINE void  __insert_element(__vec8_f *v, uint32_t index, float val) {
+  (*v)[index] = val;
+//    ((float *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_float(float f);
+template <> static FORCEINLINE __vec8_f __smear_float<__vec8_f>(float f) {
+  return _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, f,f,f,f,f,f,f,f);
+}
+
+template <class RetVecType> RetVecType __setzero_float();
+template <> static FORCEINLINE __vec8_f __setzero_float<__vec8_f>() {
+    return _mm512_setzero_ps();
+}
+
+template <class RetVecType> RetVecType __undef_float();
+template <> static FORCEINLINE __vec8_f __undef_float<__vec8_f>() {
+    return __vec8_f();
+}
+
+static FORCEINLINE __vec8_f __broadcast_float(__vec8_f v, int index) {
+    float val = __extract_element(v, index & 0x7);
+  return _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, val,val,val,val,val,val,val,val);
+}
+ 
+#if 1
+static FORCEINLINE __vec8_f __shuffle_float(__vec8_f v, __vec8_i32 index) {
+    return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
+}
+#endif
+ROTATE(__vec8_f, float, float)
+SHUFFLE2(__vec8_f, float, float)
+
+#if 0
+LOADS(__vec8_f, float)
+#else
+template <int ALIGN> static FORCEINLINE __vec8_f __load(const __vec8_f *p) {
+  __vec8_f v;
+  v = _mm512_extloadunpacklo_ps(v,           p,    _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  return __select(0xFF,v,FZERO);
+}
+#endif
+
+#if 0
+STORES(__vec8_f, float)
+#else
+template <int ALIGN> static FORCEINLINE void __store(__vec8_f *p, __vec8_f v) 
+{
+  _mm512_mask_extpackstorelo_ps(          p,    0xFF, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_ps((uint8_t*)p+64, 0xFF, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+}
+#endif
+
+#endif /* evghenii::float */
+
+static FORCEINLINE float __exp_uniform_float(float v) {    return expf(v);}
+static FORCEINLINE __vec8_f __exp_varying_float(__vec8_f v) { return _mm512_mask_exp_ps(FZERO, 0xFF, v); }
+
+
+static FORCEINLINE float __log_uniform_float(float v) {    return logf(v);}
+static FORCEINLINE __vec8_f __log_varying_float(__vec8_f v) { return _mm512_mask_log_ps(FZERO, 0xFF, v); }
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {    return powf(a, b);}
+static FORCEINLINE __vec8_f __pow_varying_float(__vec8_f a, __vec8_f b) { return _mm512_mask_pow_ps(FZERO, 0xFF, a,b); }
+
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) {
+    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+    uint32_t exp = shifted_exp & o;   // just the exponent
+    o += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) { // Zero/Denormal?
+        o += 1 << 23;             // extra exp adjust
+        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+    }
+
+    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+    return __floatbits(o);
+}
+
+
+static FORCEINLINE __vec8_f __half_to_float_varying(__vec8_i16 v) {
+    __vec8_f ret;
+    for (int i = 0; i < 8; ++i)
+        ret[i] = __half_to_float_uniform(v[i]);
+    return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    int32_t fint = __intbits(f);
+    int32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    int32_t f32infty = 255 << 23;
+    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+    const uint32_t round_mask = ~0xfffu; 
+    const int32_t magic = 15 << 23;
+    const int32_t f16infty = 31 << 23;
+
+    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
+    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+
+static FORCEINLINE __vec8_i16 __float_to_half_varying(__vec8_f v) {
+    __vec8_i16 ret;
+    for (int i = 0; i < 8; ++i)
+        ret[i] = __float_to_half_uniform(v[i]);
+    return ret;
+}
+
+
+#if 0 /* evghenii::double */
+///////////////////////////////////////////////////////////////////////////
+// double
+
+BINARY_OP(__vec8_d, __add, +)
+BINARY_OP(__vec8_d, __sub, -)
+BINARY_OP(__vec8_d, __mul, *)
+BINARY_OP(__vec8_d, __div, /)
+
+CMP_OP(__vec8_d, double, double, __equal, ==)
+CMP_OP(__vec8_d, double, double, __not_equal, !=)
+CMP_OP(__vec8_d, double, double, __less_than, <)
+CMP_OP(__vec8_d, double, double, __less_equal, <=)
+CMP_OP(__vec8_d, double, double, __greater_than, >)
+CMP_OP(__vec8_d, double, double, __greater_equal, >=)
+
+static FORCEINLINE __vec8_i1 __ordered_double(__vec8_d a, __vec8_d b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+static FORCEINLINE __vec8_i1 __unordered_double(__vec8_d a, __vec8_d b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec8_d)
+INSERT_EXTRACT(__vec8_d, double)
+SMEAR(__vec8_d, double, double)
+SETZERO(__vec8_d, double)
+UNDEF(__vec8_d, double)
+BROADCAST(__vec8_d, double, double)
+ROTATE(__vec8_d, double, double)
+SHUFFLES(__vec8_d, double, double)
+LOAD_STORE(__vec8_d, double)
+#else /* evghenii::double */
+///////////////////////////////////////////////////////////////////////////
+// double
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec8_d __add(__vec8_d a, __vec8_d b) { 
+    return _mm512_add_pd(a, b);
+}
+static FORCEINLINE __vec8_d __sub(__vec8_d a, __vec8_d b) {
+    return _mm512_sub_pd(a, b);
+}
+static FORCEINLINE __vec8_d __mul(__vec8_d a, __vec8_d b) {
+    return _mm512_mul_pd(a, b);
+}
+
+static FORCEINLINE __vec8_d __div(__vec8_d a, __vec8_d b) {
+    return _mm512_div_pd(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpeq_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                      __vec8_i1 m) {
+    return _mm512_mask_cmpeq_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpneq_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                          __vec8_i1 m) {
+    return _mm512_mask_cmpneq_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmplt_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_double_and_mask(__vec8_d a, __vec8_d b,
+                                                          __vec8_i1 m) {
+    return _mm512_mask_cmplt_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmple_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                           __vec8_i1 m) {
+    return _mm512_mask_cmple_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpnle_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_double_and_mask(__vec8_d a, __vec8_d b,
+                                                             __vec8_i1 m) {
+    return _mm512_mask_cmpnle_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpnlt_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                              __vec8_i1 m) {
+    return _mm512_mask_cmpnlt_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __ordered_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpord_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unordered_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpunord_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_d __select(__vec8_i1 mask, __vec8_d a, __vec8_d b) {
+    return _mm512_mask_mov_pd(b, mask, a);
+}
+
+
+static FORCEINLINE __vec8_d __select(bool cond, __vec8_d a, __vec8_d b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE double __extract_element(__vec8_d v, uint32_t index) {
+    return ((double *)&v)[index];
+}
+
+static FORCEINLINE void  __insert_element(__vec8_d *v, uint32_t index, double val) {
+    ((double *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_double(double d);
+template <> static FORCEINLINE __vec8_d __smear_double<__vec8_d>(double d) { return _mm512_set1_pd(d); }
+
+template <class RetVecType> RetVecType __setzero_double();
+template <> static FORCEINLINE __vec8_d __setzero_double<__vec8_d>() { return _mm512_setzero_pd(); }
+
+template <class RetVecType> RetVecType __undef_double();
+template <> static FORCEINLINE __vec8_d __undef_double<__vec8_d>() {    return __vec8_d();}
+
+static FORCEINLINE __vec8_d __broadcast_double(__vec8_d v, int index) {
+    double val = __extract_element(v, index & 0xf);
+    return _mm512_set1_pd(val);
+}
+
+ROTATE(__vec8_d, double, double)
+SHUFFLES(__vec8_d, double, double)
+
+template <int ALIGN> static FORCEINLINE __vec8_d __load(const __vec8_d *p) {
+  __vec8_d ret;
+  ret.v = _mm512_extloadunpacklo_pd(ret.v, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v = _mm512_extloadunpackhi_pd(ret.v, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  return ret;
+}
+ 
+template <int ALIGN> static FORCEINLINE void __store(__vec8_d *p, __vec8_d v) {
+  _mm512_extpackstorelo_pd(p, v.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+}
+
+
+#if 0
+template <> static FORCEINLINE __vec8_d __load<64>(const __vec8_d *p) {
+    return  _mm512_load_pd(p);
+}
+template <> static FORCEINLINE __vec8_d __load<128>(const __vec8_d *p) {
+    return __load<64>(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec8_d *p, __vec8_d v) {
+    _mm512_store_pd(p, v.v);
+}
+template <> static FORCEINLINE void __store<128>(__vec8_d *p, __vec8_d v) {
+    __store<64>(p, v);
+}
+#endif
+#endif /* evghenii::double */
+
+///////////////////////////////////////////////////////////////////////////
+// casts
+
+
+#define CAST(TO, STO, FROM, SFROM, FUNC)        \
+static FORCEINLINE TO FUNC(TO, FROM val) {      \
+    TO ret;                                     \
+    for (int i = 0; i < 8; ++i)                \
+        ret[i] = (STO)((SFROM)(val[i]));    \
+    return ret;                                 \
+}
+
+// sign extension conversions
+CAST(__vec8_i64, int64_t, __vec8_i32, int32_t, __cast_sext)
+CAST(__vec8_i64, int64_t, __vec8_i16, int16_t, __cast_sext)
+CAST(__vec8_i64, int64_t, __vec8_i8,  int8_t,  __cast_sext)
+CAST(__vec8_i32, int32_t, __vec8_i16, int16_t, __cast_sext)
+CAST(__vec8_i32, int32_t, __vec8_i8,  int8_t,  __cast_sext)
+CAST(__vec8_i16, int16_t, __vec8_i8,  int8_t,  __cast_sext)
+
+#define CAST_SEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_sext(TYPE, __vec8_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 8; ++i) {                    \
+        ret[i] = 0;                                 \
+        if (v.v & (1 << i))                           \
+            ret[i] = ~ret[i];                     \
+    }                                                 \
+    return ret;                                       \
+}
+
+CAST_SEXT_I1(__vec8_i8)
+CAST_SEXT_I1(__vec8_i16)
+#if 0
+CAST_SEXT_I1(__vec8_i32)
+#else
+static FORCEINLINE __vec8_i32 __cast_sext(const __vec8_i32 &, const __vec8_i1 &val)
+{
+    __vec8_i32 ret = _mm512_setzero_epi32();
+    __vec8_i32 one = _mm512_set1_epi32(-1);
+    return _mm512_mask_mov_epi32(ret, 0xFF & val, one);
+}
+#endif
+CAST_SEXT_I1(__vec8_i64)
+
+// zero extension
+CAST(__vec8_i64, uint64_t, __vec8_i32, uint32_t, __cast_zext)
+CAST(__vec8_i64, uint64_t, __vec8_i16, uint16_t, __cast_zext)
+CAST(__vec8_i64, uint64_t, __vec8_i8,  uint8_t,  __cast_zext)
+CAST(__vec8_i32, uint32_t, __vec8_i16, uint16_t, __cast_zext)
+CAST(__vec8_i32, uint32_t, __vec8_i8,  uint8_t,  __cast_zext)
+CAST(__vec8_i16, uint16_t, __vec8_i8,  uint8_t,  __cast_zext)
+
+#define CAST_ZEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_zext(TYPE, __vec8_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = (v.v & (1 << i)) ? 1 : 0;          \
+    return ret;                                       \
+}
+
+CAST_ZEXT_I1(__vec8_i8)
+CAST_ZEXT_I1(__vec8_i16)
+#if 0
+CAST_ZEXT_I1(__vec8_i32)
+#else
+static FORCEINLINE __vec8_i32 __cast_zext(const __vec8_i32 &, const __vec8_i1 &val)
+{
+    __vec8_i32 ret = _mm512_setzero_epi32();
+    __vec8_i32 one = _mm512_set1_epi32(1);
+    return _mm512_mask_mov_epi32(ret, 0xFF & val, one);
+}
+#endif
+CAST_ZEXT_I1(__vec8_i64)
+
+// truncations
+CAST(__vec8_i32, int32_t, __vec8_i64, int64_t, __cast_trunc)
+CAST(__vec8_i16, int16_t, __vec8_i64, int64_t, __cast_trunc)
+CAST(__vec8_i8,  int8_t,  __vec8_i64, int64_t, __cast_trunc)
+CAST(__vec8_i16, int16_t, __vec8_i32, int32_t, __cast_trunc)
+CAST(__vec8_i8,  int8_t,  __vec8_i32, int32_t, __cast_trunc)
+CAST(__vec8_i8,  int8_t,  __vec8_i16, int16_t, __cast_trunc)
+
+// signed int to float/double
+#if 0
+CAST(__vec8_f, float, __vec8_i8,   int8_t,  __cast_sitofp)
+CAST(__vec8_f, float, __vec8_i16,  int16_t, __cast_sitofp)
+CAST(__vec8_f, float, __vec8_i32,  int32_t, __cast_sitofp)
+#else
+static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i8  val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i16 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i32 val) {return _mm512_mask_cvtfxpnt_round_adjustepi32_ps(FZERO, 0xFF, val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+#endif
+CAST(__vec8_f, float, __vec8_i64,  int64_t, __cast_sitofp)
+#if 0
+CAST(__vec8_d, double, __vec8_i8,  int8_t,  __cast_sitofp)
+CAST(__vec8_d, double, __vec8_i16, int16_t, __cast_sitofp)
+CAST(__vec8_d, double, __vec8_i32, int32_t, __cast_sitofp)
+#else
+static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i8 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return  _mm512_cvtepi32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i16 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return  _mm512_cvtepi32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i32 val) {
+    __vec8_d ret;
+    return _mm512_cvtepi32lo_pd(val);
+}
+#endif
+CAST(__vec8_d, double, __vec8_i64, int64_t, __cast_sitofp)
+
+// unsigned int to float/double
+#if 0
+CAST(__vec8_f, float, __vec8_i8,   uint8_t,  __cast_uitofp)
+CAST(__vec8_f, float, __vec8_i16,  uint16_t, __cast_uitofp)
+CAST(__vec8_f, float, __vec8_i32,  uint32_t, __cast_uitofp)
+#else
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i8  val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i16 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i32 val) {return _mm512_mask_cvtfxpnt_round_adjustepu32_ps(FZERO, 0xFF, val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+#endif
+CAST(__vec8_f, float, __vec8_i64,  uint64_t, __cast_uitofp)
+#if 0
+CAST(__vec8_d, double, __vec8_i8,  uint8_t,  __cast_uitofp)
+CAST(__vec8_d, double, __vec8_i16, uint16_t, __cast_uitofp)
+CAST(__vec8_d, double, __vec8_i32, uint32_t, __cast_uitofp)
+#else
+static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i8 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return  _mm512_cvtepu32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i16 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return _mm512_cvtepu32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i32 val) {
+    __vec8_d ret;
+    return _mm512_cvtepu32lo_pd(val);
+}
+#endif
+CAST(__vec8_d, double, __vec8_i64, uint64_t, __cast_uitofp)
+
+#if 0
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i1 v) {
+    __vec8_f ret;
+    for (int i = 0; i < 8; ++i)
+        ret[i] = (v.v & (1 << i)) ? 1. : 0.;
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i1 v) 
+{
+    const __m512 ret = _mm512_setzero_ps();
+    const __m512 one = _mm512_set1_ps(1.0);
+    return _mm512_mask_mov_ps(ret, v & 0xFF, one);
+}
+#endif
+
+// float/double to signed int
+CAST(__vec8_i8,  int8_t,  __vec8_f, float, __cast_fptosi)
+CAST(__vec8_i16, int16_t, __vec8_f, float, __cast_fptosi)
+#if 0
+CAST(__vec8_i32, int32_t, __vec8_f, float, __cast_fptosi)
+#else
+static FORCEINLINE __vec8_i32 __cast_fptosi(__vec8_i32, __vec8_f val) {
+  return _mm512_mask_cvtfxpnt_round_adjustps_epi32(IZERO, 0xFF, val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+#endif
+CAST(__vec8_i64, int64_t, __vec8_f, float, __cast_fptosi)
+CAST(__vec8_i8,  int8_t,  __vec8_d, double, __cast_fptosi)
+CAST(__vec8_i16, int16_t, __vec8_d, double, __cast_fptosi)
+#if 1
+CAST(__vec8_i32, int32_t, __vec8_d, double, __cast_fptosi)
+#else
+#endif
+CAST(__vec8_i64, int64_t, __vec8_d, double, __cast_fptosi)
+
+// float/double to unsigned int
+CAST(__vec8_i8,  uint8_t,  __vec8_f, float, __cast_fptoui)
+CAST(__vec8_i16, uint16_t, __vec8_f, float, __cast_fptoui)
+#if 0
+CAST(__vec8_i32, uint32_t, __vec8_f, float, __cast_fptoui)
+#else
+static FORCEINLINE __vec8_i32 __cast_fptoui(__vec8_i32, __vec8_f val) {
+  return _mm512_mask_cvtfxpnt_round_adjustps_epu32(IZERO, 0xFF, val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+#endif
+CAST(__vec8_i64, uint64_t, __vec8_f, float, __cast_fptoui)
+CAST(__vec8_i8,  uint8_t,  __vec8_d, double, __cast_fptoui)
+CAST(__vec8_i16, uint16_t, __vec8_d, double, __cast_fptoui)
+#if 1
+CAST(__vec8_i32, uint32_t, __vec8_d, double, __cast_fptoui)
+#else
+#endif
+CAST(__vec8_i64, uint64_t, __vec8_d, double, __cast_fptoui)
+
+// float/double conversions
+#if 0
+CAST(__vec8_f, float,  __vec8_d, double, __cast_fptrunc)
+CAST(__vec8_d, double, __vec8_f, float,  __cast_fpext)
+#else
+static FORCEINLINE __vec8_f __cast_fptrunc(__vec8_f, __vec8_d val) {
+    return _mm512_mask_cvtpd_pslo(FZERO, 0xFF, val);
+}
+static FORCEINLINE __vec8_d __cast_fpext(__vec8_d, __vec8_f val) {
+    return _mm512_cvtpslo_pd(val);
+}
+#endif
+
+typedef union {
+    int32_t i32;
+    float f;
+    int64_t i64;
+    double d;
+} BitcastUnion;
+
+#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT)       \
+static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
+    TO r;                                           \
+    for (int i = 0; i < 8; ++i) {                  \
+        BitcastUnion u;                             \
+        u.FROM_ELT = val[i];                      \
+        r[i] = u.TO_ELT;                          \
+    }                                               \
+    return r;                                       \
+}
+
+#if 0
+CAST_BITS(__vec8_f,   f,   __vec8_i32, i32)
+CAST_BITS(__vec8_i32, i32, __vec8_f,   f)
+#else
+static FORCEINLINE __vec8_f __cast_bits(__vec8_f, __vec8_i32 val) {
+    return _mm512_castsi512_ps(val);
+}
+static FORCEINLINE __vec8_i32 __cast_bits(__vec8_i32, __vec8_f val) {
+    return _mm512_castps_si512(val);
+}
+#endif
+
+#if 0
+CAST_BITS(__vec8_d,   d,   __vec8_i64, i64)
+CAST_BITS(__vec8_i64, i64, __vec8_d,   d)
+#else
+static FORCEINLINE __vec8_i64 __cast_bits(__vec8_i64, __vec8_d val) {
+    return *(__vec8_i64*)&val;
+}
+static FORCEINLINE __vec8_d __cast_bits(__vec8_d, __vec8_i64 val) {
+    return *(__vec8_d*)&val;
+}
+#endif
+
+#define CAST_BITS_SCALAR(TO, FROM)                  \
+static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
+    union {                                         \
+    TO to;                                          \
+    FROM from;                                      \
+    } u;                                            \
+    u.from = v;                                     \
+    return u.to;                                    \
+}
+
+CAST_BITS_SCALAR(uint32_t, float)
+CAST_BITS_SCALAR(int32_t, float)
+CAST_BITS_SCALAR(float, uint32_t)
+CAST_BITS_SCALAR(float, int32_t)
+CAST_BITS_SCALAR(uint64_t, double)
+CAST_BITS_SCALAR(int64_t, double)
+CAST_BITS_SCALAR(double, uint64_t)
+CAST_BITS_SCALAR(double, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// various math functions
+
+static FORCEINLINE void __fastmath() {
+}
+
+static FORCEINLINE float __round_uniform_float(float v) {
+    return roundf(v);
+}
+
+static FORCEINLINE float __floor_uniform_float(float v)  {
+    return floorf(v);
+}
+
+static FORCEINLINE float __ceil_uniform_float(float v) {
+    return ceilf(v);
+}
+
+static FORCEINLINE double __round_uniform_double(double v) {
+    return round(v);
+}
+
+static FORCEINLINE double __floor_uniform_double(double v) {
+    return floor(v);
+}
+
+static FORCEINLINE double __ceil_uniform_double(double v) {
+    return ceil(v);
+}
+
+#if 0
+UNARY_OP(__vec8_f, __round_varying_float, roundf)
+UNARY_OP(__vec8_f, __floor_varying_float, floorf)
+UNARY_OP(__vec8_f, __ceil_varying_float, ceilf)
+#else
+static FORCEINLINE __vec8_f __round_varying_float(__vec8_f v) {
+  return _mm512_mask_round_ps(FZERO, 0xFF, v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);
+}
+
+static FORCEINLINE __vec8_f __floor_varying_float(__vec8_f v) {
+  return _mm512_mask_floor_ps(FZERO, 0xFF, v);
+}
+
+static FORCEINLINE __vec8_f __ceil_varying_float(__vec8_f v) {
+  return _mm512_mask_ceil_ps(FZERO, 0xFF, v);
+}
+#endif
+
+#if 0
+UNARY_OP(__vec8_d, __round_varying_double, round)
+UNARY_OP(__vec8_d, __floor_varying_double, floor)
+UNARY_OP(__vec8_d, __ceil_varying_double, ceil)
+#else
+static FORCEINLINE __vec8_d __round_varying_float(__vec8_d v) {
+  return _mm512_svml_round_pd(v);
+}
+
+static FORCEINLINE __vec8_d __floor_varying_float(__vec8_d v) {
+  return _mm512_floor_pd(v);
+}
+
+static FORCEINLINE __vec8_d __ceil_varying_float(__vec8_d v) {
+  return _mm512_ceil_pd(v);
+}
+#endif
+
+
+// min/max
+
+static FORCEINLINE float __min_uniform_float(float a, float b) { return (a<b) ? a : b; }
+static FORCEINLINE float __max_uniform_float(float a, float b) { return (a>b) ? a : b; }
+static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
+static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32(int32_t a, int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64(int64_t a, int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
+
+
+#if 0
+BINARY_OP_FUNC(__vec8_f, __max_varying_float, __max_uniform_float)
+BINARY_OP_FUNC(__vec8_f, __min_varying_float, __min_uniform_float)
+#else
+static FORCEINLINE __vec8_f __max_varying_float (__vec8_f v1, __vec8_f v2) { return _mm512_mask_gmax_ps(FZERO, 0xFF, v1, v2);}
+static FORCEINLINE __vec8_f __min_varying_float (__vec8_f v1, __vec8_f v2) { return _mm512_mask_gmin_ps(FZERO, 0xFF, v1, v2);}
+#endif
+
+#if 0
+BINARY_OP_FUNC(__vec8_d, __max_varying_double, __max_uniform_double)
+BINARY_OP_FUNC(__vec8_d, __min_varying_double, __min_uniform_double)
+#else
+static FORCEINLINE __vec8_d __max_varying_double(__vec8_d v1, __vec8_d v2) { return _mm512_gmax_pd(v1,v2); }
+static FORCEINLINE __vec8_d __min_varying_double(__vec8_d v1, __vec8_d v2) { return _mm512_gmin_pd(v1,v2); }
+#endif
+
+#if 0
+BINARY_OP_FUNC(__vec8_i32, __max_varying_int32, __max_uniform_int32)
+BINARY_OP_FUNC(__vec8_i32, __min_varying_int32, __min_uniform_int32)
+BINARY_OP_FUNC(__vec8_i32, __max_varying_uint32, __max_uniform_uint32)
+BINARY_OP_FUNC(__vec8_i32, __min_varying_uint32, __min_uniform_uint32)
+#else
+static FORCEINLINE __vec8_i32 __max_varying_int32 (__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_max_epi32(IZERO,0xFF, v1, v2);}
+static FORCEINLINE __vec8_i32 __min_varying_int32 (__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_min_epi32(IZERO,0xFF, v1, v2);}
+static FORCEINLINE __vec8_i32 __max_varying_uint32(__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_max_epu32(IZERO,0xFF, v1, v2);}
+static FORCEINLINE __vec8_i32 __min_varying_uint32(__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_min_epu32(IZERO,0xFF, v1, v2);}
+#endif
+
+BINARY_OP_FUNC(__vec8_i64, __max_varying_int64, __max_uniform_int64)
+BINARY_OP_FUNC(__vec8_i64, __min_varying_int64, __min_uniform_int64)
+BINARY_OP_FUNC(__vec8_i64, __max_varying_uint64, __max_uniform_uint64)
+BINARY_OP_FUNC(__vec8_i64, __min_varying_uint64, __min_uniform_uint64)
+
+// sqrt/rsqrt/rcp
+
+static FORCEINLINE float __rsqrt_uniform_float(float v) {
+    return 1.f / sqrtf(v);
+}
+
+static FORCEINLINE float __rcp_uniform_float(float v) {
+    return 1.f / v;
+}
+
+static FORCEINLINE float __sqrt_uniform_float(float v) {
+    return sqrtf(v);
+}
+
+static FORCEINLINE double __sqrt_uniform_double(double v) {
+    return sqrt(v);
+}
+
+#if 0
+UNARY_OP(__vec8_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec8_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec8_f, __sqrt_varying_float, __sqrt_uniform_float)
+#else
+static FORCEINLINE __vec8_f __rcp_varying_float(__vec8_f v) {
+#ifdef ISPC_FAST_MATH
+    return _mm512_mask_rcp23_ps(FZERO, 0xFF, v); // Approximation with 23 bits of accuracy.
+#else
+    return _mm512_mask_recip_ps(FZERO, 0xFF, v);
+#endif
+}
+
+static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) {
+#ifdef ISPC_FAST_MATH
+    return _mm512_mask_rsqrt23_ps(FZERO,0xFF,v); // Approximation with 0.775ULP accuracy
+#else 
+    return _mm512_mask_invsqrt_ps(FZERO,0xFF,v);
+#endif
+}
+static FORCEINLINE __vec8_f __sqrt_varying_float (__vec8_f v) {    return _mm512_mask_sqrt_ps(FZERO,0xFF,v);}
+#endif
+
+#if 0
+UNARY_OP(__vec8_d, __sqrt_varying_double, __sqrt_uniform_double)
+#else
+static FORCEINLINE __vec8_d __sqrt_varying_double(__vec8_d v) {    return _mm512_sqrt_pd(v); }
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// svml
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec8_f __svml_logf(__vec8_f v)              { return _mm512_mask_log_ps(FZERO,0xFF,v); }
+static FORCEINLINE __vec8_f __svml_expf(__vec8_f v)              { return _mm512_mask_exp_ps(FZERO,0xFF,v); }
+static FORCEINLINE __vec8_f __svml_cosf(__vec8_f v)              { return _mm512_mask_cos_ps(FZERO,0xFF,v); }
+static FORCEINLINE __vec8_f __svml_powf(__vec8_f a, __vec8_f b) { return _mm512_mask_pow_ps(FZERO,0xFF,a,b); }
+
+static FORCEINLINE __vec8_d __svml_logd(__vec8_d v)              { return _mm512_log_pd(v); }
+static FORCEINLINE __vec8_d __svml_expd(__vec8_d v)              { return _mm512_exp_pd(v); }
+static FORCEINLINE __vec8_d __svml_cosd(__vec8_d v)              { return _mm512_cos_pd(v); }
+static FORCEINLINE __vec8_d __svml_powd(__vec8_d a, __vec8_d b) { return _mm512_pow_pd(a,b); }
+
+///////////////////////////////////////////////////////////////////////////
+// bit ops
+
+static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __popcnt_int64(uint64_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & (1<<31)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & (1ull<<63)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// reductions
+
+#if 0
+REDUCE_ADD(float, __vec8_f, __reduce_add_float)
+REDUCE_MINMAX(float, __vec8_f, __reduce_min_float, <)
+REDUCE_MINMAX(float, __vec8_f, __reduce_max_float, >)
+#else
+static FORCEINLINE float __reduce_add_float(__vec8_f v) { return _mm512_mask_reduce_add_ps(0xFF,v); }
+static FORCEINLINE float __reduce_min_float(__vec8_f v) { return _mm512_mask_reduce_min_ps(0xFF,v); }
+static FORCEINLINE float __reduce_max_float(__vec8_f v) { return _mm512_mask_reduce_max_ps(0xFF,v); }
+#endif
+
+#if 0
+REDUCE_ADD(double, __vec8_d, __reduce_add_double)
+REDUCE_MINMAX(double, __vec8_d, __reduce_min_double, <)
+REDUCE_MINMAX(double, __vec8_d, __reduce_max_double, >)
+#else
+static FORCEINLINE float __reduce_add_double(__vec8_d v) { return _mm512_reduce_add_pd(v); }
+static FORCEINLINE float __reduce_min_double(__vec8_d v) { return _mm512_reduce_min_pd(v); }
+static FORCEINLINE float __reduce_max_double(__vec8_d v) { return _mm512_reduce_max_pd(v); }
+#endif
+
+
+
+#if 0
+REDUCE_ADD   (int64_t, __vec8_i32, __reduce_add_int32)
+REDUCE_MINMAX(int32_t, __vec8_i32, __reduce_min_int32, <)
+REDUCE_MINMAX(int32_t, __vec8_i32, __reduce_max_int32, >)
+REDUCE_MINMAX(uint32_t, __vec8_i32, __reduce_min_uint32, <)
+REDUCE_MINMAX(uint32_t, __vec8_i32, __reduce_max_uint32, >)
+#else
+static FORCEINLINE  int64_t __reduce_add_int32  (__vec8_i32 v) { return _mm512_mask_reduce_add_epi32(0xFF, v);}
+static FORCEINLINE  int32_t __reduce_min_int32  (__vec8_i32 v) { return _mm512_mask_reduce_min_epi32(0xFF, v);}
+static FORCEINLINE  int32_t __reduce_max_int32  (__vec8_i32 v) { return _mm512_mask_reduce_max_epi32(0xFF, v);}
+static FORCEINLINE uint32_t __reduce_min_uint32 (__vec8_i32 v) { return _mm512_mask_reduce_min_epu32(0xFF, v);}
+static FORCEINLINE uint32_t __reduce_max_uint32 (__vec8_i32 v) { return _mm512_mask_reduce_max_epu32(0xFF, v);}
+#endif
+
+REDUCE_ADD   ( int16_t, __vec8_i8,  __reduce_add_int8)
+REDUCE_ADD   ( int32_t, __vec8_i16, __reduce_add_int16)
+REDUCE_ADD   ( int64_t, __vec8_i64, __reduce_add_int64)
+REDUCE_MINMAX( int64_t, __vec8_i64, __reduce_min_int64, <)
+REDUCE_MINMAX( int64_t, __vec8_i64, __reduce_max_int64, >)
+REDUCE_MINMAX(uint64_t, __vec8_i64, __reduce_min_uint64, <)
+REDUCE_MINMAX(uint64_t, __vec8_i64, __reduce_max_uint64, >)
+
+///////////////////////////////////////////////////////////////////////////
+// masked load/store
+
+static FORCEINLINE __vec8_i8 __masked_load_i8(void *p,
+                                               __vec8_i1 mask) {
+    __vec8_i8 ret;
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec8_i16 __masked_load_i16(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_i16 ret;
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+#if 0
+static FORCEINLINE __vec8_i32 __masked_load_i32(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_i32 ret;
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_i32 __masked_load_i32(void *p, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_mask_load_epi32(__vec8_i32(), mask, p);
+#else
+    __vec8_i32 tmp;
+    tmp = _mm512_mask_extloadunpacklo_epi32(tmp, 0xFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_extloadunpackhi_epi32(tmp, 0xFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __vec8_i32 ret;
+    return _mm512_mask_mov_epi32(ret, 0xFF & mask, tmp);
+#endif
+}
+#endif
+
+#if 0
+static FORCEINLINE __vec8_f __masked_load_float(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_f ret;
+    float *ptr = (float *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_f __masked_load_float(void *p, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
+#else
+    __vec8_f tmp;
+    tmp = _mm512_mask_extloadunpacklo_ps(tmp, 0xFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_extloadunpackhi_ps(tmp, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    __vec8_f ret;
+    return _mm512_mask_mov_ps(ret, 0xFF & mask, tmp);
+#endif
+}
+#endif
+
+static FORCEINLINE __vec8_i64 __masked_load_i64(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_i64 ret;
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+#if 0
+static FORCEINLINE __vec8_d __masked_load_double(void *p,
+                                                  __vec8_i1 mask) {
+    __vec8_d ret;
+    double *ptr = (double *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_d __masked_load_double(void *p, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    __vec8_d ret = FZERO;
+    ret = _mm512_mask_load_pd(ret, 0xFF & mask, p);
+    return ret;
+#else
+    __vec8_d tmp = FZERO;
+    tmp.v = _mm512_mask_extloadunpacklo_pd(tmp.v, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_extloadunpackhi_pd(tmp.v, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    __vec8_d ret = FZERO;
+    ret.v = _mm512_mask_mov_pd(ret.v, mask, tmp.v);
+    return ret;
+#endif
+}
+#endif
+
+
+static FORCEINLINE void __masked_store_i8(void *p, __vec8_i8 val,
+                                          __vec8_i1 mask) {
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+static FORCEINLINE void __masked_store_i16(void *p, __vec8_i16 val,
+                                           __vec8_i1 mask) {
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+#if 0
+static FORCEINLINE void __masked_store_i32(void *p, __vec8_i32 val,
+                                           __vec8_i1 mask) {
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_i32(void *p, __vec8_i32 val, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_epi32(p, mask, val.v);
+#else
+    __vec8_i32 tmp;
+    tmp = _mm512_extloadunpacklo_epi32(tmp, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp = _mm512_extloadunpackhi_epi32(tmp, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_mov_epi32(tmp, 0xFF & mask, val);
+    _mm512_mask_extpackstorelo_epi32(          p,    0xFF, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+#if 0
+static FORCEINLINE void __masked_store_float(void *p, __vec8_f val,
+                                             __vec8_i1 mask) {
+    float *ptr = (float *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_float(void *p, __vec8_f val,
+                                             __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_ps(p, 0xFF & mask, val.v);
+#else
+    __vec8_f tmp = FZERO;
+    tmp = _mm512_extloadunpacklo_ps(tmp, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp = _mm512_extloadunpackhi_ps(tmp, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_mov_ps(tmp, 0xFF & mask, val);
+    _mm512_mask_extpackstorelo_ps(          p,    0xFF, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_ps((uint8_t*)p+64, 0xFF, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+static FORCEINLINE void __masked_store_i64(void *p, __vec8_i64 val,
+                                          __vec8_i1 mask) {
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+#if 0
+static FORCEINLINE void __masked_store_double(void *p, __vec8_d val,
+                                              __vec8_i1 mask) {
+    double *ptr = (double *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_double(void *p, __vec8_d val,
+                                              __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_pd(p, mask, val.v);
+#else
+    __vec8_d tmp;
+    tmp.v = _mm512_extloadunpacklo_pd(tmp.v, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_extloadunpackhi_pd(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_mov_pd(tmp.v, mask, val.v);
+    _mm512_extpackstorelo_pd(p, tmp.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+static FORCEINLINE void __masked_store_blend_i8(void *p, __vec8_i8 val,
+                                                __vec8_i1 mask) {
+    __masked_store_i8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i16(void *p, __vec8_i16 val,
+                                                 __vec8_i1 mask) {
+    __masked_store_i16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i32(void *p, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    __masked_store_i32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_float(void *p, __vec8_f val,
+                                                   __vec8_i1 mask) {
+    __masked_store_float(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i64(void *p, __vec8_i64 val,
+                                                 __vec8_i1 mask) {
+    __masked_store_i64(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_double(void *p, __vec8_d val,
+                                                    __vec8_i1 mask) {
+    __masked_store_double(p, val, mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// gather/scatter
+
+// offsets * offsetScale is in bytes (for all of these)
+
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec8_i1 mask) {          \
+    VTYPE ret;                                                          \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 8; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            ret[i] = *ptr;                                            \
+        }                                                               \
+    return ret;                                                         \
+}
+    
+
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i32, __gather_base_offsets32_i8)
+#else
+static FORCEINLINE __vec8_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec8_i32 offsets,  __vec8_i1 mask) 
+{
+    // (iw): need to temporarily store as int because gathers can only return ints.
+    __vec8_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), 0xFF & mask, offsets, base, 
+                                                     _MM_UPCONV_EPI32_SINT8, scale,
+                                                     _MM_HINT_NONE);
+    // now, downconverting to chars into temporary char vector
+    __vec8_i8 ret;
+    _mm512_mask_extstore_epi32(ret.data,0xFF,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+    return ret;
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i64, __gather_base_offsets64_i8)
+/****************/
+GATHER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i64, __gather_base_offsets64_i16)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i32, __gather_base_offsets32_i32)
+#else
+static FORCEINLINE __vec8_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec8_i32 offsets,   __vec8_i1 mask) 
+{
+    return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), 0xFF & mask, offsets, 
+                                          base, _MM_UPCONV_EPI32_NONE, scale,
+                                          _MM_HINT_NONE);
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i64, __gather_base_offsets64_i32)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i32, __gather_base_offsets32_float)
+#else
+static FORCEINLINE __vec8_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) 
+{
+    return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), 0xFF & mask, offsets,
+                                       base, _MM_UPCONV_PS_NONE, scale,
+                                       _MM_HINT_NONE);
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i64, __gather_base_offsets64_float)
+/****************/
+GATHER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i64, __gather_base_offsets64_i64)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i32, __gather_base_offsets32_double)
+#else
+static FORCEINLINE __vec8_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) 
+{
+    __vec8_d ret;
+    ret.v = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
+                                       base, _MM_UPCONV_PD_NONE, scale,
+                                       _MM_HINT_NONE); 
+#if 0
+    __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+    const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */
+    ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets,
+                                       base, _MM_UPCONV_PD_NONE, scale,
+                                       _MM_HINT_NONE); 
+#endif
+    return ret;
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i64, __gather_base_offsets64_double)
+
+#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec8_i1 mask) {   \
+    VTYPE ret;                                              \
+    for (int i = 0; i < 8; ++i)                            \
+        if ((mask.v & (1 << i)) != 0) {                     \
+            STYPE *ptr = (STYPE *)ptrs[i];                \
+            ret[i] = *ptr;                                \
+        }                                                   \
+    return ret;                                             \
+}
+#define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec8_i1 mask) {   \
+  return FUNC1(0, 1, ptrs, mask); \
+}
+
+
+#if 1
+/***********/
+GATHER_GENERALF(__vec8_i8,  int8_t,  __vec8_i32, __gather32_i8, __gather_base_offsets32_i8)
+GATHER_GENERALF(__vec8_i16, int16_t, __vec8_i32, __gather32_i16, __gather_base_offsets32_i16)
+GATHER_GENERALF(__vec8_i32, int32_t, __vec8_i32, __gather32_i32, __gather_base_offsets32_i32)
+GATHER_GENERALF(__vec8_i64, int64_t, __vec8_i32, __gather32_i64, __gather_base_offsets32_i64)
+GATHER_GENERALF(__vec8_f,   float,   __vec8_i32, __gather32_float, __gather_base_offsets32_float)
+GATHER_GENERALF(__vec8_d,   double,  __vec8_i32, __gather32_double, __gather_base_offsets32_double)
+/***********/
+GATHER_GENERAL(__vec8_i8,  int8_t,  __vec8_i64, __gather64_i8);
+GATHER_GENERAL(__vec8_i16, int16_t, __vec8_i64, __gather64_i16);
+GATHER_GENERAL(__vec8_i32, int32_t, __vec8_i64, __gather64_i32);
+GATHER_GENERAL(__vec8_i64, int64_t, __vec8_i64, __gather64_i64);
+GATHER_GENERAL(__vec8_f,   float,   __vec8_i64, __gather64_float);
+GATHER_GENERAL(__vec8_d,   double,  __vec8_i64, __gather64_double);
+/***********/
+#endif
+
+// scatter
+
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val,                   \
+                             __vec8_i1 mask) {                         \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 8; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            *ptr = val[i];                                            \
+        }                                                               \
+}
+    
+
+/*****************/
+SCATTER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i64, __scatter_base_offsets64_i8)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i64, __scatter_base_offsets64_i16)
+/*****************/
+#if 0
+SCATTER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i32, __scatter_base_offsets32_i32)
+#else
+static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec8_i32 offsets,  __vec8_i32 val, __vec8_i1 mask)
+{
+    _mm512_mask_i32extscatter_epi32(b, 0xFF & mask, offsets, val, 
+                                    _MM_DOWNCONV_EPI32_NONE, scale, 
+                                    _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i64, __scatter_base_offsets64_i32)
+/*****************/
+#if 0
+SCATTER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i32, __scatter_base_offsets32_float)
+#else
+static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec8_i32 offsets,
+                               __vec8_f val, __vec8_i1 mask) 
+{ 
+    _mm512_mask_i32extscatter_ps(base, 0xFF & mask, offsets, val, 
+                                 _MM_DOWNCONV_PS_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i64, __scatter_base_offsets64_float)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i64, __scatter_base_offsets64_i64)
+/*****************/
+#if 0 /* evghenii::to implement */
+SCATTER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i32, __scatter_base_offsets32_double)
+#else /* evghenii:testme */
+static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec8_i32 offsets,
+                               __vec8_d val, __vec8_i1 mask) 
+{ 
+    _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v,
+                                 _MM_DOWNCONV_PD_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i64, __scatter_base_offsets64_double)
+
+#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec8_i1 mask) {  \
+    VTYPE ret;                                                       \
+    for (int i = 0; i < 8; ++i)                                     \
+        if ((mask.v & (1 << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)ptrs[i];                         \
+            *ptr = val[i];                                         \
+        }                                                            \
+}
+#define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec8_i1 mask) {  \
+  return FUNC1(0, 1, ptrs, val, mask); \
+}
+
+#if 1
+/***********/
+SCATTER_GENERALF(__vec8_i8,  int8_t,  __vec8_i32, __scatter32_i8, __scatter_base_offsets32_i8)
+SCATTER_GENERALF(__vec8_i16, int16_t, __vec8_i32, __scatter32_i16, __scatter_base_offsets32_i16)
+SCATTER_GENERALF(__vec8_i32, int32_t, __vec8_i32, __scatter32_i32, __scatter_base_offsets32_i32)
+SCATTER_GENERALF(__vec8_i64, int64_t, __vec8_i32, __scatter32_i64, __scatter_base_offsets32_i64)
+SCATTER_GENERALF(__vec8_f,   float,   __vec8_i32, __scatter32_float, __scatter_base_offsets32_float)
+SCATTER_GENERALF(__vec8_d,   double,  __vec8_i32, __scatter32_double, __scatter_base_offsets32_double)
+/***********/
+SCATTER_GENERAL(__vec8_i8,  int8_t,  __vec8_i64, __scatter64_i8)
+SCATTER_GENERAL(__vec8_i16, int16_t, __vec8_i64, __scatter64_i16)
+SCATTER_GENERAL(__vec8_i32, int32_t, __vec8_i64, __scatter64_i32)
+SCATTER_GENERAL(__vec8_f,   float,   __vec8_i64, __scatter64_float)
+SCATTER_GENERAL(__vec8_i64, int64_t, __vec8_i64, __scatter64_i64)
+SCATTER_GENERAL(__vec8_d,   double,  __vec8_i64, __scatter64_double)
+/***********/
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// packed load/store
+
+#if 0
+static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->operator[](i) = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, 
+                                                 __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val[i];
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
+                                                __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->operator[](i) = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
+                                                 __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val[i];
+            ++count;
+        }
+    }
+    return count;
+}
+#else
+static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    __vec8_i32 v = __load<64>(val);
+    v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __store<64>(val, v);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    __vec8_i32 v = __load<64>(val);
+    v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __store<64>(val, v);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// aos/soa
+
+static FORCEINLINE void __soa_to_aos3_float(__vec8_f v0, __vec8_f v1, __vec8_f v2,
+                                            float *ptr) {
+    for (int i = 0; i < 8; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec8_f *out0, __vec8_f *out1,
+                                            __vec8_f *out2) {
+    for (int i = 0; i < 8; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+    }
+}
+
+static FORCEINLINE void __soa_to_aos4_float(__vec8_f v0, __vec8_f v1, __vec8_f v2,
+                                            __vec8_f v3, float *ptr) {
+    for (int i = 0; i < 8; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+        *ptr++ = __extract_element(v3, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec8_f *out0, __vec8_f *out1,
+                                            __vec8_f *out2, __vec8_f *out3) {
+    for (int i = 0; i < 8; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+        __insert_element(out3, i, *ptr++);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// prefetch
+
+static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T1); // prefetch into L2$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *p) {
+    // There is no L3$ on KNC, don't want to pollute L2$ unecessarily
+}
+
+static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint
+    // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
+}
+
+///////////////////////////////////////////////////////////////////////////
+// atomics
+
+static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) {
+    int32_t old, min;
+    do {
+        old = *((volatile int32_t *)p);
+        min = (old < (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) {
+    int32_t old, max;
+    do {
+        old = *((volatile int32_t *)p);
+        max = (old > (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) {
+    uint32_t old, min;
+    do {
+        old = *((volatile uint32_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) {
+    uint32_t old, max;
+    do {
+        old = *((volatile uint32_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange((LONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval,
+                                             uint32_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) {
+    int64_t old, min;
+    do {
+        old = *((volatile int64_t *)p);
+        min = (old < (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) {
+    int64_t old, max;
+    do {
+        old = *((volatile int64_t *)p);
+        max = (old > (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) {
+    uint64_t old, min;
+    do {
+        old = *((volatile uint64_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) {
+    uint64_t old, max;
+    do {
+        old = *((volatile uint64_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange64((LONGLONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
+                                             uint64_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+
+#endif // !WIN32
+
+#undef FORCEINLINE
+#undef PRE_ALIGN
+#undef POST_ALIGN
diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h
new file mode 100644
index 00000000..05be27bd
--- /dev/null
+++ b/examples/intrinsics/knc-i1x8unsafe_fast.h
@@ -0,0 +1,86 @@
+#define __ZMM64BIT__
+#include "knc-i1x8.h"
+
+/* the following tests fails because on KNC native vec8_i32 and vec8_float are 512 and not 256 bit in size.
+ *
+ *  Using test compiler: Intel(r) SPMD Program Compiler (ispc), 1.4.5dev (build commit d68dbbc7bce74803 @ 20130919, LLVM 3.3)
+ *  Using C/C++ compiler: icpc (ICC) 14.0.0 20130728
+ *
+ */
+
+/* knc-i1x8unsafe_fast.h fails: 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+	./tests/ptr-assign-lhs-math-1.ispc
+33 / 1206 tests FAILED execution:
+	./tests/array-gather-simple.ispc
+	./tests/array-gather-vary.ispc
+	./tests/array-multidim-gather-scatter.ispc
+	./tests/array-scatter-vary.ispc
+	./tests/atomics-5.ispc
+	./tests/atomics-swap.ispc
+	./tests/cfor-array-gather-vary.ispc
+	./tests/cfor-gs-improve-varying-1.ispc
+	./tests/cfor-struct-gather-2.ispc
+	./tests/cfor-struct-gather-3.ispc
+	./tests/cfor-struct-gather.ispc
+	./tests/gather-struct-vector.ispc
+	./tests/global-array-4.ispc
+	./tests/gs-improve-varying-1.ispc
+	./tests/half-1.ispc
+	./tests/half-3.ispc
+	./tests/half.ispc
+	./tests/launch-3.ispc
+	./tests/launch-4.ispc
+	./tests/masked-scatter-vector.ispc
+	./tests/masked-struct-scatter-varying.ispc
+	./tests/new-delete-6.ispc
+	./tests/ptr-24.ispc
+	./tests/ptr-25.ispc
+	./tests/short-vec-15.ispc
+	./tests/struct-gather-2.ispc
+	./tests/struct-gather-3.ispc
+	./tests/struct-gather.ispc
+	./tests/struct-ref-lvalue.ispc
+	./tests/struct-test-118.ispc
+	./tests/struct-vary-index-expr.ispc
+	./tests/typedef-2.ispc
+	./tests/vector-varying-scatter.ispc
+*/
+
+/* knc-i1x8.h fails: 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+	./tests/ptr-assign-lhs-math-1.ispc
+3 / 1206 tests FAILED execution:
+	./tests/half-1.ispc
+	./tests/half-3.ispc
+	./tests/half.ispc
+*/
+
+/* knc-i1x8.h fails: 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+        ./tests/ptr-assign-lhs-math-1.ispc
+4 / 1206 tests FAILED execution:
+        ./tests/half-1.ispc
+        ./tests/half-3.ispc
+        ./tests/half.ispc
+        ./tests/test-141.ispc
+*/
+
+/* generic-16.h fails: (from these knc-i1x8.h & knc-i1x16.h are derived 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+        ./tests/ptr-assign-lhs-math-1.ispc
+6 / 1206 tests FAILED execution:
+        ./tests/func-overload-max.ispc
+        ./tests/half-1.ispc
+        ./tests/half-3.ispc
+        ./tests/half.ispc
+        ./tests/test-141.ispc
+        ./tests/test-143.ispc
+*/
+
+
+
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index ff00d920..919716be 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -108,22 +108,21 @@ struct __vec4_i64 {
 };
 
 struct __vec4_i32 {
-    __vec4_i32() { }
+    FORCEINLINE __vec4_i32() { }
     FORCEINLINE __vec4_i32(__m128i vv) : v(vv) {  }
-    FORCEINLINE __vec4_i32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+    FORCEINLINE __vec4_i32(int32_t a, int32_t b, int32_t c, int32_t d) {
         v = _mm_set_epi32(d, c, b, a);
     }
-    FORCEINLINE __vec4_i32(uint32_t *p) {
+    FORCEINLINE __vec4_i32(int32_t *p) {
         v = _mm_loadu_si128((__m128i *)p);
     }
-
+    FORCEINLINE __vec4_i32(const __vec4_i32 &other) : v(other.v) {}
+    FORCEINLINE __vec4_i32& operator =(const __vec4_i32 &o) { v=o.v; return *this; }
     FORCEINLINE operator __m128() const { return _mm_castsi128_ps(v); }
-
+    
     __m128i v;
 };
 
-static inline int32_t __extract_element(__vec4_i32 v, int index);
-
 struct __vec4_i16 {
     __vec4_i16() { }
     FORCEINLINE __vec4_i16(__m128i vv) : v(vv) {  }
@@ -215,6 +214,64 @@ INSERT_EXTRACT(__vec1_i64, int64_t)
 INSERT_EXTRACT(__vec1_f, float)
 INSERT_EXTRACT(__vec1_d, double)
 
+static FORCEINLINE bool __extract_element(const __vec4_i1 &v, int index) {
+    return ((int32_t *)&v)[index] ? true : false;
+}
+
+static FORCEINLINE void __insert_element(__vec4_i1 *v, int index, bool val) {
+    ((int32_t *)v)[index] = val ? -1 : 0;
+}
+
+static FORCEINLINE int8_t __extract_element(const __vec4_i8 &v, int index) {
+    return ((int8_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
+    ((int8_t *)v)[index] = val;
+}
+
+static FORCEINLINE int16_t __extract_element(const __vec4_i16 &v, int index) {
+    return ((int16_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val) {
+    ((int16_t *)v)[index] = val;
+}
+
+static FORCEINLINE int32_t __extract_element(const __vec4_i32 &v, int index) {
+    return ((int32_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i32 *v, int index, int32_t val) {
+    ((int32_t *)v)[index] = val;
+}
+
+static FORCEINLINE int64_t __extract_element(const __vec4_i64 &v, int index) {
+    return ((int64_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i64 *v, int index, int64_t val) {
+    ((int64_t *)v)[index] = val;
+}
+
+static FORCEINLINE float __extract_element(const __vec4_f &v, int index) {
+    return ((float *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_f *v, int index, float val) {
+    ((float *)v)[index] = val;
+}
+
+static FORCEINLINE double __extract_element(const __vec4_d &v, int index) {
+    return ((double *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_d *v, int index, double val) {
+    ((double *)v)[index] = val;
+}
+
+
+
 #define CAST_BITS_SCALAR(TO, FROM)                  \
 static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
     union {                                         \
@@ -313,13 +370,6 @@ static FORCEINLINE __vec4_i1 __select(__vec4_i1 mask, __vec4_i1 a, __vec4_i1 b)
     return _mm_blendv_ps(b.v, a.v, mask.v);
 }
 
-static FORCEINLINE bool __extract_element(__vec4_i1 v, int index) {
-    return ((int32_t *)&v)[index] ? true : false;
-}
-
-static FORCEINLINE void __insert_element(__vec4_i1 *v, int index, bool val) {
-    ((int32_t *)v)[index] = val ? -1 : 0;
-}
 
 template <int ALIGN> static FORCEINLINE __vec4_i1 __load(const __vec4_i1 *v) {
     // FIXME: handle align of 16...
@@ -564,13 +614,6 @@ static FORCEINLINE __vec4_i8 __select(__vec4_i1 mask, __vec4_i8 a, __vec4_i8 b)
                                                         _mm_extract_epi8(b.v, 3));
 }
 
-static FORCEINLINE int8_t __extract_element(__vec4_i8 v, int index) {
-    return ((int8_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
-    ((int8_t *)v)[index] = val;
-}
 
 template <class RetVecType> __vec4_i8 __smear_i8(int8_t v);
 template <> FORCEINLINE __vec4_i8 __smear_i8<__vec4_i8>(int8_t v) {
@@ -598,6 +641,20 @@ static FORCEINLINE __vec4_i8 __rotate_i8(__vec4_i8 v, int delta) {
                      __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_i8 __shift_i8(__vec4_i8 v, int delta) {
+  int8_t v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_i8(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_i8 __shuffle_i8(__vec4_i8 v, __vec4_i32 index) {
     return __vec4_i8(__extract_element(v, __extract_element(index, 0) & 0x3),
                      __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -836,13 +893,6 @@ static FORCEINLINE __vec4_i16 __select(__vec4_i1 mask, __vec4_i16 a, __vec4_i16
                                                          _mm_extract_epi16(b.v, 3));
 }
 
-static FORCEINLINE int16_t __extract_element(__vec4_i16 v, int index) {
-    return ((int16_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val) {
-    ((int16_t *)v)[index] = val;
-}
 
 template <class RetVecType> __vec4_i16 __smear_i16(int16_t v);
 template <> FORCEINLINE __vec4_i16 __smear_i16<__vec4_i16>(int16_t v) {
@@ -870,6 +920,20 @@ static FORCEINLINE __vec4_i16 __rotate_i16(__vec4_i16 v, int delta) {
                       __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_i16 __shift_i16(__vec4_i16 v, int delta) {
+  int16_t v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_i16(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_i16 __shuffle_i16(__vec4_i16 v, __vec4_i32 index) {
     return __vec4_i16(__extract_element(v, __extract_element(index, 0) & 0x3),
                       __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1109,13 +1173,6 @@ template <> FORCEINLINE __vec4_i32 __undef_i32<__vec4_i32>() {
     return __vec4_i32();
 }
 
-static FORCEINLINE int32_t __extract_element(__vec4_i32 v, int index) {
-    return ((int32_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_i32 *v, int index, int32_t val) {
-    ((int32_t *)v)[index] = val;
-}
 
 static FORCEINLINE __vec4_i32 __broadcast_i32(__vec4_i32 v, int index) {
     return _mm_set1_epi32(__extract_element(v, index));
@@ -1128,6 +1185,21 @@ static FORCEINLINE __vec4_i32 __rotate_i32(__vec4_i32 v, int delta) {
                       __extract_element(v, (delta+3) & 0x3));
 }
 
+#include <iostream>
+static FORCEINLINE __vec4_i32 __shift_i32(const __vec4_i32 &v, int delta) {
+  int32_t v1, v2, v3, v4;
+  int32_t d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_i32(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_i32 __shuffle_i32(__vec4_i32 v, __vec4_i32 index) {
     return __vec4_i32(__extract_element(v, __extract_element(index, 0) & 0x3),
                       __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1383,13 +1455,6 @@ template <> FORCEINLINE __vec4_i64 __undef_i64<__vec4_i64>() {
     return __vec4_i64();
 }
 
-static FORCEINLINE int64_t __extract_element(__vec4_i64 v, int index) {
-    return ((int64_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_i64 *v, int index, int64_t val) {
-    ((int64_t *)v)[index] = val;
-}
 
 static FORCEINLINE __vec4_i64 __broadcast_i64(__vec4_i64 v, int index) {
     uint64_t val = __extract_element(v, index);
@@ -1403,6 +1468,20 @@ static FORCEINLINE __vec4_i64 __rotate_i64(__vec4_i64 v, int delta) {
                       __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_i64 __shift_i64(__vec4_i64 v, int delta) {
+  int64_t v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_i64(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_i64 __shuffle_i64(__vec4_i64 v, __vec4_i32 index) {
     return __vec4_i64(__extract_element(v, __extract_element(index, 0) & 0x3),
                       __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1504,13 +1583,6 @@ template <> FORCEINLINE __vec4_f __undef_float<__vec4_f>() {
     return __vec4_f();
 }
 
-static FORCEINLINE float __extract_element(__vec4_f v, int index) {
-    return ((float *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_f *v, int index, float val) {
-    ((float *)v)[index] = val;
-}
 
 static FORCEINLINE __vec4_f __broadcast_float(__vec4_f v, int index) {
     return _mm_set1_ps(__extract_element(v, index));
@@ -1523,6 +1595,20 @@ static FORCEINLINE __vec4_f __rotate_float(__vec4_f v, int delta) {
                     __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_f __shift_float(__vec4_f v, int delta) {
+  float v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0.f;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0.f;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0.f;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0.f;
+  return __vec4_f(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_f __shuffle_float(__vec4_f v, __vec4_i32 index) {
     return __vec4_f(__extract_element(v, __extract_element(index, 0) & 0x3),
                     __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1656,13 +1742,6 @@ template <> FORCEINLINE __vec4_d __undef_double<__vec4_d>() {
     return __vec4_d();
 }
 
-static FORCEINLINE double __extract_element(__vec4_d v, int index) {
-    return ((double *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_d *v, int index, double val) {
-    ((double *)v)[index] = val;
-}
 
 static FORCEINLINE __vec4_d __broadcast_double(__vec4_d v, int index) {
     return __vec4_d(_mm_set1_pd(__extract_element(v, index)),
@@ -1676,6 +1755,20 @@ static FORCEINLINE __vec4_d __rotate_double(__vec4_d v, int delta) {
                     __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_d __shift_double(__vec4_d v, int delta) {
+  double v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_d(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_d __shuffle_double(__vec4_d v, __vec4_i32 index) {
     return __vec4_d(__extract_element(v, __extract_element(index, 0) & 0x3),
                     __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1889,7 +1982,7 @@ static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, __vec4_i16 val) {
                     (float)((int16_t)_mm_extract_epi16(val.v, 3)));
 }
 
-static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, __vec4_i32 val) {
+static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, const __vec4_i32 &val) {
     return _mm_cvtepi32_ps(val.v);
 }
 
diff --git a/examples/mandelbrot/Makefile b/examples/mandelbrot/Makefile
index 7e83e618..d225037d 100644
--- a/examples/mandelbrot/Makefile
+++ b/examples/mandelbrot/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=mandelbrot
 CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
 ISPC_SRC=mandelbrot.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/mandelbrot/mandelbrot.vcxproj b/examples/mandelbrot/mandelbrot.vcxproj
index 7b78d1dd..e7703ad0 100644
--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -1,175 +1,15 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>mandelbrot</RootNamespace>
+    <ISPC_file>mandelbrot</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-x2</default_targets>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="mandelbrot.cpp" />
     <ClCompile Include="mandelbrot_serial.cpp" />
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="mandelbrot.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
 </Project>
diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile
index 1a565ffd..51866b32 100644
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=mandelbrot_tasks
 CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
 ISPC_SRC=mandelbrot_tasks.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
index 3a8fca79..f8b8cfcb 100644
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -1,180 +1,16 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>mandelbrot_tasks</RootNamespace>
+    <ISPC_file>mandelbrot_tasks</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-x2</default_targets>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot_tasks</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot_tasks</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot_tasks</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot_tasks</TargetName>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="mandelbrot_tasks.cpp" />
     <ClCompile Include="mandelbrot_tasks_serial.cpp" />
     <ClCompile Include="../tasksys.cpp" />
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="mandelbrot_tasks.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
 </Project>
diff --git a/examples/noise/Makefile b/examples/noise/Makefile
index 8cc72689..6dd5ae42 100644
--- a/examples/noise/Makefile
+++ b/examples/noise/Makefile
@@ -1,8 +1,8 @@
 
 EXAMPLE=noise
-CPP_SRC=$(EXAMPLE).cpp $(EXAMPLE)_serial.cpp
+CPP_SRC=noise.cpp noise_serial.cpp
 ISPC_SRC=noise.ispc
-ISPC_IA_TARGETS=sse2,sse4,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/noise/noise.vcxproj b/examples/noise/noise.vcxproj
index 4e983759..7adc57f3 100644
--- a/examples/noise/noise.vcxproj
+++ b/examples/noise/noise.vcxproj
@@ -1,175 +1,15 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>noise</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="noise.cpp" />
-    <ClCompile Include="noise_serial.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="noise.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>noise</RootNamespace>
+    <ISPC_file>noise</ISPC_file>
+    <default_targets>sse2,sse4,avx1-x2</default_targets>
+  </PropertyGroup>
+  <Import Project="..\common.props" />
+  <ItemGroup>
+    <ClCompile Include="noise.cpp" />
+    <ClCompile Include="noise_serial.cpp" />
+  </ItemGroup>
 </Project>
diff --git a/examples/options/Makefile b/examples/options/Makefile
index 11d3d790..2da7e01a 100644
--- a/examples/options/Makefile
+++ b/examples/options/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=options
 CPP_SRC=options.cpp options_serial.cpp
 ISPC_SRC=options.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/options/options.vcxproj b/examples/options/options.vcxproj
index b029b598..af336aa1 100644
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
@@ -1,183 +1,17 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>options</RootNamespace>
+    <ISPC_file>options</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-x2</default_targets>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="options.cpp" />
     <ClCompile Include="options_serial.cpp" />
     <ClCompile Include="../tasksys.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="options.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
     <ClInclude Include="options_defs.h" />
   </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
 </Project>
diff --git a/examples/perf.py b/examples/perf.py
deleted file mode 100755
index 4b661b39..00000000
--- a/examples/perf.py
+++ /dev/null
@@ -1,374 +0,0 @@
-#!/usr/bin/python
-# // Author: Filippov Ilia
-
-from optparse import OptionParser
-import sys
-import os
-import operator
-import time
-import glob
-import string
-import platform
-
-def print_debug(line):
-    if options.silent == False:
-        sys.stdout.write(line)
-
-def print_file(line):
-    if options.output != "":
-        output = open(options.output, 'w')
-        output.writelines(line)
-        output.close()
-
-def build_test():
-    global build_log
-    global is_windows
-    if is_windows == False:
-        os.system("make clean >> "+build_log)
-        return os.system("make CXX="+ref_compiler+" CC="+refc_compiler+" >> "+build_log+" 2>> "+build_log)
-    else:
-        os.system("msbuild /t:clean >> " + build_log)
-        return os.system("msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /t:rebuild >> " + build_log)
-
-def execute_test(command):
-    global perf_temp
-    r = 0
-    if os.path.exists(perf_temp):
-        os.remove(perf_temp)
-    for k in range(int(options.number)):
-        r = r + os.system(command)
-    return r
-
-#gathers all tests results and made an item test from answer structure
-def run_test(command, c1, c2, test, b_serial):
-    global perf_temp
-    if build_test() != 0:
-        sys.stdout.write("ERROR: Compilation fails\n")
-        return
-    if execute_test(command) != 0:
-        sys.stdout.write("ERROR: Execution fails\n")
-        return
-    tasks = [] #list of results with tasks, it will be test[2]
-    ispc = [] #list of results without tasks, it will be test[1]
-    absolute_tasks = []  #list of absolute results with tasks, it will be test[4]
-    absolute_ispc = [] #list of absolute results without tasks, ut will be test[3]
-    serial = [] #list serial times, it will be test[5]
-    j = 1
-    for line in open(perf_temp): # we take test output
-        if "speedup" in line: # we are interested only in lines with speedup
-            if j == c1: # we are interested only in lines with c1 numbers
-                line = line.expandtabs(0)
-                line = line.replace("("," ")
-                line = line.split(",")
-                for i in range(len(line)):
-                    subline = line[i].split(" ")
-                    number = float(subline[1][:-1])
-                    if "speedup from ISPC + tasks" in line[i]:
-                        tasks.append(number)
-                    else:
-                        ispc.append(number)
-                c1 = c1 + c2
-            j+=1
-        if "million cycles" in line:
-            if j == c1:
-                line = line.replace("]","[")
-                line = line.split("[")
-                number = float(line[3])
-                if "tasks" in line[1]:
-                    absolute_tasks.append(number)
-                else:
-                    if "ispc" in line[1]:
-                        absolute_ispc.append(number)
-                if "serial" in line[1]:
-                    serial.append(number)
-
-    if len(ispc) != 0:
-        if len(tasks) != 0:
-            print_debug("ISPC speedup / ISPC + tasks speedup / ISPC time / ISPC + tasks time / serial time\n")
-            for i in range(0,len(serial)):
-                print_debug("%10s   /\t%10s\t    /%9s  /    %10s\t    /%10s\n" %
-                    (ispc[i], tasks[i], absolute_ispc[i], absolute_tasks[i], serial[i]))
-        else:
-            print_debug("ISPC speedup / ISPC time / serial time\n")
-            for i in range(0,len(serial)):
-                print_debug("%10s   /%9s  /%10s\n" % (ispc[i], absolute_ispc[i], serial[i]))
-    else:
-        if len(tasks) != 0:
-            print_debug("ISPC + tasks speedup / ISPC + tasks time / serial time\n")
-            for i in range(0,len(serial)):
-                print_debug("%10s\t     /    %10s\t /%10s\n" % (tasks[i], absolute_tasks[i], serial[i]))
-
-    test[1] = test[1] + ispc
-    test[2] = test[2] + tasks
-    test[3] = test[3] + absolute_ispc
-    test[4] = test[4] + absolute_tasks
-    if b_serial == True:
-        #if we concatenate outputs we should use only the first serial answer.
-        test[5] = test[5] + serial
-
-def cpu_get():
-    p = open("/proc/stat", 'r')
-    cpu = p.readline()
-    p.close()
-    cpu = cpu.split(" ")
-    cpu_usage = (int(cpu[2]) + int(cpu[3]) + int(cpu[4]))
-    cpu_all = cpu_usage + int(cpu[5])
-    return [cpu_usage, cpu_all]
-
-#returns cpu_usage
-def cpu_check():
-    if is_windows == False:
-        if is_mac == False:
-            cpu1 = cpu_get()
-            time.sleep(1)
-            cpu2 = cpu_get()
-            cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
-        else:
-            os.system("sysctl -n vm.loadavg > cpu_temp")
-            c = open("cpu_temp", 'r')
-            c_line = c.readline()
-            c.close
-            os.remove("cpu_temp")
-            R = c_line.split(' ')
-            cpu_percent = float(R[1]) * 3
-    else:
-	os.system("wmic cpu get loadpercentage /value > cpu_temp")
-	c = open("cpu_temp", 'r')
-        c_lines = c.readlines()
-	c.close()
-	os.remove("cpu_temp")
-	t = "0"
-	for i in c_lines[2]:
-            if i.isdigit():
-                t = t + i
-	cpu_percent = int(t)
-    return cpu_percent
-
-#returns geomean of list
-def geomean(par):
-    temp = 1
-    l = len(par)
-    for i in range(l):
-        temp = temp * par[i]
-    temp = temp ** (1.0/l)
-    return round(temp, 2)
-
-#takes an answer struct and print it.
-#answer struct: list answer contains lists test
-#test[0] - name of test
-#test[1] - list of results without tasks
-#test[2] - list of results with tasks
-#test[3] - list of absolute results without tasks
-#test[4] - list of absolute results with tasks
-#test[5] - list of absolute time without ISPC (serial)
-#test[1..4] may be empty
-def print_answer(answer):
-    filelist = []
-    print_debug("--------------------------------------------------------------------------\n")
-    print_debug("test name:\t    ISPC speedup: ISPC + tasks speedup: | " + 
-        "ISPC time:    ISPC + tasks time:  serial:\n")
-    filelist.append("test name,ISPC speedup,diff," +
-        "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n")
-    max_t = [0,0,0,0,0]
-    diff_t = [0,0,0,0,0]
-    geomean_t = [0,0,0,0,0]
-    list_of_max = [[],[],[],[],[]]
-    for i in range(len(answer)):
-        for t in range(1,6):
-            if len(answer[i][t]) == 0:
-                max_t[t-1] = "n/a"
-                diff_t[t-1] = "n/a"
-            else:
-                if t < 3:
-                    mm = max(answer[i][t])
-                else:
-                    mm = min(answer[i][t])
-                max_t[t-1] = '%.2f' % mm
-                list_of_max[t-1].append(mm)
-                diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t]))
-        print_debug("%s:\n" % answer[i][0])
-        print_debug("\t\tmax:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
-            (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4]))
-        print_debug("\t\tdiff:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
-            (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4]))
-        for t in range(0,5):
-            if max_t[t] == "n/a":
-                max_t[t] = ""
-            if diff_t[t] == "n/a":
-                diff_t[t] = ""
-        filelist.append(answer[i][0] + "," +
-                        max_t[0] + "," + diff_t[0] + "," +  max_t[1] + "," + diff_t[1] + "," +
-                        max_t[2] + "," + diff_t[2] + "," +  max_t[3] + "," + diff_t[3] + "," +
-                        max_t[4] + "," + diff_t[4] + "\n")
-    for i in range(0,5):
-        geomean_t[i] = geomean(list_of_max[i])
-    print_debug("---------------------------------------------------------------------------------\n")
-    print_debug("Geomean:\t\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
-        (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]))
-    filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1])
-        + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n")
-    print_file(filelist)
-
-
-###Main###
-# parsing options
-parser = OptionParser()
-parser.add_option('-n', '--number', dest='number',
-    help='number of repeats', default="3")
-parser.add_option('-c', '--config', dest='config',
-    help='config file of tests', default="./perf.ini")
-parser.add_option('-p', '--path', dest='path',
-    help='path to examples directory', default="./")
-parser.add_option('-s', '--silent', dest='silent',
-    help='silent mode, only table output', default=False, action="store_true")
-parser.add_option('-o', '--output', dest='output',
-    help='output file for script reading', default="")
-parser.add_option('--compiler', dest='compiler',
-    help='reference compiler', default="")
-(options, args) = parser.parse_args()
-
-global is_windows
-is_windows = (platform.system() == 'Windows' or
-              'CYGWIN_NT' in platform.system())
-global is_mac
-is_mac = (platform.system() == 'Darwin')
-
-# save corrent path
-pwd = os.getcwd()
-pwd = pwd + os.sep
-if is_windows:
-    pwd = "..\\"
-
-# check if cpu usage is low now
-cpu_percent = cpu_check()
-if cpu_percent > 20:
-    sys.stdout.write("Warning: CPU Usage is very high.\n")
-    sys.stdout.write("Close other applications.\n")
-
-# check that required compilers exist
-PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
-compiler_exists = False
-ref_compiler_exists = False
-if is_windows == False:
-    compiler = "ispc"
-    ref_compiler = "g++"
-    refc_compiler = "gcc"
-    if options.compiler != "":
-        if options.compiler == "clang" or options.compiler == "clang++":
-            ref_compiler = "clang++"
-            refc_compiler = "clang"
-        if options.compiler == "icc" or options.compiler == "icpc":
-            ref_compiler = "icpc"
-            refc_compiler = "icc"
-else:
-    compiler = "ispc.exe"
-    ref_compiler = "cl.exe"
-for counter in PATH_dir:
-    if os.path.exists(counter + os.sep + compiler):
-        compiler_exists = True
-    if os.path.exists(counter + os.sep + ref_compiler):
-        ref_compiler_exists = True
-if not compiler_exists:
-    sys.stderr.write("Fatal error: ISPC compiler not found.\n")
-    sys.stderr.write("Added path to ispc compiler to your PATH variable.\n")
-    sys.exit()
-if not ref_compiler_exists:
-    sys.stderr.write("Fatal error: reference compiler %s not found.\n" % ref_compiler)
-    sys.stderr.write("Added path to %s compiler to your PATH variable.\n" % ref_compiler)
-    sys.exit()
-
-# checks that config file exists
-path_config = os.path.normpath(options.config)
-if os.path.exists(path_config) == False:
-    sys.stderr.write("Fatal error: config file not found: %s.\n" % options.config) 
-    sys.stderr.write("Set path to your config file in --config.\n")
-    sys.exit()
-
-# read lines from config file except comments
-f = open(path_config, 'r')
-f_lines = f.readlines()
-f.close()
-lines =[]
-for i in range(len(f_lines)):
-    if f_lines[i][0] != "%":
-        lines.append(f_lines[i])
-length = len(lines)
-
-# prepare build.log and perf_temp files
-global build_log
-build_log = pwd + "build.log"
-if is_windows == False:
-    if os.path.exists(build_log):
-        os.remove(build_log)
-else:
-    if os.path.exists("build.log"):
-        os.remove("build.log")
-global perf_temp
-perf_temp = pwd + "perf_temp"
-
-i = 0
-answer = []
-print_debug("Okey go go go!\n\n")
-os.system(compiler + " --version >" + build_log)
-version = open(build_log)
-print_debug("Using test compiler: " + version.readline())
-version.close()
-
-if is_windows == False:
-    os.system(ref_compiler + " --version >" + build_log)
-else:
-    os.system(ref_compiler + " 2>" + build_log + " 1>&2")
-
-version = open(build_log)
-print_debug("Using reference compiler: " + version.readline())
-version.close()
-
-
-# loop for all tests
-while i < length-2:
-    # we read name of test
-    print_debug("%s" % lines[i])
-    test = [lines[i][:-1],[],[],[],[],[]]
-    # read location of test
-    folder = lines[i+1]
-    folder = folder[:-1]
-    folder = os.path.normpath(options.path + os.sep + folder)
-    # check that test exists
-    if os.path.exists(folder) == False:
-        sys.stdout.write("Fatal error: Can't find test %s. Your path is: \"%s\".\n" % (lines[i][:-1], options.path))
-        sys.stdout.write("Change current location to /examples or set path to /examples in --path.\n")
-        exit(0)
-    os.chdir(folder)
-    # read parameters of test
-    command = lines[i+2]
-    command = command[:-1]
-    if is_windows == False:
-        command = "./"+command + " >> " + perf_temp
-    else:
-        command = "x64\\Release\\"+command + " >> " + perf_temp
-    # parsing config parameters
-    next_line = lines[i+3]
-    if next_line[0] == "!": # we should take only one part of test output
-        R = next_line.split(' ')
-        c1 = int(R[1]) #c1 is a number of string which we want to use in test output
-        c2 = int(R[2]) #c2 is total number of strings in test output
-        i = i+1
-    else:
-        c1 = 1
-        c2 = 1
-    next_line = lines[i+3]
-    if next_line[0] == "^":  #we should concatenate result of this test with previous one
-        run_test(command, c1, c2, answer[len(answer)-1], False)
-        i = i+1
-    else: #we run this test and append it's result to answer structure
-        run_test(command, c1, c2, test, True)
-        answer.append(test)
-    # preparing next loop iteration
-    os.chdir(pwd)
-    i+=4
-
-# delete temp file
-if os.path.exists(perf_temp):
-    os.remove(perf_temp)
-#print collected answer
-print_answer(answer)
diff --git a/examples/perfbench/Makefile b/examples/perfbench/Makefile
index 02507c84..cc2e681f 100644
--- a/examples/perfbench/Makefile
+++ b/examples/perfbench/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=perbench
 CPP_SRC=perfbench.cpp perfbench_serial.cpp
 ISPC_SRC=perfbench.ispc
-ISPC_IA_TARGETS=sse2,sse4,avx
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/perfbench/perfbench.vcxproj b/examples/perfbench/perfbench.vcxproj
index 31974ac7..d94b753c 100644
--- a/examples/perfbench/perfbench.vcxproj
+++ b/examples/perfbench/perfbench.vcxproj
@@ -22,6 +22,7 @@
     <ProjectGuid>{d923bb7e-a7c8-4850-8fcf-0eb9ce35b4e8}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>perfbench</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -155,15 +156,15 @@
   <ItemGroup>
     <CustomBuild Include="perfbench.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
diff --git a/examples/rt/Makefile b/examples/rt/Makefile
index 647086cb..e52b02e4 100644
--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=rt
 CPP_SRC=rt.cpp rt_serial.cpp
 ISPC_SRC=rt.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/rt/rt.vcxproj b/examples/rt/rt.vcxproj
index 4cfefb81..ea34de56 100644
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -1,180 +1,16 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>rt</RootNamespace>
+    <ISPC_file>rt</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-i32x8</default_targets>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <CustomBuild Include="rt.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="rt.cpp" />
     <ClCompile Include="rt_serial.cpp" />
     <ClCompile Include="../tasksys.cpp" />
   </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
 </Project>
diff --git a/examples/simple/Makefile b/examples/simple/Makefile
index 80f09193..dce7942b 100644
--- a/examples/simple/Makefile
+++ b/examples/simple/Makefile
@@ -1,5 +1,5 @@
 
-CXX=g++ -m64
+CXX=clang++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
 ISPCFLAGS=-O2 --arch=x86-64 --target=sse2
diff --git a/examples/simple/simple.vcxproj b/examples/simple/simple.vcxproj
index 65af97bb..34908223 100644
--- a/examples/simple/simple.vcxproj
+++ b/examples/simple/simple.vcxproj
@@ -25,18 +25,18 @@
     <CustomBuild Include="simple.ispc">
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
 </Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
 </Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
+$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
@@ -46,6 +46,7 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
     <ProjectGuid>{947C5311-8B78-4D05-BEE4-BCF342D4B367}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>simple</RootNamespace>
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
diff --git a/examples/sort/Makefile b/examples/sort/Makefile
index cf6bffa4..6ae43fae 100644
--- a/examples/sort/Makefile
+++ b/examples/sort/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=sort
 CPP_SRC=sort.cpp sort_serial.cpp
 ISPC_SRC=sort.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
 ISPC_ARM_TARGETS=neon
 #ISPC_FLAGS=-DDEBUG
 
diff --git a/examples/sort/sort.cpp b/examples/sort/sort.cpp
index 1d05b247..20221d90 100644
--- a/examples/sort/sort.cpp
+++ b/examples/sort/sort.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2013, Intel Corporation
+  Copyright (c) 2013, Durham University
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -13,7 +13,7 @@
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
 
-    * Neither the name of Intel Corporation nor the names of its
+    * Neither the name of Durham University nor the names of its
       contributors may be used to endorse or promote products derived from
       this software without specific prior written permission.
 
@@ -78,7 +78,7 @@ int main (int argc, char *argv[])
 
   for (i = 0; i < m; i ++)
   {
-    for (j = 0; j < n; j ++) code [j] = random() % l;
+    for (j = 0; j < n; j ++) code [j] = rand() % l;
 
     reset_and_start_timer();
 
@@ -86,7 +86,8 @@ int main (int argc, char *argv[])
 
     tISPC1 += get_elapsed_mcycles();
 
-    progressbar (i, m);
+    if (argc != 3)
+        progressbar (i, m);
   }
 
   printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1);
@@ -95,7 +96,7 @@ int main (int argc, char *argv[])
 
   for (i = 0; i < m; i ++)
   {
-    for (j = 0; j < n; j ++) code [j] = random() % l;
+    for (j = 0; j < n; j ++) code [j] = rand() % l;
 
     reset_and_start_timer();
 
@@ -103,16 +104,17 @@ int main (int argc, char *argv[])
 
     tISPC2 += get_elapsed_mcycles();
 
-    progressbar (i, m);
+    if (argc != 3)
+        progressbar (i, m);
   }
-              
-  printf("[sort ispc+tasks]:\t[%.3f] million cycles\n", tISPC2);
+
+  printf("[sort ispc + tasks]:\t[%.3f] million cycles\n", tISPC2);
 
   srand (0);
 
   for (i = 0; i < m; i ++)
   {
-    for (j = 0; j < n; j ++) code [j] = random() % l;
+    for (j = 0; j < n; j ++) code [j] = rand() % l;
 
     reset_and_start_timer();
 
@@ -120,13 +122,13 @@ int main (int argc, char *argv[])
 
     tSerial += get_elapsed_mcycles();
 
-    progressbar (i, m);
+    if (argc != 3)
+        progressbar (i, m);
   }
 
   printf("[sort serial]:\t\t[%.3f] million cycles\n", tSerial);
 
-  printf("\t\t\t\t(%.2fx speedup from ISPC serial)\n", tSerial/tISPC1);
-  printf("\t\t\t\t(%.2fx speedup from ISPC with tasks)\n", tSerial/tISPC2);
+  printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
 
   delete code;
   delete order;
diff --git a/examples/sort/sort.ispc b/examples/sort/sort.ispc
index 65df4736..25ea90f4 100644
--- a/examples/sort/sort.ispc
+++ b/examples/sort/sort.ispc
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2013, Intel Corporation
+  Copyright (c) 2013, Durham University
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -13,7 +13,7 @@
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
 
-    * Neither the name of Intel Corporation nor the names of its
+    * Neither the name of Durham University nor the names of its
       contributors may be used to endorse or promote products derived from
       this software without specific prior written permission.
 
@@ -172,7 +172,7 @@ task void bumpup (uniform int h[], uniform int g[])
 
 static void prefix_sum (uniform int num, uniform int h[])
 {
-  uniform int * uniform g = uniform new int [num+1];
+  uniform int * uniform g = uniform new uniform int [num+1];
   uniform int i;
 
   launch[num] addup (h, g+1);
@@ -191,9 +191,9 @@ export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int o
   uniform int num = ntasks < 1 ? num_cores () : ntasks;
   uniform int span = n / num;
   uniform int hsize = 256*programCount*num;
-  uniform int * uniform hist = uniform new int [hsize];
-  uniform int64 * uniform pair = uniform new int64 [n];
-  uniform int64 * uniform temp = uniform new int64 [n];
+  uniform int * uniform hist = uniform new uniform int [hsize];
+  uniform int64 * uniform pair = uniform new uniform int64 [n];
+  uniform int64 * uniform temp = uniform new uniform int64 [n];
   uniform int pass, i;
 
 #if DEBUG
diff --git a/examples/sort/sort.vcxproj b/examples/sort/sort.vcxproj
new file mode 100644
index 00000000..43f2b439
--- /dev/null
+++ b/examples/sort/sort.vcxproj
@@ -0,0 +1,16 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>sort</RootNamespace>
+    <ISPC_file>sort</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-x2</default_targets>
+  </PropertyGroup>
+  <Import Project="..\common.props" />
+  <ItemGroup>
+    <ClCompile Include="sort.cpp" />
+    <ClCompile Include="sort_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+</Project>
diff --git a/examples/sort/sort_serial.cpp b/examples/sort/sort_serial.cpp
index ba955c77..38bbdda6 100644
--- a/examples/sort/sort_serial.cpp
+++ b/examples/sort/sort_serial.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2013, Intel Corporation
+  Copyright (c) 2013, Durham University
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -13,7 +13,7 @@
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
 
-    * Neither the name of Intel Corporation nor the names of its
+    * Neither the name of Durham University nor the names of its
       contributors may be used to endorse or promote products derived from
       this software without specific prior written permission.
 
diff --git a/examples/stencil/Makefile b/examples/stencil/Makefile
index 097cd597..1b9c2717 100644
--- a/examples/stencil/Makefile
+++ b/examples/stencil/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=stencil
 CPP_SRC=stencil.cpp stencil_serial.cpp
 ISPC_SRC=stencil.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/stencil/stencil.vcxproj b/examples/stencil/stencil.vcxproj
index ce5d7979..b5f5bb22 100644
--- a/examples/stencil/stencil.vcxproj
+++ b/examples/stencil/stencil.vcxproj
@@ -1,180 +1,16 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>rt</RootNamespace>
+    <ISPC_file>stencil</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-i32x8</default_targets>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <CustomBuild Include="stencil.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="stencil.cpp" />
     <ClCompile Include="stencil_serial.cpp" />
     <ClCompile Include="../tasksys.cpp" />
   </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
 </Project>
diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp
index c9c2fa7b..b97c4bba 100644
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -693,10 +693,20 @@ InitTaskSystem() {
                     }
 
                     char name[32];
-                    sprintf(name, "ispc_task.%d", (int)getpid());
-                    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
-                    if (!workerSemaphore) {
-                        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
+                    bool success = false;
+                    srand(time(NULL));
+                    for (int i = 0; i < 10; i++) {
+                        sprintf(name, "ispc_task.%d.%d", (int)getpid(), (int)rand());
+                        workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
+                        if (workerSemaphore != SEM_FAILED) {
+                            success = true;
+                            break;
+                        }
+                        fprintf(stderr, "Failed to create %s\n", name);
+                    }
+
+                    if (!success) {
+                        fprintf(stderr, "Error creating semaphore (%s): %s\n", name, strerror(errno));
                         exit(1);
                     }
 
diff --git a/examples/volume_rendering/Makefile b/examples/volume_rendering/Makefile
index 7bb86e10..1bc81e4e 100644
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -2,7 +2,7 @@
 EXAMPLE=volume
 CPP_SRC=volume.cpp volume_serial.cpp
 ISPC_SRC=volume.ispc
-ISPC_IA_TARGETS=sse2,sse4-x2,avx
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x8,avx2-i32x8
 ISPC_ARM_TARGETS=neon
 
 include ../common.mk
diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj
index 908cf734..cc738a7e 100644
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -1,176 +1,16 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>{dee5733a-e93e-449d-9114-9bffcaeb4df9}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>volume</RootNamespace>
+    <ISPC_file>volume</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-i32x8</default_targets>
   </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
+  <Import Project="..\common.props" />
   <ItemGroup>
     <ClCompile Include="volume.cpp" />
     <ClCompile Include="volume_serial.cpp" />
     <ClCompile Include="../tasksys.cpp" />
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="volume.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
 </Project>
diff --git a/expr.cpp b/expr.cpp
index 614cb5e5..1cbebad5 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -1660,6 +1660,64 @@ BinaryExpr::BinaryExpr(Op o, Expr *a, Expr *b, SourcePos p)
     arg1 = b;
 }
 
+Expr *lCreateBinaryOperatorCall(const BinaryExpr::Op bop,
+                                Expr *a0, Expr *a1,
+                                const SourcePos &sp)
+{
+    if ((a0 == NULL) || (a1 == NULL)) {
+        return NULL;
+    }
+    Expr *arg0 = a0->TypeCheck();
+    Expr *arg1 = a1->TypeCheck();
+    if ((arg0 == NULL) || (arg1 == NULL)) {
+        return NULL;
+    }
+    const Type *type0 = arg0->GetType();
+    const Type *type1 = arg1->GetType();
+
+    // If either operand is a reference, dereference it before we move
+    // forward
+    if (CastType<ReferenceType>(type0) != NULL) {
+        arg0 = new RefDerefExpr(arg0, arg0->pos);
+        type0 = arg0->GetType();
+    }
+    if (CastType<ReferenceType>(type1) != NULL) {
+        arg1 = new RefDerefExpr(arg1, arg1->pos);
+        type1 = arg1->GetType();
+    }
+    if ((type0 == NULL) || (type1 == NULL)) {
+        return NULL;
+    }
+    if (CastType<StructType>(type0) != NULL ||
+        CastType<StructType>(type1) != NULL) {
+        std::string opName = std::string("operator") + lOpString(bop);
+        std::vector<Symbol *> funs;
+        m->symbolTable->LookupFunction(opName.c_str(), &funs);
+        if (funs.size() == 0) {
+            Error(sp, "operator %s(%s, %s) is not defined.",
+            opName.c_str(), (type0->GetString()).c_str(), (type1->GetString()).c_str());
+            return NULL;
+        }
+        Expr *func = new FunctionSymbolExpr(opName.c_str(), funs, sp);
+        ExprList *args = new ExprList(sp);
+        args->exprs.push_back(arg0);
+        args->exprs.push_back(arg1);
+        Expr *opCallExpr = new FunctionCallExpr(func, args, sp);
+        return opCallExpr;
+    }
+    return NULL;
+}
+
+
+Expr * MakeBinaryExpr(BinaryExpr::Op o, Expr *a, Expr *b, SourcePos p) {
+    Expr * op = lCreateBinaryOperatorCall(o, a, b, p);
+    if (op != NULL) {
+        return op;
+    }
+    op = new BinaryExpr(o, a, b, p);
+    return op;
+}
+
 
 /** Emit code for a && or || logical operator.  In particular, the code
     here handles "short-circuit" evaluation, where the second expression
@@ -2740,6 +2798,17 @@ BinaryExpr::TypeCheck() {
     }
 }
 
+const Type *
+BinaryExpr::GetLValueType() const {
+  const Type *t = GetType();
+  if (CastType<PointerType>(t) != NULL) {
+    // Are we doing something like (basePtr + offset)[...] = ...
+    return t;
+  }
+  else {
+    return NULL;
+  }
+}
 
 int
 BinaryExpr::EstimateCost() const {
@@ -2985,29 +3054,10 @@ AssignExpr::TypeCheck() {
     if (lvalueIsReference)
         lvalue = new RefDerefExpr(lvalue, lvalue->pos);
 
-    FunctionSymbolExpr *fse;
-    if ((fse = dynamic_cast<FunctionSymbolExpr *>(rvalue)) != NULL) {
-        // Special case to use the type of the LHS to resolve function
-        // overloads when we're assigning a function pointer where the
-        // function is overloaded.
-        const Type *lvalueType = lvalue->GetType();
-        const FunctionType *ftype;
-        if (CastType<PointerType>(lvalueType) == NULL ||
-            (ftype = CastType<FunctionType>(lvalueType->GetBaseType())) == NULL) {
-            Error(lvalue->pos, "Can't assign function pointer to type \"%s\".",
-                  lvalueType ? lvalueType->GetString().c_str() : "<unknown>");
-            return NULL;
-        }
-
-        std::vector<const Type *> paramTypes;
-        for (int i = 0; i < ftype->GetNumParameters(); ++i)
-            paramTypes.push_back(ftype->GetParameterType(i));
-
-        if (!fse->ResolveOverloads(rvalue->pos, paramTypes)) {
-            Error(pos, "Unable to find overloaded function for function "
-                  "pointer assignment.");
-            return NULL;
-        }
+    if (PossiblyResolveFunctionOverloads(rvalue, lvalue->GetType()) == false) {
+        Error(pos, "Unable to find overloaded function for function "
+                "pointer assignment.");
+        return NULL;
     }
 
     const Type *lhsType = lvalue->GetType();
@@ -3650,10 +3700,37 @@ FunctionCallExpr::GetLValue(FunctionEmitContext *ctx) const {
         return NULL;
     }
 }
- 
+
+
+bool FullResolveOverloads(Expr * func, ExprList * args,
+                        std::vector<const Type *> *argTypes,
+                        std::vector<bool> *argCouldBeNULL,
+                        std::vector<bool> *argIsConstant) {
+    for (unsigned int i = 0; i < args->exprs.size(); ++i) {
+        Expr *expr = args->exprs[i];
+        if (expr == NULL)
+            return false;
+        const Type *t = expr->GetType();
+        if (t == NULL)
+            return false;
+        argTypes->push_back(t);
+        argCouldBeNULL->push_back(lIsAllIntZeros(expr) || dynamic_cast<NullPointerExpr *>(expr));
+        argIsConstant->push_back(dynamic_cast<ConstExpr *>(expr) || dynamic_cast<NullPointerExpr *>(expr));
+    }
+    return true;
+}
+
 
 const Type *
 FunctionCallExpr::GetType() const {
+    std::vector<const Type *> argTypes;
+    std::vector<bool> argCouldBeNULL, argIsConstant;
+    if (FullResolveOverloads(func, args, &argTypes, &argCouldBeNULL, &argIsConstant) == true) {
+        FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
+        if (fse != NULL) {
+            fse->ResolveOverloads(args->pos, argTypes, &argCouldBeNULL, &argIsConstant);
+        }
+    }
     const FunctionType *ftype = lGetFunctionType(func);
     return ftype ? ftype->GetReturnType() : NULL;
 }
@@ -3689,20 +3766,9 @@ FunctionCallExpr::TypeCheck() {
 
     std::vector<const Type *> argTypes;
     std::vector<bool> argCouldBeNULL, argIsConstant;
-    for (unsigned int i = 0; i < args->exprs.size(); ++i) {
-        Expr *expr = args->exprs[i];
 
-        if (expr == NULL)
-            return NULL;
-        const Type *t = expr->GetType();
-        if (t == NULL)
-            return NULL;
-
-        argTypes.push_back(t);
-        argCouldBeNULL.push_back(lIsAllIntZeros(expr) ||
-                                 dynamic_cast<NullPointerExpr *>(expr));
-        argIsConstant.push_back(dynamic_cast<ConstExpr *>(expr) ||
-                                dynamic_cast<NullPointerExpr *>(expr));
+    if (FullResolveOverloads(func, args, &argTypes, &argCouldBeNULL, &argIsConstant) == false) {
+        return NULL;
     }
 
     FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
@@ -4211,8 +4277,9 @@ IndexExpr::GetValue(FunctionEmitContext *ctx) const {
     }
     else {
         Symbol *baseSym = GetBaseSymbol();
-        if (dynamic_cast<FunctionCallExpr *>(baseExpr) == NULL) {
-            // Only check for non-function calls
+        if (dynamic_cast<FunctionCallExpr *>(baseExpr) == NULL && 
+            dynamic_cast<BinaryExpr *>(baseExpr) == NULL) {
+          // Don't check if we're doing a function call or pointer arith
             AssertPos(pos, baseSym != NULL);
         }
         mask = lMaskForSymbol(baseSym, ctx);
@@ -7010,8 +7077,22 @@ TypeCastExpr::GetLValue(FunctionEmitContext *ctx) const {
 
 const Type *
 TypeCastExpr::GetType() const {
-    AssertPos(pos, type->HasUnboundVariability() == false);
-    return type;
+    // Here we try to resolve situation where (base_type) can be treated as
+    // (uniform base_type) of (varying base_type). This is a part of function
+    // TypeCastExpr::TypeCheck. After implementation of operators we
+    // have to have this functionality here.
+    const Type *toType = type, *fromType = expr->GetType();
+    if (toType == NULL || fromType == NULL)
+        return NULL;
+    if (toType->HasUnboundVariability()) {
+        if (fromType->IsUniformType()) {
+            toType = type->ResolveUnboundVariability(Variability::Uniform);
+        } else {
+            toType = type->ResolveUnboundVariability(Variability::Varying);
+        }
+    }
+    AssertPos(pos, toType->HasUnboundVariability() == false);
+    return toType;
 }
 
 
@@ -8190,6 +8271,9 @@ FunctionSymbolExpr::ResolveOverloads(SourcePos argPos,
                                      const std::vector<bool> *argCouldBeNULL,
                                      const std::vector<bool> *argIsConstant) {
     const char *funName = candidateFunctions.front()->name.c_str();
+    if (triedToResolve == true) {
+        return true;
+    }
 
     triedToResolve = true;
 
diff --git a/expr.h b/expr.h
index 42fdff45..45780414 100644
--- a/expr.h
+++ b/expr.h
@@ -155,6 +155,7 @@ public:
 
     llvm::Value *GetValue(FunctionEmitContext *ctx) const;
     const Type *GetType() const;
+    const Type *GetLValueType() const;
     void Print() const;
 
     Expr *Optimize();
@@ -730,6 +731,8 @@ bool CanConvertTypes(const Type *fromType, const Type *toType,
  */
 Expr *TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase);
 
+Expr * MakeBinaryExpr(BinaryExpr::Op o, Expr *a, Expr *b, SourcePos p);
+
 /** Utility routine that emits code to initialize a symbol given an
     initializer expression.
 
diff --git a/fail_db.txt b/fail_db.txt
new file mode 100644
index 00000000..ff119d5a
--- /dev/null
+++ b/fail_db.txt
@@ -0,0 +1,633 @@
+% List of known fails.
+% The list is unordered and contains information about commonly used platforms / configurations.
+% Our goas is to maintain this list for Linux, MacOS and Windows with reasonably new compilers.
+% Note, that it's important which C++ compiler was used. The currently supported C++ compilers are
+% clang 3.3 on Linux and MacOS and cl (VS2010) on Windows.
+% Please also note that it's very important to have correctly built LLVM. There are a number of
+% LLVM bugs in released versions, that we have to workaround by applying patches (see llvm_patches
+% folder). The recommended way to build LLVM on Unix is to use "alloy.py".
+% 
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\test-141.ispc runfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\test-141.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.4         cl -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.3 clang++3.3 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\packed-store.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\test-141.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\test-141.ispc runfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+./tests/half-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-11.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-12.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-13.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-14.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-3.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-4.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/half-1.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/ptr-19.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-11.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-12.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-13.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-14.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-3.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-4.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/atomics-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O0 *
+./tests/half-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-11.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-12.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-13.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-14.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-3.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-4.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/half-1.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/ptr-19.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-11.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-12.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-13.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-14.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-3.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-4.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/atomics-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O0 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O2 *
+./tests/half-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-11.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-12.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-13.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-14.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-3.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-4.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O2 *
+./tests/half-1.ispc runfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/ptr-19.ispc runfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-11.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-12.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-13.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-14.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-3.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-4.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/atomics-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-add-9.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-and-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-and-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/exclusive-scan-or-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-10.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-12.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-13.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-2.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-3.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-4.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-5.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-6.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
+./tests/reduce-equal-8.ispc compfail  x86-64     generic-16   Linux LLVM 3.5 clang++3.3 -O0 *
diff --git a/ispc.cpp b/ispc.cpp
index 6d4b063d..36d31580 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -102,6 +102,22 @@ static void __cpuidex(int info[4], int level, int count) {
 }
 #endif // !ISPC_IS_WINDOWS && !__ARM__
 
+#if !defined(__arm__)
+static bool __os_has_avx_support() {
+#if defined(ISPC_IS_WINDOWS)
+    // Check if the OS will save the YMM registers
+    unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+    return (xcrFeatureMask & 6) == 6;
+#else // !defined(ISPC_IS_WINDOWS)
+    // Check xgetbv; this uses a .byte sequence instead of the instruction
+    // directly because older assemblers do not include support for xgetbv and
+    // there is no easy way to conditionally compile based on the assembler used.
+    int rEAX, rEDX;
+    __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+    return (rEAX & 6) == 6;
+#endif // !defined(ISPC_IS_WINDOWS)
+}
+#endif // !__arm__
 
 static const char *
 lGetSystemISA() {
@@ -111,7 +127,8 @@ lGetSystemISA() {
     int info[4];
     __cpuid(info, 1);
 
-    if ((info[2] & (1 << 28)) != 0) {  // AVX
+    if ((info[2] & (1 << 28)) != 0 &&
+         __os_has_avx_support()) {  // AVX
         // AVX1 for sure....
         // Ivy Bridge?
         if ((info[2] & (1 << 29)) != 0 &&  // F16C
@@ -126,7 +143,7 @@ lGetSystemISA() {
                 return "avx1.1-i32x8";
         }
         // Regular AVX
-        return "avx-i32x8";
+        return "avx1-i32x8";
     }
     else if ((info[2] & (1 << 19)) != 0)
         return "sse4-i32x4";
@@ -151,6 +168,9 @@ static const char *supportedCPUs[] = {
 #if !defined(LLVM_3_1)
     , "core-avx-i", "core-avx2"
 #endif // LLVM 3.2+
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3)
+    , "slm"
+#endif // LLVM 3.4+
 };
 
 Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
@@ -171,6 +191,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     m_tf_attributes(NULL),
 #endif
     m_nativeVectorWidth(-1),
+    m_dataTypeWidth(-1),
     m_vectorWidth(-1),
     m_generatePIC(pic),
     m_maskingIsFree(false),
@@ -196,9 +217,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
                 isa = "avx1.1-i32x8";
             else if (!strcmp(cpu, "sandybridge") ||
                 !strcmp(cpu, "corei7-avx"))
-                isa = "avx-i32x8";
+                isa = "avx1-i32x8";
             else if (!strcmp(cpu, "corei7") ||
-                     !strcmp(cpu, "penryn"))
+                     !strcmp(cpu, "penryn") ||
+                     !strcmp(cpu, "slm"))
                 isa = "sse4-i32x4";
             else
                 isa = "sse2-i32x4";
@@ -287,9 +309,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         !strcasecmp(isa, "sse2-i32x4")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",-sse4.1,-sse4.2"
 #else
         ",-sse41,-sse42"
@@ -302,9 +325,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "sse2-i32x8")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",-sse4.1,-sse4.2"
 #else
         ",-sse41,-sse42"
@@ -317,11 +341,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "sse4-i32x4")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         // TODO: why not sse42 and popcnt?
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
-#if defined(LLVM_3_4)
-        ",+sse4.1,-sse4.2"        
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
+        ",+sse4.1,-sse4.2"
 #else
         ",+sse41,-sse42"
 #endif
@@ -334,10 +359,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "sse4-i32x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
-#if defined(LLVM_3_4)
-        ",+sse4.1,-sse4.2"        
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
+        ",+sse4.1,-sse4.2"
 #else
         ",+sse41,-sse42"
 #endif
@@ -348,10 +374,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "sse4-i8x16")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 16;
+        this->m_dataTypeWidth = 8;
         this->m_vectorWidth = 16;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
-#if defined(LLVM_3_4)
-        ",+sse4.1,-sse4.2"        
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
+        ",+sse4.1,-sse4.2"
 #else
         ",+sse41,-sse42"
 #endif
@@ -362,10 +389,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "sse4-i16x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 16;
         this->m_vectorWidth = 8;
         this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
-#if defined(LLVM_3_4)
-        ",+sse4.1,-sse4.2"        
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
+        ",+sse4.1,-sse4.2"
 #else
         ",+sse41,-sse42"
 #endif
@@ -436,21 +464,42 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
+    else if (!strcasecmp(isa, "avx1-i32x4")) {
+        this->m_isa = Target::AVX;
+        this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 32;
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+avx,+popcnt,+cmov";
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 32;
+    }
     else if (!strcasecmp(isa, "avx") ||
              !strcasecmp(isa, "avx1") ||
              !strcasecmp(isa, "avx1-i32x8")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx,+popcnt,+cmov";
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
+    else if (!strcasecmp(isa, "avx-i64x4") ||
+             !strcasecmp(isa, "avx1-i64x4")) {
+        this->m_isa = Target::AVX;
+        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_dataTypeWidth = 64;
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+avx,+popcnt,+cmov";
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 64;
+    }
     else if (!strcasecmp(isa, "avx-x2") ||
              !strcasecmp(isa, "avx1-x2") ||
              !strcasecmp(isa, "avx1-i32x16")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx,+popcnt,+cmov";
         this->m_maskingIsFree = false;
@@ -460,9 +509,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1.1-i32x8")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+rdrnd"
 #else
         ",+rdrand"
@@ -480,29 +530,51 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx1.1-i32x16")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+rdrnd"
 #else
         ",+rdrand"
 #endif
-        ;           
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
 #if !defined(LLVM_3_1)
         // LLVM 3.2+ only
         this->m_hasRand = true;
+#endif
+    }
+    else if (!strcasecmp(isa, "avx1.1-i64x4")) {
+        this->m_isa = Target::AVX11;
+        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_dataTypeWidth = 64;
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
+        ",+rdrnd"
+#else
+        ",+rdrand"
+#endif
+        ;
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 64;
+        this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
+        this->m_hasRand = true;
 #endif
     }
     else if (!strcasecmp(isa, "avx2") ||
              !strcasecmp(isa, "avx2-i32x8")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 8;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+rdrnd"
 #else
         ",+rdrand"
@@ -524,9 +596,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "avx2-i32x16")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 16;
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         ",+rdrnd"
 #else
         ",+rdrand"
@@ -542,12 +615,37 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         // LLVM 3.2+ only
         this->m_hasRand = true;
         this->m_hasGather = true;
+#endif
+    }
+    else if (!strcasecmp(isa, "avx2-i64x4")) {
+        this->m_isa = Target::AVX2;
+        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
+        this->m_dataTypeWidth = 64;
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
+        ",+rdrnd"
+#else
+        ",+rdrand"
+#endif
+#ifndef LLVM_3_1
+            ",+fma"
+#endif // !LLVM_3_1
+            ;
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 64;
+        this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
+        this->m_hasRand = true;
+        this->m_hasGather = true;
 #endif
     }
 #ifdef ISPC_ARM_ENABLED
     else if (!strcasecmp(isa, "neon-i8x16")) {
         this->m_isa = Target::NEON8;
         this->m_nativeVectorWidth = 16;
+        this->m_dataTypeWidth = 8;
         this->m_vectorWidth = 16;
         this->m_attributes = "+neon,+fp16";
         this->m_hasHalf = true; // ??
@@ -557,6 +655,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     else if (!strcasecmp(isa, "neon-i16x8")) {
         this->m_isa = Target::NEON16;
         this->m_nativeVectorWidth = 8;
+        this->m_dataTypeWidth = 16;
         this->m_vectorWidth = 8;
         this->m_attributes = "+neon,+fp16";
         this->m_hasHalf = true; // ??
@@ -567,6 +666,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
              !strcasecmp(isa, "neon-i32x4")) {
         this->m_isa = Target::NEON32;
         this->m_nativeVectorWidth = 4;
+        this->m_dataTypeWidth = 32;
         this->m_vectorWidth = 4;
         this->m_attributes = "+neon,+fp16";
         this->m_hasHalf = true; // ??
@@ -651,6 +751,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         // Initialize target-specific "target-feature" attribute.
         if (!m_attributes.empty()) {
             llvm::AttrBuilder attrBuilder;
+            attrBuilder.addAttribute("target-cpu", this->m_cpu);
             attrBuilder.addAttribute("target-features", this->m_attributes);
             this->m_tf_attributes = new llvm::AttributeSet(
                 llvm::AttributeSet::get(
@@ -696,15 +797,16 @@ const char *
 Target::SupportedTargets() {
     return
 #ifdef ISPC_ARM_ENABLED
-        "neon-i8x16, neon-16x8, neon-32x4, "
+        "neon-i8x16, neon-i16x8, neon-i32x4, "
 #endif
         "sse2-i32x4, sse2-i32x8, "
         "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
-        "avx1-i32x8, avx1-i32x16, "
-        "avx1.1-i32x8, avx1.1-i32x16, "
-        "avx2-i32x8, avx2-i32x16, "
+        "avx1-i32x4, "
+        "avx1-i32x8, avx1-i32x16, avx1-i64x4, "
+        "avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 "
+        "avx2-i32x8, avx2-i32x16, avx2-i64x4, "
         "generic-x1, generic-x4, generic-x8, generic-x16, "
-            "generic-x32, generic-x64";
+        "generic-x32, generic-x64";
 }
 
 
@@ -737,6 +839,9 @@ Target::GetTripleString() const {
     return triple.str();
 }
 
+// This function returns string representation of ISA for the purpose of
+// mangling. And may return any unique string, preferably short, like
+// sse4, avx and etc.
 const char *
 Target::ISAToString(ISA isa) {
     switch (isa) {
@@ -772,6 +877,45 @@ Target::GetISAString() const {
 }
 
 
+// This function returns string representation of default target corresponding
+// to ISA. I.e. for SSE4 it's sse4-i32x4, for AVX11 it's avx1.1-i32x8. This
+// string may be used to initialize Target.
+const char *
+Target::ISAToTargetString(ISA isa) {
+    switch (isa) {
+#ifdef ISPC_ARM_ENABLED
+    case Target::NEON8:
+        return "neon-8";
+    case Target::NEON16:
+        return "neon-16";
+    case Target::NEON32:
+        return "neon-32";
+#endif
+    case Target::SSE2:
+        return "sse2-i32x4";
+    case Target::SSE4:
+        return "sse4-i32x4";
+    case Target::AVX:
+        return "avx1-i32x8";
+    case Target::AVX11:
+        return "avx1.1-i32x8";
+    case Target::AVX2:
+        return "avx2-i32x8";
+    case Target::GENERIC:
+        return "generic-4";
+    default:
+        FATAL("Unhandled target in ISAToTargetString()");
+    }
+    return "";
+}
+
+
+const char *
+Target::GetISATargetString() const {
+    return ISAToString(m_isa);
+}
+
+
 static bool
 lGenericTypeLayoutIndeterminate(llvm::Type *type) {
     if (type->isPrimitiveType() || type->isIntegerTy())
diff --git a/ispc.h b/ispc.h
index 4804832f..b319d656 100644
--- a/ispc.h
+++ b/ispc.h
@@ -38,10 +38,10 @@
 #ifndef ISPC_H
 #define ISPC_H
 
-#define ISPC_VERSION "1.4.5dev"
+#define ISPC_VERSION "1.5.1dev"
 
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
-#error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5)
+#error "Only LLVM 3.1, 3.2, 3.3, 3.4 and the 3.5 development branch are supported"
 #endif
 
 #if defined(_WIN32) || defined(_WIN64)
@@ -214,9 +214,16 @@ public:
     /** Convert ISA enum to string */
     static const char *ISAToString(Target::ISA isa);
 
-    /** Returns a string like "avx" encoding the target. */
+    /** Returns a string like "avx" encoding the target. Good for mangling. */
     const char *GetISAString() const;
 
+    /** Convert ISA enum to string */
+    static const char *ISAToTargetString(Target::ISA isa);
+
+    /** Returns a string like "avx1.1-i32x8" encoding the target.
+        This may be used for Target initialization. */
+    const char *GetISATargetString() const;
+
     /** Returns the size of the given type */
     llvm::Value *SizeOf(llvm::Type *type,
                         llvm::BasicBlock *insertAtEnd);
@@ -253,6 +260,8 @@ public:
 
     int getNativeVectorWidth() const {return m_nativeVectorWidth;}
 
+    int getDataTypeWidth() const {return m_dataTypeWidth;}
+
     int getVectorWidth() const {return m_vectorWidth;}
 
     bool getGeneratePIC() const {return m_generatePIC;}
@@ -319,10 +328,14 @@ private:
 #endif
 
     /** Native vector width of the vector instruction set.  Note that this
-        value is directly derived from the ISA Being used (e.g. it's 4 for
+        value is directly derived from the ISA being used (e.g. it's 4 for
         SSE, 8 for AVX, etc.) */
     int m_nativeVectorWidth;
 
+    /** Data type with in bits. Typically it's 32, but could be 8, 16 or 64.
+        For generic it's -1, which means undefined. */
+    int m_dataTypeWidth;
+
     /** Actual vector width currently being compiled to.  This may be an
         integer multiple of the native vector width, for example if we're
         "doubling up" and compiling 8-wide on a 4-wide SSE system. */
diff --git a/ispc.vcxproj b/ispc.vcxproj
index b4a8b764..8aee2988 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -22,14 +22,20 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-avx1-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx1-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx1-x2-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx1-i64x4-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx1-i64x4-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-x2-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx11-i64x4-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx11-i64x4-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx2-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx2-x2-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx2-i64x4-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx2-i64x4-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-c-32.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-c-64.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-dispatch.cpp" />
@@ -51,16 +57,17 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-32bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-32bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-mask1.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-mask8.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-mask16.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-mask32.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask1.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask8.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask16.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask32.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask64.cpp" />
     <ClCompile Include="ispc.cpp" />
     <ClCompile Include="$(Configuration)\lex.cc">
       <DisableSpecificWarnings>4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
@@ -103,13 +110,14 @@
   <ItemGroup>
     <CustomBuild Include="stdlib.ispc">
       <FileType>Document</FileType>
-      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask1 &gt; $(Configuration)/gen-stdlib-mask1.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask8 &gt; $(Configuration)/gen-stdlib-mask8.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask16 &gt; $(Configuration)/gen-stdlib-mask16.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask32 &gt; $(Configuration)/gen-stdlib-mask32.cpp;
+      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask1 &gt; $(Configuration)/gen-stdlib-mask1.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask8 &gt; $(Configuration)/gen-stdlib-mask8.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask16 &gt; $(Configuration)/gen-stdlib-mask16.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask32 &gt; $(Configuration)/gen-stdlib-mask32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask64 &gt; $(Configuration)/gen-stdlib-mask64.cpp;
 </Command>
-      <Outputs>$(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp</Outputs>
-      <Message>Building gen-stdlib-{mask1,8,16,32}.cpp</Message>
+      <Outputs>$(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp;$(Configuration)/gen-stdlib-mask64.cpp</Outputs>
+      <Message>Building gen-stdlib-{mask1,8,16,32,64}.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -117,336 +125,222 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins/dispatch.ll | python bitcode2cpp.py dispatch.ll &gt; $(Configuration)/gen-bitcode-dispatch.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4</AdditionalInputs>
       <Message>Building gen-bitcode-dispatch.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-32bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-32bit.cpp; $(Configuration)/gen-bitcode-sse4-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-32bit.cpp and gen-bitcode-sse4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins\target-sse4.ll">
+    <CustomBuild Include="builtins\target-sse4-8.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-8-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-8-32bit.cpp; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-8-32bit.cpp and gen-bitcode-sse4-8-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-sse4-8.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-8-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-sse4-8.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-8-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-sse4-16.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-16-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-sse4-16.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-16-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins\target-sse4-x2.ll">
+    <CustomBuild Include="builtins\target-sse4-16.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-x2-32bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-16-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-16-32bit.cpp; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-16-32bit.cpp and gen-bitcode-sse4-16-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse4-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse4-x2-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-x2-32bit.cpp; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-x2-32bit.cpp and gen-bitcode-sse4-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse2-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-sse2.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse2-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse2-32bit.cpp; $(Configuration)/gen-bitcode-sse2-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse2-32bit.cpp and gen-bitcode-sse2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse2-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse2-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse2-x2-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-sse2-x2.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-sse2-x2-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse2-x2-32bit.cpp; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse2-x2-32bit.cpp and gen-bitcode-sse2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-32bit.cpp and gen-bitcode-avx1-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-x2-32bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-x2-32bit.cpp and gen-bitcode-avx1-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1-x2.ll">
+    <CustomBuild Include="builtins\target-avx1-i64x4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-x2-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-i64x4-32bit.cpp and gen-bitcode-avx1-i64x4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx11.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx11-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx11-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx11.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx11-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx11-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx11-32bit.cpp; $(Configuration)/gen-bitcode-avx11-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx11-32bit.cpp and gen-bitcode-avx11-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx11-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx11-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx11-x2-32bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx11-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx11-x2-32bit.cpp and gen-bitcode-avx11-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins\target-avx11-x2.ll">
+    <CustomBuild Include="builtins\target-avx11-i64x4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx11-x2-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx11-i64x4-32bit.cpp and gen-bitcode-avx11-i64x4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx2-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx2.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx2-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx2-32bit.cpp; $(Configuration)/gen-bitcode-avx2-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx2-32bit.cpp and gen-bitcode-avx2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx2-x2.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx2-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx2-x2-32bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx2-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx2-x2-32bit.cpp and gen-bitcode-avx2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins\target-avx2-x2.ll">
+    <CustomBuild Include="builtins\target-avx2-i64x4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx2-x2-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx2-i64x4-32bit.cpp and gen-bitcode-avx2-i64x4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-1.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-1-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-1-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-1-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-generic-1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-1-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-1-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-1-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-1-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-generic-1-32bit.cpp; $(Configuration)/gen-bitcode-generic-1-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-generic-1-32bit.cpp and gen-bitcode-generic-1-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-4.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-4-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-4-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-4-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-generic-4.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-4-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-4-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-4-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-4-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-generic-4-32bit.cpp; $(Configuration)/gen-bitcode-generic-4-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-generic-4-32bit.cpp and gen-bitcode-generic-4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-8.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-8-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-8-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-8-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-generic-8.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-8-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-8-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-8-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-8-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-8-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-generic-8-32bit.cpp; $(Configuration)/gen-bitcode-generic-8-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-generic-8-32bit.cpp and gen-bitcode-generic-8-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-16.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-16-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-16-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-16-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-generic-16.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-16-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-16-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-16-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-16-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-16-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-generic-16-32bit.cpp; $(Configuration)/gen-bitcode-generic-16-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-generic-16-32bit.cpp and gen-bitcode-generic-16-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-32.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-32-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-32-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-32-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-generic-32.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-32-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-32-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-32-64bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-32-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-32-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-generic-32-32bit.cpp; $(Configuration)/gen-bitcode-generic-32-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-generic-32-32bit.cpp and gen-bitcode-generic-32-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-64.ll">
       <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-64-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-64-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-64-32bit.cpp</Message>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-64-32bit.cpp;
+               m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-64-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-generic-64-32bit.cpp; $(Configuration)/gen-bitcode-generic-64-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-generic-64-32bit.cpp and gen-bitcode-generic-64-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins\target-generic-64.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-64-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-generic-64-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-generic-64-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-      <CustomBuild Include="lex.ll">
+    <CustomBuild Include="lex.ll">
       <FileType>Document</FileType>
       <Command>flex -t lex.ll &gt; $(Configuration)\lex.cc</Command>
       <Outputs>$(Configuration)\lex.cc</Outputs>
@@ -535,4 +429,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
+</Project>
diff --git a/lex.ll b/lex.ll
index 8baa627a..87a80145 100644
--- a/lex.ll
+++ b/lex.ll
@@ -76,7 +76,7 @@ static int allTokens[] = {
   TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED,
   TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
   TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
-  TOKEN_FLOAT_CONSTANT,
+  TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT,
   TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT,
   TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT,
   TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
@@ -152,6 +152,7 @@ void ParserInit() {
     tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
     tokenToName[TOKEN_DOTDOTDOT] = "...";
     tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
+    tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT";
     tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT";
     tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT";
     tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT";
@@ -266,6 +267,7 @@ void ParserInit() {
     tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
     tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
     tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
+    tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant";
     tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant";
     tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant";
     tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant";
@@ -343,6 +345,9 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
+FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9]+)|([0-9]+[dD][-+]?[0-9]+)|(\.[0-9]*[dD][-+]?[0-9]+))
+
+
 
 IDENT [a-zA-Z_][a-zA-Z_0-9]*
 ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
@@ -414,6 +419,14 @@ while { RT; return TOKEN_WHILE; }
 \"C\" { RT; return TOKEN_STRING_C_LITERAL; }
 \.\.\. { RT; return TOKEN_DOTDOTDOT; }
 
+"operator*"  { return TOKEN_IDENTIFIER; }
+"operator+"  { return TOKEN_IDENTIFIER; }
+"operator-"  { return TOKEN_IDENTIFIER; }
+"operator<<" { return TOKEN_IDENTIFIER; }
+"operator>>" { return TOKEN_IDENTIFIER; }
+"operator/" { return TOKEN_IDENTIFIER; }
+"operator%" { return TOKEN_IDENTIFIER; }
+
 L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERAL; }
 
 {IDENT} {
@@ -437,6 +450,17 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     return lParseInteger(true);
 }
 
+{FORTRAN_DOUBLE_NUMBER} {
+    RT;
+    {
+      int i = 0;
+      while (yytext[i] != 'd' && yytext[i] != 'D') i++;
+      yytext[i] = 'E';
+    }
+    yylval.doubleVal = atof(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
+
 
 {FLOAT_NUMBER} {
     RT;
@@ -450,6 +474,8 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     return TOKEN_FLOAT_CONSTANT;
 }
 
+
+
 "++" { RT; return TOKEN_INC_OP; }
 "--" { RT; return TOKEN_DEC_OP; }
 "<<" { RT; return TOKEN_LEFT_OP; }
diff --git a/llvm_patches/3_3_0001-Fix-PR16807.patch b/llvm_patches/3_3_0001-Fix-PR16807.patch
new file mode 100644
index 00000000..daf1327c
--- /dev/null
+++ b/llvm_patches/3_3_0001-Fix-PR16807.patch
@@ -0,0 +1,78 @@
+From b9c47f44691cb9a648b9fa1ae373f0defe53c757 Mon Sep 17 00:00:00 2001
+From: Michael Liao <michael.hliao@gmail.com>
+Date: Thu, 10 Oct 2013 16:47:00 -0700
+Subject: [PATCH] Fix PR16807
+
+- Lower signed division by constant powers-of-2 to target-independent
+  DAG operators instead of target-dependent ones to support them on
+  targets where vector types are legal but shift operators on that types
+  are illegal, e.g. on AVX, PSRAW is only available on <8 x i16> though
+  <16 x i16> is a legal type.
+---
+ lib/Target/X86/X86ISelLowering.cpp | 22 ++++++++++++++++------
+ test/CodeGen/X86/pr16807.ll        | 18 ++++++++++++++++++
+ 2 files changed, 34 insertions(+), 6 deletions(-)
+ create mode 100644 test/CodeGen/X86/pr16807.ll
+
+diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
+index bd5ad4e..518bb90 100644
+--- lib/Target/X86/X86ISelLowering.cpp
++++ lib/Target/X86/X86ISelLowering.cpp
+@@ -12462,14 +12462,24 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
+       (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
+     unsigned lg2 = SplatValue.countTrailingZeros();
+     // Splat the sign bit.
+-    SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32);
+-    SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG);
++    SmallVector<SDValue, 16> Sz(NumElts,
++                                DAG.getConstant(EltTy.getSizeInBits() - 1,
++                                                EltTy));
++    SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0,
++                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0],
++                                          NumElts));
+     // Add (N0 < 0) ? abs2 - 1 : 0;
+-    SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32);
+-    SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG);
++    SmallVector<SDValue, 16> Amt(NumElts,
++                                 DAG.getConstant(EltTy.getSizeInBits() - lg2,
++                                                 EltTy));
++    SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN,
++                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0],
++                                          NumElts));
+     SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
+-    SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32);
+-    SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG);
++    SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(lg2, EltTy));
++    SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD,
++                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0],
++                                          NumElts));
+ 
+     // If we're dividing by a positive value, we're done.  Otherwise, we must
+     // negate the result.
+diff --git a/test/CodeGen/X86/pr16807.ll b/test/CodeGen/X86/pr16807.ll
+new file mode 100644
+index 0000000..6d55d99
+--- /dev/null
++++ test/CodeGen/X86/pr16807.ll
+@@ -0,0 +1,18 @@
++; RUN: llc < %s -mtriple=x86_64-linux-gnu -mcpu=core-avx-i | FileCheck %s
++
++define <16 x i16> @f_fu(<16 x i16> %bf) {
++allocas:
++  %avg.i.i = sdiv <16 x i16> %bf, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
++  ret <16 x i16> %avg.i.i
++}
++
++; CHECK: f_fu
++; CHECK: psraw
++; CHECK: psrlw
++; CHECK: paddw
++; CHECK: psraw
++; CHECK: psraw
++; CHECK: psrlw
++; CHECK: paddw
++; CHECK: psraw
++; CHECK: ret
+-- 
+1.8.1.2
+
diff --git a/llvm_patches/3_3_PR17764_reverse_operands_avx2.patch b/llvm_patches/3_3_PR17764_reverse_operands_avx2.patch
new file mode 100644
index 00000000..2719633a
--- /dev/null
+++ b/llvm_patches/3_3_PR17764_reverse_operands_avx2.patch
@@ -0,0 +1,51 @@
+From 13c33dd2931ae9d9c5c9f142677f025281fbefca Mon Sep 17 00:00:00 2001
+From: Michael Liao <michael.hliao@gmail.com>
+Date: Fri, 1 Nov 2013 11:08:08 -0700
+Subject: [PATCH] Fix PR17764
+
+- %ret = select %mask, %v1, %v2 is equivalent to
+
+    %ret = %mask ? %v1 : %v2
+
+  but VPBLENDVB %mask, %v1, %v2, %ret (operands are in Intel assembly
+  order) is equivalent to
+
+    %ret = %mask ? %v2 : %v1
+---
+ lib/Target/X86/X86InstrSSE.td |  2 +-
+ test/CodeGen/X86/pr17764.ll   | 10 ++++++++++
+ 2 files changed, 11 insertions(+), 1 deletion(-)
+ create mode 100644 test/CodeGen/X86/pr17764.ll
+
+diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
+index 7cae485..bac88f9 100644
+--- lib/Target/X86/X86InstrSSE.td
++++ lib/Target/X86/X86InstrSSE.td
+@@ -6965,7 +6965,7 @@ let Predicates = [HasAVX] in {
+ let Predicates = [HasAVX2] in {
+   def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
+                             (v32i8 VR256:$src2))),
+-            (VPBLENDVBYrr VR256:$src1, VR256:$src2, VR256:$mask)>;
++            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+   def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2),
+                                (imm:$mask))),
+             (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+diff --git a/test/CodeGen/X86/pr17764.ll b/test/CodeGen/X86/pr17764.ll
+new file mode 100644
+index 0000000..7a3fd6d
+--- /dev/null
++++ test/CodeGen/X86/pr17764.ll
+@@ -0,0 +1,10 @@
++; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 | FileCheck %s
++
++define <16 x i16> @foo(<16 x i1> %mask, <16 x i16> %x, <16 x i16> %y) {
++  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %y
++  ret <16 x i16> %ret
++}
++
++; CHECK: foo
++; CHECK: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
++; CHECK: ret
+-- 
+1.8.1.2
+
diff --git a/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch
new file mode 100644
index 00000000..36bb5572
--- /dev/null
+++ b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch
@@ -0,0 +1,102 @@
+This patch needs to be applied to LLVM 3.3 to fix performance regression after r172868 revision.
+This regression is due to increased register pressure after revision causing spills in case of multiple loads 
+This regression is fixed in 3.4 but the changes in 3.4 is not back portable,
+so we roll back r172868 to avoid regression with 3.3.
+
+Index: test/CodeGen/X86/sandybridge-loads.ll
+===================================================================
+--- test/CodeGen/X86/sandybridge-loads.ll       (revision 191082)
++++ test/CodeGen/X86/sandybridge-loads.ll       (working copy)
+@@ -1,24 +1,5 @@
+ ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
+ 
+-;CHECK: wideloads
+-;CHECK: vmovaps
+-;CHECK: vinsertf128
+-;CHECK: vmovaps
+-;CHECK-NOT: vinsertf128
+-;CHECK: ret
+-
+-define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+-  %v0 = load <8 x float>* %a, align 16  ; <---- unaligned!
+-  %v1 = load <8 x float>* %b, align 32  ; <---- aligned!
+-  %m0 = fcmp olt <8 x float> %v1, %v0
+-  %v2 = load <8 x float>* %c, align 32  ; <---- aligned!
+-  %m1 = fcmp olt <8 x float> %v2, %v0
+-  %mand = and <8 x i1> %m1, %m0
+-  %r = zext <8 x i1> %mand to <8 x i32>
+-  store <8 x i32> %r, <8 x i32>* undef, align 32
+-  ret void
+-}
+-
+ ; CHECK: widestores
+ ; loads:
+ ; CHECK: vmovaps
+Index: test/CodeGen/X86/v8i1-masks.ll
+===================================================================
+--- test/CodeGen/X86/v8i1-masks.ll	(revision 172868)
++++ test/CodeGen/X86/v8i1-masks.ll	(revision 172866)
+@@ -1,7 +1,7 @@
+ ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
+ 
+ ;CHECK: and_masks
+-;CHECK: vmovaps
++;CHECK: vmovups
+ ;CHECK: vcmpltp
+ ;CHECK: vcmpltp
+ ;CHECK: vandps
+Index: lib/Target/X86/X86ISelLowering.cpp
+===================================================================
+--- lib/Target/X86/X86ISelLowering.cpp  (revision 191077)
++++ lib/Target/X86/X86ISelLowering.cpp  (working copy)
+@@ -16756,42 +16756,9 @@
+   EVT MemVT = Ld->getMemoryVT();
+   DebugLoc dl = Ld->getDebugLoc();
+   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+-  unsigned RegSz = RegVT.getSizeInBits();
+ 
+-  // On Sandybridge unaligned 256bit loads are inefficient.
+   ISD::LoadExtType Ext = Ld->getExtensionType();
+-  unsigned Alignment = Ld->getAlignment();
+-  bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
+-  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+-      !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
+-    unsigned NumElems = RegVT.getVectorNumElements();
+-    if (NumElems < 2)
+-      return SDValue();
+ 
+-    SDValue Ptr = Ld->getBasePtr();
+-    SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
+-
+-    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+-                                  NumElems/2);
+-    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+-                                Ld->getPointerInfo(), Ld->isVolatile(),
+-                                Ld->isNonTemporal(), Ld->isInvariant(),
+-                                Alignment);
+-    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+-    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+-                                Ld->getPointerInfo(), Ld->isVolatile(),
+-                                Ld->isNonTemporal(), Ld->isInvariant(),
+-                                std::min(16U, Alignment));
+-    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+-                             Load1.getValue(1),
+-                             Load2.getValue(1));
+-
+-    SDValue NewVec = DAG.getUNDEF(RegVT);
+-    NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
+-    NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+-    return DCI.CombineTo(N, NewVec, TF, true);
+-  }
+-
+   // If this is a vector EXT Load then attempt to optimize it using a
+   // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+   // expansion is still better than scalar code.
+@@ -16805,6 +16772,7 @@
+     assert(MemVT.isVector() && "Must load a vector from memory");
+ 
+     unsigned NumElems = RegVT.getVectorNumElements();
++    unsigned RegSz = RegVT.getSizeInBits();
+     unsigned MemSz = MemVT.getSizeInBits();
+     assert(RegSz > MemSz && "Register size must be greater than the mem size");
+ 
diff --git a/llvm_patches/r183327-AVX2-GATHER.patch b/llvm_patches/3_3_r183327-AVX2-GATHER.patch
similarity index 100%
rename from llvm_patches/r183327-AVX2-GATHER.patch
rename to llvm_patches/3_3_r183327-AVX2-GATHER.patch
diff --git a/llvm_patches/r184575-x86-shift.patch b/llvm_patches/3_3_r184575-x86-shift.patch
similarity index 100%
rename from llvm_patches/r184575-x86-shift.patch
rename to llvm_patches/3_3_r184575-x86-shift.patch
diff --git a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
new file mode 100644
index 00000000..b6abb1d3
--- /dev/null
+++ b/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
@@ -0,0 +1,69 @@
+From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001
+From: Michael Liao <michael.hliao@gmail.com>
+Date: Mon, 21 Oct 2013 17:47:58 -0700
+Subject: [PATCH] Fix PR17631
+
+- Skip instructions added in prolog. For specific targets, prolog may
+  insert helper function calls (e.g. _chkstk will be called when
+  there're more than 4K bytes allocated on stack). However, these
+  helpers don't use/def YMM/XMM registers.
+---
+ lib/Target/X86/X86VZeroUpper.cpp | 11 ++++++++++-
+ test/CodeGen/X86/pr17631.ll      | 22 ++++++++++++++++++++++
+ 2 files changed, 32 insertions(+), 1 deletion(-)
+ create mode 100644 test/CodeGen/X86/pr17631.ll
+
+diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
+index 477f75a..0d37a7d 100644
+--- lib/Target/X86/X86VZeroUpper.cpp
++++ lib/Target/X86/X86VZeroUpper.cpp
+@@ -231,8 +231,17 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
+   bool BBHasCall = false;
+ 
+   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+-    MachineInstr *MI = I;
+     DebugLoc dl = I->getDebugLoc();
++    MachineInstr *MI = I;
++
++    // Don't need to check instructions added in prolog.
++    // In prolog, special function calls may be added for specific targets
++    // (e.g. on Windows, a prolog helper '_chkstk' is called when the local
++    // variables exceed 4K bytes on stack.) These helpers won't use/def YMM/XMM
++    // registers.
++    if (MI->getFlag(MachineInstr::FrameSetup))
++      continue;
++
+     bool isControlFlow = MI->isCall() || MI->isReturn();
+ 
+     // Shortcut: don't need to check regular instructions in dirty state.
+diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
+new file mode 100644
+index 0000000..a572ff2
+--- /dev/null
++++ test/CodeGen/X86/pr17631.ll
+@@ -0,0 +1,22 @@
++; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
++ 
++%struct_type = type { [64 x <8 x float>], <8 x float> }
++ 
++; Function Attrs: nounwind readnone
++declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
++ 
++; Function Attrs: nounwind
++define i32 @equal(<8 x i32> %A) {
++allocas:
++  %first_alloc  = alloca [64 x <8 x i32>]
++  %second_alloc = alloca %struct_type
++ 
++  %A1 = bitcast <8 x i32> %A to <8 x float>
++  %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
++  ret i32 %A2
++}
++
++; CHECK: equal
++; CHECK-NOT: vzeroupper
++; CHECK: _chkstk
++; CHECK: ret
+-- 
+1.8.1.2
+
diff --git a/llvm_patches/3_3_r195476_r195779_i16_sext.patch b/llvm_patches/3_3_r195476_r195779_i16_sext.patch
new file mode 100644
index 00000000..a49325c9
--- /dev/null
+++ b/llvm_patches/3_3_r195476_r195779_i16_sext.patch
@@ -0,0 +1,57 @@
+Two stability patches affecting sse4-i16x8 and sse4-i8x16 targets. See PR18014 and PR18054 for more details.
+
+Index: lib/Target/X86/X86ISelLowering.cpp
+===================================================================
+--- lib/Target/X86/X86ISelLowering.cpp	(revision 195862)
++++ lib/Target/X86/X86ISelLowering.cpp	(working copy)
+@@ -12099,19 +12099,27 @@
+       // fall through
+     case MVT::v4i32:
+     case MVT::v8i16: {
+-      // (sext (vzext x)) -> (vsext x)
+       SDValue Op0 = Op.getOperand(0);
+       SDValue Op00 = Op0.getOperand(0);
+       SDValue Tmp1;
+       // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
+       if (Op0.getOpcode() == ISD::BITCAST &&
+-          Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
++          Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
++        // (sext (vzext x)) -> (vsext x)
+         Tmp1 = LowerVectorIntExtend(Op00, DAG);
+-      if (Tmp1.getNode()) {
+-        SDValue Tmp1Op0 = Tmp1.getOperand(0);
+-        assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
+-               "This optimization is invalid without a VZEXT.");
+-        return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
++        if (Tmp1.getNode()) {
++          EVT ExtraEltVT = ExtraVT.getVectorElementType();
++          // This folding is only valid when the in-reg type is a vector of i8,
++          // i16, or i32.
++          if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 ||
++              ExtraEltVT == MVT::i32) {
++            SDValue Tmp1Op0 = Tmp1.getOperand(0);
++            assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
++                   "This optimization is invalid without a VZEXT.");
++            return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
++          }
++          Op0 = Tmp1;
++        }
+       }
+ 
+       // If the above didn't work, then just use Shift-Left + Shift-Right.
+@@ -15826,6 +15834,15 @@
+     if (BitWidth == 1)
+       return SDValue();
+ 
++    // Check all uses of that condition operand to check whether it will be
++    // consumed by non-BLEND instructions, which may depend on all bits are set
++    // properly.
++    for (SDNode::use_iterator I = Cond->use_begin(),
++                              E = Cond->use_end(); I != E; ++I)
++      if (I->getOpcode() != ISD::VSELECT)
++        // TODO: Add other opcodes eventually lowered into BLEND.
++        return SDValue();
++
+     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+ 
diff --git a/llvm_patches/3_4_r195476_r195779_i16_sext.patch b/llvm_patches/3_4_r195476_r195779_i16_sext.patch
new file mode 100644
index 00000000..4e2c0f6b
--- /dev/null
+++ b/llvm_patches/3_4_r195476_r195779_i16_sext.patch
@@ -0,0 +1,57 @@
+Two stability patches affecting sse4-i16x8 and sse4-i8x16 targets. See PR18014 and PR18054 for more details.
+
+Index: lib/Target/X86/X86ISelLowering.cpp
+===================================================================
+--- lib/Target/X86/X86ISelLowering.cpp	(revision 195863)
++++ lib/Target/X86/X86ISelLowering.cpp	(working copy)
+@@ -13120,19 +13120,27 @@
+       // fall through
+     case MVT::v4i32:
+     case MVT::v8i16: {
+-      // (sext (vzext x)) -> (vsext x)
+       SDValue Op0 = Op.getOperand(0);
+       SDValue Op00 = Op0.getOperand(0);
+       SDValue Tmp1;
+       // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
+       if (Op0.getOpcode() == ISD::BITCAST &&
+-          Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
++          Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
++        // (sext (vzext x)) -> (vsext x)
+         Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
+-      if (Tmp1.getNode()) {
+-        SDValue Tmp1Op0 = Tmp1.getOperand(0);
+-        assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
+-               "This optimization is invalid without a VZEXT.");
+-        return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
++        if (Tmp1.getNode()) {
++          EVT ExtraEltVT = ExtraVT.getVectorElementType();
++          // This folding is only valid when the in-reg type is a vector of i8,
++          // i16, or i32.
++          if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 ||
++              ExtraEltVT == MVT::i32) {
++            SDValue Tmp1Op0 = Tmp1.getOperand(0);
++            assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
++                   "This optimization is invalid without a VZEXT.");
++            return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
++          }
++          Op0 = Tmp1;
++        }
+       }
+ 
+       // If the above didn't work, then just use Shift-Left + Shift-Right.
+@@ -17007,6 +17015,15 @@
+     if (BitWidth == 1)
+       return SDValue();
+ 
++    // Check all uses of that condition operand to check whether it will be
++    // consumed by non-BLEND instructions, which may depend on all bits are set
++    // properly.
++    for (SDNode::use_iterator I = Cond->use_begin(),
++                              E = Cond->use_end(); I != E; ++I)
++      if (I->getOpcode() != ISD::VSELECT)
++        // TODO: Add other opcodes eventually lowered into BLEND.
++        return SDValue();
++
+     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+ 
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 180c8676..275cf794 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -132,6 +132,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
             llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth());
         break;
+    case 64:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt64Ty(*ctx), target.getVectorWidth());
+        break;
     default:
         FATAL("Unhandled mask width for initializing MaskType");
     }
@@ -183,6 +187,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
                                     true /*signed*/); // 0xffffffff
         break;
+    case 64:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1,
+                                    true /*signed*/); // 0xffffffffffffffffull
+        break;
     default:
         FATAL("Unhandled mask width for onMask");
     }
@@ -210,6 +218,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
                                          true /*signed*/);
         break;
+    case 64:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
     default:
         FATAL("Unhandled mask width for offMask");
     }
@@ -480,7 +492,10 @@ LLVMUInt64Vector(const uint64_t *ivec) {
 llvm::Constant *
 LLVMBoolVector(bool b) {
     llvm::Constant *v;
-    if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+    if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int64Type, b ? 0xffffffffffffffffull : 0,
+                                   false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
         v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0,
                                    false /*unsigned*/);
     else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
@@ -506,7 +521,10 @@ LLVMBoolVector(const bool *bvec) {
     std::vector<llvm::Constant *> vals;
     for (int i = 0; i < g->target->getVectorWidth(); ++i) {
         llvm::Constant *v;
-        if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+        if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int64Type, bvec[i] ? 0xffffffffffffffffull : 0,
+                                       false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
             v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0,
                                        false /*unsigned*/);
         else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
diff --git a/main.cpp b/main.cpp
index 21a47de8..99497af5 100644
--- a/main.cpp
+++ b/main.cpp
@@ -70,6 +70,8 @@ lPrintVersion() {
            "3.3"
 #elif defined(LLVM_3_4)
            "3.4"
+#elif defined(LLVM_3_5)
+           "3.5"
 #else
 #error "Unhandled LLVM version"
 #endif
@@ -164,7 +166,7 @@ devUsage(int ret) {
     printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
     printf("    [--yydebug]\t\t\t\tPrint debugging information during parsing\n");
     printf("    [--debug-phase=<value>]\t\tSet optimization phases to dump. --debug-phase=first,210:220,300,305,310:last\n");
-#ifdef LLVM_3_4
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
     printf("    [--debug-ir=<value>]\t\tSet optimization phase to generate debugIR after it\n");
 #endif
     printf("    [--off-phase=<value>]\t\tSwitch off optimization phases. --off-phase=first,210:220,300,305,310:last\n");
@@ -547,7 +549,7 @@ int main(int Argc, char *Argv[]) {
                             "away or introduce the new ones.\n");
             g->debug_stages = ParsingPhases(argv[i] + strlen("--debug-phase="));
         }
-#ifdef LLVM_3_4
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         else if (strncmp(argv[i], "--debug-ir=", 11) == 0) {
             g->debugIR = ParsingPhaseName(argv[i] + strlen("--debug-ir="));
         }
diff --git a/module.cpp b/module.cpp
index 3f197c1b..89f7a1e6 100644
--- a/module.cpp
+++ b/module.cpp
@@ -928,7 +928,7 @@ Module::AddExportedTypes(const std::vector<std::pair<const Type *,
 bool
 Module::writeOutput(OutputType outputType, const char *outFileName,
                     const char *includeFileName) {
-    if (diBuilder != NULL && outputType != Header) {
+    if (diBuilder != NULL && (outputType != Header && outputType != Deps)) {
         diBuilder->finalize();
 
         lStripUnusedDebugInfo(module);
@@ -2096,7 +2096,7 @@ lAddExtractedGlobals(llvm::Module *module,
                 // example, this happens with varying globals if we compile
                 // to different vector widths.
                 if (gv2->getType() != gv->getType())
-                    Error(rgi.pos, "Mismatch in size/layout of global "
+                    Warning(rgi.pos, "Mismatch in size/layout of global "
                           "variable \"%s\" with different targets. "
                           "Globals must not include \"varying\" types or arrays "
                           "with size based on programCount when compiling to "
@@ -2446,7 +2446,7 @@ Module::CompileAndOutput(const char *srcFile,
         int i = 0;
         const char *firstISA;
         while (i < Target::NUM_ISAS && firstTargetMachine == NULL) {
-            firstISA = Target::ISAToString((Target::ISA) i);
+            firstISA = Target::ISAToTargetString((Target::ISA) i);
             firstTargetMachine = targetMachines[i++];
         }
         Assert(firstTargetMachine != NULL);
diff --git a/opt.cpp b/opt.cpp
index 75eae20c..3e320b4b 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -63,7 +63,7 @@
   #include <llvm/IR/BasicBlock.h>
   #include <llvm/IR/Constants.h>
 #endif
-#if defined (LLVM_3_4)
+#if defined (LLVM_3_4) || defined(LLVM_3_5)
   #include <llvm/Transforms/Instrumentation.h>
 #endif
 #include <llvm/PassManager.h>
@@ -72,6 +72,7 @@
 #include <llvm/Analysis/ConstantFolding.h>
 #include <llvm/Target/TargetLibraryInfo.h>
 #include <llvm/ADT/Triple.h>
+#include <llvm/ADT/SmallSet.h>
 #include <llvm/Transforms/Scalar.h>
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
@@ -124,6 +125,10 @@ static llvm::Pass *CreateMakeInternalFuncsStaticPass();
 
 static llvm::Pass *CreateDebugPass(char * output);
 
+static llvm::Pass *CreateReplaceStdlibShiftPass();
+
+static llvm::Pass *CreateFixBooleanSelectPass();
+
 #define DEBUG_START_PASS(NAME)                                 \
     if (g->debugPrint &&                                       \
         (getenv("FUNC") == NULL ||                             \
@@ -438,7 +443,7 @@ DebugPassManager::add(llvm::Pass * P, int stage = -1) {
                 number, P->getPassName());
             PM.add(CreateDebugPass(buf));
         }
-#ifdef LLVM_3_4
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
         if (g->debugIR == number) {
             // adding generating of LLVM IR debug after optimization
             char buf[100];
@@ -521,6 +526,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createPromoteMemoryToRegisterPass());
         optPM.add(llvm::createAggressiveDCEPass());
 
+
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
             optPM.add(llvm::createInstructionCombiningPass(), 210);
@@ -546,7 +552,8 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createGlobalOptimizerPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createIPConstantPropagationPass());
-        optPM.add(llvm::createDeadArgEliminationPass());
+        optPM.add(CreateReplaceStdlibShiftPass(),229);
+        optPM.add(llvm::createDeadArgEliminationPass(),230);
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createPruneEHPass());
@@ -654,6 +661,9 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(CreateMakeInternalFuncsStaticPass());
         optPM.add(llvm::createGlobalDCEPass());
         optPM.add(llvm::createConstantMergePass());
+
+        // Should be the last
+        optPM.add(CreateFixBooleanSelectPass(), 400);
     }
 
     // Finish up by making sure we didn't mess anything up in the IR along
@@ -665,6 +675,7 @@ Optimize(llvm::Module *module, int optLevel) {
         printf("\n*****\nFINAL OUTPUT\n*****\n");
         module->dump();
     }
+
 }
 
 
@@ -1017,12 +1028,12 @@ InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) {
     if (trunc != NULL) {
         // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector)
         llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(value);
-        if (sext && 
+        if (sext &&
             sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
             return sext->getOperand(0);
 
         llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(value);
-        if (zext && 
+        if (zext &&
             zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
             return zext->getOperand(0);
     }
@@ -1848,7 +1859,7 @@ lIs32BitSafeHelper(llvm::Value *v) {
     // handle Adds, SExts, Constant Vectors
     if (llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v)) {
         if (bop->getOpcode() == llvm::Instruction::Add) {
-            return lIs32BitSafeHelper(bop->getOperand(0)) 
+            return lIs32BitSafeHelper(bop->getOperand(0))
                 && lIs32BitSafeHelper(bop->getOperand(1));
         }
         return false;
@@ -4879,6 +4890,7 @@ lMatchAvgDownInt16(llvm::Value *inst) {
 }
 #endif // !LLVM_3_1 && !LLVM_3_2
 
+
 bool
 PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
     DEBUG_START_PASS("PeepholePass");
@@ -4923,3 +4935,271 @@ static llvm::Pass *
 CreatePeepholePass() {
   return new PeepholePass;
 }
+
+/** Given an llvm::Value known to be an integer, return its value as
+    an int64_t.
+*/
+static int64_t
+lGetIntValue(llvm::Value *offset) {
+  llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
+  Assert(intOffset && (intOffset->getBitWidth() == 32 ||
+                       intOffset->getBitWidth() == 64));
+  return intOffset->getSExtValue();
+}
+
+///////////////////////////////////////////////////////////////////////////
+// ReplaceStdlibShiftPass
+
+class ReplaceStdlibShiftPass : public llvm::BasicBlockPass {
+public:
+    static char ID;
+    ReplaceStdlibShiftPass() : BasicBlockPass(ID) {
+    }
+
+    const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+};
+
+char ReplaceStdlibShiftPass::ID = 0;
+
+bool
+ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("ReplaceStdlibShiftPass");
+    bool modifiedAny = false;
+
+    llvm::Function *shifts[6];
+    shifts[0] = m->module->getFunction("__shift_i8");
+    shifts[1] = m->module->getFunction("__shift_i16");
+    shifts[2] = m->module->getFunction("__shift_i32");
+    shifts[3] = m->module->getFunction("__shift_i64");
+    shifts[4] = m->module->getFunction("__shift_float");
+    shifts[5] = m->module->getFunction("__shift_double");
+
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        llvm::Instruction *inst = &*iter;
+
+        if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
+          llvm::Function *func = ci->getCalledFunction();
+          for (int i = 0; i < 6; i++) {
+            if (shifts[i] && (shifts[i] == func)) {
+              // we matched a call
+              llvm::Value *shiftedVec = ci->getArgOperand(0);
+              llvm::Value *shiftAmt = ci->getArgOperand(1);
+              if (llvm::isa<llvm::Constant>(shiftAmt)) {
+                int vectorWidth = g->target->getVectorWidth();
+                int * shuffleVals = new int[vectorWidth];
+                int shiftInt = lGetIntValue(shiftAmt);
+                for (int i = 0; i < vectorWidth; i++) {
+                  int s = i + shiftInt;
+                  s = (s < 0) ? vectorWidth : s;
+                  s = (s >= vectorWidth) ? vectorWidth : s;
+                  shuffleVals[i] = s;
+                }
+                llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleVals);
+                llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shiftedVec->getType());
+                llvm::Value *shuffle = new llvm::ShuffleVectorInst(shiftedVec, zeroVec,
+                                                                   shuffleIdxs, "vecShift", ci);
+                ci->replaceAllUsesWith(shuffle);
+                modifiedAny = true;
+                delete [] shuffleVals;
+              } else {
+                PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount.");
+              }
+            }
+          }
+        }
+    }
+
+    DEBUG_END_PASS("ReplaceStdlibShiftPass");
+
+    return modifiedAny;
+}
+
+
+static llvm::Pass *
+CreateReplaceStdlibShiftPass() {
+    return new ReplaceStdlibShiftPass();
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// FixBooleanSelect
+//
+// The problem is that in LLVM 3.3, optimizer doesn't like
+// the following instruction sequence:
+//    %cmp = fcmp olt <8 x float> %a, %b
+//    %sext_cmp = sext <8 x i1> %cmp to <8 x i32>
+//    %new_mask = and <8 x i32> %sext_cmp, %mask
+// and optimizes it to the following:
+//    %cmp = fcmp olt <8 x float> %a, %b
+//    %cond = select <8 x i1> %cmp, <8 x i32> %mask, <8 x i32> zeroinitializer
+//
+// It wouldn't be a problem if codegen produced good code for it. But it
+// doesn't, especially for vectors larger than native vectors.
+//
+// This optimization reverts this pattern and should be the last one before
+// code gen.
+//
+// Note that this problem was introduced in LLVM 3.3. But in LLVM 3.4 it was
+// fixed. See commit r194542.
+//
+// After LLVM 3.3 this optimization should probably stay for experimental
+// purposes and code should be compared with and without this optimization from
+// time to time to make sure that LLVM does right thing.
+///////////////////////////////////////////////////////////////////////////////
+
+class FixBooleanSelectPass : public llvm::FunctionPass {
+public:
+    static char ID;
+    FixBooleanSelectPass() :FunctionPass(ID) {}
+
+    const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
+    bool runOnFunction(llvm::Function &F);
+
+private:
+    llvm::Instruction* fixSelect(llvm::SelectInst* sel, llvm::SExtInst* sext);
+};
+
+char FixBooleanSelectPass::ID = 0;
+
+llvm::Instruction* FixBooleanSelectPass::fixSelect(llvm::SelectInst* sel, llvm::SExtInst* sext) {
+    // Select instruction result type and its integer equivalent
+    llvm::VectorType *orig_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
+    llvm::VectorType *int_type = llvm::VectorType::getInteger(orig_type);
+
+    // Result value and optional pointer to instruction to delete
+    llvm::Instruction *result = 0, *optional_to_delete = 0;
+
+    // It can be vector of integers or vector of floating point values.
+    if (orig_type->getElementType()->isIntegerTy()) {
+        // Generate sext+and, remove select.
+        result = llvm::BinaryOperator::CreateAnd(sext, sel->getTrueValue(), "and_mask", sel);
+    } else {
+        llvm::BitCastInst* bc = llvm::dyn_cast<llvm::BitCastInst>(sel->getTrueValue());
+
+        if (bc && bc->hasOneUse() && bc->getSrcTy()->isIntOrIntVectorTy() && bc->getSrcTy()->isVectorTy() &&
+                llvm::isa<llvm::Instruction>(bc->getOperand(0)) &&
+                llvm::dyn_cast<llvm::Instruction>(bc->getOperand(0))->getParent() == sel->getParent()) {
+            // Bitcast is casting form integer type, it's operand is instruction, which is located in the same basic block (otherwise it's unsafe to use it).
+            // bitcast+select => sext+and+bicast
+            // Create and
+            llvm::BinaryOperator* and_inst = llvm::BinaryOperator::CreateAnd(sext, bc->getOperand(0), "and_mask", sel);
+            // Bitcast back to original type
+            result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
+            // Original bitcast will be removed
+            optional_to_delete = bc;
+        } else {
+            // General case: select => bitcast+sext+and+bitcast
+            // Bitcast
+            llvm::BitCastInst* bc_in = new llvm::BitCastInst(sel->getTrueValue(), int_type, "bitcast_mask_in", sel);
+            // And
+            llvm::BinaryOperator* and_inst = llvm::BinaryOperator::CreateAnd(sext, bc_in, "and_mask", sel);
+            // Bitcast back to original type
+            result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel);
+        }
+    }
+
+    // Done, finalize.
+    sel->replaceAllUsesWith(result);
+    sel->eraseFromParent();
+    if (optional_to_delete) {
+        optional_to_delete->eraseFromParent();
+    }
+
+    return result;
+}
+
+bool
+FixBooleanSelectPass::runOnFunction(llvm::Function &F) {
+    bool modifiedAny = false;
+
+    // LLVM 3.3 only
+#if defined(LLVM_3_3)
+
+    for (llvm::Function::iterator I = F.begin(), E = F.end();
+         I != E; ++I) {
+        llvm::BasicBlock* bb = &*I;
+        for (llvm::BasicBlock::iterator iter = bb->begin(), e = bb->end(); iter != e; ++iter) {
+            llvm::Instruction *inst = &*iter;
+
+            llvm::CmpInst *cmp = llvm::dyn_cast<llvm::CmpInst>(inst);
+
+            if (cmp && 
+                cmp->getType()->isVectorTy() &&
+                cmp->getType()->getVectorElementType()->isIntegerTy(1)) {
+
+                // Search for select instruction uses.
+                int selects = 0;
+                llvm::VectorType* sext_type = 0;
+                for (llvm::Instruction::use_iterator it=cmp->use_begin(); it!=cmp->use_end(); ++it ) {
+                    llvm::SelectInst* sel = llvm::dyn_cast<llvm::SelectInst>(*it);
+                    if (sel &&
+                        sel->getType()->isVectorTy() &&
+                        sel->getType()->getScalarSizeInBits() > 1) {
+                        selects++;
+                        // We pick the first one, but typical case when all select types are the same.
+                        sext_type = llvm::dyn_cast<llvm::VectorType>(sel->getType());
+                        break;
+                    }
+                }
+                if (selects == 0) {
+                    continue;
+                }
+                // Get an integer equivalent, if it's not yet an integer.
+                sext_type = llvm::VectorType::getInteger(sext_type);
+
+                // Do transformation
+                llvm::BasicBlock::iterator iter_copy=iter;
+                llvm::Instruction* next_inst = &*(++iter_copy);
+                // Create or reuse sext
+                llvm::SExtInst* sext = llvm::dyn_cast<llvm::SExtInst>(next_inst);
+                if (sext &&
+                    sext->getOperand(0) == cmp &&
+                    sext->getDestTy() == sext_type) {
+                    // This sext can be reused
+                } else {
+                    if (next_inst) {
+                        sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", next_inst);
+                    } else {
+                        sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", bb);
+                    }
+                }
+
+                // Walk and fix selects
+                std::vector<llvm::SelectInst*> sel_uses;
+                for (llvm::Instruction::use_iterator it=cmp->use_begin(); it!=cmp->use_end(); ++it) {
+                    llvm::SelectInst* sel = llvm::dyn_cast<llvm::SelectInst>(*it);
+                    if (sel &&
+                        sel->getType()->getScalarSizeInBits() == sext_type->getScalarSizeInBits()) {
+
+                        // Check that second operand is zero.
+                        llvm::Constant* false_cond = llvm::dyn_cast<llvm::Constant>(sel->getFalseValue());
+                        if (false_cond &&
+                            false_cond->isZeroValue()) {
+                            sel_uses.push_back(sel);
+                            modifiedAny = true;
+                        }
+                    }
+                }
+
+                for (int i=0; i<sel_uses.size(); i++) {
+                    fixSelect(sel_uses[i], sext);
+                }
+            }
+        }
+    }
+
+#endif // LLVM 3.3
+
+    return modifiedAny;
+}
+
+
+static llvm::Pass *
+CreateFixBooleanSelectPass() {
+    return new FixBooleanSelectPass();
+}
+
+
diff --git a/parse.yy b/parse.yy
index 5fc01cb0..38c5ba77 100644
--- a/parse.yy
+++ b/parse.yy
@@ -149,7 +149,8 @@ struct ForeachDimension {
 
 %union {
     uint64_t intVal;
-    float floatVal;
+    float  floatVal;
+    double doubleVal;
     std::string *stringVal;
     const char *constCharPtr;
 
@@ -185,7 +186,7 @@ struct ForeachDimension {
 %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT
 %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT
 %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT
-%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
+%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL
 %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL
 %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP
 %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
@@ -327,7 +328,11 @@ primary_expression
     }
     | TOKEN_FLOAT_CONSTANT {
         $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(),
-                           (float)yylval.floatVal, @1);
+                           yylval.floatVal, @1);
+    }
+    | TOKEN_DOUBLE_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
+                           yylval.doubleVal, @1);
     }
     | TOKEN_TRUE {
         $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1);
@@ -463,27 +468,27 @@ cast_expression
 multiplicative_expression
     : cast_expression
     | multiplicative_expression '*' cast_expression
-      { $$ = new BinaryExpr(BinaryExpr::Mul, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Mul, $1, $3, Union(@1, @3)); }
     | multiplicative_expression '/' cast_expression
-      { $$ = new BinaryExpr(BinaryExpr::Div, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Div, $1, $3, Union(@1, @3)); }
     | multiplicative_expression '%' cast_expression
-      { $$ = new BinaryExpr(BinaryExpr::Mod, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Mod, $1, $3, Union(@1, @3)); }
     ;
 
 additive_expression
     : multiplicative_expression
     | additive_expression '+' multiplicative_expression
-      { $$ = new BinaryExpr(BinaryExpr::Add, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Add, $1, $3, Union(@1, @3)); }
     | additive_expression '-' multiplicative_expression
-      { $$ = new BinaryExpr(BinaryExpr::Sub, $1, $3, Union(@1, @3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Sub, $1, $3, Union(@1, @3)); }
     ;
 
 shift_expression
     : additive_expression
     | shift_expression TOKEN_LEFT_OP additive_expression
-      { $$ = new BinaryExpr(BinaryExpr::Shl, $1, $3, Union(@1,@3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Shl, $1, $3, Union(@1, @3)); }
     | shift_expression TOKEN_RIGHT_OP additive_expression
-      { $$ = new BinaryExpr(BinaryExpr::Shr, $1, $3, Union(@1,@3)); }
+      { $$ = MakeBinaryExpr(BinaryExpr::Shr, $1, $3, Union(@1, @3)); }
     ;
 
 relational_expression
@@ -2183,6 +2188,9 @@ static void lAddMaskToSymbolTable(SourcePos pos) {
     case 32:
         t = AtomicType::VaryingUInt32;
         break;
+    case 64:
+        t = AtomicType::VaryingUInt64;
+        break;
     default:
         FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable");
     }
diff --git a/examples/perf.ini b/perf.ini
similarity index 84%
rename from examples/perf.ini
rename to perf.ini
index d2a5c73e..249c25f4 100755
--- a/examples/perf.ini
+++ b/perf.ini
@@ -10,44 +10,48 @@
 %****************************************************************************************************
 AOBench
 aobench
-ao 10 512 512
+10 512 512
 #***
 Deferred Shading
 deferred
-deferred_shading data/pp1280x720.bin
+data/pp1280x720.bin
 #***
 Mandelbrot Set
 mandelbrot
-mandelbrot
+
 #***
 Mandelbrot Set
 mandelbrot_tasks
-mandelbrot_tasks
+
 ^
 #***
 Perlin Noise Function
 noise
-noise
+
 #***
 Binomial Options
 options
-options
+
 ! 1 2
 #***
 Black-Scholes Options
 options
-options
+
 ! 2 2
 #***
 Ray Tracer
 rt
-rt sponza
+sponza
 #***
 3D Stencil
 stencil
-stencil
+
 #***
 Volume Rendering
 volume_rendering
-volume camera.dat density_highres.vol
+camera.dat density_highres.vol
+#***
+Sort
+sort
+1000000 1
 #***
diff --git a/perf.py b/perf.py
new file mode 100755
index 00000000..d1134990
--- /dev/null
+++ b/perf.py
@@ -0,0 +1,575 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+
+def print_file(line):
+    if options.output != "":
+        output = open(options.output, 'w')
+        output.writelines(line)
+        output.close()
+
+def build_test(commands):
+    os.system(commands[4])
+    test = os.system(commands[1])
+    if is_windows:
+        common.remove_if_exists(".\\X64\\Release1")
+        if (test == 0):
+            os.rename(".\\X64\\Release", ".\\X64\\Release1")
+    if options.ref:
+        ref = os.system(commands[3])
+    return (options.ref and ref) or test
+
+def execute_test(commands):
+    r = 0
+    common.remove_if_exists(perf_temp+"_test") 
+    common.remove_if_exists(perf_temp+"_ref")
+    for k in range(int(options.number)):
+        r = r + os.system(commands[0])
+        if options.ref:
+            r = r + os.system(commands[2])
+    return r
+
+#gathers all tests results and made an item test from answer structure
+def run_test(commands, c1, c2, test, test_ref, b_serial):
+    if build_test(commands) != 0:
+        error("Compilation fails of test %s\n" % test[0], 0)
+        return
+    if execute_test(commands) != 0:
+        error("Execution fails of test %s\n" % test[0], 0)
+        return
+    print_debug("TEST COMPILER:\n", s, perf_log)
+    analyse_test(c1, c2, test, b_serial, perf_temp+"_test")
+    if options.ref:
+        print_debug("REFERENCE COMPILER:\n", s, perf_log)
+        analyse_test(c1, c2, test_ref, b_serial, perf_temp+"_ref")
+
+
+def analyse_test(c1, c2, test, b_serial, perf_temp_n):
+    tasks = [] #list of results with tasks, it will be test[2]
+    ispc = [] #list of results without tasks, it will be test[1]
+    absolute_tasks = []  #list of absolute results with tasks, it will be test[4]
+    absolute_ispc = [] #list of absolute results without tasks, ut will be test[3]
+    serial = [] #list serial times, it will be test[5]
+    j = 1
+    for line in open(perf_temp_n): # we take test output
+        if "speedup" in line: # we are interested only in lines with speedup
+            if j == c1: # we are interested only in lines with c1 numbers
+                line = line.expandtabs(0)
+                line = line.replace("("," ")
+                line = line.split(",")
+                for i in range(len(line)):
+                    subline = line[i].split(" ")
+                    number = float(subline[1][:-1])
+                    if "speedup from ISPC + tasks" in line[i]:
+                        tasks.append(number)
+                    else:
+                        ispc.append(number)
+                c1 = c1 + c2
+            j+=1
+        if "million cycles" in line:
+            if j == c1:
+                line = line.replace("]","[")
+                line = line.split("[")
+                number = float(line[3])
+                if "tasks" in line[1]:
+                    absolute_tasks.append(number)
+                else:
+                    if "ispc" in line[1]:
+                        absolute_ispc.append(number)
+                if "serial" in line[1]:
+                    serial.append(number)
+
+    if len(ispc) != 0:
+        if len(tasks) != 0:
+            print_debug("ISPC speedup / ISPC + tasks speedup / ISPC time / ISPC + tasks time / serial time\n", s, perf_log)
+            for i in range(0,len(serial)):
+                print_debug("%10s   /\t%10s\t    /%9s  /    %10s\t    /%10s\n" %
+                    (ispc[i], tasks[i], absolute_ispc[i], absolute_tasks[i], serial[i]), s, perf_log)
+        else:
+            print_debug("ISPC speedup / ISPC time / serial time\n", s, perf_log)
+            for i in range(0,len(serial)):
+                print_debug("%10s   /%9s  /%10s\n" % (ispc[i], absolute_ispc[i], serial[i]), s, perf_log)
+    else:
+        if len(tasks) != 0:
+            print_debug("ISPC + tasks speedup / ISPC + tasks time / serial time\n", s, perf_log)
+            for i in range(0,len(serial)):
+                print_debug("%10s\t     /    %10s\t /%10s\n" % (tasks[i], absolute_tasks[i], serial[i]), s, perf_log)
+
+    test[1] = test[1] + ispc
+    test[2] = test[2] + tasks
+    test[3] = test[3] + absolute_ispc
+    test[4] = test[4] + absolute_tasks
+    if b_serial == True:
+        #if we concatenate outputs we should use only the first serial answer.
+        test[5] = test[5] + serial
+
+def cpu_get():
+    p = open("/proc/stat", 'r')
+    cpu = p.readline()
+    p.close()
+    cpu = cpu.split(" ")
+    cpu_usage = (int(cpu[2]) + int(cpu[3]) + int(cpu[4]))
+    cpu_all = cpu_usage + int(cpu[5])
+    return [cpu_usage, cpu_all]
+
+#returns cpu_usage
+def cpu_check():
+    if is_windows == False:
+        if is_mac == False:
+            cpu1 = cpu_get()
+            time.sleep(1)
+            cpu2 = cpu_get()
+            cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
+        else:
+            os.system("sysctl -n vm.loadavg > cpu_temp")
+            c = open("cpu_temp", 'r')
+            c_line = c.readline()
+            c.close
+            os.remove("cpu_temp")
+            R = c_line.split(' ')
+            cpu_percent = float(R[1]) * 3
+    else:
+        os.system("wmic cpu get loadpercentage /value > cpu_temp")
+        c = open("cpu_temp", 'r')
+        c_lines = c.readlines()
+        c.close()
+        os.remove("cpu_temp")
+        t = "0"
+        for i in c_lines[2]:
+            if i.isdigit():
+                t = t + i
+        cpu_percent = int(t)
+    return cpu_percent
+
+#returns geomean of list
+def geomean(par):
+    temp = 1
+    l = len(par)
+    for i in range(l):
+        temp = temp * par[i]
+    if l != 0:
+        temp = temp ** (1.0/l)
+    else:
+        temp = 0
+    return round(temp, 2)
+
+#takes an answer struct and print it.
+#answer struct: list answer contains lists test
+#test[0] - name of test
+#test[1] - list of results without tasks
+#test[2] - list of results with tasks
+#test[3] - list of absolute results without tasks
+#test[4] - list of absolute results with tasks
+#test[5] - list of absolute time without ISPC (serial)
+#test[1..4] may be empty
+def print_answer(answer, target_number):
+    filelist = []
+    print_debug("--------------------------------------------------------------------------\n", s, perf_log)
+    print_debug("test name:\t    ISPC speedup: ISPC + tasks speedup: | " + 
+        "    ISPC time:    ISPC + tasks time:  serial:\n", s, perf_log)
+    if target_number > 1:
+        if options.output == "":
+            options.output = "targets.csv"
+        filelist.append("test name,ISPC speedup" + "," * target_number + "ISPC + tasks speedup\n")
+        filelist.append("," + options.perf_target + "," + options.perf_target + "\n")
+    else:
+        filelist.append("test name,ISPC speedup,diff," +
+            "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n")
+    max_t = [0,0,0,0,0]
+    diff_t = [0,0,0,0,0]
+    geomean_t = []
+    list_of_max = []
+    for i1 in range(target_number):
+        geomean_t.append([0,0,0,0,0])
+        list_of_max.append([[],[],[],[],[]])
+    list_of_compare = [[],[],[],[],[],[]]
+    target_k = 0
+    temp_str_1 = ""
+    temp_str_2 = ""
+    for i in range(len(answer)):
+        list_of_compare[0].append(answer[i][0])
+        for t in range(1,6):
+            if len(answer[i][t]) == 0:
+                max_t[t-1] = "n/a"
+                diff_t[t-1] = "n/a"
+                list_of_compare[t].append(0);
+            else:
+                if t < 3:
+                    mm = max(answer[i][t])
+                else:
+                    mm = min(answer[i][t])
+                list_of_compare[t].append(mm)
+                max_t[t-1] = '%.2f' % mm
+                list_of_max[i % target_number][t-1].append(mm)
+                diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t]))
+        print_debug("%s:\n" % answer[i][0], s, perf_log)
+        print_debug("\t\tmax:\t%5s\t\t%10s\t|min:%10s\t%10s\t%10s\n" %
+            (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4]), s, perf_log)
+        print_debug("\t\tdiff:\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" %
+            (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4]), s, perf_log)
+        for t in range(0,5):
+            if max_t[t] == "n/a":
+                max_t[t] = ""
+            if diff_t[t] == "n/a":
+                diff_t[t] = ""
+        if target_number > 1:
+            if target_k == 0:
+                temp_str_1 = answer[i][0] + ","
+                temp_str_2 = ""
+            temp_str_1 += max_t[0] + ","
+            temp_str_2 += max_t[1] + ","
+            target_k = target_k + 1
+            if target_k == target_number:
+                filelist.append(temp_str_1 + temp_str_2[:-1] + "\n")
+                target_k = 0
+        else:
+            filelist.append(answer[i][0] + "," +
+                        max_t[0] + "," + diff_t[0] + "," +  max_t[1] + "," + diff_t[1] + "," +
+                        max_t[2] + "," + diff_t[2] + "," +  max_t[3] + "," + diff_t[3] + "," +
+                        max_t[4] + "," + diff_t[4] + "\n")
+    for i in range(0,5):
+        for i1 in range(target_number):
+            geomean_t[i1][i] = geomean(list_of_max[i1][i])
+    print_debug("---------------------------------------------------------------------------------\n", s, perf_log)
+    print_debug("Geomean:\t\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" %
+        (geomean_t[0][0], geomean_t[0][1], geomean_t[0][2], geomean_t[0][3], geomean_t[0][4]), s, perf_log)
+    if target_number > 1:
+        temp_str_1 = "Geomean,"
+        temp_str_2 = ""
+        for i in range(target_number):
+            temp_str_1 += str(geomean_t[i][0]) + ","
+            temp_str_2 += str(geomean_t[i][1]) + ","
+        filelist.append(temp_str_1 + temp_str_2[:-1] + "\n")
+    else:
+        filelist.append("Geomean," + str(geomean_t[0][0]) + ",," + str(geomean_t[0][1])
+            + ",," + str(geomean_t[0][2]) + ",," + str(geomean_t[0][3]) + ",," + str(geomean_t[0][4]) + "\n")
+    print_file(filelist)
+    return list_of_compare
+
+
+def compare(A, B):
+    print_debug("\n\n_____________________PERFORMANCE REPORT____________________________\n", False, "")
+    print_debug("test name:                 ISPC time: ISPC time ref: %:\n", False, "")
+    for i in range(0,len(A[0])):
+        if B[3][i] == 0:
+            p1 = 0
+        else:
+            p1 = 100 - 100 * A[3][i]/B[3][i]
+        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[3][i], B[3][i], abs(p1)), False, "")
+        if p1 < -1:
+            print_debug(" <+", False, "")
+        if p1 > 1:
+            print_debug(" <-", False, "")
+        print_debug("\n", False, "")
+    print_debug("\n", False, "")
+
+    print_debug("test name:                 TASKS time: TASKS time ref: %:\n", False, "")
+    for i in range(0,len(A[0])):
+        if B[4][i] == 0:
+            p2 = 0
+        else:
+            p2 = 100 - 100 * A[4][i]/B[4][i]
+        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[4][i], B[4][i], abs(p2)), False, "")
+        if p2 < -1:
+            print_debug(" <+", False, "")
+        if p2 > 1:
+            print_debug(" <-", False, "")
+        print_debug("\n", False, "")
+    if "performance.log" in options.in_file:
+        print_debug("\n\n_________________Watch performance.log for details________________\n", False, "")
+    else:
+        print_debug("\n\n__________________________________________________________________\n", False, "")
+
+
+
+def perf(options1, args):
+    global options
+    options = options1  
+    global s
+    s = options.silent
+
+    # save current OS
+    global is_windows
+    is_windows = (platform.system() == 'Windows' or
+              'CYGWIN_NT' in platform.system())
+    global is_mac
+    is_mac = (platform.system() == 'Darwin')
+
+    # save current path
+    pwd = os.getcwd()
+    pwd = pwd + os.sep
+    pwd1 = pwd
+    if is_windows:
+        pwd1 = "..\\..\\"
+
+    if options.perf_target != "":
+        test_only_r = " sse2-i32x4 sse2-i32x8 sse4-i32x4 sse4-i32x8 sse4-i16x8 \
+                        sse4-i8x16 avx1-i32x4 avx1-i32x8 avx1-i32x16 avx1-i64x4 avx1.1-i32x8 \
+                        avx1.1-i32x16 avx1.1-i64x4 avx2-i32x8 avx2-i32x16 avx2-i64x4 "
+        test_only = options.perf_target.split(",")
+        for iterator in test_only:
+            if not (" " + iterator + " " in test_only_r):
+                error("unknow option for target: " + iterator, 1)
+
+    # check if cpu usage is low now
+    cpu_percent = cpu_check()
+    if cpu_percent > 20:
+        error("CPU Usage is very high.\nClose other applications.\n", 2)
+
+    # prepare build.log, perf_temp and perf.log files
+    global perf_log
+    if options.in_file:
+        perf_log = pwd + options.in_file
+        common.remove_if_exists(perf_log)
+    else:
+        perf_log = ""
+    global build_log
+    build_log = pwd + os.sep + "logs" + os.sep + "perf_build.log"
+    common.remove_if_exists(build_log)
+    if os.path.exists(pwd + os.sep + "logs") == False:
+        os.makedirs(pwd + os.sep + "logs")
+    global perf_temp
+    perf_temp = pwd + "perf_temp"
+
+
+    global ispc_test
+    global ispc_ref
+    global ref_compiler
+    global refc_compiler
+    # check that required compilers exist
+    PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+    ispc_test_exists = False
+    ispc_ref_exists = False
+    ref_compiler_exists = False
+    if is_windows == False:
+        ispc_test = "ispc"
+        ref_compiler = "clang++"
+        refc_compiler = "clang"
+        if options.compiler != "":
+            if options.compiler == "clang" or options.compiler == "clang++":
+                ref_compiler = "clang++"
+                refc_compiler = "clang"
+            if options.compiler == "icc" or options.compiler == "icpc":
+                ref_compiler = "icpc"
+                refc_compiler = "icc"
+            if options.compiler == "gcc" or options.compiler == "g++":
+                ref_compiler = "g++"
+                refc_compiler = "gcc"
+    else:
+        ispc_test = "ispc.exe"
+        ref_compiler = "cl.exe"
+    ispc_ref = options.ref
+    if options.ref != "":
+        options.ref = True
+    if os.environ.get("ISPC_HOME") != None:
+        if is_windows == False:
+            if os.path.exists(os.environ["ISPC_HOME"] + os.sep + ispc_test):
+                ispc_test_exists = True
+                ispc_test = os.environ["ISPC_HOME"] + os.sep + ispc_test
+        else:
+            if os.path.exists(os.environ["ISPC_HOME"] + "\\Release\\" + ispc_test):
+                ispc_test_exists = True
+                ispc_test = os.environ["ISPC_HOME"] + "\\Release\\" + ispc_test
+    for counter in PATH_dir:
+        if ispc_test_exists == False:
+            if os.path.exists(counter + os.sep + ispc_test):
+                ispc_test_exists = True
+                ispc_test = counter + os.sep + ispc_test
+        if os.path.exists(counter + os.sep + ref_compiler):
+            ref_compiler_exists = True
+        if os.path.exists(counter + os.sep + ispc_ref):
+            ispc_ref_exists = True
+    if not ispc_test_exists:
+        error("ISPC compiler not found.\nAdded path to ispc compiler to your PATH variable or ISPC_HOME variable\n", 1)
+    if not ref_compiler_exists:
+        error("C/C++ compiler %s not found.\nAdded path to %s compiler to your PATH variable.\n" % (ref_compiler, ref_compiler), 1)
+    if options.ref:
+        if not ispc_ref_exists:
+            error("ISPC reference compiler not found.\nAdded path to ispc reference compiler to your PATH variable.\n", 1)
+
+    # checks that config file exists
+    path_config = os.path.normpath(options.config)
+    if os.path.exists(path_config) == False:
+        error("config file not found: %s.\nSet path to your config file in --config.\n" % options.config, 1)
+        sys.exit()
+
+    # read lines from config file except comments
+    f = open(path_config, 'r')
+    f_lines = f.readlines()
+    f.close()
+    lines =[]
+    for i in range(len(f_lines)):
+        if f_lines[i][0] != "%":
+            lines.append(f_lines[i])
+    length = len(lines)
+    # end of preparations
+ 
+    print_debug("Okey go go go!\n\n", s, perf_log)
+    # report command line
+    if __name__ == "__main__":
+        print_debug("Command line: %s\n" % " ".join(map(str, sys.argv)), s, perf_log)
+    # report used ispc
+    print_debug("Testing ispc: " + ispc_test + "\n", s, perf_log)
+ 
+    #print compilers versions   
+    common.print_version(ispc_test, ispc_ref, ref_compiler, False, perf_log, is_windows) 
+
+    # begin
+    i = 0
+    answer = []
+    answer_ref = []
+
+    # loop for all tests
+    while i < length-2:
+        # we read name of test
+        print_debug("%s" % lines[i], s, perf_log)
+        # read location of test
+        folder = lines[i+1]
+        folder = folder[:-1]
+        folder = os.path.normpath(options.path + os.sep + "examples" + os.sep + folder)
+        # check that test exists
+        if os.path.exists(folder) == False:
+            error("Can't find test %s. Your path is: \"%s\".\nChange current location to ISPC_HOME or set path to ISPC_HOME in --path.\n" %
+                 (lines[i][:-1], options.path), 1)
+        os.chdir(folder)
+        # read parameters of test
+        command = lines[i+2]
+        command = command[:-1]
+        # handle conditional target argument
+        target_str_temp = ""
+	target_out_temp = ""
+        perf_targets = [""]
+        target_number = 1
+        if options.perf_target != "":
+            perf_targets = options.perf_target.split(',')
+            target_str_temp = " ISPC_IA_TARGETS="
+	    target_out_temp = " /p:Target_str="
+            target_number = len(perf_targets)
+        temp = 0
+        for target_i in range(target_number):
+            test = [lines[i][:-1],[],[],[],[],[]]
+            test_ref = [lines[i][:-1],[],[],[],[],[]]
+            target_str = target_str_temp + perf_targets[target_i]
+	    Target_out = target_out_temp + perf_targets[target_i]
+            if is_windows == False:
+                ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref"
+                ex_command = "./test " + command + " >> " + perf_temp + "_test"
+                bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+target_str+" >> "+build_log+" 2>> "+build_log
+                bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+target_str+" >> "+build_log+" 2>> "+build_log
+                re_command = "make clean >> "+build_log
+            else:
+                ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref"
+                ex_command = "x64\\Release1\\test.exe " + command + " >> " + perf_temp + "_test"
+		bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /p:ISPC_compiler=ispc_ref " + Target_out + " /t:rebuild >> " + build_log
+                bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /p:ISPC_compiler=ispc " + Target_out + " /t:rebuild >> " + build_log
+                re_command = "msbuild /t:clean >> " + build_log
+            commands = [ex_command, bu_command, ex_command_ref, bu_command_ref, re_command]
+            # parsing config parameters
+            next_line = lines[i+3]
+            if next_line[0] == "!": # we should take only one part of test output
+                R = next_line.split(' ')
+                c1 = int(R[1]) #c1 is a number of string which we want to use in test output
+                c2 = int(R[2]) #c2 is total number of strings in test output
+                temp = 1
+            else:
+                c1 = 1
+                c2 = 1
+            next_line = lines[i+3]
+            if next_line[0] == "^":
+                temp = 1
+            if next_line[0] == "^" and target_number == 1:  #we should concatenate result of this test with previous one
+                run_test(commands, c1, c2, answer[len(answer)-1], answer_ref[len(answer)-1], False)
+            else: #we run this test and append it's result to answer structure
+                run_test(commands, c1, c2, test, test_ref, True)
+                answer.append(test)
+                answer_ref.append(test_ref)
+        i = i + temp
+        # preparing next loop iteration
+        os.chdir(pwd1)
+        i+=4
+
+    # delete temp file
+    common.remove_if_exists(perf_temp+"_test")
+    common.remove_if_exists(perf_temp+"_ref")
+
+    #print collected answer
+    if target_number > 1:
+        s = True
+    print_debug("\n\nTEST COMPILER:\n", s, perf_log)
+    A = print_answer(answer, target_number)
+    if options.ref != "":
+        print_debug("\n\nREFERENCE COMPILER:\n", s, perf_log)
+        B = print_answer(answer_ref)
+        # print perf report
+        compare(A,B)
+
+ 
+
+###Main###
+from optparse import OptionParser
+import sys
+import os
+import operator
+import time
+import glob
+import string
+import platform
+# our functions
+import common
+print_debug = common.print_debug
+error = common.error
+
+if __name__ == "__main__":
+    # parsing options
+    parser = OptionParser()
+    parser.add_option('-n', '--number', dest='number',
+        help='number of repeats', default="3")
+    parser.add_option('-c', '--config', dest='config',
+        help='config file of tests', default="./perf.ini")
+    parser.add_option('-p', '--path', dest='path',
+        help='path to ispc root', default=".")
+    parser.add_option('-s', '--silent', dest='silent',
+        help='silent mode, only table output', default=False, action="store_true")
+    parser.add_option('-o', '--output', dest='output',
+        help='output file for script reading', default="")
+    parser.add_option('--compiler', dest='compiler',
+        help='C/C++ compiler', default="")
+    parser.add_option('-r', '--ref', dest='ref',
+        help='set reference compiler for compare', default="")
+    parser.add_option('-f', '--file', dest='in_file',
+        help='file to save perf output', default="")
+    parser.add_option('-t', '--target', dest='perf_target',
+        help='set ispc target for building benchmarks (both test and ref)', default="")
+    (options, args) = parser.parse_args()
+    perf(options, args)
diff --git a/run_tests.py b/run_tests.py
index 9729930f..e6429861 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -1,165 +1,37 @@
 #!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 # test-running driver for ispc
-
-from optparse import OptionParser
-import multiprocessing
-from ctypes import c_int
-import os
-import sys
-import glob
-import re
-import signal
-import random
-import string
-import subprocess
-import shlex
-import platform
-import tempfile
-import os.path
-import time
-
-# disable fancy error/warning printing with ANSI colors, so grepping for error
-# messages doesn't get confused
-os.environ["TERM"] = "dumb"
-
-# This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
-# git history has a workaround for that issue.
-
-is_windows = (platform.system() == 'Windows' or
-              'CYGWIN_NT' in platform.system())
-
-parser = OptionParser()
-parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
-                  default=False, action="store_true")
-parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics",
-                  default=None)
-parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
-                  default="")
-parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)',
-                  default="sse4")
-parser.add_option('-a', '--arch', dest='arch',
-                  help='Set architecture (arm, x86, x86-64)',
-                  default="x86-64")
-parser.add_option("-c", "--compiler", dest="compiler_exe", help="Compiler binary to use to run tests",
-                  default=None)
-parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
-                  default=False, action="store_true")
-parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel',
-                  default="1024", type="int")
-parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output',
-                  default=False, action="store_true")
-parser.add_option('--wrap-exe', dest='wrapexe',
-                  help='Executable to wrap test runs with (e.g. "valgrind")',
-                  default="")
-parser.add_option('--time', dest='time', help='Enable time output',
-                  default=False, action="store_true")
-parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates',
-                  default=False, action="store_true")
-
-(options, args) = parser.parse_args()
-
-if options.target == 'neon':
-    options.arch = 'arm'
-
-# use relative path to not depend on host directory, which may possibly
-# have white spaces and unicode characters.
-if not is_windows:
-    ispc_exe = "./ispc"
-else:
-    ispc_exe = ".\\Release\\ispc.exe"
-
-# checks the required ispc compiler otherwise prints an error message
-if not os.path.exists(ispc_exe):
-    sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe)
-    sys.exit()
-
-ispc_exe += " " + options.ispc_flags
-
-if __name__ == '__main__':
-    sys.stdout.write("ispc compiler: %s\n" % ispc_exe)
-
-is_generic_target = (options.target.find("generic-") != -1 and
-                     options.target != "generic-1")
-if is_generic_target and options.include_file == None:
-    if options.target == "generic-4":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/sse4.h\n")
-        options.include_file = "examples/intrinsics/sse4.h"
-    elif options.target == "generic-8":
-        sys.stderr.write("No generics #include specified and no default available for \"generic-8\" target.\n")
-        sys.exit(1)
-    elif options.target == "generic-16":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-16.h\n")
-        options.include_file = "examples/intrinsics/generic-16.h"
-    elif options.target == "generic-32":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-32.h\n")
-        options.include_file = "examples/intrinsics/generic-32.h"
-    elif options.target == "generic-64":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-64.h\n")
-        options.include_file = "examples/intrinsics/generic-64.h"
-
-if options.compiler_exe == None:
-    if is_windows:
-        options.compiler_exe = "cl.exe"
-    else:
-        options.compiler_exe = "g++"
-
-# checks the required compiler otherwise prints an error message
-PATH_dir = string.split(os.getenv("PATH"), os.pathsep) 
-compiler_exists = False
-
-for counter in PATH_dir:
-    if os.path.exists(counter + os.sep + options.compiler_exe):
-        compiler_exists = True
-        break
-
-if not compiler_exists:
-    sys.stderr.write("Fatal error: missing the required compiler: %s \n" %
-        options.compiler_exe)
-    sys.exit()
-
-ispc_root = "."
-    
-# if no specific test files are specified, run all of the tests in tests/,
-# failing_tests/, and tests_errors/
-if len(args) == 0:
-    files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \
-        glob.glob(ispc_root + os.sep + "failing_tests" + os.sep + "*ispc") + \
-        glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc")
-else:
-    if is_windows:
-        argfiles = [ ]
-        for f in args:
-            # we have to glob ourselves if this is being run under a DOS
-            # shell, as it passes wildcard as is.
-            argfiles += glob.glob(f)
-    else:
-        argfiles = args
-        
-    files = [ ]
-    for f in argfiles:
-        if os.path.splitext(string.lower(f))[1] != ".ispc":
-            sys.stdout.write("Ignoring file %s, which doesn't have an .ispc extension.\n" % f)
-        else:
-            files += [ f ]
-
-# max_test_length is used to issue exact number of whitespace characters when
-# updating status. Otherwise update causes new lines standard 80 char terminal
-# on both Linux and Windows.
-max_test_length = 0
-for f in files:
-    max_test_length = max(max_test_length, len(f))
-
-# randomly shuffle the tests if asked to do so
-if (options.random):
-    random.seed()
-    random.shuffle(files)
-
-# counter
-total_tests = 0
-
-
 # utility routine to print an update on the number of tests that have been
 # finished.  Should be called with the lock held..
 def update_progress(fn, total_tests_arg, counter, max_test_length_arg):
@@ -176,7 +48,7 @@ def update_progress(fn, total_tests_arg, counter, max_test_length_arg):
 
 def run_command(cmd):
     if options.verbose:
-        sys.stdout.write("Running: %s\n" % cmd)
+        print_debug("Running: %s\n" % cmd, s, run_tests_log)
 
     # Here's a bit tricky part. To pass a command for execution we should
     # break down the line in to arguments. shlex class is designed exactly
@@ -204,9 +76,9 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
         (return_code, output) = run_command(cmd)
         compile_failed = (return_code != 0)
         if compile_failed:
-            sys.stdout.write("Compilation of test %s failed            \n" % filename)
+            print_debug("Compilation of test %s failed            \n" % filename, s, run_tests_log)
             if output != "":
-                sys.stdout.write("%s" % output.encode("utf-8"))
+                print_debug("%s" % output.encode("utf-8"), s, run_tests_log)
             return (1, 0)
 
     (return_code, output) = run_command(run_cmd)
@@ -215,11 +87,11 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
     surprise = ((expect_failure and not run_failed) or
                 (not expect_failure and run_failed))
     if surprise == True:
-        sys.stderr.write("Test %s %s (return code %d)            \n" % \
+        print_debug("Test %s %s (return code %d)            \n" % \
             (filename, "unexpectedly passed" if expect_failure else "failed",
-             return_code))
+             return_code), s, run_tests_log)
     if output != "":
-        sys.stdout.write("%s\n" % output.encode("utf-8"))
+        print_debug("%s\n" % output.encode("utf-8"), s, run_tests_log)
     if surprise == True:
         return (0, 1)
     else:
@@ -298,11 +170,11 @@ def run_test(testname):
         file.close()
 
         if re.search(firstline, output) == None:
-            sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
-                (firstline, testname, output))
+            print_debug("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
+                (firstline, testname, output), s, run_tests_log)
             return (1, 0)
         elif got_error == False:
-            sys.stderr.write("Unexpectedly no errors issued from test %s\n" % testname)
+            print_debug("Unexpectedly no errors issued from test %s\n" % testname, s, run_tests_log)
             return (1, 0)
         else:
             return (0, 0)
@@ -328,8 +200,7 @@ def run_test(testname):
                     break
         file.close()
         if match == -1:
-            sys.stderr.write("Fatal error: unable to find function signature " + \
-                  "in test %s\n" % testname)
+            error("unable to find function signature in test %s\n" % testname, 0)
             return (1, 0)
         else:
             global is_generic_target
@@ -362,10 +233,13 @@ def run_test(testname):
                 gcc_isa=""
                 if options.target == 'generic-4':
                     gcc_isa = '-msse4.2'
-                if options.target == 'generic-8':
+                if (options.target == 'generic-8'):
+                  if (options.include_file.find("knc-i1x8.h")!=-1 or options.include_file.find("knc-i1x8unsafe_fast.h")!=-1):
+                    gcc_isa = '-mmic'
+                  else:
                     gcc_isa = '-mavx'
                 if (options.target == 'generic-16' or options.target == 'generic-32' or options.target == 'generic-64') \
-                        and (options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1):
+                        and (options.include_file.find("knc-i1x16.h")!=-1 or options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1):
                     gcc_isa = '-mmic'
 
                 cc_cmd = "%s -O2 -I. %s %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \
@@ -404,7 +278,21 @@ def run_test(testname):
 # pull tests to run from the given queue and run them.  Multiple copies of
 # this function will be running in parallel across all of the CPU cores of
 # the system.
-def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test_length_arg, counter, mutex):
+def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test_length_arg, counter, mutex, glob_var):
+    # This is needed on windows because windows doen't copy globals from parent process whili multiprocessing
+    global is_windows
+    is_windows = glob_var[0]
+    global options
+    options = glob_var[1]
+    global s
+    s = glob_var[2]
+    global ispc_exe
+    ispc_exe = glob_var[3]
+    global is_generic_target
+    is_generic_target = glob_var[4]
+    global run_tests_log
+    run_tests_log = glob_var[5]    
+
     if is_windows:
         tmpdir = "tmp%d" % os.getpid()
         os.mkdir(tmpdir)
@@ -447,14 +335,282 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test
             skip_files += [ filename ]
 
 
-task_threads = []
-
 def sigint(signum, frame):
     for t in task_threads:
         t.terminate()
     sys.exit(1)
 
-if __name__ == '__main__':
+
+def file_check(compfails, runfails):
+    errors = len(compfails) + len(runfails)
+    new_compfails = []
+    new_runfails = []
+    new_passes_compfails = []
+    new_passes_runfails = []
+# Open file fail_db.txt
+    f = open(test_states, 'r')
+    f_lines = f.readlines()
+    f.close()
+# Detect OS
+    if platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system():
+        OS = "Windows"
+    else:
+        if platform.system() == 'Darwin':
+            OS = "Mac"
+        else:
+            OS = "Linux"
+# Detect opt_set
+    if options.no_opt == True:
+        opt = "-O0"
+    else:
+        opt = "-O2"
+# Detect LLVM version
+    temp1 = common.take_lines(ispc_exe + " --version", "first")
+    llvm_version = temp1[-10:-2]
+# Detect compiler version
+    if is_windows == False:
+        temp1 = common.take_lines(options.compiler_exe + " --version", "first")
+        temp2 = re.search("[0-9]*\.[0-9]*\.[0-9]", temp1)
+        if temp2 == None:
+            temp3 = re.search("[0-9]*\.[0-9]*", temp1)
+        else:
+            temp3 = re.search("[0-9]*\.[0-9]*", temp2.group())
+        compiler_version = options.compiler_exe + temp3.group()
+    else:
+        compiler_version = "cl"
+    possible_compilers = ["g++4.4", "g++4.7", "clang++3.3", "cl"]
+    if not compiler_version in possible_compilers:
+        error("\n**********\nWe don't have history of fails for compiler " +
+                compiler_version +
+                "\nAll fails will be new!!!\n**********", 2)
+    new_line = " "+options.arch.rjust(6)+" "+options.target.rjust(14)+" "+OS.rjust(7)+" "+llvm_version+" "+compiler_version.rjust(10)+" "+opt+" *\n"
+
+    new_compfails = compfails[:]
+    new_runfails = runfails[:]
+    new_f_lines = f_lines[:]
+    for j in range(0, len(f_lines)):
+        if (((" "+options.arch+" ") in f_lines[j]) and
+           ((" "+options.target+" ") in f_lines[j]) and
+           ((" "+OS+" ") in f_lines[j]) and
+           ((" "+llvm_version+" ") in f_lines[j]) and
+           ((" "+compiler_version+" ") in f_lines[j]) and
+           ((" "+opt+" ") in f_lines[j])):
+            if (" compfail " in f_lines[j]):
+                f = 0
+                for i in range(0, len(compfails)):
+                    if compfails[i] in f_lines[j]:
+                        new_compfails.remove(compfails[i])
+                    else:
+                        f = f + 1
+                if f == len(compfails):
+                    temp3 = f_lines[j].split(" ")
+                    new_passes_compfails.append(temp3[0])
+                    if options.update == "FP":
+                        new_f_lines.remove(f_lines[j])
+            if (" runfail " in f_lines[j]):
+                f = 0
+                for i in range(0, len(runfails)):
+                    if runfails[i] in f_lines[j]:
+                        new_runfails.remove(runfails[i])
+                    else:
+                        f = f + 1
+                if f == len(runfails):
+                    temp3 = f_lines[j].split(" ")
+                    new_passes_runfails.append(temp3[0])
+                    if options.update == "FP":
+                        new_f_lines.remove(f_lines[j])
+    if len(new_runfails) != 0:
+        print_debug("NEW RUNFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_runfails)):
+            new_f_lines.append(new_runfails[i] + " runfail " + new_line)
+            print_debug("\t" + new_runfails[i] + "\n", s, run_tests_log)
+    if len(new_compfails) != 0:
+        print_debug("NEW COMPFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_compfails)):
+            new_f_lines.append(new_compfails[i] + " compfail " + new_line)
+            print_debug("\t" + new_compfails[i] + "\n", s, run_tests_log)
+    if len(new_runfails) == 0 and len(new_compfails) == 0:
+        print_debug("No new fails\n", s, run_tests_log)
+    if len(new_passes_runfails) != 0:
+        print_debug("NEW PASSES after RUNFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_passes_runfails)):
+            print_debug("\t" + new_passes_runfails[i] + "\n", s, run_tests_log)
+    if len(new_passes_compfails) != 0:
+        print_debug("NEW PASSES after COMPFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_passes_compfails)):
+            print_debug("\t" + new_passes_compfails[i] + "\n", s, run_tests_log)
+    
+    if options.update != "":
+        output = open(test_states, 'w')
+        output.writelines(new_f_lines)
+        output.close()
+    return [new_runfails, new_compfails, new_passes_runfails, new_passes_compfails, new_line, errors]
+
+def verify():
+    # Open file fail_db.txt
+    f = open(test_states, 'r')
+    f_lines = f.readlines()
+    f.close()
+    check = [["g++", "clang++", "cl"],["-O0", "-O2"],["x86","x86-64"],
+             ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM 3.4","LLVM trunk"],
+             ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8",
+              "sse4-i8x16", "avx1-i32x4" "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8",
+              "avx1.1-i32x16", "avx1.1-i64x4", "avx2-i32x8", "avx2-i32x16", "avx2-i64x4",
+              "generic-1", "generic-4", "generic-8",
+              "generic-16", "generic-32", "generic-64"]]
+    for i in range (0,len(f_lines)):
+        if f_lines[i][0] == "%":
+            continue
+        for j in range(0,len(check)):
+            temp = 0
+            for t in range(0,len(check[j])):
+                if " " + check[j][t] + " " in f_lines[i]:
+                    temp = temp + 1
+            if temp != 1:
+                print_debug("error in line " + str(i) + "\n", False, run_tests_log)
+                break
+
+
+def run_tests(options1, args, print_version):
+    global options
+    options = options1
+    global s
+    s = options.silent
+    
+    # prepare run_tests_log and fail_db files
+    global run_tests_log
+    if options.in_file:
+        run_tests_log = os.getcwd() + os.sep + options.in_file
+        if print_version == 1:
+            common.remove_if_exists(run_tests_log)
+    else:
+        run_tests_log = ""
+    global test_states
+    test_states = "fail_db.txt"
+    if options.verify:
+        verify()
+        return 0
+
+    # disable fancy error/warning printing with ANSI colors, so grepping for error
+    # messages doesn't get confused
+    os.environ["TERM"] = "dumb"
+ 
+    # This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
+    # git history has a workaround for that issue.
+    global is_windows 
+    is_windows = (platform.system() == 'Windows' or
+                'CYGWIN_NT' in platform.system())
+ 
+    if options.target == 'neon':
+        options.arch = 'arm'
+ 
+    # use relative path to not depend on host directory, which may possibly
+    # have white spaces and unicode characters.
+    global ispc_exe
+    ispc_exe = ""
+    if not is_windows:
+        if os.environ.get("ISPC_HOME") != None:
+            if os.path.exists(os.environ["ISPC_HOME"] + os.sep + "ispc"):
+                ispc_exe = os.environ["ISPC_HOME"] + os.sep + "ispc"
+            else:
+                PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+                for counter in PATH_dir:
+                    if os.path.exists(counter + os.sep + "ispc"):
+                        ispc_exe = counter + os.sep + "ispc"
+    else:
+        if os.path.exists(".\\Release\\ispc.exe"):
+            ispc_exe = ".\\Release\\ispc.exe"
+        else:
+            error("You don't have ispc.exe compiler in .\\Release.\n", 1)
+    # checks the required ispc compiler otherwise prints an error message
+    if ispc_exe == "":
+        error("ISPC compiler not found.\nAdded path to ispc compiler to your PATH variable or ISPC_HOME variable\n", 1)
+    print_debug("Testing ispc: " + ispc_exe + "\n", s, run_tests_log)
+    ispc_exe += " " + options.ispc_flags
+
+    global is_generic_target 
+    is_generic_target = (options.target.find("generic-") != -1 and
+                     options.target != "generic-1" and options.target != "generic-x1")
+    if is_generic_target and options.include_file == None:
+        if options.target == "generic-4" or options.target == "generic-x4":
+            error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2)
+            options.include_file = "examples/intrinsics/sse4.h"
+            options.target = "generic-4"
+        elif options.target == "generic-8" or options.target == "generic-x8":
+            error("No generics #include specified and no default available for \"generic-8\" target.\n", 1)
+            options.target = "generic-8"
+        elif options.target == "generic-16" or options.target == "generic-x16":
+            error("No generics #include specified; using examples/intrinsics/generic-16.h\n", 2)
+            options.include_file = "examples/intrinsics/generic-16.h"
+            options.target = "generic-16"
+        elif options.target == "generic-32" or options.target == "generic-x32":
+            error("No generics #include specified; using examples/intrinsics/generic-32.h\n", 2)
+            options.include_file = "examples/intrinsics/generic-32.h"
+            options.target = "generic-32"
+        elif options.target == "generic-64" or options.target == "generic-x64":
+            error("No generics #include specified; using examples/intrinsics/generic-64.h\n", 2)
+            options.include_file = "examples/intrinsics/generic-64.h"
+            options.target = "generic-64"
+ 
+    if options.compiler_exe == None:
+        if is_windows:
+            options.compiler_exe = "cl.exe"
+        else:
+            options.compiler_exe = "clang++"
+ 
+    # checks the required compiler otherwise prints an error message
+    PATH_dir = string.split(os.getenv("PATH"), os.pathsep) 
+    compiler_exists = False
+ 
+    for counter in PATH_dir:
+        if os.path.exists(counter + os.sep + options.compiler_exe):
+            compiler_exists = True
+            break
+ 
+    if not compiler_exists:
+        error("missing the required compiler: %s \n" % options.compiler_exe, 1)
+
+    # print compilers versions
+    if print_version > 0:
+        common.print_version(ispc_exe, "", options.compiler_exe, False, run_tests_log, is_windows)
+ 
+    ispc_root = "."
+    
+    # if no specific test files are specified, run all of the tests in tests/,
+    # failing_tests/, and tests_errors/
+    if len(args) == 0:
+        files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \
+            glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc")
+    else:
+        if is_windows:
+            argfiles = [ ]
+            for f in args:
+                # we have to glob ourselves if this is being run under a DOS
+                # shell, as it passes wildcard as is.
+                argfiles += glob.glob(f)
+        else:
+            argfiles = args
+        
+        files = [ ]
+        for f in argfiles:
+            if os.path.splitext(string.lower(f))[1] != ".ispc":
+                error("Ignoring file %s, which doesn't have an .ispc extension.\n" % f, 2)
+            else:
+                files += [ f ]
+ 
+    # max_test_length is used to issue exact number of whitespace characters when
+    # updating status. Otherwise update causes new lines standard 80 char terminal
+    # on both Linux and Windows.
+    max_test_length = 0
+    for f in files:
+        max_test_length = max(max_test_length, len(f))
+ 
+    # randomly shuffle the tests if asked to do so
+    if (options.random):
+        random.seed()
+        random.shuffle(files)
+ 
+    # counter
     total_tests = len(files)
 
     compile_error_files = [ ]
@@ -463,7 +619,7 @@ if __name__ == '__main__':
 
     nthreads = min(multiprocessing.cpu_count(), options.num_jobs)
     nthreads = min(nthreads, len(files))
-    sys.stdout.write("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests))
+    print_debug("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests), s, run_tests_log)
 
     # put each of the test filenames into a queue
     q = multiprocessing.Queue()
@@ -483,45 +639,116 @@ if __name__ == '__main__':
 
     start_time = time.time()
     # launch jobs to run tests
+    glob_var = [is_windows, options, s, ispc_exe, is_generic_target, run_tests_log]
+    global task_threads
+    task_threads = [0] * nthreads
     for x in range(nthreads):
-        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests, max_test_length, finished_tests_counter, finished_tests_counter_lock))
-        task_threads.append(t)
-        t.start()
-
+        task_threads[x] = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests,
+            max_test_length, finished_tests_counter, finished_tests_counter_lock, glob_var))
+        task_threads[x].start()
     # wait for them to all finish and then return the number that failed
     # (i.e. return 0 if all is ok)
     for t in task_threads:
         t.join()
     if options.non_interactive == False:
-        sys.stdout.write("\n")
+        print_debug("\n", s, run_tests_log)
 
-    elapsed_time = time.time() - start_time
+    temp_time = (time.time() - start_time)
+    elapsed_time = time.strftime('%Hh%Mm%Ssec.', time.gmtime(temp_time))
 
     while not qret.empty():
-        (c, r, s) = qret.get()
+        (c, r, skip) = qret.get()
         compile_error_files += c
         run_error_files += r
-        skip_files += s
+        skip_files += skip
 
     if options.non_interactive:
-        sys.stdout.write(" Done %d / %d\n" % (finished_tests_counter.value, total_tests))
+        print_debug(" Done %d / %d\n" % (finished_tests_counter.value, total_tests), s, run_tests_log)
     if len(skip_files) > 0:
         skip_files.sort()
-        sys.stdout.write("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests))
+        print_debug("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests), s, run_tests_log)
         for f in skip_files:
-            sys.stdout.write("\t%s\n" % f)
+            print_debug("\t%s\n" % f, s, run_tests_log)
     if len(compile_error_files) > 0:
         compile_error_files.sort()
-        sys.stdout.write("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests))
+        print_debug("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests), s, run_tests_log)
         for f in compile_error_files:
-            sys.stdout.write("\t%s\n" % f)
+            print_debug("\t%s\n" % f, s, run_tests_log)
     if len(run_error_files) > 0:
         run_error_files.sort()
-        sys.stdout.write("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests))
+        print_debug("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests), s, run_tests_log)
         for f in run_error_files:
-            sys.stdout.write("\t%s\n" % f)
+            print_debug("\t%s\n" % f, s, run_tests_log)
+    if len(compile_error_files) == 0 and len(run_error_files) == 0:
+        print_debug("No fails\n", s, run_tests_log)
+
+    if len(args) == 0:
+        R = file_check(compile_error_files, run_error_files)
+    else:
+        error("don't check new fails for incomplete suite of tests", 2)
+        R = 0
 
     if options.time:
-        sys.stdout.write("Elapsed time: %d s\n" % elapsed_time)
+        print_debug("Elapsed time: " + elapsed_time + "\n", s, run_tests_log)
 
-    sys.exit(len(compile_error_files) + len(run_error_files))
+    return [R, elapsed_time]
+
+
+from optparse import OptionParser
+import multiprocessing
+from ctypes import c_int
+import os
+import sys
+import glob
+import re
+import signal
+import random
+import string
+import subprocess
+import shlex
+import platform
+import tempfile
+import os.path
+import time
+# our functions
+import common
+print_debug = common.print_debug
+error = common.error
+
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
+                  default=False, action="store_true")
+    parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics",
+                  default=None)
+    parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
+                  default="")
+    parser.add_option('-t', '--target', dest='target',
+                  help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)',
+                                    default="sse4")
+    parser.add_option('-a', '--arch', dest='arch',
+                  help='Set architecture (arm, x86, x86-64)',
+                                    default="x86-64")
+    parser.add_option("-c", "--compiler", dest="compiler_exe", help="C/C++ compiler binary to use to run tests",
+                  default=None)
+    parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
+                  default=False, action="store_true")
+    parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel',
+                  default="1024", type="int")
+    parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output',
+                  default=False, action="store_true")
+    parser.add_option('--wrap-exe', dest='wrapexe',
+                  help='Executable to wrap test runs with (e.g. "valgrind")',
+                                    default="")
+    parser.add_option('--time', dest='time', help='Enable time output',
+                  default=False, action="store_true")
+    parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates',
+                  default=False, action="store_true")
+    parser.add_option('-u', "--update-errors", dest='update', help='Update file with fails (F of FP)', default="")
+    parser.add_option('-s', "--silent", dest='silent', help='enable silent mode without any output', default=False,
+                  action = "store_true")
+    parser.add_option("--file", dest='in_file', help='file to save run_tests output', default="")
+    parser.add_option("--verify", dest='verify', help='verify the file fail_db.txt', default=False, action="store_true")
+    (options, args) = parser.parse_args()
+    L = run_tests(options, args, 1)
+    exit(0)
diff --git a/stdlib.ispc b/stdlib.ispc
index e4f8844f..6768594b 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -50,6 +50,9 @@
 #elif (ISPC_MASK_BITS == 32)
   #define IntMaskType int32
   #define UIntMaskType unsigned int32
+#elif (ISPC_MASK_BITS == 64)
+  #define IntMaskType int64
+  #define UIntMaskType unsigned int64
 #else
   #error Unknown value of ISPC_MASK_BITS
 #endif
@@ -167,6 +170,60 @@ static inline int64 rotate(int64 v, uniform int i) {
     return __rotate_i64(v, i);
 }
 
+__declspec(safe) 
+static inline float shift(float v, uniform int i) {
+  varying float result;
+  unmasked {
+    result = __shift_float(v, i);
+  }
+  return result;
+}
+
+__declspec(safe) 
+static inline int8 shift(int8 v, uniform int i) {
+  varying int8 result;
+  unmasked {
+    result = __shift_i8(v, i);
+  }
+  return result;
+}
+
+__declspec(safe) 
+static inline int16 shift(int16 v, uniform int i) {
+  varying int16 result;
+  unmasked {
+    result = __shift_i16(v, i);
+  }
+  return result;
+}
+
+__declspec(safe) 
+static inline int32 shift(int32 v, uniform int i) {
+  varying int32 result;
+  unmasked {
+    result = __shift_i32(v, i);
+  }
+  return result;
+}
+
+__declspec(safe) 
+static inline double shift(double v, uniform int i) {
+  varying double result;
+  unmasked {
+    result = __shift_double(v, i);
+  }
+  return result;
+}
+
+__declspec(safe) 
+static inline int64 shift(int64 v, uniform int i) {
+  varying int64 result;
+  unmasked {
+    result = __shift_i64(v, i);
+  }
+  return result;
+}
+
 __declspec(safe) 
 static inline float shuffle(float v, int i) {
     return __shuffle_float(v, i);
@@ -1556,6 +1613,18 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl
     return min(max(v, low), high);
 }
 
+// double
+
+__declspec(safe,cost2)
+static inline double clamp(double v, double low, double high) {
+    return min(max(v, low), high);
+}
+
+__declspec(safe,cost2)
+static inline uniform double clamp(uniform double v, uniform double low, uniform double high) {
+    return min(max(v, low), high);
+}
+
 // int8
 
 __declspec(safe,cost2)
@@ -2180,7 +2249,7 @@ static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
 __declspec(safe)
 static inline float sin(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_sin(x_full);
+        return __svml_sinf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2313,8 +2382,10 @@ static inline float asin(float x) {
     bool isnan = (x > 1);
 
     float v;
-    if (__math_lib == __math_lib_svml ||
-        __math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_asinf(x);
+    } 
+    else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_asinf(extract(x, i));
@@ -2417,7 +2488,7 @@ static inline uniform float asin(uniform float x) {
 __declspec(safe)
 static inline float cos(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_cos(x_full);
+        return __svml_cosf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2545,18 +2616,28 @@ static inline float acos(float v) {
     return 1.57079637050628662109375 - asin(v);
 }
 
+__declspec(safe)
+static inline double acos(const double v) {
+    return 1.57079637050628662109375d0 - asin(v);
+}
+
 
 __declspec(safe)
 static inline uniform float acos(uniform float v) {
     return 1.57079637050628662109375 - asin(v);
 }
 
+__declspec(safe)
+static inline uniform double acos(const uniform double v) {
+    return 1.57079637050628662109375d0 - asin(v);
+}
+
 
 __declspec(safe)
 static inline void sincos(float x_full, varying float * uniform sin_result, 
                           varying float * uniform cos_result) {
     if (__math_lib == __math_lib_svml) {
-        __svml_sincos(x_full, sin_result, cos_result);
+        __svml_sincosf(x_full, sin_result, cos_result);
     }
     else if (__math_lib == __math_lib_system) {
         foreach_active (i) {
@@ -2688,7 +2769,7 @@ static inline void sincos(uniform float x_full, uniform float * uniform sin_resu
 __declspec(safe)
 static inline float tan(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_tan(x_full);
+        return __svml_tanf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2839,7 +2920,7 @@ static inline uniform float tan(uniform float x_full) {
 __declspec(safe)
 static inline float atan(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_atan(x_full);
+        return __svml_atanf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2934,7 +3015,7 @@ static inline uniform float atan(uniform float x_full) {
 __declspec(safe)
 static inline float atan2(float y, float x) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_atan2(y, x);
+        return __svml_atan2f(y, x);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2997,7 +3078,7 @@ static inline float exp(float x_full) {
         return __exp_varying_float(x_full);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_exp(x_full);
+        return __svml_expf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3204,7 +3285,7 @@ static inline float log(float x_full) {
         return __log_varying_float(x_full);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_log(x_full);
+        return __svml_logf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3379,7 +3460,7 @@ static inline float pow(float a, float b) {
         return __pow_varying_float(a, b);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_pow(a, b);
+        return __svml_powf(a, b);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3469,7 +3550,11 @@ static inline uniform double frexp(uniform double x, uniform int * uniform pw2)
 
 __declspec(safe)
 static inline double sin(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_sind(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return sin((float)x);
     else {
         double ret;
@@ -3490,8 +3575,30 @@ static inline uniform double sin(uniform double x) {
 }
 
 __declspec(safe)
-static inline double cos(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+static inline double asin(const double x) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_asind(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
+        return asin((float)x);
+    else {
+        double ret;
+        foreach_active (i) {
+            uniform double r = __stdlib_asin(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+__declspec(safe)
+static inline double cos(const double x) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_cosd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return cos((float)x);
     else {
         double ret;
@@ -3514,7 +3621,11 @@ static inline uniform double cos(uniform double x) {
 __declspec(safe)
 static inline void sincos(double x, varying double * uniform sin_result,
                           varying double * uniform cos_result) {
-    if (__math_lib == __math_lib_ispc_fast) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      __svml_sincosd(x, sin_result, cos_result);
+    }
+    else if (__math_lib == __math_lib_ispc_fast) {
         float sr, cr;
         sincos((float)x, &sr, &cr);
         *sin_result = sr;
@@ -3545,7 +3656,11 @@ static inline void sincos(uniform double x, uniform double * uniform sin_result,
 
 __declspec(safe)
 static inline double tan(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_tand(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return tan((float)x);
     else {
         double ret;
@@ -3589,7 +3704,11 @@ static inline uniform double atan(uniform double x) {
 
 __declspec(safe)
 static inline double atan2(double y, double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_atan2d(y,x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return atan2((float)y, (float)x);
     else {
         double ret;
@@ -3611,7 +3730,11 @@ static inline uniform double atan2(uniform double y, uniform double x) {
 
 __declspec(safe)
 static inline double exp(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_expd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return exp((float)x);
     else {
         double ret;
@@ -3633,7 +3756,11 @@ static inline uniform double exp(uniform double x) {
 
 __declspec(safe)
 static inline double log(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_logd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return log((float)x);
     else {
         double ret;
@@ -3655,7 +3782,11 @@ static inline uniform double log(uniform double x) {
 
 __declspec(safe)
 static inline double pow(double a, double b) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_powd(a,b);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return pow((float)a, (float)b);
     else {
         double ret;
diff --git a/tests/chkstk.ispc b/tests/chkstk.ispc
new file mode 100644
index 00000000..bd0a8299
--- /dev/null
+++ b/tests/chkstk.ispc
@@ -0,0 +1,49 @@
+//test for 17631 bug in LLVM.
+
+export uniform int width() { return programCount; }
+ 
+struct s_temp
+{
+    float temp[64];
+};
+ 
+int CompressBlockBC7(int A, uniform float b)
+{
+    // This declaration caused problem because LLVM inserted
+    // _chkstk after declaration and vzeroupper before it's call.
+    // A will be in ymm at avx, so we lose a half of it.
+    s_temp _state;
+    // These two loops are here to prevent elimination of declaration
+    for (int i=0; i<64; i++) {
+        float ii = i;
+        _state.temp[i] = b + sin(ii);
+    }
+    float r = 0;
+    for (int j=0; j<64; j+=9) {
+        r += _state.temp[j] + j;
+    }
+
+    // Here upper bits of A in ymm can be zeros. This will crash the test.
+    int B;
+    if (A!=0) {
+        B = 20;
+    }
+    else {
+        B = 30;
+    }
+    if(A == 1) {
+        B = r;
+    }
+    return B;
+}
+ 
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int A = programIndex;
+    RET[programIndex] = CompressBlockBC7(A, b);
+}
+ 
+export void result(uniform float RET[]) {
+    RET[programIndex] = 20;
+    RET[0] = 30;
+    RET[1] = 292;
+}
diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc
new file mode 100644
index 00000000..5f9a66d5
--- /dev/null
+++ b/tests/double-consts.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    // Test parsing of double constants.
+    double d1 = 1.0d40;
+    double d2 = 1.d40;
+    double d3 = 1d40;
+    double d4 = .1d41;
+    double d5 = 10000000000000000000000000000000000000000.d;
+    double d6 = 10000000000000000000000000000000000000000.0d;
+
+    // All the constants should be equal and if it's evaluated as "float",
+    // then sqrt will evaluate to +inf.
+    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && d1 == d6 &&
+        ((float)sqrt(d1)) < 2e20) {
+        RET[programIndex] = a;
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/operators.ispc b/tests/operators.ispc
new file mode 100644
index 00000000..95502bdd
--- /dev/null
+++ b/tests/operators.ispc
@@ -0,0 +1,70 @@
+
+export uniform int width() { return programCount; }
+
+struct S {
+    float a;
+};
+
+// References "struct&" were put in random order to test them.
+struct S operator*(struct S& rr, struct S rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator/(struct S& rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a - rr.a + 2;
+    return c;
+}
+
+struct S operator%(struct S rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator+(struct S rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    return c;
+}
+
+struct S operator-(struct S rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator>>(struct S& rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    return c;
+}
+
+struct S operator<<(struct S& rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S a, a1;
+struct S b, b1;
+struct S d1, d2, d3, d4, d5, d6, d7;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    a.a = aFOO[programIndex];
+    b.a = -aFOO[programIndex];
+    d1 = a * b;
+    d2 = a / b;
+    d3 = a % b;
+    d4 = a + b;
+    d5 = a - b;
+    d6 = a >> b;
+    d7 = a << b;
+    RET[programIndex] = d1.a + d2.a + d3.a + d4.a + d5.a + d6.a + d7.a;
+}
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 14;
+}
diff --git a/tests/operators1.ispc b/tests/operators1.ispc
new file mode 100644
index 00000000..f52c4c35
--- /dev/null
+++ b/tests/operators1.ispc
@@ -0,0 +1,64 @@
+
+export uniform int width() { return programCount; }
+
+struct S {
+    float a;
+};
+
+// References "struct&" were put in random order to test them.
+struct S operator*(struct S& rr, struct S rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator/(struct S& rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a - rr.a + 2;
+    return c;
+}
+
+struct S operator%(struct S rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator+(struct S rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    return c;
+}
+
+struct S operator-(struct S rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S operator>>(struct S& rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    return c;
+}
+
+struct S operator<<(struct S& rr, struct S& rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 2;
+    return c;
+}
+
+struct S a;
+struct S b;
+struct S d;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    a.a = 5;
+    b.a = -5;
+    d = a * b + b / a - a << (b - b) - a;
+    RET[programIndex] = d.a;
+}
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 12;
+}
diff --git a/tests/operators2.ispc b/tests/operators2.ispc
new file mode 100644
index 00000000..b732b24a
--- /dev/null
+++ b/tests/operators2.ispc
@@ -0,0 +1,51 @@
+int off;
+
+export uniform int width() { return programCount; }
+
+struct S {
+    float a;
+};
+
+struct S operator+(struct S rr, struct S rv) {
+    struct S c;
+    c.a = rr.a / rv.a + 3;
+    if (off == 1)
+        c.a = 22;
+    return c;
+}
+
+struct S operator/(struct S rr, struct S rv) {
+    struct S c;
+    c.a = rr.a + rv.a + 10;
+    if (off == 1)
+        c.a = 33;
+    return c;
+}
+
+struct S a;
+struct S b;
+struct S d;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int T = programIndex;
+    a.a = aFOO[programIndex];
+    b.a = -aFOO[programIndex];
+    if (programIndex == 3)
+        off = 1;
+    else
+        off = 0;
+    if (T % 2)
+        d = a + b;
+    else
+        d = a / b;
+
+    RET[programIndex] = d.a;
+}
+
+export void result(uniform float RET[4]) {
+    if (programIndex % 2)
+        RET[programIndex] = 2;
+    else
+        RET[programIndex] = 10;
+    RET[3] = 22;
+}
diff --git a/tests/ptr-arith-indexing.ispc b/tests/ptr-arith-indexing.ispc
new file mode 100644
index 00000000..9f62a2c9
--- /dev/null
+++ b/tests/ptr-arith-indexing.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+int foo(uniform float * uniform base, uniform int uOfs, varying int vOfs) {
+    return (base+uOfs)[vOfs];
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform ptr = &aFOO[0];
+    int val = foo(ptr, programCount, programIndex);
+    RET[programIndex] = val;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programCount+programIndex;
+}
diff --git a/tests/shift-1.ispc b/tests/shift-1.ispc
new file mode 100644
index 00000000..2062e36b
--- /dev/null
+++ b/tests/shift-1.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int rot = shift(a, -1);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    varying int val = programIndex;
+    if (val < 0) val = 0;	 
+    RET[programIndex] = val;	 
+}   
diff --git a/tests/shift-2.ispc b/tests/shift-2.ispc
new file mode 100644
index 00000000..6cb88e8a
--- /dev/null
+++ b/tests/shift-2.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    uniform int delta = b - 6; // -1
+    int rot = shift(a, delta);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    varying int val = programIndex;
+    if (val < 0) val = 0;	 
+    RET[programIndex] = val;	 
+}
diff --git a/tests/shift-3.ispc b/tests/shift-3.ispc
new file mode 100644
index 00000000..827d076f
--- /dev/null
+++ b/tests/shift-3.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int rot = shift(a, 1);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    varying int val = 2 + programIndex;
+    if (val > programCount) val = 0;	 
+    RET[programIndex] = val;	 
+}   
diff --git a/tests/test-141.ispc b/tests/test-141.ispc
index a533b605..b69be1fa 100644
--- a/tests/test-141.ispc
+++ b/tests/test-141.ispc
@@ -3,8 +3,9 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = aFOO[programIndex]; 
-    RET[programIndex] = (exp(-log(1/a)) - a) < 1e-7 ? 1 : 0;
+    float a = aFOO[programIndex];
+    // calculation error 1e-6 is the same as in icc
+    RET[programIndex] = (exp(-log(1/a)) - a) < 1e-6 ? 1 : 0;
 }
 
 export void result(uniform float RET[4]) {
diff --git a/type.cpp b/type.cpp
index 5dc6a21e..8a21f3d9 100644
--- a/type.cpp
+++ b/type.cpp
@@ -2897,7 +2897,7 @@ FunctionType::GetDIType(llvm::DIDescriptor scope) const {
     for (int i = 0; i < GetNumParameters(); ++i) {
         const Type *t = GetParameterType(i);
         if (t == NULL)
-#if defined(LLVM_3_4)
+#if defined(LLVM_3_4) || defined(LLVM_3_5)
             return llvm::DICompositeType();
 #else
             return llvm::DIType();