Release 1.5.0

Merge pull request #608 from dbabokin/fail_db
Fail_db.txt update
2013-09-27 23:27:05 +04:00 · 2013-09-27 07:16:52 -07:00 · 2013-09-27 18:12:12 +04:00 · 2013-09-27 18:06:28 +04:00 · 2013-09-27 17:00:17 +04:00 · 2013-09-27 02:32:01 +04:00
730 changed files with 143005 additions and 14395 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,20 @@
 depend
 ispc
 ispc_test
+ispc_ref
 objs
 docs/doxygen
-docs/ispc.html
+docs/*.html
+tests*/*cpp
+tests*/*run
+logs/
+notify_log.log
+alloy_results_*
+examples/*/*.png
+examples/*/*.ppm
+examples/*/objs/*
+examples/*/ref
+examples/*/test
+*.swp
+
+
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2010-2011, Intel Corporation
+Copyright (c) 2010-2013, Intel Corporation
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
@@ -77,7 +77,7 @@ covered by the following license:
 University of Illinois/NCSA
 Open Source License

-Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign.
+Copyright (c) 2003-2013 University of Illinois at Urbana-Champaign.
 All rights reserved.

 Developed by:
--- a/227
+++ b/227
@@ -1,7 +1,52 @@
+#
+#  Copyright (c) 2010-2013, Intel Corporation
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+#
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+#
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 #
 # ispc Makefile
 #

+# If you have your own special version of llvm and/or clang, change
+# these variables to match.
+LLVM_CONFIG=$(shell which llvm-config)
+CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
+
+# Enable ARM by request
+# To enable: make ARM_ENABLED=1
+ARM_ENABLED=0
+
+# Add llvm bin to the path so any scripts run will go to the right llvm-config
+LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
+export PATH:=$(LLVM_BIN):$(PATH)
+
 ARCH_OS = $(shell uname)
 ifeq ($(ARCH_OS), Darwin)
 	ARCH_OS2 = "OSX"
@@ -10,29 +55,34 @@ else
 endif
 ARCH_TYPE = $(shell arch)

-ifeq ($(shell llvm-config --version), 3.1svn)
-  LLVM_LIBS=-lLLVMAsmParser -lLLVMInstrumentation -lLLVMLinker			\
-	-lLLVMArchive -lLLVMBitReader -lLLVMDebugInfo -lLLVMJIT -lLLVMipo	\
-	-lLLVMBitWriter -lLLVMTableGen -lLLVMCBackendInfo			\
-	-lLLVMX86Disassembler -lLLVMX86CodeGen -lLLVMSelectionDAG		\
-	-lLLVMAsmPrinter -lLLVMX86AsmParser -lLLVMX86Desc -lLLVMX86Info		\
-	-lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCDisassembler	-lLLVMMCParser	\
-	-lLLVMCodeGen -lLLVMScalarOpts	-lLLVMInstCombine -lLLVMTransformUtils	\
-	-lLLVMipa -lLLVMAnalysis -lLLVMMCJIT -lLLVMRuntimeDyld			\
-	-lLLVMExecutionEngine -lLLVMTarget -lLLVMMC -lLLVMObject -lLLVMCore 	\
-	-lLLVMSupport
-else
-  LLVM_LIBS=$(shell llvm-config --libs)
+LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
+LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
+LLVM_VERSION_DEF=-D$(LLVM_VERSION)
+
+LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker
+# Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step.
+# We check if it's available before adding it (to not break 3.2 and earlier).
+ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1)
+    LLVM_COMPONENTS+=option
 endif
+ifneq ($(ARM_ENABLED), 0)
+    LLVM_COMPONENTS+=arm
+endif
+LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS))

 CLANG=clang
 CLANG_LIBS = -lclangFrontend -lclangDriver \
             -lclangSerialization -lclangParse -lclangSema \
-             -lclangAnalysis -lclangAST -lclangLex -lclangBasic
+             -lclangAnalysis -lclangAST -lclangBasic \
+             -lclangEdit -lclangLex

-ISPC_LIBS=$(shell llvm-config --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
+ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
 	-lpthread

+ifeq ($(LLVM_VERSION),LLVM_3_4)
+    ISPC_LIBS += -lcurses
+endif
+
 ifeq ($(ARCH_OS),Linux)
 	ISPC_LIBS += -ldl
 endif
@@ -41,28 +91,44 @@ ifeq ($(ARCH_OS2),Msys)
 	ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
 endif

-LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
-LLVM_VERSION=LLVM_$(shell llvm-config --version | sed s/\\./_/)
-LLVM_VERSION_DEF=-D$(LLVM_VERSION)
-
+# Define build time stamp and revision.
+# For revision we use GIT or SVN info.
 BUILD_DATE=$(shell date +%Y%m%d)
-BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
+GIT_REVISION:=$(shell git log --abbrev-commit --abbrev=16 2>/dev/null | head -1)
+ifeq (${GIT_REVISION},)
+    SVN_REVISION:=$(shell svn log -l 1 2>/dev/null | grep -o \^r[[:digit:]]\* )
+    ifeq (${SVN_REVISION},)
+        # Failed to get revision info
+        BUILD_VERSION:="no_version_info"
+    else
+        # SVN revision info
+        BUILD_VERSION:=$(SVN_REVISION)
+    endif
+else
+    # GIT revision info
+    BUILD_VERSION:=$(GIT_REVISION)
+endif

 CXX=g++
 CPP=cpp
-OPT=-g3
-CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
-	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
+OPT=-O2
+CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
+	$(LLVM_VERSION_DEF) \
+	-Wall \
+	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" \
+	-Wno-sign-compare
+ifneq ($(LLVM_VERSION),LLVM_3_1)
+	CXXFLAGS+=-Werror
+endif
+ifneq ($(ARM_ENABLED), 0)
+    CXXFLAGS+=-DISPC_ARM_ENABLED
+endif

 LDFLAGS=
 ifeq ($(ARCH_OS),Linux)
  # try to link everything statically under Linux (including libstdc++) so
  # that the binaries we generate will be portable across distributions...
-  ifeq ($(ARCH_TYPE),x86_64)
-    LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
-  else
-    LDFLAGS=-L/usr/lib/gcc/i686-redhat-linux/4.6.0
-  endif
+#    LDFLAGS=-static
 endif

 LEX=flex
@@ -75,26 +141,36 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx avx-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 generic-16
-BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
-	builtins/dispatch.ll
-BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
-	builtins-c-32.cpp builtins-c-64.cpp 
+TARGETS=avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
+	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
+ifneq ($(ARM_ENABLED), 0)
+    TARGETS+=neon-32 neon-16 neon-8
+endif
+# These files need to be compiled in two versions - 32 and 64 bits.
+BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
+# These are files to be compiled in single version.
+BUILTINS_SRC_COMMON=builtins/dispatch.ll
+BUILTINS_OBJS_32=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-32bit.o)))
+BUILTINS_OBJS_64=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-64bit.o)))
+BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_COMMON:.ll=.o))) \
+	$(BUILTINS_OBJS_32) $(BUILTINS_OBJS_64) \
+	builtins-c-32.cpp builtins-c-64.cpp
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll

 OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-	stdlib_generic_ispc.o stdlib_x86_ispc.o \
+       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o stdlib_mask64_ispc.o \
 	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))

 default: ispc

-.PHONY: dirs clean depend doxygen print_llvm_src
+.PHONY: dirs clean depend doxygen print_llvm_src llvm_check
 .PRECIOUS: objs/builtins-%.cpp

-depend: $(CXX_SRC) $(HEADERS)
+depend: llvm_check $(CXX_SRC) $(HEADERS)
 	@echo Updating dependencies
-	@gcc -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend
+	@$(CXX) -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend

 -include depend

@@ -102,8 +178,17 @@ dirs:
 	@echo Creating objs/ directory
 	@/bin/mkdir -p objs

-print_llvm_src:
+llvm_check:
+	@llvm-config --version > /dev/null || \
+	(echo; \
+	 echo "******************************************"; \
+	 echo "ERROR: llvm-config not found in your PATH";  \
+	 echo "******************************************"; \
+	 echo; exit 1)
+
+print_llvm_src: llvm_check
 	@echo Using LLVM `llvm-config --version` from `llvm-config --libdir`
+	@echo Using compiler to build: `$(CXX) --version | head -1`

 clean:
 	/bin/rm -rf objs ispc
@@ -114,7 +199,20 @@ doxygen:

 ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
-	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
+	@$(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
+
+# Use clang as a default compiler, instead of gcc
+clang: ispc
+clang: CXX=clang++
+
+# Build ispc with address sanitizer instrumentation using clang compiler
+# Note that this is not portable build
+asan: clang
+asan: OPT+=-fsanitize=address
+
+# Do debug build, i.e. -O0 -g
+debug: ispc
+debug: OPT=-O0 -g

 objs/%.o: %.cpp
 	@echo Compiling $<
@@ -124,6 +222,10 @@ objs/cbackend.o: cbackend.cpp
 	@echo Compiling $<
 	@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<

+objs/opt.o: opt.cpp
+	@echo Compiling $<
+	@$(CXX) -fno-rtti $(CXXFLAGS) -o $@ -c $<
+
 objs/%.o: objs/%.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
@@ -144,24 +246,47 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
-	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | python bitcode2cpp.py $< > $@
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
+
+objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+	@echo Creating C++ source from builtins definition file $< \(32 bit version\)
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
+
+objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+	@echo Creating C++ source from builtins definition file $< \(64 bit version\)
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@

 objs/builtins-c-32.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-32 > $@
+	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 32 > $@

 objs/builtins-c-64.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-64 > $@
+	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@

-objs/stdlib_generic_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $< for generic
-	@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
-		python stdlib2cpp.py generic > $@
+objs/stdlib_mask1_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask1
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask1 > $@

-objs/stdlib_x86_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $< for x86
-	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
-		python stdlib2cpp.py x86 > $@
+objs/stdlib_mask8_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask8
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask8 > $@
+
+objs/stdlib_mask16_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask16
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask16 > $@
+
+objs/stdlib_mask32_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask32
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask32 > $@
+
+objs/stdlib_mask64_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask64
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask64 > $@
--- a/README.rst
+++ b/README.rst
@@ -47,7 +47,7 @@ remarkable `LLVM Compiler Infrastructure <http://llvm.org>`_ for back-end
 code generation and optimization and is `hosted on
 github <http://github.com/ispc/ispc/>`_. It supports Windows, Mac, and
 Linux, with both x86 and x86-64 targets.  It currently supports the SSE2,
-SSE4, and AVX instruction sets.
+SSE4, AVX1, and AVX2 instruction sets.

 Features
 --------
--- a/alloy.py
+++ b/alloy.py
@@ -0,0 +1,656 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+
+def attach_mail_file(msg, filename, name):
+    if os.path.exists(filename):
+        fp = open(filename, "rb")
+        to_attach = MIMEBase("application", "octet-stream")
+        to_attach.set_payload(fp.read())
+        encode_base64(to_attach)
+        to_attach.add_header("Content-Disposition", "attachment", filename=name)
+        fp.close()
+        msg.attach(to_attach)
+
+def setting_paths(llvm, ispc, sde):
+    if llvm != "":
+        os.environ["LLVM_HOME"]=llvm
+    if ispc != "":
+        os.environ["ISPC_HOME"]=ispc
+    if sde != "":
+        os.environ["SDE_HOME"]=sde
+
+def check_LLVM(which_LLVM):
+    answer = []
+    if which_LLVM[0] == " ":
+        return answer
+    p = os.environ["LLVM_HOME"]
+    for i in range(0,len(which_LLVM)):
+        if not os.path.exists(p + os.sep + "bin-" + which_LLVM[i] + os.sep + "bin"):
+            answer.append(which_LLVM[i])
+    return answer
+
+def try_do_LLVM(text, command, from_validation):
+    if from_validation == True:
+        text = text + "\n"
+    print_debug("Trying to " + text, from_validation, alloy_build)
+    if os.system(command + " >> " + alloy_build + " 2>> " + alloy_build) != 0:
+        print_debug("ERROR.\n", from_validation, alloy_build)
+        error("can't " + text, 1)
+    print_debug("DONE.\n", from_validation, alloy_build)
+
+def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_validation, force, make):
+    print_debug("Building LLVM. Version: " + version_LLVM + ". ", from_validation, alloy_build)
+    if revision != "":
+        print_debug("Revision: " + revision + ".\n", from_validation, alloy_build)
+    else:
+        print_debug("\n", from_validation, alloy_build)
+    # Here we understand what and where do we want to build
+    current_path = os.getcwd()
+    llvm_home = os.environ["LLVM_HOME"]
+    os.chdir(llvm_home)
+    FOLDER_NAME=version_LLVM
+    if  version_LLVM == "trunk":
+        SVN_PATH="trunk"
+    if  version_LLVM == "3.3":
+        SVN_PATH="tags/RELEASE_33/final"
+        version_LLVM = "3_3"
+    if  version_LLVM == "3.2":
+        SVN_PATH="tags/RELEASE_32/final"
+        version_LLVM = "3_2"
+    if  version_LLVM == "3.1":
+        SVN_PATH="tags/RELEASE_31/final"
+        version_LLVM = "3_1"
+    if revision != "":
+        FOLDER_NAME = FOLDER_NAME + "_" + revision
+        revision = "-" + revision
+    if folder == "":
+        folder = FOLDER_NAME
+    LLVM_SRC="llvm-" + folder
+    LLVM_BUILD="build-" + folder
+    LLVM_BIN="bin-" + folder
+    if os.path.exists(LLVM_BIN + os.sep + "bin") and not force:
+        error("you have folder " + LLVM_BIN + ".\nIf you want to rebuild use --force", 1)
+    LLVM_BUILD_selfbuild = LLVM_BUILD + "_temp"
+    LLVM_BIN_selfbuild = LLVM_BIN + "_temp"
+    common.remove_if_exists(LLVM_SRC)
+    common.remove_if_exists(LLVM_BUILD)
+    common.remove_if_exists(LLVM_BIN)
+    if selfbuild:
+        common.remove_if_exists(LLVM_BUILD_selfbuild)
+        common.remove_if_exists(LLVM_BIN_selfbuild)
+    print_debug("Using folders: " + LLVM_SRC + " " + LLVM_BUILD + " " + LLVM_BIN + " in " + 
+        llvm_home + "\n", from_validation, alloy_build)
+    # load llvm
+    if tarball == "":
+        try_do_LLVM("load LLVM from http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " " + LLVM_SRC,
+                    from_validation)
+        os.chdir(LLVM_SRC + "/tools")
+        try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang",
+                    from_validation)
+        os.chdir("../")
+    else:
+        tar = tarball.split(" ")
+        os.makedirs(LLVM_SRC) 
+        os.chdir(LLVM_SRC) 
+        try_do_LLVM("untar LLVM from " + tar[0] + " ",
+                    "tar -xvzf " + tar[0] + " --strip-components 1", from_validation)
+        os.chdir("./tools") 
+        os.makedirs("clang") 
+        os.chdir("./clang") 
+        try_do_LLVM("untar clang from " + tar[1] + " ",
+                    "tar -xvzf " + tar[1] + " --strip-components 1", from_validation)
+        os.chdir("../../")
+    # paching llvm
+    patches = glob.glob(os.environ["ISPC_HOME"] + "/llvm_patches/*.*")
+    for patch in patches:
+        if version_LLVM in os.path.basename(patch):
+            try_do_LLVM("patch LLVM with patch" + patch + " ", "patch -p0 < " + patch, from_validation)
+    os.chdir("../")
+    # configuring llvm, build first part of selfbuild
+    os.makedirs(LLVM_BUILD)
+    os.makedirs(LLVM_BIN)
+    selfbuild_compiler = ""
+    if selfbuild:
+        print_debug("Making selfbuild and use folders " + LLVM_BUILD_selfbuild + " and " +
+            LLVM_BIN_selfbuild + "\n", from_validation, alloy_build)
+        os.makedirs(LLVM_BUILD_selfbuild)
+        os.makedirs(LLVM_BIN_selfbuild)
+        os.chdir(LLVM_BUILD_selfbuild)
+        try_do_LLVM("configure release version for selfbuild ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" +
+                    LLVM_BIN_selfbuild + " --enable-optimized",
+                    from_validation)
+        try_do_LLVM("build release version for selfbuild ",
+                    make, from_validation)
+        try_do_LLVM("install release version for selfbuild ",
+                    "make install",
+                    from_validation)
+        os.chdir("../")
+        selfbuild_compiler = " CC="+llvm_home+ "/" + LLVM_BIN_selfbuild + "/bin/clang"
+        print_debug("Now we have compiler for selfbuild: " + selfbuild_compiler + "\n", from_validation, alloy_build)
+    os.chdir(LLVM_BUILD)
+    if debug == False:
+        try_do_LLVM("configure release version ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" +
+                    LLVM_BIN + " --enable-optimized" + selfbuild_compiler,
+                    from_validation)
+    else:
+        try_do_LLVM("configure debug version ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" + LLVM_BIN +
+                    " --enable-debug-runtime --enable-debug-symbols --enable-keep-symbols" + selfbuild_compiler,
+                    from_validation)
+    # building llvm
+    try_do_LLVM("build LLVM ", make, from_validation)
+    try_do_LLVM("install LLVM ", "make install", from_validation)
+    os.chdir(current_path) 
+
+def check_targets():
+    answer = []
+    answer_sde = []
+    SSE2 = False;
+    SSE4 = False;
+    AVX = False;
+    AVX11 = False;
+    AVX2 = False;
+    if current_OS == "Linux":
+        cpu = open("/proc/cpuinfo")
+        f_lines = cpu.readlines()
+        cpu.close()
+        # check what native targets do we have
+        for i in range(0,len(f_lines)):
+            if SSE2 == False and "sse2" in f_lines[i]:
+                SSE2 = True;
+                answer = answer + ["sse2-i32x4", "sse2-i32x8"]
+            if SSE4 == False and "sse4_1" in f_lines[i]:
+                SSE4 = True;
+                answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
+            if AVX == False and "avx" in f_lines[i]:
+                AVX = True;
+                answer = answer + ["avx1-i32x8", "avx1-i32x16"]
+            if AVX11 == False and "rdrand" in f_lines[i]:
+                AVX11 = True;
+                answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
+            if AVX2 == False and "avx2" in f_lines[i]:
+                AVX2 = True;
+                answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+    if current_OS == "MacOS":
+        f_lines = take_lines("sysctl machdep.cpu.features", "first")
+        if "SSE2" in f_lines:
+            SSE2 = True;
+            answer = answer + ["sse2-i32x4", "sse2-i32x8"]
+        if "SSE4.1" in f_lines:
+            SSE4 = True;
+            answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
+        if "AVX1.0" in f_lines:
+            AVX = True;
+            answer = answer + ["avx1-i32x8", "avx1-i32x16"]
+        if "RDRAND" in f_lines:
+            AVX11 = True;
+            answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
+        if "AVX2.0" in f_lines:
+            AVX2 = True;
+            answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+
+    answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"]
+    # now check what targets we have with the help of SDE
+    sde_exists = ""
+    PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+    for counter in PATH_dir:
+        if os.path.exists(counter + os.sep + "sde") and sde_exists == "":
+            sde_exists = counter + os.sep + "sde"
+    if os.environ.get("SDE_HOME") != None:
+        if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"):
+            sde_exists = os.environ.get("SDE_HOME") + os.sep + "sde"
+    if sde_exists == "":
+        error("you haven't got sde neither in SDE_HOME nor in your PATH.\n" + 
+            "To test all platforms please set SDE_HOME to path containing SDE.\n" +
+            "Please refer to http://www.intel.com/software/sde for SDE download information.", 2)
+        return [answer, answer_sde]
+    # here we have SDE
+    f_lines = take_lines(sde_exists + " -help", "all")
+    for i in range(0,len(f_lines)):
+        if SSE4 == False and "wsm" in f_lines[i]:
+            answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]]
+        if AVX == False and "snb" in f_lines[i]:
+            answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"]]
+        if AVX11 == False and "ivb" in f_lines[i]:
+            answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"]]
+        if AVX2 == False and "hsw" in f_lines[i]:
+            answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]]
+    return [answer, answer_sde]
+
+def build_ispc(version_LLVM, make):
+    current_path = os.getcwd()
+    os.chdir(os.environ["ISPC_HOME"])
+    p_temp = os.getenv("PATH")
+    os.environ["PATH"] = os.environ["LLVM_HOME"] + "/bin-" + version_LLVM + "/bin:" + os.environ["PATH"]
+    try_do_LLVM("clean ISPC for building", "make clean", True)
+    try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", make, True)
+    os.environ["PATH"] = p_temp
+    os.chdir(current_path)
+
+def execute_stability(stability, R, print_version):
+    stability1 = copy.deepcopy(stability)
+    temp = run_tests.run_tests(stability1, [], print_version)
+    for j in range(0,4):
+        R[j][0] = R[j][0] + temp[j]
+        for i in range(0,len(temp[j])):
+            R[j][1].append(temp[4])
+    number_of_fails = temp[5]
+    number_of_new_fails = len(temp[0]) + len(temp[1])
+    if number_of_fails == 0:
+        str_fails = ". No fails"
+    else:
+        str_fails = ". Fails: " + str(number_of_fails)
+    if number_of_new_fails == 0:
+        str_new_fails = ", No new fails.\n"
+    else:
+        str_new_fails = ", New fails: " + str(number_of_new_fails) + ".\n"
+    print_debug(temp[4][1:-3] + str_fails + str_new_fails, False, stability_log)
+
+def run_special_tests():
+   i = 5 
+
+def validation_run(only, only_targets, reference_branch, number, notify, update, make):
+    os.chdir(os.environ["ISPC_HOME"])
+    os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
+    if options.notify != "":
+        common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "notify_log.log")
+        smtp_server = os.environ["SMTP_ISPC"]
+        msg = MIMEMultipart()
+        msg['Subject'] = 'ISPC test system results'
+        msg['From'] = 'ISPC_test_system'
+        msg['To'] = options.notify
+    print_debug("Command: " + ' '.join(sys.argv) + "\n", False, "")
+    print_debug("Folder: " + os.environ["ISPC_HOME"] + "\n", False, "")
+    date = datetime.datetime.now()
+    print_debug("Date: " + date.strftime('%H:%M %d/%m/%Y') + "\n", False, "")
+    class options_for_drivers:
+        pass
+# *** *** ***
+# Stability validation run
+# *** *** ***
+    if ((("stability" in only) == True) or ("performance" in only) == False):
+        print_debug("\n\nStability validation run\n\n", False, "")
+        stability = options_for_drivers()
+# stability constant options
+        stability.random = False
+        stability.ispc_flags = ""
+        stability.compiler_exe = None
+        stability.num_jobs = 1024
+        stability.verbose = False
+        stability.time = False
+        stability.non_interactive = True
+        stability.update = update
+        stability.include_file = None
+        stability.silent = True
+        stability.in_file = "." + os.sep + f_date + os.sep + "run_tests_log.log"
+        stability.verify = False
+# stability varying options
+        stability.target = ""
+        stability.arch = ""
+        stability.no_opt = False
+        stability.wrapexe = ""
+# prepare parameters of run
+        [targets_t, sde_targets_t] = check_targets()
+        rebuild = True
+        opts = []
+        archs = []
+        LLVM = []
+        targets = []
+        sde_targets = []
+# parsing option only, update parameters of run
+        if "-O2" in only:
+            opts.append(False)
+        if "-O0" in only:
+            opts.append(True)
+        if "x86" in only and not ("x86-64" in only):
+            archs.append("x86")
+        if "x86-64" in only:
+            archs.append("x86-64")
+        if "native" in only:
+            sde_targets_t = []
+        for i in ["3.1", "3.2", "3.3", "trunk"]:
+            if i in only:
+                LLVM.append(i)
+        if "current" in only:
+            LLVM = [" "]
+            rebuild = False
+        else:
+            common.check_tools(1)
+        if only_targets != "":
+            only_targets_t = only_targets.split(" ")
+            for i in only_targets_t:
+                err = True
+                for j in range(0,len(targets_t)):
+                    if i in targets_t[j]:
+                        targets.append(targets_t[j])
+                        err = False
+                for j in range(0,len(sde_targets_t)):
+                    if i in sde_targets_t[j][1]:
+                        sde_targets.append(sde_targets_t[j])
+                        err = False
+                if err == True:
+                    error("You haven't sde for target " + i, 1)
+        else:
+            targets = targets_t[:-4]
+            sde_targets = sde_targets_t
+        if "build" in only:
+            targets = []
+            sde_targets = []
+            only = only + " stability "
+# finish parameters of run, prepare LLVM
+        if len(opts) == 0:
+            opts = [False]
+        if len(archs) == 0:
+            archs = ["x86", "x86-64"]
+        if len(LLVM) == 0:
+            LLVM = ["3.3", "trunk"]
+        gen_archs = ["x86-64"]
+        need_LLVM = check_LLVM(LLVM)
+        for i in range(0,len(need_LLVM)):
+            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make)
+# begin validation run for stabitily
+        common.remove_if_exists(stability.in_file)
+        R = [[[],[]],[[],[]],[[],[]],[[],[]]]
+        print_debug("\n_________________________STABILITY REPORT_________________________\n", False, stability_log)
+        for i in range(0,len(LLVM)):
+            print_version = 2
+            if rebuild:
+                build_ispc(LLVM[i], make)
+            for j in range(0,len(targets)):
+                stability.target = targets[j]
+                stability.wrapexe = ""
+                if "generic" in targets[j]:
+                    arch = gen_archs
+                else:
+                    arch = archs
+                for i1 in range(0,len(arch)):
+                    for i2 in range(0,len(opts)):
+                        stability.arch = arch[i1]
+                        stability.no_opt = opts[i2]
+                        execute_stability(stability, R, print_version)
+                        print_version = 0
+            for j in range(0,len(sde_targets)):
+                stability.target = sde_targets[j][1]
+                stability.wrapexe = os.environ["SDE_HOME"] + "/sde " + sde_targets[j][0] + " -- "
+                for i1 in range(0,len(archs)):
+                    for i2 in range(0,len(opts)):
+                        stability.arch = archs[i1]
+                        stability.no_opt = opts[i2]
+                        execute_stability(stability, R, print_version)
+                        print_version = 0
+# run special tests like embree
+# 
+        run_special_tests()
+        ttt = ["NEW RUNFAILS: ", "NEW COMPFAILS: ", "NEW PASSES RUNFAILS: ", "NEW PASSES COMPFAILS: "]
+        for j in range(0,4):
+            if len(R[j][0]) == 0:
+                print_debug("NO " + ttt[j][:-2] + "\n", False, stability_log)
+            else:
+                print_debug(ttt[j] + str(len(R[j][0])) + "\n", False, stability_log)
+                temp5 = [[],[]]
+                for i in range(0,len(R[j][0])):
+                    er = True
+                    for k in range(0,len(temp5[0])):
+                        if R[j][0][i] == temp5[0][k]:
+                            temp5[1][k].append(R[j][1][i])
+                            er = False
+                    if er == True:
+                        temp5[0].append(R[j][0][i])
+                        temp5[1].append([R[j][1][i]])
+                for i in range(0,len(temp5[0])):
+                    print_debug("\t" + temp5[0][i] + "\n", True, stability_log)
+                    for k in range(0,len(temp5[1][i])):
+                        print_debug("\t\t\t" + temp5[1][i][k], True, stability_log)
+        print_debug("__________________Watch stability.log for details_________________\n", False, stability_log)
+        if options.notify != "":
+            attach_mail_file(msg, stability.in_file, "run_tests_log.log")
+            attach_mail_file(msg, stability_log, "stability.log")
+
+# *** *** ***
+# Performance validation run
+# *** *** ***
+    if ((("performance" in only) == True) or ("stability" in only) == False):
+        print_debug("\n\nPerformance validation run\n\n", False, "")
+        common.check_tools(1)
+        performance = options_for_drivers()
+# performance constant options
+        performance.number = number
+        performance.config = "./perf.ini"
+        performance.path = "./"
+        performance.silent = True
+        performance.output = ""
+        performance.compiler = ""
+        performance.ref = "ispc_ref"
+        performance.in_file = "." + os.sep + f_date + os.sep + "performance.log"
+# prepare LLVM 3.3 as newest LLVM
+        need_LLVM = check_LLVM(["3.3"])
+        if len(need_LLVM) != 0:
+            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make)
+# prepare reference point. build both test and reference compilers
+        try_do_LLVM("apply git", "git branch", True)
+        temp4 = take_lines("git branch", "all")
+        for line in temp4:
+            if "*" in line:
+                current_branch = line[2:-1]
+        stashing = True
+        sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n")
+        if "No local changes" in take_lines("git stash", "first"):
+            stashing = False
+        #try_do_LLVM("stash current branch ", "git stash", True)
+        try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True)
+        sys.stdout.write(".\n")
+        build_ispc("3.3", make)
+        sys.stdout.write(".\n")
+        os.rename("ispc", "ispc_ref")
+        try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True)
+        if stashing:
+            try_do_LLVM("return current branch ", "git stash pop", True)
+        sys.stdout.write("You can interrupt script now.\n")
+        build_ispc("3.3", make)
+# begin validation run for performance. output is inserted into perf()
+        perf.perf(performance, [])
+        if options.notify != "":
+            attach_mail_file(msg, performance.in_file, "performance.log")
+            attach_mail_file(msg, "." + os.sep + "logs" + os.sep + "perf_build.log", "perf_build.log")
+
+# sending e-mail with results
+    if options.notify != "":
+        fp = open(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", 'rb')
+        f_lines = fp.readlines()
+        fp.close()
+        line = ""
+        for i in range(0,len(f_lines)):
+            line = line + f_lines[i][:-1]
+            line = line + '   \n'
+        text = MIMEText(line, "", "KOI-8")
+        msg.attach(text)
+        attach_mail_file(msg, alloy_build, "alloy_build.log")
+        s = smtplib.SMTP(smtp_server)
+        s.sendmail('ISPC_test_system', options.notify, msg.as_string())
+        s.quit()
+
+def Main():
+    global current_OS
+    if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True:
+        current_OS = "Windows"
+        error("Windows isn't supported now", 1)
+    else:
+        if (platform.system() == 'Darwin'):
+            current_OS = "MacOS"
+        else:
+            current_OS = "Linux" 
+
+    if (options.build_llvm == False and options.validation_run == False):
+        parser.print_help()
+        exit(0)
+
+    setting_paths(options.llvm_home, options.ispc_home, options.sde_home)
+    if os.environ.get("LLVM_HOME") == None:
+        error("you have no LLVM_HOME", 1)
+    if os.environ.get("ISPC_HOME") == None:
+        error("you have no ISPC_HOME", 1)
+    if options.notify != "":
+        if os.environ.get("SMTP_ISPC") == None:
+            error("you have no SMTP_ISPC in your environment for option notify", 1)
+    if options.only != "":
+        test_only_r = " 3.1 3.2 3.3 trunk current build stability performance x86 x86-64 -O0 -O2 native "
+        test_only = options.only.split(" ")
+        for iterator in test_only:
+            if not (" " + iterator + " " in test_only_r):
+                error("unknow option for only: " + iterator, 1)
+
+    global f_date
+    f_date = "logs"
+    common.remove_if_exists(f_date)
+    os.makedirs(f_date)
+    global alloy_build
+    alloy_build = os.getcwd() + os.sep + f_date + os.sep + "alloy_build.log"
+    global stability_log
+    stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log"
+    current_path = os.getcwd()
+    make = "make -j" + options.speed
+    try:
+        if options.build_llvm:
+            build_LLVM(options.version, options.revision, options.folder, options.tarball,
+                    options.debug, options.selfbuild, False, options.force, make)
+        if options.validation_run:
+            validation_run(options.only, options.only_targets, options.branch,
+                    options.number_for_performance, options.notify, options.update, make)
+    finally:
+        os.chdir(current_path)
+        date_name = "alloy_results_" + datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')
+        if os.path.exists(date_name):
+            error("It's forbidden to run alloy two times in a second, logs are in ./logs", 1)
+        os.rename(f_date, date_name)
+        print_debug("Logs are in " + date_name + "\n", False, "")
+
+###Main###
+from optparse import OptionParser
+from optparse import OptionGroup
+import sys
+import os
+import operator
+import time
+import glob
+import string
+import platform
+import smtplib
+import datetime
+import copy
+from email.MIMEMultipart import MIMEMultipart
+from email.MIMEBase import MIMEBase
+from email.mime.text import MIMEText
+from email.Encoders import encode_base64
+# our drivers
+import run_tests
+import perf
+import common
+error = common.error
+take_lines = common.take_lines
+print_debug = common.print_debug
+# parsing options
+class MyParser(OptionParser):
+    def format_epilog(self, formatter):
+        return self.epilog
+examples =  ("Examples:\n" +
+"Load and build LLVM from trunk\n\talloy.py -b\n" +
+"Load and build LLVM 3.3. Rewrite LLVM folders\n\talloy.py -b --version=3.3 --force\n" +
+"Untar files llvm.tgz clang.tgz, build LLVM from them in folder bin-from_tar\n\talloy.py -b --tarball='llvm.tgz clang.tgz' --folder=from_tar\n" +
+"Load LLVM from trunk, revision r172870. Build it. Do selfbuild\n\talloy.py -b --revision=r172870 --selfbuild\n" +
+"Validation run with LLVM 3.3, trunk; x86, x86-64; -O2;\nall supported targets; performance\n\talloy.py -r\n" + 
+"Validation run with all avx targets and sse4-i8x16 without performance\n\talloy.py -r --only=stability --only-targets='avx sse4-i8x16'\n" +
+"Validation run with avx2-i32x8, all sse4 and sse2 targets\nand all targets with i32x16\n\talloy.py -r --only-targets='avx2-i32x8 sse4 i32x16 sse2'\n" +
+"Stability validation run with LLVM 3.2, 3.3; -O0; x86,\nupdate fail_db.txt with passes and fails\n\talloy.py -r --only='3.2 -O0 stability 3.3 x86' --update-errors=FP\n" +
+"Try to build compiler with all LLVM\n\talloy.py -r --only=build\n" +
+"Performance validation run with 10 runs of each test and comparing to branch 'old'\n\talloy.py -r --only=performance --compare-with=old --number=10\n" +
+"Validation run. Update fail_db.txt with new fails, send results to my@my.com\n\talloy.py -r --update-errors=F --notify='my@my.com'\n")
+parser = MyParser(usage="Usage: alloy.py -r/-b [options]", epilog=examples)
+parser.add_option('-b', '--build-llvm', dest='build_llvm',
+    help='ask to build LLVM', default=False, action="store_true")
+parser.add_option('-r', '--run', dest='validation_run',
+    help='ask for validation run', default=False, action="store_true")
+parser.add_option('-j', dest='speed',
+    help='set -j for make', default="8")
+# options for activity "build LLVM"
+llvm_group = OptionGroup(parser, "Options for building LLVM",
+                    "These options must be used with -b option.")
+llvm_group.add_option('--version', dest='version',
+    help='version of llvm to build: 3.1 3.2 3.3 trunk. Default: trunk', default="trunk")
+llvm_group.add_option('--revision', dest='revision',
+    help='revision of llvm to build in format r172870', default="")
+llvm_group.add_option('--debug', dest='debug',
+    help='debug build of LLVM?', default=False, action="store_true")
+llvm_group.add_option('--folder', dest='folder',
+    help='folder to build LLVM in', default="")
+llvm_group.add_option('--tarball', dest='tarball',
+    help='"llvm_tarball clang_tarball"', default="")
+llvm_group.add_option('--selfbuild', dest='selfbuild',
+    help='make selfbuild of LLVM and clang', default=False, action="store_true")
+llvm_group.add_option('--force', dest='force',
+    help='rebuild LLVM', default=False, action='store_true')
+parser.add_option_group(llvm_group)
+# options for activity "validation run"
+run_group = OptionGroup(parser, "Options for validation run",
+                    "These options must be used with -r option.")
+run_group.add_option('--compare-with', dest='branch',
+    help='set performance reference point. Dafault: master', default="master")
+run_group.add_option('--number', dest='number_for_performance',
+    help='number of performance runs for each test. Default: 5', default=5)
+run_group.add_option('--notify', dest='notify',
+    help='email to sent results to', default="")
+run_group.add_option('--update-errors', dest='update',
+    help='rewrite fail_db.txt file according to received results (F or FP)', default="")
+run_group.add_option('--only-targets', dest='only_targets',
+    help='set list of targets to test. Possible values - all subnames of targets.',
+    default="")
+run_group.add_option('--only', dest='only',
+    help='set types of tests. Possible values:\n' + 
+        '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
+        'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).',
+        default="")
+parser.add_option_group(run_group)
+# options for activity "setup PATHS"
+setup_group = OptionGroup(parser, "Options for setup",
+                    "These options must be use with -r or -b to setup environment variables")
+setup_group.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="")
+setup_group.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="")
+setup_group.add_option('--sde_home', dest='sde_home',help='path to SDE',default="")
+parser.add_option_group(setup_group)
+(options, args) = parser.parse_args()
+Main()
--- a/ast.cpp
+++ b/ast.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,12 +28,14 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file ast.cpp
-    @brief 
-*/
+
+    @brief General functionality related to abstract syntax trees and
+    traversal of them.
+ */

 #include "ast.h"
 #include "expr.h"
@@ -53,10 +55,10 @@ ASTNode::~ASTNode() {
 // AST

 void
-AST::AddFunction(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code) {
+AST::AddFunction(Symbol *sym, Stmt *code) {
    if (sym == NULL)
        return;
-    functions.push_back(new Function(sym, args, code));
+    functions.push_back(new Function(sym, code));
 }


@@ -90,30 +92,37 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        DoStmt *dos;
        ForStmt *fs;
        ForeachStmt *fes;
+        ForeachActiveStmt *fas;
+        ForeachUniqueStmt *fus;
+        CaseStmt *cs;
+        DefaultStmt *defs;
+        SwitchStmt *ss;
        ReturnStmt *rs;
        LabeledStmt *ls;
        StmtList *sl;
        PrintStmt *ps;
        AssertStmt *as;
+        DeleteStmt *dels;
+        UnmaskedStmt *ums;

        if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
            es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
        else if ((ds = dynamic_cast<DeclStmt *>(node)) != NULL) {
            for (unsigned int i = 0; i < ds->vars.size(); ++i)
-                ds->vars[i].init = (Expr *)WalkAST(ds->vars[i].init, preFunc, 
+                ds->vars[i].init = (Expr *)WalkAST(ds->vars[i].init, preFunc,
                                                   postFunc, data);
        }
        else if ((is = dynamic_cast<IfStmt *>(node)) != NULL) {
            is->test = (Expr *)WalkAST(is->test, preFunc, postFunc, data);
-            is->trueStmts = (Stmt *)WalkAST(is->trueStmts, preFunc, 
+            is->trueStmts = (Stmt *)WalkAST(is->trueStmts, preFunc,
                                            postFunc, data);
-            is->falseStmts = (Stmt *)WalkAST(is->falseStmts, preFunc, 
+            is->falseStmts = (Stmt *)WalkAST(is->falseStmts, preFunc,
                                             postFunc, data);
        }
        else if ((dos = dynamic_cast<DoStmt *>(node)) != NULL) {
-            dos->testExpr = (Expr *)WalkAST(dos->testExpr, preFunc, 
+            dos->testExpr = (Expr *)WalkAST(dos->testExpr, preFunc,
                                            postFunc, data);
-            dos->bodyStmts = (Stmt *)WalkAST(dos->bodyStmts, preFunc, 
+            dos->bodyStmts = (Stmt *)WalkAST(dos->bodyStmts, preFunc,
                                             postFunc, data);
        }
        else if ((fs = dynamic_cast<ForStmt *>(node)) != NULL) {
@@ -124,13 +133,28 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        }
        else if ((fes = dynamic_cast<ForeachStmt *>(node)) != NULL) {
            for (unsigned int i = 0; i < fes->startExprs.size(); ++i)
-                fes->startExprs[i] = (Expr *)WalkAST(fes->startExprs[i], preFunc, 
+                fes->startExprs[i] = (Expr *)WalkAST(fes->startExprs[i], preFunc,
                                                     postFunc, data);
            for (unsigned int i = 0; i < fes->endExprs.size(); ++i)
-                fes->endExprs[i] = (Expr *)WalkAST(fes->endExprs[i], preFunc, 
+                fes->endExprs[i] = (Expr *)WalkAST(fes->endExprs[i], preFunc,
                                                   postFunc, data);
            fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
        }
+        else if ((fas = dynamic_cast<ForeachActiveStmt *>(node)) != NULL) {
+            fas->stmts = (Stmt *)WalkAST(fas->stmts, preFunc, postFunc, data);
+        }
+        else if ((fus = dynamic_cast<ForeachUniqueStmt *>(node)) != NULL) {
+            fus->expr = (Expr *)WalkAST(fus->expr, preFunc, postFunc, data);
+            fus->stmts = (Stmt *)WalkAST(fus->stmts, preFunc, postFunc, data);
+        }
+        else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
+            cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
+        else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
+            defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data);
+        else if ((ss = dynamic_cast<SwitchStmt *>(node)) != NULL) {
+            ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data);
+            ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data);
+        }
        else if (dynamic_cast<BreakStmt *>(node) != NULL ||
                 dynamic_cast<ContinueStmt *>(node) != NULL ||
                 dynamic_cast<GotoStmt *>(node) != NULL) {
@@ -139,7 +163,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
            ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
        else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
-            rs->val = (Expr *)WalkAST(rs->val, preFunc, postFunc, data);
+            rs->expr = (Expr *)WalkAST(rs->expr, preFunc, postFunc, data);
        else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
            std::vector<Stmt *> &sls = sl->stmts;
            for (unsigned int i = 0; i < sls.size(); ++i)
@@ -149,6 +173,10 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
            ps->values = (Expr *)WalkAST(ps->values, preFunc, postFunc, data);
        else if ((as = dynamic_cast<AssertStmt *>(node)) != NULL)
            as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
+        else if ((dels = dynamic_cast<DeleteStmt *>(node)) != NULL)
+            dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
+        else if ((ums = dynamic_cast<UnmaskedStmt *>(node)) != NULL)
+            ums->stmts = (Stmt *)WalkAST(ums->stmts, preFunc, postFunc, data);
        else
            FATAL("Unhandled statement type in WalkAST()");
    }
@@ -166,9 +194,11 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        MemberExpr *me;
        TypeCastExpr *tce;
        ReferenceExpr *re;
-        DereferenceExpr *dre;
+        PtrDerefExpr *ptrderef;
+        RefDerefExpr *refderef;
        SizeOfExpr *soe;
        AddressOfExpr *aoe;
+        NewExpr *newe;

        if ((ue = dynamic_cast<UnaryExpr *>(node)) != NULL)
            ue->expr = (Expr *)WalkAST(ue->expr, preFunc, postFunc, data);
@@ -187,7 +217,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        }
        else if ((el = dynamic_cast<ExprList *>(node)) != NULL) {
            for (unsigned int i = 0; i < el->exprs.size(); ++i)
-                el->exprs[i] = (Expr *)WalkAST(el->exprs[i], preFunc, 
+                el->exprs[i] = (Expr *)WalkAST(el->exprs[i], preFunc,
                                               postFunc, data);
        }
        else if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
@@ -206,20 +236,30 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
            tce->expr = (Expr *)WalkAST(tce->expr, preFunc, postFunc, data);
        else if ((re = dynamic_cast<ReferenceExpr *>(node)) != NULL)
            re->expr = (Expr *)WalkAST(re->expr, preFunc, postFunc, data);
-        else if ((dre = dynamic_cast<DereferenceExpr *>(node)) != NULL)
-            dre->expr = (Expr *)WalkAST(dre->expr, preFunc, postFunc, data);
+        else if ((ptrderef = dynamic_cast<PtrDerefExpr *>(node)) != NULL)
+            ptrderef->expr = (Expr *)WalkAST(ptrderef->expr, preFunc, postFunc,
+                                             data);
+        else if ((refderef = dynamic_cast<RefDerefExpr *>(node)) != NULL)
+            refderef->expr = (Expr *)WalkAST(refderef->expr, preFunc, postFunc,
+                                             data);
        else if ((soe = dynamic_cast<SizeOfExpr *>(node)) != NULL)
            soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
        else if ((aoe = dynamic_cast<AddressOfExpr *>(node)) != NULL)
            aoe->expr = (Expr *)WalkAST(aoe->expr, preFunc, postFunc, data);
+        else if ((newe = dynamic_cast<NewExpr *>(node)) != NULL) {
+            newe->countExpr = (Expr *)WalkAST(newe->countExpr, preFunc,
+                                              postFunc, data);
+            newe->initExpr = (Expr *)WalkAST(newe->initExpr, preFunc,
+                                             postFunc, data);
+        }
        else if (dynamic_cast<SymbolExpr *>(node) != NULL ||
                 dynamic_cast<ConstExpr *>(node) != NULL ||
                 dynamic_cast<FunctionSymbolExpr *>(node) != NULL ||
                 dynamic_cast<SyncExpr *>(node) != NULL ||
                 dynamic_cast<NullPointerExpr *>(node) != NULL) {
-            // nothing to do 
+            // nothing to do
        }
-        else 
+        else
            FATAL("Unhandled expression type in WalkAST().");
    }

@@ -279,18 +319,165 @@ TypeCheck(Stmt *stmt) {
 }


+struct CostData {
+    CostData() { cost = foreachDepth = 0; }
+
+    int cost;
+    int foreachDepth;
+};
+
+
 static bool
-lCostCallback(ASTNode *node, void *c) {
-    int *cost = (int *)c;
-    *cost += node->EstimateCost();
+lCostCallbackPre(ASTNode *node, void *d) {
+    CostData *data = (CostData *)d;
+    if (dynamic_cast<ForeachStmt *>(node) != NULL)
+        ++data->foreachDepth;
+    if (data->foreachDepth == 0)
+        data->cost += node->EstimateCost();
    return true;
 }


+static ASTNode *
+lCostCallbackPost(ASTNode *node, void *d) {
+    CostData *data = (CostData *)d;
+    if (dynamic_cast<ForeachStmt *>(node) != NULL)
+        --data->foreachDepth;
+    return node;
+}
+
+
 int
 EstimateCost(ASTNode *root) {
-    int cost = 0;
-    WalkAST(root, lCostCallback, NULL, &cost);
-    return cost;
+    CostData data;
+    WalkAST(root, lCostCallbackPre, lCostCallbackPost, &data);
+    return data.cost;
 }

+
+/** Given an AST node, check to see if it's safe if we happen to run the
+    code for that node with the execution mask all off.
+ */
+static bool
+lCheckAllOffSafety(ASTNode *node, void *data) {
+    bool *okPtr = (bool *)data;
+
+    FunctionCallExpr *fce;
+    if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
+        if (fce->func == NULL)
+            return false;
+
+        const Type *type = fce->func->GetType();
+        const PointerType *pt = CastType<PointerType>(type);
+        if (pt != NULL)
+            type = pt->GetBaseType();
+        const FunctionType *ftype = CastType<FunctionType>(type);
+        Assert(ftype != NULL);
+
+        if (ftype->isSafe == false) {
+            *okPtr = false;
+            return false;
+        }
+    }
+
+    if (dynamic_cast<AssertStmt *>(node) != NULL) {
+        // While it's fine to run the assert for varying tests, it's not
+        // desirable to check an assert on a uniform variable if all of the
+        // lanes are off.
+        *okPtr = false;
+        return false;
+    }
+
+    if (dynamic_cast<NewExpr *>(node) != NULL ||
+        dynamic_cast<DeleteStmt *>(node) != NULL) {
+        // We definitely don't want to run the uniform variants of these if
+        // the mask is all off.  It's also worth skipping the overhead of
+        // executing the varying versions of them in the all-off mask case.
+        *okPtr = false;
+        return false;
+    }
+
+    if (dynamic_cast<ForeachStmt *>(node) != NULL ||
+        dynamic_cast<ForeachActiveStmt *>(node) != NULL ||
+        dynamic_cast<ForeachUniqueStmt *>(node) != NULL ||
+        dynamic_cast<UnmaskedStmt *>(node) != NULL) {
+        // The various foreach statements also shouldn't be run with an
+        // all-off mask.  Since they can re-establish an 'all on' mask,
+        // this would be pretty unintuitive.  (More generally, it's
+        // possibly a little strange to allow foreach in the presence of
+        // any non-uniform control flow...)
+        //
+        // Similarly, the implementation of foreach_unique assumes as a
+        // precondition that the mask won't be all off going into it, so
+        // we'll enforce that here...
+        *okPtr = false;
+        return false;
+    }
+
+    IndexExpr *ie;
+    if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
+        const Type *type = ie->baseExpr->GetType();
+        if (type == NULL)
+            return true;
+        if (CastType<ReferenceType>(type) != NULL)
+            type = type->GetReferenceTarget();
+
+        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
+        if (ce == NULL) {
+            // indexing with a variable... -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        const PointerType *pointerType = CastType<PointerType>(type);
+        if (pointerType != NULL) {
+            // pointer[index] -> can't be sure -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        const SequentialType *seqType = CastType<SequentialType>(type);
+        Assert(seqType != NULL);
+        int nElements = seqType->GetElementCount();
+        if (nElements == 0) {
+            // Unsized array, so we can't be sure -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        int32_t indices[ISPC_MAX_NVEC];
+        int count = ce->GetValues(indices);
+        for (int i = 0; i < count; ++i) {
+            if (indices[i] < 0 || indices[i] >= nElements) {
+                // Index is out of bounds -> not safe
+                *okPtr = false;
+                return false;
+            }
+        }
+
+        // All indices are in-bounds
+        return true;
+    }
+
+    MemberExpr *me;
+    if ((me = dynamic_cast<MemberExpr *>(node)) != NULL &&
+        me->dereferenceExpr) {
+        *okPtr = false;
+        return false;
+    }
+
+    if (dynamic_cast<PtrDerefExpr *>(node) != NULL) {
+        *okPtr = false;
+        return false;
+    }
+
+    return true;
+}
+
+
+bool
+SafeToRunWithMaskAllOff(ASTNode *root) {
+    bool safe = true;
+    WalkAST(root, lCheckAllOffSafety, NULL, &safe);
+    return safe;
+}
--- a/ast.h
+++ b/ast.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,11 +28,11 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file ast.h
-    @brief 
+    @brief
 */

 #ifndef ISPC_AST_H
@@ -84,8 +84,7 @@ class AST {
 public:
    /** Add the AST for a function described by the given declaration
        information and source code. */
-    void AddFunction(Symbol *sym, const std::vector<Symbol *> &args, 
-                     Stmt *code);
+    void AddFunction(Symbol *sym, Stmt *code);

    /** Generate LLVM IR for all of the functions into the current
        module. */
@@ -122,12 +121,12 @@ extern ASTNode *Optimize(ASTNode *root);

 /** Convenience version of Optimize() for Expr *s that returns an Expr *
    (rather than an ASTNode *, which would require the caller to cast back
-    to an Expr *). */ 
+    to an Expr *). */
 extern Expr *Optimize(Expr *);

 /** Convenience version of Optimize() for Expr *s that returns an Stmt *
    (rather than an ASTNode *, which would require the caller to cast back
-    to a Stmt *). */ 
+    to a Stmt *). */
 extern Stmt *Optimize(Stmt *);

 /** Perform type-checking on the given AST (or portion of one), returning a
@@ -144,4 +143,8 @@ extern Stmt *TypeCheck(Stmt *);
    the given root. */
 extern int EstimateCost(ASTNode *root);

+/** Returns true if it would be safe to run the given code with an "all
+    off" mask. */
+extern bool SafeToRunWithMaskAllOff(ASTNode *root);
+
 #endif // ISPC_AST_H
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -10,6 +10,8 @@ import os
 length=0

 src=str(sys.argv[1])
+if (len(sys.argv) > 2):
+    runtime=str(sys.argv[2])

 target = re.sub("builtins/target-", "", src)
 target = re.sub(r"builtins\\target-", "", target)
@@ -26,17 +28,24 @@ if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT")
 try:
    as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
 except IOError:
-    print >> sys.stderr, "Couldn't open " + src
+    sys.stderr.write("Couldn't open " + src)
    sys.exit(1)

-print "unsigned char builtins_bitcode_" + target + "[] = {"
-for line in as_out.stdout.readlines():
-    length = length + len(line)
-    for c in line:
-        print ord(c)
-        print ", "
-print " 0 };\n\n"
-print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n"
+name = target
+if (len(sys.argv) > 2):
+    name += "_" + runtime;
+width = 16;
+sys.stdout.write("unsigned char builtins_bitcode_" + name + "[] = {\n")
+
+data = as_out.stdout.read()
+for i in range(0, len(data), 1):
+        sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
+
+        if i%width == (width-1):
+            sys.stdout.write("\n")
+
+sys.stdout.write("0x00 };\n\n")
+sys.stdout.write("int builtins_bitcode_" + name + "_length = " + str(len(data)) + ";\n")

 as_out.wait()

--- a/buildispc.bat
+++ b/buildispc.bat
@@ -2,8 +2,8 @@

 REM If LLVM_INSTALL_DIR isn't set globally in your environment,
 REM it can be set here_
-set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
-set LLVM_VERSION=3.1svn
+REM set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
+REM set LLVM_VERSION=LLVM_3_2

 REM Both the LLVM binaries and python need to be in the path
 set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2013, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,11 +28,11 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file builtins.cpp
-    @brief Definitions of functions related to setting up the standard library 
+    @brief Definitions of functions related to setting up the standard library
           and other builtins.
 */

@@ -47,12 +47,25 @@

 #include <math.h>
 #include <stdlib.h>
-#include <llvm/LLVMContext.h>
-#include <llvm/Module.h>
-#include <llvm/Type.h>
-#include <llvm/DerivedTypes.h>
-#include <llvm/Instructions.h>
-#include <llvm/Intrinsics.h>
+#if defined(LLVM_3_2)
+  #include <llvm/Attributes.h>
+#endif
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
+  #include <llvm/LLVMContext.h>
+  #include <llvm/Module.h>
+  #include <llvm/Type.h>
+  #include <llvm/Instructions.h>
+  #include <llvm/Intrinsics.h>
+  #include <llvm/DerivedTypes.h>
+#else
+  #include <llvm/IR/Attributes.h>
+  #include <llvm/IR/LLVMContext.h>
+  #include <llvm/IR/Module.h>
+  #include <llvm/IR/Type.h>
+  #include <llvm/IR/Instructions.h>
+  #include <llvm/IR/Intrinsics.h>
+  #include <llvm/IR/DerivedTypes.h>
+#endif
 #include <llvm/Linker.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/ADT/Triple.h>
@@ -99,10 +112,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;

    // varying
-    if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
-        t == LLVMTypes::MaskType)
-        return AtomicType::VaryingBool;
-    else if (t == LLVMTypes::Int8VectorType)
+    if (t == LLVMTypes::Int8VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
    else if (t == LLVMTypes::Int16VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16;
@@ -114,6 +124,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return AtomicType::VaryingDouble;
    else if (t == LLVMTypes::Int64VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
+    else if (t == LLVMTypes::MaskType)
+        return AtomicType::VaryingBool;

    // pointers to uniform
    else if (t == LLVMTypes::Int8PointerType)
@@ -156,9 +168,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {


 static void
-lCreateSymbol(const std::string &name, const Type *returnType, 
-              const std::vector<const Type *> &argTypes, 
-              const llvm::FunctionType *ftype, llvm::Function *func, 
+lCreateSymbol(const std::string &name, const Type *returnType,
+              llvm::SmallVector<const Type *, 8> &argTypes,
+              const llvm::FunctionType *ftype, llvm::Function *func,
              SymbolTable *symbolTable) {
    SourcePos noPos;
    noPos.name = "__stdlib";
@@ -197,9 +209,9 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
    // symbol creation code below assumes that any LLVM vector of i32s is a
    // varying int32.  Here, we need that to be interpreted as a varying
    // bool, so just have a one-off override for that one...
-    if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") {
+    if (g->target->getMaskBitCount() != 1 && name == "__sext_varying_bool") {
        const Type *returnType = AtomicType::VaryingInt32;
-        std::vector<const Type *> argTypes;
+        llvm::SmallVector<const Type *, 8> argTypes;
        argTypes.push_back(AtomicType::VaryingBool);

        FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
@@ -229,7 +241,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
        // Iterate over the arguments and try to find their equivalent ispc
        // types.  Track if any of the arguments has an integer type.
        bool anyIntArgs = false;
-        std::vector<const Type *> argTypes;
+        llvm::SmallVector<const Type *, 8> argTypes;
        for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
            const llvm::Type *llvmArgType = ftype->getParamType(j);
            const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
@@ -238,7 +250,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
                      "representable for builtin %s", j, name.c_str());
                return false;
            }
-            anyIntArgs |= 
+            anyIntArgs |=
                (Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
            argTypes.push_back(type);
        }
@@ -273,7 +285,7 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {

 /** In many of the builtins-*.ll files, we have declarations of various LLVM
    intrinsics that are then used in the implementation of various target-
-    specific functions.  This function loops over all of the intrinsic 
+    specific functions.  This function loops over all of the intrinsic
    declarations and makes sure that the signature we have in our .ll file
    matches the signature of the actual intrinsic.
 */
@@ -290,8 +302,9 @@ lCheckModuleIntrinsics(llvm::Module *module) {
        // check the llvm.x86.* intrinsics for now...
        if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
            llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
+            if (id == 0) fprintf(stderr, "FATAL: intrinsic is not found: %s  \n", funcName.c_str());
            Assert(id != 0);
-            LLVM_TYPE_CONST llvm::Type *intrinsicType = 
+            llvm::Type *intrinsicType =
                llvm::Intrinsic::getType(*g->ctx, id);
            intrinsicType = llvm::PointerType::get(intrinsicType, 0);
            Assert(func->getType() == intrinsicType);
@@ -322,6 +335,8 @@ lSetInternalFunctions(llvm::Module *module) {
        "__add_varying_double",
        "__add_varying_int32",
        "__add_varying_int64",
+        "__all",
+        "__any",
        "__aos_to_soa3_float",
        "__aos_to_soa3_float16",
        "__aos_to_soa3_float4",
@@ -386,13 +401,18 @@ lSetInternalFunctions(llvm::Module *module) {
        "__ceil_uniform_float",
        "__ceil_varying_double",
        "__ceil_varying_float",
+        "__clock",
        "__count_trailing_zeros_i32",
        "__count_trailing_zeros_i64",
        "__count_leading_zeros_i32",
        "__count_leading_zeros_i64",
+        "__delete_uniform_32rt",
+        "__delete_uniform_64rt",
+        "__delete_varying_32rt",
+        "__delete_varying_64rt",
        "__do_assert_uniform",
        "__do_assert_varying",
-        "__do_print", 
+        "__do_print",
        "__doublebits_uniform_int64",
        "__doublebits_varying_int64",
        "__exclusive_scan_add_double",
@@ -408,12 +428,17 @@ lSetInternalFunctions(llvm::Module *module) {
        "__extract_int64",
        "__extract_int8",
        "__fastmath",
+        "__float_to_half_uniform",
+        "__float_to_half_varying",
        "__floatbits_uniform_int32",
        "__floatbits_varying_int32",
        "__floor_uniform_double",
        "__floor_uniform_float",
        "__floor_varying_double",
        "__floor_varying_float",
+        "__get_system_isa",
+        "__half_to_float_uniform",
+        "__half_to_float_varying",
        "__insert_int16",
        "__insert_int32",
        "__insert_int64",
@@ -435,6 +460,12 @@ lSetInternalFunctions(llvm::Module *module) {
        "__max_varying_uint32",
        "__max_varying_uint64",
        "__memory_barrier",
+        "__memcpy32",
+        "__memcpy64",
+        "__memmove32",
+        "__memmove64",
+        "__memset32",
+        "__memset64",
        "__min_uniform_double",
        "__min_uniform_float",
        "__min_uniform_int32",
@@ -448,6 +479,12 @@ lSetInternalFunctions(llvm::Module *module) {
        "__min_varying_uint32",
        "__min_varying_uint64",
        "__movmsk",
+        "__new_uniform_32rt",
+        "__new_uniform_64rt",
+        "__new_varying32_32rt",
+        "__new_varying32_64rt",
+        "__new_varying64_64rt",
+        "__none",
        "__num_cores",
        "__packed_load_active",
        "__packed_store_active",
@@ -459,12 +496,15 @@ lSetInternalFunctions(llvm::Module *module) {
        "__prefetch_read_uniform_nt",
        "__rcp_uniform_float",
        "__rcp_varying_float",
+        "__rdrand_i16",
+        "__rdrand_i32",
+        "__rdrand_i64",
        "__reduce_add_double",
        "__reduce_add_float",
+        "__reduce_add_int8",
+        "__reduce_add_int16",
        "__reduce_add_int32",
        "__reduce_add_int64",
-        "__reduce_add_uint32",
-        "__reduce_add_uint64",
        "__reduce_equal_double",
        "__reduce_equal_float",
        "__reduce_equal_int32",
@@ -493,6 +533,7 @@ lSetInternalFunctions(llvm::Module *module) {
        "__round_varying_float",
        "__rsqrt_uniform_float",
        "__rsqrt_varying_float",
+        "__set_system_isa",
        "__sext_uniform_bool",
        "__sext_varying_bool",
        "__shuffle2_double",
@@ -521,6 +562,8 @@ lSetInternalFunctions(llvm::Module *module) {
        "__sqrt_uniform_float",
        "__sqrt_varying_double",
        "__sqrt_varying_float",
+        "__stdlib_acosf",
+        "__stdlib_asinf",
        "__stdlib_atan",
        "__stdlib_atan2",
        "__stdlib_atan2f",
@@ -534,20 +577,34 @@ lSetInternalFunctions(llvm::Module *module) {
        "__stdlib_pow",
        "__stdlib_powf",
        "__stdlib_sin",
+        "__stdlib_asin",
        "__stdlib_sincos",
        "__stdlib_sincosf",
        "__stdlib_sinf",
        "__stdlib_tan",
        "__stdlib_tanf",
-        "__svml_sin",
-        "__svml_cos",
-        "__svml_sincos",
-        "__svml_tan",
-        "__svml_atan",
-        "__svml_atan2",
-        "__svml_exp",
-        "__svml_log",
-        "__svml_pow",
+        "__svml_sind",
+        "__svml_asind",
+        "__svml_cosd",
+        "__svml_acosd",
+        "__svml_sincosd",
+        "__svml_tand",
+        "__svml_atand",
+        "__svml_atan2d",
+        "__svml_expd",
+        "__svml_logd",
+        "__svml_powd",
+        "__svml_sinf",
+        "__svml_asinf",
+        "__svml_cosf",
+        "__svml_acosf",
+        "__svml_sincosf",
+        "__svml_tanf",
+        "__svml_atanf",
+        "__svml_atan2f",
+        "__svml_expf",
+        "__svml_logf",
+        "__svml_powf",
        "__undef_uniform",
        "__undef_varying",
        "__vec4_add_float",
@@ -559,8 +616,10 @@ lSetInternalFunctions(llvm::Module *module) {
    int count = sizeof(names) / sizeof(names[0]);
    for (int i = 0; i < count; ++i) {
        llvm::Function *f = module->getFunction(names[i]);
-        if (f != NULL && f->empty() == false)
+        if (f != NULL && f->empty() == false) {
            f->setLinkage(llvm::GlobalValue::InternalLinkage);
+            g->target->markFuncWithTargetAttr(f);
+        }
    }
 }

@@ -594,17 +653,57 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
        // linking together modules with incompatible target triples..
        llvm::Triple mTriple(m->module->getTargetTriple());
        llvm::Triple bcTriple(bcModule->getTargetTriple());
-        Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
-               mTriple.getArch() == bcTriple.getArch());
-        Assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
-               mTriple.getVendor() == bcTriple.getVendor());
+        Debug(SourcePos(), "module triple: %s\nbitcode triple: %s\n",
+              mTriple.str().c_str(), bcTriple.str().c_str());
+#if defined(ISPC_ARM_ENABLED) && !defined(__arm__)
+        // FIXME: More ugly and dangerous stuff.  We really haven't set up
+        // proper build and runtime infrastructure for ispc to do
+        // cross-compilation, yet it's at minimum useful to be able to emit
+        // ARM code from x86 for ispc development.  One side-effect is that
+        // when the build process turns builtins/builtins.c to LLVM bitcode
+        // for us to link in at runtime, that bitcode has been compiled for
+        // an IA target, which in turn causes the checks in the following
+        // code to (appropraitely) fail.
+        //
+        // In order to be able to have some ability to generate ARM code on
+        // IA, we'll just skip those tests in that case and allow the
+        // setTargetTriple() and setDataLayout() calls below to shove in
+        // the values for an ARM target.  This maybe won't cause problems
+        // in the generated code, since bulitins.c doesn't do anything too
+        // complex w.r.t. struct layouts, etc.
+        if (g->target->getISA() != Target::NEON32 &&
+            g->target->getISA() != Target::NEON16 &&
+            g->target->getISA() != Target::NEON8)
+#endif // !__arm__
+        {
+            Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
+                   mTriple.getArch() == bcTriple.getArch());
+            Assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
+                   mTriple.getVendor() == bcTriple.getVendor());
+
+            // We unconditionally set module DataLayout to library, but we must
+            // ensure that library and module DataLayouts are compatible.
+            // If they are not, we should recompile the library for problematic
+            // architecture and investigate what happened.
+            // Generally we allow library DataLayout to be subset of module
+            // DataLayout or library DataLayout to be empty.
+            if (!VerifyDataLayoutCompatibility(module->getDataLayout(),
+                                               bcModule->getDataLayout())) {
+              Warning(SourcePos(), "Module DataLayout is incompatible with "
+                      "library DataLayout:\n"
+                      "Module  DL: %s\n"
+                      "Library DL: %s\n",
+                      module->getDataLayout().c_str(),
+                      bcModule->getDataLayout().c_str());
+            }
+        }
+
        bcModule->setTargetTriple(mTriple.str());
+        bcModule->setDataLayout(module->getDataLayout());

        std::string(linkError);
-        if (llvm::Linker::LinkModules(module, bcModule, 
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+        if (llvm::Linker::LinkModules(module, bcModule,
                                      llvm::Linker::DestroySource,
-#endif // LLVM_3_0
                                      &linkError))
            Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
        lSetInternalFunctions(module);
@@ -621,15 +720,37 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
 static void
 lDefineConstantInt(const char *name, int val, llvm::Module *module,
                   SymbolTable *symbolTable) {
-    Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32,
-                            SC_STATIC);
-    pw->constValue = new ConstExpr(pw->type, val, SourcePos());
-    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
+    Symbol *sym =
+        new Symbol(name, SourcePos(), AtomicType::UniformInt32->GetAsConstType(),
+                   SC_STATIC);
+    sym->constValue = new ConstExpr(sym->type, val, SourcePos());
+    llvm::Type *ltype = LLVMTypes::Int32Type;
    llvm::Constant *linit = LLVMInt32(val);
-    pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
-                                              llvm::GlobalValue::InternalLinkage,
-                                              linit, pw->name.c_str());
-    symbolTable->AddVariable(pw);
+    // Use WeakODRLinkage rather than InternalLinkage so that a definition
+    // survives even if it's not used in the module, so that the symbol is
+    // there in the debugger.
+    llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
+        llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
+    sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
+                                               linit, name);
+    symbolTable->AddVariable(sym);
+
+    if (m->diBuilder != NULL) {
+        llvm::DIFile file;
+        llvm::DIType diType = sym->type->GetDIType(file);
+        Assert(diType.Verify());
+        // FIXME? DWARF says that this (and programIndex below) should
+        // have the DW_AT_artifical attribute.  It's not clear if this
+        // matters for anything though.
+        llvm::DIGlobalVariable var =
+            m->diBuilder->createGlobalVariable(name,
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               true /* static */,
+                                               sym->storagePtr);
+        Assert(var.Verify());
+    }
 }


@@ -637,13 +758,17 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
 static void
 lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
                       SymbolTable *symbolTable) {
-    std::vector<const Type *> args;
+    llvm::SmallVector<const Type *, 8> args;
    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
    Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);

    llvm::Function *func = module->getFunction(name);
    Assert(func != NULL); // it should be declared already...
+#if defined(LLVM_3_2)
+    func->addFnAttr(llvm::Attributes::AlwaysInline);
+#else // LLVM 3.1 and 3.3+
    func->addFnAttr(llvm::Attribute::AlwaysInline);
+#endif
    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);

@@ -655,131 +780,297 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,

 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
-    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
-                              AtomicType::VaryingConstInt32, SC_STATIC);
+    Symbol *sym =
+        new Symbol("programIndex", SourcePos(),
+                   AtomicType::VaryingInt32->GetAsConstType(), SC_STATIC);

    int pi[ISPC_MAX_NVEC];
-    for (int i = 0; i < g->target.vectorWidth; ++i)
+    for (int i = 0; i < g->target->getVectorWidth(); ++i)
        pi[i] = i;
-    pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());
+    sym->constValue = new ConstExpr(sym->type, pi, SourcePos());

-    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32VectorType;
+    llvm::Type *ltype = LLVMTypes::Int32VectorType;
    llvm::Constant *linit = LLVMInt32Vector(pi);
-    pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
-                                                llvm::GlobalValue::InternalLinkage, linit, 
-                                                pidx->name.c_str());
-    symbolTable->AddVariable(pidx);
+    // See comment in lDefineConstantInt() for why WeakODRLinkage is used here
+    llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
+        llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
+    sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
+                                               linit, sym->name.c_str());
+    symbolTable->AddVariable(sym);
+
+    if (m->diBuilder != NULL) {
+        llvm::DIFile file;
+        llvm::DIType diType = sym->type->GetDIType(file);
+        Assert(diType.Verify());
+        llvm::DIGlobalVariable var =
+            m->diBuilder->createGlobalVariable(sym->name.c_str(),
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               false /* static */,
+                                               sym->storagePtr);
+        Assert(var.Verify());
+    }
 }


 void
 DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
             bool includeStdlibISPC) {
+    bool runtime32 = g->target->is32Bit();
+
+#define EXPORT_MODULE(export_module)                            \
+    extern unsigned char export_module[];                       \
+    extern int export_module##_length;                          \
+    AddBitcodeToModule(export_module, export_module##_length,   \
+                       module, symbolTable);
+
    // Add the definitions from the compiled builtins-c.c file
-    if (g->target.is32Bit) {
-        extern unsigned char builtins_bitcode_c_32[];
-        extern int builtins_bitcode_c_32_length;
-        AddBitcodeToModule(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
-                           module, symbolTable);
+    if (runtime32) {
+        EXPORT_MODULE(builtins_bitcode_c_32);
    }
    else {
-        extern unsigned char builtins_bitcode_c_64[];
-        extern int builtins_bitcode_c_64_length;
-        AddBitcodeToModule(builtins_bitcode_c_64, builtins_bitcode_c_64_length, 
-                           module, symbolTable);
+        EXPORT_MODULE(builtins_bitcode_c_64);
    }

    // Next, add the target's custom implementations of the various needed
    // builtin functions (e.g. __masked_store_32(), etc).
-    switch (g->target.isa) {
-    case Target::SSE2:
-        extern unsigned char builtins_bitcode_sse2[];
-        extern int builtins_bitcode_sse2_length;
-        extern unsigned char builtins_bitcode_sse2_x2[];
-        extern int builtins_bitcode_sse2_x2_length;
-        switch (g->target.vectorWidth) {
-        case 4: 
-            AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length, 
-                               module, symbolTable);
-            break;
-        case 8:
-            AddBitcodeToModule(builtins_bitcode_sse2_x2, builtins_bitcode_sse2_x2_length, 
-                               module, symbolTable);
-            break;
-        default:
-            FATAL("logic error in DefineStdlib");
+    switch (g->target->getISA()) {
+
+#ifdef ISPC_ARM_ENABLED
+    case Target::NEON8: {
+        if (runtime32) {
+            EXPORT_MODULE(builtins_bitcode_neon_8_32bit);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_neon_8_64bit);
        }
        break;
-    case Target::SSE4:
-        extern unsigned char builtins_bitcode_sse4[];
-        extern int builtins_bitcode_sse4_length;
-        extern unsigned char builtins_bitcode_sse4_x2[];
-        extern int builtins_bitcode_sse4_x2_length;
-        switch (g->target.vectorWidth) {
-        case 4: 
-            AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length, 
-                               module, symbolTable);
-            break;
-        case 8:
-            AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length, 
-                               module, symbolTable);
-            break;
-        default:
-            FATAL("logic error in DefineStdlib");
+    }
+    case Target::NEON16: {
+        if (runtime32) {
+            EXPORT_MODULE(builtins_bitcode_neon_16_32bit);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_neon_16_64bit);
        }
        break;
-    case Target::AVX:
-    case Target::AVX2:
-        switch (g->target.vectorWidth) {
-        case 8:
-            extern unsigned char builtins_bitcode_avx[];
-            extern int builtins_bitcode_avx_length;
-            AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length, 
-                               module, symbolTable);
-            break;
-        case 16:
-            extern unsigned char builtins_bitcode_avx_x2[];
-            extern int builtins_bitcode_avx_x2_length;
-            AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
-                               module,  symbolTable);
-            break;
-        default:
-            FATAL("logic error in DefineStdlib");
+    }
+    case Target::NEON32: {
+        if (runtime32) {
+            EXPORT_MODULE(builtins_bitcode_neon_32_32bit);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_neon_32_64bit);
        }
        break;
-    case Target::GENERIC:
-        switch (g->target.vectorWidth) {
+    }
+#endif
+    case Target::SSE2: {
+        switch (g->target->getVectorWidth()) {
        case 4:
-            extern unsigned char builtins_bitcode_generic_4[];
-            extern int builtins_bitcode_generic_4_length;
-            AddBitcodeToModule(builtins_bitcode_generic_4, 
-                               builtins_bitcode_generic_4_length, 
-                               module, symbolTable);
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_sse2_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_sse2_64bit);
+            }
            break;
        case 8:
-            extern unsigned char builtins_bitcode_generic_8[];
-            extern int builtins_bitcode_generic_8_length;
-            AddBitcodeToModule(builtins_bitcode_generic_8, 
-                               builtins_bitcode_generic_8_length, 
-                               module, symbolTable);
-            break;
-        case 16:
-            extern unsigned char builtins_bitcode_generic_16[];
-            extern int builtins_bitcode_generic_16_length;
-            AddBitcodeToModule(builtins_bitcode_generic_16, 
-                               builtins_bitcode_generic_16_length, 
-                               module, symbolTable);
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_sse2_x2_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_sse2_x2_64bit);
+            }
            break;
        default:
            FATAL("logic error in DefineStdlib");
        }
        break;
+    }
+    case Target::SSE4: {
+        switch (g->target->getVectorWidth()) {
+        case 4:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_sse4_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_sse4_64bit);
+            }
+            break;
+        case 8:
+            if (runtime32) {
+                if (g->target->getMaskBitCount() == 16) {
+                    EXPORT_MODULE(builtins_bitcode_sse4_16_32bit);
+                }
+                else {
+                    Assert(g->target->getMaskBitCount() == 32);
+                    EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit);
+                }
+            }
+            else {
+                if (g->target->getMaskBitCount() == 16) {
+                    EXPORT_MODULE(builtins_bitcode_sse4_16_64bit);
+                }
+                else {
+                    Assert(g->target->getMaskBitCount() == 32);
+                    EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit);
+                }
+            }
+            break;
+        case 16:
+            Assert(g->target->getMaskBitCount() == 8);
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_sse4_8_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_sse4_8_64bit);
+            }
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    }
+    case Target::AVX: {
+        switch (g->target->getVectorWidth()) {
+        case 4:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit);
+            }
+            break;
+        case 8:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx1_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx1_64bit);
+            }
+            break;
+        case 16:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx1_x2_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx1_x2_64bit);
+            }
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    }
+    case Target::AVX11: {
+        switch (g->target->getVectorWidth()) {
+        case 8:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx11_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx11_64bit);
+            }
+            break;
+        case 16:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx11_x2_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx11_x2_64bit);
+            }
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    }
+    case Target::AVX2: {
+        switch (g->target->getVectorWidth()) {
+        case 8:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx2_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx2_64bit);
+            }
+            break;
+        case 16:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avx2_x2_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avx2_x2_64bit);
+            }
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    }
+    case Target::GENERIC: {
+        switch (g->target->getVectorWidth()) {
+        case 4:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_generic_4_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_generic_4_64bit);
+            }
+            break;
+        case 8:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_generic_8_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_generic_8_64bit);
+            }
+            break;
+        case 16:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_generic_16_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_generic_16_64bit);
+            }
+            break;
+        case 32:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_generic_32_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_generic_32_64bit);
+            }
+            break;
+        case 64:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_generic_64_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_generic_64_64bit);
+            }
+            break;
+        case 1:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_generic_1_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_generic_1_64bit);
+            }
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    }
    default:
        FATAL("logic error");
    }

    // define the 'programCount' builtin variable
-    lDefineConstantInt("programCount", g->target.vectorWidth, module, symbolTable);
+    lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);

    // define the 'programIndex' builtin
    lDefineProgramIndex(module, symbolTable);
@@ -789,28 +1080,58 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    lDefineConstantInt("__math_lib", (int)g->mathLib, module, symbolTable);
    lDefineConstantInt("__math_lib_ispc", (int)Globals::Math_ISPC, module,
                       symbolTable);
-    lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast, 
+    lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast,
                       module, symbolTable);
    lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module,
                       symbolTable);
    lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                       symbolTable);
-    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
-                           symbolTable);
+    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
+                           module, symbolTable);
+
+    lDefineConstantInt("__have_native_half", g->target->hasHalf(), module,
+                       symbolTable);
+    lDefineConstantInt("__have_native_rand", g->target->hasRand(), module,
+                       symbolTable);
+    lDefineConstantInt("__have_native_transcendentals", g->target->hasTranscendentals(),
+                       module, symbolTable);
+
+    if (g->forceAlignment != -1) {
+        llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
+        alignment->setInitializer(LLVMInt32(g->forceAlignment));
+    }

    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
        // serialized version of the stdlib.ispc file to get its
        // definitions added.
-        if (g->target.isa == Target::GENERIC) {
-            extern char stdlib_generic_code[];
-            yy_scan_string(stdlib_generic_code);
-            yyparse();
+        extern char stdlib_mask1_code[], stdlib_mask8_code[];
+        extern char stdlib_mask16_code[], stdlib_mask32_code[], stdlib_mask64_code[];
+        if (g->target->getISA() == Target::GENERIC &&
+            g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib
+            yy_scan_string(stdlib_mask32_code);
        }
        else {
-            extern char stdlib_x86_code[];
-            yy_scan_string(stdlib_x86_code);
-            yyparse();
+            switch (g->target->getMaskBitCount()) {
+            case 1:
+                yy_scan_string(stdlib_mask1_code);
+                break;
+            case 8:
+                yy_scan_string(stdlib_mask8_code);
+                break;
+            case 16:
+                yy_scan_string(stdlib_mask16_code);
+                break;
+            case 32:
+                yy_scan_string(stdlib_mask32_code);
+                break;
+            case 64:
+                yy_scan_string(stdlib_mask64_code);
+                break;
+            default:
+                FATAL("Unhandled mask bit size for stdlib.ispc");
+            }
        }
+        yyparse();
    }
 }
--- a/builtins.h
+++ b/builtins.h
@@ -28,11 +28,11 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file builtins.h
-    @brief Declarations of functions related to builtins and the 
+    @brief Declarations of functions related to builtins and the
           standard library
 */

--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file builtins-c.c
@@ -50,6 +50,16 @@
    available to ispc programs at compile time automatically.
  */

+#ifdef _MSC_VER
+// We do want old school sprintf and don't want secure Microsoft extensions.
+// And we also don't want warnings about it, so the define.
+#define _CRT_SECURE_NO_WARNINGS
+#else
+// Some versions of glibc has "fortification" feature, which expands sprintf
+// to __builtin___sprintf_chk(..., __builtin_object_size(...), ...).
+// We don't want this kind of expansion, as we don't support these intrinsics.
+#define _FORTIFY_SOURCE 0
+#endif

 #ifndef _MSC_VER
 #include <unistd.h>
@@ -59,22 +69,39 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
+#include <string.h>

 typedef int Bool;

-#define PRINT_SCALAR(fmt, type)  \
-    printf(fmt, *((type *)ptr)); \
+#define PRINT_BUF_SIZE 4096
+
+#define APPEND(str)                                        \
+    do {                                                   \
+        int offset = bufp - &printString[0];               \
+        *bufp = '\0';                                      \
+        strncat(bufp, str, PRINT_BUF_SIZE-offset);         \
+        bufp += strlen(str);                               \
+        if (bufp >= &printString[PRINT_BUF_SIZE])          \
+            goto done;                                     \
+    } while (0) /* eat semicolon */
+
+
+#define PRINT_SCALAR(fmt, type)                  \
+    sprintf(tmpBuf, fmt, *((type *)ptr));        \
+    APPEND(tmpBuf);                              \
    break

 #define PRINT_VECTOR(fmt, type)                                         \
-    putchar('[');                                                       \
+    *bufp++ = '[';                                                      \
+    if (bufp == &printString[PRINT_BUF_SIZE]) break;                    \
    for (int i = 0; i < width; ++i) {                                   \
        /* only print the value if the current lane is executing */     \
-        if (mask & (1<<i))                                              \
-            printf(fmt, ((type *)ptr)[i]);                              \
+        if (mask & (1ull<<i))                                           \
+            sprintf(tmpBuf, fmt, ((type *)ptr)[i]);                     \
        else                                                            \
-            printf("((" fmt "))", ((type *)ptr)[i]);                    \
-        putchar(i != width-1 ? ',' : ']');                              \
+            sprintf(tmpBuf, "((" fmt "))", ((type *)ptr)[i]);           \
+        APPEND(tmpBuf);                                                 \
+        *bufp++ = (i != width-1 ? ',' : ']');                           \
    }                                                                   \
    break

@@ -84,21 +111,23 @@ typedef int Bool;

    @param format  Print format string
    @param types   Encoded types of the values being printed.
-                   (See lEncodeType()). 
+                   (See lEncodeType()).
    @param width   Vector width of the compilation target
    @param mask    Current lane mask when the print statemnt is called
    @param args    Array of pointers to the values to be printed
 */
-void __do_print(const char *format, const char *types, int width, int mask, 
+void __do_print(const char *format, const char *types, int width, uint64_t mask,
                void **args) {
-    if (mask == 0) 
-        return;
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];

    int argCount = 0;
-    while (*format) {
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
        // Format strings are just single percent signs.
-        if (*format != '%')
-            putchar(*format);
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
        else {
            if (*types) {
                void *ptr = args[argCount++];
@@ -107,17 +136,22 @@ void __do_print(const char *format, const char *types, int width, int mask,
                // printf() formatting string.
                switch (*types) {
                case 'b': {
-                    printf("%s", *((Bool *)ptr) ? "true" : "false");
+                    sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
+                    APPEND(tmpBuf);
                    break;
                }
                case 'B': {
-                    putchar('[');
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
                    for (int i = 0; i < width; ++i) {
-                        if (mask & (1<<i))
-                            printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
+                        if (mask & (1ull << i)) {
+                            sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
+                            APPEND(tmpBuf);
+                        }
                        else
-                            printf("_________");
-                        putchar(i != width-1 ? ',' : ']');
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
                    }
                    break;
                }
@@ -136,14 +170,18 @@ void __do_print(const char *format, const char *types, int width, int mask,
                case 'p': PRINT_SCALAR("%p", void *);
                case 'P': PRINT_VECTOR("%p", void *);
                default:
-                    printf("UNKNOWN TYPE ");
-                    putchar(*types);
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
                }
                ++types;
            }
        }
        ++format;
    }
+
+ done:
+    *bufp = '\0';
+    fputs(printString, stdout);
    fflush(stdout);
 }

--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2011, Intel Corporation
+;;  Copyright (c) 2011-2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -41,30 +41,63 @@

@__system_best_isa = internal global i32 -1

-declare void @abort() noreturn
-
 ;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
 ;; following code...  Specifically, __get_system_isa should return a value
 ;; corresponding to one of the Target::ISA enumerant values that gives the
 ;; most capable ISA that the curremt system can run.
 ;;
-;; #ifdef _MSC_VER
-;; extern void __stdcall __cpuid(int info[4], int infoType);
-;; #else
+;; Note: clang from LLVM 3.1 should be used if this is updated, for maximum
+;; backwards compatibility for anyone building ispc with LLVM 3.1
+;;
+;; #include <stdint.h>
+;; #include <stdlib.h>
+;; 
 ;; static void __cpuid(int info[4], int infoType) {
 ;;     __asm__ __volatile__ ("cpuid"
 ;;                           : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
 ;;                           : "0" (infoType));
 ;; }
-;; #endif
+;; 
+;; // Save %ebx in case it's the PIC register.
+;; static void __cpuid_count(int info[4], int level, int count) {
+;;   __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+;;                         "cpuid\n\t"
+;;                         "xchg{l}\t{%%}ebx, %1\n\t"
+;;                         : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+;;                         : "0" (level), "2" (count));
+;; }
+;; 
+;; static int __os_has_avx_support() {
+;;     // Check xgetbv; this uses a .byte sequence instead of the instruction
+;;     // directly because older assemblers do not include support for xgetbv and
+;;     // there is no easy way to conditionally compile based on the assembler used.
+;;     int rEAX, rEDX;
+;;     __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+;;     return (rEAX & 6) == 6;
+;; }
 ;; 
 ;; int32_t __get_system_isa() {
 ;;     int info[4];
 ;;     __cpuid(info, 1);
-;;     /* NOTE: the values returned below must be the same as the
-;;        corresponding enumerant values in Target::ISA. */
-;;     if ((info[2] & (1 << 28)) != 0)
-;;         return 2; // AVX
+;; 
+;;     // NOTE: the values returned below must be the same as the
+;;     // corresponding enumerant values in Target::ISA.
+;;     if ((info[2] & (1 << 28)) != 0 &&
+;;         __os_has_avx_support()) {
+;;        if ((info[2] & (1 << 29)) != 0 &&  // F16C
+;;            (info[2] & (1 << 30)) != 0) {  // RDRAND
+;;            // So far, so good.  AVX2?
+;;            // Call cpuid with eax=7, ecx=0
+;;            int info2[4];
+;;            __cpuid_count(info2, 7, 0);
+;;            if ((info2[1] & (1 << 5)) != 0)
+;;                return 4;
+;;            else
+;;                return 3;
+;;        }
+;;        // Regular AVX
+;;        return 2;
+;;     }
 ;;     else if ((info[2] & (1 << 19)) != 0)
 ;;         return 1; // SSE4
 ;;     else if ((info[3] & (1 << 26)) != 0)
@@ -73,36 +106,57 @@ declare void @abort() noreturn
 ;;         abort();
 ;; }

-%0 = type { i32, i32, i32, i32 }
+define i32 @__get_system_isa() nounwind uwtable {
+entry:
+  %0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  %asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
+  %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
+  %and = and i32 %asmresult5.i, 268435456
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.else14, label %land.lhs.true

-define i32 @__get_system_isa() nounwind ssp {
-  %1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
-  %2 = extractvalue %0 %1, 2
-  %3 = extractvalue %0 %1, 3
-  %4 = and i32 %2, 268435456
-  %5 = icmp eq i32 %4, 0
-  br i1 %5, label %6, label %13
+land.lhs.true:                                    ; preds = %entry
+  %1 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind
+  %asmresult.i25 = extractvalue { i32, i32 } %1, 0
+  %and.i = and i32 %asmresult.i25, 6
+  %cmp.i = icmp eq i32 %and.i, 6
+  br i1 %cmp.i, label %if.then, label %if.else14

-; <label>:6                                       ; preds = %0
-  %7 = and i32 %2, 524288
-  %8 = icmp eq i32 %7, 0
-  br i1 %8, label %9, label %13
+if.then:                                          ; preds = %land.lhs.true
+  %2 = and i32 %asmresult5.i, 1610612736
+  %3 = icmp eq i32 %2, 1610612736
+  br i1 %3, label %if.then8, label %return

-; <label>:9                                       ; preds = %6
-  %10 = and i32 %3, 67108864
-  %11 = icmp eq i32 %10, 0
-  br i1 %11, label %12, label %13
+if.then8:                                         ; preds = %if.then
+  %4 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult4.i30 = extractvalue { i32, i32, i32, i32 } %4, 1
+  %and11 = lshr i32 %asmresult4.i30, 5
+  %5 = and i32 %and11, 1
+  %6 = add i32 %5, 3
+  br label %return

-; <label>:12                                      ; preds = %9
+if.else14:                                        ; preds = %land.lhs.true, %entry
+  %and16 = and i32 %asmresult5.i, 524288
+  %cmp17 = icmp eq i32 %and16, 0
+  br i1 %cmp17, label %if.else19, label %return
+
+if.else19:                                        ; preds = %if.else14
+  %and21 = and i32 %asmresult6.i, 67108864
+  %cmp22 = icmp eq i32 %and21, 0
+  br i1 %cmp22, label %if.else24, label %return
+
+if.else24:                                        ; preds = %if.else19
  tail call void @abort() noreturn nounwind
  unreachable

-; <label>:13                                      ; preds = %9, %6, %0
-  %.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
-  ret i32 %.0
+return:                                           ; preds = %if.else19, %if.else14, %if.then8, %if.then
+  %retval.0 = phi i32 [ %6, %if.then8 ], [ 2, %if.then ], [ 1, %if.else14 ], [ 0, %if.else19 ]
+  ret i32 %retval.0
 }

+declare void @abort() noreturn nounwind

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This function is called by each of the dispatch functions we generate;
 ;; it sets @__system_best_isa if it is unset.

--- a/builtins/svml.m4
+++ b/builtins/svml.m4
@@ -0,0 +1,217 @@
+;; copyright stub  :)
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;; svml macro
+
+;; svml_stubs : stubs for svml calls
+;; $1 - type ("float" or "double")
+;; $2 - svml internal function suffix ("f" for float, "d" for double)
+;; $3 - vector width
+define(`svml_stubs',`
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
+')
+
+;; svml_declare : declaration of __svml_* intrinsics 
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+define(`svml_declare',`
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_sincos$2(<$3 x $1> *, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
+');
+
+;; defintition of __svml_* internal functions
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
+define(`svml_define',`
+  define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+  define <$3 x $1> @__svml_asin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_asin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_cos$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_cos$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define void @__svml_sincos$4(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline {
+    %s = call <$3 x $1> @__svml_sincos$2(<$3 x $1> * %2, <$3 x $1> %0)
+    store <$3 x $1> %s, <$3 x $1> * %1
+    ret void
+  }
+
+  define <$3 x $1> @__svml_tan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_tan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_atan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_atan2$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan2$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_exp$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_exp$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_log$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_log$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_pow$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_pow$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
+  }
+')
+
+
+;; svml_define_x : defintition of __svml_* internal functions operation on extended width
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
+;; $5 - extended width, must be at least twice the native vector width
+;;      contigent on existing of unary$3to$5 and binary$3to$5 macros
+
+;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g.
+;;define void @__svml_sincosf(<8 x float>, <8 x float> *,
+;;                                    <8 x float> *) nounwind readnone alwaysinline {
+;;  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+;;  %a = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %b = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+;;
+;;  %cospa = alloca <4 x float>
+;;  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+;;
+;;  %cospb = alloca <4 x float>
+;;  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+;;
+;;  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %sin, <8 x float> * %1
+;;
+;;  %cosa = load <4 x float> * %cospa
+;;  %cosb = load <4 x float> * %cospb
+;;  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %cos, <8 x float> * %2
+;;
+;;  ret void
+;;}
+define(`svml_define_x',`
+  define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_sin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_asin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_asin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_cos$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_cos$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  {
+    %s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0)
+    %c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0)
+    store <$5 x $1> %s, <$5 x $1> * %1
+    store <$5 x $1> %c, <$5 x $1> * %2
+    ret void
+  }
+  define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_tan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_atan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan2$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_atan2$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_exp$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_exp$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_log$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_log$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_pow$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_pow$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
+')
+
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -254,10 +254,10 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt

-declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone

 define double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
  ret double %ret
 }

@@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
  ret double %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -137,19 +137,14 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml

-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones 4x with our 16-wide
-; vectors...
+include(`svml.m4')
+;; single precision
+svml_declare(float,f8,8)
+svml_define_x(float,f8,8,f,16)

-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+;; double precision
+svml_declare(double,4,4)
+svml_define_x(double,4,4,d,16)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
@@ -158,51 +153,24 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone

 define <16 x float> @__max_varying_float(<16 x float>,
-                                                  <16 x float>) nounwind readonly alwaysinline {
+                                         <16 x float>) nounwind readonly alwaysinline {
  binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
  ret <16 x float> %call
 }

 define <16 x float> @__min_varying_float(<16 x float>,
-                                                  <16 x float>) nounwind readonly alwaysinline {
+                                         <16 x float>) nounwind readonly alwaysinline {
  binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
  ret <16 x float> %call
 }


-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
-
-define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret <16 x i32> %ret
-}
-
-define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret <16 x i32> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unsigned int min/max
-
-define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret <16 x i32> %ret
-}
-
-define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret <16 x i32> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

-define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <16 x i32> %0 to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -213,9 +181,57 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {

  %v1shift = shl i32 %v1, 8
  %v = or i32 %v1shift, %v0
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }

+define i1 @__any(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp eq i32 %v, 65535
+  ret i1 %cmp
+}
+
+define i1 @__none(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal float ops

@@ -250,8 +266,35 @@ reduce_equal(16)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops

+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <16 x i16> @__add_varying_i16(<16 x i16>,
+                                  <16 x i16>) nounwind readnone alwaysinline {
+  %r = add <16 x i16> %0, %1
+  ret <16 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
+  reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define <16 x i32> @__add_varying_int32(<16 x i32>,
-                                                <16 x i32>) nounwind readnone alwaysinline {
+                                       <16 x i32>) nounwind readnone alwaysinline {
  %s = add <16 x i32> %0, %1
  ret <16 x i32> %s
 }
@@ -279,11 +322,6 @@ define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint32 ops

-define i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
-  %r = call i32 @__reduce_add_int32(<16 x i32> %v)
-  ret i32 %r
-}
-
 define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
@@ -361,11 +399,6 @@ define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint64 ops

-define i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
-  %r = call i64 @__reduce_add_int64(<16 x i64> %v)
-  ret i64 %r
-}
-
 define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
@@ -379,19 +412,14 @@ define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(16, i8, 8)
-load_and_broadcast(16, i16, 16)
-load_and_broadcast(16, i32, 32)
-load_and_broadcast(16, i64, 64)
-
 ; no masked load instruction for i8 and i16 types??
-masked_load(16, i8,  8,  1)
-masked_load(16, i16, 16, 2)
+masked_load(i8,  1)
+masked_load(i16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 
-define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i32> @__masked_load_i32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <16 x i32> %mask to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -409,7 +437,7 @@ define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin
 }


-define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -443,6 +471,7 @@ define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
  ret <16 x i64> %val
 }

+masked_load_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
@@ -450,15 +479,15 @@ define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
 ; FIXME: there is no AVX instruction for these, but we could be clever
 ; by packing the bits down and setting the last 3/4 or half, respectively,
 ; of the mask to zero...  Not sure if this would be a win in the end
-gen_masked_store(16, i8, 8)
-gen_masked_store(16, i16, 16)
+gen_masked_store(i8)
+gen_masked_store(i16)

 ; note that mask is the 2nd parameter, not the 3rd one!!
 declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)

-define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>, 
-                               <16 x i32>) nounwind alwaysinline {
+define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                <16 x i32>) nounwind alwaysinline {
  %ptr = bitcast <16 x i32> * %0 to i8 *
  %val = bitcast <16 x i32> %1 to <16 x float>
  %mask = bitcast <16 x i32> %2 to <16 x float>
@@ -480,8 +509,8 @@ define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
  ret void
 }

-define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
-                               <16 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>,
+                                <16 x i32> %mask) nounwind alwaysinline {
  %ptr = bitcast <16 x i64> * %0 to i8 *
  %val = bitcast <16 x i64> %1 to <16 x double>

@@ -519,14 +548,15 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
  ret void
 }

+masked_store_float_double()

 masked_store_blend_8_16_by_16()

 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone

-define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, 
-                                     <16 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                      <16 x i32>) nounwind alwaysinline {
  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
  %oldValue = load <16 x i32>* %0, align 4
  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
@@ -563,8 +593,8 @@ define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
                                                 <4 x double>) nounwind readnone

-define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
-                                     <16 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
+                                      <16 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <16 x i64>* %ptr, align 8
  %old = bitcast <16 x i64> %oldValue to <16 x double>
  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
@@ -622,17 +652,14 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
+;; scatter

-gen_gather(16, i8)
-gen_gather(16, i16)
-gen_gather(16, i32)
-gen_gather(16, i64)
-
-gen_scatter(16, i8)
-gen_scatter(16, i16)
-gen_scatter(16, i32)
-gen_scatter(16, i64)
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -137,19 +137,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml

-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
+include(`svml.m4')
+;; single precision
+svml_declare(float,f8,8)
+svml_define(float,f8,8,f)

-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+;; double precision
+svml_declare(double,4,4)
+svml_define_x(double,4,4,d,8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
@@ -158,54 +153,49 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone

 define <8 x float> @__max_varying_float(<8 x float>,
-                                                 <8 x float>) nounwind readonly alwaysinline {
+                                        <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
 }

 define <8 x float> @__min_varying_float(<8 x float>,
-                                                 <8 x float>) nounwind readonly alwaysinline {
+                                        <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
 }


-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
-
-define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret <8 x i32> %ret
-}
-
-define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret <8 x i32> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unsigned int min/max
-
-define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret <8 x i32> %ret
-}
-
-define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret <8 x i32> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -222,7 +212,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
  ret float %sum
 }

-
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8(float, @__min_varying_float, @__min_uniform_float)
 }
@@ -234,11 +223,47 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {

 reduce_equal(8)

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops

 define <8 x i32> @__add_varying_int32(<8 x i32>,
-                                               <8 x i32>) nounwind readnone alwaysinline {
+                                      <8 x i32>) nounwind readnone alwaysinline {
  %s = add <8 x i32> %0, %1
  ret <8 x i32> %s
 }
@@ -262,25 +287,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
 }

-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;; horizontal uint32 ops
-
-define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
-  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
-  ret i32 %r
-}
-
 define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }

-
 define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }

-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal double ops

@@ -314,7 +328,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline
 ;; horizontal int64 ops

 define <8 x i64> @__add_varying_int64(<8 x i64>,
-                                               <8 x i64>) nounwind readnone alwaysinline {
+                                      <8 x i64>) nounwind readnone alwaysinline {
  %s = add <8 x i64> %0, %1
  ret <8 x i64> %s
 }
@@ -339,14 +353,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
 }


-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;; horizontal uint64 ops
-
-define i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
-  %r = call i64 @__reduce_add_int64(<8 x i64> %v)
-  ret i64 %r
-}
-
 define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
@@ -360,19 +366,15 @@ define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(8, i8, 8)
-load_and_broadcast(8, i16, 16)
-load_and_broadcast(8, i32, 32)
-load_and_broadcast(8, i64, 64)

 ; no masked load instruction for i8 and i16 types??
-masked_load(8, i8,  8,  1)
-masked_load(8, i16, 16, 2)
+masked_load(i8,  1)
+masked_load(i16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 
-define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i32> @__masked_load_i32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <8 x i32> %mask to <8 x float>
  %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
  %retval = bitcast <8 x float> %floatval to <8 x i32>
@@ -380,7 +382,7 @@ define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline
 }


-define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -399,22 +401,20 @@ define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
  ret <8 x i64> %val
 }

+masked_load_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-; FIXME: there is no AVX instruction for these, but we could be clever
-; by packing the bits down and setting the last 3/4 or half, respectively,
-; of the mask to zero...  Not sure if this would be a win in the end
-gen_masked_store(8, i8, 8)
-gen_masked_store(8, i16, 16)
+gen_masked_store(i8)
+gen_masked_store(i16)

 ; note that mask is the 2nd parameter, not the 3rd one!!
 declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)

-define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>, 
-                               <8 x i32>) nounwind alwaysinline {
+define void @__masked_store_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                <8 x i32>) nounwind alwaysinline {
  %ptr = bitcast <8 x i32> * %0 to i8 *
  %val = bitcast <8 x i32> %1 to <8 x float>
  %mask = bitcast <8 x i32> %2 to <8 x float>
@@ -422,8 +422,8 @@ define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
  ret void
 }

-define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
-                               <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
+                                <8 x i32> %mask) nounwind alwaysinline {
  %ptr = bitcast <8 x i64> * %0 to i8 *
  %val = bitcast <8 x i64> %1 to <8 x double>

@@ -447,14 +447,13 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
 }


-
 masked_store_blend_8_16_by_8()

 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone

-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
-                                     <8 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32>) nounwind alwaysinline {
  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
  %oldValue = load <8 x i32>* %0, align 4
  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
@@ -468,8 +467,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
 }


-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
-                                     <8 x i32> %i32mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
+                                      <8 x i32> %i32mask) nounwind alwaysinline {
  %oldValue = load <8 x i64>* %ptr, align 8
  %mask = bitcast <8 x i32> %i32mask to <8 x float>

@@ -518,19 +517,17 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }

+masked_store_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
+;; scatter

-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
-
-gen_scatter(8, i8)
-gen_scatter(8, i16)
-gen_scatter(8, i32)
-gen_scatter(8, i64)
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
--- a/builtins/target-avx1-i64x4.ll
+++ b/builtins/target-avx1-i64x4.ll
@@ -0,0 +1,81 @@
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx1-i64x4base.ll')
+
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+
+  ret <4 x i32> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+ifelse(NO_HALF_DECLARES, `1', `', `
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
--- a/builtins/target-avx1-i64x4base.ll
+++ b/builtins/target-avx1-i64x4base.ll
@@ -0,0 +1,513 @@
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 4-wide definitions
+
+define(`WIDTH',`4')
+define(`MASK',`i64')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  ; do one N-R iteration
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+;; avx intrinsic
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9)
+  ret <4 x double> %call
+}
+
+
+define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10)
+  ret <4 x double> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+;; avx<76> intrinsic
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0)
+  ret <4 x double> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define(double,4,4,d)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+;; sse intrinsics
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+;; sse intrinsic 
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
+
+define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
+  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
+  %scalar = extractelement <4 x float> %v2, i32 0
+  ret float %scalar
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline 
+{
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define <4 x i32> @__add_varying_int32(<4 x i32>,
+                                      <4 x i32>) nounwind readnone alwaysinline {
+  %s = add <4 x i32> %0, %1
+  ret <4 x i32> %s
+}
+
+define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <4 x double> <double 0.,double 0.,double 0.,double 0.>, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %v1 = <4 x double> <double 0., double 0., double 0., double 0.>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0,   <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
+  ret double %sum
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define <4 x i64> @__add_varying_int64(<4 x i64>,
+                                      <4 x i64>) nounwind readnone alwaysinline {
+  %s = add <4 x i64> %0, %1
+  ret <4 x i64> %s
+}
+
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+; no masked load instruction for i8 and i16 types??
+masked_load(i8,  1)
+masked_load(i16, 2)
+
+;; avx intrinsics
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+ 
+define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline {
+  %mask      = trunc <4 x i64> %mask64 to <4 x i32>
+  %floatmask = bitcast <4 x i32> %mask to <4 x float>
+  %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
+  %retval = bitcast <4 x float> %floatval to <4 x i32>
+  ret <4 x i32> %retval
+}
+
+
+define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline {
+  %doublemask = bitcast <4 x i64> %mask to <4 x double>
+  %doubleval  = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask)
+  %retval = bitcast <4 x double> %doubleval to <4 x i64>
+  ret <4 x i64> %retval
+}
+
+masked_load_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+;; avx intrinsics
+declare void @llvm.x86.avx.maskstore.ps    (i8 *, <4 x float>,  <4 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+
+define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                <4 x i64>) nounwind alwaysinline {
+  %mask32 = trunc <4 x i64> %2 to <4 x i32>
+
+  %ptr    = bitcast <4 x i32> * %0 to i8 *
+  %val    = bitcast <4 x i32> %1 to <4 x float>
+  %mask   = bitcast <4 x i32> %mask32 to <4 x float>
+  call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
+  ret void
+}
+
+define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
+                                <4 x i64>) nounwind alwaysinline {
+  %ptr  = bitcast <4 x i64> * %0 to i8 *
+  %val  = bitcast <4 x i64> %1 to <4 x double>
+  %mask = bitcast <4 x i64> %2 to <4 x double>
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val)
+  ret void
+}
+
+
+masked_store_blend_8_16_by_4_mask64()
+
+;; sse intrinsic
+declare <4 x float>  @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask          = trunc   <4 x i64> %2 to <4 x i32>
+  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
+  %oldValue      = load    <4 x i32>* %0, align 4
+  %oldAsFloat    = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat    = bitcast <4 x i32> %1 to <4 x float>
+  %blend         = call    <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                             <4 x float> %newAsFloat,
+                                                             <4 x float> %mask_as_float)
+  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
+  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
+  ret void
+}
+
+;; avx intrinsic
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                <4 x double>) nounwind readnone
+
+define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask_as_double = bitcast <4 x i64>  %2 to <4 x double>
+  %oldValue       = load    <4 x i64>* %0, align 4
+  %oldAsDouble    = bitcast <4 x i64>  %oldValue to <4 x double>
+  %newAsDouble    = bitcast <4 x i64>  %1 to <4 x double>
+  %blend          = call    <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble,
+                                                                        <4 x double> %newAsDouble,
+                                                                        <4 x double> %mask_as_double)
+  %blendAsInt = bitcast <4 x double> %blend to <4 x i64>
+  store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4
+  ret void
+}
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; scatter
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -0,0 +1,81 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+ifelse(NO_HALF_DECLARES, `1', `', `
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -0,0 +1,81 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+ifelse(NO_HALF_DECLARES, `1', `', `
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
--- a/builtins/target-avx11-x2.ll
+++ b/builtins/target-avx11-x2.ll
@@ -0,0 +1,132 @@
+;;  Copyright (c) 2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
+  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
+  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
+  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
+  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+'
+)
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -0,0 +1,115 @@
+;;  Copyright (c) 2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  ret <8 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+')
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -0,0 +1,561 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
+
+include(`target-avx-x2.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
+  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
+  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
+  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
+  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+declare void @llvm.trap() noreturn nounwind
+
+; $1: type
+; $2: var base name
+define(`extract_4s', `
+  %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: type
+; $2: var base name
+define(`extract_8s', `
+  %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef,
+                    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: element type
+; $2: ret name
+; $3: v1
+; $4: v2
+define(`assemble_8s', `
+  %$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4,
+                      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: element type
+; $2: ret name
+; $3: v1
+; $4: v2
+; $5: v3
+; $6: v4
+define(`assemble_4s', `
+  %$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  assemble_8s($1, $2, $2_1, $2_2)
+')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  extract_8s(i32, offsets)
+  extract_8s(i32, vecmask)
+
+  %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8)
+
+  assemble_8s(i32, v, v1, v2)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  extract_4s(i32, vecmask)
+  extract_4s(i64, offsets)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i32, v, v1, v2, v3, v4)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather32_i32(<16 x i32> %ptrs, 
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_8s(i32, ptrs)
+  extract_8s(i32, vecmask)
+
+  %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                       <8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1)
+  %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                       <8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1)
+
+  assemble_8s(i32, v, v1, v2)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather64_i32(<16 x i64> %ptrs, 
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_4s(i64, ptrs)
+  extract_4s(i32, vecmask)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
+  %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1)
+  %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1)
+
+  assemble_4s(i32, v, v1, v2, v3, v4)
+
+  ret <16 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <16 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <16 x i32> %offsets,
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_8s(i32, offsets)
+  extract_8s(float, mask)
+
+  %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8)
+  %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8)
+
+  assemble_8s(float, v, v1, v2)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <16 x i64> %offsets,
+                                   <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_4s(i64, offsets)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
+  %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8)
+  %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8)
+
+  assemble_4s(float, v, v1, v2, v3, v4)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather32_float(<16 x i32> %ptrs, 
+                                      <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_8s(float, mask)
+  extract_8s(i32, ptrs)
+
+  %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1)
+  %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1)
+
+  assemble_8s(float, v, v1, v2)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather64_float(<16 x i64> %ptrs, 
+                                      <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_4s(i64, ptrs)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
+  %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1)
+  %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1)
+
+  assemble_4s(float, v, v1, v2, v3, v4)
+
+  ret <16 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i32, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+
+define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i64, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+
+define <16 x i64> @__gather32_i64(<16 x i32> %ptrs, 
+                                  <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i32, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @__gather64_i64(<16 x i64> %ptrs, 
+                                  <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i64, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <16 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i32, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i64, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather32_double(<16 x i32> %ptrs, 
+                                        <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i32, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather64_double(<16 x i64> %ptrs, 
+                                        <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i64, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+')
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -0,0 +1,433 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
+
+include(`target-avx.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  ret <8 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+declare void @llvm.trap() noreturn nounwind
+
+define(`extract_4s', `
+  %$2_1 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <8 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets, <8 x i32> %vecmask, i8 %scale8)
+
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  extract_4s(i32, vecmask)
+  extract_4s(i64, offsets)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather32_i32(<8 x i32> %ptrs, 
+                                 <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                      <8 x i32> %ptrs, <8 x i32> %vecmask, i8 1)
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather64_i32(<8 x i64> %ptrs, 
+                                 <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_4s(i64, ptrs)
+  extract_4s(i32, vecmask)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <8 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <8 x i32> %offsets,
+                                  <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+
+  %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets, <8 x float> %mask, i8 %scale8)
+
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <8 x i64> %offsets,
+                                   <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+  extract_4s(i64, offsets)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
+
+  %v = shufflevector <4 x float> %v1, <4 x float> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather32_float(<8 x i32> %ptrs, 
+                                     <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+
+  %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs, <8 x float> %mask, i8 1)
+
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather64_float(<8 x i64> %ptrs, 
+                                     <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+  extract_4s(i64, ptrs)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
+
+  %v = shufflevector <4 x float> %v1, <4 x float> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <8 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i32, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i64, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather32_i64(<8 x i32> %ptrs, 
+                                 <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+
+  extract_4s(i32, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather64_i64(<8 x i64> %ptrs, 
+                                 <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i64, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <8 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i32, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i64, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather32_double(<8 x i32> %ptrs, 
+                                       <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i32, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather64_double(<8 x i64> %ptrs, 
+                                       <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i64, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  ret <8 x double> %v
+}
+
+')
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -0,0 +1,993 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Define the standard library builtins for the NOVEC target
+define(`MASK',`i32')
+define(`WIDTH',`1')
+include(`util.m4')
+; Define some basics for a 1-wide target
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+aossoa()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+
+define  <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
+                               <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %mv = trunc <1 x i32> %mask to <1 x i8>
+;  %notmask = xor <1 x i8> %mv, <i8 -1>
+;  %cleared_old = and <1 x i8> %0, %notmask
+;  %masked_new = and <1 x i8> %1, %mv
+;  %new = or <1 x i8> %cleared_old, %masked_new
+;  ret <1 x i8> %new
+
+   ; not doing this the easy way because of problems with LLVM's scalarizer
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i8> %0, <1 x i8> %1
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i8> %0, i32 0
+    %d1 = extractelement <1 x i8> %1, i32 0
+    %sel = select i1 %cmp, i8 %d0, i8 %d1    
+    %r = insertelement <1 x i8> undef, i8 %sel, i32 0
+   ret <1 x i8> %r
+}
+
+define  <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
+                                 <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %mv = trunc <1 x i32> %mask to <1 x i16>
+;  %notmask = xor <1 x i16> %mv, <i16 -1>
+;  %cleared_old = and <1 x i16> %0, %notmask
+;  %masked_new = and <1 x i16> %1, %mv
+;  %new = or <1 x i16> %cleared_old, %masked_new
+;  ret <1 x i16> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i16> %0, <1 x i16> %1
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i16> %0, i32 0
+    %d1 = extractelement <1 x i16> %1, i32 0
+    %sel = select i1 %cmp, i16 %d0, i16 %d1    
+    %r = insertelement <1 x i16> undef, i16 %sel, i32 0
+   ret <1 x i16> %r
+
+;   ret <1 x i16> %sel
+}
+
+
+define  <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
+                                 <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %notmask = xor <1 x i32> %mask, <i32 -1>
+;  %cleared_old = and <1 x i32> %0, %notmask
+;  %masked_new = and <1 x i32> %1, %mask
+;  %new = or <1 x i32> %cleared_old, %masked_new
+;  ret <1 x i32> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i32> %0, <1 x i32> %1
+;   ret <1 x i32> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i32> %0, i32 0
+    %d1 = extractelement <1 x i32> %1, i32 0
+    %sel = select i1 %cmp, i32 %d0, i32 %d1    
+    %r = insertelement <1 x i32> undef, i32 %sel, i32 0
+   ret <1 x i32> %r
+
+}
+
+define  <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
+                                 <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %newmask = zext <1 x i32> %mask to <1 x i64>
+;  %notmask = xor <1 x i64> %newmask, <i64 -1>
+;  %cleared_old = and <1 x i64> %0, %notmask
+;  %masked_new = and <1 x i64> %1, %newmask
+;  %new = or <1 x i64> %cleared_old, %masked_new
+;  ret <1 x i64> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i64> %0, <1 x i64> %1
+;   ret <1 x i64> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i64> %0, i32 0
+    %d1 = extractelement <1 x i64> %1, i32 0
+    %sel = select i1 %cmp, i64 %d0, i64 %d1    
+    %r = insertelement <1 x i64> undef, i64 %sel, i32 0
+   ret <1 x i64> %r
+
+}
+
+define  <1 x float> @__vselect_float(<1 x float>, <1 x float>,
+                                     <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %v0 = bitcast <1 x float> %0 to <1 x i32>
+;  %v1 = bitcast <1 x float> %1 to <1 x i32>
+;  %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask)
+;  %rf = bitcast <1 x i32> %r to <1 x float>
+;  ret <1 x float> %rf
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x float> %0, <1 x float> %1
+;   ret <1 x float> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x float> %0, i32 0
+    %d1 = extractelement <1 x float> %1, i32 0
+    %sel = select i1 %cmp, float %d0, float %d1    
+    %r = insertelement <1 x float> undef, float %sel, i32 0
+   ret <1 x float> %r
+
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i8(<1 x i8>* nocapture, <1 x i8>,
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i8> * %0, align 4
+  %newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask) 
+  store <1 x i8> %newval, <1 x i8> * %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<1 x i16>* nocapture, <1 x i16>, 
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i16> * %0, align 4
+  %newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask) 
+  store <1 x i16> %newval, <1 x i16> * %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<1 x i32>* nocapture, <1 x i32>, 
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i32> * %0, align 4
+  %newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask) 
+  store <1 x i32> %newval, <1 x i32> * %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i64(<1 x i64>* nocapture, <1 x i64>,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i64> * %0, align 4
+  %newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask) 
+  store <1 x i64> %newval, <1 x i64> * %0, align 4
+  ret void
+}
+
+masked_store_float_double()
+
+define  i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define  i1 @__any(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define  i1 @__all(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %cmp = icmp eq i32 %v, 1
+  ret i1 %cmp
+}
+
+define  i1 @__none(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+;;
+;; There are not any rounding instructions in SSE2, so we have to emulate
+;; the functionality with multiple instructions...
+
+; The code for __round_* is the result of compiling the following source
+; code.
+;
+; export float Round(float x) {
+;    unsigned int sign = signbits(x);
+;    unsigned int ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    x += 0x1.0p23f;
+;    x -= 0x1.0p23f;
+;    ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    return x;
+;}
+
+define  <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32>
+  %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
+  %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
+  %binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
+  %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float>
+  ret <1 x float> %int_to_float_bitcast.i.i.i
+}
+
+;; Similarly, for implementations of the __floor* functions below, we have the
+;; bitcode from compiling the following source code...
+
+;export float Floor(float x) {
+;    float y = Round(x);
+;    unsigned int cmp = y > x ? 0xffffffff : 0;
+;    float delta = -1.f;
+;    unsigned int idelta = intbits(delta);
+;    idelta &= cmp;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define  <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
+  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <1 x float> %binop.i
+}
+
+;; And here is the code we compiled to get the __ceil* functions below
+;
+;export uniform float Ceil(uniform float x) {
+;    uniform float y = Round(x);
+;    uniform int yltx = y < x ? 0xffffffff : 0;
+;    uniform float delta = 1.f;
+;    uniform int idelta = intbits(delta);
+;    idelta &= yltx;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define  <1 x float> @__ceil_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
+  %bincmp.i = fcmp olt <1 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
+  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <1 x float> %binop.i
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+; expecting math lib to provide this
+declare double @ceil (double) nounwind readnone
+declare double @floor (double) nounwind readnone
+declare double @round (double) nounwind readnone
+;declare float     @llvm.sqrt.f32(float %Val)
+declare double    @llvm.sqrt.f64(double %Val)
+declare float     @llvm.sin.f32(float %Val)
+declare float     @llvm.asin.f32(float %Val)
+declare float     @llvm.cos.f32(float %Val)
+declare float     @llvm.sqrt.f32(float %Val)
+declare float     @llvm.exp.f32(float %Val)
+declare float     @llvm.log.f32(float %Val)
+declare float     @llvm.pow.f32(float %f, float %e)
+
+
+
+
+;; stuff that could be in builtins ...
+
+define(`unary1to1', `
+  %v_0 = extractelement <1 x $1> %0, i32 0
+  %r_0 = call $1 $2($1 %v_0)
+  %ret_0 = insertelement <1 x $1> undef, $1 %r_0, i32 0
+  ret <1 x $1> %ret_0
+')
+
+
+
+;; dummy 1 wide vector ops
+define  void
+@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline { 
+
+  store <1 x float> %v0, <1 x float > * %out0
+  store <1 x float> %v1, <1 x float > * %out1
+  store <1 x float> %v2, <1 x float > * %out2
+  store <1 x float> %v3, <1 x float > * %out3
+
+  ret void
+}
+
+define  void
+@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline { 
+  call void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, 
+    <1 x float> %v2, <1 x float> %v3, <1 x float> * %out0, 
+    <1 x float> * %out1, <1 x float> * %out2, <1 x float> * %out3)
+  ret void
+}
+
+define  void
+@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2) {
+  store <1 x float> %v0, <1 x float > * %out0
+  store <1 x float> %v1, <1 x float > * %out1
+  store <1 x float> %v2, <1 x float > * %out2
+
+  ret void
+}
+
+define  void
+@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2) {
+  call void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2)
+  ret void
+}
+
+
+;; end builtins
+
+
+define  <1 x double> @__round_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @round)
+}
+
+define  <1 x double> @__floor_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @floor)
+}
+
+
+define  <1 x double> @__ceil_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @ceil)
+}
+
+; To do vector integer min and max, we do the vector compare and then sign
+; extend the i1 vector result to an i32 mask.  The __vselect does the
+; rest...
+
+define  <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp slt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp slt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define  <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp sgt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp sgt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+; The functions for unsigned ints are similar, just with unsigned
+; comparison functions...
+
+define  <1 x i32> @__min_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ult <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ult i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define  <1 x i32> @__max_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ugt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ugt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define  i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define  i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
+define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x i8> %v, i32 0
+  ret i8 %r
+}
+
+define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x i16> %v, i32 0
+  ret i16 %r
+}
+
+define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x float> %v, i32 0
+  ret float %r
+}
+
+define  float @__reduce_min_float(<1 x float>) nounwind readnone {
+  %r = extractelement <1 x float> %0, i32 0
+  ret float %r
+}
+
+define  float @__reduce_max_float(<1 x float>) nounwind readnone {
+  %r = extractelement <1 x float> %0, i32 0
+  ret float %r
+}
+
+define  i32 @__reduce_add_int32(<1 x i32> %v) nounwind readnone {
+  %r = extractelement <1 x i32> %v, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_min_int32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_max_int32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+ }
+
+
+define  double @__reduce_add_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  double @__reduce_min_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  double @__reduce_max_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  i64 @__reduce_add_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_min_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_max_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x i32> %vv, i32 0
+  store i32 %v, i32 * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_float(<1 x float> %vv, float * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x float> %vv, i32 0
+  store float %v, float * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_int64(<1 x i64> %vv, i64 * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x i64> %vv, i32 0
+  store i64 %v, i64 * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x double> %vv, i32 0
+  store double %v, double * %samevalue
+  ret i1 true
+
+}
+
+; extracting/reinserting elements because I want to be able to remove vectors later on
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+define  <1 x float> @__rcp_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  ;%call = call <1 x float> @llvm.x86.sse.rcp.ps(<1 x float> %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  ;%v_iv = fmul <1 x float> %0, %call
+  ;%two_minus = fsub <1 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
+  ;%iv_mul = fmul <1 x float> %call, %two_minus
+  ;ret <1 x float> %iv_mul
+  %d = extractelement <1 x float> %0, i32 0
+  %r = fdiv float 1.,%d
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+define  <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  ;%call = call <1 x float> @llvm.x86.sse.sqrt.ps(<1 x float> %0)
+  ;ret <1 x float> %call
+  %d = extractelement <1 x float> %0, i32 0
+  %r = call float @llvm.sqrt.f32(float %d)
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  ;%is = call <1 x float> @llvm.x86.sse.rsqrt.ps(<1 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  ;%v_is = fmul <1 x float> %v, %is
+  ;%v_is_is = fmul <1 x float> %v_is, %is
+  ;%three_sub = fsub <1 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  ;%is_mul = fmul <1 x float> %is, %three_sub
+  ;%half_scale = fmul <1 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ;ret <1 x float> %half_scale
+  %s = call <1 x float> @__sqrt_varying_float(<1 x float> %v)
+  %r = call <1 x float> @__rcp_varying_float(<1 x float> %s)
+  ret <1 x float> %r
+  
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare  <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline 
+declare  void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+
+define  <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.sin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.sin.f32)
+   
+}
+
+define  <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.asin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.asin.f32)
+   
+}
+
+define  <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.cos.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float, @llvm.cos.f32)
+
+}
+
+define  void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
+;  store <1 x float> %s, <1 x float> * %1
+;  ret void
+   %sin = call <1 x float> @__svml_sinf(<1 x float> %0)
+   %cos = call <1 x float> @__svml_cosf(<1 x float> %0)
+   store <1 x float> %sin, <1 x float> * %1
+   store <1 x float> %cos, <1 x float> * %2
+   ret void
+}
+
+define  <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_tan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unasry1to1(float, @llvm.tan.f32)
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline {
+;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
+;  ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_atan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unsary1to1(float,@llvm.atan.f32)
+  ;UNSUPPORTED!
+  ret <1 x float > %0
+
+}
+
+define  <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  ;%y = extractelement <1 x float> %0, i32 0
+  ;%x = extractelement <1 x float> %1, i32 0
+  ;%q = fdiv float %y, %x
+  ;%a = call float @llvm.atan.f32 (float %q)
+  ;%rv = insertelement <1 x float> undef, float %a, i32 0
+  ;ret <1 x float> %rv
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.exp.f32)
+}
+
+define  <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.log.f32)
+}
+
+define  <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  %r = extractelement <1 x float> %0, i32 0
+  %e  = extractelement <1 x float> %1, i32 0
+  %s = call float @llvm.pow.f32(float %r,float %e)
+  %rv = insertelement <1 x float> undef, float %s, i32 0
+  ret <1 x float> %rv
+
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+define  <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
+;  %call = call <1 x float> @llvm.x86.sse.max.ps(<1 x float> %0, <1 x float> %1)
+;  ret <1 x float> %call
+  %a = extractelement <1 x float> %0, i32 0
+  %b = extractelement <1 x float> %1, i32 0
+  %d = fcmp ogt float %a, %b  
+  %r = select i1 %d, float %a, float %b
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv    
+}
+
+define  <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
+;  %call = call <1 x float> @llvm.x86.sse.min.ps(<1 x float> %0, <1 x float> %1)
+;  ret <1 x float> %call
+  %a = extractelement <1 x float> %0, i32 0
+  %b = extractelement <1 x float> %1, i32 0
+  %d = fcmp olt float %a, %b  
+  %r = select i1 %d, float %a, float %b
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv    
+
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+;declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define  <1 x double> @__sqrt_varying_double(<1 x double>) nounwind alwaysinline {
+  ;unarya2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ;ret <1 x double> %ret
+  unary1to1(double, @llvm.sqrt.f64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+;declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+;declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define  <1 x double> @__min_varying_double(<1 x double>, <1 x double>) nounwind readnone {
+  ;binarsy2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ;ret <1 x double> %ret
+  %a = extractelement <1 x double> %0, i32 0
+  %b = extractelement <1 x double> %1, i32 0
+  %d = fcmp olt double %a, %b  
+  %r = select i1 %d, double %a, double %b
+  %rv = insertelement <1 x double> undef, double %r, i32 0
+  ret <1 x double> %rv    
+
+}
+
+define  <1 x double> @__max_varying_double(<1 x double>, <1 x double>) nounwind readnone {
+  ;binary2sto4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ;ret <1 x double> %ret
+  %a = extractelement <1 x double> %0, i32 0
+  %b = extractelement <1 x double> %1, i32 0
+  %d = fcmp ogt double %a, %b  
+  %r = select i1 %d, double %a, double %b
+  %rv = insertelement <1 x double> undef, double %r, i32 0
+  ret <1 x double> %rv    
+
+}
+
+
+define  float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %r = fdiv float 1.,%0
+  ret float %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+define  float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__round_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+
+}
+
+define  float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__floor_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+
+}
+
+define  float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__ceil_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+
+define  double @__round_uniform_double(double) nounwind readonly alwaysinline {
+       %rs=call double @round(double %0)
+       ret double %rs
+}
+
+define  double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  %rs = call double @floor(double %0)
+  ret double %rs
+}
+
+define  double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  %rs = call double @ceil(double %0)
+  ret double %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+
+define  float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  %ret = call float @llvm.sqrt.f32(float %0)
+  ret float %ret
+}
+
+define  double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {
+  %ret = call double @llvm.sqrt.f64(double %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+
+define  float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  %s = call float @__sqrt_uniform_float(float %0)
+  %r = call float @__rcp_uniform_float(float %s)
+  ret float %r
+}
+
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fastmath
+
+
+define  void @__fastmath() nounwind alwaysinline {
+ ; no-op
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+
+define  float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp ogt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+
+define  float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp olt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+define  double @__max_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp ogt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+
+}
+
+define  double @__min_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp olt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+
+}
+
+define_shuffles()
+
+ctlztz()
+
+define_prefetches()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
--- a/builtins/target-generic-32.ll
+++ b/builtins/target-generic-32.ll
@@ -0,0 +1,33 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`32')
+include(`target-generic-common.ll')
--- a/builtins/target-generic-64.ll
+++ b/builtins/target-generic-64.ll
@@ -0,0 +1,33 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`64')
+include(`target-generic-common.ll')
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -29,12 +29,18 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32-v4:128:128";
+
 define(`MASK',`i1')
+define(`HAVE_GATHER',`1')
+define(`HAVE_SCATTER',`1')
+
 include(`util.m4')

 stdlib_core()
 scans()
 reduce_equal(WIDTH)
+rdrand_decls()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; broadcast/rotate/shuffle
@@ -46,6 +52,20 @@ declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
 declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
 declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone

+declare <WIDTH x float> @__setzero_float() nounwind readnone
+declare <WIDTH x double> @__setzero_double() nounwind readnone
+declare <WIDTH x i8> @__setzero_i8() nounwind readnone
+declare <WIDTH x i16> @__setzero_i16() nounwind readnone
+declare <WIDTH x i32> @__setzero_i32() nounwind readnone
+declare <WIDTH x i64> @__setzero_i64() nounwind readnone
+
+declare <WIDTH x float> @__undef_float() nounwind readnone
+declare <WIDTH x double> @__undef_double() nounwind readnone
+declare <WIDTH x i8> @__undef_i8() nounwind readnone
+declare <WIDTH x i16> @__undef_i16() nounwind readnone
+declare <WIDTH x i32> @__undef_i32() nounwind readnone
+declare <WIDTH x i64> @__undef_i64() nounwind readnone
+
 declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
 declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
 declare <WIDTH x i8> @__broadcast_i8(<WIDTH x i8>, i32) nounwind readnone
@@ -98,6 +118,14 @@ declare void @__aos_to_soa4_float(float * noalias %p, <WIDTH x float> * noalias
                                  <WIDTH x float> * noalias %out2,
                                  <WIDTH x float> * noalias %out3) nounwind

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; math

@@ -174,36 +202,34 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
 declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
 declare i64 @__count_leading_zeros_i64(i64) nounwind readnone

-;; svml
-
 ; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
 ; or, use the macro to call the 4-wide ones twice with our 8-wide
 ; vectors...

-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+;; svml
+
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions

-declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone 
+declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone 
+declare i1 @__any(<WIDTH x i1>) nounwind readnone 
+declare i1 @__all(<WIDTH x i1>) nounwind readnone 
+declare i1 @__none(<WIDTH x i1>) nounwind readnone 
+
+declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
+declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone

 declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
 declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
 declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone 

-declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone 
+declare i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
 declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone 
-
-declare i32 @__reduce_add_uint32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone 

@@ -214,82 +240,113 @@ declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone
 declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone 
-
-declare i64 @__reduce_add_uint64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone 

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(WIDTH, i8, 8)
-load_and_broadcast(WIDTH, i16, 16)
-load_and_broadcast(WIDTH, i32, 32)
-load_and_broadcast(WIDTH, i64, 64)

-declare <WIDTH x i8> @__masked_load_8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
-declare <WIDTH x i16> @__masked_load_16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
-declare <WIDTH x i32> @__masked_load_32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
-declare <WIDTH x i64> @__masked_load_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x float> @__masked_load_float(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x double> @__masked_load_double(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly

-declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                <WIDTH x i1>) nounwind 
-declare void @__masked_store_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
-                                <WIDTH x i1>) nounwind 
-declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
-                                <WIDTH x i1>) nounwind 
-declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
-                                <WIDTH x i1> %mask) nounwind 
+declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                 <WIDTH x i1>) nounwind 
+declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                 <WIDTH x i1>) nounwind 
+declare void @__masked_store_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                   <WIDTH x i1>) nounwind 
+declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                 <WIDTH x i1> %mask) nounwind 
+declare void @__masked_store_double(<WIDTH x double>* nocapture, <WIDTH x double>,
+                                    <WIDTH x i1> %mask) nounwind 

-define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
-                                     <WIDTH x i1>) nounwind {
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+declare void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                       <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                       <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                       <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                       <WIDTH x i1> %mask) nounwind 
+declare void @__masked_store_blend_double(<WIDTH x double>* nocapture, <WIDTH x double>,
+                                       <WIDTH x i1> %mask) nounwind 
+', `
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i8> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
  store <WIDTH x i8> %v1, <WIDTH x i8> * %0
  ret void
 }

-define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
-                                     <WIDTH x i1>) nounwind {
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i16> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
  ret void
 }

-define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
-                                     <WIDTH x i1>) nounwind {
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i32> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
  ret void
 }

-define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
-                                     <WIDTH x i64>, <WIDTH x i1>) nounwind {
+define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                        <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x float> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
+  store <WIDTH x float> %v1, <WIDTH x float> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
+                            <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i64> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
  ret void
 }

+define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
+                            <WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x double> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
+  store <WIDTH x double> %v1, <WIDTH x double> * %0
+  ret void
+}
+')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

 define(`gather_scatter', `
-declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
-                        i32, <WIDTH x i1>) nounwind readonly 
-declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
-                        i32, <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, i32, <WIDTH x i32>,
+                                                 <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, i32, <WIDTH x i64>,
+                                                  <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
                                    <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
                                    <WIDTH x i1>) nounwind readonly 

-declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
-                  i32, <WIDTH x $1>, <WIDTH x i1>) nounwind 
-declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
-                  i32, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, <WIDTH x i32>,
+                                          <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, <WIDTH x i64>,
+                                          <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
                             <WIDTH x i1>) nounwind 
 declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
@@ -299,7 +356,9 @@ declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
 gather_scatter(i8)
 gather_scatter(i16)
 gather_scatter(i32)
+gather_scatter(float)
 gather_scatter(i64)
+gather_scatter(double)

 declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
                                  <WIDTH x i1>) nounwind
@@ -315,3 +374,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
 declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind 
 declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind 

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
--- a/builtins/target-neon-16.ll
+++ b/builtins/target-neon-16.ll
@@ -0,0 +1,517 @@
+;;
+;; target-neon-16.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`8')
+define(`MASK',`i16')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <8 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
+  %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i,
+      <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+       i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <8 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
+  %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
+  ret <8 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+  unary4to8(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <8 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+  unary4to8(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+  %and_mask = and <WIDTH x i16> %0,
+    <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %v = or i64 %va, %vb
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vor = or <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vor, i32 0
+  %v1 = extractelement <4 x MASK> %vor, i32 1
+  %v2 = extractelement <4 x MASK> %vor, i32 2
+  %v3 = extractelement <4 x MASK> %vor, i32 3
+  %v01 = or MASK %v0, %v1
+  %v23 = or MASK %v2, %v3
+  %v = or MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vand = and <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vand, i32 0
+  %v1 = extractelement <4 x MASK> %vand, i32 1
+  %v2 = extractelement <4 x MASK> %vand, i32 2
+  %v3 = extractelement <4 x MASK> %vand, i32 3
+  %v01 = and MASK %v0, %v1
+  %v23 = and MASK %v2, %v3
+  %v = and MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v8tov4($1, %0, %v0123, %v4567)
+  %v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8)
+  %vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0)
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  %r16 = trunc i32 %r to i16
+  ret i16 %r16
+}
+
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16>)
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  %a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16> %0)
+  %a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1)
+  %aa = extractelement <2 x i64> %a2, i32 0
+  %ab = extractelement <2 x i64> %a2, i32 1
+  %r = add i64 %aa, %ab
+  ret i64 %r
+}
+
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  v8tov4(i32, %0, %va, %vb)
+  %pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %psum = add <2 x i64> %pa, %pb
+  %a0 = extractelement <2 x i64> %psum, i32 0
+  %a1 = extractelement <2 x i64> %psum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+  v8tov2(double, %0, %v0, %v1, %v2, %v3)
+  %v01 = fadd <2 x double> %v0, %v1
+  %v23 = fadd <2 x double> %v2, %v3
+  %sum = fadd <2 x double> %v01, %v23
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+  v8tov2(i64, %0, %v0, %v1, %v2, %v3)
+  %v01 = add <2 x i64> %v0, %v1
+  %v23 = add <2 x i64> %v2, %v3
+  %sum = add <2 x i64> %v01, %v23
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16
+
+declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
--- a/builtins/target-neon-32.ll
+++ b/builtins/target-neon-32.ll
@@ -0,0 +1,487 @@
+;;
+;; target-neon-32.ll
+;;
+;;  Copyright(c) 2012-2013 Matt Pharr
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`4')
+define(`MASK',`i32')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone {
+  %r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v)
+  ret <4 x float> %r
+}
+
+define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v)
+  ret <4 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
+  %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <4 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <4 x float> %binop21.i to <4 x i32>
+  %bitop31.i = xor <4 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop31.i to <4 x float>
+  ret <4 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
+  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <4 x float> %binop.i
+}
+
+define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
+  %bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
+  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <4 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  %r = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %0, <4 x float> %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  %r = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %0, <4 x float> %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  %r = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  %r = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  %r = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  %r = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+  %x0 = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %d)
+  %x0_nr = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %d, <4 x float> %x0)
+  %x1 = fmul <4 x float> %x0, %x0_nr
+  %x1_nr = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %d, <4 x float> %x1)
+  %x2 = fmul <4 x float> %x1, %x1_nr
+  ret <4 x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+  %x0 = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %d)
+  %x0_2 = fmul <4 x float> %x0, %x0
+  %x0_nr = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %d, <4 x float> %x0_2)
+  %x1 = fmul <4 x float> %x0, %x0_nr
+  %x1_2 = fmul <4 x float> %x1, %x1
+  %x1_nr = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %d, <4 x float> %x1_2)
+  %x2 = fmul <4 x float> %x1, %x1_nr
+  ret <4 x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %vr = call <4 x float> @__rsqrt_varying_float(<4 x float> %vs)
+  %r = extractelement <4 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %vr = call <4 x float> @__rcp_varying_float(<4 x float> %vs)
+  %r = extractelement <4 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+  %result = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <4 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+  %r = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %0)
+  ret <4 x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<4 x MASK>) nounwind readnone {
+  %and_mask = and <4 x MASK> %0, <MASK 1, MASK 2, MASK 4, MASK 8>
+  %v01 = shufflevector <4 x i32> %and_mask, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %v23 = shufflevector <4 x i32> %and_mask, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vor = or <2 x i32> %v01, %v23
+  %v0 = extractelement <2 x i32> %vor, i32 0
+  %v1 = extractelement <2 x i32> %vor, i32 1
+  %v = or i32 %v0, %v1
+  %mask64 = zext i32 %v to i64
+  ret i64 %mask64
+}
+
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %v01 = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %v23 = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vor = or <2 x i32> %v01, %v23
+  %v0 = extractelement <2 x i32> %vor, i32 0
+  %v1 = extractelement <2 x i32> %vor, i32 1
+  %v = or i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %v01 = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %v23 = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vor = and <2 x i32> %v01, %v23
+  %v0 = extractelement <2 x i32> %vor, i32 0
+  %v1 = extractelement <2 x i32> %vor, i32 1
+  %v = and i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<4 x i32> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $3 scalar reduce function
+
+define(`neon_reduce', `
+  %v0 = shufflevector <4 x $1> %0, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x $1> %0, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
+  %vh = call <2 x $1> $2(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1$3 ($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<4 x float>) nounwind readnone {
+  neon_reduce(float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone {
+  neon_reduce(float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone {
+  neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8)
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  %r16 = trunc i32 %r to i16
+  ret i16 %r16
+}
+
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+
+define i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  ret i32 %r
+}
+
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0)
+  %a0 = extractelement <2 x i64> %a64, i32 0
+  %a1 = extractelement <2 x i64> %a64, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
+  neon_reduce(i32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
+  neon_reduce(i32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
+  neon_reduce(i32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
+  neon_reduce(i32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define double @__reduce_add_double(<4 x double>) nounwind readnone {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = fadd <2 x double> %v0, %v1
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
+  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = add <2 x i64> %v0, %v1
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16
+
+declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
--- a/builtins/target-neon-8.ll
+++ b/builtins/target-neon-8.ll
@@ -0,0 +1,583 @@
+;;
+;; target-neon-8.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+define(`MASK',`i8')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <16 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32>
+  %bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i,
+    <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <16 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32>
+  %bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float>
+  ret <16 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp olt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+  unary4to16(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <16 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+  unary4to16(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+  %and_mask = and <WIDTH x i8> %0,
+    <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128,
+     i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128>
+  %v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask)
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %vbshift = shl i64 %vb, 8
+  %v = or i64 %va, %vbshift
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vor8 = or <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vor8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vor16 = or <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vor16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vor32 = or <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vor32, i32 0
+  %v1 = extractelement <2 x i32> %vor32, i32 1
+  %v = or i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vand8 = and <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vand8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vand16 = and <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vand16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vand32 = and <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vand32, i32 0
+  %v1 = extractelement <2 x i32> %vand32, i32 1
+  %v = and i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v16tov8($1, %0, %va, %vb)
+  %va_16 = shufflevector <8 x $1> %va, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16)
+
+  %v8a = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8b = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 4, i32 5, i32 6, i32 7, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b)
+
+  %vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %a0 = extractelement <2 x i64> %a64, i32 0
+  %a1 = extractelement <2 x i64> %a64, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  v16tov8(i16, %0, %va, %vb)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va)
+  %b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32)
+  %sum = add <2 x i64> %a64, %b64
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  v16tov4(i32, %0, %va, %vb, %vc, %vd)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc)
+  %d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd)
+  %ab = add <2 x i64> %a64, %b64
+  %cd = add <2 x i64> %c64, %d64
+  %sum = add <2 x i64> %ab, %cd
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define internal <WIDTH x double> @__add_varying_double(<WIDTH x double>, <WIDTH x double>) {
+  %r = fadd <WIDTH x double> %0, %1
+  ret <WIDTH x double> %r
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define internal <WIDTH x i64> @__add_varying_int64(<WIDTH x i64>, <WIDTH x i64>) {
+  %r = add <WIDTH x i64> %0, %1
+  ret <WIDTH x i64> %r
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -0,0 +1,346 @@
+;;
+;; target-neon-common.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
+
+stdlib_core()
+scans()
+reduce_equal(WIDTH)
+rdrand_decls()
+define_shuffles()
+aossoa()
+ctlztz()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
+  %r = extractelement <4 x float> %h, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vec = shufflevector <1 x float> %v1, <1 x float> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
+  %r = extractelement <4 x i16> %h, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+define void @__fastmath() nounwind {
+  ret void
+}
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
+  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
+  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
+  %binop21.i = fadd float %binop.i, -8.388608e+06
+  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
+  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp ogt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, -1082130432
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp olt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, 1065353216
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare double @__round_uniform_double(double) nounwind readnone 
+declare double @__floor_uniform_double(double) nounwind readnone 
+declare double @__ceil_uniform_double(double) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+define float @__max_uniform_float(float, float) nounwind readnone {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__min_uniform_float(float, float) nounwind readnone {
+  %cmp = fcmp ult float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
+  %cmp = icmp slt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
+  %cmp = icmp sgt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
+  %cmp = icmp ult i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
+  %cmp = icmp ugt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define double @__min_uniform_double(double, double) nounwind readnone {
+  %cmp = fcmp olt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone {
+  %cmp = fcmp ogt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp slt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp sgt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp ult <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp ugt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone {
+  %m = fcmp olt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone {
+  %m = fcmp ogt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare float @llvm.sqrt.f32(float)
+
+define float @__sqrt_uniform_float(float) nounwind readnone {
+  %r = call float @llvm.sqrt.f32(float %0)
+  ret float %r
+}
+
+declare double @llvm.sqrt.f64(double)
+
+define double @__sqrt_uniform_double(double) nounwind readnone {
+  %r = call double @llvm.sqrt.f64(double %0)
+  ret double %r
+}
+
+;; bit ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define i32 @__popcnt_int32(i32) nounwind readnone {
+  %v = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %v
+}
+
+define i64 @__popcnt_int64(i64) nounwind readnone {
+  %v = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+masked_store_float_double()
+
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
+                                     <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i8> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i8> %new, <WIDTH x i8> %old
+  store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i16> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i16> %new, <WIDTH x i16> %old
+  store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i32> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i32> %new, <WIDTH x i32> %old
+  store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
+                            <WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i64> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i64> %new, <WIDTH x i64> %old
+  store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
+  ret void
+}
+
+;; yuck.  We need declarations of these, even though we shouldnt ever
+;; actually generate calls to them for the NEON target...
+
+
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+packed_load_and_store(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+
+define_prefetches()
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -33,6 +33,7 @@ ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
+rdrand_decls()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -268,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
  ret i64 %val
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()

--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -47,6 +47,14 @@ int64minmax()

 include(`target-sse2-common.ll')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

@@ -97,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff

-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)

-
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,8)


 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -287,7 +222,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {

 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone

-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -301,7 +236,92 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; of the second one
  %v1s = shl i32 %v1, 4
  %v = or i32 %v0, %v1s
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
 }

 define <4 x float> @__vec4_add_float(<4 x float> %v0,
@@ -352,11 +372,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
 }

-define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
-  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
-  ret i32 %r
-}
-
 define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
@@ -389,7 +404,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone {
 }

 define <4 x i64> @__add_varying_int64(<4 x i64>,
-                                               <4 x i64>) nounwind readnone alwaysinline {
+                                      <4 x i64>) nounwind readnone alwaysinline {
  %r = add <4 x i64> %0, %1
  ret <4 x i64> %r
 }
@@ -424,28 +439,30 @@ reduce_equal(8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(8, i8, 8)
-load_and_broadcast(8, i16, 16)
-load_and_broadcast(8, i32, 32)
-load_and_broadcast(8, i64, 64)

-masked_load(8, i8,  8,  1)
-masked_load(8, i16, 16, 2)
-masked_load(8, i32, 32, 4)
-masked_load(8, i64, 64, 8)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)

-gen_scatter(8, i8)
-gen_scatter(8, i16)
-gen_scatter(8, i32)
-gen_scatter(8, i64)
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float rounding
@@ -549,23 +566,23 @@ define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alway
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-gen_masked_store(8, i8, 8)
-gen_masked_store(8, i16, 16)
-gen_masked_store(8, i32, 32)
-gen_masked_store(8, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)

 masked_store_blend_8_16_by_8()

-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
-                                     <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32> %mask) nounwind alwaysinline {
  %val = load <8 x i32> * %0, align 4
  %newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask) 
  store <8 x i32> %newval, <8 x i32> * %0, align 4
  ret void
 }

-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
-                                     <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+                                      <8 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <8 x i64>* %ptr, align 8

  ; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
@@ -608,6 +625,8 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }

+masked_store_float_double()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt

--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -44,6 +44,14 @@ int64minmax()

 include(`target-sse2-common.ll')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
@@ -231,10 +239,62 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {

 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone

-define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
 }

 define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
@@ -273,18 +333,13 @@ define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
 }

-define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
-  %r = call i32 @__reduce_add_int32(<4 x i32> %v)
-  ret i32 %r
-}
-
 define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }

 define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
- }
+}


 define double @__reduce_add_double(<4 x double>) nounwind readnone {
@@ -341,16 +396,16 @@ reduce_equal(4)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
-                                     <4 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i32> %mask) nounwind alwaysinline {
  %val = load <4 x i32> * %0, align 4
  %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
  store <4 x i32> %newval, <4 x i32> * %0, align 4
  ret void
 }

-define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
-                                     <4 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                      <4 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <4 x i64>* %ptr, align 8

  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
@@ -392,6 +447,8 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
 }


+masked_store_float_double()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

@@ -439,62 +496,15 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff

-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)

+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,4)

-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
@@ -543,35 +553,37 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r

 masked_store_blend_8_16_by_4()

-gen_masked_store(4, i8, 8)
-gen_masked_store(4, i16, 16)
-gen_masked_store(4, i32, 32)
-gen_masked_store(4, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(4, i8, 8)
-load_and_broadcast(4, i16, 16)
-load_and_broadcast(4, i32, 32)
-load_and_broadcast(4, i64, 64)

-masked_load(4, i8,  8,  1)
-masked_load(4, i16, 16, 2)
-masked_load(4, i32, 32, 4)
-masked_load(4, i64, 64, 8)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

 ; define these with the macros from stdlib.m4

-gen_gather(4, i8)
-gen_gather(4, i16)
-gen_gather(4, i32)
-gen_gather(4, i64)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)

-gen_scatter(4, i8)
-gen_scatter(4, i16)
-gen_scatter(4, i32)
-gen_scatter(4, i64)
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -0,0 +1,490 @@
+;;  Copyright (c) 2013, Google, Inc.
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Google, Inc. nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`8')
+define(`MASK',`i16')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+   ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind
+alwaysinline {
+  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to8(%0, 8)
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to8(%0, 9)
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to8(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 8)
+}
+
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 9)
+}
+
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone {
+  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone {
+  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i64 @__movmsk(<8 x MASK>) nounwind readnone alwaysinline {
+  %m8 = trunc <8 x MASK> %0 to <8 x i8>
+  %mask8 = shufflevector <8 x i8> %m8, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8)
+  %m64 = zext i32 %m to i64
+  ret i64 %m64
+}
+
+define i1 @__any(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %mne = icmp ne i64 %m, 0
+  ret i1 %mne
+}
+
+define i1 @__all(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %meq = icmp eq i64 %m, ALL_ON_MASK
+  ret i1 %meq
+}
+
+define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %meq = icmp eq i64 %m, 0
+  ret i1 %meq
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) {
+  %r = fadd <8 x float> %0, %1
+  ret <8 x float> %r
+}
+
+define internal float @__add_uniform_float(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  reduce8(float, @__add_varying_float, @__add_uniform_float)
+}
+
+define float @__reduce_min_float(<8 x float>) nounwind readnone {
+  reduce8(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<8 x float>) nounwind readnone {
+  reduce8(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal <8 x i32> @__add_varying_int32(<8 x i32>, <8 x i32>) {
+  %r = add <8 x i32> %0, %1
+  ret <8 x i32> %r
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) {
+  %r = add i32 %0, %1
+  ret i32 %r
+}
+
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define internal <8 x double> @__add_varying_double(<8 x double>, <8 x double>) {
+  %r = fadd <8 x double> %0, %1
+  ret <8 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <8 x i64> @__add_varying_int64(<8 x i64>, <8 x i64>) {
+  %r = add <8 x i64> %0, %1
+  ret <8 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i64(<8 x i64>* nocapture, <8 x i64>,
+                                      <8 x MASK> %mask) nounwind
+                                      alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i64>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i64> %1, <8 x i64> %old
+  store <8 x i64> %blend, <8 x i64>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i32>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i32> %1, <8 x i32> %old
+  store <8 x i32> %blend, <8 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>,
+                                     <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i16>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i16> %1, <8 x i16> %old
+  store <8 x i16> %blend, <8 x i16>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
+                                     <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i8>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i8> %1, <8 x i8> %old
+  store <8 x i8> %blend, <8 x i8>* %0, align 4
+  ret void
+}
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
+  %r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+define_avg_up_int8()
+define_avg_up_int16()
+define_down_avgs()
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -0,0 +1,492 @@
+;;  Copyright (c) 2013, Google, Inc.
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Google, Inc. nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`16')
+define(`MASK',`i8')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
+  unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <16 x float> %0, %call
+  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <16 x float> %call, %two_minus
+  ret <16 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+   ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <16 x float> %v, %is
+  %v_is_is = fmul <16 x float> %v_is, %is
+  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <16 x float> %is, %three_sub
+  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <16 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind
+alwaysinline {
+  unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <16 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to16(%0, 8)
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to16(%0, 9)
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to16(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+;  XXXround2to4double(%0, 8)
+  ; FIXME: need round2to16double in util.m4...
+  ret <16 x double> undef  
+}
+
+define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+;  XXXround2to4double(%0, 9)
+  ret <16 x double> undef  
+}
+
+define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+;  XXXround2to4double(%0, 10)
+  ret <16 x double> undef  
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
+  binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <16 x float> %call
+}
+
+define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
+  binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %call
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %call
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone {
+  binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <16 x double> %ret
+}
+
+define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone {
+  binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <16 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME
+
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %m64 = zext i32 %m to i64
+  ret i64 %m64
+}
+
+define i1 @__any(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %mne = icmp ne i32 %m, 0
+  ret i1 %mne
+}
+
+define i1 @__all(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %meq = icmp eq i32 %m, ALL_ON_MASK
+  ret i1 %meq
+}
+
+define i1 @__none(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %meq = icmp eq i32 %m, 0
+  ret i1 %meq
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <16 x i16> @__add_varying_i16(<16 x i16>,
+                                  <16 x i16>) nounwind readnone alwaysinline {
+  %r = add <16 x i16> %0, %1
+  ret <16 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
+  reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) {
+  %r = fadd <16 x float> %0, %1
+  ret <16 x float> %r
+}
+
+define internal float @__add_uniform_float(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+  reduce16(float, @__add_varying_float, @__add_uniform_float)
+}
+
+define float @__reduce_min_float(<16 x float>) nounwind readnone {
+  reduce16(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<16 x float>) nounwind readnone {
+  reduce16(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) {
+  %r = add <16 x i32> %0, %1
+  ret <16 x i32> %r
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) {
+  %r = add i32 %0, %1
+  ret i32 %r
+}
+
+define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) {
+  %r = fadd <16 x double> %0, %1
+  ret <16 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) {
+  %r = add <16 x i64> %0, %1
+  ret <16 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>,
+                                      <16 x i8> %mask) nounwind
+                                      alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i64>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old
+  store <16 x i64> %blend, <16 x i64>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                      <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i32>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old
+  store <16 x i32> %blend, <16 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
+                                     <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i16>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old
+  store <16 x i16> %blend, <16 x i16>* %0, align 4
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
+                                     <16 x MASK> %mask) nounwind alwaysinline {
+  %old = load <16 x i8>* %0, align 4
+  %blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1,
+                                                   <16 x i8> %mask)
+  store <16 x i8> %blend, <16 x i8>* %0, align 4
+  ret void
+}
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+define_avg_up_int8()
+define_avg_up_int16()
+define_down_avgs()
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -33,6 +33,7 @@ ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
+rdrand_decls()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -47,6 +47,14 @@ int64minmax()

 include(`target-sse4-common.ll')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

@@ -97,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff

-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)

-
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,8)


 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -213,13 +148,13 @@ define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly al
 ; unsigned int min/max

 define <8 x i32> @__min_varying_uint32(<8 x i32>,
-                                                <8 x i32>) nounwind readonly alwaysinline {
+                                       <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret <8 x i32> %call
 }

 define <8 x i32> @__max_varying_uint32(<8 x i32>,
-                                                <8 x i32>) nounwind readonly alwaysinline {
+                                       <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret <8 x i32> %call
 }
@@ -229,7 +164,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>,

 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone

-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -243,7 +178,92 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; of the second one
  %v1s = shl i32 %v1, 4
  %v = or i32 %v0, %v1s
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
 }

 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
@@ -279,11 +299,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
 }

-define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
-  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
-  ret i32 %r
-}
-
 define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
 }
@@ -316,7 +331,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone {
 }

 define <4 x i64> @__add_varying_int64(<4 x i64>,
-                                               <4 x i64>) nounwind readnone alwaysinline {
+                                      <4 x i64>) nounwind readnone alwaysinline {
  %r = add <4 x i64> %0, %1
  ret <4 x i64> %r
 }
@@ -351,28 +366,30 @@ reduce_equal(8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(8, i8, 8)
-load_and_broadcast(8, i16, 16)
-load_and_broadcast(8, i32, 32)
-load_and_broadcast(8, i64, 64)

-masked_load(8, i8,  8,  1)
-masked_load(8, i16, 16, 2)
-masked_load(8, i32, 32, 4)
-masked_load(8, i64, 64, 8)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)

-gen_scatter(8, i8)
-gen_scatter(8, i16)
-gen_scatter(8, i32)
-gen_scatter(8, i64)
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float rounding
@@ -435,18 +452,18 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-gen_masked_store(8, i8, 8)
-gen_masked_store(8, i16, 16)
-gen_masked_store(8, i32, 32)
-gen_masked_store(8, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)

 masked_store_blend_8_16_by_8()

 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                             <4 x float>) nounwind readnone

-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
-                                     <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32> %mask) nounwind alwaysinline {
  ; do two 4-wide blends with blendvps
  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
  %mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
@@ -475,8 +492,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
  ret void
 }

-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
-                                     <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+                                      <8 x i32> %mask) nounwind alwaysinline {
  ; implement this as 4 blends of <4 x i32>s, which are actually bitcast
  ; <2 x i64>s...

@@ -542,6 +559,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }

+masked_store_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
@@ -568,3 +586,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
  ret <8 x double> %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -44,6 +44,14 @@ int64minmax()

 include(`target-sse4-common.ll')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

@@ -201,72 +209,76 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff

-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)

-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,4)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone

-define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
 }

 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
@@ -304,18 +316,13 @@ define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
 }

-define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
-  %r = call i32 @__reduce_add_int32(<4 x i32> %v)
-  ret i32 %r
-}
-
 define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }

 define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
- }
+}


 define double @__reduce_add_double(<4 x double>) nounwind readnone {
@@ -375,8 +382,8 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                             <4 x float>) nounwind readnone


-define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
-                                     <4 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i32> %mask) nounwind alwaysinline {
  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
  %oldValue = load <4 x i32>* %0, align 4
  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
@@ -390,8 +397,8 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
 }


-define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
-                                     <4 x i32> %i32mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                      <4 x i32> %i32mask) nounwind alwaysinline {
  %oldValue = load <4 x i64>* %ptr, align 8
  %mask = bitcast <4 x i32> %i32mask to <4 x float>

@@ -442,35 +449,45 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,

 masked_store_blend_8_16_by_4()

-gen_masked_store(4, i8, 8)
-gen_masked_store(4, i16, 16)
-gen_masked_store(4, i32, 32)
-gen_masked_store(4, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(4, i8, 8)
-load_and_broadcast(4, i16, 16)
-load_and_broadcast(4, i32, 32)
-load_and_broadcast(4, i64, 64)

-masked_load(4, i8,  8,  1)
-masked_load(4, i16, 16, 2)
-masked_load(4, i32, 32, 4)
-masked_load(4, i64, 64, 8)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

 ; define these with the macros from stdlib.m4

-gen_gather(4, i8)
-gen_gather(4, i16)
-gen_gather(4, i32)
-gen_gather(4, i64)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()

-gen_scatter(4, i8)
-gen_scatter(4, i16)
-gen_scatter(4, i32)
-gen_scatter(4, i64)
--- a/builtins/util.m4
+++ b/builtins/util.m4
--- a/cbackend.cpp
+++ b/cbackend.cpp
--- a/check_env.py
+++ b/check_env.py
@@ -0,0 +1,102 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
+# // Author: Filippov Ilia
+
+import common
+import sys
+import os
+import string
+print_debug = common.print_debug
+error = common.error
+take_lines = common.take_lines
+
+exists = [False, False, False, False, False, False, False, False]
+names = ["m4", "bison", "flex", "sde", "ispc", "clang", "gcc", "icc"]
+
+PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+for counter in PATH_dir:
+    for i in range(0,8):
+        if os.path.exists(counter + os.sep + names[i]):
+            exists[i] = True
+
+print_debug("=== in PATH: ===\n", False, "")
+print_debug("Tools:\n", False, "")
+for i in range(0,3):
+    if exists[i]:
+        print_debug(take_lines(names[i] + " --version", "first"), False, "")
+    else:
+        error("you don't have " + names[i], 0)
+if exists[0] and exists[1] and exists[2]:
+    if common.check_tools(2):
+        print_debug("Tools' versions are ok\n", False, "")
+print_debug("\nSDE:\n", False, "")
+if exists[3]:
+    print_debug(take_lines(names[3] + " --version", "first"), False, "")
+else:
+    error("you don't have " + names[3], 2)
+print_debug("\nISPC:\n", False, "")
+if exists[4]:
+    print_debug(take_lines(names[4] + " --version", "first"), False, "")
+else:
+    error("you don't have " + names[4], 2)
+print_debug("\nC/C++ compilers:\n", False, "")
+for i in range(5,8):
+    if exists[i]:
+        print_debug(take_lines(names[i] + " --version", "first"), False, "")
+    else:
+        error("you don't have " + names[i], 2)
+
+print_debug("\n=== in ISPC specific environment variables: ===\n", False, "")
+if os.environ.get("LLVM_HOME") == None:
+    error("you have no LLVM_HOME", 2)
+else:
+    print_debug("Your LLVM_HOME:" + os.environ.get("LLVM_HOME") + "\n", False, "")
+if os.environ.get("ISPC_HOME") == None:
+    error("you have no ISPC_HOME", 2)
+else:
+    print_debug("Your ISPC_HOME:" + os.environ.get("ISPC_HOME") + "\n", False, "")
+    if os.path.exists(os.environ.get("ISPC_HOME") + os.sep + "ispc"):
+        print_debug("You have ISPC in your ISPC_HOME: " +
+        take_lines(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version", "first"), False, "")
+    else:
+        error("you don't have ISPC in your ISPC_HOME", 2)
+if os.environ.get("SDE_HOME") == None:
+    error("You have no SDE_HOME", 2)
+else:
+    print_debug("Your SDE_HOME:" + os.environ.get("SDE_HOME") + "\n", False, "")
+    if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"):
+        print_debug("You have sde in your SDE_HOME: " +
+        take_lines(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version", "first"), False, "")
+    else:
+        error("you don't have any SDE in your ISPC_HOME", 2)
--- a/common.py
+++ b/common.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation 
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+import sys
+import os
+import shutil
+
+def write_to_file(filename, line):
+    f = open(filename, 'a')
+    f.writelines(line)
+    f.close()
+
+#remove file if it exists
+def remove_if_exists(filename):
+    if os.path.exists(filename):
+        if os.path.isdir(filename):
+            shutil.rmtree(filename)
+        else:
+            os.remove(filename)
+
+# detect version which is printed after command
+def take_lines(command, which):
+    os.system(command + " > " + "temp_detect_version")
+    version = open("temp_detect_version")
+    if which == "first":
+        answer = version.readline()
+    if which == "all":
+        answer = version.readlines()
+    version.close()
+    remove_if_exists("temp_detect_version")
+    return answer
+
+# print versions of compilers
+def print_version(ispc_test, ispc_ref, ref_compiler, s, perf_log, is_windows):
+    print_debug("\nUsing test compiler: " + take_lines(ispc_test + " --version", "first"), s, perf_log)
+    if ispc_ref != "":
+        print_debug("Using ref compiler:  " + take_lines(ispc_ref + " --version", "first"), s, perf_log)
+    if is_windows == False:
+        temp1 = take_lines(ref_compiler + " --version", "first")
+    else:
+        os.system(ref_compiler + " 2>&1" + " 2> temp_detect_version > temp_detect_version1" )
+        version = open("temp_detect_version")
+        temp1 = version.readline()
+        version.close()
+        remove_if_exists("temp_detect_version")
+        remove_if_exists("temp_detect_version1")
+    print_debug("Using C/C++ compiler: " + temp1 + "\n", s, perf_log)
+
+# print everything from scripts instead errors
+def print_debug(line, silent, filename):
+    if silent == False:
+        sys.stdout.write(line)
+        sys.stdout.flush()
+        if os.environ.get("ISPC_HOME") != None:
+            write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line)
+    if filename != "":
+        write_to_file(filename, line)
+
+# print errors from scripts
+# type 1 for error in environment
+# type 2 for warning
+# type 3 for error of compiler or test which isn't the goal of script 
+def error(line, error_type):
+    line = line + "\n"
+    if error_type == 1:
+        sys.stderr.write("Fatal error: " + line)
+        sys.exit(1)
+    if error_type == 2:
+        sys.stderr.write("Warning: " + line)
+    if error_type == 0:
+        print_debug("FIND ERROR: " + line, False, "")
+
+def check_tools(m):
+    input_tools=[[[1,4],"m4 --version", "bad m4 version"],
+                 [[2,4],"bison --version", "bad bison version"],
+                 [[2,5], "flex --version", "bad flex version"]]
+    ret = 1 
+    for t in range(0,len(input_tools)):
+        t1 = ((take_lines(input_tools[t][1], "first"))[:-1].split(" "))
+        for i in range(0,len(t1)):
+            t11 = t1[i].split(".")
+            f = True
+            for j in range(0,len(t11)):
+                if not t11[j].isdigit():
+                    f = False
+            if f == True:
+                for j in range(0,len(t11)):
+                    if j < len(input_tools[t][0]):
+                        if int(t11[j])<input_tools[t][0][j]:
+                            error(input_tools[t][2], m)
+                            ret = 0
+    return ret
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -17,7 +17,14 @@ syn keyword	ispcStatement	cbreak ccontinue creturn launch print reference soa sy
 syn keyword	ispcConditional	cif
 syn keyword	ispcRepeat	cdo cfor cwhile
 syn keyword	ispcBuiltin	programCount programIndex	
-syn keyword	ispcType	export int8 int16 int32 int64
+syn keyword	ispcType	export uniform varying int8 int16 int32 int64
+
+"double precision floating point number, with dot, optional exponent
+syn match	cFloat		display contained "\d\+\.\d*d[-+]\=\d*\>"
+"double precision floating point number, starting with dot, optional exponent
+syn match	cFloat		display contained ".\d*d[-+]\=\d*\>"
+"double precision floating point number, without dot, with exponent
+syn match	cFloat		display contained "\d\+d[-+]\=\d\+\>"

 " Default highlighting
 command -nargs=+ HiLink hi def link <args>
--- a/contrib/ispc.vim.README
+++ b/contrib/ispc.vim.README
@@ -0,0 +1,8 @@
+To install vim syntax highlighting for ispc files:
+
+1) Copy ispc.vim into ~/.vim/syntax/ispc.vim (create if necessary)
+2) Create a filetype for ispc files to correspond to that syntax file
+   To do this, create and append the following line to ~/.vim/ftdetect/ispc.vim
+
+au BufRead,BufNewFile *.ispc set filetype=ispc
+
--- a/ctx.cpp
+++ b/ctx.cpp
--- a/ctx.h
+++ b/ctx.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2013, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,11 +28,11 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file ctx.h
-    @brief Declaration of the FunctionEmitContext class
+    @brief %Declaration of the FunctionEmitContext class
 */

 #ifndef ISPC_CTX_H
@@ -40,10 +40,20 @@

 #include "ispc.h"
 #include <map>
-#include <llvm/InstrTypes.h>
-#include <llvm/Instructions.h>
-#include <llvm/Analysis/DIBuilder.h>
-#include <llvm/Analysis/DebugInfo.h>
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
+  #include <llvm/InstrTypes.h>
+  #include <llvm/Instructions.h>
+#else
+  #include <llvm/IR/InstrTypes.h>
+  #include <llvm/IR/Instructions.h>
+#endif
+#if defined(LLVM_3_1)
+  #include <llvm/Analysis/DebugInfo.h>
+  #include <llvm/Analysis/DIBuilder.h>
+#else
+  #include <llvm/DebugInfo.h>
+  #include <llvm/DIBuilder.h>
+#endif

 struct CFInfo;

@@ -65,7 +75,7 @@ public:
        @param firstStmtPos Source file position of the first statement in the
                            function
     */
-    FunctionEmitContext(Function *function, Symbol *funSym, 
+    FunctionEmitContext(Function *function, Symbol *funSym,
                        llvm::Function *llvmFunction,
                        SourcePos firstStmtPos);
    ~FunctionEmitContext();
@@ -77,9 +87,9 @@ public:
    /** @name Current basic block management
        @{
     */
-    /** Returns the current basic block pointer */ 
+    /** Returns the current basic block pointer */
    llvm::BasicBlock *GetCurrentBasicBlock();
-    
+
    /** Set the given llvm::BasicBlock to be the basic block to emit
        forthcoming instructions into. */
    void SetCurrentBasicBlock(llvm::BasicBlock *bblock);
@@ -87,7 +97,7 @@ public:
    /** @name Mask management
        @{
     */
-    /** Returns the mask value at entry to the current function. */ 
+    /** Returns the mask value at entry to the current function. */
    llvm::Value *GetFunctionMask();

    /** Returns the mask value corresponding to "varying" control flow
@@ -96,7 +106,7 @@ public:
    llvm::Value *GetInternalMask();

    /** Returns the complete current mask value--i.e. the logical AND of
-        the function entry mask and the internal mask. */ 
+        the function entry mask and the internal mask. */
    llvm::Value *GetFullMask();

    /** Returns a pointer to storage in memory that stores the current full
@@ -149,22 +159,21 @@ public:
        'continue' statements should jump to (if all running lanes want to
        break or continue), uniformControlFlow indicates whether the loop
        condition is 'uniform'. */
-    void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget, 
+    void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget,
                   bool uniformControlFlow);

    /** Informs FunctionEmitContext of the value of the mask at the start
-        of a loop body. */
-    void SetLoopMask(llvm::Value *mask);
+        of a loop body or switch statement. */
+    void SetBlockEntryMask(llvm::Value *mask);

    /** Informs FunctionEmitContext that code generation for a loop is
        finished. */
    void EndLoop();

-    /** Indicates that code generation for a 'foreach' or 'foreach_tiled'
-        loop is about to start.  The provided basic block pointer indicates
-        where control flow should go if a 'continue' statement is executed
-        in the loop. */
-    void StartForeach(llvm::BasicBlock *continueTarget);
+    /** Indicates that code generation for a 'foreach', 'foreach_tiled',
+        'foreach_active', or 'foreach_unique' loop is about to start. */
+    enum ForeachType { FOREACH_REGULAR, FOREACH_ACTIVE, FOREACH_UNIQUE };
+    void StartForeach(ForeachType ft);
    void EndForeach();

    /** Emit code for a 'break' statement in a loop.  If doCoherenceCheck
@@ -187,12 +196,60 @@ public:
        previous iteration. */
    void RestoreContinuedLanes();

+    /** Indicates that code generation for a "switch" statement is about to
+        start.  isUniform indicates whether the "switch" value is uniform,
+        and bbAfterSwitch gives the basic block immediately following the
+        "switch" statement.  (For example, if the switch condition is
+        uniform, we jump here upon executing a "break" statement.) */
+    void StartSwitch(bool isUniform, llvm::BasicBlock *bbAfterSwitch);
+    /** Indicates the end of code generation for a "switch" statement. */
+    void EndSwitch();
+
+    /** Emits code for a "switch" statement in the program.
+        @param expr         Gives the value of the expression after the "switch"
+        @param defaultBlock Basic block to execute for the "default" case.  This
+                            should be NULL if there is no "default" label inside
+                            the switch.
+        @param caseBlocks   vector that stores the mapping from label values
+                            after "case" statements to basic blocks corresponding
+                            to the "case" labels.
+        @param nextBlocks   For each basic block for a "case" or "default"
+                            label, this gives the basic block for the
+                            immediately-following "case" or "default" label (or
+                            the basic block after the "switch" statement for the
+                            last label.)
+    */
+    void SwitchInst(llvm::Value *expr, llvm::BasicBlock *defaultBlock,
+                    const std::vector<std::pair<int, llvm::BasicBlock *> > &caseBlocks,
+                    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &nextBlocks);
+
+    /** Generates code for a "default" label after a "switch" statement.
+        The checkMask parameter indicates whether additional code should be
+        generated to check to see if the execution mask is all off after
+        the default label (in which case a jump to the following label will
+        be issued. */
+    void EmitDefaultLabel(bool checkMask, SourcePos pos);
+
+    /** Generates code for a "case" label after a "switch" statement.  See
+        the documentation for EmitDefaultLabel() for discussion of the
+        checkMask parameter. */
+    void EmitCaseLabel(int value, bool checkMask, SourcePos pos);
+
    /** Returns the current number of nested levels of 'varying' control
        flow */
    int VaryingCFDepth() const;

    bool InForeachLoop() const;

+    /** Temporarily disables emission of performance warnings from gathers
+        and scatters from subsequent code. */
+    void DisableGatherScatterWarnings();
+
+    /** Reenables emission of gather/scatter performance warnings. */
+    void EnableGatherScatterWarnings();
+
+    void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
+
    /** Step through the code and find label statements; create a basic
        block for each one, so that subsequent calls to
        GetLabeledBasicBlock() return the corresponding basic block. */
@@ -202,6 +259,10 @@ public:
        new basic block that it starts. */
    llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);

+    /** Returns a vector of all labels in the context. This is
+        simply the key set of the labelMap */
+    std::vector<std::string> GetLabels();
+
    /** Called to generate code for 'return' statement; value is the
        expression in the return statement (if non-NULL), and
        doCoherenceCheck indicates whether instructions should be generated
@@ -211,7 +272,7 @@ public:
    /** @} */

    /** @name Small helper/utility routines
-        @{ 
+        @{
    */
    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
        i1 value that indicates if any of the mask lanes are on. */
@@ -222,7 +283,11 @@ public:
    llvm::Value *All(llvm::Value *mask);

    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
-        i32 value wherein the i'th bit is on if and only if the i'th lane
+        i1 value that indicates if all of the mask lanes are off. */
+    llvm::Value *None(llvm::Value *mask);
+
+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i64 value wherein the i'th bit is on if and only if the i'th lane
        of the mask is on. */
    llvm::Value *LaneMask(llvm::Value *mask);

@@ -230,6 +295,10 @@ public:
        that indicates whether the two masks are equal. */
    llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);

+    /** Generate ConstantVector, which contains ProgramIndex, i.e.
+        < i32 0, i32 1, i32 2, i32 3> */
+    llvm::Value *ProgramIndexVector(bool is32bits = true);
+
    /** Given a string, create an anonymous global variable to hold its
        value and return the pointer to the string. */
    llvm::Value *GetStringPtr(const std::string &str);
@@ -267,7 +336,7 @@ public:
        llvm::Instruction for convenience; in calling code we often have
        Instructions stored using Value pointers; the code here returns
        silently if it's not actually given an instruction. */
-    void AddDebugPos(llvm::Value *instruction, const SourcePos *pos = NULL, 
+    void AddDebugPos(llvm::Value *instruction, const SourcePos *pos = NULL,
                     llvm::DIScope *scope = NULL);

    /** Inform the debugging information generation code that a new scope
@@ -288,7 +357,7 @@ public:

    /** Emits debugging information for the function parameter represented
        by sym.  */
-    void EmitFunctionParameterDebugInfo(Symbol *sym);
+    void EmitFunctionParameterDebugInfo(Symbol *sym, int parameterNum);
    /** @} */

    /** @name IR instruction emission
@@ -296,7 +365,7 @@ public:
        instructions.  See the LLVM assembly language reference manual
        (http://llvm.org/docs/LangRef.html) and the LLVM doxygen documentaion
        (http://llvm.org/doxygen) for more information.  Here we will only
-        document significant generalizations to the functionality of the 
+        document significant generalizations to the functionality of the
        corresponding basic LLVM instructions.

        Beyond actually emitting the instruction, the implementations of
@@ -312,7 +381,7 @@ public:
        this also handles applying the given operation to the vector
        elements. */
    llvm::Value *BinaryOperator(llvm::Instruction::BinaryOps inst,
-                                llvm::Value *v0, llvm::Value *v1, 
+                                llvm::Value *v0, llvm::Value *v1,
                                const char *name = NULL);

    /** Emit the "not" operator.  Like BinaryOperator(), this also handles
@@ -322,7 +391,7 @@ public:
    /** Emit a comparison instruction.  If the operands are VectorTypes,
        then a value for the corresponding boolean VectorType is
        returned. */
-    llvm::Value *CmpInst(llvm::Instruction::OtherOps inst, 
+    llvm::Value *CmpInst(llvm::Instruction::OtherOps inst,
                         llvm::CmpInst::Predicate pred,
                         llvm::Value *v0, llvm::Value *v1, const char *name = NULL);

@@ -330,25 +399,35 @@ public:
        array, for pointer types). */
    llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);

-    llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *BitCastInst(llvm::Value *value, llvm::Type *type,
                             const char *name = NULL);
    llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
-    llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *PtrToIntInst(llvm::Value *value, llvm::Type *type,
                              const char *name = NULL);
-    llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *IntToPtrInst(llvm::Value *value, llvm::Type *type,
                              const char *name = NULL);

-    llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Instruction *TruncInst(llvm::Value *value, llvm::Type *type,
                                 const char *name = NULL);
    llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
-                                LLVM_TYPE_CONST llvm::Type *type, const char *name = NULL);
-    llvm::Instruction *FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+                                llvm::Type *type, const char *name = NULL);
+    llvm::Instruction *FPCastInst(llvm::Value *value, llvm::Type *type,
                                  const char *name = NULL);
-    llvm::Instruction *SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+    llvm::Instruction *SExtInst(llvm::Value *value, llvm::Type *type,
                                const char *name = NULL);
-    llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+    llvm::Instruction *ZExtInst(llvm::Value *value, llvm::Type *type,
                                const char *name = NULL);

+    /** Given two integer-typed values (but possibly one vector and the
+        other not, and or of possibly-different bit-widths), update their
+        values as needed so that the two have the same (more general)
+        type. */
+    void MatchIntegerTypes(llvm::Value **v0, llvm::Value **v1);
+
+    /** Create a new slice pointer out of the given pointer to an soa type
+        and an integer offset to a slice within that type. */
+    llvm::Value *MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset);
+
    /** These GEP methods are generalizations of the standard ones in LLVM;
        they support both uniform and varying basePtr values as well as
        uniform and varying index values (arrays of indices).  Varying base
@@ -369,7 +448,8 @@ public:
        the type of the pointer, though it may be NULL if the base pointer
        is uniform. */
    llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
-                                  const Type *ptrType, const char *name = NULL);
+                                  const Type *ptrType, const char *name = NULL,
+                                  const PointerType **resultPtrType = NULL);

    /** Load from the memory location(s) given by lvalue, using the given
        mask.  The lvalue may be varying, in which case this corresponds to
@@ -386,9 +466,9 @@ public:
        allocated at the given alignment.  By default, the alloca
        instruction is added at the start of the function in the entry
        basic block; if it should be added to the current basic block, then
-        the atEntryBlock parameter should be false. */ 
-    llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, 
-                            const char *name = NULL, int align = 0, 
+        the atEntryBlock parameter should be false. */
+    llvm::Value *AllocaInst(llvm::Type *llvmType,
+                            const char *name = NULL, int align = 0,
                            bool atEntryBlock = true);

    /** Standard store instruction; for this variant, the lvalue must be a
@@ -400,7 +480,14 @@ public:
        varying, the given storeMask is used to mask the stores so that
        they only execute for the active program instances. */
    void StoreInst(llvm::Value *value, llvm::Value *ptr,
-                   llvm::Value *storeMask, const Type *ptrType);
+                   llvm::Value *storeMask, const Type *valueType,
+                   const Type *ptrType);
+
+    /** Copy count bytes of memory from the location pointed to by src to
+        the location pointed to by dest.  (src and dest must not be
+        overlapping.) */
+    void MemcpyInst(llvm::Value *dest, llvm::Value *src, llvm::Value *count,
+                    llvm::Value *align = NULL);

    void BranchInst(llvm::BasicBlock *block);
    void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
@@ -414,10 +501,20 @@ public:
    /** This convenience method maps to an llvm::InsertElementInst if the
        given value is a llvm::VectorType, and to an llvm::InsertValueInst
        otherwise. */
-    llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, 
+    llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
                            const char *name = NULL);

-    llvm::PHINode *PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
+    /** This convenience method maps to an llvm::ShuffleVectorInst. */
+    llvm::Value *ShuffleInst(llvm::Value *v1, llvm::Value *v2, llvm::Value *mask,
+                            const char *name = NULL);
+
+    /** This convenience method to generate broadcast pattern. It takes a value
+        and a vector type. Type of the value must match element type of the
+        vector. */
+    llvm::Value *BroadcastValue(llvm::Value *v, llvm::Type *vecType,
+                                const char *name = NULL);
+
+    llvm::PHINode *PhiNode(llvm::Type *type, int count,
                           const char *name = NULL);
    llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
                                  llvm::Value *val1, const char *name = NULL);
@@ -443,7 +540,7 @@ public:

    /** Launch an asynchronous task to run the given function, passing it
        he given argument values. */
-    llvm::Value *LaunchInst(llvm::Value *callee, 
+    llvm::Value *LaunchInst(llvm::Value *callee,
                            std::vector<llvm::Value *> &argVals,
                            llvm::Value *launchCount);

@@ -488,14 +585,14 @@ private:
        for error messages and debugging symbols. */
    SourcePos funcStartPos;

-    /** If currently in a loop body, the value of the mask at the start of
-        the loop. */
-    llvm::Value *loopMask;
+    /** If currently in a loop body or switch statement, the value of the
+        mask at the start of it. */
+    llvm::Value *blockEntryMask;

-    /** If currently in a loop body, this is a pointer to memory to store a
-        mask value that represents which of the lanes have executed a
-        'break' statement.  If we're not in a loop body, this should be
-        NULL. */
+    /** If currently in a loop body or switch statement, this is a pointer
+        to memory to store a mask value that represents which of the lanes
+        have executed a 'break' statement.  If we're not in a loop body or
+        switch, this should be NULL. */
    llvm::Value *breakLanesPtr;

    /** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
@@ -503,16 +600,49 @@ private:
        'continue' statement. */
    llvm::Value *continueLanesPtr;

-    /** If we're inside a loop, this gives the basic block immediately
-        after the current loop, which we will jump to if all of the lanes
-        have executed a break statement or are otherwise done with the
-        loop. */
+    /** If we're inside a loop or switch statement, this gives the basic
+        block immediately after the current loop or switch, which we will
+        jump to if all of the lanes have executed a break statement or are
+        otherwise done with it. */
    llvm::BasicBlock *breakTarget;

    /** If we're inside a loop, this gives the block to jump to if all of
        the running lanes have executed a 'continue' statement. */
    llvm::BasicBlock *continueTarget;

+    /** @name Switch statement state
+
+        These variables store various state that's active when we're
+        generating code for a switch statement.  They should all be NULL
+        outside of a switch.
+        @{
+    */
+
+    /** The value of the expression used to determine which case in the
+        statements after the switch to execute. */
+    llvm::Value *switchExpr;
+
+    /** Map from case label numbers to the basic block that will hold code
+        for that case. */
+    const std::vector<std::pair<int, llvm::BasicBlock *> > *caseBlocks;
+
+    /** The basic block of code to run for the "default" label in the
+        switch statement. */
+    llvm::BasicBlock *defaultBlock;
+
+    /** For each basic block for the code for cases (and the default label,
+        if present), this map gives the basic block for the immediately
+        following case/default label. */
+    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *nextBlocks;
+
+    /** Records whether the switch condition was uniform; this is a
+        distinct notion from whether the switch represents uniform or
+        varying control flow; we may have varying control flow from a
+        uniform switch condition if there is a 'break' inside the switch
+        that's under varying control flow. */
+    bool switchConditionWasUniform;
+    /** @} */
+
    /** A pointer to memory that records which of the program instances
        have executed a 'return' statement (and are thus really truly done
        running any more instructions in this functions. */
@@ -531,12 +661,12 @@ private:
    std::vector<CFInfo *> controlFlowInfo;

    /** DIFile object corresponding to the source file where the current
-        function was defined (used for debugging info0. */
+        function was defined (used for debugging info). */
    llvm::DIFile diFile;

    /** DISubprogram corresponding to this function (used for debugging
        info). */
-    llvm::DISubprogram diFunction;
+    llvm::DISubprogram diSubprogram;

    /** These correspond to the current set of nested scopes in the
        function. */
@@ -550,27 +680,43 @@ private:
        tasks launched from the current function. */
    llvm::Value *launchGroupHandlePtr;

+    /** Nesting count of the number of times calling code has disabled (and
+        not yet reenabled) gather/scatter performance warnings. */
+    int disableGSWarningCount;
+
    std::map<std::string, llvm::BasicBlock *> labelMap;

    static bool initLabelBBlocks(ASTNode *node, void *data);

    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
    static void addGSMetadata(llvm::Value *inst, SourcePos pos);
-    bool ifsInLoopAllUniform() const;
+    bool ifsInCFAllUniform(int cfType) const;
    void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
    llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);

-    llvm::Value *applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index, 
+    llvm::Value *applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index,
                                 const Type *ptrType);

    void restoreMaskGivenReturns(llvm::Value *oldMask);
+    void addSwitchMaskCheck(llvm::Value *mask);
+    bool inSwitchStatement() const;
+    llvm::Value *getMaskAtSwitchEntry();

-    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType, 
-                 llvm::Value *mask);
+    CFInfo *popCFState();
+
+    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *valueType,
+                 const Type *ptrType, llvm::Value *mask);
    void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
                     llvm::Value *mask);
-    llvm::Value *gather(llvm::Value *ptr, const Type *ptrType, llvm::Value *mask,
-                        const char *name);
+    void storeUniformToSOA(llvm::Value *value, llvm::Value *ptr,
+                           llvm::Value *mask, const Type *valueType,
+                           const PointerType *ptrType);
+    llvm::Value *loadUniformFromSOA(llvm::Value *ptr, llvm::Value *mask,
+                                    const PointerType *ptrType, const char *name);
+
+    llvm::Value *gather(llvm::Value *ptr, const PointerType *ptrType,
+                        llvm::Value *mask, const char *name);
+
    llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
 };

--- a/decl.cpp
+++ b/decl.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2013, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,12 +28,12 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file decl.cpp
-    @brief Implementations of classes related to turning declarations into 
-           symbols and types.
+    @brief Implementations of classes related to turning declarations into
+           symbol names and types.
 */

 #include "decl.h"
@@ -44,6 +44,7 @@
 #include "stmt.h"
 #include "expr.h"
 #include <stdio.h>
+#include <string.h>
 #include <set>

 static void
@@ -55,26 +56,45 @@ lPrintTypeQualifiers(int typeQualifiers) {
    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
    if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
+    if (typeQualifiers & TYPEQUAL_EXPORT)    printf("export ");
+    if (typeQualifiers & TYPEQUAL_UNMASKED)  printf("unmasked ");
 }


 /** Given a Type and a set of type qualifiers, apply the type qualifiers to
-    the type, returning the type that is the result. 
+    the type, returning the type that is the result.
 */
 static const Type *
 lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
    if (type == NULL)
        return NULL;

-    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
+    if ((typeQualifiers & TYPEQUAL_CONST) != 0) {
        type = type->GetAsConstType();
+    }

-    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
-        type = type->GetAsUniformType();
-    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
-        type = type->GetAsVaryingType();
-    else
-        type = type->GetAsUnboundVariabilityType();
+    if ( ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
+         && ((typeQualifiers & TYPEQUAL_VARYING) != 0) ) {
+        Error(pos, "Type \"%s\" cannot be qualified with both uniform and varying.", 
+              type->GetString().c_str());
+    }
+
+    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) {
+        if (Type::Equal(type, AtomicType::Void))
+            Error(pos, "\"uniform\" qualifier is illegal with \"void\" type.");
+        else
+            type = type->GetAsUniformType();
+    }
+    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0) {
+        if (Type::Equal(type, AtomicType::Void))
+            Error(pos, "\"varying\" qualifier is illegal with \"void\" type.");
+        else
+            type = type->GetAsVaryingType();
+    }
+    else {
+        if (Type::Equal(type, AtomicType::Void) == false)
+            type = type->GetAsUnboundVariabilityType();
+    }

    if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
        if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
@@ -84,15 +104,20 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
        const Type *unsignedType = type->GetAsUnsignedType();
        if (unsignedType != NULL)
            type = unsignedType;
-        else
+        else {
+            const Type *resolvedType =
+                type->ResolveUnboundVariability(Variability::Varying);
            Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
-                  type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
+                  resolvedType->GetString().c_str());
+        }
    }

-    if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false)
+    if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false) {
+        const Type *resolvedType =
+            type->ResolveUnboundVariability(Variability::Varying);
        Error(pos, "\"signed\" qualifier is illegal with non-integer type "
-              "\"%s\".", 
-              type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
+              "\"%s\".", resolvedType->GetString().c_str());
+    }

    return type;
 }
@@ -107,23 +132,75 @@ DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
    typeQualifiers = tq;
    soaWidth = 0;
    vectorSize = 0;
+    if (t != NULL) {
+        if (m->symbolTable->ContainsType(t)) {
+            // Typedefs might have uniform/varying qualifiers inside.
+            if (t->IsVaryingType()) {
+                typeQualifiers |= TYPEQUAL_VARYING;
+            }
+            else if (t->IsUniformType()) {
+                typeQualifiers |= TYPEQUAL_UNIFORM;
+            }
+        }
+    }
 }


 const Type *
 DeclSpecs::GetBaseType(SourcePos pos) const {
-    const Type *bt = baseType;
+    const Type *retType = baseType;
+
+    if (retType == NULL) {
+        Warning(pos, "No type specified in declaration.  Assuming int32.");
+        retType = AtomicType::UniformInt32->GetAsUnboundVariabilityType();
+    }
+
    if (vectorSize > 0) {
-        const AtomicType *atomicType = dynamic_cast<const AtomicType *>(bt);
+        const AtomicType *atomicType = CastType<AtomicType>(retType);
        if (atomicType == NULL) {
            Error(pos, "Only atomic types (int, float, ...) are legal for vector "
                  "types.");
            return NULL;
        }
-        bt = new VectorType(atomicType, vectorSize);
+        retType = new VectorType(atomicType, vectorSize);
    }

-    return lApplyTypeQualifiers(typeQualifiers, bt, pos);
+    retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
+
+    if (soaWidth > 0) {
+        const StructType *st = CastType<StructType>(retType);
+
+        if (st == NULL) {
+            Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
+                  "type \"%s\".", soaWidth, retType->GetString().c_str());
+            return NULL;
+        }
+        else if (soaWidth <= 0 || (soaWidth & (soaWidth - 1)) != 0) {
+            Error(pos, "soa<%d> width illegal. Value must be positive power "
+                  "of two.", soaWidth);
+            return NULL;
+        }
+
+        if (st->IsUniformType()) {
+            Error(pos, "\"uniform\" qualifier and \"soa<%d>\" qualifier can't "
+                  "both be used in a type declaration.", soaWidth);
+            return NULL;
+        }
+        else if (st->IsVaryingType()) {
+            Error(pos, "\"varying\" qualifier and \"soa<%d>\" qualifier can't "
+                  "both be used in a type declaration.", soaWidth);
+            return NULL;
+        }
+        else
+            retType = st->GetAsSOAType(soaWidth);
+
+        if (soaWidth < g->target->getVectorWidth())
+            PerformanceWarning(pos, "soa<%d> width smaller than gang size %d "
+                               "currently leads to inefficient code to access "
+                               "soa types.", soaWidth, g->target->getVectorWidth());
+    }
+
+    return retType;
 }


@@ -133,7 +210,6 @@ lGetStorageClassName(StorageClass storageClass) {
    case SC_NONE:     return "";
    case SC_EXTERN:   return "extern";
    case SC_EXTERN_C: return "extern \"C\"";
-    case SC_EXPORT:   return "export";
    case SC_STATIC:   return "static";
    case SC_TYPEDEF:  return "typedef";
    default:          FATAL("Unhandled storage class in lGetStorageClassName");
@@ -158,35 +234,35 @@ DeclSpecs::Print() const {
 ///////////////////////////////////////////////////////////////////////////
 // Declarator

-Declarator::Declarator(DeclaratorKind dk, SourcePos p) 
-    : pos(p), kind(dk) { 
+Declarator::Declarator(DeclaratorKind dk, SourcePos p)
+    : pos(p), kind(dk) {
    child = NULL;
    typeQualifiers = 0;
+    storageClass = SC_NONE;
    arraySize = -1;
-    sym = NULL;
+    type = NULL;
    initExpr = NULL;
 }


 void
 Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
-    const Type *t = GetType(ds);
-    Symbol *sym = GetSymbol();
-    if (sym != NULL) {
-        sym->type = t;
-        sym->storageClass = ds->storageClass;
+    const Type *baseType = ds->GetBaseType(pos);
+
+    InitFromType(baseType, ds);
+
+    if (type == NULL) {
+        AssertPos(pos, m->errorCount > 0);
+        return;
    }
-}

+    storageClass = ds->storageClass;

-Symbol *
-Declarator::GetSymbol() const {
-    // The symbol lives at the last child in the chain, so walk down there
-    // and return the one there.
-    const Declarator *d = this;
-    while (d->child != NULL)
-        d = d->child;
-    return d->sym;
+    if (ds->declSpecList.size() > 0 &&
+        CastType<FunctionType>(type) == NULL) {
+        Error(pos, "__declspec specifiers for non-function type \"%s\" are "
+              "not used.", type->GetString().c_str());
+    }
 }


@@ -196,11 +272,11 @@ Declarator::Print(int indent) const {
    pos.Print();

    lPrintTypeQualifiers(typeQualifiers);
-    Symbol *sym = GetSymbol();
-    if (sym != NULL)
-        printf("%s", sym->name.c_str());
+    printf("%s ", lGetStorageClassName(storageClass));
+    if (name.size() > 0)
+        printf("%s", name.c_str());
    else
-        printf("(null symbol)");
+        printf("(unnamed)");

    printf(", array size = %d", arraySize);

@@ -234,115 +310,112 @@ Declarator::Print(int indent) const {
 }


-Symbol *
-Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
-    const FunctionType *type = 
-        dynamic_cast<const FunctionType *>(GetType(ds));
-    if (type == NULL)
-        return NULL;
-
-    Symbol *declSym = GetSymbol();
-    Assert(declSym != NULL);
-
-    // Get the symbol for the function from the symbol table.  (It should
-    // already have been added to the symbol table by AddGlobal() by the
-    // time we get here.)
-    Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
-    if (funSym != NULL)
-        // May be NULL due to error earlier in compilation
-        funSym->pos = pos;
-
-    // Walk down to the declarator for the function.  (We have to get past
-    // the stuff that specifies the function's return type before we get to
-    // the function's declarator.)
-    Declarator *d = this;
-    while (d != NULL && d->kind != DK_FUNCTION)
-        d = d->child;
-    Assert(d != NULL);
-
-    for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
-        Symbol *sym = d->GetSymbolForFunctionParameter(i);
-        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
-        funArgs->push_back(sym);
-    }
-
-    funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying);
-
-    return funSym;
-}
-
-
-const Type *
-Declarator::GetType(const Type *base, DeclSpecs *ds) const {
+void
+Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
    bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
    bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
    bool isTask =         ((typeQualifiers & TYPEQUAL_TASK) != 0);
+    bool isExported =     ((typeQualifiers & TYPEQUAL_EXPORT) != 0);
    bool isConst =        ((typeQualifiers & TYPEQUAL_CONST) != 0);
+    bool isUnmasked =     ((typeQualifiers & TYPEQUAL_UNMASKED) != 0);

    if (hasUniformQual && hasVaryingQual) {
        Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
-        return NULL;
+        return;
    }
-    if (kind != DK_FUNCTION && isTask)
+    if (kind != DK_FUNCTION && isTask) {
        Error(pos, "\"task\" qualifier illegal in variable declaration.");
+        return;
+    }
+    if (kind != DK_FUNCTION && isUnmasked) {
+        Error(pos, "\"unmasked\" qualifier illegal in variable declaration.");
+        return;
+    }
+    if (kind != DK_FUNCTION && isExported) {
+        Error(pos, "\"export\" qualifier illegal in variable declaration.");
+        return;
+    }

-    Type::Variability variability = Type::Unbound;
+    Variability variability(Variability::Unbound);
    if (hasUniformQual)
-        variability = Type::Uniform;
+        variability = Variability::Uniform;
    else if (hasVaryingQual)
-        variability = Type::Varying;
+        variability = Variability::Varying;

-    const Type *type = base;
-    switch (kind) {
-    case DK_BASE:
+    if (kind == DK_BASE) {
        // All of the type qualifiers should be in the DeclSpecs for the
        // base declarator
-        Assert(typeQualifiers == 0);
-        Assert(child == NULL);
-        return type;
-
-    case DK_POINTER:
-        type = new PointerType(type, variability, isConst);
-        if (child != NULL)
-            return child->GetType(type, ds);
+        AssertPos(pos, typeQualifiers == 0);
+        AssertPos(pos, child == NULL);
+        type = baseType;
+    }
+    else if (kind == DK_POINTER) {
+        /* For now, any pointer to an SOA type gets the slice property; if
+           we add the capability to declare pointers as slices or not,
+           we'll want to set this based on a type qualifier here. */
+        const Type *ptrType = new PointerType(baseType, variability, isConst,
+                                              baseType->IsSOAType());
+        if (child != NULL) {
+            child->InitFromType(ptrType, ds);
+            type = child->type;
+            name = child->name;
+        }
        else
-            return type;
-        break;
-
-    case DK_REFERENCE:
-        if (hasUniformQual)
+            type = ptrType;
+    }
+    else if (kind == DK_REFERENCE) {
+        if (hasUniformQual) {
            Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
-        if (hasVaryingQual)
+            return;
+        }
+        if (hasVaryingQual) {
            Error(pos, "\"varying\" qualifier is illegal to apply to references.");
-        if (isConst)
+            return;
+        }
+        if (isConst) {
            Error(pos, "\"const\" qualifier is to illegal apply to references.");
-
+            return;
+        }
        // The parser should disallow this already, but double check.
-        if (dynamic_cast<const ReferenceType *>(type) != NULL) {
+        if (CastType<ReferenceType>(baseType) != NULL) {
            Error(pos, "References to references are illegal.");
-            return NULL;
+            return;
        }

-        type = new ReferenceType(type);
-        if (child != NULL)
-            return child->GetType(type, ds);
+        const Type *refType = new ReferenceType(baseType);
+        if (child != NULL) {
+            child->InitFromType(refType, ds);
+            type = child->type;
+            name = child->name;
+        }
        else
-            return type;
-        break;
+            type = refType;
+    }
+    else if (kind == DK_ARRAY) {
+        if (Type::Equal(baseType, AtomicType::Void)) {
+            Error(pos, "Arrays of \"void\" type are illegal.");
+            return;
+        }
+        if (CastType<ReferenceType>(baseType)) {
+            Error(pos, "Arrays of references (type \"%s\") are illegal.",
+                  baseType->GetString().c_str());
+            return;
+        }

-    case DK_ARRAY:
-        type = new ArrayType(type, arraySize);
-        if (child)
-            return child->GetType(type, ds);
+        const Type *arrayType = new ArrayType(baseType, arraySize);
+        if (child != NULL) {
+            child->InitFromType(arrayType, ds);
+            type = child->type;
+            name = child->name;
+        }
        else
-            return type;
-        break;
-
-    case DK_FUNCTION: {
-        std::vector<const Type *> args;
-        std::vector<std::string> argNames;
-        std::vector<ConstExpr *> argDefaults;
-        std::vector<SourcePos> argPos;
+            type = arrayType;
+    }
+    else if (kind == DK_FUNCTION) {
+        llvm::SmallVector<const Type *, 8> args;
+        llvm::SmallVector<std::string, 8> argNames;
+        llvm::SmallVector<Expr *, 8> argDefaults;
+        llvm::SmallVector<SourcePos, 8> argPos;

        // Loop over the function arguments and store the names, types,
        // default values (if any), and source file positions each one in
@@ -350,15 +423,44 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
        for (unsigned int i = 0; i < functionParams.size(); ++i) {
            Declaration *d = functionParams[i];

-            Symbol *sym = GetSymbolForFunctionParameter(i);
+            if (d == NULL) {
+                AssertPos(pos, m->errorCount > 0);
+                continue;
+            }
+            if (d->declarators.size() == 0) {
+                // function declaration like foo(float), w/o a name for the
+                // parameter; wire up a placeholder Declarator for it
+                d->declarators.push_back(new Declarator(DK_BASE, pos));
+                d->declarators[0]->InitFromDeclSpecs(d->declSpecs);
+            }
+
+            AssertPos(pos, d->declarators.size() == 1);
+            Declarator *decl = d->declarators[0];
+            if (decl == NULL || decl->type == NULL) {
+                AssertPos(pos, m->errorCount > 0);
+                continue;
+            }
+
+            if (decl->name == "") {
+                // Give a name to any anonymous parameter declarations
+                char buf[32];
+                sprintf(buf, "__anon_parameter_%d", i);
+                decl->name = buf;
+            }
+            decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);

            if (d->declSpecs->storageClass != SC_NONE)
-                Error(sym->pos, "Storage class \"%s\" is illegal in "
-                      "function parameter declaration for parameter \"%s\".", 
+                Error(decl->pos, "Storage class \"%s\" is illegal in "
+                      "function parameter declaration for parameter \"%s\".",
                      lGetStorageClassName(d->declSpecs->storageClass),
-                      sym->name.c_str());
+                      decl->name.c_str());
+            if (Type::Equal(decl->type, AtomicType::Void)) {
+                Error(decl->pos, "Parameter with type \"void\" illegal in function "
+                      "parameter list.");
+                decl->type = NULL;
+            }

-            const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
+            const ArrayType *at = CastType<ArrayType>(decl->type);
            if (at != NULL) {
                // As in C, arrays are passed to functions as pointers to
                // their element type.  We'll just immediately make this
@@ -368,144 +470,124 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
                // report this differently than it was originally declared
                // in the function, but it's not clear that this is a
                // significant problem.)
-                sym->type = PointerType::GetUniform(at->GetElementType());
+                const Type *targetType = at->GetElementType();
+                if (targetType == NULL) {
+                    AssertPos(pos, m->errorCount > 0);
+                    return;
+                }
+
+                decl->type = PointerType::GetUniform(targetType, at->IsSOAType());

                // Make sure there are no unsized arrays (other than the
                // first dimension) in function parameter lists.
-                at = dynamic_cast<const ArrayType *>(at->GetElementType());
+                at = CastType<ArrayType>(targetType);
                while (at != NULL) {
                    if (at->GetElementCount() == 0)
-                        Error(sym->pos, "Arrays with unsized dimensions in "
+                        Error(decl->pos, "Arrays with unsized dimensions in "
                              "dimensions after the first one are illegal in "
                              "function parameter lists.");
-                    at = dynamic_cast<const ArrayType *>(at->GetElementType());
+                    at = CastType<ArrayType>(at->GetElementType());
                }
            }

-            args.push_back(sym->type);
-            argNames.push_back(sym->name);
-            argPos.push_back(sym->pos);
+            args.push_back(decl->type);
+            argNames.push_back(decl->name);
+            argPos.push_back(decl->pos);

-            ConstExpr *init = NULL;
-            if (d->declarators.size()) {
-                // Try to find an initializer expression; if there is one,
-                // it lives down to the base declarator.
-                Declarator *decl = d->declarators[0];
-                while (decl->child != NULL) {
-                    Assert(decl->initExpr == NULL);
+            Expr *init = NULL;
+            // Try to find an initializer expression.
+            while (decl != NULL) {
+                if (decl->initExpr != NULL) {
+                    decl->initExpr = TypeCheck(decl->initExpr);
+                    decl->initExpr = Optimize(decl->initExpr);
+                    if (decl->initExpr != NULL) {
+                        init = dynamic_cast<ConstExpr *>(decl->initExpr);
+                        if (init == NULL)
+                            init = dynamic_cast<NullPointerExpr *>(decl->initExpr);
+                        if (init == NULL)
+                            Error(decl->initExpr->pos, "Default value for parameter "
+                                  "\"%s\" must be a compile-time constant.",
+                                  decl->name.c_str());
+                    }
+                    break;
+                }
+                else
                    decl = decl->child;
-                }
-
-                if (decl->initExpr != NULL &&
-                    (decl->initExpr = TypeCheck(decl->initExpr)) != NULL &&
-                    (decl->initExpr = Optimize(decl->initExpr)) != NULL &&
-                    (init = dynamic_cast<ConstExpr *>(decl->initExpr)) == NULL) {
-                    Error(decl->initExpr->pos, "Default value for parameter "
-                          "\"%s\" must be a compile-time constant.", 
-                          sym->name.c_str());
-                }
            }
            argDefaults.push_back(init);
        }

-        const Type *returnType = type;
+        const Type *returnType = baseType;
        if (returnType == NULL) {
            Error(pos, "No return type provided in function declaration.");
-            return NULL;
+            return;
        }
-        
-        bool isExported = ds && (ds->storageClass == SC_EXPORT);
+
+        if (CastType<FunctionType>(returnType) != NULL) {
+            Error(pos, "Illegal to return function type from function.");
+            return;
+        }
+
+        returnType = returnType->ResolveUnboundVariability(Variability::Varying);
+
        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
+        bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
+        bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);

        if (isExported && isTask) {
            Error(pos, "Function can't have both \"task\" and \"export\" "
                  "qualifiers");
-            return NULL;
+            return;
        }
        if (isExternC && isTask) {
            Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
                  "qualifiers");
-            return NULL;
+            return;
        }
        if (isExternC && isExported) {
            Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
                  "qualifiers");
-            return NULL;
+            return;
+        }
+        if (isUnmasked && isExported)
+            Warning(pos, "\"unmasked\" qualifier is redundant for exported "
+                    "functions.");
+
+        if (child == NULL) {
+            AssertPos(pos, m->errorCount > 0);
+            return;
        }

-        const Type *functionType = 
+        const FunctionType *functionType =
            new FunctionType(returnType, args, argNames, argDefaults,
-                             argPos, isTask, isExported, isExternC);
-        functionType = functionType->ResolveUnboundVariability(Type::Varying);
-        return child->GetType(functionType, ds);
-    }
-    default:
-        FATAL("Unexpected decl kind");
-        return NULL;
-    }
+                             argPos, isTask, isExported, isExternC, isUnmasked);

-#if 0
-            // Make sure we actually have an array of structs ..
-            const StructType *childStructType = 
-                dynamic_cast<const StructType *>(childType);
-            if (childStructType == NULL) {
-                Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
-                      "type \"%s\".", soaWidth, childType->GetString().c_str());
-                return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
+        // handle any explicit __declspecs on the function
+        if (ds != NULL) {
+            for (int i = 0; i < (int)ds->declSpecList.size(); ++i) {
+                std::string str = ds->declSpecList[i].first;
+                SourcePos pos = ds->declSpecList[i].second;
+
+                if (str == "safe")
+                    (const_cast<FunctionType *>(functionType))->isSafe = true;
+                else if (!strncmp(str.c_str(), "cost", 4)) {
+                    int cost = atoi(str.c_str() + 4);
+                    if (cost < 0)
+                        Error(pos, "Negative function cost %d is illegal.",
+                              cost);
+                    (const_cast<FunctionType *>(functionType))->costOverride = cost;
+                }
+                else
+                    Error(pos, "__declspec parameter \"%s\" unknown.", str.c_str());
            }
-            else if ((soaWidth & (soaWidth - 1)) != 0) {
-                Error(pos, "soa<%d> width illegal.  Value must be power of two.",
-                      soaWidth);
-                return NULL;
-            }
-            else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
-                Error(pos, "soa<%d> width must evenly divide array size %d.",
-                      soaWidth, arraySize);
-                return NULL;
-            }
-            return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
-                                    soaWidth);
-#endif
-}
-
-
-const Type *
-Declarator::GetType(DeclSpecs *ds) const {
-    const Type *baseType = ds->GetBaseType(pos);
-    const Type *type = GetType(baseType, ds);
-    return type;
-}
-
-
-Symbol *
-Declarator::GetSymbolForFunctionParameter(int paramNum) const {
-    Assert(paramNum < (int)functionParams.size());
-    Declaration *d = functionParams[paramNum];
-
-    char buf[32];
-    Symbol *sym;
-    if (d->declarators.size() == 0) {
-        // function declaration like foo(float), w/o a name for
-        // the parameter
-        sprintf(buf, "__anon_parameter_%d", paramNum);
-        sym = new Symbol(buf, pos);
-        sym->type = d->declSpecs->GetBaseType(pos);
-    }
-    else {
-        Assert(d->declarators.size() == 1);
-        sym = d->declarators[0]->GetSymbol();
-        if (sym == NULL) {
-            // Handle more complex anonymous declarations like
-            // float (float **).
-            sprintf(buf, "__anon_parameter_%d", paramNum);
-            sym = new Symbol(buf, d->declarators[0]->pos);
-            sym->type = d->declarators[0]->GetType(d->declSpecs);
        }
-    }
-    return sym;
-}

+        child->InitFromType(functionType, ds);
+        type = child->type;
+        name = child->name;
+    }
+}

 ///////////////////////////////////////////////////////////////////////////
 // Declaration
@@ -529,6 +611,7 @@ Declaration::Declaration(DeclSpecs *ds, Declarator *d) {
 }


+
 std::vector<VariableDeclaration>
 Declaration::GetVariableDeclarations() const {
    Assert(declSpecs->storageClass != SC_TYPEDEF);
@@ -536,18 +619,23 @@ Declaration::GetVariableDeclarations() const {

    for (unsigned int i = 0; i < declarators.size(); ++i) {
        Declarator *decl = declarators[i];
-        if (decl == NULL)
+        if (decl == NULL || decl->type == NULL) {
            // Ignore earlier errors
+            Assert(m->errorCount > 0);
            continue;
+        }

-        Symbol *sym = decl->GetSymbol();
-        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
-
-        if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
+        if (Type::Equal(decl->type, AtomicType::Void))
+            Error(decl->pos, "\"void\" type variable illegal in declaration.");
+        else if (CastType<FunctionType>(decl->type) == NULL) {
+            decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
+            Symbol *sym = new Symbol(decl->name, decl->pos, decl->type,
+                                     decl->storageClass);
            m->symbolTable->AddVariable(sym);
            vars.push_back(VariableDeclaration(sym, decl->initExpr));
        }
    }
+
    return vars;
 }

@@ -558,18 +646,19 @@ Declaration::DeclareFunctions() {

    for (unsigned int i = 0; i < declarators.size(); ++i) {
        Declarator *decl = declarators[i];
-        if (decl == NULL)
+        if (decl == NULL || decl->type == NULL) {
            // Ignore earlier errors
+            Assert(m->errorCount > 0);
            continue;
+        }

-        Symbol *sym = decl->GetSymbol();
-        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
-
-        if (dynamic_cast<const FunctionType *>(sym->type) == NULL)
+        const FunctionType *ftype = CastType<FunctionType>(decl->type);
+        if (ftype == NULL)
            continue;

        bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
-        m->AddFunctionDeclaration(sym, isInline);
+        m->AddFunctionDeclaration(decl->name, ftype, decl->storageClass,
+                                  isInline, decl->pos);
    }
 }

@@ -583,13 +672,14 @@ Declaration::Print(int indent) const {
        declarators[i]->Print(indent+4);
 }

+
 ///////////////////////////////////////////////////////////////////////////

 void
 GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
-                             std::vector<const Type *> *elementTypes,
-                             std::vector<std::string> *elementNames,
-                             std::vector<SourcePos> *elementPositions) {
+                             llvm::SmallVector<const Type *, 8> *elementTypes,
+                             llvm::SmallVector<std::string, 8> *elementNames,
+                             llvm::SmallVector<SourcePos, 8> *elementPositions) {
    std::set<std::string> seenNames;
    for (unsigned int i = 0; i < sd.size(); ++i) {
        const Type *type = sd[i]->type;
@@ -599,35 +689,41 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
        // FIXME: making this fake little DeclSpecs here is really
        // disgusting
        DeclSpecs ds(type);
-        if (type->IsUniformType()) 
-            ds.typeQualifiers |= TYPEQUAL_UNIFORM;
-        else if (type->IsVaryingType())
-            ds.typeQualifiers |= TYPEQUAL_VARYING;
+        if (Type::Equal(type, AtomicType::Void) == false) {
+            if (type->IsUniformType())
+                ds.typeQualifiers |= TYPEQUAL_UNIFORM;
+            else if (type->IsVaryingType())
+                ds.typeQualifiers |= TYPEQUAL_VARYING;
+            else if (type->GetSOAWidth() != 0)
+                ds.soaWidth = type->GetSOAWidth();
+            // FIXME: ds.vectorSize?
+        }

        for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
            Declarator *d = (*sd[i]->declarators)[j];
            d->InitFromDeclSpecs(&ds);

-            Symbol *sym = d->GetSymbol();
+            if (Type::Equal(d->type, AtomicType::Void))
+                Error(d->pos, "\"void\" type illegal for struct member.");

-            const ArrayType *arrayType = 
-                dynamic_cast<const ArrayType *>(sym->type);
-            if (arrayType != NULL && arrayType->GetElementCount() == 0) {
-                Error(d->pos, "Unsized arrays aren't allowed in struct "
-                      "definitions.");
-                elementTypes->push_back(NULL);
-            }
-            else
-                elementTypes->push_back(sym->type);
+            elementTypes->push_back(d->type);

-            if (seenNames.find(sym->name) != seenNames.end())
+            if (seenNames.find(d->name) != seenNames.end())
                Error(d->pos, "Struct member \"%s\" has same name as a "
-                      "previously-declared member.", sym->name.c_str());
+                      "previously-declared member.", d->name.c_str());
            else
-                seenNames.insert(sym->name);
+                seenNames.insert(d->name);

-            elementNames->push_back(sym->name);
-            elementPositions->push_back(sym->pos);
+            elementNames->push_back(d->name);
+            elementPositions->push_back(d->pos);
        }
    }
+
+    for (int i = 0; i < (int)elementTypes->size() - 1; ++i) {
+        const ArrayType *arrayType = CastType<ArrayType>((*elementTypes)[i]);
+
+        if (arrayType != NULL && arrayType->GetElementCount() == 0)
+            Error((*elementPositions)[i], "Unsized arrays aren't allowed except "
+                  "for the last member in a struct definition.");
+    }
 }
--- a/decl.h
+++ b/decl.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file decl.h
@@ -47,30 +47,21 @@
    variables--here, that the declaration has the 'static' and 'uniform'
    qualifiers, and that it's basic type is 'int'.  Then for each variable
    declaration, the Declaraiton class holds an instance of a Declarator,
-    which in turn records the per-variable information like the symbol
-    name, array size (if any), initializer expression, etc.
+    which in turn records the per-variable information like the name, array
+    size (if any), initializer expression, etc.
 */

 #ifndef ISPC_DECL_H
 #define ISPC_DECL_H

 #include "ispc.h"
+#include <llvm/ADT/SmallVector.h>

 struct VariableDeclaration;

 class Declaration;
 class Declarator;

-enum StorageClass {
-    SC_NONE,
-    SC_EXTERN,
-    SC_EXPORT,
-    SC_STATIC,
-    SC_TYPEDEF,
-    SC_EXTERN_C
-};
-
-
 /* Multiple qualifiers can be provided with types in declarations;
   therefore, they are set up so that they can be ANDed together into an
   int. */
@@ -82,6 +73,8 @@ enum StorageClass {
 #define TYPEQUAL_SIGNED     (1<<4)
 #define TYPEQUAL_UNSIGNED   (1<<5)
 #define TYPEQUAL_INLINE     (1<<6)
+#define TYPEQUAL_EXPORT     (1<<7)
+#define TYPEQUAL_UNMASKED   (1<<8)

 /** @brief Representation of the declaration specifiers in a declaration.

@@ -90,7 +83,8 @@ enum StorageClass {
 */
 class DeclSpecs {
 public:
-    DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE, int tq = TYPEQUAL_NONE);
+    DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE,
+              int tq = TYPEQUAL_NONE);

    void Print() const;

@@ -117,6 +111,8 @@ public:
        SOA width specified.  Otherwise this is zero.
     */
    int soaWidth;
+
+    std::vector<std::pair<std::string, SourcePos> > declSpecList;
 };


@@ -128,7 +124,7 @@ enum DeclaratorKind {
    DK_FUNCTION
 };

-/** @brief Representation of the declaration of a single variable.  
+/** @brief Representation of the declaration of a single variable.

    In conjunction with an instance of the DeclSpecs, this gives us
    everything we need for a full variable declaration.
@@ -138,25 +134,11 @@ public:
    Declarator(DeclaratorKind dk, SourcePos p);

    /** Once a DeclSpecs instance is available, this method completes the
-        initialization of the Symbol, setting its Type accordingly.
+        initialization of the type member.
     */
    void InitFromDeclSpecs(DeclSpecs *ds);

-    /** Get the actual type of the combination of Declarator and the given
-        DeclSpecs.  If an explicit base type is provided, the declarator is
-        applied to that type; otherwise the base type from the DeclSpecs is
-        used. */
-    const Type *GetType(DeclSpecs *ds) const;
-    const Type *GetType(const Type *base, DeclSpecs *ds) const;
-
-    /** Returns the symbol corresponding to the function declared by this
-        declarator and symbols for its arguments in *args. */
-    Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
-
-    Symbol *GetSymbolForFunctionParameter(int paramNum) const;
-
-    /** Returns the symbol associated with the declarator. */
-    Symbol *GetSymbol() const;
+    void InitFromType(const Type *base, DeclSpecs *ds);

    void Print(int indent) const;

@@ -177,18 +159,24 @@ public:
    /** Type qualifiers provided with the declarator. */
    int typeQualifiers;

+    StorageClass storageClass;
+
    /** For array declarators, this gives the declared size of the array.
-        Unsized arrays have arraySize == 0. */ 
+        Unsized arrays have arraySize == 0. */
    int arraySize;

-    /** Symbol associated with the declarator. */
-    Symbol *sym;
+    /** Name associated with the declarator. */
+    std::string name;

    /** Initialization expression for the variable.  May be NULL. */
    Expr *initExpr;

+    /** Type of the declarator.  This is NULL until InitFromDeclSpecs() or
+        InitFromType() is called. */
+    const Type *type;
+
    /** For function declarations, this holds the Declaration *s for the
-        funciton's parameters. */
+        function's parameters. */
    std::vector<Declaration *> functionParams;
 };

@@ -233,8 +221,8 @@ struct StructDeclaration {
 /** Given a set of StructDeclaration instances, this returns the types of
    the elements of the corresponding struct and their names. */
 extern void GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
-                                         std::vector<const Type *> *elementTypes,
-                                         std::vector<std::string> *elementNames,
-                                         std::vector<SourcePos> *elementPositions);
+                                         llvm::SmallVector<const Type *, 8> *elementTypes,
+                                         llvm::SmallVector<std::string, 8> *elementNames,
+                                         llvm::SmallVector<SourcePos, 8> *elementPositions);

 #endif // ISPC_DECL_H
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,377 @@
+=== v1.5.0 === (27 September 2013)
+
+A major new version of ISPC with several new targets and important bug fixes.
+Here's a list of the most important changes, if you are using pre-built
+binaries (which are based on patched version of LLVM 3.3):
+
+* The naming of targets was changed to explicitly include data type width and
+  a number of threads in the gang. For example, avx2-i32x8 is avx2 target,
+  which uses 32 bit types as a base and has 8 threads in a gang. Old naming
+  scheme is still supported, but depricated.
+
+* New SSE4 targets for calculations based on 8 bit and 16 bit data types:
+  sse4-i8x16 and sse4-i16x8.
+
+* New AVX1 target for calculations based on 64 bit data types: avx1-i64x4.
+
+* SVML support was extended and improved.
+
+* Behavior of -g switch was changed to not affect optimization level.
+
+* ISPC debug infrastructure was redesigned. See --help-dev for more info and
+  enjoy capabilities of new --debug-phase=<value> and --off-phase=<value>
+  switches.
+
+* Fixed an auto-dispatch bug, which caused AVX code execution when OS doesn't
+  support AVX (but hardware does).
+
+* Fixed a bug, which discarded uniform/varying keyword in typedefs.
+
+* Several performance regressions were fixed.
+
+If you are building ISPC yourself, then following changes are also available
+to you:
+
+* --cpu=slm for targeting Intel Atom codename Silvermont (if LLVM 3.4 is used).
+
+* ARM NEON targets are available (if enabled in build system).
+
+* --debug-ir=<value> is available to generate debug information based on LLVM
+  IR (if LLVM 3.4 is used). In debugger you'll see LLVM IR instead of source
+  code.
+
+* A redesigned and improved test and configuration management system is
+  available to facilitate the process of building LLVM and testing ISPC
+  compiler.
+
+Standard library changes/fixes:
+
+* __pause() function was removed from standard library.
+
+* Fixed reduce_[min|max]_[float|double] intrinsics, which were producing
+  incorrect code under some conditions.
+
+Language changes:
+
+* By default a floating point constant without a suffix is a single precision
+  constant (32 bit). A new suffix "d" was introduced to allow double precision
+  constant (64 bit). Please refer to tests/double-consts.ispc for syntax
+  examples.
+
+=== v1.4.4 === (19 July 2013)
+
+A minor version update with several stability fixes requested by the customers.
+
+=== v1.4.3 === (25 June 2013)
+
+A minor version update with several stability improvements:
+
+* Two bugs were fixed (including a bug in LLVM) to improve stability on 32 bit
+  platforms.
+
+* A bug affecting several examples was fixed.
+
+* --instrument switch is fixed.
+
+All tests and examples now properly compile and execute on native targets on
+Unix platforms (Linux and MacOS).
+
+=== v1.4.2 === (11 June 2013)
+
+A minor version update with a few important changes:
+
+* Stability fix for AVX2 target (Haswell) - problem with gather instructions was
+  released in LLVM 3.4, if you build with LLVM 3.2 or 3.3, it's available in our
+  repository (llvm_patches/r183327-AVX2-GATHER.patch) and needs to be applied
+  manually.
+
+* Stability fix for widespread issue on Win32 platform (#503).
+
+* Performance improvements for Xeon Phi related to mask representation.
+
+Also LLVM 3.3 has been released and now it's the recommended version for building ISPC.
+Precompiled binaries are also built with LLVM 3.3. 
+
+=== v1.4.1 === (28 May 2013)
+
+A major new version of ispc has been released with stability and performance
+improvements on all supported platforms (Windows, Linux and MacOS). 
+This version supports LLVM 3.1, 3.2, 3.3 and 3.4. The released binaries are built with 3.2.
+
+New compiler features:
+
+* ISPC memory allocation returns aligned memory with platform natural alignment
+  of vector registers by default. Alignment can also be managed via
+  --force-alignment=<value>.
+
+Important bug fixes/changes:
+
+* ISPC was fixed to be fully functional when built by GCC 4.7.
+
+* Major cleanup of build and test scripts on Windows.
+
+* Gather/scatter performance improvements on Xeon Phi.
+
+* FMA instructions are enabled for AVX2 instruction set.
+
+* Support of RDRAND instruction when available via library function rdrand (Ivy Bridge).
+
+Release also contains numerous bug fixes and minor improvements.
+
+=== v1.3.0 === (29 June 2012)
+
+This is a major new release of ispc, with support for more compilation
+targets and a number of additions to the language.  As usual, the quality
+of generated code has also been improved in a number of cases and a number
+of small bugs have been fixed.
+
+New targets:
+
+* This release provides "beta" support for compiling to Intel® Xeon
+  Phi™ processor, code named Knights Corner, the first processor in
+  the Intel® Many Integrated Core Architecture.  See
+  http://ispc.github.com/ispc.html#compiling-for-the-intel-xeon-phi-architecture
+  for more details on this support.
+
+* This release also has an "avx1.1" target, which provides support for the
+  new instructions in the Intel Ivy Bridge microarchitecutre. 
+
+New language features:
+
+* The foreach_active statement allows iteration over the active program
+  instances in a gang.  (See
+  http://ispc.github.com/ispc.html#iteration-over-active-program-instances-foreach-active)
+
+* foreach_unique allows iterating over subsets of program instances in a
+  gang that share the same value of a variable.  (See
+  http://ispc.github.com/ispc.html#iteration-over-unique-elements-foreach-unique)
+
+* An "unmasked" function qualifier and statement in the language allow
+  re-activating execution of all program instances in a gang.  (See
+  http://ispc.github.com/ispc.html#re-establishing-the-execution-mask
+
+Standard library updates:
+
+* The seed_rng() function has been modified to take a "varying" seed value
+  when a varying RNGState is being initialized.
+
+* An isnan() function has been added, to check for floating-point "not a
+  number" values.
+
+* The float_to_srgb8() routine does high performance conversion of
+  floating-point color values to SRGB8 format.
+
+Other changes:
+
+* A number of bugfixes have been made for compiler crashes with malformed
+  programs.
+
+* Floating-point comparisons are now "unordered", so that any comparison
+  where one of the operands is a "not a number" value returns false.  (This
+  matches standard IEEE floating-point behavior.)
+
+* The code generated for 'break' statements in "varying" loops has been
+  improved for some common cases. 
+
+* Compile time and compiler memory use have both been improved,
+  particularly for large input programs.
+
+* A nubmer of bugs have been fixed in the debugging information generated
+  by the compiler when the "-g" command-line flag is used.
+
+=== v1.2.2 === (20 April 2012)
+
+This release includes a number of small additions to functionality and a
+number of bugfixes.  New functionality includes:
+
+* It's now possible to forward declare structures as in C/C++: "struct
+  Foo;".  After such a declaration, structs with pointers to "Foo" and
+  functions that take pointers or references to Foo structs can be declared
+  without the entire definition of Foo being available.
+
+* New built-in types size_t, ptrdiff_t, and [u]intptr_t are now available,
+  corresponding to the equivalent types in C.
+
+* The standard library now provides atomic_swap*() and
+  atomic_compare_exchange*() functions for void * types.
+
+* The C++ backend has seen a number of improvements to the quality and
+  readability of generated code.
+
+A number of bugs have been fixed in this release as well.  The most
+significant are:
+
+* Fixed a bug where nested loops could cause a compiler crash in some
+  circumstances (issues #240, and #229)
+
+* Gathers could access invlaid mamory (and cause the program to crash) in
+  some circumstances (#235)
+
+* References to temporary values are now handled properly when passed to a
+  function that takes a reference typed parameter.
+
+* A case where incorrect code could be generated for compile-time-constant
+  initializers has been fixed (#234).
+
+=== v1.2.1 === (6 April 2012)
+
+This release contains only minor new functionality and is mostly for many
+small bugfixes and improvements to error handling and error reporting.
+The new functionality that is present is:
+
+* Significantly more efficient versions of the float / half conversion
+  routines are now available in the standard library, thanks to Fabian
+  Giesen.
+
+* The last member of a struct can now be a zero-length array; this allows
+  the trick of dynamically allocating enough storage for the struct and
+  some number of array elements at the end of it.
+
+Significant bugs fixed include:
+
+* Issue #205: When a target ISA isn't specified, use the host system's
+  capabilities to choose a target for which it will be able to run the
+  generated code.
+
+* Issues #215 and #217: Don't allocate storage for global variables that
+  are declared "extern".
+
+* Issue #197: Allow NULL as a default argument value in a function
+  declaration.
+
+* Issue #223: Fix bugs where taking the address of a function wouldn't work
+  as expected.
+
+* Issue #224: When there are overloaded variants of a function that take
+  both reference and const reference parameters, give the non-const
+  reference preference when matching values of that underlying type.
+
+* Issue #225: An error is issed when a varying lvalue is assigned to a
+  reference type (rather than crashing).
+
+* Issue #193: Permit conversions from array types to void *, not just the
+  pointer type of the underlying array element.
+
+* Issue #199: Still evaluate expressions that are cast to (void).
+
+The documentation has also been improved, with FAQs added to clarify some
+aspects of the ispc pointer model.
+
+=== v1.2.0 === (20 March 2012)
+
+This is a major new release of ispc, with a number of significant
+improvements to functionality, performance, and compiler robustness.  It
+does, however, include three small changes to language syntax and semantics
+that may require changes to existing programs:
+
+* Syntax for the "launch" keyword has been cleaned up; it's now no longer
+  necessary to bracket the launched function call with angle brackets.
+  (In other words, now use "launch foo();", rather than "launch < foo() >;".
+
+* When using pointers, the pointed-to data type is now "uniform" by
+  default.  Use the varying keyword to specify varying pointed-to types when
+  needed.  (i.e. "float *ptr" is a varying pointer to uniform float data,
+  whereas previously it was a varying pointer to varying float values.)
+  Use "varying float *" to specify a varying pointer to varying float data,
+  and so forth.
+
+* The details of "uniform" and "varying" and how they interact with struct
+  types have been cleaned up.  Now, when a struct type is declared, if the
+  struct elements don't have explicit "uniform" or "varying" qualifiers,
+  they are said to have "unbound" variability.  When a struct type is
+  instantiated, any unbound variability elements inherit the variability of
+  the parent struct type. See http://ispc.github.com/ispc.html#struct-types
+  for more details.
+
+ispc has a new language feature that makes it much easier to use the
+efficient "(array of) structure of arrays" (AoSoA, or SoA) memory layout of
+data.  A new "soa<n>" qualifier can be applied to structure types to
+specify an n-wide SoA version of the corresponding type.  Array indexing
+and pointer operations with arrays SoA types automatically handles the
+two-stage indexing calculation to access the data.  See
+http://ispc.github.com/ispc.html#structure-of-array-types for more details.
+
+For more efficient access of data that is still in "array of structures"
+(AoS) format, ispc has a new "memory coalescing" optimization that
+automatically detects series of strided loads and/or gathers that can be
+transformed into a more efficient set of vector loads and shuffles.  A
+diagnostic is emitted when this optimization is successfully applied. 
+
+Smaller changes in this release:
+
+* The standard library now provides memcpy(), memmove() and memset()
+  functions, as well as single-precision asin() and acos() functions.
+
+* -I can now be specified on the command-line to specify a search path for
+  #include files.
+
+* A number of improvements have been made to error reporting from the
+  parser, and a number of cases where malformed programs could cause the
+  compiler to crash have been fixed.
+
+* A number of small improvements to the quality and performance of generated
+  code have been made, including finding more cases where 32-bit addressing
+  calculations can be safely done on 64-bit systems and generating better
+  code for initializer expressions.
+
+=== v1.1.4 === (4 February 2012)
+
+There are two major bugfixes for Windows in this release.  First, a number
+of failures in AVX code generation on Windows have been fixed; AVX on
+Windows now has no known issues.  Second, a longstanding bug in parsing 64-bit
+integer constants on Windows has been fixed.
+
+This release features a new experimental scalar target, contributed by Gabe
+Weisz <gweisz@cs.cmu.edu>.  This target ("--target=generic-1") compiles
+gangs of single program instances (i.e. programCount == 1); it can be
+useful for debugging ispc programs.
+
+The compiler now supports dynamic memory allocation in ispc programs (with
+"new" and "delete" operators based on C++).  See
+http://ispc.github.com/ispc.html#dynamic-memory-allocation in the
+documentation for more information.
+
+ispc now performs "short circuit" evaluation of the || and && logical
+operators and the ? : selection operator.  (This represents the correction
+of a major incompatibility with C.)  Code like "(index < arraySize &&
+array[index] == 1)" thus now executes as in C, where "array[index]" won't
+be evaluated unless "index" is less than "arraySize".
+
+The standard library now provides "local" atomic operations, which are
+atomic across the gang of program instances (but not across other gangs or
+other hardware threads.  See the updated documentation on atomics for more
+information:
+http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences.
+
+The standard library now offers a clock() function, which returns a uniform
+int64 value that counts processor cycles; it can be used for
+fine-resolution timing measurements.
+
+Finally (of limited interest now): ispc now supports the forthcoming AVX2
+instruction set, due with Haswell-generation CPUs.  All tests and examples
+compile and execute correctly with AVX2.  (Thanks specifically to Craig
+Topper and Nadav Rotem for work on AVX2 support in LLVM, which made this
+possible.)
+ 
+=== v1.1.3 === (20 January 2012)
+
+With this release, the language now supports "switch" statements, with the
+same semantics and syntax as in C.
+
+This release includes fixes for two important performance related issues:
+the quality of code generated for "foreach" statements has been
+substantially improved (https://github.com/ispc/ispc/issues/151), and a
+performance regression with code for "gathers" that was introduced in
+v1.1.2 has been fixed in this release. 
+
+A number of other small bugs were fixed in this release as well, including
+one where invalid memory would sometimes be incorrectly accessed
+(https://github.com/ispc/ispc/issues/160).
+
+Thanks to Jean-Luc Duprat for a number of patches that improve support for
+building on various platforms, and to Pierre-Antoine Lacaze for patches so
+that ispc builds under MinGW.
+
 === v1.1.2 === (9 January 2012)

 The major new feature in this release is support for "generic" C++
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -1,11 +1,16 @@
 #!/bin/bash

+rst2html=rst2html.py
+
 for i in ispc perfguide faq; do
-    rst2html.py --template=template.txt --link-stylesheet \
+    $rst2html --template=template.txt --link-stylesheet \
        --stylesheet-path=css/style.css $i.rst > $i.html
 done

-rst2html.py --template=template-perf.txt --link-stylesheet \
+$rst2html --template=template-news.txt --link-stylesheet \
+    --stylesheet-path=css/style.css news.rst > news.html
+
+$rst2html --template=template-perf.txt --link-stylesheet \
        --stylesheet-path=css/style.css perf.rst > perf.html

 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -1,10 +1,10 @@
-=============================================================
-Intel® SPMD Program Compiler Frequently Asked Questions (FAQ)
-=============================================================
+=====================================
+Frequently Asked Questions About ispc
+=====================================

 This document includes a number of frequently (and not frequently) asked
 questions about ispc, the Intel® SPMD Program Compiler.  The source to this
-document is in the file ``docs/faq.txt`` in the ``ispc`` source
+document is in the file ``docs/faq.rst`` in the ``ispc`` source
 distribution.

 * Understanding ispc's Output
@@ -14,11 +14,24 @@ distribution.
  + `Why are there multiple versions of exported ispc functions in the assembly output?`_
  + `How can I more easily see gathers and scatters in generated assembly?`_

+* Running The Compiler
+
+  + `Why is it required to use one of the "generic" targets with C++ output?`_
+  + `Why won't the compiler generate an object file or assembly output with the "generic" targets?`_
+
+* Language Details
+
+  + `What is the difference between "int *foo" and "int foo[]"?`_
+  + `Why are pointed-to types "uniform" by default?`_
+  + `What am I getting an error about assigning a varying lvalue to a reference type?`_ 
+  
 * Interoperability

  + `How can I supply an initial execution mask in the call from the application?`_
  + `How can I generate a single binary executable with support for multiple instruction sets?`_
  + `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
+  + `Is it possible to inline ispc functions in C/C++ code?`_
+  + `Why is it illegal to pass "varying" values from C/C++ to ispc functions?`_ 

 * Programming Techniques

@@ -26,6 +39,8 @@ distribution.
  + `How can a gang of program instances generate variable amounts of output efficiently?`_
  + `Is it possible to use ispc for explicit vector programming?`_
  + `How can I debug my ispc programs using Valgrind?`_
+  + `foreach statements generate more complex assembly than I'd expect; what's going on?`_
+  + `How do I launch an individual task for each active program instance?`_

 Understanding ispc's Output
 ===========================
@@ -212,6 +227,174 @@ easier to understand:
            jmp        ___pseudo_scatter_base_offsets32_32 ## TAILCALL


+Running The Compiler
+====================
+
+Why is it required to use one of the "generic" targets with C++ output?
+-----------------------------------------------------------------------
+
+The C++ output option transforms the provided ``ispc`` program source into
+C++ code where each basic operation in the program (addition, comparison,
+etc.) is represented as a function call to an as-yet-undefined function,
+chaining the results of these calls together to perform the required
+computations.  It is then expected that the user will provide the
+implementation of these functions via a header file with ``inline``
+functions defined for each of these functions and then use a C++ compiler
+to generate a final object file.  (Examples of these headers include
+``examples/intrinsics/sse4.h`` and ``examples/intrinsics/knc.h`` in the
+``ispc`` distribution.)
+
+If a target other than one of the "generic" ones is used with C++ output,
+then the compiler will transform certain operations into particular code
+sequences that may not be desired for the actual final target; for example,
+SSE targets that don't have hardware "gather" instructions will transform a
+gather into a sequence of scalar load instructions.  When this in turn is
+transformed to C++ code, the fact that the loads were originally a gather
+is lost, and the header file of function definitions wouldn't have a chance
+to map the "gather" to a target-specific operation, as the ``knc.h`` header
+does, for example.  Thus, the "generic" targets exist to provide basic
+targets of various vector widths, without imposing any limitations on the
+final target's capabilities.
+
+Why won't the compiler generate an object file or assembly output with the "generic" targets?
+---------------------------------------------------------------------------------------------
+
+As described in the above FAQ entry, when compiling to the "generic"
+targets, ``ispc`` generates vector code for the source program that
+transforms every basic operation in the program (addition, comparison,
+etc.) into a separate function call.
+
+While there is no fundamental reason that the compiler couldn't generate
+target-specific object code with a function call to an undefined function
+for each primitive operation, doing so wouldn't actually be useful in
+practice--providing definitions of these functions in a separate object
+file and actually performing function calls for each of them (versus
+turning them into inline function calls) would be a highly inefficient way
+to run the program.
+
+Therefore, in the interests of encouraging the  use of the system,
+these types of output are disallowed.
+
+
+Language Details
+================
+
+What is the difference between "int \*foo" and "int foo[]"?
+-----------------------------------------------------------
+
+In C and C++, declaring a function to take a parameter ``int *foo`` and
+``int foo[]`` results in the same type for the parameter.  Both are
+pointers to integers.  In ``ispc``, these are different types.  The first
+one is a varying pointer to a uniform integer value in memory, while the
+second results in a uniform pointer to the start of an array of varying
+integer values in memory.
+
+To understand why the first is a varying pointer to a uniform integer,
+first recall that types without explicit rate qualifiers (``uniform``,
+``varying``, or ``soa<>``) are ``varying`` by default.  Second, recall from
+the `discussion of pointer types in the ispc User's Guide`_ that pointed-to
+types without rate qualifiers are ``uniform`` by default.  (This second
+rule is discussed further below, in `Why are pointed-to types "uniform" by
+default?`_.)  The type of ``int *foo`` follows from these.
+
+.. _discussion of pointer types in the ispc User's Guide: ispc.html#pointer-types 
+
+Conversely, in a function body, ``int foo[10]`` represents a declaration of
+a 10-element array of varying ``int`` values.  In that we'd certainly like
+to be able to pass such an array to a function that takes a ``int []``
+parameter, the natural type for an ``int []`` parameter is a uniform
+pointer to varying integer values.
+
+In terms of compatibility with C/C++, it's unfortunate that this
+distinction exists, though any other set of rules seems to introduce more
+awkwardness than this one.  (Though we're interested to hear ideas to
+improve these rules!).
+
+Why are pointed-to types "uniform" by default?
+----------------------------------------------
+
+In ``ispc``, types without rate qualifiers are "varying" by default, but
+types pointed to by pointers without rate qualifiers are "uniform" by
+default.  Why this difference?
+
+::
+
+    int foo;  // no rate qualifier, "varying int".
+    uniform int *foo;  // pointer type has no rate qualifier, pointed-to does.
+                       // "varying pointer to uniform int".
+    int *foo;  // neither pointer type nor pointed-to type ("int") have
+               // rate qualifiers. Pointer type is varying by default,
+               // pointed-to is uniform. "varying pointer to uniform int".
+    varying int *foo;   // varying pointer to varying int
+
+The first rule, having types without rate qualifiers be varying by default,
+is a default that keeps the number of "uniform" or "varying" qualifiers in
+``ispc`` programs low.  Most ``ispc`` programs use mostly "varying"
+variables, so this rule allows most variables to be declared without also
+requiring rate qualifiers.
+
+On a related note, this rule allows many C/C++ functions to be used to
+define equivalent functions in the SPMD execution model that ``ispc``
+provides with little or no modification:
+
+::
+
+    // scalar add in C/C++, SPMD/vector add in ispc
+    int add(int a, int b) { return a + b; }
+
+This motivation also explains why ``uniform int *foo`` represents a varying
+pointer; having pointers be varying by default if they don't have rate
+qualifiers similarly helps with porting code from C/C++ to ``ispc``.
+
+The tricker issue is why pointed-to types are "uniform" by default.  In our
+experience, data in memory that is accessed via pointers is most often
+uniform; this generally includes all data that has been allocated and
+initialized by the C/C++ application code. In practice, "varying" types are
+more generally (but not exclusively) used for local data in ``ispc``
+functions.  Thus, making the pointed-to type uniform by default leads to
+more concise code for the most common cases.
+
+
+What am I getting an error about assigning a varying lvalue to a reference type?
+--------------------------------------------------------------------------------
+
+Given code like the following:
+
+::
+
+    uniform float a[...];
+    int index = ...;
+    float &r = a[index];
+
+``ispc`` issues the error "Initializer for reference-type variable "r" must
+have a uniform lvalue type.".  The underlying issue stems from how
+references are represented in the code generated by ``ispc``.  Recall that
+``ispc`` supports both uniform and varying pointer types--a uniform pointer
+points to the same location in memory for all program instances in the
+gang, while a varying pointer allows each program instance to have its own
+pointer value.
+
+References are represented a pointer in the code generated by ``ispc``,
+though this is generally opaque to the user; in ``ispc``, they are
+specifically uniform pointers.  This design decision was made so that given
+code like this:
+
+::
+
+    extern void func(float &val);
+    float foo = ...;
+    func(foo);
+
+Then the reference would be handled efficiently as a single pointer, rather
+than unnecessarily being turned into a gang-size of pointers.
+
+However, an implication of this decision is that it's not possible for
+references to refer to completely different things for each of the program
+instances.  (And hence the error that is issued).  In cases where a unique
+per-program-instance pointer is needed, a varying pointer should be used
+instead of a reference.
+
+
 Interoperability
 ================

@@ -346,6 +529,92 @@ In a similar fashion, it's possible to find out at run-time the value of
    export uniform int width() { return programCount; }


+Is it possible to inline ispc functions in C/C++ code?
+------------------------------------------------------
+
+If you're willing to use the ``clang`` C/C++ compiler that's part of the
+LLVM tool suite, then it is possible to inline ``ispc`` code with C/C++
+(and conversely, to inline C/C++ calls in ``ispc``).  Doing so can provide
+performance advantages when calling out to short functions written in the
+"other" language.  Note that you don't need to use ``clang`` to compile all
+of your C/C++ code, but only for the files where you want to be able to
+inline.  In order to do this, you must have a full installation of LLVM
+version 3.0 or later, including the ``clang`` compiler.
+
+The basic approach is to have the various compilers emit LLVM intermediate
+representation (IR) code and to then use tools from LLVM to link together
+the IR from the compilers and then re-optimize it, which gives the LLVM
+optimizer the opportunity to do additional inlining and cross-function
+optimizations.  If you have source files ``foo.ispc`` and ``foo.cpp``,
+first emit LLVM IR:
+
+::
+
+   ispc --emit-llvm -o foo_ispc.bc foo.ispc
+   clang -O2 -c -emit-llvm -o foo_cpp.bc foo.cpp
+
+Next, link the two IR files into a single file and run the LLVM optimizer
+on the result:
+
+::
+  
+    llvm-link foo_ispc.bc foo_cpp.bc -o - | opt -O3 -o foo_opt.bc
+
+And finally, generate a native object file:
+
+::
+
+   llc -filetype=obj foo_opt.bc -o foo.o
+
+This file can in turn be linked in with the rest of your object files when
+linking your applicaiton.
+
+(Note that if you're using the AVX instruction set, you must provide the
+``-mattr=+avx`` flag to ``llc``.)
+    
+
+Why is it illegal to pass "varying" values from C/C++ to ispc functions?
+------------------------------------------------------------------------
+
+If any of the types in the parameter list to an exported function is
+"varying" (including recursively, and members of structure types, etc.),
+then ``ispc`` will issue an error and refuse to compile the function:
+
+::
+
+    % echo "export int add(int x) { return ++x; }" | ispc
+    <stdin>:1:12: Error: Illegal to return a "varying" type from exported function "foo" 
+    <stdin>:1:20: Error: Varying parameter "x" is illegal in an exported function. 
+
+While there's no fundamental reason why this isn't possible, recall the
+definition of "varying" variables: they have one value for each program
+instance in the gang.  As such, the number of values and amount of storage
+required to represent a varying variable depends on the gang size
+(i.e. ``programCount``), which can have different values depending on the
+compilation target.
+
+``ispc`` therefore prohibits passing "varying" values between the
+application and the ``ispc`` program in order to prevent the
+application-side code from depending on a particular gang size, in order to
+encourage portability to different gang sizes.  (A generally desirable
+programming practice.)
+
+For cases where the size of data is actually fixed from the application
+side, the value can be passed via a pointer to a short ``uniform`` array,
+as follows:
+
+::
+
+    export void add4(uniform int ptr[4]) {
+        foreach (i = 0 ... 4)
+            ptr[i]++;
+    }
+
+On the 4-wide SSE instruction set, this compiles to a single vector add
+instruction (and associated move instructions), while it still also
+efficiently computes the correct result on 8-wide AVX targets.
+
+
 Programming Techniques
 ======================

@@ -480,3 +749,131 @@ you can use ``--target=sse4`` when compiling to run with ``valgrind``.
 Note that ``valgrind`` does not yet support programs that use the AVX
 instruction set.

+foreach statements generate more complex assembly than I'd expect; what's going on?
+-----------------------------------------------------------------------------------
+
+Given a simple ``foreach`` loop like the following:
+
+::
+
+    void foo(uniform float a[], uniform int count) {
+        foreach (i = 0 ... count)
+            a[i] *= 2;
+    }
+
+
+the ``ispc`` compiler generates approximately 40 instructions--why isn't
+the generated code simpler?
+
+There are two main components to the code: one handles
+``programCount``-sized chunks of elements of the array, and the other
+handles any excess elements at the end of the array that don't completely
+fill a gang.  The code for the main loop is essentially what one would
+expect: a vector of values are laoded from the array, the multiply is done,
+and the result is stored.
+
+::
+
+    LBB0_2:                                 ## %foreach_full_body
+	movslq	%edx, %rdx
+	vmovups	(%rdi,%rdx), %ymm1
+	vmulps	%ymm0, %ymm1, %ymm1
+	vmovups	%ymm1, (%rdi,%rdx)
+	addl	$32, %edx
+	addl	$8, %eax
+	cmpl	%ecx, %eax
+	jl	LBB0_2
+
+
+Then, there is a sequence of instructions that handles any additional
+elements at the end of the array.  (These instructions don't execute if
+there aren't any left-over values to process, but they do lengthen the
+amount of generated code.)
+
+::
+
+  ## BB#4:                                ## %partial_inner_only
+	vmovd	%eax, %xmm0
+	vinsertf128	$1, %xmm0, %ymm0, %ymm0
+	vpermilps	$0, %ymm0, %ymm0 ## ymm0 = ymm0[0,0,0,0,4,4,4,4]
+	vextractf128	$1, %ymm0, %xmm3
+	vmovd	%esi, %xmm2
+	vmovaps	LCPI0_1(%rip), %ymm1
+	vextractf128	$1, %ymm1, %xmm4
+	vpaddd	%xmm4, %xmm3, %xmm3
+        # ....
+	vmulps	LCPI0_0(%rip), %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm0, (%rdi,%rax)
+
+
+If you know that the number of elements to be processed will always be an
+exact multiple of the 8, 16, etc., then adding a simple assignment to
+``count`` like the one below gives the compiler enough information to be
+able to eliminate the code for the additional array elements.
+
+::
+
+    void foo(uniform float a[], uniform int count) {
+        // This assignment doesn't change the value of count
+        // if it's a multiple of 16, but it gives the compiler
+        // insight into this fact, allowing for simpler code to
+        // be generated for the foreach loop.
+        count = (count & ~(16-1));
+        foreach (i = 0 ... count)
+            a[i] *= 2;
+    }
+
+With this new version of ``foo()``, only the code for the first loop above
+is generated.
+
+
+How do I launch an individual task for each active program instance?
+--------------------------------------------------------------------
+
+Recall from the `discussion of "launch" in the ispc User's Guide`_ that a
+``launch`` statement launches a single task corresponding to a single gang
+of executing program instances, where the indices of the active program
+instances are the same as were active when the ``launch`` statement
+executed.
+
+.. _discussion of "launch" in the ispc User's Guide: ispc.html#task-parallelism-launch-and-sync-statements
+
+In some situations, it's desirable to be able to launch an individual task
+for each executing program instance.  For example, we might be performing
+an iterative computation where a subset of the program instances determine
+that an item they are responsible for requires additional processing.
+
+::
+
+    bool itemNeedsMoreProcessing(int);
+    int itemNum = ...;
+    if (itemNeedsMoreProcessing(itemNum)) {
+        // do additional work 
+    }
+
+For performance reasons, it may be desirable to apply an entire gang's
+worth of comptuation to each item that needs additional processing; 
+there may be available parallelism in this computation such that we'd like
+to process each of the items with SPMD computation.
+
+In this case, the ``foreach_active`` and ``unmasked`` constructs can be
+applied together to accomplish this goal.
+
+::
+
+    // do additional work 
+    task void doWork(uniform int index);
+    foreach_active (index) {
+        unmasked {
+            launch doWork(extract(itemNum, index)); 
+        }
+    }
+
+Recall that the body of the ``foreach_active`` loop runs once for each
+active program instance, with each active program instance's
+``programIndex`` value available in ``index`` in the above.  In the loop,
+we can re-establish an "all on" execution mask, enabling execution in all
+of the program instances in the gang, such that execution in ``doWork()``
+starts with all instances running.  (Alternatively, the ``unmasked`` block
+could be in the definition of ``doWork()``.)
+
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -0,0 +1,113 @@
+=========
+ispc News
+=========
+
+ispc 1.5.0 is Released
+----------------------
+
+A major update of ``ispc`` has been released with several new targets available
+and bunch of performance and stability fixes. The released binaries are built
+with patched version of LLVM 3.3. Please refer to Release Notes for complete
+set of changes.
+
+ispc 1.4.4 is Released
+----------------------
+
+A minor update of ``ispc`` has been released with several stability improvements.
+The released binaries are built with patched version of LLVM 3.3. Since this
+release we also distribute 32 bit Linux binaries.
+
+ispc 1.4.3 is Released
+----------------------
+
+A minor update of ``ispc`` has been released with several stability improvements.
+All tests and examples now properly compile and execute on native targets on
+Unix platforms (Linux and MacOS).
+The released binaries are built with patched version of LLVM 3.3.
+
+ispc 1.4.2 is Released
+----------------------
+
+A minor update of ``ispc`` has been released with stability fix for AVX2
+(Haswell), fix for Win32 platform and performance improvements on Xeon Phi.
+As usual, it's available on all supported platforms (Windows, Linux and MacOS).
+This version supports LLVM 3.1, 3.2, 3.3 and 3.4, but now we are recommending
+to avoid 3.1, as it's known to contain a number of stability problems and we are
+planning to deprecate its support soon.
+The released binaries are built with 3.3.
+
+ispc 1.4.1 is Released
+----------------------
+
+A major new version of ``ispc`` has been released with stability and
+performance improvements on all supported platforms (Windows, Linux and MacOS).
+This version supports LLVM 3.1, 3.2, 3.3 and 3.4. The released binaries are
+built with 3.2.
+
+ispc 1.3.0 is Released
+----------------------
+
+A major new version of ``ispc`` has been released.  In addition to a number
+of new language features, this release notably features initial support for
+compiling to the Intel Xeon Phi (Many Integrated Core) architecture.
+
+ispc 1.2.1 is Released
+----------------------
+
+This is a bugfix release, fixing approximately 20 bugs in the system and
+improving error handling and error reporting.  New functionality includes
+very efficient float/half conversion routines thanks to Fabian 
+Giesen.  See the `1.2.1 release notes`_ for details.
+
+.. _1.2.1 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
+
+ispc 1.2.0 is Released
+-----------------------
+
+A new major release was posted on March 20, 2012.  This release includes
+significant new functionality for cleanly handling "structure of arrays"
+(SoA) data layout and a new model for how uniform and varying are handled
+with structure types.  
+
+Paper on ispc To Appear in InPar 2012
+-------------------------------------
+
+A technical paper on ``ispc``, `ispc: A SPMD Compiler for High-Performance
+CPU Programming`_, by Matt Pharr and William R. Mark, has been accepted to
+the `InPar 2012`_ conference. This paper describes a number of the design
+features and key characteristics of the ``ispc`` implementation.
+
+(© 2012 IEEE. Personal use of this material is permitted. Permission from
+IEEE must be obtained for all other uses, in any current or future media,
+including reprinting/republishing this material for advertising or
+promotional purposes, creating new collective works, for resale or
+redistribution to servers or lists, or reuse of any copyrighted component
+of this work in other works.).
+
+.. _ispc\: A SPMD Compiler for High-Performance CPU Programming: https://github.com/downloads/ispc/ispc/ispc_inpar_2012.pdf
+.. _InPar 2012: http://innovativeparallel.org/
+
+ispc 1.1.4 is Released
+----------------------
+
+On February 4, 2012, the 1.1.4 release of ``ispc`` was posted; new features
+include ``new`` and ``delete`` for dynamic memory allocation in ``ispc``
+programs, "local" atomic operations in the standard library, and a new
+scalar compilation target.  See the `1.1.4 release notes`_ for details.
+
+.. _1.1.4 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
+
+
+ispc 1.1.3 is Released
+----------------------
+
+With this release, the language now supports "switch" statements, with the same semantics and syntax as in C.
+
+This release includes fixes for two important performance related issues:
+the quality of code generated for "foreach" statements has been
+substantially improved, and performance regression with code for "gathers"
+that was introduced in v1.1.2 has been fixed in this release.
+
+Thanks to Jean-Luc Duprat for a number of patches that improve support for
+building on various platforms, and to Pierre-Antoine Lacaze for patches so
+that ispc builds under MinGW.
--- a/docs/perfguide.rst
+++ b/docs/perfguide.rst
@@ -13,6 +13,7 @@ the most out of ``ispc`` in practice.
  + `Improving Control Flow Coherence With "foreach_tiled"`_
  + `Using Coherent Control Flow Constructs`_
  + `Use "uniform" Whenever Appropriate`_
+  + `Use "Structure of Arrays" Layout When Possible`_

 * `Tips and Techniques`_

@@ -20,6 +21,7 @@ the most out of ``ispc`` in practice.
  + `Avoid 64-bit Addressing Calculations When Possible`_
  + `Avoid Computation With 8 and 16-bit Integer Types`_
  + `Implementing Reductions Efficiently`_
+  + `Using "foreach_active" Effectively`_
  + `Using Low-level Vector Tricks`_
  + `The "Fast math" Option`_
  + `"inline" Aggressively`_
@@ -247,6 +249,76 @@ but it's always best to provide the compiler with as much help as possible
 to understand the actual form of your computation.


+Use "Structure of Arrays" Layout When Possible
+----------------------------------------------
+
+In general, memory access performance (for both reads and writes) is best
+when the running program instances access a contiguous region of memory; in
+this case efficient vector load and store instructions can often be used
+rather than gathers and scatters.  As an example of this issue, consider an
+array of a simple point datatype laid out and accessed in conventional
+"array of structures" (AOS) layout:
+
+::
+
+    struct Point { float x, y, z; };
+    uniform Point pts[...];
+    float v = pts[programIndex].x;
+
+In the above code, the access to ``pts[programIndex].x`` accesses
+non-sequential memory locations, due to the ``y`` and ``z`` values between
+the desired ``x`` values in memory.  A "gather" is required to get the
+value of ``v``, with a corresponding decrease in performance.
+
+If ``Point`` was defined as a "structure of arrays" (SOA) type, the access
+can be much more efficient:
+
+::
+
+    struct Point8 { float x[8], y[8], z[8]; };
+    uniform Point8 pts8[...];
+    int majorIndex = programIndex / 8;
+    int minorIndex = programIndex % 8;
+    float v = pts8[majorIndex].x[minorIndex];
+
+In this case, each ``Point8`` has 8 ``x`` values contiguous in memory
+before 8 ``y`` values and then 8 ``z`` values.  If the gang size is 8 or
+less, the access for ``v`` will have the same value of ``majorIndex`` for
+all program instances and will access consecutive elements of the ``x[8]``
+array with a vector load.  (For larger gang sizes, two 8-wide vector loads
+would be issues, which is also quite efficient.)
+
+However, the syntax in the above code is messy; accessing SOA data in this
+fashion is much less elegant than the corresponding code for accessing the
+data with AOS layout.  The ``soa`` qualifier in ``ispc`` can be used to
+cause the corresponding transformation to be made to the ``Point`` type,
+while preserving the clean syntax for data access that comes with AOS
+layout:
+
+::
+
+    soa<8> Point pts[...]; 
+    float v = pts[programIndex].x;
+
+Thanks to having SOA layout a first-class concept in the language's type
+system, it's easy to write functions that convert data between the
+layouts.  For example, the ``aos_to_soa`` function below converts ``count``
+elements of the given ``Point`` type from AOS to 8-wide SOA layout.  (It
+assumes that the caller has pre-allocated sufficient space in the
+``pts_soa`` output array.
+
+::
+
+    void aos_to_soa(uniform Point pts_aos[], uniform int count,
+                    soa<8> pts_soa[]) {
+         foreach (i = 0 ... count)
+             pts_soa[i] = pts_aos[i];
+    }
+
+Analogously, a function could be written to convert back from SOA to AOS if
+needed.
+
+
 Tips and Techniques
 ===================

@@ -339,6 +411,12 @@ based on the index, it can be worth doing.  See the example
 ``examples/volume_rendering`` in the ``ispc`` distribution for the use of
 this technique in an instance where it is beneficial to performance.

+Understanding Memory Read Coalescing
+------------------------------------
+
+XXXX todo
+
+
 Avoid 64-bit Addressing Calculations When Possible
 --------------------------------------------------

@@ -433,6 +511,43 @@ values--very efficient code in the end.
        return reduce_add(sum);
    } 

+Using "foreach_active" Effectively
+----------------------------------
+
+For high-performance code,
+
+For example, consider this segment of code, from the introduction of
+``foreach_active`` in the ispc User's Guide:
+
+::
+
+    uniform float array[...] = { ... };    
+    int index = ...;
+    foreach_active (i) {
+        ++array[index];
+    }  
+
+Here, ``index`` was assumed to possibly have the same value for multiple
+program instances, so the updates to ``array[index]`` are serialized by the
+``foreach_active`` statement in order to not have undefined results when
+``index`` values do collide.
+
+The code generated by the compiler can be improved  in this case by making
+it clear that only a single element of the array is accessed by
+``array[index]`` and that thus a general gather or scatter isn't required.
+Specifically, by using the ``extract()`` function from the standard library
+to extract the current program instance's value of ``index`` into a
+``uniform`` variable and then using that to index into ``array``, as below,
+more efficient code is generated.
+
+::
+
+    foreach_active (instanceNum) {
+        uniform int unifIndex = extract(index, instanceNum);
+        ++array[unifIndex];
+    }
+
+
 Using Low-level Vector Tricks
 -----------------------------

@@ -547,7 +662,7 @@ gathers happen.)

    extern "C" {
        void ISPCInstrument(const char *fn, const char *note, 
-                            int line, int mask);
+                            int line, uint64_t mask);
    }

 This function is passed the file name of the ``ispc`` file running, a short
@@ -560,7 +675,7 @@ as follows:

 ::

-   ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
+   ISPCInstrument("foo.ispc", "function entry", 55, 0xfull);

 This call indicates that at the currently executing program has just
 entered the function defined at line 55 of the file ``foo.ispc``, with a
--- a/docs/template-news.txt
+++ b/docs/template-news.txt
@@ -0,0 +1,66 @@
+%(head_prefix)s
+%(head)s
+<script type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-1486404-4']);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</script>
+%(stylesheet)s
+%(body_prefix)s
+<div id="wrap">
+  <div id="wrap2">
+    <div id="header">
+      <h1 id="logo">Intel SPMD Program Compiler</h1>
+      <div id="slogan">An open-source compiler for high-performance SIMD programming on
+      the CPU</div>
+    </div>
+    <div id="nav">
+      <div id="nbar">
+        <ul>
+          <li><a href="index.html">Overview</a></li>
+          <li id="selected"><a href="news.html">News</a></li>
+          <li><a href="features.html">Features</a></li>
+          <li><a href="downloads.html">Downloads</a></li>
+          <li><a href="documentation.html">Documentation</a></li>
+          <li><a href="perf.html">Performance</a></li>
+          <li><a href="contrib.html">Contributors</a></li>
+        </ul>
+      </div>
+    </div>
+    <div id="content-wrap">
+      <div id="sidebar">
+          <div class="widgetspace">
+            <h1>Resources</h1>
+            <ul class="menu">
+              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
+              <li><a href="http://groups.google.com/group/ispc-users/">ispc
+              users mailing list</a></li>
+              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
+              developers mailing list</a></li>
+              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
+              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
+            </ul>
+        </div>
+      </div>
+%(body_pre_docinfo)s
+%(docinfo)s
+<div id="content">
+%(body)s
+</div>
+    <div class="clearfix"></div>
+    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+      <!-- Please Do Not remove this link, thank u -->
+      </div>
+      </div>
+      </div>
+      </div>
+%(body_suffix)s
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -26,10 +26,12 @@
      <div id="nbar">
        <ul>
          <li><a href="index.html">Overview</a></li>
+          <li><a href="news.html">News</a></li>
          <li><a href="features.html">Features</a></li>
          <li><a href="downloads.html">Downloads</a></li>
          <li><a href="documentation.html">Documentation</a></li>
          <li id="selected"><a href="perf.html">Performance</a></li>
+          <li><a href="contrib.html">Contributors</a></li>
        </ul>
      </div>
    </div>
@@ -45,8 +47,7 @@
              developers mailing list</a></li>
              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
-              <li><a href="doxygen/index.html">Doxygen documentation of
-              <tt>ispc</tt> source code</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
            </ul>
        </div>
      </div>
@@ -56,7 +57,7 @@
 %(body)s
 </div>
    <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
      <!-- Please Do Not remove this link, thank u -->
      </div>
      </div>
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -26,10 +26,12 @@
      <div id="nbar">
        <ul>
          <li><a href="index.html">Overview</a></li>
+          <li><a href="news.html">News</a></li>
          <li><a href="features.html">Features</a></li>
          <li><a href="downloads.html">Downloads</a></li>
          <li id="selected"><a href="documentation.html">Documentation</a></li>
          <li><a href="perf.html">Performance</a></li>
+          <li><a href="contrib.html">Contributors</a></li>
        </ul>
      </div>
    </div>
@@ -45,8 +47,7 @@
              developers mailing list</a></li>
              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
-              <li><a href="doxygen/index.html">Doxygen documentation of
-              <tt>ispc</tt> source code</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
            </ul>
        </div>
      </div>
@@ -56,7 +57,7 @@
 %(body)s
 </div>
    <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
      <!-- Please Do Not remove this link, thank u -->
      </div>
      </div>
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.1.2
+PROJECT_NUMBER         = 1.5.0

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
@@ -581,10 +581,12 @@ WARN_LOGFILE           =
 # directories like "/usr/src/myproject". Separate the files or directories
 # with spaces.

-INPUT                  = builtins.h \
+INPUT                  = ast.h \
+                         builtins.h \
                         ctx.h \
                         decl.h \
                         expr.h \
+                         func.h \
                         ispc.h \
                         llvmutil.h \
                         module.h \
@@ -593,10 +595,13 @@ INPUT                  = builtins.h \
                         sym.h \
                         type.h \
                         util.h \
+                         ast.cpp \
                         builtins.cpp \
+                         cbackend.cpp \
                         ctx.cpp \
                         decl.cpp \
                         expr.cpp \
+                         func.cpp \
                         ispc.cpp \
                         llvmutil.cpp \
                         main.cpp \
@@ -608,7 +613,7 @@ INPUT                  = builtins.h \
                         util.cpp \
                         parse.yy \
                         lex.ll \
-                         builtins-c.c
+                         builtins/builtins.c

 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -39,9 +39,6 @@ example implementation of this function that counts the number of times the
 callback is made and records some statistics about control flow coherence
 is provided in the instrument.cpp file.

-*** Note: on Linux, this example currently hits an assertion in LLVM during
-*** compilation
-

 Deferred
 ========
@@ -76,6 +73,14 @@ This directory includes three implementations of the algorithm:
  light culling and shading.


+GMRES
+=====
+
+An implementation of the generalized minimal residual method for solving
+sparse matrix equations.
+(http://en.wikipedia.org/wiki/Generalized_minimal_residual_method)
+
+
 Mandelbrot
 ==========

@@ -110,6 +115,13 @@ This program implements both the Black-Scholes and Binomial options pricing
 models in both ispc and regular serial C++ code.


+Perfbench
+=========
+
+This runs a number of microbenchmarks to measure system performance and
+code generation quality.
+
+
 RT
 ==

@@ -134,6 +146,11 @@ This is a simple "hello world" type program that shows a ~10 line
 application program calling out to a ~5 line ispc program to do a simple
 computation.

+Sort
+====
+This is a bucket sort of 32 bit unsigned integers.
+By default 1000000 random elements get sorted.
+Call ./sort N in order to sort N elements instead.

 Volume
 ======
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -2,6 +2,7 @@
 EXAMPLE=ao
 CPP_SRC=ao.cpp ao_serial.cpp
 ISPC_SRC=ao.ispc
-ISPC_TARGETS=sse2,sse4,avx
+ISPC_IA_TARGETS=sse2,sse4,avx
+ISPC_ARM_TARGETS=neon

 include ../common.mk
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -138,7 +138,7 @@ int main(int argc, char **argv)
    }

    // Report results and save image
-    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", 
+    printf("[aobench ispc]:\t\t\t[%.3f] million cycles (%d x %d image)\n", 
           minTimeISPC, width, height);
    savePPM("ao-ispc.ppm", width, height); 

@@ -158,7 +158,7 @@ int main(int argc, char **argv)
    }

    // Report results and save image
-    printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", 
+    printf("[aobench ispc + tasks]:\t\t[%.3f] million cycles (%d x %d image)\n", 
           minTimeISPCTasks, width, height);
    savePPM("ao-ispc-tasks.ppm", width, height); 

@@ -176,7 +176,7 @@ int main(int argc, char **argv)
    }

    // Report more results, save another image...
-    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
+    printf("[aobench serial]:\t\t[%.3f] million cycles (%d x %d image)\n", minTimeSerial, 
           width, height);
    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -50,7 +50,6 @@ struct Isect {
 struct Sphere {
    vec        center;
    float      radius;
-
 };

 struct Plane {
@@ -82,8 +81,8 @@ static inline void vnormalize(vec &v) {
 }


-static inline void
-ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
+static void
+ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);

@@ -103,7 +102,7 @@ ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {


 static inline void
-ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
+ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {
    vec rs = ray.org - sphere.center;

    float B = dot(rs, ray.dir);
@@ -124,7 +123,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
 }


-static inline void
+static void
 orthoBasis(vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
@@ -147,8 +146,8 @@ orthoBasis(vec basis[3], vec n) {
 }


-static inline float
-ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
+static float
+ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],
                  RNGState &rngstate) {
    float eps = 0.0001f;
    vec p, n;
@@ -204,112 +203,52 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
 static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
                         uniform int h,  uniform int nsubsamples, 
                         uniform float image[]) {
-    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
-    static Sphere spheres[3] = {
+    static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+    static uniform Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
        { { -0.5f, 0.0f, -3.0f }, 0.5f },
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;

-    seed_rng(&rngstate, y0);
+    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
+    float invSamples = 1.f / nsubsamples;

-    // Compute the mapping between the 'programCount'-wide program
-    // instances running in parallel and samples in the image.  
-    //
-    // For now, we'll always take four samples per pixel, so start by
-    // initializing du and dv with offsets into subpixel samples.  We'll
-    // take care of further updating du and dv for the case where we're
-    // doing more than 4 program instances in parallel shortly.
-    uniform float uSteps[4] = { 0, 1, 0, 1 };
-    uniform float vSteps[4] = { 0, 0, 1, 1 };
-    float du = uSteps[programIndex % 4] / nsubsamples;
-    float dv = vSteps[programIndex % 4] / nsubsamples;
+    foreach_tiled(y = y0 ... y1, x = 0 ... w, 
+                  u = 0 ... nsubsamples, v = 0 ... nsubsamples) {
+        float du = (float)u * invSamples, dv = (float)v * invSamples;

-    // Now handle the case where we are able to do more than one pixel's
-    // worth of work at once.  nx records the number of pixels in the x
-    // direction we do per iteration and ny the number in y.
-    uniform int nx = 1, ny = 1;
+        // Figure out x,y pixel in NDC
+        float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+        float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+        float ret = 0.f;
+        Ray ray;
+        Isect isect;

-    // FIXME: We actually need ny to be 1 regardless of the decomposition,
-    // since the task decomposition is one scanline high.
+        ray.org = 0.f;

-    if (programCount == 8) {
-        // Do two pixels at once in the x direction
-        nx = 2;
-        if (programIndex >= 4) 
-            // And shift the offsets for the second pixel's worth of work
-            ++du;
-    }
-    else if (programCount == 16) {
-        nx = 4;
-        ny = 1;
-        if (programIndex >= 4 && programIndex < 8)
-            ++du;
-        if (programIndex >= 8 && programIndex < 12)
-            du += 2;
-        if (programIndex >= 12)
-            du += 3;
-    }
+        // Poor man's perspective projection
+        ray.dir.x = px;
+        ray.dir.y = py;
+        ray.dir.z = -1.0;
+        vnormalize(ray.dir);

-    // Now loop over all of the pixels, stepping in x and y as calculated
-    // above.  (Assumes that ny divides y and nx divides x...)
-    for (uniform int y = y0; y < y1; y += ny) {
-        for (uniform int x = 0; x < w; x += nx)  {
-            // Figure out x,y pixel in NDC
-            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
-            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
-            float ret = 0.f;
-            Ray ray;
-            Isect isect;
+        isect.t   = 1.0e+17;
+        isect.hit = 0;

-            ray.org = 0.f;
+        for (uniform int snum = 0; snum < 3; ++snum)
+            ray_sphere_intersect(isect, ray, spheres[snum]);
+        ray_plane_intersect(isect, ray, plane);

-            // Poor man's perspective projection
-            ray.dir.x = px;
-            ray.dir.y = py;
-            ray.dir.z = -1.0;
-            vnormalize(ray.dir);
+        // Note use of 'coherent' if statement; the set of rays we
+        // trace will often all hit or all miss the scene
+        cif (isect.hit) {
+            ret = ambient_occlusion(isect, plane, spheres, rngstate);
+            ret *= invSamples * invSamples;

-            isect.t   = 1.0e+17;
-            isect.hit = 0;
-
-            for (uniform int snum = 0; snum < 3; ++snum)
-                ray_sphere_intersect(isect, ray, spheres[snum]);
-            ray_plane_intersect(isect, ray, plane);
-
-            // Note use of 'coherent' if statement; the set of rays we
-            // trace will often all hit or all miss the scene
-            cif (isect.hit)
-                ret = ambient_occlusion(isect, plane, spheres, rngstate);
-
-            // This is a little grungy; we have results for
-            // programCount-worth of values.  Because we're doing 2x2
-            // subsamples, we need to peel them off in groups of four,
-            // average the four values for each pixel, and update the
-            // output image.
-            //
-            // Store the varying value to a uniform array of the same size.
-            // See the discussion about communication among program
-            // instances in the ispc user's manual for more discussion on
-            // this idiom.
-            uniform float retArray[programCount];
-            retArray[programIndex] = ret;
-
-            // offset to the first pixel in the image
-            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
-                // Get the four sample values for this pixel
-                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
-                    retArray[p+3];
-
-                // Normalize by number of samples taken
-                sumret /= nsubsamples * nsubsamples; 
-                
-                // Store result in the image
-                image[offset+0] = sumret;
-                image[offset+1] = sumret;
-                image[offset+2] = sumret;
-            }
+            int offset = 3 * (y * w + x);
+            atomic_add_local(&image[offset], ret);
+            atomic_add_local(&image[offset+1], ret);
+            atomic_add_local(&image[offset+2], ret);
        }
    }
 }
@@ -329,5 +268,5 @@ static void task ao_task(uniform int width, uniform int height,

 export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
                          uniform float image[]) {
-    launch[h] < ao_task(w, h, nsubsamples, image) >;
+    launch[h] ao_task(w, h, nsubsamples, image);
 }
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -87,18 +87,22 @@
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+    <TargetName>ao</TargetName>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
+    <TargetName>ao</TargetName>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+    <TargetName>ao</TargetName>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+    <TargetName>ao</TargetName>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -173,4 +177,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/examples/aobench_instrumented/ao.ispc
+++ b/examples/aobench_instrumented/ao.ispc
@@ -211,7 +211,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;

-    seed_rng(&rngstate, y0);
+    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));

    // Compute the mapping between the 'programCount'-wide program
    // instances running in parallel and samples in the image.  
@@ -329,5 +329,5 @@ static void task ao_task(uniform int width, uniform int height,

 export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
                          uniform float image[]) {
-    launch[h] < ao_task(w, h, nsubsamples, image) >;
+    launch[h] ao_task(w, h, nsubsamples, image);
 }
--- a/examples/aobench_instrumented/instrument.cpp
+++ b/examples/aobench_instrumented/instrument.cpp
@@ -60,7 +60,7 @@ int countbits(int i) {
 // Callback function that ispc compiler emits calls to when --instrument
 // command-line flag is given while compiling.
 void
-ISPCInstrument(const char *fn, const char *note, int line, int mask) {
+ISPCInstrument(const char *fn, const char *note, int line, uint64_t mask) {
    char sline[16];
    sprintf(sline, "%04d", line);
    std::string s = std::string(fn) + std::string("(") + std::string(sline) +
--- a/examples/aobench_instrumented/instrument.h
+++ b/examples/aobench_instrumented/instrument.h
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #ifndef INSTRUMENT_H
@@ -36,8 +36,8 @@

 #include <stdint.h>

-extern "C" { 
-    void ISPCInstrument(const char *fn, const char *note, int line, int mask);
+extern "C" {
+    void ISPCInstrument(const char *fn, const char *note, int line, uint64_t mask);
 }

 void ISPCPrintInstrument();
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -1,20 +1,40 @@

 TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
-TASK_OBJ=tasksys.o
+TASK_OBJ=objs/tasksys.o

 CXX=g++
-CXXFLAGS=-Iobjs/ -O2 -m64
+CXXFLAGS=-Iobjs/ -O2
+CC=gcc
+CCFLAGS=-Iobjs/ -O2
+
 LIBS=-lm $(TASK_LIB) -lstdc++
-ISPC=ispc -O2 --arch=x86-64 $(ISPC_FLAGS)
-ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
-	$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
+ISPC=ispc -O2 $(ISPC_FLAGS)
 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
-CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o) $(TASK_OBJ))
+
+ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
+
+ifeq ($(ARCH),x86)
+  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
+	$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
+  ISPC_TARGETS=$(ISPC_IA_TARGETS)
+  ISPC_FLAGS += --arch=x86-64
+  CXXFLAGS += -m64
+  CCFLAGS += -m64
+else ifeq ($(ARCH),arm)
+  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o))
+  ISPC_TARGETS=$(ISPC_ARM_TARGETS)
+else
+  $(error Unknown architecture $(ARCH) from uname -m)
+endif
+
+CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))
+CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))
+OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)

 default: $(EXAMPLE)

-all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
+all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar

 .PHONY: dirs clean

@@ -24,14 +44,17 @@ dirs:
 objs/%.cpp objs/%.o objs/%.h: dirs

 clean:
-	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
+	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test

-$(EXAMPLE): $(CPP_OBJS) $(ISPC_OBJS)
+$(EXAMPLE): $(OBJS)
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)

 objs/%.o: %.cpp dirs $(ISPC_HEADER)
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: %.c dirs $(ISPC_HEADER)
+	$(CC) $< $(CCFLAGS) -c -o $@
+
 objs/%.o: ../%.cpp dirs
 	$(CXX) $< $(CXXFLAGS) -c -o $@

@@ -57,3 +80,9 @@ objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp

 $(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
+	$(ISPC) $< -o $@ --target=generic-1
+
+$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
--- a/examples/deferred/Makefile
+++ b/examples/deferred/Makefile
@@ -2,7 +2,8 @@
 EXAMPLE=deferred_shading
 CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
 ISPC_SRC=kernels.ispc
-ISPC_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_ARM_TARGETS=neon
 ISPC_FLAGS=--opt=fast-math

 include ../common.mk
--- a/examples/deferred/common.cpp
+++ b/examples/deferred/common.cpp
@@ -204,6 +204,7 @@ void WriteFrame(const char *filename, const InputData *input,
    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, 
            input->header.framebufferHeight);
    fwrite(framebufferAOS, imageBytes, 1, out);
+    fclose(out);

    lAlignedFree(framebufferAOS);
 }
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -35,35 +35,35 @@

 struct InputDataArrays
 {
-    uniform float * uniform zBuffer;
-    uniform unsigned int16 * uniform normalEncoded_x; // half float
-    uniform unsigned int16 * uniform normalEncoded_y; // half float
-    uniform unsigned int16 * uniform specularAmount; // half float
-    uniform unsigned int16 * uniform specularPower; // half float
-    uniform unsigned int8 * uniform albedo_x; // unorm8
-    uniform unsigned int8 * uniform albedo_y; // unorm8
-    uniform unsigned int8 * uniform albedo_z; // unorm8
-    uniform float * uniform lightPositionView_x;
-    uniform float * uniform lightPositionView_y;
-    uniform float * uniform lightPositionView_z;
-    uniform float * uniform lightAttenuationBegin;
-    uniform float * uniform lightColor_x;
-    uniform float * uniform lightColor_y;
-    uniform float * uniform lightColor_z;
-    uniform float * uniform lightAttenuationEnd;
+    float *zBuffer;
+    unsigned int16 *normalEncoded_x; // half float
+    unsigned int16 *normalEncoded_y; // half float
+    unsigned int16 *specularAmount; // half float
+    unsigned int16 *specularPower; // half float
+    unsigned int8 *albedo_x; // unorm8
+    unsigned int8 *albedo_y; // unorm8
+    unsigned int8 *albedo_z; // unorm8
+    float *lightPositionView_x;
+    float *lightPositionView_y;
+    float *lightPositionView_z;
+    float *lightAttenuationBegin;
+    float *lightColor_x;
+    float *lightColor_y;
+    float *lightColor_z;
+    float *lightAttenuationEnd;
 };

 struct InputHeader
 {
-    uniform float cameraProj[4][4];
-    uniform float cameraNear;
-    uniform float cameraFar;
+    float cameraProj[4][4];
+    float cameraNear;
+    float cameraFar;

-    uniform int32 framebufferWidth;
-    uniform int32 framebufferHeight;
-    uniform int32 numLights;
-    uniform int32 inputDataChunkSize;
-    uniform int32 inputDataArrayOffsets[idaNum];
+    int32 framebufferWidth;
+    int32 framebufferHeight;
+    int32 numLights;
+    int32 inputDataChunkSize;
+    int32 inputDataArrayOffsets[idaNum];
 };


@@ -158,38 +158,22 @@ IntersectLightsWithTileMinMax(
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
        
-    // Parallize across frustum planes.
-    // We really only have four side planes here, but write the code to
-    // handle programCount > 4 robustly
-    uniform float frustumPlanes_xy[programCount];
-    uniform float frustumPlanes_z[programCount];
+    uniform float frustumPlanes_xy[4] = {
+        -(cameraProj_11 * gBufferScale_x),
+         (cameraProj_11 * gBufferScale_x),
+         (cameraProj_22 * gBufferScale_y),
+        -(cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[4] = {
+         tileEndX - gBufferScale_x,
+        -tileStartX + gBufferScale_x,
+         tileEndY - gBufferScale_y,
+        -tileStartY + gBufferScale_y };

-    // TODO: If programIndex < 4 here? Don't care about masking off the
-    // rest but if interleaving ("x2" modes) the other lanes should ideally
-    // not be emitted...
-    {
-        // This one is totally constant over the whole screen... worth pulling it up at all?
-        float frustumPlanes_xy_v;
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_11 * gBufferScale_x));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2,  (cameraProj_22 * gBufferScale_y));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
-    
-        float frustumPlanes_z_v;
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 0,  tileEndX - gBufferScale_x);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 2,  tileEndY - gBufferScale_y);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
-
-        // Normalize
-        float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
-                           frustumPlanes_z_v * frustumPlanes_z_v);
-            frustumPlanes_xy_v *= norm;
-            frustumPlanes_z_v *= norm;
-
-        // Save out for uniform use later
-        frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
-        frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    for (uniform int i = 0; i < 4; ++i) {
+        uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
    }

    uniform int32 tileNumLights = 0;
@@ -343,8 +327,8 @@ ShadeTile(

                // Reconstruct normal from G-buffer
                float surface_normal_x, surface_normal_y, surface_normal_z;
-                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
-                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
                    
                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
                float m = sqrt(4.0f * f - 1.0f);
@@ -355,9 +339,9 @@ ShadeTile(

                // Load other G-buffer parameters
                float surface_specularAmount = 
-                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                    half_to_float(inputData.specularAmount[gBufferOffset]);
                float surface_specularPower  = 
-                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                    half_to_float(inputData.specularPower[gBufferOffset]);
                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
@@ -530,9 +514,9 @@ RenderStatic(uniform InputHeader &inputHeader,

    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
    // by MIN_TILE_HEIGHT pixels.
-    launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
-                                    inputHeader, inputData, visualizeLightCount,
-                                    framebuffer_r, framebuffer_g, framebuffer_b) >;
+    launch[num_groups] RenderTile(num_groups_x, num_groups_y,
+                                  inputHeader, inputData, visualizeLightCount,
+                                  framebuffer_r, framebuffer_g, framebuffer_b);
 }


@@ -591,8 +575,6 @@ SplitTileMinMax(
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Outputs
-    // TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
-    // indexing math ourselves
    uniform int32 subtileIndices[],
    uniform int32 subtileIndicesPitch,
    uniform int32 subtileNumLights[]
@@ -601,30 +583,20 @@ SplitTileMinMax(
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
        
-    // Parallize across frustum planes
-    // Only have 2 frustum split planes here so may not be worth it, but
-    // we'll do it for now for consistency
-    uniform float frustumPlanes_xy[programCount];
-    uniform float frustumPlanes_z[programCount];
-
-    // This one is totally constant over the whole screen... worth pulling it up at all?
-    float frustumPlanes_xy_v;
-    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
-    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_22 * gBufferScale_y));
-    
-    float frustumPlanes_z_v;
-    frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
-    frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
+    uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                           (cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                         tileMidY - gBufferScale_y };

    // Normalize
-    float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
-                       frustumPlanes_z_v * frustumPlanes_z_v);
-    frustumPlanes_xy_v *= norm;
-    frustumPlanes_z_v *= norm;
-
-    // Save out for uniform use later
-    frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
-    frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + 
+                                    frustumPlanes_z[0] * frustumPlanes_z[0]),
+                              rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + 
+                                    frustumPlanes_z[1] * frustumPlanes_z[1]) };
+    frustumPlanes_xy[0] *= norm[0];
+    frustumPlanes_xy[1] *= norm[1];
+    frustumPlanes_z[0] *= norm[0];
+    frustumPlanes_z[1] *= norm[1];

    // Initialize
    uniform int32 subtileLightOffset[4];
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -87,7 +87,7 @@ int main(int argc, char** argv) {
        framebuffer.clear();
        reset_and_start_timer();
        for (int j = 0; j < nframes; ++j)
-            ispc::RenderStatic(&input->header, &input->arrays, 
+            ispc::RenderStatic(input->header, input->arrays,
                               VISUALIZE_LIGHT_COUNT,
                               framebuffer.r, framebuffer.g, framebuffer.b);
        double mcycles = get_elapsed_mcycles() / nframes;
@@ -130,7 +130,7 @@ int main(int argc, char** argv) {
    printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
           serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
 #else
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
+    printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles);
 #endif // __cilk

    DeleteInputData(input);
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -23,6 +23,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "perfbench", "perfbench\perfbench.vcxproj", "{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -119,6 +121,14 @@ Global
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.ActiveCfg = Debug|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.Build.0 = Debug|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.ActiveCfg = Debug|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.Build.0 = Debug|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.ActiveCfg = Release|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.Build.0 = Release|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.ActiveCfg = Release|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/examples/gmres/Makefile
+++ b/examples/gmres/Makefile
@@ -0,0 +1,9 @@
+
+EXAMPLE=gmres
+CPP_SRC=algorithm.cpp main.cpp matrix.cpp
+CC_SRC=mmio.c
+ISPC_SRC=matrix.ispc
+ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_ARM_TARGETS=neon
+
+include ../common.mk
--- a/examples/gmres/algorithm.cpp
+++ b/examples/gmres/algorithm.cpp
@@ -0,0 +1,231 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+/*===========================================================================*\
+|* Includes
+\*===========================================================================*/
+#include "algorithm.h"
+#include "stdio.h"
+#include "debug.h"
+
+
+/*===========================================================================*\
+|* GMRES
+\*===========================================================================*/
+/* upper_triangular_right_solve:
+ * ----------------------------
+ * Given upper triangular matrix R and rhs vector b, solve for
+ * x.  This "solve" ignores the rows, columns of R that are greater than the
+ * dimensions of x.
+ */
+void upper_triangular_right_solve (const DenseMatrix &R, const Vector &b, Vector &x) 
+{
+    // Dimensionality check
+    ASSERT(R.rows() >= b.size());
+    ASSERT(R.cols() >= x.size());
+    ASSERT(b.size() >= x.size());
+
+    int max_row = x.size() - 1;
+
+    // first solve step:
+    x[max_row] = b[max_row] / R(max_row, max_row);
+
+    for (int row = max_row - 1; row >= 0; row--) {
+        double xi = b[row];
+        for (int col = max_row; col > row; col--)
+            xi -= x[col] * R(row, col);
+        x[row] = xi / R(row, row);
+    }
+}
+
+/* create_rotation (used in gmres):
+ * -------------------------------
+ * Construct a Givens rotation to zero out the lowest non-zero entry in a partially
+ * factored Hessenburg matrix.  Note that the previous Givens rotations should be
+ * applied to this column before creating a new rotation.
+ */
+void create_rotation (const DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn) 
+{
+    double a = H(col,     col);
+    double b = H(col + 1, col);
+    double r;
+
+    if (b == 0) {
+        Cn[col] = copysign(1, a);
+        Sn[col] = 0;
+    } 
+    else if (a == 0) {
+        Cn[col] = 0;
+        Sn[col] = copysign(1, b);
+    }
+    else {
+        r       = sqrt(a*a + b*b);
+        Sn[col] = -b / r;
+        Cn[col] =  a / r;
+    }
+}
+
+/* Applies the 'col'th Givens rotation stored in vectors Sn and Cn to the 'col'th 
+ * column of the DenseMatrix M.  (Previous columns don't need the rotation applied b/c
+ * presumeably, the first col-1 columns are already upper triangular, and so their
+ * entries in the col and col+1 rows are 0.)
+ */
+void apply_rotation (DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn) 
+{
+    double c = Cn[col];
+    double s = Sn[col];
+    double tmp    = c * H(col, col) - s * H(col+1, col);
+    H(col+1, col) = s * H(col, col) + c * H(col+1, col);
+    H(col,   col) = tmp;
+}
+
+/* Applies the 'col'th Givens rotation to the vector.
+ */
+void apply_rotation (Vector &v, size_t col, Vector &Cn, Vector &Sn) 
+{
+    double a = v[col];
+    double b = v[col + 1];
+
+    double c = Cn[col];
+    double s = Sn[col];
+
+    v[col]     = c * a - s * b;
+    v[col + 1] = s * a + c * b;
+}
+
+/* Applies the first 'col' Givens rotations to the newly-created column
+ * of H.  (Leaves other columns alone.)
+ */
+void update_column (DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn) 
+{
+    for (int i = 0; i < col; i++) {
+        double c    = Cn[i];
+        double s    = Sn[i];
+        double t    = c * H(i,col) - s * H(i+1,col);
+        H(i+1, col) = s * H(i,col) + c * H(i+1,col);
+        H(i,   col) = t;
+    }
+}
+
+/* After a new column has been added to the hessenburg matrix, factor it back into
+ * an upper-triangular matrix by:
+ * - applying the previous Givens rotations to the new column
+ * - computing the new Givens rotation to make the column upper triangluar
+ * - applying the new Givens rotation to the column, and
+ * - applying the new Givens rotation to the solution vector
+ */
+void update_qr_decomp (DenseMatrix &H, Vector &s, size_t col, Vector &Cn, Vector &Sn)
+{
+    update_column(  H, col, Cn, Sn);
+    create_rotation(H, col, Cn, Sn);
+    apply_rotation( H, col, Cn, Sn);
+    apply_rotation( s, col, Cn, Sn);
+}
+
+void gmres (const Matrix &A, const Vector &b, Vector &x, int num_iters, double max_err)  
+{
+    DEBUG_PRINT("gmres starting!\n");
+    x.zero();
+
+    ASSERT(A.rows() == A.cols());
+    DenseMatrix Qstar(num_iters + 1, A.rows());
+    DenseMatrix H(num_iters + 1, num_iters);
+
+    // arrays for storing parameters of givens rotations
+    Vector Sn(num_iters);
+    Vector Cn(num_iters);
+
+    // array for storing the rhs projected onto the hessenburg's column space
+    Vector G(num_iters+1);
+    G.zero();
+
+    double beta = b.norm();
+    G[0] = beta;
+
+    // temp vector, stores Aqi
+    Vector w(A.rows());
+
+    w.copy(b);
+    w.normalize();
+    Qstar.set_row(0, w);
+
+    int iter = 0;
+    Vector temp(A.rows(), false);
+    double rel_err;
+
+    while (iter < num_iters) 
+    {
+        // w = Aqi
+        Qstar.row(iter, temp);
+        A.multiply(temp, w);
+
+        // construct ith column of H, i+1th row of Qstar:        
+        for (int row = 0; row <= iter; row++) {
+            Qstar.row(row, temp);
+            H(row, iter) = temp.dot(w);
+            w.add_ax(-H(row, iter), temp);
+        }
+
+        H(iter+1, iter) = w.norm();
+        w.divide(H(iter+1, iter));
+        Qstar.set_row(iter+1, w);
+
+        update_qr_decomp (H, G, iter, Cn, Sn);
+
+        rel_err = fabs(G[iter+1] / beta);
+
+        if (rel_err < max_err)
+            break;
+
+        if (iter % 100 == 0)
+            DEBUG_PRINT("Iter %d: %f err\n", iter, rel_err);
+
+        iter++;
+    }
+
+    if (iter == num_iters) {
+        fprintf(stderr, "Error: gmres failed to converge in %d iterations (relative err: %f)\n", num_iters, rel_err);
+        exit(-1);
+    }
+
+    // We've reached an acceptable solution (?):
+
+    DEBUG_PRINT("gmres completed in %d iterations (rel. resid. %f, max %f)\n", num_iters, rel_err, max_err);
+    Vector y(iter+1);
+    upper_triangular_right_solve(H, G, y);
+    for (int i = 0; i < iter + 1; i++) {
+        Qstar.row(i, temp);
+        x.add_ax(y[i], temp);
+    }
+}
--- a/examples/gmres/algorithm.h
+++ b/examples/gmres/algorithm.h
@@ -0,0 +1,50 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#ifndef __ALGORITHM_H__
+#define __ALGORITHM_H__
+
+#include "matrix.h"
+
+
+/* Generalized Minimal Residual Method:
+ * -----------------------------------
+ * Takes a square matrix and an rhs and uses GMRES to find an estimate for x.
+ * The specified error is relative.
+ */
+void gmres (const Matrix &A, const Vector &b, Vector &x, int num_iters, double err);
+
+
+
+#endif
--- a/examples/gmres/data/c-18/c-18.mtx
+++ b/examples/gmres/data/c-18/c-18.mtx
--- a/examples/gmres/data/c-18/c-18_b.mtx
+++ b/examples/gmres/data/c-18/c-18_b.mtx
--- a/examples/gmres/data/c-21/c-21.mtx
+++ b/examples/gmres/data/c-21/c-21.mtx
--- a/examples/gmres/data/c-21/c-21_b.mtx
+++ b/examples/gmres/data/c-21/c-21_b.mtx
--- a/examples/gmres/data/c-22/c-22.mtx
+++ b/examples/gmres/data/c-22/c-22.mtx
--- a/examples/gmres/data/c-22/c-22_b.mtx
+++ b/examples/gmres/data/c-22/c-22_b.mtx
--- a/examples/gmres/data/c-25/c-25.mtx
+++ b/examples/gmres/data/c-25/c-25.mtx
--- a/examples/gmres/data/c-25/c-25_b.mtx
+++ b/examples/gmres/data/c-25/c-25_b.mtx
--- a/examples/gmres/debug.h
+++ b/examples/gmres/debug.h
@@ -0,0 +1,55 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#ifndef __DEBUG_H__
+#define __DEBUG_H__
+
+#include <cassert>
+
+
+/**************************************************************\
+| Macros
+\**************************************************************/
+#define DEBUG
+
+#ifdef DEBUG
+#define ASSERT(expr) assert(expr)
+#define DEBUG_PRINT(...) printf(__VA_ARGS__)
+#else
+#define ASSERT(expr)
+#define DEBUG_PRINT(...)
+#endif
+
+
+#endif
--- a/examples/gmres/main.cpp
+++ b/examples/gmres/main.cpp
@@ -0,0 +1,79 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#include "matrix.h"
+#include "algorithm.h"
+#include "util.h"
+#include <cmath>
+#include "../timing.h"
+
+
+int main (int argc, char **argv) 
+{
+    if (argc < 4) {
+        printf("usage: %s <input-matrix> <input-rhs> <output-file>\n", argv[0]);
+        return -1;
+    }
+
+    double gmres_cycles;
+
+    DEBUG_PRINT("Loading A...\n");
+    Matrix *A = CRSMatrix::matrix_from_mtf(argv[1]);
+    if (A == NULL) 
+        return -1;
+    DEBUG_PRINT("... size: %lu\n", A->cols());
+
+    DEBUG_PRINT("Loading b...\n");
+    Vector *b = Vector::vector_from_mtf(argv[2]);
+    if (b == NULL)
+        return -1;
+
+    Vector x(A->cols());
+    DEBUG_PRINT("Beginning gmres...\n");
+    gmres(*A, *b, x, A->cols() / 2, .01);
+
+    // Write result out to file
+    x.to_mtf(argv[argc-1]);
+
+    // Compute residual (double-check)
+#ifdef DEBUG
+    Vector bprime(b->size());
+    A->multiply(x, bprime);
+    Vector resid(bprime.size(), &(bprime[0]));
+    resid.subtract(*b);
+    DEBUG_PRINT("residual error check: %lg\n", resid.norm() / b->norm());
+#endif
+    // Print profiling results
+    DEBUG_PRINT("-- Total mcycles to solve : %.03f --\n", gmres_cycles);
+}
--- a/examples/gmres/matrix.cpp
+++ b/examples/gmres/matrix.cpp
@@ -0,0 +1,246 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+/**************************************************************\
+| Includes
+\**************************************************************/
+#include "matrix.h"
+#include "matrix_ispc.h"
+
+extern "C" {
+#include "mmio.h"
+}
+
+/**************************************************************\
+| DenseMatrix methods
+\**************************************************************/
+void DenseMatrix::multiply (const Vector &v, Vector &r) const 
+{
+    // Dimensionality check
+    ASSERT(v.size() == cols());
+    ASSERT(r.size() == rows());
+
+    for (int i = 0; i < rows(); i++)
+        r[i] = v.dot(entries + i * num_cols);
+}
+
+const Vector *DenseMatrix::row (size_t row) const {
+    return new Vector(num_cols, entries + row * num_cols, true);
+}
+
+void DenseMatrix::row (size_t row, Vector &r) {
+    r.entries = entries + row * cols();
+    r._size   = cols();
+}
+
+void DenseMatrix::set_row(size_t row, const Vector &v) 
+{
+    ASSERT(v.size() == num_cols);
+    memcpy(entries + row * num_cols, v.entries, num_cols * sizeof(double));
+}
+
+
+/**************************************************************\
+| CRSMatrix Methods
+\**************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+#include <algorithm>
+
+
+struct entry {
+    int row;
+    int col;
+    double val;
+};
+
+bool compare_entries(struct entry i, struct entry j) {
+    if (i.row < j.row)
+        return true;
+    if (i.row > j.row)
+        return false;
+
+    return i.col < j.col;
+}
+
+#define ERR_OUT(...) { fprintf(stderr, __VA_ARGS__); return NULL; }
+
+CRSMatrix *CRSMatrix::matrix_from_mtf (char *path) {
+    FILE *f;
+    MM_typecode matcode;
+
+    int m, n, nz;
+
+    if ((f = fopen(path, "r")) == NULL) 
+        ERR_OUT("Error: %s does not name a valid/readable file.\n", path);
+
+    if (mm_read_banner(f, &matcode) != 0)
+        ERR_OUT("Error: Could not process Matrix Market banner.\n");
+
+    if (mm_is_complex(matcode)) 
+        ERR_OUT("Error: Application does not support complex numbers.\n")
+
+    if (mm_is_dense(matcode))
+        ERR_OUT("Error: supplied matrix is dense (should be sparse.)\n");
+
+    if (!mm_is_matrix(matcode))
+        ERR_OUT("Error: %s does not encode a matrix.\n", path)
+
+    if (mm_read_mtx_crd_size(f, &m, &n, &nz) != 0)
+        ERR_OUT("Error: could not read matrix size from file.\n");
+
+    if (m != n)
+        ERR_OUT("Error: Application does not support non-square matrices.");
+
+    std::vector<struct entry> entries;
+    entries.resize(nz);
+
+    for (int i = 0; i < nz; i++) {
+        fscanf(f, "%d %d %lg\n", &entries[i].row, &entries[i].col, &entries[i].val);
+        // Adjust from 1-based to 0-based
+        entries[i].row--;
+        entries[i].col--;
+    }
+
+    sort(entries.begin(), entries.end(), compare_entries);
+
+    CRSMatrix *M = new CRSMatrix(m, n, nz);
+    int cur_row = -1;
+    for (int i = 0; i < nz; i++) {
+        while (entries[i].row > cur_row)
+            M->row_offsets[++cur_row] = i;
+        M->entries[i] = entries[i].val;
+        M->columns[i] = entries[i].col;
+    }
+
+    return M;
+}
+
+Vector *Vector::vector_from_mtf (char *path) {
+    FILE *f;
+    MM_typecode matcode;
+
+    int m, n, nz;
+
+    if ((f = fopen(path, "r")) == NULL) 
+        ERR_OUT("Error: %s does not name a valid/readable file.\n", path);
+
+    if (mm_read_banner(f, &matcode) != 0)
+        ERR_OUT("Error: Could not process Matrix Market banner.\n");
+
+    if (mm_is_complex(matcode)) 
+        ERR_OUT("Error: Application does not support complex numbers.\n")
+
+    if (mm_is_dense(matcode)) {
+        if (mm_read_mtx_array_size(f, &m, &n) != 0)
+            ERR_OUT("Error: could not read matrix size from file.\n");
+    } else {
+        if (mm_read_mtx_crd_size(f, &m, &n, &nz) != 0)
+            ERR_OUT("Error: could not read matrix size from file.\n");
+    }
+    if (n != 1)
+        ERR_OUT("Error: %s does not describe a vector.\n", path);
+
+    Vector *x = new Vector(m);
+
+    if (mm_is_dense(matcode)) {
+        double val;
+        for (int i = 0; i < m; i++) {
+            fscanf(f, "%lg\n", &val);
+            (*x)[i] = val;
+        }
+    }
+    else {
+        x->zero();
+        double val;
+        int row;
+        int col;
+        for (int i = 0; i < nz; i++) {
+            fscanf(f, "%d %d %lg\n", &row, &col, &val);
+            (*x)[row-1] = val;
+        }
+    }
+    return x;
+}
+
+#define ERR(...) { fprintf(stderr, __VA_ARGS__); exit(-1); }
+
+void Vector::to_mtf (char *path) {
+    FILE *f;
+    MM_typecode matcode;
+
+    mm_initialize_typecode(&matcode);
+    mm_set_matrix(&matcode);
+    mm_set_real(&matcode);
+    mm_set_dense(&matcode);
+    mm_set_general(&matcode);
+
+    if ((f = fopen(path, "w")) == NULL)
+        ERR("Error: cannot open/write to %s\n", path);
+
+    mm_write_banner(f, matcode);
+    mm_write_mtx_array_size(f, size(), 1);
+    for (int i = 0; i < size(); i++)
+        fprintf(f, "%lg\n", entries[i]);
+
+    fclose(f);
+}
+
+void CRSMatrix::multiply (const Vector &v, Vector &r) const
+{
+    ASSERT(v.size() == cols());
+    ASSERT(r.size() == rows());
+
+    for (int row = 0; row < rows(); row++) 
+    {
+        int row_offset = row_offsets[row];
+        int next_offset = ((row + 1 == rows()) ? _nonzeroes : row_offsets[row + 1]);
+
+        double sum = 0;
+        for (int i = row_offset; i < next_offset; i++)
+        {
+            sum += v[columns[i]] * entries[i];
+        }
+        r[row] = sum;
+    }
+}
+
+void CRSMatrix::zero ( ) 
+{
+    entries.clear();
+    row_offsets.clear();
+    columns.clear();
+    _nonzeroes = 0;
+}
--- a/examples/gmres/matrix.h
+++ b/examples/gmres/matrix.h
@@ -0,0 +1,279 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#ifndef __MATRIX_H__
+#define __MATRIX_H__
+
+/**************************************************************\
+| Includes
+\**************************************************************/
+#include <cstring> // size_t
+#include <cstdlib> // malloc, memcpy, etc.
+#include <cmath>   // sqrt
+#include <vector>
+
+#include "debug.h"
+#include "matrix_ispc.h"
+
+
+class DenseMatrix;
+/**************************************************************\
+| Vector class
+\**************************************************************/
+class Vector {
+ public:
+    static Vector *vector_from_mtf(char *path);
+    void to_mtf (char *path);
+
+    Vector(size_t size, bool alloc_mem=true) 
+        {
+            shared_ptr = false;
+            _size      = size;
+			
+            if (alloc_mem)
+                entries = (double *) malloc(sizeof(double) * _size);
+            else {
+                shared_ptr = true;
+                entries    = NULL;
+            }
+        }
+
+    Vector(size_t size, double *content, bool share_ptr=false) 
+        {
+            _size = size;
+            if (share_ptr) {
+                entries = content;
+                shared_ptr = true;
+            }
+            else {
+                shared_ptr = false;
+                entries = (double *) malloc(sizeof(double) * _size);
+                memcpy(entries, content, sizeof(double) * _size);
+            }
+        }
+
+    ~Vector() { if (!shared_ptr) free(entries); }
+
+    const double & operator [] (size_t index) const 
+    { 
+        ASSERT(index < _size); 
+        return *(entries + index); 
+    }
+
+    double &operator [] (size_t index) 
+    {
+        ASSERT(index < _size);
+        return *(entries + index);
+    }
+
+    bool operator == (const Vector &v) const 
+    {
+        if (v.size() != _size)
+            return false;
+
+        for (int i = 0; i < _size; i++)
+            if (entries[i] != v[i])
+                return false;
+
+        return true;
+    }
+
+    size_t size() const {return _size; }
+
+    double dot (const Vector &b) const 
+    {
+        ASSERT(b.size() == this->size());
+        return ispc::vector_dot(entries, b.entries, size());
+    }
+
+    double dot (const double * const b) const 
+    {
+        return ispc::vector_dot(entries, b, size());
+    }
+
+    void zero () 
+    {
+        ispc::zero(entries, size()); 
+    }
+
+    double norm () const { return sqrtf(dot(entries)); }
+
+    void normalize () { this->divide(this->norm()); }
+
+    void add (const Vector &a) 
+    {
+        ASSERT(size() == a.size());
+        ispc::vector_add(entries, a.entries, size());
+    }
+
+    void subtract (const Vector &s)
+    {
+        ASSERT(size() == s.size());
+        ispc::vector_sub(entries, s.entries, size());
+    }
+
+    void multiply (double scalar) 
+    {
+        ispc::vector_mult(entries, scalar, size());
+    }
+
+    void divide (double scalar) 
+    {
+        ispc::vector_div(entries, scalar, size());
+    }
+
+    // Note: x may be longer than *(this)
+    void add_ax (double a, const Vector &x) {
+        ASSERT(x.size() >= size());
+        ispc::vector_add_ax(entries, a, x.entries, size());
+    }
+
+    // Note that copy only copies the first size() elements of the
+    // supplied vector, i.e. the supplied vector can be longer than
+    // this one.  This is useful in least squares calculations.
+    void copy (const Vector &other) {
+        ASSERT(other.size() >= size());
+        memcpy(entries, other.entries, size() * sizeof(double));
+    }
+
+    friend class DenseMatrix;
+
+ private:
+    size_t  _size;
+    bool     shared_ptr;
+    double  *entries;
+};
+
+
+/**************************************************************\
+| Matrix base class
+\**************************************************************/
+class Matrix {
+    friend class Vector;
+	
+ public:
+    Matrix(size_t size_r, size_t size_c) 
+        { 
+            num_rows = size_r; 
+            num_cols = size_c; 
+        }
+    ~Matrix(){}
+
+    size_t rows() const { return num_rows; }
+    size_t cols() const { return num_cols; }
+
+    virtual void multiply (const Vector &v, Vector &r) const = 0;
+    virtual void zero () = 0;
+
+ protected:
+    size_t num_rows;
+    size_t num_cols;
+};
+
+/**************************************************************\
+| DenseMatrix class
+\**************************************************************/
+class DenseMatrix : public Matrix { 
+    friend class Vector;
+
+ public:
+ DenseMatrix(size_t size_r, size_t size_c) : Matrix(size_r, size_c) 
+        {
+            entries = (double *) malloc(size_r * size_c * sizeof(double));
+        }
+
+ DenseMatrix(size_t size_r, size_t size_c, const double *content) : Matrix (size_r, size_c)
+        {
+            entries = (double *) malloc(size_r * size_c * sizeof(double));
+            memcpy(entries, content, size_r * size_c * sizeof(double));
+        }
+
+    virtual void multiply (const Vector &v, Vector &r) const;
+
+    double &operator () (unsigned int r, unsigned int c)
+    {
+        return *(entries + r * num_cols + c);
+    }
+
+    const double &operator () (unsigned int r, unsigned int c) const
+    {
+        return *(entries + r * num_cols + c);			
+    }
+
+    const Vector *row(size_t row) const;
+    void          row(size_t row, Vector &r);
+    void      set_row(size_t row, const Vector &v);
+
+    virtual void zero() { ispc::zero(entries, rows() * cols()); }
+
+    void copy (const DenseMatrix &other) 
+    {
+        ASSERT(rows() == other.rows());
+        ASSERT(cols() == other.cols());
+        memcpy(entries, other.entries, rows() * cols() * sizeof(double));
+    }
+
+ private:
+    double *entries;
+    bool shared_ptr;
+};
+
+/**************************************************************\
+| CSRMatrix (compressed row storage, a sparse matrix format)
+\**************************************************************/
+class CRSMatrix : public Matrix { 
+ public:
+    CRSMatrix (size_t size_r, size_t size_c, size_t nonzeroes) :
+    Matrix(size_r, size_c) 
+        {
+            _nonzeroes = nonzeroes;
+            entries.resize(nonzeroes);
+            columns.resize(nonzeroes);
+            row_offsets.resize(size_r);
+        }
+
+    virtual void multiply(const Vector &v, Vector &r) const;
+
+    virtual void zero();
+
+    static CRSMatrix *matrix_from_mtf (char *path);
+
+ private:
+    unsigned int        _nonzeroes;
+    std::vector<double>  entries;
+    std::vector<int>     row_offsets;
+    std::vector<int>     columns;
+};
+
+#endif
--- a/examples/gmres/matrix.ispc
+++ b/examples/gmres/matrix.ispc
@@ -0,0 +1,122 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+/**************************************************************\
+| General
+\**************************************************************/
+export void zero (uniform double data[],
+                  uniform int size)
+{
+    foreach (i = 0 ... size)
+        data[i] = 0.0;
+}
+
+
+/**************************************************************\
+| Vector helpers
+\**************************************************************/
+export void vector_add (uniform double a[], 
+                        const uniform double b[], 
+                        const uniform int size) 
+{
+    foreach (i = 0 ... size)
+        a[i] += b[i];
+}
+
+export void vector_sub (uniform double a[], 
+                        const uniform double b[], 
+                        const uniform int size) 
+{
+    foreach (i = 0 ... size)
+        a[i] -= b[i];
+}
+
+export void vector_mult (uniform double a[],
+                         const uniform double b,
+                         const uniform int size)
+{
+    foreach (i = 0 ... size)
+        a[i] *= b;
+}
+
+export void vector_div (uniform double a[],
+                        const uniform double b,
+                        const uniform int size)
+{
+    foreach (i = 0 ... size)
+        a[i] /= b;
+}
+
+export void vector_add_ax (uniform double r[],
+                           const uniform double a,
+                           const uniform double x[],
+                           const uniform int    size)
+{
+    foreach (i = 0 ... size)
+        r[i] += a * x[i];
+}
+
+export uniform double vector_dot (const uniform double a[],
+                                  const uniform double b[],
+                                  const uniform int size)
+{
+    varying double sum = 0.0;
+    foreach (i = 0 ... size)
+        sum += a[i] * b[i];
+    return reduce_add(sum);
+}
+
+/**************************************************************\
+| Matrix helpers
+\**************************************************************/
+export void sparse_multiply (const uniform double entries[],
+                             const uniform double columns[],
+                             const uniform double row_offsets[],
+                             const uniform int rows,
+                             const uniform int cols,
+                             const uniform int nonzeroes,
+                             const uniform double v[],
+                             uniform double r[]) 
+{
+    foreach (row = 0 ... rows) {
+        int row_offset = row_offsets[row];
+        int next_offset = ((row + 1 == rows) ? nonzeroes : row_offsets[row+1]);
+
+        double sum = 0;
+        for (int j = row_offset; j < next_offset; j++)
+            sum += v[columns[j]] * entries[j];
+        r[row] = sum;
+    }
+}
+
--- a/examples/gmres/mmio.c
+++ b/examples/gmres/mmio.c
@@ -0,0 +1,511 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+#include "mmio.h"
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_)
+{
+    FILE *f;
+    MM_typecode matcode;
+    int M, N, nz;
+    int i;
+    double *val;
+    int *I, *J;
+ 
+    if ((f = fopen(fname, "r")) == NULL)
+            return -1;
+ 
+ 
+    if (mm_read_banner(f, &matcode) != 0)
+    {
+        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
+        printf(" in file [%s]\n", fname);
+        return -1;
+    }
+ 
+ 
+ 
+    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
+            mm_is_sparse(matcode)))
+    {
+        fprintf(stderr, "Sorry, this application does not support ");
+        fprintf(stderr, "Market Market type: [%s]\n",
+                mm_typecode_to_str(matcode));
+        return -1;
+    }
+ 
+    /* find out size of sparse matrix: M, N, nz .... */
+ 
+    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
+    {
+        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
+        return -1;
+    }
+ 
+    *M_ = M;
+    *N_ = N;
+    *nz_ = nz;
+ 
+    /* reseve memory for matrices */
+ 
+    I = (int *) malloc(nz * sizeof(int));
+    J = (int *) malloc(nz * sizeof(int));
+    val = (double *) malloc(nz * sizeof(double));
+ 
+    *val_ = val;
+    *I_ = I;
+    *J_ = J;
+ 
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+ 
+    for (i=0; i<nz; i++)
+    {
+        fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
+        I[i]--;  /* adjust from 1-based to 0-based */
+        J[i]--;
+    }
+    fclose(f);
+ 
+    return 0;
+}
+
+int mm_is_valid(MM_typecode matcode)
+{
+    if (!mm_is_matrix(matcode)) return 0;
+    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
+    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
+    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
+                mm_is_skew(matcode))) return 0;
+    return 1;
+}
+
+int mm_read_banner(FILE *f, MM_typecode *matcode)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    char banner[MM_MAX_TOKEN_LENGTH];
+    char mtx[MM_MAX_TOKEN_LENGTH]; 
+    char crd[MM_MAX_TOKEN_LENGTH];
+    char data_type[MM_MAX_TOKEN_LENGTH];
+    char storage_scheme[MM_MAX_TOKEN_LENGTH];
+    char *p;
+
+
+    mm_clear_typecode(matcode);  
+
+    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
+        return MM_PREMATURE_EOF;
+
+    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
+        storage_scheme) != 5)
+        return MM_PREMATURE_EOF;
+
+    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
+    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
+    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
+    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
+
+    /* check for banner */
+    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
+        return MM_NO_HEADER;
+
+    /* first field should be "mtx" */
+    if (strcmp(mtx, MM_MTX_STR) != 0)
+        return  MM_UNSUPPORTED_TYPE;
+    mm_set_matrix(matcode);
+
+
+    /* second field describes whether this is a sparse matrix (in coordinate
+            storgae) or a dense array */
+
+
+    if (strcmp(crd, MM_SPARSE_STR) == 0)
+        mm_set_sparse(matcode);
+    else
+    if (strcmp(crd, MM_DENSE_STR) == 0)
+            mm_set_dense(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* third field */
+
+    if (strcmp(data_type, MM_REAL_STR) == 0)
+        mm_set_real(matcode);
+    else
+    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
+        mm_set_complex(matcode);
+    else
+    if (strcmp(data_type, MM_PATTERN_STR) == 0)
+        mm_set_pattern(matcode);
+    else
+    if (strcmp(data_type, MM_INT_STR) == 0)
+        mm_set_integer(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* fourth field */
+
+    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
+        mm_set_general(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
+        mm_set_symmetric(matcode);
+    else
+    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
+        mm_set_hermitian(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
+        mm_set_skew(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+        
+
+    return 0;
+}
+
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
+{
+    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = *nz = 0;
+
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
+        return 0;
+        
+    else
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 3);
+
+    return 0;
+}
+
+
+int mm_read_mtx_array_size(FILE *f, int *M, int *N)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = 0;
+	
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d", M, N) == 2)
+        return 0;
+        
+    else /* we have a blank line */
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d", M, N); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 2);
+
+    return 0;
+}
+
+int mm_write_mtx_array_size(FILE *f, int M, int N)
+{
+    if (fprintf(f, "%d %d\n", M, N) != 2)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+
+
+/*-------------------------------------------------------------------------*/
+
+/******************************************************************/
+/* use when I[], J[], and val[]J, and val[] are already allocated */
+/******************************************************************/
+
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    int i;
+    if (mm_is_complex(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+        for (i=0; i<nz; i++)
+        {
+            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
+                != 3) return MM_PREMATURE_EOF;
+
+        }
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d", &I[i], &J[i])
+                != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
+        double *real, double *imag, MM_typecode matcode)
+{
+    if (mm_is_complex(matcode))
+    {
+            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+            if (fscanf(f, "%d %d %lg\n", I, J, real)
+                != 3) return MM_PREMATURE_EOF;
+
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+
+/************************************************************************
+    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
+                        type code, e.g. 'MCRS'
+
+                        if matrix is complex, values[] is of size 2*nz,
+                            (nz pairs of real/imaginary values)
+************************************************************************/
+
+int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
+        double **val, MM_typecode *matcode)
+{
+    int ret_code;
+    FILE *f;
+
+    if (strcmp(fname, "stdin") == 0) f=stdin;
+    else
+    if ((f = fopen(fname, "r")) == NULL)
+        return MM_COULD_NOT_READ_FILE;
+
+
+    if ((ret_code = mm_read_banner(f, matcode)) != 0)
+        return ret_code;
+
+    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
+            mm_is_matrix(*matcode)))
+        return MM_UNSUPPORTED_TYPE;
+
+    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
+        return ret_code;
+
+
+    *I = (int *)  malloc(*nz * sizeof(int));
+    *J = (int *)  malloc(*nz * sizeof(int));
+    *val = NULL;
+
+    if (mm_is_complex(*matcode))
+    {
+        *val = (double *) malloc(*nz * 2 * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+    else if (mm_is_real(*matcode))
+    {
+        *val = (double *) malloc(*nz * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    else if (mm_is_pattern(*matcode))
+    {
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    if (f != stdin) fclose(f);
+    return 0;
+}
+
+int mm_write_banner(FILE *f, MM_typecode matcode)
+{
+    char *str = mm_typecode_to_str(matcode);
+    int ret_code;
+
+    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
+    free(str);
+    if (ret_code !=2 )
+        return MM_COULD_NOT_WRITE_FILE;
+    else
+        return 0;
+}
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    FILE *f;
+    int i;
+
+    if (strcmp(fname, "stdout") == 0) 
+        f = stdout;
+    else
+    if ((f = fopen(fname, "w")) == NULL)
+        return MM_COULD_NOT_WRITE_FILE;
+    
+    /* print banner followed by typecode */
+    fprintf(f, "%s ", MatrixMarketBanner);
+    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
+
+    /* print matrix sizes and nonzeros */
+    fprintf(f, "%d %d %d\n", M, N, nz);
+
+    /* print values */
+    if (mm_is_pattern(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d\n", I[i], J[i]);
+    else
+    if (mm_is_real(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
+    else
+    if (mm_is_complex(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
+                        val[2*i+1]);
+    else
+    {
+        if (f != stdout) fclose(f);
+        return MM_UNSUPPORTED_TYPE;
+    }
+
+    if (f !=stdout) fclose(f);
+
+    return 0;
+}
+  
+
+/**
+*  Create a new copy of a string s.  mm_strdup() is a common routine, but
+*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
+*
+*/
+char *mm_strdup(const char *s)
+{
+	int len = strlen(s);
+	char *s2 = (char *) malloc((len+1)*sizeof(char));
+	return strcpy(s2, s);
+}
+
+char  *mm_typecode_to_str(MM_typecode matcode)
+{
+    char buffer[MM_MAX_LINE_LENGTH];
+    char *types[4];
+	char *mm_strdup(const char *);
+    int error =0;
+
+    /* check for MTX type */
+    if (mm_is_matrix(matcode)) 
+        types[0] = MM_MTX_STR;
+    else
+        error=1;
+
+    /* check for CRD or ARR matrix */
+    if (mm_is_sparse(matcode))
+        types[1] = MM_SPARSE_STR;
+    else
+    if (mm_is_dense(matcode))
+        types[1] = MM_DENSE_STR;
+    else
+        return NULL;
+
+    /* check for element data type */
+    if (mm_is_real(matcode))
+        types[2] = MM_REAL_STR;
+    else
+    if (mm_is_complex(matcode))
+        types[2] = MM_COMPLEX_STR;
+    else
+    if (mm_is_pattern(matcode))
+        types[2] = MM_PATTERN_STR;
+    else
+    if (mm_is_integer(matcode))
+        types[2] = MM_INT_STR;
+    else
+        return NULL;
+
+
+    /* check for symmetry type */
+    if (mm_is_general(matcode))
+        types[3] = MM_GENERAL_STR;
+    else
+    if (mm_is_symmetric(matcode))
+        types[3] = MM_SYMM_STR;
+    else 
+    if (mm_is_hermitian(matcode))
+        types[3] = MM_HERM_STR;
+    else 
+    if (mm_is_skew(matcode))
+        types[3] = MM_SKEW_STR;
+    else
+        return NULL;
+
+    sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
+    return mm_strdup(buffer);
+
+}
--- a/examples/gmres/mmio.h
+++ b/examples/gmres/mmio.h
@@ -0,0 +1,135 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+#ifndef MM_IO_H
+#define MM_IO_H
+
+#define MM_MAX_LINE_LENGTH 1025
+#define MatrixMarketBanner "%%MatrixMarket"
+#define MM_MAX_TOKEN_LENGTH 64
+
+typedef char MM_typecode[4];
+
+#include <stdio.h>
+
+char *mm_typecode_to_str(MM_typecode matcode);
+
+int mm_read_banner(FILE *f, MM_typecode *matcode);
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
+int mm_read_mtx_array_size(FILE *f, int *M, int *N);
+
+int mm_write_banner(FILE *f, MM_typecode matcode);
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
+int mm_write_mtx_array_size(FILE *f, int M, int N);
+
+
+/********************* MM_typecode query fucntions ***************************/
+
+#define mm_is_matrix(typecode)	((typecode)[0]=='M')
+
+#define mm_is_sparse(typecode)	((typecode)[1]=='C')
+#define mm_is_coordinate(typecode)((typecode)[1]=='C')
+#define mm_is_dense(typecode)	((typecode)[1]=='A')
+#define mm_is_array(typecode)	((typecode)[1]=='A')
+
+#define mm_is_complex(typecode)	((typecode)[2]=='C')
+#define mm_is_real(typecode)		((typecode)[2]=='R')
+#define mm_is_pattern(typecode)	((typecode)[2]=='P')
+#define mm_is_integer(typecode) ((typecode)[2]=='I')
+
+#define mm_is_symmetric(typecode)((typecode)[3]=='S')
+#define mm_is_general(typecode)	((typecode)[3]=='G')
+#define mm_is_skew(typecode)	((typecode)[3]=='K')
+#define mm_is_hermitian(typecode)((typecode)[3]=='H')
+
+int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
+
+
+/********************* MM_typecode modify fucntions ***************************/
+
+#define mm_set_matrix(typecode)	((*typecode)[0]='M')
+#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
+#define mm_set_array(typecode)	((*typecode)[1]='A')
+#define mm_set_dense(typecode)	mm_set_array(typecode)
+#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
+
+#define mm_set_complex(typecode)((*typecode)[2]='C')
+#define mm_set_real(typecode)	((*typecode)[2]='R')
+#define mm_set_pattern(typecode)((*typecode)[2]='P')
+#define mm_set_integer(typecode)((*typecode)[2]='I')
+
+
+#define mm_set_symmetric(typecode)((*typecode)[3]='S')
+#define mm_set_general(typecode)((*typecode)[3]='G')
+#define mm_set_skew(typecode)	((*typecode)[3]='K')
+#define mm_set_hermitian(typecode)((*typecode)[3]='H')
+
+#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
+									(*typecode)[2]=' ',(*typecode)[3]='G')
+
+#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
+
+
+/********************* Matrix Market error codes ***************************/
+
+
+#define MM_COULD_NOT_READ_FILE	11
+#define MM_PREMATURE_EOF		12
+#define MM_NOT_MTX				13
+#define MM_NO_HEADER			14
+#define MM_UNSUPPORTED_TYPE		15
+#define MM_LINE_TOO_LONG		16
+#define MM_COULD_NOT_WRITE_FILE	17
+
+
+/******************** Matrix Market internal definitions ********************
+
+   MM_matrix_typecode: 4-character sequence
+
+				    ojbect 		sparse/   	data        storage 
+						  		dense     	type        scheme
+
+   string position:	 [0]        [1]			[2]         [3]
+
+   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
+						        A(array)	C(omplex)   H(ermitian)
+											P(attern)   S(ymmetric)
+								    		I(nteger)	K(kew)
+
+ ***********************************************************************/
+
+#define MM_MTX_STR		"matrix"
+#define MM_ARRAY_STR	"array"
+#define MM_DENSE_STR	"array"
+#define MM_COORDINATE_STR "coordinate" 
+#define MM_SPARSE_STR	"coordinate"
+#define MM_COMPLEX_STR	"complex"
+#define MM_REAL_STR		"real"
+#define MM_INT_STR		"integer"
+#define MM_GENERAL_STR  "general"
+#define MM_SYMM_STR		"symmetric"
+#define MM_HERM_STR		"hermitian"
+#define MM_SKEW_STR		"skew-symmetric"
+#define MM_PATTERN_STR  "pattern"
+
+
+/*  high level routines */
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+		 double val[], MM_typecode matcode);
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+		double val[], MM_typecode matcode);
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
+			MM_typecode matcode);
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_);
+
+
+
+#endif
--- a/examples/gmres/util.h
+++ b/examples/gmres/util.h
@@ -0,0 +1,53 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#ifndef __UTIL_H__
+#define __UTIL_H__
+
+#include <stdio.h>
+#include "matrix.h"
+
+
+inline void printMatrix (DenseMatrix &M, const char *name) {
+    printf("Matrix %s:\n", name);
+    for (int row = 0; row < M.rows(); row++) {
+        printf("row %2d: ", row + 1);
+        for (int col = 0; col < M.cols(); col++)
+            printf("%6f ", M(row, col));
+        printf("\n");
+    }
+    printf("\n");
+}
+
+#endif
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -55,6 +55,7 @@ typedef int64_t __vec1_i64;

 struct __vec16_i1 {
    __vec16_i1() { }
+    __vec16_i1(const uint16_t &vv) : v(vv) { }
    __vec16_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3,
               uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7,
               uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11,
@@ -193,13 +194,22 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
   return ret;                                                      \
 }

-#define CMP_OP(TYPE, CAST, NAME, OP)                                \
-static FORCEINLINE __vec16_i1 NAME(TYPE a, TYPE b) {                \
+#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP)                        \
+static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) {     \
   __vec16_i1 ret;                                                  \
   ret.v = 0;                                                       \
   for (int i = 0; i < 16; ++i)                                     \
       ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i;            \
   return ret;                                                      \
+}                                                                   \
+static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b,       \
+                                              __vec16_i1 mask) {    \
+   __vec16_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i;            \
+   ret.v &= mask.v;                                                 \
+   return ret;                                                      \
 }

 #define INSERT_EXTRACT(VTYPE, STYPE)                                  \
@@ -211,14 +221,16 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
 }

 #define LOAD_STORE(VTYPE, STYPE)                       \
-static FORCEINLINE VTYPE __load(VTYPE *p, int align) { \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
    STYPE *ptr = (STYPE *)p;                           \
    VTYPE ret;                                         \
    for (int i = 0; i < 16; ++i)                       \
        ret.v[i] = ptr[i];                             \
    return ret;                                        \
 }                                                      \
-static FORCEINLINE void __store(VTYPE *p, VTYPE v, int align) {    \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
    STYPE *ptr = (STYPE *)p;                           \
    for (int i = 0; i < 16; ++i)                       \
        ptr[i] = v.v[i];                               \
@@ -251,13 +263,37 @@ static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
    return cond ? a : b;                                            \
 }

-#define SMEAR(VTYPE, NAME, STYPE)               \
-static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {        \
-    VTYPE ret;                                  \
-    for (int i = 0; i < 16; ++i)                \
-        ret.v[i] = v;                           \
-    return ret;                                 \
-}                                               \
+#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
+static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v[i] = (CAST)(a.v[i]) OP b;                              \
+   return ret;                                                      \
+}
+
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
+template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret.v[i] = v;                                              \
+    return ret;                                                    \
+}
+
+#define SETZERO(VTYPE, NAME)                                       \
+template <class RetVecType> VTYPE __setzero_##NAME();              \
+template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret.v[i] = 0;                                              \
+    return ret;                                                    \
+}
+
+#define UNDEF(VTYPE, NAME)                                         \
+template <class RetVecType> VTYPE __undef_##NAME();                \
+template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {            \
+    return VTYPE();                                                \
+}

 #define BROADCAST(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
@@ -303,11 +339,23 @@ INSERT_EXTRACT(__vec1_d, double)
 ///////////////////////////////////////////////////////////////////////////
 // mask ops

-static FORCEINLINE uint32_t __movmsk(__vec16_i1 mask) {
-    return mask.v;
+static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
+    return (uint64_t)mask.v;
 }

-static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
+static FORCEINLINE bool __any(__vec16_i1 mask) {
+    return (mask.v!=0);
+}
+
+static FORCEINLINE bool __all(__vec16_i1 mask) {
+    return (mask.v==0xFFFF);
+}
+
+static FORCEINLINE bool __none(__vec16_i1 mask) {
+    return (mask.v==0);
+}
+
+static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
    __vec16_i1 r;
    r.v = (a.v & b.v) | (~a.v & ~b.v);
    return r;
@@ -331,6 +379,24 @@ static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
    return r;
 }

+static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) {
+    __vec16_i1 r;
+    r.v = ~v.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = ~a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v & ~b.v;
+    return r;
+}
+
 static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
                                       __vec16_i1 b) {
    __vec16_i1 r;
@@ -354,18 +420,36 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
        vec->v |= (1 << index);
 }

-static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p, int align) {
+template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
    uint16_t *ptr = (uint16_t *)p;
    __vec16_i1 r;
    r.v = *ptr;
    return r;
 }

-static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v, int align) {
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
    uint16_t *ptr = (uint16_t *)p;
    *ptr = v.v;
 }

+template <class RetVecType> __vec16_i1 __smear_i1(int i);
+template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int v) {
+    return __vec16_i1(v, v, v, v, v, v, v, v, 
+                      v, v, v, v, v, v, v, v);
+}
+
+template <class RetVecType> __vec16_i1 __setzero_i1();
+template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
+    return __vec16_i1(0, 0, 0, 0, 0, 0, 0, 0, 
+                      0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+template <class RetVecType> __vec16_i1 __undef_i1();
+template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
+    return __vec16_i1();
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // int8

@@ -386,20 +470,26 @@ BINARY_OP_CAST(__vec16_i8, int8_t,  __srem, %)
 BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>)
 BINARY_OP_CAST(__vec16_i8, int8_t,  __ashr, >>)

-CMP_OP(__vec16_i8, int8_t,  __equal, ==)
-CMP_OP(__vec16_i8, int8_t,  __not_equal, !=)
-CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=)
-CMP_OP(__vec16_i8, int8_t,  __signed_less_equal, <=)
-CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_equal, >=)
-CMP_OP(__vec16_i8, int8_t,  __signed_greater_equal, >=)
-CMP_OP(__vec16_i8, uint8_t, __unsigned_less_than, <)
-CMP_OP(__vec16_i8, int8_t,  __signed_less_than, <)
-CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_than, >)
-CMP_OP(__vec16_i8, int8_t,  __signed_greater_than, >)
+SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
+
+CMP_OP(__vec16_i8, i8, int8_t,  __equal, ==)
+CMP_OP(__vec16_i8, i8, int8_t,  __not_equal, !=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_less_than, <)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_greater_than, >)

 SELECT(__vec16_i8)
 INSERT_EXTRACT(__vec16_i8, int8_t)
 SMEAR(__vec16_i8, i8, int8_t)
+SETZERO(__vec16_i8, i8)
+UNDEF(__vec16_i8, i8)
 BROADCAST(__vec16_i8, i8, int8_t)
 ROTATE(__vec16_i8, i8, int8_t)
 SHUFFLES(__vec16_i8, i8, int8_t)
@@ -425,20 +515,26 @@ BINARY_OP_CAST(__vec16_i16, int16_t,  __srem, %)
 BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>)
 BINARY_OP_CAST(__vec16_i16, int16_t,  __ashr, >>)

-CMP_OP(__vec16_i16, int16_t,  __equal, ==)
-CMP_OP(__vec16_i16, int16_t,  __not_equal, !=)
-CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=)
-CMP_OP(__vec16_i16, int16_t,  __signed_less_equal, <=)
-CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_equal, >=)
-CMP_OP(__vec16_i16, int16_t,  __signed_greater_equal, >=)
-CMP_OP(__vec16_i16, uint16_t, __unsigned_less_than, <)
-CMP_OP(__vec16_i16, int16_t,  __signed_less_than, <)
-CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_than, >)
-CMP_OP(__vec16_i16, int16_t,  __signed_greater_than, >)
+SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
+
+CMP_OP(__vec16_i16, i16, int16_t,  __equal, ==)
+CMP_OP(__vec16_i16, i16, int16_t,  __not_equal, !=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_less_than, <)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_greater_than, >)

 SELECT(__vec16_i16)
 INSERT_EXTRACT(__vec16_i16, int16_t)
 SMEAR(__vec16_i16, i16, int16_t)
+SETZERO(__vec16_i16, i16)
+UNDEF(__vec16_i16, i16)
 BROADCAST(__vec16_i16, i16, int16_t)
 ROTATE(__vec16_i16, i16, int16_t)
 SHUFFLES(__vec16_i16, i16, int16_t)
@@ -464,20 +560,26 @@ BINARY_OP_CAST(__vec16_i32, int32_t,  __srem, %)
 BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>)
 BINARY_OP_CAST(__vec16_i32, int32_t,  __ashr, >>)

-CMP_OP(__vec16_i32, int32_t,  __equal, ==)
-CMP_OP(__vec16_i32, int32_t,  __not_equal, !=)
-CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=)
-CMP_OP(__vec16_i32, int32_t,  __signed_less_equal, <=)
-CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_equal, >=)
-CMP_OP(__vec16_i32, int32_t,  __signed_greater_equal, >=)
-CMP_OP(__vec16_i32, uint32_t, __unsigned_less_than, <)
-CMP_OP(__vec16_i32, int32_t,  __signed_less_than, <)
-CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_than, >)
-CMP_OP(__vec16_i32, int32_t,  __signed_greater_than, >)
+SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
+
+CMP_OP(__vec16_i32, i32, int32_t,  __equal, ==)
+CMP_OP(__vec16_i32, i32, int32_t,  __not_equal, !=)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_less_than, <)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_greater_than, >)

 SELECT(__vec16_i32)
 INSERT_EXTRACT(__vec16_i32, int32_t)
 SMEAR(__vec16_i32, i32, int32_t)
+SETZERO(__vec16_i32, i32)
+UNDEF(__vec16_i32, i32)
 BROADCAST(__vec16_i32, i32, int32_t)
 ROTATE(__vec16_i32, i32, int32_t)
 SHUFFLES(__vec16_i32, i32, int32_t)
@@ -503,20 +605,26 @@ BINARY_OP_CAST(__vec16_i64, int64_t,  __srem, %)
 BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
 BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)

-CMP_OP(__vec16_i64, int64_t,  __equal, ==)
-CMP_OP(__vec16_i64, int64_t,  __not_equal, !=)
-CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
-CMP_OP(__vec16_i64, int64_t,  __signed_less_equal, <=)
-CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=)
-CMP_OP(__vec16_i64, int64_t,  __signed_greater_equal, >=)
-CMP_OP(__vec16_i64, uint64_t, __unsigned_less_than, <)
-CMP_OP(__vec16_i64, int64_t,  __signed_less_than, <)
-CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_than, >)
-CMP_OP(__vec16_i64, int64_t,  __signed_greater_than, >)
+SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
+
+CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
+CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_than, <)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_than, >)

 SELECT(__vec16_i64)
 INSERT_EXTRACT(__vec16_i64, int64_t)
 SMEAR(__vec16_i64, i64, int64_t)
+SETZERO(__vec16_i64, i64)
+UNDEF(__vec16_i64, i64)
 BROADCAST(__vec16_i64, i64, int64_t)
 ROTATE(__vec16_i64, i64, int64_t)
 SHUFFLES(__vec16_i64, i64, int64_t)
@@ -530,14 +638,14 @@ BINARY_OP(__vec16_f, __sub, -)
 BINARY_OP(__vec16_f, __mul, *)
 BINARY_OP(__vec16_f, __div, /)

-CMP_OP(__vec16_f, float, __equal, ==)
-CMP_OP(__vec16_f, float, __not_equal, !=)
-CMP_OP(__vec16_f, float, __less_than, <)
-CMP_OP(__vec16_f, float, __less_equal, <=)
-CMP_OP(__vec16_f, float, __greater_than, >)
-CMP_OP(__vec16_f, float, __greater_equal, >=)
+CMP_OP(__vec16_f, float, float, __equal, ==)
+CMP_OP(__vec16_f, float, float, __not_equal, !=)
+CMP_OP(__vec16_f, float, float, __less_than, <)
+CMP_OP(__vec16_f, float, float, __less_equal, <=)
+CMP_OP(__vec16_f, float, float, __greater_than, >)
+CMP_OP(__vec16_f, float, float, __greater_equal, >=)

-static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
+static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
    __vec16_i1 ret;
    ret.v = 0;
    for (int i = 0; i < 16; ++i)
@@ -545,6 +653,14 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
    return ret;
 }

+static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0;
+    return ret;
+}
+
 #if 0
      case Instruction::FRem: intrinsic = "__frem"; break;
 #endif
@@ -552,11 +668,128 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
 SELECT(__vec16_f)
 INSERT_EXTRACT(__vec16_f, float)
 SMEAR(__vec16_f, float, float)
+SETZERO(__vec16_f, float)
+UNDEF(__vec16_f, float)
 BROADCAST(__vec16_f, float, float)
 ROTATE(__vec16_f, float, float)
 SHUFFLES(__vec16_f, float, float)
 LOAD_STORE(__vec16_f, float)

+static FORCEINLINE float __exp_uniform_float(float v) {
+    return expf(v);
+}
+
+static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = expf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __log_uniform_float(float v) {
+    return logf(v);
+}
+
+static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = logf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {
+    return powf(a, b);
+}
+
+static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = powf(a.v[i], b.v[i]);
+    return ret;
+}
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) {
+    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+    uint32_t exp = shifted_exp & o;   // just the exponent
+    o += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) { // Zero/Denormal?
+        o += 1 << 23;             // extra exp adjust
+        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+    }
+
+    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+    return __floatbits(o);
+}
+
+
+static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = __half_to_float_uniform(v.v[i]);
+    return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    int32_t fint = __intbits(f);
+    int32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    int32_t f32infty = 255 << 23;
+    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+    const uint32_t round_mask = ~0xfffu; 
+    const int32_t magic = 15 << 23;
+    const int32_t f16infty = 31 << 23;
+
+    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
+    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+
+static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) {
+    __vec16_i16 ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = __float_to_half_uniform(v.v[i]);
+    return ret;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // double

@@ -565,14 +798,14 @@ BINARY_OP(__vec16_d, __sub, -)
 BINARY_OP(__vec16_d, __mul, *)
 BINARY_OP(__vec16_d, __div, /)

-CMP_OP(__vec16_d, double, __equal, ==)
-CMP_OP(__vec16_d, double, __not_equal, !=)
-CMP_OP(__vec16_d, double, __less_than, <)
-CMP_OP(__vec16_d, double, __less_equal, <=)
-CMP_OP(__vec16_d, double, __greater_than, >)
-CMP_OP(__vec16_d, double, __greater_equal, >=)
+CMP_OP(__vec16_d, double, double, __equal, ==)
+CMP_OP(__vec16_d, double, double, __not_equal, !=)
+CMP_OP(__vec16_d, double, double, __less_than, <)
+CMP_OP(__vec16_d, double, double, __less_equal, <=)
+CMP_OP(__vec16_d, double, double, __greater_than, >)
+CMP_OP(__vec16_d, double, double, __greater_equal, >=)

-static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
+static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
    __vec16_i1 ret;
    ret.v = 0;
    for (int i = 0; i < 16; ++i)
@@ -580,6 +813,14 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
    return ret;
 }

+static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0;
+    return ret;
+}
+
 #if 0
      case Instruction::FRem: intrinsic = "__frem"; break;
 #endif
@@ -587,6 +828,8 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
 SELECT(__vec16_d)
 INSERT_EXTRACT(__vec16_d, double)
 SMEAR(__vec16_d, double, double)
+SETZERO(__vec16_d, double)
+UNDEF(__vec16_d, double)
 BROADCAST(__vec16_d, double, double)
 ROTATE(__vec16_d, double, double)
 SHUFFLES(__vec16_d, double, double)
@@ -919,27 +1162,28 @@ REDUCE_ADD(double, __vec16_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >)

-REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_int32)
+REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+
+REDUCE_ADD(int64_t, __vec16_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >)

-REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >)

-REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64)
+REDUCE_ADD(int64_t, __vec16_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_max_int64, >)

-REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)

 ///////////////////////////////////////////////////////////////////////////
 // masked load/store

-static FORCEINLINE __vec16_i8 __masked_load_8(unsigned char *p,
-                                              __vec16_i1 mask) {
+static FORCEINLINE __vec16_i8 __masked_load_i8(void *p,
+                                               __vec16_i1 mask) {
    __vec16_i8 ret;
    int8_t *ptr = (int8_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -948,8 +1192,8 @@ static FORCEINLINE __vec16_i8 __masked_load_8(unsigned char *p,
    return ret;
 }

-static FORCEINLINE __vec16_i16 __masked_load_16(unsigned char *p,
-                                                __vec16_i1 mask) {
+static FORCEINLINE __vec16_i16 __masked_load_i16(void *p,
+                                                 __vec16_i1 mask) {
    __vec16_i16 ret;
    int16_t *ptr = (int16_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -958,8 +1202,8 @@ static FORCEINLINE __vec16_i16 __masked_load_16(unsigned char *p,
    return ret;
 }

-static FORCEINLINE __vec16_i32 __masked_load_32(unsigned char *p,
-                                                __vec16_i1 mask) {
+static FORCEINLINE __vec16_i32 __masked_load_i32(void *p,
+                                                 __vec16_i1 mask) {
    __vec16_i32 ret;
    int32_t *ptr = (int32_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -968,8 +1212,18 @@ static FORCEINLINE __vec16_i32 __masked_load_32(unsigned char *p,
    return ret;
 }

-static FORCEINLINE __vec16_i64 __masked_load_64(unsigned char *p,
-                                                __vec16_i1 mask) {
+static FORCEINLINE __vec16_f __masked_load_float(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_f ret;
+    float *ptr = (float *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
+                                                 __vec16_i1 mask) {
    __vec16_i64 ret;
    int64_t *ptr = (int64_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -978,31 +1232,49 @@ static FORCEINLINE __vec16_i64 __masked_load_64(unsigned char *p,
    return ret;
 }

-static FORCEINLINE void __masked_store_8(unsigned char *p, __vec16_i8 val,
-                                         __vec16_i1 mask) {
+static FORCEINLINE __vec16_d __masked_load_double(void *p,
+                                                  __vec16_i1 mask) {
+    __vec16_d ret;
+    double *ptr = (double *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
+                                          __vec16_i1 mask) {
    int8_t *ptr = (int8_t *)p;
    for (int i = 0; i < 16; ++i)
        if ((mask.v & (1 << i)) != 0)
            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_16(unsigned char *p, __vec16_i16 val,
-                                          __vec16_i1 mask) {
+static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
+                                           __vec16_i1 mask) {
    int16_t *ptr = (int16_t *)p;
    for (int i = 0; i < 16; ++i)
        if ((mask.v & (1 << i)) != 0)
            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_32(unsigned char *p, __vec16_i32 val,
-                                          __vec16_i1 mask) {
+static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val,
+                                           __vec16_i1 mask) {
    int32_t *ptr = (int32_t *)p;
    for (int i = 0; i < 16; ++i)
        if ((mask.v & (1 << i)) != 0)
            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_64(unsigned char *p, __vec16_i64 val,
+static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
+                                             __vec16_i1 mask) {
+    float *ptr = (float *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
                                          __vec16_i1 mask) {
    int64_t *ptr = (int64_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -1010,33 +1282,75 @@ static FORCEINLINE void __masked_store_64(unsigned char *p, __vec16_i64 val,
            ptr[i] = val.v[i];
 }

+static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
+                                              __vec16_i1 mask) {
+    double *ptr = (double *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
+                                                __vec16_i1 mask) {
+    __masked_store_i8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
+                                                   __vec16_i1 mask) {
+    __masked_store_float(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i64(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
+                                                    __vec16_i1 mask) {
+    __masked_store_double(p, val, mask);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // gather/scatter

 // offsets * offsetScale is in bytes (for all of these)

 #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
-                         __vec16_i1 mask) {                             \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec16_i1 mask) {          \
    VTYPE ret;                                                          \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 16; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]);        \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
            ret.v[i] = *ptr;                                            \
        }                                                               \
    return ret;                                                         \
 }
    

-GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
-GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
+GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_base_offsets32_i8)
+GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
 GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
 GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
+GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
+GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
+GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_base_offsets32_double)
+GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __gather_base_offsets64_double)

 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
 static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
@@ -1049,37 +1363,46 @@ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
    return ret;                                             \
 }

-GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __gather32_i8)
-GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __gather64_i8)
+GATHER_GENERAL(__vec16_i8,  int8_t,  __vec16_i32, __gather32_i8)
+GATHER_GENERAL(__vec16_i8,  int8_t,  __vec16_i64, __gather64_i8)
 GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __gather32_i16)
 GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16)
 GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32)
 GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32)
+GATHER_GENERAL(__vec16_f,   float,   __vec16_i32, __gather32_float)
+GATHER_GENERAL(__vec16_f,   float,   __vec16_i64, __gather64_float)
 GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64)
 GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
+GATHER_GENERAL(__vec16_d,   double,  __vec16_i32, __gather32_double)
+GATHER_GENERAL(__vec16_d,   double,  __vec16_i64, __gather64_double)

 // scatter

 #define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
-                             VTYPE val, __vec16_i1 mask) {              \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val,                   \
+                             __vec16_i1 mask) {                         \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 16; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]);        \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
            *ptr = val.v[i];                                            \
        }                                                               \
 }
    

-SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
-SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __scatter_base_offsets64_i8)
 SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
 SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
 SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
 SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
+SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
+SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
+SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_base_offsets32_double)
+SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __scatter_base_offsets64_double)

 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
@@ -1091,14 +1414,18 @@ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
        }                                                            \
 }

-SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
-SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8)
+SCATTER_GENERAL(__vec16_i8,  int8_t,  __vec16_i32, __scatter32_i8)
+SCATTER_GENERAL(__vec16_i8,  int8_t,  __vec16_i64, __scatter64_i8)
 SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16)
 SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
 SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32)
 SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32)
+SCATTER_GENERAL(__vec16_f,   float,   __vec16_i32, __scatter32_float)
+SCATTER_GENERAL(__vec16_f,   float,   __vec16_i64, __scatter64_float)
 SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64)
 SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64)
+SCATTER_GENERAL(__vec16_d,   double,  __vec16_i32, __scatter32_double)
+SCATTER_GENERAL(__vec16_d,   double,  __vec16_i64, __scatter64_double)

 ///////////////////////////////////////////////////////////////////////////
 // packed load/store
@@ -1432,3 +1759,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
    return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+
+#endif // !WIN32
+
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
--- a/examples/intrinsics/knc2x.h
+++ b/examples/intrinsics/knc2x.h
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
--- a/Show More
+++ b/Show More